An Example of XML Processing in AFL

Here's a simple example of an XML parser written in AFL, and its use:

# xmlparsers.afl

include "patterns.afl";

# This sample illustrates the use of different kinds of XML parsers:
# "push", "pull" and "DOM(-like)", and how a single parser implemenation
# can be used in all three modes given appropriate language facilities.

# ----------------------------------------------------------------------

# This is a very primative XML parser.  It's written as a "push" parser.
# That is, it invokes a client-provided method (passed as "yield") for
# each XML event encountered.  Note that events can be yielded when
# deeply nested within the XML parser, making it difficult to use it
# as a "pull" parser if written in most conventional programming languages.
# (It would have been easy to put all the "yield" calls at the parser's
# top level in such a small and simple parser, but I've nested the calls
# to make its implementation reflect the implementation of real-world
# push parsers.  The point being made here is that how the parser is
# implemented doesn't impact how it can be used.)

# Note that pattern matching is used extensively to simplify the task
# of parsing.   Although the XML parser is provided only with string
# input in the sample, it equally well accepts a file or other streaming
# input.

def xmlLetter: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
def xmlDigit:  "0123456789";
def xmlName:   anyOf xmlLetter &: anyOf (xmlLetter ++ "-._" ++ xmlDigit):*;
def xmlWs:     anyOf " \{tab}\{cr}\{lf}":+;
def xmlData :
    anyOf (xmlLetter ++ xmlDigit ++ " ~`!@#$%^*()_-+={}|[]\\:;',>.?/"):+;

def xmlParser (sourceText) (yield):
  {
    def parseContent (elementName):
      loop (exit):
        if ~: ="<" &: xmlName :> "gi" then
          {
            def args: hashtable ();
            while ~: xmlWs &: xmlName :> "aname" &: xmlWs:? &: ="=" &:
                     xmlWs:? &: anyOf "\"'" :> "quote" &:
                     (noneOf "\"'":+ |: !: another "quote" &: anyOf "\"'"):*
                         :> "avalue" &: another "quote" do
              args [matched ["aname"]] = matched ["avalue"];
            ~: xmlWs:?;
            def tag : '((if ~: ="/" then "emptyTag" else "startTag"),
                        matched ["gi"], args);
            if ! (~: =">") then
              yield ('("error", "missing > on <\{matched ["gi"]} ..."));
            yield (tag);
            if tag.1 =="startTag" then
              {
                parseContent (tag.2);
                yield ('("endTag", tag.2))
              };
          }
        else if ~: ="</" &: xmlName :> "gi" &: xmlWs:? &: =">" then
          if matched ["gi"] == elementName then
            exit '{}
          else
            yield ('("error", "out-of-place end tag: </\{matched ["gi"]}>"))
        else if ~: ="&" &: xmlName :> "entityName" &: =";" then
          yield ('("entity", matched ["entityName"]))
        else if ~: ="&#" &: xmlDigit :> "number" &: =";" then
          yield ('("numericCharacter", toNumber matched ["number"]))
        else if ~: ="<!--" &:
                   (noneOf "-":+ |: ="-" &: !: ="->"):* :> "comment" &:
                   ="-->" then
          yield ('("comment", matched ["comment"]))
        else if ~: ="<?" &: (noneOf "?":+ |: ="?" &: !: =">"):* :> "pi"
                           &: "-->" then
          yield ('("processingInstruction", matched ["pi"]))
        else if ~: anyOf "<&" then
          yield ('("error", "found unmached \"\{matched []}\""))
        else if ~: noneOf "<&":+ then
          yield ('("data", matched []))
        else
          (
            if elementName != nil then
              yield ('("error", "missing </\{elementName}>"));
            exit '{}
          );
    withSource sourceText do
      parseContent (nil);
    yield ('("done"))
  };

# A simple displayer, just so that it's easy to see what's produced
# by the XML parser.

def displayNodeSubset (nodeSet, depth):
  for each nodeSet do (node):
    {
      printinline "  " ** depth ++ node.1;
      if node hasField "2" then
        (
          printinline ":";
          withSource node.2 do
            while ~: ?: any do
              printinline if ~: xmlData then " \"\{matched []}\""
                                        else " #\{ord (+: any)}";
        );
      print "";
      if node hasField "3" then
        for each node.3 do (k, v):
           print "  " ** depth + 2 ++ "\"\{k}\"=\"\{v}\"";
      if node hasField "4" then
        displayNodeSubset (node.4, depth + 1);
    };
def displayEvent [180 event]: displayNodeSubset ('[event], 0);
def displayNodeSet (nodeSet): displayNodeSubset (nodeSet, 0);

def case [180 n]: print "
                        "Case #\{n}:"

# ----------------------------------------------------------------------

# Now for using the parser.

# The issues of how processing is associated with individual XML events,
# method selection, rule selection, simple testing etc., are not dealt
# with in these implementations.  Note that any such technique can be
# applied in all of the cases.

def xmlDocument: <doc><title>title text</title>
                 <para>para text</para>
                 <para>more para text</para>
                 </doc>;

case 1;

# As a "push" parser (a.k.a. "SAX-like").  The XML parser is
# passed "processEvent" to be called when each XML event is encountered:

{
  def processEvent (event):
    displayEvent event;
  xmlParser (xmlDocument) (processEvent);
};

case 2;

# As a "generator" of XML events:

for xmlParser (xmlDocument) do (event):
  displayEvent event;

case 3;

# As a "pull" parser.  Each invocation of "pullParser" returns
# the next event.  In this example, the end of parsing is indicated
# by the parser returning a "done" event.

{
  def pullParser : ungenerate (xmlParser (xmlDocument), nil);
  loop (exit):
    {
      def event : pullParser (nil).1;
      displayEvent event;
      if event.1 == "done" then
        exit '{};
    };
};

case 4;

# As a "pull" parser again, with the parser exiting to "exit" once
# it's finished parsing (i.e. after the "done" event is returned).

catch exit:
  {
    def pullParser : ungenerate (xmlParser (xmlDocument), exit);
    while true do
      displayEvent pullParser (nil).1;
  };

case 5;

# As a "pull" parser yet again, with the parser signalling an
# exception when it's finished.

try
  {
    def handler : exceptionHandler;
    def pullParser : ungenerate (xmlParser (xmlDocument),
                                 '{}: handler '{"EndDocument"});
    while true do
      displayEvent pullParser (nil).1
  }
except (e):
  print ">> exception: " ++ e;

case 6;

# As a "DOM-like" parser, returning a tree-like data structure
# representing the whole parsed document.  "captureXMLDocument" uses
# the "push" parser and builds a tree of what it pushes:

{
  def captureXMLDocument (sourceText):
    {
      def startTagStack : '['("startTag", "dummy",
                              arraylist (0), arraylist (0))];
      def captureNodes (event):
        if event.1 == "startTag" then
          startTagStack [] = '("startTag", event.2, event.3, arraylist (0))
        else if event.1 == "endTag" then
          {
            def e : *startTagStack [];
            startTagStack [].remove ();
            (*startTagStack []).4 [] = e;
          }
        else
          (*startTagStack []).4 [] = event;
      xmlParser (sourceText) (captureNodes);
      (*startTagStack []).4
    };
  displayNodeSet (captureXMLDocument (xmlDocument));
};

case 7;

# As a "DOM-like" parser again, but this time "captureXMLDocument" uses
# the XML parser as a "pull" parser.  Implementation is a bit cleaner
# than when using the "push" parser.

{
  def captureXMLDocument (sourceText):
    {
      def pullParser : ungenerate (xmlParser (sourceText), nil);
      def captureNodeSet ():
        {
          def nodeSet : arraylist (0);
          loop (exit):
            {
              def event : pullParser (nil).1;
              if event.1 == "endTag" || event.1 == "done" then exit '{};
              nodeSet [] = if event.1 == "startTag" then
                             '("startTag", event.2, event.3, captureNodeSet ())
                           else
                             event;
            };
          nodeSet
        };
      captureNodeSet ();
    };
  displayNodeSet (captureXMLDocument (xmlDocument));
};

17 November 2005