Here's a simple example of an XML parser written in AFL, and its use:
# xmlparsers.afl
include "patterns.afl";
# This sample illustrates the use of different kinds of XML parsers:
# "push", "pull" and "DOM(-like)", and how a single parser implemenation
# can be used in all three modes given appropriate language facilities.
# ----------------------------------------------------------------------
# This is a very primative XML parser. It's written as a "push" parser.
# That is, it invokes a client-provided method (passed as "yield") for
# each XML event encountered. Note that events can be yielded when
# deeply nested within the XML parser, making it difficult to use it
# as a "pull" parser if written in most conventional programming languages.
# (It would have been easy to put all the "yield" calls at the parser's
# top level in such a small and simple parser, but I've nested the calls
# to make its implementation reflect the implementation of real-world
# push parsers. The point being made here is that how the parser is
# implemented doesn't impact how it can be used.)
# Note that pattern matching is used extensively to simplify the task
# of parsing. Although the XML parser is provided only with string
# input in the sample, it equally well accepts a file or other streaming
# input.
def xmlLetter: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
def xmlDigit: "0123456789";
def xmlName: anyOf xmlLetter &: anyOf (xmlLetter ++ "-._" ++ xmlDigit):*;
def xmlWs: anyOf " \{tab}\{cr}\{lf}":+;
def xmlData :
anyOf (xmlLetter ++ xmlDigit ++ " ~`!@#$%^*()_-+={}|[]\\:;',>.?/"):+;
def xmlParser (sourceText) (yield):
{
def parseContent (elementName):
loop (exit):
if ~: ="<" &: xmlName :> "gi" then
{
def args: hashtable ();
while ~: xmlWs &: xmlName :> "aname" &: xmlWs:? &: ="=" &:
xmlWs:? &: anyOf "\"'" :> "quote" &:
(noneOf "\"'":+ |: !: another "quote" &: anyOf "\"'"):*
:> "avalue" &: another "quote" do
args [matched ["aname"]] = matched ["avalue"];
~: xmlWs:?;
def tag : '((if ~: ="/" then "emptyTag" else "startTag"),
matched ["gi"], args);
if ! (~: =">") then
yield ('("error", "missing > on <\{matched ["gi"]} ..."));
yield (tag);
if tag.1 =="startTag" then
{
parseContent (tag.2);
yield ('("endTag", tag.2))
};
}
else if ~: ="</" &: xmlName :> "gi" &: xmlWs:? &: =">" then
if matched ["gi"] == elementName then
exit '{}
else
yield ('("error", "out-of-place end tag: </\{matched ["gi"]}>"))
else if ~: ="&" &: xmlName :> "entityName" &: =";" then
yield ('("entity", matched ["entityName"]))
else if ~: ="&#" &: xmlDigit :> "number" &: =";" then
yield ('("numericCharacter", toNumber matched ["number"]))
else if ~: ="<!--" &:
(noneOf "-":+ |: ="-" &: !: ="->"):* :> "comment" &:
="-->" then
yield ('("comment", matched ["comment"]))
else if ~: ="<?" &: (noneOf "?":+ |: ="?" &: !: =">"):* :> "pi"
&: "-->" then
yield ('("processingInstruction", matched ["pi"]))
else if ~: anyOf "<&" then
yield ('("error", "found unmached \"\{matched []}\""))
else if ~: noneOf "<&":+ then
yield ('("data", matched []))
else
(
if elementName != nil then
yield ('("error", "missing </\{elementName}>"));
exit '{}
);
withSource sourceText do
parseContent (nil);
yield ('("done"))
};
# A simple displayer, just so that it's easy to see what's produced
# by the XML parser.
def displayNodeSubset (nodeSet, depth):
for each nodeSet do (node):
{
printinline " " ** depth ++ node.1;
if node hasField "2" then
(
printinline ":";
withSource node.2 do
while ~: ?: any do
printinline if ~: xmlData then " \"\{matched []}\""
else " #\{ord (+: any)}";
);
print "";
if node hasField "3" then
for each node.3 do (k, v):
print " " ** depth + 2 ++ "\"\{k}\"=\"\{v}\"";
if node hasField "4" then
displayNodeSubset (node.4, depth + 1);
};
def displayEvent [180 event]: displayNodeSubset ('[event], 0);
def displayNodeSet (nodeSet): displayNodeSubset (nodeSet, 0);
def case [180 n]: print "
"Case #\{n}:"
# ----------------------------------------------------------------------
# Now for using the parser.
# The issues of how processing is associated with individual XML events,
# method selection, rule selection, simple testing etc., are not dealt
# with in these implementations. Note that any such technique can be
# applied in all of the cases.
def xmlDocument: <doc><title>title text</title>
<para>para text</para>
<para>more para text</para>
</doc>;
case 1;
# As a "push" parser (a.k.a. "SAX-like"). The XML parser is
# passed "processEvent" to be called when each XML event is encountered:
{
def processEvent (event):
displayEvent event;
xmlParser (xmlDocument) (processEvent);
};
case 2;
# As a "generator" of XML events:
for xmlParser (xmlDocument) do (event):
displayEvent event;
case 3;
# As a "pull" parser. Each invocation of "pullParser" returns
# the next event. In this example, the end of parsing is indicated
# by the parser returning a "done" event.
{
def pullParser : ungenerate (xmlParser (xmlDocument), nil);
loop (exit):
{
def event : pullParser (nil).1;
displayEvent event;
if event.1 == "done" then
exit '{};
};
};
case 4;
# As a "pull" parser again, with the parser exiting to "exit" once
# it's finished parsing (i.e. after the "done" event is returned).
catch exit:
{
def pullParser : ungenerate (xmlParser (xmlDocument), exit);
while true do
displayEvent pullParser (nil).1;
};
case 5;
# As a "pull" parser yet again, with the parser signalling an
# exception when it's finished.
try
{
def handler : exceptionHandler;
def pullParser : ungenerate (xmlParser (xmlDocument),
'{}: handler '{"EndDocument"});
while true do
displayEvent pullParser (nil).1
}
except (e):
print ">> exception: " ++ e;
case 6;
# As a "DOM-like" parser, returning a tree-like data structure
# representing the whole parsed document. "captureXMLDocument" uses
# the "push" parser and builds a tree of what it pushes:
{
def captureXMLDocument (sourceText):
{
def startTagStack : '['("startTag", "dummy",
arraylist (0), arraylist (0))];
def captureNodes (event):
if event.1 == "startTag" then
startTagStack [] = '("startTag", event.2, event.3, arraylist (0))
else if event.1 == "endTag" then
{
def e : *startTagStack [];
startTagStack [].remove ();
(*startTagStack []).4 [] = e;
}
else
(*startTagStack []).4 [] = event;
xmlParser (sourceText) (captureNodes);
(*startTagStack []).4
};
displayNodeSet (captureXMLDocument (xmlDocument));
};
case 7;
# As a "DOM-like" parser again, but this time "captureXMLDocument" uses
# the XML parser as a "pull" parser. Implementation is a bit cleaner
# than when using the "push" parser.
{
def captureXMLDocument (sourceText):
{
def pullParser : ungenerate (xmlParser (sourceText), nil);
def captureNodeSet ():
{
def nodeSet : arraylist (0);
loop (exit):
{
def event : pullParser (nil).1;
if event.1 == "endTag" || event.1 == "done" then exit '{};
nodeSet [] = if event.1 == "startTag" then
'("startTag", event.2, event.3, captureNodeSet ())
else
event;
};
nodeSet
};
captureNodeSet ();
};
displayNodeSet (captureXMLDocument (xmlDocument));
};
17 November 2005