| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
|
|---|
| 4 |
|
|---|
| 5 |
|
|---|
| 6 |
""" NanoSax is a very simple event-based XML processing library, much like SAX. |
|---|
| 7 |
|
|---|
| 8 |
It's written to accompany the Templess library, but can easily be used |
|---|
| 9 |
stand-alone for quick and easy XML parsing |
|---|
| 10 |
""" |
|---|
| 11 |
|
|---|
| 12 |
import re |
|---|
| 13 |
|
|---|
| 14 |
class XMLError(Exception): |
|---|
| 15 |
def __init__(self, lineno, message): |
|---|
| 16 |
self.lineno = lineno |
|---|
| 17 |
self.message = message |
|---|
| 18 |
|
|---|
| 19 |
def __str__(self): |
|---|
| 20 |
return 'parse error in line %s: %s' % (self.lineno, self.message) |
|---|
| 21 |
|
|---|
| 22 |
class nsparser(object): |
|---|
| 23 |
"""very simple parser for XML |
|---|
| 24 |
|
|---|
| 25 |
emits events like SAX, except the API is a lot ('even' ;) simpler |
|---|
| 26 |
""" |
|---|
| 27 |
TYPE_TEXT = 1 |
|---|
| 28 |
TYPE_START = 2 |
|---|
| 29 |
TYPE_END = 3 |
|---|
| 30 |
TYPE_COMMENT = 4 |
|---|
| 31 |
TYPE_CDATA = 5 |
|---|
| 32 |
|
|---|
| 33 |
_reg_name = re.compile(r'^[\w\:\-]+$', re.U) |
|---|
| 34 |
_reg_start = re.compile( |
|---|
| 35 |
r'^([\w\:\-]+)(\s+(([\w\:\-]+)=(("(?=([^"]*)")[^"]*")|' |
|---|
| 36 |
"('(?=([^']*)')[^']*'))))*$") |
|---|
| 37 |
_reg_attr = re.compile( |
|---|
| 38 |
r'([\w\:\-]+)((\="(?=([^">]*)"))|(\=\'(?=([^\'>]*)\')))', re.U) |
|---|
| 39 |
_reg_xml_decl = re.compile(r'<\?xml.*?>', re.S) |
|---|
| 40 |
_reg_encoding = re.compile(r'encoding="([^"]+)"') |
|---|
| 41 |
_reg_pi = re.compile(r'<\?.*?>', re.S) |
|---|
| 42 |
_reg_dtd_1 = re.compile(r'<!DOCTYPE\s+[\w\:\-]+\s+\[.*?\]>', re.S) |
|---|
| 43 |
_reg_dtd_2 = re.compile(r'<!DOCTYPE\s+[\w\:\-]+\s+SYSTEM\s+.*?>') |
|---|
| 44 |
|
|---|
| 45 |
def __init__(self, handler): |
|---|
| 46 |
self.handler = handler |
|---|
| 47 |
|
|---|
| 48 |
def parse(self, xml): |
|---|
| 49 |
"""parse the xml using self.handler |
|---|
| 50 |
|
|---|
| 51 |
xml is supposed to be either a unicode or ascii string, or a |
|---|
| 52 |
string with the character set as defined in the xml declaration |
|---|
| 53 |
""" |
|---|
| 54 |
xml = self._handle_pis(xml) |
|---|
| 55 |
self.handler.startdoc() |
|---|
| 56 |
for type, lineno, chunk in self._parse_into_chunks(xml): |
|---|
| 57 |
if type == self.TYPE_TEXT: |
|---|
| 58 |
self.handler.text(chunk) |
|---|
| 59 |
elif type == self.TYPE_START: |
|---|
| 60 |
self.handler.startel(*self._parse_start(lineno, chunk)) |
|---|
| 61 |
elif type == self.TYPE_END: |
|---|
| 62 |
self.handler.endel(chunk) |
|---|
| 63 |
elif type == self.TYPE_COMMENT: |
|---|
| 64 |
self.handler.comment(chunk) |
|---|
| 65 |
elif type == self.TYPE_CDATA: |
|---|
| 66 |
self.handler.cdata(chunk) |
|---|
| 67 |
self.handler.enddoc() |
|---|
| 68 |
|
|---|
| 69 |
def _handle_pis(self, xml): |
|---|
| 70 |
"""handle processing instructions |
|---|
| 71 |
|
|---|
| 72 |
takes care of handling (if appropriate) the XML declaration, and |
|---|
| 73 |
of discarding any processing instructions and document type |
|---|
| 74 |
declarations etc. the lib can't deal with |
|---|
| 75 |
|
|---|
| 76 |
returns unicode, if the input string is not already unicode the |
|---|
| 77 |
charset mentioned in the XML declaration will be used for |
|---|
| 78 |
conversion (if any) |
|---|
| 79 |
""" |
|---|
| 80 |
match = self._reg_xml_decl.search(xml) |
|---|
| 81 |
charset = 'UTF-8' |
|---|
| 82 |
if match: |
|---|
| 83 |
decl = match.group(0) |
|---|
| 84 |
xml = xml.replace(decl, '') |
|---|
| 85 |
encmatch = self._reg_encoding.search(decl) |
|---|
| 86 |
if encmatch: |
|---|
| 87 |
charset = encmatch.group(1) |
|---|
| 88 |
for reg in (self._reg_dtd_1, self._reg_dtd_2, self._reg_pi): |
|---|
| 89 |
while 1: |
|---|
| 90 |
match = reg.search(xml) |
|---|
| 91 |
if not match: |
|---|
| 92 |
break |
|---|
| 93 |
xml = xml.replace(match.group(0), '') |
|---|
| 94 |
if isinstance(xml, str): |
|---|
| 95 |
xml = unicode(xml, charset) |
|---|
| 96 |
return xml |
|---|
| 97 |
|
|---|
| 98 |
def _parse_into_chunks(self, xml): |
|---|
| 99 |
xml = xml.strip() |
|---|
| 100 |
offset = 0 |
|---|
| 101 |
currline = 1 |
|---|
| 102 |
namestack = [] |
|---|
| 103 |
self._test(xml.startswith('<'), currline, 'text before document start') |
|---|
| 104 |
while xml: |
|---|
| 105 |
offset = 0 |
|---|
| 106 |
if xml.startswith('<![CDATA['): |
|---|
| 107 |
endpos = xml.find(']]>') |
|---|
| 108 |
self._test(endpos > -1, currline, 'CDATA section not closed') |
|---|
| 109 |
data = xml[9:endpos] |
|---|
| 110 |
offset += endpos + 3 |
|---|
| 111 |
yield self.TYPE_CDATA, currline, data |
|---|
| 112 |
currline += data.count('\n') |
|---|
| 113 |
elif xml.startswith('<!--'): |
|---|
| 114 |
endpos = xml.find('-->') |
|---|
| 115 |
self._test(endpos > -1, currline, 'comment not closed') |
|---|
| 116 |
data = xml[4:endpos] |
|---|
| 117 |
offset += endpos + 3 |
|---|
| 118 |
yield self.TYPE_COMMENT, currline, data |
|---|
| 119 |
currline += data.count('\n') |
|---|
| 120 |
elif xml.startswith('</'): |
|---|
| 121 |
endpos = xml.find('>') |
|---|
| 122 |
self._test(endpos > -1, currline, 'end tag not closed') |
|---|
| 123 |
data = xml[2:endpos] |
|---|
| 124 |
name = data.strip() |
|---|
| 125 |
self._test(self._reg_name.match(name), currline, |
|---|
| 126 |
'illegal element name \'%s\' in end tag' % (name,)) |
|---|
| 127 |
startname = namestack.pop() |
|---|
| 128 |
self._test(name.strip() == startname.strip(), currline, |
|---|
| 129 |
('closing tag \'%s\' doesn\'t match opening' |
|---|
| 130 |
'tag \'%s\'') % (name, startname)) |
|---|
| 131 |
offset += endpos + 1 |
|---|
| 132 |
yield self.TYPE_END, currline, name |
|---|
| 133 |
currline += data.count('\n') |
|---|
| 134 |
elif xml.startswith('<'): |
|---|
| 135 |
endpos = xml.find('>') |
|---|
| 136 |
self._test(endpos > -1, currline, 'start tag not closed') |
|---|
| 137 |
data = xml[1:endpos] |
|---|
| 138 |
issingle = False |
|---|
| 139 |
if data[-1] == '/': |
|---|
| 140 |
data = data[:-1] |
|---|
| 141 |
issingle = True |
|---|
| 142 |
name = data.split()[0] |
|---|
| 143 |
self._test(self._reg_name.match(name), currline, |
|---|
| 144 |
'illegal element name \'%s\' for tag' % (name,)) |
|---|
| 145 |
offset += endpos + 1 |
|---|
| 146 |
if not issingle: |
|---|
| 147 |
|
|---|
| 148 |
namestack.append(name) |
|---|
| 149 |
yield self.TYPE_START, currline, data |
|---|
| 150 |
else: |
|---|
| 151 |
|
|---|
| 152 |
yield self.TYPE_START, currline, data |
|---|
| 153 |
yield self.TYPE_END, currline, name |
|---|
| 154 |
currline += data.count('\n') |
|---|
| 155 |
else: |
|---|
| 156 |
endpos = xml.find('<') |
|---|
| 157 |
self._test(endpos > -1, currline, 'text after document end') |
|---|
| 158 |
data = xml[:endpos] |
|---|
| 159 |
offset += endpos |
|---|
| 160 |
yield self.TYPE_TEXT, currline, data |
|---|
| 161 |
currline += data.count('\n') |
|---|
| 162 |
xml = xml[offset:] |
|---|
| 163 |
self._test(not namestack, currline, 'document not closed') |
|---|
| 164 |
|
|---|
| 165 |
def _parse_start(self, lineno, data): |
|---|
| 166 |
match = self._reg_start.match(data.strip()) |
|---|
| 167 |
self._test(match, lineno, 'illegal start tag content \'%s\'' % (data,)) |
|---|
| 168 |
name = match.group(1) |
|---|
| 169 |
data = match.group(0)[len(name):].strip() |
|---|
| 170 |
attrs = {} |
|---|
| 171 |
while 1: |
|---|
| 172 |
match = self._reg_attr.search(data) |
|---|
| 173 |
if not match: |
|---|
| 174 |
break |
|---|
| 175 |
|
|---|
| 176 |
|
|---|
| 177 |
data = data.replace(match.group(0) + match.group(4), '') |
|---|
| 178 |
attrs[match.group(1)] = match.group(4) |
|---|
| 179 |
return name, attrs |
|---|
| 180 |
|
|---|
| 181 |
def _test(self, assertion, lineno, message): |
|---|
| 182 |
"""raises an exception with message as text when assertion is false""" |
|---|
| 183 |
if not assertion: |
|---|
| 184 |
raise XMLError(lineno, message) |
|---|
| 185 |
|
|---|
| 186 |
class nshandler(object): |
|---|
| 187 |
"""handler for nsparser |
|---|
| 188 |
|
|---|
| 189 |
this provides the interface to implement, and can serve as a base |
|---|
| 190 |
class when you don't want to implement everything |
|---|
| 191 |
""" |
|---|
| 192 |
def startdoc(self): |
|---|
| 193 |
pass |
|---|
| 194 |
|
|---|
| 195 |
def enddoc(self): |
|---|
| 196 |
pass |
|---|
| 197 |
|
|---|
| 198 |
def startel(self, name, attrs): |
|---|
| 199 |
pass |
|---|
| 200 |
|
|---|
| 201 |
def endel(self, name): |
|---|
| 202 |
pass |
|---|
| 203 |
|
|---|
| 204 |
def text(self, text): |
|---|
| 205 |
pass |
|---|
| 206 |
|
|---|
| 207 |
def comment(self, text): |
|---|
| 208 |
pass |
|---|
| 209 |
|
|---|
| 210 |
def cdata(self, text): |
|---|
| 211 |
pass |
|---|
| 212 |
|
|---|
| 213 |
class echohandler(nshandler): |
|---|
| 214 |
def startdoc(self): |
|---|
| 215 |
self.buffer = [] |
|---|
| 216 |
|
|---|
| 217 |
def enddoc(self): |
|---|
| 218 |
self.xml = ''.join(self.buffer) |
|---|
| 219 |
|
|---|
| 220 |
def startel(self, name, attrs): |
|---|
| 221 |
self.buffer += ['<', name] |
|---|
| 222 |
if len(attrs): |
|---|
| 223 |
self.buffer.append(' ') |
|---|
| 224 |
self.buffer += ' '.join('%s="%s"' % (k, v) |
|---|
| 225 |
for (k, v) in attrs.iteritems()) |
|---|
| 226 |
self.buffer.append('>') |
|---|
| 227 |
|
|---|
| 228 |
def endel(self, name): |
|---|
| 229 |
if self.buffer[-1] == '>': |
|---|
| 230 |
|
|---|
| 231 |
self.buffer.pop() |
|---|
| 232 |
self.buffer.append('/>') |
|---|
| 233 |
else: |
|---|
| 234 |
self.buffer += ['</', name, '>'] |
|---|
| 235 |
|
|---|
| 236 |
def text(self, text): |
|---|
| 237 |
self.buffer.append(text) |
|---|
| 238 |
|
|---|
| 239 |
def comment(self, text): |
|---|
| 240 |
self.buffer += ['<!--', text, '-->'] |
|---|
| 241 |
|
|---|
| 242 |
def cdata(self, text): |
|---|
| 243 |
self.buffer += ['<![CDATA[', text, ']]>'] |
|---|
| 244 |
|
|---|
| 245 |
if __name__ == '__main__': |
|---|
| 246 |
"""as an example we use the empty handler (base implementation), which |
|---|
| 247 |
means nothing is produced, but the document is checked for |
|---|
| 248 |
well-formedness (and the lib can be tested a bit) |
|---|
| 249 |
""" |
|---|
| 250 |
import sys |
|---|
| 251 |
if len(sys.argv) != 2: |
|---|
| 252 |
print 'usage: %s <xmlfile>' % (sys.argv[0],) |
|---|
| 253 |
sys.exit() |
|---|
| 254 |
fname = sys.argv[1] |
|---|
| 255 |
xml = open(fname).read() |
|---|
| 256 |
h = nshandler() |
|---|
| 257 |
p = nsparser(h) |
|---|
| 258 |
p.parse(xml) |
|---|