root/trunk/nanosax.py

Revision 100 (checked in by johnny, 5 years ago)

Small optimization: replaced some interpolation by simple concatenation (thanks
fijal ;)

  • Property svn:eol-style set to native
Line 
1 # Copyright (c) 2005-2006 Guido Wesdorp. All rights reserved.
2 # This software is distributed under the terms of the Templess
3 # License. See LICENSE.txt for license text.
4 # E-mail: johnny@johnnydebris.net
5
6 """ NanoSax is a very simple event-based XML processing library, much like SAX.
7
8     It's written to accompany the Templess library, but can easily be used
9     stand-alone for quick and easy XML parsing
10 """
11
12 import re
13
14 class XMLError(Exception):
15     def __init__(self, lineno, message):
16         self.lineno = lineno
17         self.message = message
18
19     def __str__(self):
20         return 'parse error in line %s: %s' % (self.lineno, self.message)
21
22 class nsparser(object):
23     """very simple parser for XML
24
25         emits events like SAX, except the API is a lot ('even' ;) simpler
26     """
27     TYPE_TEXT = 1
28     TYPE_START = 2
29     TYPE_END = 3
30     TYPE_COMMENT = 4
31     TYPE_CDATA = 5
32
33     _reg_name = re.compile(r'^[\w\:\-]+$', re.U)
34     _reg_start = re.compile(
35         r'^([\w\:\-]+)(\s+(([\w\:\-]+)=(("(?=([^"]*)")[^"]*")|'
36             "('(?=([^']*)')[^']*'))))*$")
37     _reg_attr = re.compile(
38         r'([\w\:\-]+)((\="(?=([^">]*)"))|(\=\'(?=([^\'>]*)\')))', re.U)
39     _reg_xml_decl = re.compile(r'<\?xml.*?>', re.S)
40     _reg_encoding = re.compile(r'encoding="([^"]+)"')
41     _reg_pi = re.compile(r'<\?.*?>', re.S)
42     _reg_dtd_1 = re.compile(r'<!DOCTYPE\s+[\w\:\-]+\s+\[.*?\]>', re.S)
43     _reg_dtd_2 = re.compile(r'<!DOCTYPE\s+[\w\:\-]+\s+SYSTEM\s+.*?>')
44
45     def __init__(self, handler):
46         self.handler = handler
47
48     def parse(self, xml):
49         """parse the xml using self.handler
50         
51             xml is supposed to be either a unicode or ascii string, or a
52             string with the character set as defined in the xml declaration
53         """
54         xml = self._handle_pis(xml)
55         self.handler.startdoc()
56         for type, lineno, chunk in self._parse_into_chunks(xml):
57             if type == self.TYPE_TEXT:
58                 self.handler.text(chunk)
59             elif type == self.TYPE_START:
60                 self.handler.startel(*self._parse_start(lineno, chunk))
61             elif type == self.TYPE_END:
62                 self.handler.endel(chunk)
63             elif type == self.TYPE_COMMENT:
64                 self.handler.comment(chunk)
65             elif type == self.TYPE_CDATA:
66                 self.handler.cdata(chunk)
67         self.handler.enddoc()
68
69     def _handle_pis(self, xml):
70         """handle processing instructions
71         
72             takes care of handling (if appropriate) the XML declaration, and
73             of discarding any processing instructions and document type
74             declarations etc. the lib can't deal with
75
76             returns unicode, if the input string is not already unicode the
77             charset mentioned in the XML declaration will be used for
78             conversion (if any)
79         """
80         match = self._reg_xml_decl.search(xml)
81         charset = 'UTF-8'
82         if match:
83             decl = match.group(0)
84             xml = xml.replace(decl, '')
85             encmatch = self._reg_encoding.search(decl)
86             if encmatch:
87                 charset = encmatch.group(1)
88         for reg in (self._reg_dtd_1, self._reg_dtd_2, self._reg_pi):
89             while 1:
90                 match = reg.search(xml)
91                 if not match:
92                     break
93                 xml = xml.replace(match.group(0), '')
94         if isinstance(xml, str):
95             xml = unicode(xml, charset)
96         return xml
97
98     def _parse_into_chunks(self, xml):
99         xml = xml.strip()
100         offset = 0
101         currline = 1
102         namestack = [] # for error checking
103         self._test(xml.startswith('<'), currline, 'text before document start')
104         while xml:
105             offset = 0
106             if xml.startswith('<![CDATA['):
107                 endpos = xml.find(']]>')
108                 self._test(endpos > -1, currline, 'CDATA section not closed')
109                 data = xml[9:endpos]
110                 offset += endpos + 3
111                 yield self.TYPE_CDATA, currline, data
112                 currline += data.count('\n')
113             elif xml.startswith('<!--'):
114                 endpos = xml.find('-->')
115                 self._test(endpos > -1, currline, 'comment not closed')
116                 data = xml[4:endpos]
117                 offset += endpos + 3
118                 yield self.TYPE_COMMENT, currline, data
119                 currline += data.count('\n')
120             elif xml.startswith('</'):
121                 endpos = xml.find('>')
122                 self._test(endpos > -1, currline, 'end tag not closed')
123                 data = xml[2:endpos]
124                 name = data.strip()
125                 self._test(self._reg_name.match(name), currline,
126                             'illegal element name \'%s\' in end tag' % (name,))
127                 startname = namestack.pop()
128                 self._test(name.strip() == startname.strip(), currline,
129                             ('closing tag \'%s\' doesn\'t match opening'
130                                 'tag \'%s\'') % (name, startname))
131                 offset += endpos + 1
132                 yield self.TYPE_END, currline, name
133                 currline += data.count('\n')
134             elif xml.startswith('<'):
135                 endpos = xml.find('>')
136                 self._test(endpos > -1, currline, 'start tag not closed')
137                 data = xml[1:endpos]
138                 issingle = False
139                 if data[-1] == '/':
140                     data = data[:-1]
141                     issingle = True
142                 name = data.split()[0]
143                 self._test(self._reg_name.match(name), currline,
144                             'illegal element name \'%s\' for tag' % (name,))
145                 offset += endpos + 1
146                 if not issingle:
147                     # opening tag
148                     namestack.append(name)
149                     yield self.TYPE_START, currline, data
150                 else:
151                     # singleton
152                     yield self.TYPE_START, currline, data
153                     yield self.TYPE_END, currline, name
154                 currline += data.count('\n')
155             else:
156                 endpos = xml.find('<')
157                 self._test(endpos > -1, currline, 'text after document end')
158                 data = xml[:endpos]
159                 offset += endpos
160                 yield self.TYPE_TEXT, currline, data
161                 currline += data.count('\n')
162             xml = xml[offset:]
163         self._test(not namestack, currline, 'document not closed')
164
165     def _parse_start(self, lineno, data):
166         match = self._reg_start.match(data.strip())
167         self._test(match, lineno, 'illegal start tag content \'%s\'' % (data,))
168         name = match.group(1)
169         data = match.group(0)[len(name):].strip()
170         attrs = {}
171         while 1:
172             match = self._reg_attr.search(data)
173             if not match:
174                 break
175             # XXX really strange... for some reason group(0) doesn't contain
176             # the whole matched string
177             data = data.replace(match.group(0) + match.group(4), '')
178             attrs[match.group(1)] = match.group(4)
179         return name, attrs
180
181     def _test(self, assertion, lineno, message):
182         """raises an exception with message as text when assertion is false"""
183         if not assertion:
184             raise XMLError(lineno, message)
185
186 class nshandler(object):
187     """handler for nsparser
188     
189         this provides the interface to implement, and can serve as a base
190         class when you don't want to implement everything
191     """
192     def startdoc(self):
193         pass
194
195     def enddoc(self):
196         pass
197    
198     def startel(self, name, attrs):
199         pass
200
201     def endel(self, name):
202         pass
203
204     def text(self, text):
205         pass
206
207     def comment(self, text):
208         pass
209
210     def cdata(self, text):
211         pass
212
213 class echohandler(nshandler):
214     def startdoc(self):
215         self.buffer = []
216
217     def enddoc(self):
218         self.xml = ''.join(self.buffer)
219
220     def startel(self, name, attrs):
221         self.buffer += ['<', name]
222         if len(attrs):
223             self.buffer.append(' ')
224             self.buffer += ' '.join('%s="%s"' % (k, v)
225                                     for (k, v) in attrs.iteritems())
226         self.buffer.append('>')
227
228     def endel(self, name):
229         if self.buffer[-1] == '>':
230             # singleton
231             self.buffer.pop()
232             self.buffer.append('/>')
233         else:
234             self.buffer += ['</', name, '>']
235
236     def text(self, text):
237         self.buffer.append(text)
238
239     def comment(self, text):
240         self.buffer += ['<!--', text, '-->']
241
242     def cdata(self, text):
243         self.buffer += ['<![CDATA[', text, ']]>']
244
245 if __name__ == '__main__':
246     """as an example we use the empty handler (base implementation), which
247         means nothing is produced, but the document is checked for
248         well-formedness (and the lib can be tested a bit)
249     """
250     import sys
251     if len(sys.argv) != 2:
252         print 'usage: %s <xmlfile>' % (sys.argv[0],)
253         sys.exit()
254     fname = sys.argv[1]
255     xml = open(fname).read()
256     h = nshandler()
257     p = nsparser(h)
258     p.parse(xml)
Note: See TracBrowser for help on using the browser.