OpenLayers OpenLayers

root/trunk/openlayers/tools/BeautifulSoup.py

Revision 5362, 67.9 kB (checked in by crschmidt, 9 months ago)

Merge the excellent documentation work done during foss4g into trunk. Many
thanks to all the contributors who helped put this together.
I'm not exactly sure of what's going to happen with this, but for now,
at http://openlayers.org/dev/doc/examples.html you can see links to all the
examples *with descriptions*. Hooray!

Line 
1 """Beautiful Soup
2 Elixir and Tonic
3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
5
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
9
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it. The BeautifulSoup class
15
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
19
20 * chardet, for auto-detecting character encodings
21   http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
23   by stock Python.
24   http://cjkpython.i18n.org/
25
26 Beautiful Soup defines classes for two main parsing strategies:
27     
28  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29    language that kind of looks like XML.
30
31  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32    or invalid. This class has web browser-like heuristics for
33    obtaining a sensible parse tree in the face of common HTML errors.
34
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39 For more than you ever wanted to know about Beautiful Soup, see the
40 documentation:
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43 """
44 from __future__ import generators
45
46 __author__ = "Leonard Richardson (leonardr@segfault.org)"
47 __version__ = "3.0.4"
48 __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
49 __license__ = "PSF"
50
51 from sgmllib import SGMLParser, SGMLParseError
52 import codecs
53 import types
54 import re
55 import sgmllib
56 try:
57   from htmlentitydefs import name2codepoint
58 except ImportError:
59   name2codepoint = {}
60
61 #This hack makes Beautiful Soup able to parse XML with namespaces
62 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
63
64 DEFAULT_OUTPUT_ENCODING = "utf-8"
65
66 # First, the classes that represent markup elements.
67
68 class PageElement:
69     """Contains the navigational information for some part of the page
70     (either a tag or a piece of text)"""
71
72     def setup(self, parent=None, previous=None):
73         """Sets up the initial relations between this element and
74         other elements."""       
75         self.parent = parent
76         self.previous = previous
77         self.next = None
78         self.previousSibling = None
79         self.nextSibling = None
80         if self.parent and self.parent.contents:
81             self.previousSibling = self.parent.contents[-1]
82             self.previousSibling.nextSibling = self
83
84     def replaceWith(self, replaceWith):       
85         oldParent = self.parent
86         myIndex = self.parent.contents.index(self)
87         if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
88             # We're replacing this element with one of its siblings.
89             index = self.parent.contents.index(replaceWith)
90             if index and index < myIndex:
91                 # Furthermore, it comes before this element. That
92                 # means that when we extract it, the index of this
93                 # element will change.
94                 myIndex = myIndex - 1
95         self.extract()       
96         oldParent.insert(myIndex, replaceWith)
97        
98     def extract(self):
99         """Destructively rips this element out of the tree."""       
100         if self.parent:
101             try:
102                 self.parent.contents.remove(self)
103             except ValueError:
104                 pass
105
106         #Find the two elements that would be next to each other if
107         #this element (and any children) hadn't been parsed. Connect
108         #the two.       
109         lastChild = self._lastRecursiveChild()
110         nextElement = lastChild.next
111
112         if self.previous:
113             self.previous.next = nextElement
114         if nextElement:
115             nextElement.previous = self.previous
116         self.previous = None
117         lastChild.next = None
118
119         self.parent = None       
120         if self.previousSibling:
121             self.previousSibling.nextSibling = self.nextSibling
122         if self.nextSibling:
123             self.nextSibling.previousSibling = self.previousSibling
124         self.previousSibling = self.nextSibling = None       
125
126     def _lastRecursiveChild(self):
127         "Finds the last element beneath this object to be parsed."
128         lastChild = self
129         while hasattr(lastChild, 'contents') and lastChild.contents:
130             lastChild = lastChild.contents[-1]
131         return lastChild
132
133     def insert(self, position, newChild):
134         if (isinstance(newChild, basestring)
135             or isinstance(newChild, unicode)) \
136             and not isinstance(newChild, NavigableString):
137             newChild = NavigableString(newChild)       
138
139         position =  min(position, len(self.contents))
140         if hasattr(newChild, 'parent') and newChild.parent != None:
141             # We're 'inserting' an element that's already one
142             # of this object's children.
143             if newChild.parent == self:
144                 index = self.find(newChild)
145                 if index and index < position:
146                     # Furthermore we're moving it further down the
147                     # list of this object's children. That means that
148                     # when we extract this element, our target index
149                     # will jump down one.
150                     position = position - 1
151             newChild.extract()
152            
153         newChild.parent = self
154         previousChild = None
155         if position == 0:
156             newChild.previousSibling = None
157             newChild.previous = self
158         else:
159             previousChild = self.contents[position-1]
160             newChild.previousSibling = previousChild
161             newChild.previousSibling.nextSibling = newChild
162             newChild.previous = previousChild._lastRecursiveChild()
163         if newChild.previous:
164             newChild.previous.next = newChild       
165
166         newChildsLastElement = newChild._lastRecursiveChild()
167
168         if position >= len(self.contents):
169             newChild.nextSibling = None
170            
171             parent = self
172             parentsNextSibling = None
173             while not parentsNextSibling:
174                 parentsNextSibling = parent.nextSibling
175                 parent = parent.parent
176                 if not parent: # This is the last element in the document.
177                     break
178             if parentsNextSibling:
179                 newChildsLastElement.next = parentsNextSibling
180             else:
181                 newChildsLastElement.next = None
182         else:
183             nextChild = self.contents[position]           
184             newChild.nextSibling = nextChild           
185             if newChild.nextSibling:
186                 newChild.nextSibling.previousSibling = newChild
187             newChildsLastElement.next = nextChild
188
189         if newChildsLastElement.next:
190             newChildsLastElement.next.previous = newChildsLastElement
191         self.contents.insert(position, newChild)
192
193     def findNext(self, name=None, attrs={}, text=None, **kwargs):
194         """Returns the first item that matches the given criteria and
195         appears after this Tag in the document."""
196         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
197
198     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
199                     **kwargs):
200         """Returns all items that match the given criteria and appear
201         before after Tag in the document."""
202         return self._findAll(name, attrs, text, limit, self.nextGenerator)
203
204     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
205         """Returns the closest sibling to this Tag that matches the
206         given criteria and appears after this Tag in the document."""
207         return self._findOne(self.findNextSiblings, name, attrs, text,
208                              **kwargs)
209
210     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
211                          **kwargs):
212         """Returns the siblings of this Tag that match the given
213         criteria and appear after this Tag in the document."""
214         return self._findAll(name, attrs, text, limit,
215                              self.nextSiblingGenerator, **kwargs)
216     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
217
218     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
219         """Returns the first item that matches the given criteria and
220         appears before this Tag in the document."""
221         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
222
223     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
224                         **kwargs):
225         """Returns all items that match the given criteria and appear
226         before this Tag in the document."""
227         return self._findAll(name, attrs, text, limit, self.previousGenerator,
228                            **kwargs)
229     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
230
231     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
232         """Returns the closest sibling to this Tag that matches the
233         given criteria and appears before this Tag in the document."""
234         return self._findOne(self.findPreviousSiblings, name, attrs, text,
235                              **kwargs)
236
237     def findPreviousSiblings(self, name=None, attrs={}, text=None,
238                              limit=None, **kwargs):
239         """Returns the siblings of this Tag that match the given
240         criteria and appear before this Tag in the document."""
241         return self._findAll(name, attrs, text, limit,
242                              self.previousSiblingGenerator, **kwargs)
243     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
244
245     def findParent(self, name=None, attrs={}, **kwargs):
246         """Returns the closest parent of this Tag that matches the given
247         criteria."""
248         # NOTE: We can't use _findOne because findParents takes a different
249         # set of arguments.
250         r = None
251         l = self.findParents(name, attrs, 1)
252         if l:
253             r = l[0]
254         return r
255
256     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
257         """Returns the parents of this Tag that match the given
258         criteria."""
259
260         return self._findAll(name, attrs, None, limit, self.parentGenerator,
261                              **kwargs)
262     fetchParents = findParents # Compatibility with pre-3.x
263
264     #These methods do the real heavy lifting.
265
266     def _findOne(self, method, name, attrs, text, **kwargs):
267         r = None
268         l = method(name, attrs, text, 1, **kwargs)
269         if l:
270             r = l[0]
271         return r
272    
273     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
274         "Iterates over a generator looking for things that match."
275
276         if isinstance(name, SoupStrainer):
277             strainer = name
278         else:
279             # Build a SoupStrainer
280             strainer = SoupStrainer(name, attrs, text, **kwargs)
281         results = ResultSet(strainer)
282         g = generator()
283         while True:
284             try:
285                 i = g.next()
286             except StopIteration:
287                 break
288             if i:
289                 found = strainer.search(i)
290                 if found:
291                     results.append(found)
292                     if limit and len(results) >= limit:
293                         break
294         return results
295
296     #These Generators can be used to navigate starting from both
297     #NavigableStrings and Tags.               
298     def nextGenerator(self):
299         i = self
300         while i:
301             i = i.next
302             yield i
303
304     def nextSiblingGenerator(self):
305         i = self
306         while i:
307             i = i.nextSibling
308             yield i
309
310     def previousGenerator(self):
311         i = self
312         while i:
313             i = i.previous
314             yield i
315
316     def previousSiblingGenerator(self):
317         i = self
318         while i:
319             i = i.previousSibling
320             yield i
321
322     def parentGenerator(self):
323         i = self
324         while i:
325             i = i.parent
326             yield i
327
328     # Utility methods
329     def substituteEncoding(self, str, encoding=None):
330         encoding = encoding or "utf-8"
331         return str.replace("%SOUP-ENCODING%", encoding)   
332
333     def toEncoding(self, s, encoding=None):
334         """Encodes an object to a string in some encoding, or to Unicode.
335         ."""
336         if isinstance(s, unicode):
337             if encoding:
338                 s = s.encode(encoding)
339         elif isinstance(s, str):
340             if encoding:
341                 s = s.encode(encoding)
342             else:
343                 s = unicode(s)
344         else:
345             if encoding:
346                 s  = self.toEncoding(str(s), encoding)
347             else:
348                 s = unicode(s)
349         return s
350
351 class NavigableString(unicode, PageElement):
352
353     def __getattr__(self, attr):
354         """text.string gives you text. This is for backwards
355         compatibility for Navigable*String, but for CData* it lets you
356         get the string without the CData wrapper."""
357         if attr == 'string':
358             return self
359         else:
360             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
361
362     def __unicode__(self):
363         return self.__str__(None)
364
365     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
366         if encoding:
367             return self.encode(encoding)
368         else:
369             return self
370        
371 class CData(NavigableString):
372
373     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
374         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
375
376 class ProcessingInstruction(NavigableString):
377     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
378         output = self
379         if "%SOUP-ENCODING%" in output:
380             output = self.substituteEncoding(output, encoding)
381         return "<?%s?>" % self.toEncoding(output, encoding)
382
383 class Comment(NavigableString):
384     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
385         return "<!--%s-->" % NavigableString.__str__(self, encoding)   
386
387 class Declaration(NavigableString):
388     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
389         return "<!%s>" % NavigableString.__str__(self, encoding)       
390
391 class Tag(PageElement):
392
393     """Represents a found HTML tag with its attributes and contents."""
394
395     XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",
396                                       '"' : "quote",
397                                       "&" : "amp",
398                                       "<" : "lt",
399                                       ">" : "gt" }
400
401     def __init__(self, parser, name, attrs=None, parent=None,
402                  previous=None):
403         "Basic constructor."
404
405         # We don't actually store the parser object: that lets extracted
406         # chunks be garbage-collected
407         self.parserClass = parser.__class__
408         self.isSelfClosing = parser.isSelfClosingTag(name)
409         self.name = name
410         if attrs == None:
411             attrs = []
412         self.attrs = attrs
413         self.contents = []
414         self.setup(parent, previous)
415         self.hidden = False
416         self.containsSubstitutions = False
417
418     def get(self, key, default=None):
419         """Returns the value of the 'key' attribute for the tag, or
420         the value given for 'default' if it doesn't have that
421         attribute."""
422         return self._getAttrMap().get(key, default)   
423
424     def has_key(self, key):
425         return self._getAttrMap().has_key(key)
426
427     def __getitem__(self, key):
428         """tag[key] returns the value of the 'key' attribute for the tag,
429         and throws an exception if it's not there."""
430         return self._getAttrMap()[key]
431
432     def __iter__(self):
433         "Iterating over a tag iterates over its contents."
434         return iter(self.contents)
435
436     def __len__(self):
437         "The length of a tag is the length of its list of contents."
438         return len(self.contents)
439
440     def __contains__(self, x):
441         return x in self.contents
442
443     def __nonzero__(self):
444         "A tag is non-None even if it has no contents."
445         return True
446
447     def __setitem__(self, key, value):       
448         """Setting tag[key] sets the value of the 'key' attribute for the
449         tag."""
450         self._getAttrMap()
451         self.attrMap[key] = value
452         found = False
453         for i in range(0, len(self.attrs)):
454             if self.attrs[i][0] == key:
455                 self.attrs[i] = (key, value)
456                 found = True
457         if not found:
458             self.attrs.append((key, value))
459         self._getAttrMap()[key] = value
460
461     def __delitem__(self, key):
462         "Deleting tag[key] deletes all 'key' attributes for the tag."
463         for item in self.attrs:
464             if item[0] == key:
465                 self.attrs.remove(item)
466                 #We don't break because bad HTML can define the same
467                 #attribute multiple times.
468             self._getAttrMap()
469             if self.attrMap.has_key(key):
470                 del self.attrMap[key]
471
472     def __call__(self, *args, **kwargs):
473         """Calling a tag like a function is the same as calling its
474         findAll() method. Eg. tag('a') returns a list of all the A tags
475         found within this tag."""
476         return apply(self.findAll, args, kwargs)
477
478     def __getattr__(self, tag):
479         #print "Getattr %s.%s" % (self.__class__, tag)
480         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
481             return self.find(tag[:-3])
482         elif tag.find('__') != 0:
483             return self.find(tag)
484
485     def __eq__(self, other):
486         """Returns true iff this tag has the same name, the same attributes,
487         and the same contents (recursively) as the given tag.
488
489         NOTE: right now this will return false if two tags have the
490         same attributes in a different order. Should this be fixed?"""
491         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
492             return False
493         for i in range(0, len(self.contents)):
494             if self.contents[i] != other.contents[i]:
495                 return False
496         return True
497
498     def __ne__(self, other):
499         """Returns true iff this tag is not identical to the other tag,
500         as defined in __eq__."""
501         return not self == other
502
503     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
504         """Renders this tag as a string."""
505         return self.__str__(encoding)
506
507     def __unicode__(self):
508         return self.__str__(None)
509
510     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
511                 prettyPrint=False, indentLevel=0):
512         """Returns a string or Unicode representation of this tag and
513         its contents. To get Unicode, pass None for encoding.
514
515         NOTE: since Python's HTML parser consumes whitespace, this
516         method is not certain to reproduce the whitespace present in
517         the original string."""
518
519         encodedName = self.toEncoding(self.name, encoding)
520
521         attrs = []
522         if self.attrs:
523             for key, val in self.attrs:
524                 fmt = '%s="%s"'
525                 if isString(val):                   
526                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
527                         val = self.substituteEncoding(val, encoding)
528
529                     # The attribute value either:
530                     #
531                     # * Contains no embedded double quotes or single quotes.
532                     #   No problem: we enclose it in double quotes.
533                     # * Contains embedded single quotes. No problem:
534                     #   double quotes work here too.
535                     # * Contains embedded double quotes. No problem:
536                     #   we enclose it in single quotes.
537                     # * Embeds both single _and_ double quotes. This
538                     #   can't happen naturally, but it can happen if
539                     #   you modify an attribute value after parsing
540                     #   the document. Now we have a bit of a
541                     #   problem. We solve it by enclosing the
542                     #   attribute in single quotes, and escaping any
543                     #   embedded single quotes to XML entities.
544                     if '"' in val:
545                         fmt = "%s='%s'"
546                         # This can't happen naturally, but it can happen
547                         # if you modify an attribute value after parsing.
548                         if "'" in val:
549                             val = val.replace("'", "&squot;")
550
551                     # Now we're okay w/r/t quotes. But the attribute
552                     # value might also contain angle brackets, or
553                     # ampersands that aren't part of entities. We need
554                     # to escape those to XML entities too.
555                     val = re.sub("([<>]|&(?![^\s]+;))",
556                                  lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",
557                                  val)
558                                      
559                 attrs.append(fmt % (self.toEncoding(key, encoding),
560                                     self.toEncoding(val, encoding)))
561         close = ''
562         closeTag = ''
563         if self.isSelfClosing:
564             close = ' /'
565         else:
566             closeTag = '</%s>' % encodedName
567
568         indentTag, indentContents = 0, 0
569         if prettyPrint:
570             indentTag = indentLevel
571             space = (' ' * (indentTag-1))
572             indentContents = indentTag + 1
573         contents = self.renderContents(encoding, prettyPrint, indentContents)
574         if self.hidden:
575             s = contents
576         else:
577             s = []
578             attributeString = ''
579             if attrs:
580                 attributeString = ' ' + ' '.join(attrs)           
581             if prettyPrint:
582                 s.append(space)
583             s.append('<%s%s%s>' % (encodedName, attributeString, close))
584             if prettyPrint:
585                 s.append("\n")
586             s.append(contents)
587             if prettyPrint and contents and contents[-1] != "\n":
588                 s.append("\n")
589             if prettyPrint and closeTag:
590                 s.append(space)
591             s.append(closeTag)
592             if prettyPrint and closeTag and self.nextSibling:
593                 s.append("\n")
594             s = ''.join(s)
595         return s
596
597     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
598         return self.__str__(encoding, True)
599
600     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
601                        prettyPrint=False, indentLevel=0):
602         """Renders the contents of this tag as a string in the given
603         encoding. If encoding is None, returns a Unicode string.."""
604         s=[]
605         for c in self:
606             text = None
607             if isinstance(c, NavigableString):
608                 text = c.__str__(encoding)
609             elif isinstance(c, Tag):
610                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
611             if text and prettyPrint:
612                 text = text.strip()             
613             if text:
614                 if prettyPrint:
615                     s.append(" " * (indentLevel-1))
616                 s.append(text)
617                 if prettyPrint:
618                     s.append("\n")
619         return ''.join(s)   
620
621     #Soup methods
622
623     def find(self, name=None, attrs={}, recursive=True, text=None,
624              **kwargs):
625         """Return only the first child of this Tag matching the given
626         criteria."""
627         r = None
628         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
629         if l:
630             r = l[0]
631         return r
632     findChild = find
633
634     def findAll(self, name=None, attrs={}, recursive=True, text=None,
635                 limit=None, **kwargs):
636         """Extracts a list of Tag objects that match the given
637         criteria.  You can specify the name of the Tag and any
638         attributes you want the Tag to have.
639
640         The value of a key-value pair in the 'attrs' map can be a
641         string, a list of strings, a regular expression object, or a
642         callable that takes a string and returns whether or not the
643         string matches for some custom definition of 'matches'. The
644         same is true of the tag name."""
645         generator = self.recursiveChildGenerator
646         if not recursive:
647             generator = self.childGenerator
648         return self._findAll(name, attrs, text, limit, generator, **kwargs)
649     findChildren = findAll
650
651     # Pre-3.x compatibility methods
652     first = find
653     fetch = findAll
654    
655     def fetchText(self, text=None, recursive=True, limit=None):
656         return self.findAll(text=text, recursive=recursive, limit=limit)
657
658     def firstText(self, text=None, recursive=True):
659         return self.find(text=text, recursive=recursive)
660    
661     #Utility methods
662
663     def append(self, tag):
664         """Appends the given tag to the contents of this tag."""
665         self.contents.append(tag)
666
667     #Private methods
668
669     def _getAttrMap(self):
670         """Initializes a map representation of this tag's attributes,
671         if not already initialized."""
672         if not getattr(self, 'attrMap'):
673             self.attrMap = {}
674             for (key, value) in self.attrs:
675                 self.attrMap[key] = value
676         return self.attrMap
677
678     #Generator methods
679     def childGenerator(self):
680         for i in range(0, len(self.contents)):
681             yield self.contents[i]
682         raise StopIteration
683    
684     def recursiveChildGenerator(self):
685         stack = [(self, 0)]
686         while stack:
687             tag, start = stack.pop()
688             if isinstance(tag, Tag):           
689                 for i in range(start, len(tag.contents)):
690                     a = tag.contents[i]
691                     yield a
692                     if isinstance(a, Tag) and tag.contents:
693                         if i < len(tag.contents) - 1:
694                             stack.append((tag, i+1))
695                         stack.append((a, 0))
696                         break
697         raise StopIteration
698
699 # Next, a couple classes to represent queries and their results.
700 class SoupStrainer:
701     """Encapsulates a number of ways of matching a markup element (tag or
702     text)."""
703
704     def __init__(self, name=None, attrs={}, text=None, **kwargs):
705         self.name = name
706         if isString(attrs):
707             kwargs['class'] = attrs
708             attrs = None
709         if kwargs:
710             if attrs:
711                 attrs = attrs.copy()
712                 attrs.update(kwargs)
713             else:
714                 attrs = kwargs
715         self.attrs = attrs
716         self.text = text
717
718     def __str__(self):
719         if self.text:
720             return self.text
721         else:
722             return "%s|%s" % (self.name, self.attrs)
723    
724     def searchTag(self, markupName=None, markupAttrs={}):
725         found = None
726         markup = None
727         if isinstance(markupName, Tag):
728             markup = markupName
729             markupAttrs = markup
730         callFunctionWithTagData = callable(self.name) \
731                                 and not isinstance(markupName, Tag)
732
733         if (not self.name) \
734                or callFunctionWithTagData \
735                or (markup and self._matches(markup, self.name)) \
736                or (not markup and self._matches(markupName, self.name)):
737             if callFunctionWithTagData:
738                 match = self.name(markupName, markupAttrs)
739             else:
740                 match = True           
741                 markupAttrMap = None
742                 for attr, matchAgainst in self.attrs.items():
743                     if not markupAttrMap:
744                          if hasattr(markupAttrs, 'get'):
745                             markupAttrMap = markupAttrs
746                          else:
747                             markupAttrMap = {}
748                             for k,v in markupAttrs:
749                                 markupAttrMap[k] = v
750                     attrValue = markupAttrMap.get(attr)
751                     if not self._matches(attrValue, matchAgainst):
752                         match = False
753                         break
754             if match:
755                 if markup:
756                     found = markup
757                 else:
758                     found = markupName
759         return found
760
761     def search(self, markup):
762         #print 'looking for %s in %s' % (self, markup)
763         found = None
764         # If given a list of items, scan it for a text element that
765         # matches.       
766         if isList(markup) and not isinstance(markup, Tag):
767             for element in markup:
768                 if isinstance(element, NavigableString) \
769                        and self.search(element):
770                     found = element
771                     break
772         # If it's a Tag, make sure its name or attributes match.
773         # Don't bother with Tags if we're searching for text.
774         elif isinstance(markup, Tag):
775             if not self.text:
776                 found = self.searchTag(markup)
777         # If it's text, make sure the text matches.
778         elif isinstance(markup, NavigableString) or \
779                  isString(markup):
780             if self._matches(markup, self.text):
781                 found = markup
782         else:
783             raise Exception, "I don't know how to match against a %s" \
784                   % markup.__class__
785