| 1 |
"""Beautiful Soup |
|---|
| 2 |
Elixir and Tonic |
|---|
| 3 |
"The Screen-Scraper's Friend" |
|---|
| 4 |
http://www.crummy.com/software/BeautifulSoup/ |
|---|
| 5 |
|
|---|
| 6 |
Beautiful Soup parses a (possibly invalid) XML or HTML document into a |
|---|
| 7 |
tree representation. It provides methods and Pythonic idioms that make |
|---|
| 8 |
it easy to navigate, search, and modify the tree. |
|---|
| 9 |
|
|---|
| 10 |
A well-formed XML/HTML document yields a well-formed data |
|---|
| 11 |
structure. An ill-formed XML/HTML document yields a correspondingly |
|---|
| 12 |
ill-formed data structure. If your document is only locally |
|---|
| 13 |
well-formed, you can use this library to find and process the |
|---|
| 14 |
well-formed part of it. The BeautifulSoup class |
|---|
| 15 |
|
|---|
| 16 |
Beautiful Soup works with Python 2.2 and up. It has no external |
|---|
| 17 |
dependencies, but you'll have more success at converting data to UTF-8 |
|---|
| 18 |
if you also install these three packages: |
|---|
| 19 |
|
|---|
| 20 |
* chardet, for auto-detecting character encodings |
|---|
| 21 |
http://chardet.feedparser.org/ |
|---|
| 22 |
* cjkcodecs and iconv_codec, which add more encodings to the ones supported |
|---|
| 23 |
by stock Python. |
|---|
| 24 |
http://cjkpython.i18n.org/ |
|---|
| 25 |
|
|---|
| 26 |
Beautiful Soup defines classes for two main parsing strategies: |
|---|
| 27 |
|
|---|
| 28 |
* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific |
|---|
| 29 |
language that kind of looks like XML. |
|---|
| 30 |
|
|---|
| 31 |
* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid |
|---|
| 32 |
or invalid. This class has web browser-like heuristics for |
|---|
| 33 |
obtaining a sensible parse tree in the face of common HTML errors. |
|---|
| 34 |
|
|---|
| 35 |
Beautiful Soup also defines a class (UnicodeDammit) for autodetecting |
|---|
| 36 |
the encoding of an HTML or XML document, and converting it to |
|---|
| 37 |
Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. |
|---|
| 38 |
|
|---|
| 39 |
For more than you ever wanted to know about Beautiful Soup, see the |
|---|
| 40 |
documentation: |
|---|
| 41 |
http://www.crummy.com/software/BeautifulSoup/documentation.html |
|---|
| 42 |
|
|---|
| 43 |
""" |
|---|
| 44 |
from __future__ import generators |
|---|
| 45 |
|
|---|
| 46 |
__author__ = "Leonard Richardson (leonardr@segfault.org)" |
|---|
| 47 |
__version__ = "3.0.4" |
|---|
| 48 |
__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson" |
|---|
| 49 |
__license__ = "PSF" |
|---|
| 50 |
|
|---|
| 51 |
from sgmllib import SGMLParser, SGMLParseError |
|---|
| 52 |
import codecs |
|---|
| 53 |
import types |
|---|
| 54 |
import re |
|---|
| 55 |
import sgmllib |
|---|
| 56 |
try: |
|---|
| 57 |
from htmlentitydefs import name2codepoint |
|---|
| 58 |
except ImportError: |
|---|
| 59 |
name2codepoint = {} |
|---|
| 60 |
|
|---|
| 61 |
|
|---|
| 62 |
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') |
|---|
| 63 |
|
|---|
| 64 |
DEFAULT_OUTPUT_ENCODING = "utf-8" |
|---|
| 65 |
|
|---|
| 66 |
|
|---|
| 67 |
|
|---|
| 68 |
class PageElement: |
|---|
| 69 |
"""Contains the navigational information for some part of the page |
|---|
| 70 |
(either a tag or a piece of text)""" |
|---|
| 71 |
|
|---|
| 72 |
def setup(self, parent=None, previous=None): |
|---|
| 73 |
"""Sets up the initial relations between this element and |
|---|
| 74 |
other elements.""" |
|---|
| 75 |
self.parent = parent |
|---|
| 76 |
self.previous = previous |
|---|
| 77 |
self.next = None |
|---|
| 78 |
self.previousSibling = None |
|---|
| 79 |
self.nextSibling = None |
|---|
| 80 |
if self.parent and self.parent.contents: |
|---|
| 81 |
self.previousSibling = self.parent.contents[-1] |
|---|
| 82 |
self.previousSibling.nextSibling = self |
|---|
| 83 |
|
|---|
| 84 |
def replaceWith(self, replaceWith): |
|---|
| 85 |
oldParent = self.parent |
|---|
| 86 |
myIndex = self.parent.contents.index(self) |
|---|
| 87 |
if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: |
|---|
| 88 |
|
|---|
| 89 |
index = self.parent.contents.index(replaceWith) |
|---|
| 90 |
if index and index < myIndex: |
|---|
| 91 |
|
|---|
| 92 |
|
|---|
| 93 |
|
|---|
| 94 |
myIndex = myIndex - 1 |
|---|
| 95 |
self.extract() |
|---|
| 96 |
oldParent.insert(myIndex, replaceWith) |
|---|
| 97 |
|
|---|
| 98 |
def extract(self): |
|---|
| 99 |
"""Destructively rips this element out of the tree.""" |
|---|
| 100 |
if self.parent: |
|---|
| 101 |
try: |
|---|
| 102 |
self.parent.contents.remove(self) |
|---|
| 103 |
except ValueError: |
|---|
| 104 |
pass |
|---|
| 105 |
|
|---|
| 106 |
|
|---|
| 107 |
|
|---|
| 108 |
|
|---|
| 109 |
lastChild = self._lastRecursiveChild() |
|---|
| 110 |
nextElement = lastChild.next |
|---|
| 111 |
|
|---|
| 112 |
if self.previous: |
|---|
| 113 |
self.previous.next = nextElement |
|---|
| 114 |
if nextElement: |
|---|
| 115 |
nextElement.previous = self.previous |
|---|
| 116 |
self.previous = None |
|---|
| 117 |
lastChild.next = None |
|---|
| 118 |
|
|---|
| 119 |
self.parent = None |
|---|
| 120 |
if self.previousSibling: |
|---|
| 121 |
self.previousSibling.nextSibling = self.nextSibling |
|---|
| 122 |
if self.nextSibling: |
|---|
| 123 |
self.nextSibling.previousSibling = self.previousSibling |
|---|
| 124 |
self.previousSibling = self.nextSibling = None |
|---|
| 125 |
|
|---|
| 126 |
def _lastRecursiveChild(self): |
|---|
| 127 |
"Finds the last element beneath this object to be parsed." |
|---|
| 128 |
lastChild = self |
|---|
| 129 |
while hasattr(lastChild, 'contents') and lastChild.contents: |
|---|
| 130 |
lastChild = lastChild.contents[-1] |
|---|
| 131 |
return lastChild |
|---|
| 132 |
|
|---|
| 133 |
def insert(self, position, newChild): |
|---|
| 134 |
if (isinstance(newChild, basestring) |
|---|
| 135 |
or isinstance(newChild, unicode)) \ |
|---|
| 136 |
and not isinstance(newChild, NavigableString): |
|---|
| 137 |
newChild = NavigableString(newChild) |
|---|
| 138 |
|
|---|
| 139 |
position = min(position, len(self.contents)) |
|---|
| 140 |
if hasattr(newChild, 'parent') and newChild.parent != None: |
|---|
| 141 |
|
|---|
| 142 |
|
|---|
| 143 |
if newChild.parent == self: |
|---|
| 144 |
index = self.find(newChild) |
|---|
| 145 |
if index and index < position: |
|---|
| 146 |
|
|---|
| 147 |
|
|---|
| 148 |
|
|---|
| 149 |
|
|---|
| 150 |
position = position - 1 |
|---|
| 151 |
newChild.extract() |
|---|
| 152 |
|
|---|
| 153 |
newChild.parent = self |
|---|
| 154 |
previousChild = None |
|---|
| 155 |
if position == 0: |
|---|
| 156 |
newChild.previousSibling = None |
|---|
| 157 |
newChild.previous = self |
|---|
| 158 |
else: |
|---|
| 159 |
previousChild = self.contents[position-1] |
|---|
| 160 |
newChild.previousSibling = previousChild |
|---|
| 161 |
newChild.previousSibling.nextSibling = newChild |
|---|
| 162 |
newChild.previous = previousChild._lastRecursiveChild() |
|---|
| 163 |
if newChild.previous: |
|---|
| 164 |
newChild.previous.next = newChild |
|---|
| 165 |
|
|---|
| 166 |
newChildsLastElement = newChild._lastRecursiveChild() |
|---|
| 167 |
|
|---|
| 168 |
if position >= len(self.contents): |
|---|
| 169 |
newChild.nextSibling = None |
|---|
| 170 |
|
|---|
| 171 |
parent = self |
|---|
| 172 |
parentsNextSibling = None |
|---|
| 173 |
while not parentsNextSibling: |
|---|
| 174 |
parentsNextSibling = parent.nextSibling |
|---|
| 175 |
parent = parent.parent |
|---|
| 176 |
if not parent: |
|---|
| 177 |
break |
|---|
| 178 |
if parentsNextSibling: |
|---|
| 179 |
newChildsLastElement.next = parentsNextSibling |
|---|
| 180 |
else: |
|---|
| 181 |
newChildsLastElement.next = None |
|---|
| 182 |
else: |
|---|
| 183 |
nextChild = self.contents[position] |
|---|
| 184 |
newChild.nextSibling = nextChild |
|---|
| 185 |
if newChild.nextSibling: |
|---|
| 186 |
newChild.nextSibling.previousSibling = newChild |
|---|
| 187 |
newChildsLastElement.next = nextChild |
|---|
| 188 |
|
|---|
| 189 |
if newChildsLastElement.next: |
|---|
| 190 |
newChildsLastElement.next.previous = newChildsLastElement |
|---|
| 191 |
self.contents.insert(position, newChild) |
|---|
| 192 |
|
|---|
| 193 |
def findNext(self, name=None, attrs={}, text=None, **kwargs): |
|---|
| 194 |
"""Returns the first item that matches the given criteria and |
|---|
| 195 |
appears after this Tag in the document.""" |
|---|
| 196 |
return self._findOne(self.findAllNext, name, attrs, text, **kwargs) |
|---|
| 197 |
|
|---|
| 198 |
def findAllNext(self, name=None, attrs={}, text=None, limit=None, |
|---|
| 199 |
**kwargs): |
|---|
| 200 |
"""Returns all items that match the given criteria and appear |
|---|
| 201 |
before after Tag in the document.""" |
|---|
| 202 |
return self._findAll(name, attrs, text, limit, self.nextGenerator) |
|---|
| 203 |
|
|---|
| 204 |
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): |
|---|
| 205 |
"""Returns the closest sibling to this Tag that matches the |
|---|
| 206 |
given criteria and appears after this Tag in the document.""" |
|---|
| 207 |
return self._findOne(self.findNextSiblings, name, attrs, text, |
|---|
| 208 |
**kwargs) |
|---|
| 209 |
|
|---|
| 210 |
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, |
|---|
| 211 |
**kwargs): |
|---|
| 212 |
"""Returns the siblings of this Tag that match the given |
|---|
| 213 |
criteria and appear after this Tag in the document.""" |
|---|
| 214 |
return self._findAll(name, attrs, text, limit, |
|---|
| 215 |
self.nextSiblingGenerator, **kwargs) |
|---|
| 216 |
fetchNextSiblings = findNextSiblings |
|---|
| 217 |
|
|---|
| 218 |
def findPrevious(self, name=None, attrs={}, text=None, **kwargs): |
|---|
| 219 |
"""Returns the first item that matches the given criteria and |
|---|
| 220 |
appears before this Tag in the document.""" |
|---|
| 221 |
return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) |
|---|
| 222 |
|
|---|
| 223 |
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, |
|---|
| 224 |
**kwargs): |
|---|
| 225 |
"""Returns all items that match the given criteria and appear |
|---|
| 226 |
before this Tag in the document.""" |
|---|
| 227 |
return self._findAll(name, attrs, text, limit, self.previousGenerator, |
|---|
| 228 |
**kwargs) |
|---|
| 229 |
fetchPrevious = findAllPrevious |
|---|
| 230 |
|
|---|
| 231 |
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): |
|---|
| 232 |
"""Returns the closest sibling to this Tag that matches the |
|---|
| 233 |
given criteria and appears before this Tag in the document.""" |
|---|
| 234 |
return self._findOne(self.findPreviousSiblings, name, attrs, text, |
|---|
| 235 |
**kwargs) |
|---|
| 236 |
|
|---|
| 237 |
def findPreviousSiblings(self, name=None, attrs={}, text=None, |
|---|
| 238 |
limit=None, **kwargs): |
|---|
| 239 |
"""Returns the siblings of this Tag that match the given |
|---|
| 240 |
criteria and appear before this Tag in the document.""" |
|---|
| 241 |
return self._findAll(name, attrs, text, limit, |
|---|
| 242 |
self.previousSiblingGenerator, **kwargs) |
|---|
| 243 |
fetchPreviousSiblings = findPreviousSiblings |
|---|
| 244 |
|
|---|
| 245 |
def findParent(self, name=None, attrs={}, **kwargs): |
|---|
| 246 |
"""Returns the closest parent of this Tag that matches the given |
|---|
| 247 |
criteria.""" |
|---|
| 248 |
|
|---|
| 249 |
|
|---|
| 250 |
r = None |
|---|
| 251 |
l = self.findParents(name, attrs, 1) |
|---|
| 252 |
if l: |
|---|
| 253 |
r = l[0] |
|---|
| 254 |
return r |
|---|
| 255 |
|
|---|
| 256 |
def findParents(self, name=None, attrs={}, limit=None, **kwargs): |
|---|
| 257 |
"""Returns the parents of this Tag that match the given |
|---|
| 258 |
criteria.""" |
|---|
| 259 |
|
|---|
| 260 |
return self._findAll(name, attrs, None, limit, self.parentGenerator, |
|---|
| 261 |
**kwargs) |
|---|
| 262 |
fetchParents = findParents |
|---|
| 263 |
|
|---|
| 264 |
|
|---|
| 265 |
|
|---|
| 266 |
def _findOne(self, method, name, attrs, text, **kwargs): |
|---|
| 267 |
r = None |
|---|
| 268 |
l = method(name, attrs, text, 1, **kwargs) |
|---|
| 269 |
if l: |
|---|
| 270 |
r = l[0] |
|---|
| 271 |
return r |
|---|
| 272 |
|
|---|
| 273 |
def _findAll(self, name, attrs, text, limit, generator, **kwargs): |
|---|
| 274 |
"Iterates over a generator looking for things that match." |
|---|
| 275 |
|
|---|
| 276 |
if isinstance(name, SoupStrainer): |
|---|
| 277 |
strainer = name |
|---|
| 278 |
else: |
|---|
| 279 |
|
|---|
| 280 |
strainer = SoupStrainer(name, attrs, text, **kwargs) |
|---|
| 281 |
results = ResultSet(strainer) |
|---|
| 282 |
g = generator() |
|---|
| 283 |
while True: |
|---|
| 284 |
try: |
|---|
| 285 |
i = g.next() |
|---|
| 286 |
except StopIteration: |
|---|
| 287 |
break |
|---|
| 288 |
if i: |
|---|
| 289 |
found = strainer.search(i) |
|---|
| 290 |
if found: |
|---|
| 291 |
results.append(found) |
|---|
| 292 |
if limit and len(results) >= limit: |
|---|
| 293 |
break |
|---|
| 294 |
return results |
|---|
| 295 |
|
|---|
| 296 |
|
|---|
| 297 |
|
|---|
| 298 |
def nextGenerator(self): |
|---|
| 299 |
i = self |
|---|
| 300 |
while i: |
|---|
| 301 |
i = i.next |
|---|
| 302 |
yield i |
|---|
| 303 |
|
|---|
| 304 |
def nextSiblingGenerator(self): |
|---|
| 305 |
i = self |
|---|
| 306 |
while i: |
|---|
| 307 |
i = i.nextSibling |
|---|
| 308 |
yield i |
|---|
| 309 |
|
|---|
| 310 |
def previousGenerator(self): |
|---|
| 311 |
i = self |
|---|
| 312 |
while i: |
|---|
| 313 |
i = i.previous |
|---|
| 314 |
yield i |
|---|
| 315 |
|
|---|
| 316 |
def previousSiblingGenerator(self): |
|---|
| 317 |
i = self |
|---|
| 318 |
while i: |
|---|
| 319 |
i = i.previousSibling |
|---|
| 320 |
yield i |
|---|
| 321 |
|
|---|
| 322 |
def parentGenerator(self): |
|---|
| 323 |
i = self |
|---|
| 324 |
while i: |
|---|
| 325 |
i = i.parent |
|---|
| 326 |
yield i |
|---|
| 327 |
|
|---|
| 328 |
|
|---|
| 329 |
def substituteEncoding(self, str, encoding=None): |
|---|
| 330 |
encoding = encoding or "utf-8" |
|---|
| 331 |
return str.replace("%SOUP-ENCODING%", encoding) |
|---|
| 332 |
|
|---|
| 333 |
def toEncoding(self, s, encoding=None): |
|---|
| 334 |
"""Encodes an object to a string in some encoding, or to Unicode. |
|---|
| 335 |
.""" |
|---|
| 336 |
if isinstance(s, unicode): |
|---|
| 337 |
if encoding: |
|---|
| 338 |
s = s.encode(encoding) |
|---|
| 339 |
elif isinstance(s, str): |
|---|
| 340 |
if encoding: |
|---|
| 341 |
s = s.encode(encoding) |
|---|
| 342 |
else: |
|---|
| 343 |
s = unicode(s) |
|---|
| 344 |
else: |
|---|
| 345 |
if encoding: |
|---|
| 346 |
s = self.toEncoding(str(s), encoding) |
|---|
| 347 |
else: |
|---|
| 348 |
s = unicode(s) |
|---|
| 349 |
return s |
|---|
| 350 |
|
|---|
| 351 |
class NavigableString(unicode, PageElement): |
|---|
| 352 |
|
|---|
| 353 |
def __getattr__(self, attr): |
|---|
| 354 |
"""text.string gives you text. This is for backwards |
|---|
| 355 |
compatibility for Navigable*String, but for CData* it lets you |
|---|
| 356 |
get the string without the CData wrapper.""" |
|---|
| 357 |
if attr == 'string': |
|---|
| 358 |
return self |
|---|
| 359 |
else: |
|---|
| 360 |
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) |
|---|
| 361 |
|
|---|
| 362 |
def __unicode__(self): |
|---|
| 363 |
return self.__str__(None) |
|---|
| 364 |
|
|---|
| 365 |
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 366 |
if encoding: |
|---|
| 367 |
return self.encode(encoding) |
|---|
| 368 |
else: |
|---|
| 369 |
return self |
|---|
| 370 |
|
|---|
| 371 |
class CData(NavigableString): |
|---|
| 372 |
|
|---|
| 373 |
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 374 |
return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) |
|---|
| 375 |
|
|---|
| 376 |
class ProcessingInstruction(NavigableString): |
|---|
| 377 |
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 378 |
output = self |
|---|
| 379 |
if "%SOUP-ENCODING%" in output: |
|---|
| 380 |
output = self.substituteEncoding(output, encoding) |
|---|
| 381 |
return "<?%s?>" % self.toEncoding(output, encoding) |
|---|
| 382 |
|
|---|
| 383 |
class Comment(NavigableString): |
|---|
| 384 |
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 385 |
return "<!--%s-->" % NavigableString.__str__(self, encoding) |
|---|
| 386 |
|
|---|
| 387 |
class Declaration(NavigableString): |
|---|
| 388 |
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 389 |
return "<!%s>" % NavigableString.__str__(self, encoding) |
|---|
| 390 |
|
|---|
| 391 |
class Tag(PageElement): |
|---|
| 392 |
|
|---|
| 393 |
"""Represents a found HTML tag with its attributes and contents.""" |
|---|
| 394 |
|
|---|
| 395 |
XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot", |
|---|
| 396 |
'"' : "quote", |
|---|
| 397 |
"&" : "amp", |
|---|
| 398 |
"<" : "lt", |
|---|
| 399 |
">" : "gt" } |
|---|
| 400 |
|
|---|
| 401 |
def __init__(self, parser, name, attrs=None, parent=None, |
|---|
| 402 |
previous=None): |
|---|
| 403 |
"Basic constructor." |
|---|
| 404 |
|
|---|
| 405 |
|
|---|
| 406 |
|
|---|
| 407 |
self.parserClass = parser.__class__ |
|---|
| 408 |
self.isSelfClosing = parser.isSelfClosingTag(name) |
|---|
| 409 |
self.name = name |
|---|
| 410 |
if attrs == None: |
|---|
| 411 |
attrs = [] |
|---|
| 412 |
self.attrs = attrs |
|---|
| 413 |
self.contents = [] |
|---|
| 414 |
self.setup(parent, previous) |
|---|
| 415 |
self.hidden = False |
|---|
| 416 |
self.containsSubstitutions = False |
|---|
| 417 |
|
|---|
| 418 |
def get(self, key, default=None): |
|---|
| 419 |
"""Returns the value of the 'key' attribute for the tag, or |
|---|
| 420 |
the value given for 'default' if it doesn't have that |
|---|
| 421 |
attribute.""" |
|---|
| 422 |
return self._getAttrMap().get(key, default) |
|---|
| 423 |
|
|---|
| 424 |
def has_key(self, key): |
|---|
| 425 |
return self._getAttrMap().has_key(key) |
|---|
| 426 |
|
|---|
| 427 |
def __getitem__(self, key): |
|---|
| 428 |
"""tag[key] returns the value of the 'key' attribute for the tag, |
|---|
| 429 |
and throws an exception if it's not there.""" |
|---|
| 430 |
return self._getAttrMap()[key] |
|---|
| 431 |
|
|---|
| 432 |
def __iter__(self): |
|---|
| 433 |
"Iterating over a tag iterates over its contents." |
|---|
| 434 |
return iter(self.contents) |
|---|
| 435 |
|
|---|
| 436 |
def __len__(self): |
|---|
| 437 |
"The length of a tag is the length of its list of contents." |
|---|
| 438 |
return len(self.contents) |
|---|
| 439 |
|
|---|
| 440 |
def __contains__(self, x): |
|---|
| 441 |
return x in self.contents |
|---|
| 442 |
|
|---|
| 443 |
def __nonzero__(self): |
|---|
| 444 |
"A tag is non-None even if it has no contents." |
|---|
| 445 |
return True |
|---|
| 446 |
|
|---|
| 447 |
def __setitem__(self, key, value): |
|---|
| 448 |
"""Setting tag[key] sets the value of the 'key' attribute for the |
|---|
| 449 |
tag.""" |
|---|
| 450 |
self._getAttrMap() |
|---|
| 451 |
self.attrMap[key] = value |
|---|
| 452 |
found = False |
|---|
| 453 |
for i in range(0, len(self.attrs)): |
|---|
| 454 |
if self.attrs[i][0] == key: |
|---|
| 455 |
self.attrs[i] = (key, value) |
|---|
| 456 |
found = True |
|---|
| 457 |
if not found: |
|---|
| 458 |
self.attrs.append((key, value)) |
|---|
| 459 |
self._getAttrMap()[key] = value |
|---|
| 460 |
|
|---|
| 461 |
def __delitem__(self, key): |
|---|
| 462 |
"Deleting tag[key] deletes all 'key' attributes for the tag." |
|---|
| 463 |
for item in self.attrs: |
|---|
| 464 |
if item[0] == key: |
|---|
| 465 |
self.attrs.remove(item) |
|---|
| 466 |
|
|---|
| 467 |
|
|---|
| 468 |
self._getAttrMap() |
|---|
| 469 |
if self.attrMap.has_key(key): |
|---|
| 470 |
del self.attrMap[key] |
|---|
| 471 |
|
|---|
| 472 |
def __call__(self, *args, **kwargs): |
|---|
| 473 |
"""Calling a tag like a function is the same as calling its |
|---|
| 474 |
findAll() method. Eg. tag('a') returns a list of all the A tags |
|---|
| 475 |
found within this tag.""" |
|---|
| 476 |
return apply(self.findAll, args, kwargs) |
|---|
| 477 |
|
|---|
| 478 |
def __getattr__(self, tag): |
|---|
| 479 |
|
|---|
| 480 |
if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: |
|---|
| 481 |
return self.find(tag[:-3]) |
|---|
| 482 |
elif tag.find('__') != 0: |
|---|
| 483 |
return self.find(tag) |
|---|
| 484 |
|
|---|
| 485 |
def __eq__(self, other): |
|---|
| 486 |
"""Returns true iff this tag has the same name, the same attributes, |
|---|
| 487 |
and the same contents (recursively) as the given tag. |
|---|
| 488 |
|
|---|
| 489 |
NOTE: right now this will return false if two tags have the |
|---|
| 490 |
same attributes in a different order. Should this be fixed?""" |
|---|
| 491 |
if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): |
|---|
| 492 |
return False |
|---|
| 493 |
for i in range(0, len(self.contents)): |
|---|
| 494 |
if self.contents[i] != other.contents[i]: |
|---|
| 495 |
return False |
|---|
| 496 |
return True |
|---|
| 497 |
|
|---|
| 498 |
def __ne__(self, other): |
|---|
| 499 |
"""Returns true iff this tag is not identical to the other tag, |
|---|
| 500 |
as defined in __eq__.""" |
|---|
| 501 |
return not self == other |
|---|
| 502 |
|
|---|
| 503 |
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 504 |
"""Renders this tag as a string.""" |
|---|
| 505 |
return self.__str__(encoding) |
|---|
| 506 |
|
|---|
| 507 |
def __unicode__(self): |
|---|
| 508 |
return self.__str__(None) |
|---|
| 509 |
|
|---|
| 510 |
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, |
|---|
| 511 |
prettyPrint=False, indentLevel=0): |
|---|
| 512 |
"""Returns a string or Unicode representation of this tag and |
|---|
| 513 |
its contents. To get Unicode, pass None for encoding. |
|---|
| 514 |
|
|---|
| 515 |
NOTE: since Python's HTML parser consumes whitespace, this |
|---|
| 516 |
method is not certain to reproduce the whitespace present in |
|---|
| 517 |
the original string.""" |
|---|
| 518 |
|
|---|
| 519 |
encodedName = self.toEncoding(self.name, encoding) |
|---|
| 520 |
|
|---|
| 521 |
attrs = [] |
|---|
| 522 |
if self.attrs: |
|---|
| 523 |
for key, val in self.attrs: |
|---|
| 524 |
fmt = '%s="%s"' |
|---|
| 525 |
if isString(val): |
|---|
| 526 |
if self.containsSubstitutions and '%SOUP-ENCODING%' in val: |
|---|
| 527 |
val = self.substituteEncoding(val, encoding) |
|---|
| 528 |
|
|---|
| 529 |
|
|---|
| 530 |
|
|---|
| 531 |
|
|---|
| 532 |
|
|---|
| 533 |
|
|---|
| 534 |
|
|---|
| 535 |
|
|---|
| 536 |
|
|---|
| 537 |
|
|---|
| 538 |
|
|---|
| 539 |
|
|---|
| 540 |
|
|---|
| 541 |
|
|---|
| 542 |
|
|---|
| 543 |
|
|---|
| 544 |
if '"' in val: |
|---|
| 545 |
fmt = "%s='%s'" |
|---|
| 546 |
|
|---|
| 547 |
|
|---|
| 548 |
if "'" in val: |
|---|
| 549 |
val = val.replace("'", "&squot;") |
|---|
| 550 |
|
|---|
| 551 |
|
|---|
| 552 |
|
|---|
| 553 |
|
|---|
| 554 |
|
|---|
| 555 |
val = re.sub("([<>]|&(?![^\s]+;))", |
|---|
| 556 |
lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";", |
|---|
| 557 |
val) |
|---|
| 558 |
|
|---|
| 559 |
attrs.append(fmt % (self.toEncoding(key, encoding), |
|---|
| 560 |
self.toEncoding(val, encoding))) |
|---|
| 561 |
close = '' |
|---|
| 562 |
closeTag = '' |
|---|
| 563 |
if self.isSelfClosing: |
|---|
| 564 |
close = ' /' |
|---|
| 565 |
else: |
|---|
| 566 |
closeTag = '</%s>' % encodedName |
|---|
| 567 |
|
|---|
| 568 |
indentTag, indentContents = 0, 0 |
|---|
| 569 |
if prettyPrint: |
|---|
| 570 |
indentTag = indentLevel |
|---|
| 571 |
space = (' ' * (indentTag-1)) |
|---|
| 572 |
indentContents = indentTag + 1 |
|---|
| 573 |
contents = self.renderContents(encoding, prettyPrint, indentContents) |
|---|
| 574 |
if self.hidden: |
|---|
| 575 |
s = contents |
|---|
| 576 |
else: |
|---|
| 577 |
s = [] |
|---|
| 578 |
attributeString = '' |
|---|
| 579 |
if attrs: |
|---|
| 580 |
attributeString = ' ' + ' '.join(attrs) |
|---|
| 581 |
if prettyPrint: |
|---|
| 582 |
s.append(space) |
|---|
| 583 |
s.append('<%s%s%s>' % (encodedName, attributeString, close)) |
|---|
| 584 |
if prettyPrint: |
|---|
| 585 |
s.append("\n") |
|---|
| 586 |
s.append(contents) |
|---|
| 587 |
if prettyPrint and contents and contents[-1] != "\n": |
|---|
| 588 |
s.append("\n") |
|---|
| 589 |
if prettyPrint and closeTag: |
|---|
| 590 |
s.append(space) |
|---|
| 591 |
s.append(closeTag) |
|---|
| 592 |
if prettyPrint and closeTag and self.nextSibling: |
|---|
| 593 |
s.append("\n") |
|---|
| 594 |
s = ''.join(s) |
|---|
| 595 |
return s |
|---|
| 596 |
|
|---|
| 597 |
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 598 |
return self.__str__(encoding, True) |
|---|
| 599 |
|
|---|
| 600 |
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, |
|---|
| 601 |
prettyPrint=False, indentLevel=0): |
|---|
| 602 |
"""Renders the contents of this tag as a string in the given |
|---|
| 603 |
encoding. If encoding is None, returns a Unicode string..""" |
|---|
| 604 |
s=[] |
|---|
| 605 |
for c in self: |
|---|
| 606 |
text = None |
|---|
| 607 |
if isinstance(c, NavigableString): |
|---|
| 608 |
text = c.__str__(encoding) |
|---|
| 609 |
elif isinstance(c, Tag): |
|---|
| 610 |
s.append(c.__str__(encoding, prettyPrint, indentLevel)) |
|---|
| 611 |
if text and prettyPrint: |
|---|
| 612 |
text = text.strip() |
|---|
| 613 |
if text: |
|---|
| 614 |
if prettyPrint: |
|---|
| 615 |
s.append(" " * (indentLevel-1)) |
|---|
| 616 |
s.append(text) |
|---|
| 617 |
if prettyPrint: |
|---|
| 618 |
s.append("\n") |
|---|
| 619 |
return ''.join(s) |
|---|
| 620 |
|
|---|
| 621 |
|
|---|
| 622 |
|
|---|
| 623 |
def find(self, name=None, attrs={}, recursive=True, text=None, |
|---|
| 624 |
**kwargs): |
|---|
| 625 |
"""Return only the first child of this Tag matching the given |
|---|
| 626 |
criteria.""" |
|---|
| 627 |
r = None |
|---|
| 628 |
l = self.findAll(name, attrs, recursive, text, 1, **kwargs) |
|---|
| 629 |
if l: |
|---|
| 630 |
r = l[0] |
|---|
| 631 |
return r |
|---|
| 632 |
findChild = find |
|---|
| 633 |
|
|---|
| 634 |
def findAll(self, name=None, attrs={}, recursive=True, text=None, |
|---|
| 635 |
limit=None, **kwargs): |
|---|
| 636 |
"""Extracts a list of Tag objects that match the given |
|---|
| 637 |
criteria. You can specify the name of the Tag and any |
|---|
| 638 |
attributes you want the Tag to have. |
|---|
| 639 |
|
|---|
| 640 |
The value of a key-value pair in the 'attrs' map can be a |
|---|
| 641 |
string, a list of strings, a regular expression object, or a |
|---|
| 642 |
callable that takes a string and returns whether or not the |
|---|
| 643 |
string matches for some custom definition of 'matches'. The |
|---|
| 644 |
same is true of the tag name.""" |
|---|
| 645 |
generator = self.recursiveChildGenerator |
|---|
| 646 |
if not recursive: |
|---|
| 647 |
generator = self.childGenerator |
|---|
| 648 |
return self._findAll(name, attrs, text, limit, generator, **kwargs) |
|---|
| 649 |
findChildren = findAll |
|---|
| 650 |
|
|---|
| 651 |
|
|---|
| 652 |
first = find |
|---|
| 653 |
fetch = findAll |
|---|
| 654 |
|
|---|
| 655 |
def fetchText(self, text=None, recursive=True, limit=None): |
|---|
| 656 |
return self.findAll(text=text, recursive=recursive, limit=limit) |
|---|
| 657 |
|
|---|
| 658 |
def firstText(self, text=None, recursive=True): |
|---|
| 659 |
return self.find(text=text, recursive=recursive) |
|---|
| 660 |
|
|---|
| 661 |
|
|---|
| 662 |
|
|---|
| 663 |
def append(self, tag): |
|---|
| 664 |
"""Appends the given tag to the contents of this tag.""" |
|---|
| 665 |
self.contents.append(tag) |
|---|
| 666 |
|
|---|
| 667 |
|
|---|
| 668 |
|
|---|
| 669 |
def _getAttrMap(self): |
|---|
| 670 |
"""Initializes a map representation of this tag's attributes, |
|---|
| 671 |
if not already initialized.""" |
|---|
| 672 |
if not getattr(self, 'attrMap'): |
|---|
| 673 |
self.attrMap = {} |
|---|
| 674 |
for (key, value) in self.attrs: |
|---|
| 675 |
self.attrMap[key] = value |
|---|
| 676 |
return self.attrMap |
|---|
| 677 |
|
|---|
| 678 |
|
|---|
| 679 |
def childGenerator(self): |
|---|
| 680 |
for i in range(0, len(self.contents)): |
|---|
| 681 |
yield self.contents[i] |
|---|
| 682 |
raise StopIteration |
|---|
| 683 |
|
|---|
| 684 |
def recursiveChildGenerator(self): |
|---|
| 685 |
stack = [(self, 0)] |
|---|
| 686 |
while stack: |
|---|
| 687 |
tag, start = stack.pop() |
|---|
| 688 |
if isinstance(tag, Tag): |
|---|
| 689 |
for i in range(start, len(tag.contents)): |
|---|
| 690 |
a = tag.contents[i] |
|---|
| 691 |
yield a |
|---|
| 692 |
if isinstance(a, Tag) and tag.contents: |
|---|
| 693 |
if i < len(tag.contents) - 1: |
|---|
| 694 |
stack.append((tag, i+1)) |
|---|
| 695 |
stack.append((a, 0)) |
|---|
| 696 |
break |
|---|
| 697 |
raise StopIteration |
|---|
| 698 |
|
|---|
| 699 |
|
|---|
| 700 |
class SoupStrainer: |
|---|
| 701 |
"""Encapsulates a number of ways of matching a markup element (tag or |
|---|
| 702 |
text).""" |
|---|
| 703 |
|
|---|
| 704 |
def __init__(self, name=None, attrs={}, text=None, **kwargs): |
|---|
| 705 |
self.name = name |
|---|
| 706 |
if isString(attrs): |
|---|
| 707 |
kwargs['class'] = attrs |
|---|
| 708 |
attrs = None |
|---|
| 709 |
if kwargs: |
|---|
| 710 |
if attrs: |
|---|
| 711 |
attrs = attrs.copy() |
|---|
| 712 |
attrs.update(kwargs) |
|---|
| 713 |
else: |
|---|
| 714 |
attrs = kwargs |
|---|
| 715 |
self.attrs = attrs |
|---|
| 716 |
self.text = text |
|---|
| 717 |
|
|---|
| 718 |
def __str__(self): |
|---|
| 719 |
if self.text: |
|---|
| 720 |
return self.text |
|---|
| 721 |
else: |
|---|
| 722 |
return "%s|%s" % (self.name, self.attrs) |
|---|
| 723 |
|
|---|
| 724 |
def searchTag(self, markupName=None, markupAttrs={}): |
|---|
| 725 |
found = None |
|---|
| 726 |
markup = None |
|---|
| 727 |
if isinstance(markupName, Tag): |
|---|
| 728 |
markup = markupName |
|---|
| 729 |
markupAttrs = markup |
|---|
| 730 |
callFunctionWithTagData = callable(self.name) \ |
|---|
| 731 |
and not isinstance(markupName, Tag) |
|---|
| 732 |
|
|---|
| 733 |
if (not self.name) \ |
|---|
| 734 |
or callFunctionWithTagData \ |
|---|
| 735 |
or (markup and self._matches(markup, self.name)) \ |
|---|
| 736 |
or (not markup and self._matches(markupName, self.name)): |
|---|
| 737 |
if callFunctionWithTagData: |
|---|
| 738 |
match = self.name(markupName, markupAttrs) |
|---|
| 739 |
else: |
|---|
| 740 |
match = True |
|---|
| 741 |
markupAttrMap = None |
|---|
| 742 |
for attr, matchAgainst in self.attrs.items(): |
|---|
| 743 |
if not markupAttrMap: |
|---|
| 744 |
if hasattr(markupAttrs, 'get'): |
|---|
| 745 |
markupAttrMap = markupAttrs |
|---|
| 746 |
else: |
|---|
| 747 |
markupAttrMap = {} |
|---|
| 748 |
for k,v in markupAttrs: |
|---|
| 749 |
markupAttrMap[k] = v |
|---|
| 750 |
attrValue = markupAttrMap.get(attr) |
|---|
| 751 |
if not self._matches(attrValue, matchAgainst): |
|---|
| 752 |
match = False |
|---|
| 753 |
break |
|---|
| 754 |
if match: |
|---|
| 755 |
if markup: |
|---|
| 756 |
found = markup |
|---|
| 757 |
else: |
|---|
| 758 |
found = markupName |
|---|
| 759 |
return found |
|---|
| 760 |
|
|---|
| 761 |
def search(self, markup): |
|---|
| 762 |
|
|---|
| 763 |
found = None |
|---|
| 764 |
|
|---|
| 765 |
|
|---|
| 766 |
if isList(markup) and not isinstance(markup, Tag): |
|---|
| 767 |
for element in markup: |
|---|
| 768 |
if isinstance(element, NavigableString) \ |
|---|
| 769 |
and self.search(element): |
|---|
| 770 |
found = element |
|---|
| 771 |
break |
|---|
| 772 |
|
|---|
| 773 |
|
|---|
| 774 |
elif isinstance(markup, Tag): |
|---|
| 775 |
if not self.text: |
|---|
| 776 |
found = self.searchTag(markup) |
|---|
| 777 |
|
|---|
| 778 |
elif isinstance(markup, NavigableString) or \ |
|---|
| 779 |
isString(markup): |
|---|
| 780 |
if self._matches(markup, self.text): |
|---|
| 781 |
found = markup |
|---|
| 782 |
else: |
|---|
| 783 |
raise Exception, "I don't know how to match against a %s" \ |
|---|
| 784 |
% markup.__class__ |
|---|
| 785 |
|
|---|