|
|
from __future__ import annotations |
|
|
|
|
|
|
|
|
__license__ = "MIT" |
|
|
|
|
|
import re |
|
|
import warnings |
|
|
|
|
|
from bs4.css import CSS |
|
|
from bs4._deprecation import ( |
|
|
_deprecated, |
|
|
_deprecated_alias, |
|
|
_deprecated_function_alias, |
|
|
) |
|
|
from bs4.formatter import ( |
|
|
Formatter, |
|
|
HTMLFormatter, |
|
|
XMLFormatter, |
|
|
) |
|
|
from bs4._warnings import AttributeResemblesVariableWarning |
|
|
|
|
|
from typing import ( |
|
|
Any, |
|
|
Callable, |
|
|
Dict, |
|
|
Generic, |
|
|
Iterable, |
|
|
Iterator, |
|
|
List, |
|
|
Mapping, |
|
|
Optional, |
|
|
Pattern, |
|
|
Set, |
|
|
TYPE_CHECKING, |
|
|
Tuple, |
|
|
Type, |
|
|
TypeVar, |
|
|
Union, |
|
|
cast, |
|
|
) |
|
|
from typing_extensions import ( |
|
|
Self, |
|
|
TypeAlias, |
|
|
) |
|
|
|
|
|
if TYPE_CHECKING: |
|
|
from bs4 import BeautifulSoup |
|
|
from bs4.builder import TreeBuilder |
|
|
from bs4.filter import ElementFilter |
|
|
from bs4.formatter import ( |
|
|
_EntitySubstitutionFunction, |
|
|
_FormatterOrName, |
|
|
) |
|
|
from bs4._typing import ( |
|
|
_AtMostOneElement, |
|
|
_AttributeValue, |
|
|
_AttributeValues, |
|
|
_Encoding, |
|
|
_InsertableElement, |
|
|
_OneElement, |
|
|
_QueryResults, |
|
|
_RawOrProcessedAttributeValues, |
|
|
_StrainableElement, |
|
|
_StrainableAttribute, |
|
|
_StrainableAttributes, |
|
|
_StrainableString, |
|
|
) |
|
|
|
|
|
_OneOrMoreStringTypes: TypeAlias = Union[ |
|
|
Type["NavigableString"], Iterable[Type["NavigableString"]] |
|
|
] |
|
|
|
|
|
_FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]] |
|
|
|
|
|
|
|
|
|
|
|
_deprecated_names = dict( |
|
|
whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy." |
|
|
) |
|
|
|
|
|
_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+") |
|
|
|
|
|
|
|
|
def __getattr__(name: str) -> Any: |
|
|
if name in _deprecated_names: |
|
|
message = _deprecated_names[name] |
|
|
warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2) |
|
|
|
|
|
return globals()[f"_deprecated_{name}"] |
|
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_OUTPUT_ENCODING: str = "utf-8" |
|
|
|
|
|
|
|
|
nonwhitespace_re: Pattern[str] = re.compile(r"\S+") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set( |
|
|
[ |
|
|
"idna", |
|
|
"mbcs", |
|
|
"oem", |
|
|
"palmos", |
|
|
"punycode", |
|
|
"raw_unicode_escape", |
|
|
"undefined", |
|
|
"unicode_escape", |
|
|
"raw-unicode-escape", |
|
|
"unicode-escape", |
|
|
"string-escape", |
|
|
"string_escape", |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
class NamespacedAttribute(str): |
|
|
"""A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"') |
|
|
which remembers the namespace prefix ('xml') and the name ('lang') |
|
|
that were used to create it. |
|
|
""" |
|
|
|
|
|
prefix: Optional[str] |
|
|
name: Optional[str] |
|
|
namespace: Optional[str] |
|
|
|
|
|
def __new__( |
|
|
cls, |
|
|
prefix: Optional[str], |
|
|
name: Optional[str] = None, |
|
|
namespace: Optional[str] = None, |
|
|
) -> Self: |
|
|
if not name: |
|
|
|
|
|
|
|
|
name = None |
|
|
|
|
|
if not name: |
|
|
obj = str.__new__(cls, prefix) |
|
|
elif not prefix: |
|
|
|
|
|
obj = str.__new__(cls, name) |
|
|
else: |
|
|
obj = str.__new__(cls, prefix + ":" + name) |
|
|
obj.prefix = prefix |
|
|
obj.name = name |
|
|
obj.namespace = namespace |
|
|
return obj |
|
|
|
|
|
|
|
|
class AttributeValueWithCharsetSubstitution(str): |
|
|
"""An abstract class standing in for a character encoding specified |
|
|
inside an HTML ``<meta>`` tag. |
|
|
|
|
|
Subclasses exist for each place such a character encoding might be |
|
|
found: either inside the ``charset`` attribute |
|
|
(`CharsetMetaAttributeValue`) or inside the ``content`` attribute |
|
|
(`ContentMetaAttributeValue`) |
|
|
|
|
|
This allows Beautiful Soup to replace that part of the HTML file |
|
|
with a different encoding when ouputting a tree as a string. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
original_value: str |
|
|
|
|
|
def substitute_encoding(self, eventual_encoding: str) -> str: |
|
|
"""Do whatever's necessary in this implementation-specific |
|
|
portion an HTML document to substitute in a specific encoding. |
|
|
""" |
|
|
raise NotImplementedError() |
|
|
|
|
|
|
|
|
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): |
|
|
"""A generic stand-in for the value of a ``<meta>`` tag's ``charset`` |
|
|
attribute. |
|
|
|
|
|
When Beautiful Soup parses the markup ``<meta charset="utf8">``, the |
|
|
value of the ``charset`` attribute will become one of these objects. |
|
|
|
|
|
If the document is later encoded to an encoding other than UTF-8, its |
|
|
``<meta>`` tag will mention the new encoding instead of ``utf8``. |
|
|
""" |
|
|
|
|
|
def __new__(cls, original_value: str) -> Self: |
|
|
|
|
|
|
|
|
obj = str.__new__(cls, original_value) |
|
|
obj.original_value = original_value |
|
|
return obj |
|
|
|
|
|
def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: |
|
|
"""When an HTML document is being encoded to a given encoding, the |
|
|
value of a ``<meta>`` tag's ``charset`` becomes the name of |
|
|
the encoding. |
|
|
""" |
|
|
if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: |
|
|
return "" |
|
|
return eventual_encoding |
|
|
|
|
|
|
|
|
class AttributeValueList(List[str]): |
|
|
"""Class for the list used to hold the values of attributes which |
|
|
have multiple values (such as HTML's 'class'). It's just a regular |
|
|
list, but you can subclass it and pass it in to the TreeBuilder |
|
|
constructor as attribute_value_list_class, to have your subclass |
|
|
instantiated instead. |
|
|
""" |
|
|
|
|
|
|
|
|
class AttributeDict(Dict[Any,Any]): |
|
|
"""Superclass for the dictionary used to hold a tag's |
|
|
attributes. You can use this, but it's just a regular dict with no |
|
|
special logic. |
|
|
""" |
|
|
|
|
|
|
|
|
class XMLAttributeDict(AttributeDict): |
|
|
"""A dictionary for holding a Tag's attributes, which processes |
|
|
incoming values for consistency with the HTML spec. |
|
|
""" |
|
|
|
|
|
def __setitem__(self, key: str, value: Any) -> None: |
|
|
"""Set an attribute value, possibly modifying it to comply with |
|
|
the XML spec. |
|
|
|
|
|
This just means converting common non-string values to |
|
|
strings: XML attributes may have "any literal string as a |
|
|
value." |
|
|
""" |
|
|
if value is None: |
|
|
value = "" |
|
|
if isinstance(value, bool): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pass |
|
|
elif isinstance(value, (int, float)): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
value = str(value) |
|
|
|
|
|
super().__setitem__(key, value) |
|
|
|
|
|
|
|
|
class HTMLAttributeDict(AttributeDict): |
|
|
"""A dictionary for holding a Tag's attributes, which processes |
|
|
incoming values for consistency with the HTML spec, which says |
|
|
'Attribute values are a mixture of text and character |
|
|
references...' |
|
|
|
|
|
Basically, this means converting common non-string values into |
|
|
strings, like XMLAttributeDict, though HTML also has some rules |
|
|
around boolean attributes that XML doesn't have. |
|
|
""" |
|
|
|
|
|
def __setitem__(self, key: str, value: Any) -> None: |
|
|
"""Set an attribute value, possibly modifying it to comply |
|
|
with the HTML spec, |
|
|
""" |
|
|
if value in (False, None): |
|
|
|
|
|
|
|
|
|
|
|
if key in self: |
|
|
del self[key] |
|
|
return |
|
|
if isinstance(value, bool): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if isinstance(key, NamespacedAttribute): |
|
|
value = key.name |
|
|
else: |
|
|
value = key |
|
|
elif isinstance(value, (int, float)): |
|
|
|
|
|
|
|
|
value = str(value) |
|
|
super().__setitem__(key, value) |
|
|
|
|
|
|
|
|
class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): |
|
|
"""A generic stand-in for the value of a ``<meta>`` tag's ``content`` |
|
|
attribute. |
|
|
|
|
|
When Beautiful Soup parses the markup: |
|
|
``<meta http-equiv="content-type" content="text/html; charset=utf8">`` |
|
|
|
|
|
The value of the ``content`` attribute will become one of these objects. |
|
|
|
|
|
If the document is later encoded to an encoding other than UTF-8, its |
|
|
``<meta>`` tag will mention the new encoding instead of ``utf8``. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) |
|
|
|
|
|
def __new__(cls, original_value: str) -> Self: |
|
|
cls.CHARSET_RE.search(original_value) |
|
|
obj = str.__new__(cls, original_value) |
|
|
obj.original_value = original_value |
|
|
return obj |
|
|
|
|
|
def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: |
|
|
"""When an HTML document is being encoded to a given encoding, the |
|
|
value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes |
|
|
the name of the encoding. |
|
|
""" |
|
|
if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: |
|
|
return self.CHARSET_RE.sub("", self.original_value) |
|
|
|
|
|
def rewrite(match: re.Match[str]) -> str: |
|
|
return match.group(1) + eventual_encoding |
|
|
|
|
|
return self.CHARSET_RE.sub(rewrite, self.original_value) |
|
|
|
|
|
|
|
|
class PageElement(object): |
|
|
"""An abstract class representing a single element in the parse tree. |
|
|
|
|
|
`NavigableString`, `Tag`, etc. are all subclasses of |
|
|
`PageElement`. For this reason you'll see a lot of methods that |
|
|
return `PageElement`, but you'll never see an actual `PageElement` |
|
|
object. For the most part you can think of `PageElement` as |
|
|
meaning "a `Tag` or a `NavigableString`." |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
known_xml: Optional[bool] = None |
|
|
|
|
|
|
|
|
|
|
|
_decomposed: bool |
|
|
|
|
|
parent: Optional[Tag] |
|
|
next_element: _AtMostOneElement |
|
|
previous_element: _AtMostOneElement |
|
|
next_sibling: _AtMostOneElement |
|
|
previous_sibling: _AtMostOneElement |
|
|
|
|
|
|
|
|
|
|
|
hidden: bool = False |
|
|
|
|
|
def setup( |
|
|
self, |
|
|
parent: Optional[Tag] = None, |
|
|
previous_element: _AtMostOneElement = None, |
|
|
next_element: _AtMostOneElement = None, |
|
|
previous_sibling: _AtMostOneElement = None, |
|
|
next_sibling: _AtMostOneElement = None, |
|
|
) -> None: |
|
|
"""Sets up the initial relations between this element and |
|
|
other elements. |
|
|
|
|
|
:param parent: The parent of this element. |
|
|
|
|
|
:param previous_element: The element parsed immediately before |
|
|
this one. |
|
|
|
|
|
:param next_element: The element parsed immediately after |
|
|
this one. |
|
|
|
|
|
:param previous_sibling: The most recently encountered element |
|
|
on the same level of the parse tree as this one. |
|
|
|
|
|
:param previous_sibling: The next element to be encountered |
|
|
on the same level of the parse tree as this one. |
|
|
""" |
|
|
self.parent = parent |
|
|
|
|
|
self.previous_element = previous_element |
|
|
if self.previous_element is not None: |
|
|
self.previous_element.next_element = self |
|
|
|
|
|
self.next_element = next_element |
|
|
if self.next_element is not None: |
|
|
self.next_element.previous_element = self |
|
|
|
|
|
self.next_sibling = next_sibling |
|
|
if self.next_sibling is not None: |
|
|
self.next_sibling.previous_sibling = self |
|
|
|
|
|
if ( |
|
|
previous_sibling is None |
|
|
and self.parent is not None |
|
|
and self.parent.contents |
|
|
): |
|
|
previous_sibling = self.parent.contents[-1] |
|
|
|
|
|
self.previous_sibling = previous_sibling |
|
|
if self.previous_sibling is not None: |
|
|
self.previous_sibling.next_sibling = self |
|
|
|
|
|
def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str: |
|
|
"""Format the given string using the given formatter. |
|
|
|
|
|
:param s: A string. |
|
|
:param formatter: A Formatter object, or a string naming one of the standard formatters. |
|
|
""" |
|
|
if formatter is None: |
|
|
return s |
|
|
if not isinstance(formatter, Formatter): |
|
|
formatter = self.formatter_for_name(formatter) |
|
|
output = formatter.substitute(s) |
|
|
return output |
|
|
|
|
|
def formatter_for_name( |
|
|
self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction] |
|
|
) -> Formatter: |
|
|
"""Look up or create a Formatter for the given identifier, |
|
|
if necessary. |
|
|
|
|
|
:param formatter: Can be a `Formatter` object (used as-is), a |
|
|
function (used as the entity substitution hook for an |
|
|
`bs4.formatter.XMLFormatter` or |
|
|
`bs4.formatter.HTMLFormatter`), or a string (used to look |
|
|
up an `bs4.formatter.XMLFormatter` or |
|
|
`bs4.formatter.HTMLFormatter` in the appropriate registry. |
|
|
|
|
|
""" |
|
|
if isinstance(formatter_name, Formatter): |
|
|
return formatter_name |
|
|
c: type[Formatter] |
|
|
registry: Mapping[Optional[str], Formatter] |
|
|
if self._is_xml: |
|
|
c = XMLFormatter |
|
|
registry = XMLFormatter.REGISTRY |
|
|
else: |
|
|
c = HTMLFormatter |
|
|
registry = HTMLFormatter.REGISTRY |
|
|
if callable(formatter_name): |
|
|
return c(entity_substitution=formatter_name) |
|
|
return registry[formatter_name] |
|
|
|
|
|
@property |
|
|
def _is_xml(self) -> bool: |
|
|
"""Is this element part of an XML tree or an HTML tree? |
|
|
|
|
|
This is used in formatter_for_name, when deciding whether an |
|
|
XMLFormatter or HTMLFormatter is more appropriate. It can be |
|
|
inefficient, but it should be called very rarely. |
|
|
""" |
|
|
if self.known_xml is not None: |
|
|
|
|
|
|
|
|
return self.known_xml |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if self.parent is None: |
|
|
|
|
|
|
|
|
|
|
|
return getattr(self, "is_xml", False) |
|
|
return self.parent._is_xml |
|
|
|
|
|
nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0") |
|
|
previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0") |
|
|
|
|
|
def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: |
|
|
raise NotImplementedError() |
|
|
|
|
|
def __copy__(self) -> Self: |
|
|
"""A copy of a PageElement can only be a deep copy, because |
|
|
only one PageElement can occupy a given place in a parse tree. |
|
|
""" |
|
|
return self.__deepcopy__({}) |
|
|
|
|
|
default: Iterable[type[NavigableString]] = tuple() |
|
|
|
|
|
def _all_strings( |
|
|
self, strip: bool = False, types: Iterable[type[NavigableString]] = default |
|
|
) -> Iterator[str]: |
|
|
"""Yield all strings of certain classes, possibly stripping them. |
|
|
|
|
|
This is implemented differently in `Tag` and `NavigableString`. |
|
|
""" |
|
|
raise NotImplementedError() |
|
|
|
|
|
@property |
|
|
def stripped_strings(self) -> Iterator[str]: |
|
|
"""Yield all interesting strings in this PageElement, stripping them |
|
|
first. |
|
|
|
|
|
See `Tag` for information on which strings are considered |
|
|
interesting in a given context. |
|
|
""" |
|
|
for string in self._all_strings(True): |
|
|
yield string |
|
|
|
|
|
def get_text( |
|
|
self, |
|
|
separator: str = "", |
|
|
strip: bool = False, |
|
|
types: Iterable[Type[NavigableString]] = default, |
|
|
) -> str: |
|
|
"""Get all child strings of this PageElement, concatenated using the |
|
|
given separator. |
|
|
|
|
|
:param separator: Strings will be concatenated using this separator. |
|
|
|
|
|
:param strip: If True, strings will be stripped before being |
|
|
concatenated. |
|
|
|
|
|
:param types: A tuple of NavigableString subclasses. Any |
|
|
strings of a subclass not found in this list will be |
|
|
ignored. Although there are exceptions, the default |
|
|
behavior in most cases is to consider only NavigableString |
|
|
and CData objects. That means no comments, processing |
|
|
instructions, etc. |
|
|
|
|
|
:return: A string. |
|
|
""" |
|
|
return separator.join([s for s in self._all_strings(strip, types=types)]) |
|
|
|
|
|
getText = get_text |
|
|
text = property(get_text) |
|
|
|
|
|
def replace_with(self, *args: PageElement) -> Self: |
|
|
"""Replace this `PageElement` with one or more other `PageElement`, |
|
|
objects, keeping the rest of the tree the same. |
|
|
|
|
|
:return: This `PageElement`, no longer part of the tree. |
|
|
""" |
|
|
if self.parent is None: |
|
|
raise ValueError( |
|
|
"Cannot replace one element with another when the " |
|
|
"element to be replaced is not part of a tree." |
|
|
) |
|
|
if len(args) == 1 and args[0] is self: |
|
|
|
|
|
return self |
|
|
if any(x is self.parent for x in args): |
|
|
raise ValueError("Cannot replace a Tag with its parent.") |
|
|
old_parent = self.parent |
|
|
my_index = self.parent.index(self) |
|
|
self.extract(_self_index=my_index) |
|
|
for idx, replace_with in enumerate(args, start=my_index): |
|
|
old_parent.insert(idx, replace_with) |
|
|
return self |
|
|
|
|
|
replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0") |
|
|
|
|
|
def wrap(self, wrap_inside: Tag) -> Tag: |
|
|
"""Wrap this `PageElement` inside a `Tag`. |
|
|
|
|
|
:return: ``wrap_inside``, occupying the position in the tree that used |
|
|
to be occupied by this object, and with this object now inside it. |
|
|
""" |
|
|
me = self.replace_with(wrap_inside) |
|
|
wrap_inside.append(me) |
|
|
return wrap_inside |
|
|
|
|
|
def extract(self, _self_index: Optional[int] = None) -> Self: |
|
|
"""Destructively rips this element out of the tree. |
|
|
|
|
|
:param _self_index: The location of this element in its parent's |
|
|
.contents, if known. Passing this in allows for a performance |
|
|
optimization. |
|
|
|
|
|
:return: this `PageElement`, no longer part of the tree. |
|
|
""" |
|
|
if self.parent is not None: |
|
|
if _self_index is None: |
|
|
_self_index = self.parent.index(self) |
|
|
del self.parent.contents[_self_index] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
last_child = self._last_descendant() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
last_child = cast(PageElement, last_child) |
|
|
next_element = last_child.next_element |
|
|
|
|
|
if self.previous_element is not None: |
|
|
if self.previous_element is not next_element: |
|
|
self.previous_element.next_element = next_element |
|
|
if next_element is not None and next_element is not self.previous_element: |
|
|
next_element.previous_element = self.previous_element |
|
|
self.previous_element = None |
|
|
last_child.next_element = None |
|
|
|
|
|
self.parent = None |
|
|
if ( |
|
|
self.previous_sibling is not None |
|
|
and self.previous_sibling is not self.next_sibling |
|
|
): |
|
|
self.previous_sibling.next_sibling = self.next_sibling |
|
|
if ( |
|
|
self.next_sibling is not None |
|
|
and self.next_sibling is not self.previous_sibling |
|
|
): |
|
|
self.next_sibling.previous_sibling = self.previous_sibling |
|
|
self.previous_sibling = self.next_sibling = None |
|
|
return self |
|
|
|
|
|
def decompose(self) -> None: |
|
|
"""Recursively destroys this `PageElement` and its children. |
|
|
|
|
|
The element will be removed from the tree and wiped out; so |
|
|
will everything beneath it. |
|
|
|
|
|
The behavior of a decomposed `PageElement` is undefined and you |
|
|
should never use one for anything, but if you need to *check* |
|
|
whether an element has been decomposed, you can use the |
|
|
`PageElement.decomposed` property. |
|
|
""" |
|
|
self.extract() |
|
|
e: _AtMostOneElement = self |
|
|
next_up: _AtMostOneElement = None |
|
|
while e is not None: |
|
|
next_up = e.next_element |
|
|
e.__dict__.clear() |
|
|
if isinstance(e, Tag): |
|
|
e.contents = [] |
|
|
e._decomposed = True |
|
|
e = next_up |
|
|
|
|
|
def _last_descendant( |
|
|
self, is_initialized: bool = True, accept_self: bool = True |
|
|
) -> _AtMostOneElement: |
|
|
"""Finds the last element beneath this object to be parsed. |
|
|
|
|
|
Special note to help you figure things out if your type |
|
|
checking is tripped up by the fact that this method returns |
|
|
_AtMostOneElement instead of PageElement: the only time |
|
|
this method returns None is if `accept_self` is False and the |
|
|
`PageElement` has no children--either it's a NavigableString |
|
|
or an empty Tag. |
|
|
|
|
|
:param is_initialized: Has `PageElement.setup` been called on |
|
|
this `PageElement` yet? |
|
|
|
|
|
:param accept_self: Is ``self`` an acceptable answer to the |
|
|
question? |
|
|
""" |
|
|
if is_initialized and self.next_sibling is not None: |
|
|
last_child = self.next_sibling.previous_element |
|
|
else: |
|
|
last_child = self |
|
|
while isinstance(last_child, Tag) and last_child.contents: |
|
|
last_child = last_child.contents[-1] |
|
|
if not accept_self and last_child is self: |
|
|
last_child = None |
|
|
return last_child |
|
|
|
|
|
_lastRecursiveChild = _deprecated_alias( |
|
|
"_lastRecursiveChild", "_last_descendant", "4.0.0" |
|
|
) |
|
|
|
|
|
def insert_before(self, *args: _InsertableElement) -> List[PageElement]: |
|
|
"""Makes the given element(s) the immediate predecessor of this one. |
|
|
|
|
|
All the elements will have the same `PageElement.parent` as |
|
|
this one, and the given elements will occur immediately before |
|
|
this one. |
|
|
|
|
|
:param args: One or more PageElements. |
|
|
|
|
|
:return The list of PageElements that were inserted. |
|
|
""" |
|
|
parent = self.parent |
|
|
if parent is None: |
|
|
raise ValueError("Element has no parent, so 'before' has no meaning.") |
|
|
if any(x is self for x in args): |
|
|
raise ValueError("Can't insert an element before itself.") |
|
|
results: List[PageElement] = [] |
|
|
for predecessor in args: |
|
|
|
|
|
|
|
|
if isinstance(predecessor, PageElement): |
|
|
predecessor.extract() |
|
|
index = parent.index(self) |
|
|
results.extend(parent.insert(index, predecessor)) |
|
|
|
|
|
return results |
|
|
|
|
|
def insert_after(self, *args: _InsertableElement) -> List[PageElement]: |
|
|
"""Makes the given element(s) the immediate successor of this one. |
|
|
|
|
|
The elements will have the same `PageElement.parent` as this |
|
|
one, and the given elements will occur immediately after this |
|
|
one. |
|
|
|
|
|
:param args: One or more PageElements. |
|
|
|
|
|
:return The list of PageElements that were inserted. |
|
|
""" |
|
|
|
|
|
parent = self.parent |
|
|
if parent is None: |
|
|
raise ValueError("Element has no parent, so 'after' has no meaning.") |
|
|
if any(x is self for x in args): |
|
|
raise ValueError("Can't insert an element after itself.") |
|
|
|
|
|
offset = 0 |
|
|
results: List[PageElement] = [] |
|
|
for successor in args: |
|
|
|
|
|
|
|
|
if isinstance(successor, PageElement): |
|
|
successor.extract() |
|
|
index = parent.index(self) |
|
|
results.extend(parent.insert(index + 1 + offset, successor)) |
|
|
offset += 1 |
|
|
|
|
|
return results |
|
|
|
|
|
def find_next( |
|
|
self, |
|
|
name: _FindMethodName = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
string: Optional[_StrainableString] = None, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _AtMostOneElement: |
|
|
"""Find the first PageElement that matches the given criteria and |
|
|
appears later in the document than this PageElement. |
|
|
|
|
|
All find_* methods take a common set of arguments. See the online |
|
|
documentation for detailed explanations. |
|
|
|
|
|
:param name: A filter on tag name. |
|
|
:param attrs: Additional filters on attribute values. |
|
|
:param string: A filter for a NavigableString with specific text. |
|
|
:kwargs: Additional filters on attribute values. |
|
|
""" |
|
|
return self._find_one(self.find_all_next, name, attrs, string, **kwargs) |
|
|
|
|
|
findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0") |
|
|
|
|
|
def find_all_next( |
|
|
self, |
|
|
name: _FindMethodName = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
string: Optional[_StrainableString] = None, |
|
|
limit: Optional[int] = None, |
|
|
_stacklevel: int = 2, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _QueryResults: |
|
|
"""Find all `PageElement` objects that match the given criteria and |
|
|
appear later in the document than this `PageElement`. |
|
|
|
|
|
All find_* methods take a common set of arguments. See the online |
|
|
documentation for detailed explanations. |
|
|
|
|
|
:param name: A filter on tag name. |
|
|
:param attrs: Additional filters on attribute values. |
|
|
:param string: A filter for a NavigableString with specific text. |
|
|
:param limit: Stop looking after finding this many results. |
|
|
:param _stacklevel: Used internally to improve warning messages. |
|
|
:kwargs: Additional filters on attribute values. |
|
|
""" |
|
|
return self._find_all( |
|
|
name, |
|
|
attrs, |
|
|
string, |
|
|
limit, |
|
|
self.next_elements, |
|
|
_stacklevel=_stacklevel + 1, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0") |
|
|
|
|
|
def find_next_sibling( |
|
|
self, |
|
|
name: _FindMethodName = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
string: Optional[_StrainableString] = None, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _AtMostOneElement: |
|
|
"""Find the closest sibling to this PageElement that matches the |
|
|
given criteria and appears later in the document. |
|
|
|
|
|
All find_* methods take a common set of arguments. See the |
|
|
online documentation for detailed explanations. |
|
|
|
|
|
:param name: A filter on tag name. |
|
|
:param attrs: Additional filters on attribute values. |
|
|
:param string: A filter for a `NavigableString` with specific text. |
|
|
:kwargs: Additional filters on attribute values. |
|
|
""" |
|
|
return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs) |
|
|
|
|
|
findNextSibling = _deprecated_function_alias( |
|
|
"findNextSibling", "find_next_sibling", "4.0.0" |
|
|
) |
|
|
|
|
|
def find_next_siblings( |
|
|
self, |
|
|
name: _FindMethodName = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
string: Optional[_StrainableString] = None, |
|
|
limit: Optional[int] = None, |
|
|
_stacklevel: int = 2, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _QueryResults: |
|
|
"""Find all siblings of this `PageElement` that match the given criteria |
|
|
and appear later in the document. |
|
|
|
|
|
All find_* methods take a common set of arguments. See the online |
|
|
documentation for detailed explanations. |
|
|
|
|
|
:param name: A filter on tag name. |
|
|
:param attrs: Additional filters on attribute values. |
|
|
:param string: A filter for a `NavigableString` with specific text. |
|
|
:param limit: Stop looking after finding this many results. |
|
|
:param _stacklevel: Used internally to improve warning messages. |
|
|
:kwargs: Additional filters on attribute values. |
|
|
""" |
|
|
return self._find_all( |
|
|
name, |
|
|
attrs, |
|
|
string, |
|
|
limit, |
|
|
self.next_siblings, |
|
|
_stacklevel=_stacklevel + 1, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
findNextSiblings = _deprecated_function_alias( |
|
|
"findNextSiblings", "find_next_siblings", "4.0.0" |
|
|
) |
|
|
fetchNextSiblings = _deprecated_function_alias( |
|
|
"fetchNextSiblings", "find_next_siblings", "3.0.0" |
|
|
) |
|
|
|
|
|
def find_previous( |
|
|
self, |
|
|
name: _FindMethodName = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
string: Optional[_StrainableString] = None, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _AtMostOneElement: |
|
|
"""Look backwards in the document from this `PageElement` and find the |
|
|
first `PageElement` that matches the given criteria. |
|
|
|
|
|
All find_* methods take a common set of arguments. See the online |
|
|
documentation for detailed explanations. |
|
|
|
|
|
:param name: A filter on tag name. |
|
|
:param attrs: Additional filters on attribute values. |
|
|
:param string: A filter for a `NavigableString` with specific text. |
|
|
:kwargs: Additional filters on attribute values. |
|
|
""" |
|
|
return self._find_one(self.find_all_previous, name, attrs, string, **kwargs) |
|
|
|
|
|
findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0") |
|
|
|
|
|
def find_all_previous( |
|
|
self, |
|
|
name: _FindMethodName = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
string: Optional[_StrainableString] = None, |
|
|
limit: Optional[int] = None, |
|
|
_stacklevel: int = 2, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _QueryResults: |
|
|
"""Look backwards in the document from this `PageElement` and find all |
|
|
`PageElement` that match the given criteria. |
|
|
|
|
|
All find_* methods take a common set of arguments. See the online |
|
|
documentation for detailed explanations. |
|
|
|
|
|
:param name: A filter on tag name. |
|
|
:param attrs: Additional filters on attribute values. |
|
|
:param string: A filter for a `NavigableString` with specific text. |
|
|
:param limit: Stop looking after finding this many results. |
|
|
:param _stacklevel: Used internally to improve warning messages. |
|
|
:kwargs: Additional filters on attribute values. |
|
|
""" |
|
|
return self._find_all( |
|
|
name, |
|
|
attrs, |
|
|
string, |
|
|
limit, |
|
|
self.previous_elements, |
|
|
_stacklevel=_stacklevel + 1, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
findAllPrevious = _deprecated_function_alias( |
|
|
"findAllPrevious", "find_all_previous", "4.0.0" |
|
|
) |
|
|
fetchAllPrevious = _deprecated_function_alias( |
|
|
"fetchAllPrevious", "find_all_previous", "3.0.0" |
|
|
) |
|
|
|
|
|
def find_previous_sibling( |
|
|
self, |
|
|
name: _FindMethodName = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
string: Optional[_StrainableString] = None, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _AtMostOneElement: |
|
|
"""Returns the closest sibling to this `PageElement` that matches the |
|
|
given criteria and appears earlier in the document. |
|
|
|
|
|
All find_* methods take a common set of arguments. See the online |
|
|
documentation for detailed explanations. |
|
|
|
|
|
:param name: A filter on tag name. |
|
|
:param attrs: Additional filters on attribute values. |
|
|
:param string: A filter for a `NavigableString` with specific text. |
|
|
:kwargs: Additional filters on attribute values. |
|
|
""" |
|
|
return self._find_one( |
|
|
self.find_previous_siblings, name, attrs, string, **kwargs |
|
|
) |
|
|
|
|
|
findPreviousSibling = _deprecated_function_alias( |
|
|
"findPreviousSibling", "find_previous_sibling", "4.0.0" |
|
|
) |
|
|
|
|
|
def find_previous_siblings( |
|
|
self, |
|
|
name: _FindMethodName = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
string: Optional[_StrainableString] = None, |
|
|
limit: Optional[int] = None, |
|
|
_stacklevel: int = 2, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _QueryResults: |
|
|
"""Returns all siblings to this PageElement that match the |
|
|
given criteria and appear earlier in the document. |
|
|
|
|
|
All find_* methods take a common set of arguments. See the online |
|
|
documentation for detailed explanations. |
|
|
|
|
|
:param name: A filter on tag name. |
|
|
:param attrs: Additional filters on attribute values. |
|
|
:param string: A filter for a NavigableString with specific text. |
|
|
:param limit: Stop looking after finding this many results. |
|
|
:param _stacklevel: Used internally to improve warning messages. |
|
|
:kwargs: Additional filters on attribute values. |
|
|
""" |
|
|
return self._find_all( |
|
|
name, |
|
|
attrs, |
|
|
string, |
|
|
limit, |
|
|
self.previous_siblings, |
|
|
_stacklevel=_stacklevel + 1, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
findPreviousSiblings = _deprecated_function_alias( |
|
|
"findPreviousSiblings", "find_previous_siblings", "4.0.0" |
|
|
) |
|
|
fetchPreviousSiblings = _deprecated_function_alias( |
|
|
"fetchPreviousSiblings", "find_previous_siblings", "3.0.0" |
|
|
) |
|
|
|
|
|
def find_parent( |
|
|
self, |
|
|
name: _FindMethodName = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _AtMostOneElement: |
|
|
"""Find the closest parent of this PageElement that matches the given |
|
|
criteria. |
|
|
|
|
|
All find_* methods take a common set of arguments. See the online |
|
|
documentation for detailed explanations. |
|
|
|
|
|
:param name: A filter on tag name. |
|
|
:param attrs: Additional filters on attribute values. |
|
|
:param self: Whether the PageElement itself should be considered |
|
|
as one of its 'parents'. |
|
|
:kwargs: Additional filters on attribute values. |
|
|
""" |
|
|
|
|
|
|
|
|
r = None |
|
|
results = self.find_parents( |
|
|
name, attrs, 1, _stacklevel=3, **kwargs |
|
|
) |
|
|
if results: |
|
|
r = results[0] |
|
|
return r |
|
|
|
|
|
findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0") |
|
|
|
|
|
def find_parents( |
|
|
self, |
|
|
name: _FindMethodName = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
limit: Optional[int] = None, |
|
|
_stacklevel: int = 2, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _QueryResults: |
|
|
"""Find all parents of this `PageElement` that match the given criteria. |
|
|
|
|
|
All find_* methods take a common set of arguments. See the online |
|
|
documentation for detailed explanations. |
|
|
|
|
|
:param name: A filter on tag name. |
|
|
:param attrs: Additional filters on attribute values. |
|
|
:param limit: Stop looking after finding this many results. |
|
|
:param _stacklevel: Used internally to improve warning messages. |
|
|
:kwargs: Additional filters on attribute values. |
|
|
""" |
|
|
iterator = self.parents |
|
|
return self._find_all( |
|
|
name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs |
|
|
) |
|
|
|
|
|
findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0") |
|
|
fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0") |
|
|
|
|
|
@property |
|
|
def next(self) -> _AtMostOneElement: |
|
|
"""The `PageElement`, if any, that was parsed just after this one.""" |
|
|
return self.next_element |
|
|
|
|
|
@property |
|
|
def previous(self) -> _AtMostOneElement: |
|
|
"""The `PageElement`, if any, that was parsed just before this one.""" |
|
|
return self.previous_element |
|
|
|
|
|
|
|
|
|
|
|
def _find_one( |
|
|
self, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
method: Callable, |
|
|
name: _FindMethodName, |
|
|
attrs: _StrainableAttributes, |
|
|
string: Optional[_StrainableString], |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _AtMostOneElement: |
|
|
r: _AtMostOneElement = None |
|
|
results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs) |
|
|
if results: |
|
|
r = results[0] |
|
|
return r |
|
|
|
|
|
def _find_all( |
|
|
self, |
|
|
name: _FindMethodName, |
|
|
attrs: _StrainableAttributes, |
|
|
string: Optional[_StrainableString], |
|
|
limit: Optional[int], |
|
|
generator: Iterator[PageElement], |
|
|
_stacklevel: int = 3, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _QueryResults: |
|
|
"""Iterates over a generator looking for things that match.""" |
|
|
|
|
|
if string is None and "text" in kwargs: |
|
|
string = kwargs.pop("text") |
|
|
warnings.warn( |
|
|
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", |
|
|
DeprecationWarning, |
|
|
stacklevel=_stacklevel, |
|
|
) |
|
|
|
|
|
if "_class" in kwargs: |
|
|
warnings.warn( |
|
|
AttributeResemblesVariableWarning.MESSAGE |
|
|
% dict( |
|
|
original="_class", |
|
|
autocorrect="class_", |
|
|
), |
|
|
AttributeResemblesVariableWarning, |
|
|
stacklevel=_stacklevel, |
|
|
) |
|
|
|
|
|
from bs4.filter import ElementFilter |
|
|
|
|
|
if isinstance(name, ElementFilter): |
|
|
matcher = name |
|
|
else: |
|
|
matcher = SoupStrainer(name, attrs, string, **kwargs) |
|
|
|
|
|
result: Iterable[_OneElement] |
|
|
if string is None and not limit and not attrs and not kwargs: |
|
|
if name is True or name is None: |
|
|
|
|
|
result = (element for element in generator if isinstance(element, Tag)) |
|
|
return ResultSet(matcher, result) |
|
|
elif isinstance(name, str): |
|
|
|
|
|
if name.count(":") == 1: |
|
|
|
|
|
|
|
|
|
|
|
prefix, local_name = name.split(":", 1) |
|
|
else: |
|
|
prefix = None |
|
|
local_name = name |
|
|
result = [] |
|
|
for element in generator: |
|
|
if not isinstance(element, Tag): |
|
|
continue |
|
|
if element.name == name or ( |
|
|
element.name == local_name |
|
|
and (prefix is None or element.prefix == prefix) |
|
|
): |
|
|
result.append(element) |
|
|
return ResultSet(matcher, result) |
|
|
return matcher.find_all(generator, limit) |
|
|
|
|
|
|
|
|
|
|
|
@property |
|
|
def next_elements(self) -> Iterator[PageElement]: |
|
|
"""All PageElements that were parsed after this one.""" |
|
|
i = self.next_element |
|
|
while i is not None: |
|
|
successor = i.next_element |
|
|
yield i |
|
|
i = successor |
|
|
|
|
|
@property |
|
|
def self_and_next_elements(self) -> Iterator[PageElement]: |
|
|
"""This PageElement, then all PageElements that were parsed after it.""" |
|
|
return self._self_and(self.next_elements) |
|
|
|
|
|
@property |
|
|
def next_siblings(self) -> Iterator[PageElement]: |
|
|
"""All PageElements that are siblings of this one but were parsed |
|
|
later. |
|
|
""" |
|
|
i = self.next_sibling |
|
|
while i is not None: |
|
|
successor = i.next_sibling |
|
|
yield i |
|
|
i = successor |
|
|
|
|
|
@property |
|
|
def self_and_next_siblings(self) -> Iterator[PageElement]: |
|
|
"""This PageElement, then all of its siblings.""" |
|
|
return self._self_and(self.next_siblings) |
|
|
|
|
|
@property |
|
|
def previous_elements(self) -> Iterator[PageElement]: |
|
|
"""All PageElements that were parsed before this one. |
|
|
|
|
|
:yield: A sequence of PageElements. |
|
|
""" |
|
|
i = self.previous_element |
|
|
while i is not None: |
|
|
successor = i.previous_element |
|
|
yield i |
|
|
i = successor |
|
|
|
|
|
@property |
|
|
def self_and_previous_elements(self) -> Iterator[PageElement]: |
|
|
"""This PageElement, then all elements that were parsed |
|
|
earlier.""" |
|
|
return self._self_and(self.previous_elements) |
|
|
|
|
|
@property |
|
|
def previous_siblings(self) -> Iterator[PageElement]: |
|
|
"""All PageElements that are siblings of this one but were parsed |
|
|
earlier. |
|
|
|
|
|
:yield: A sequence of PageElements. |
|
|
""" |
|
|
i = self.previous_sibling |
|
|
while i is not None: |
|
|
successor = i.previous_sibling |
|
|
yield i |
|
|
i = successor |
|
|
|
|
|
@property |
|
|
def self_and_previous_siblings(self) -> Iterator[PageElement]: |
|
|
"""This PageElement, then all of its siblings that were parsed |
|
|
earlier.""" |
|
|
return self._self_and(self.previous_siblings) |
|
|
|
|
|
@property |
|
|
def parents(self) -> Iterator[Tag]: |
|
|
"""All elements that are parents of this PageElement. |
|
|
|
|
|
:yield: A sequence of Tags, ending with a BeautifulSoup object. |
|
|
""" |
|
|
i = self.parent |
|
|
while i is not None: |
|
|
successor = i.parent |
|
|
yield i |
|
|
i = successor |
|
|
|
|
|
@property |
|
|
def self_and_parents(self) -> Iterator[PageElement]: |
|
|
"""This element, then all of its parents. |
|
|
|
|
|
:yield: A sequence of PageElements, ending with a BeautifulSoup object. |
|
|
""" |
|
|
return self._self_and(self.parents) |
|
|
|
|
|
def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]: |
|
|
"""Modify a generator by yielding this element, then everything |
|
|
yielded by the other generator. |
|
|
""" |
|
|
if not self.hidden: |
|
|
yield self |
|
|
for i in other_generator: |
|
|
yield i |
|
|
|
|
|
@property |
|
|
def decomposed(self) -> bool: |
|
|
"""Check whether a PageElement has been decomposed.""" |
|
|
return getattr(self, "_decomposed", False) or False |
|
|
|
|
|
@_deprecated("next_elements", "4.0.0") |
|
|
def nextGenerator(self) -> Iterator[PageElement]: |
|
|
":meta private:" |
|
|
return self.next_elements |
|
|
|
|
|
@_deprecated("next_siblings", "4.0.0") |
|
|
def nextSiblingGenerator(self) -> Iterator[PageElement]: |
|
|
":meta private:" |
|
|
return self.next_siblings |
|
|
|
|
|
@_deprecated("previous_elements", "4.0.0") |
|
|
def previousGenerator(self) -> Iterator[PageElement]: |
|
|
":meta private:" |
|
|
return self.previous_elements |
|
|
|
|
|
@_deprecated("previous_siblings", "4.0.0") |
|
|
def previousSiblingGenerator(self) -> Iterator[PageElement]: |
|
|
":meta private:" |
|
|
return self.previous_siblings |
|
|
|
|
|
@_deprecated("parents", "4.0.0") |
|
|
def parentGenerator(self) -> Iterator[PageElement]: |
|
|
":meta private:" |
|
|
return self.parents |
|
|
|
|
|
|
|
|
class NavigableString(str, PageElement): |
|
|
"""A Python string that is part of a parse tree. |
|
|
|
|
|
When Beautiful Soup parses the markup ``<b>penguin</b>``, it will |
|
|
create a `NavigableString` for the string "penguin". |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PREFIX: str = "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SUFFIX: str = "" |
|
|
|
|
|
def __new__(cls, value: Union[str, bytes]) -> Self: |
|
|
"""Create a new NavigableString. |
|
|
|
|
|
When unpickling a NavigableString, this method is called with |
|
|
the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be |
|
|
passed in to the superclass's __new__ or the superclass won't know |
|
|
how to handle non-ASCII characters. |
|
|
""" |
|
|
if isinstance(value, str): |
|
|
u = str.__new__(cls, value) |
|
|
else: |
|
|
u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) |
|
|
u.hidden = False |
|
|
u.setup() |
|
|
return u |
|
|
|
|
|
def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: |
|
|
"""A copy of a NavigableString has the same contents and class |
|
|
as the original, but it is not connected to the parse tree. |
|
|
|
|
|
:param recursive: This parameter is ignored; it's only defined |
|
|
so that NavigableString.__deepcopy__ implements the same |
|
|
signature as Tag.__deepcopy__. |
|
|
""" |
|
|
return type(self)(self) |
|
|
|
|
|
def __getnewargs__(self) -> Tuple[str]: |
|
|
return (str(self),) |
|
|
|
|
|
|
|
|
|
|
|
def __getitem__(self, key: Union[int|slice]) -> str: |
|
|
"""Raise an exception """ |
|
|
if isinstance(key, str): |
|
|
raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__)) |
|
|
return super(NavigableString, self).__getitem__(key) |
|
|
|
|
|
@property |
|
|
def string(self) -> str: |
|
|
"""Convenience property defined to match `Tag.string`. |
|
|
|
|
|
:return: This property always returns the `NavigableString` it was |
|
|
called on. |
|
|
|
|
|
:meta private: |
|
|
""" |
|
|
return self |
|
|
|
|
|
def output_ready(self, formatter: _FormatterOrName = "minimal") -> str: |
|
|
"""Run the string through the provided formatter, making it |
|
|
ready for output as part of an HTML or XML document. |
|
|
|
|
|
:param formatter: A `Formatter` object, or a string naming one |
|
|
of the standard formatters. |
|
|
""" |
|
|
output = self.format_string(self, formatter) |
|
|
return self.PREFIX + output + self.SUFFIX |
|
|
|
|
|
@property |
|
|
def name(self) -> None: |
|
|
"""Since a NavigableString is not a Tag, it has no .name. |
|
|
|
|
|
This property is implemented so that code like this doesn't crash |
|
|
when run on a mixture of Tag and NavigableString objects: |
|
|
[x.name for x in tag.children] |
|
|
|
|
|
:meta private: |
|
|
""" |
|
|
return None |
|
|
|
|
|
@name.setter |
|
|
def name(self, name: str) -> None: |
|
|
"""Prevent NavigableString.name from ever being set. |
|
|
|
|
|
:meta private: |
|
|
""" |
|
|
raise AttributeError("A NavigableString cannot be given a name.") |
|
|
|
|
|
def _all_strings( |
|
|
self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default |
|
|
) -> Iterator[str]: |
|
|
"""Yield all strings of certain classes, possibly stripping them. |
|
|
|
|
|
This makes it easy for NavigableString to implement methods |
|
|
like get_text() as conveniences, creating a consistent |
|
|
text-extraction API across all PageElements. |
|
|
|
|
|
:param strip: If True, all strings will be stripped before being |
|
|
yielded. |
|
|
|
|
|
:param types: A tuple of NavigableString subclasses. If this |
|
|
NavigableString isn't one of those subclasses, the |
|
|
sequence will be empty. By default, the subclasses |
|
|
considered are NavigableString and CData objects. That |
|
|
means no comments, processing instructions, etc. |
|
|
|
|
|
:yield: A sequence that either contains this string, or is empty. |
|
|
""" |
|
|
if types is self.default: |
|
|
|
|
|
|
|
|
types = Tag.MAIN_CONTENT_STRING_TYPES |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
my_type = type(self) |
|
|
if types is not None: |
|
|
if isinstance(types, type): |
|
|
|
|
|
if my_type is not types: |
|
|
return |
|
|
elif my_type not in types: |
|
|
|
|
|
return |
|
|
|
|
|
value = self |
|
|
if strip: |
|
|
final_value = value.strip() |
|
|
else: |
|
|
final_value = self |
|
|
if len(final_value) > 0: |
|
|
yield final_value |
|
|
|
|
|
@property |
|
|
def strings(self) -> Iterator[str]: |
|
|
"""Yield this string, but only if it is interesting. |
|
|
|
|
|
This is defined the way it is for compatibility with |
|
|
`Tag.strings`. See `Tag` for information on which strings are |
|
|
interesting in a given context. |
|
|
|
|
|
:yield: A sequence that either contains this string, or is empty. |
|
|
""" |
|
|
return self._all_strings() |
|
|
|
|
|
|
|
|
class PreformattedString(NavigableString): |
|
|
"""A `NavigableString` not subject to the normal formatting rules. |
|
|
|
|
|
This is an abstract class used for special kinds of strings such |
|
|
as comments (`Comment`) and CDATA blocks (`CData`). |
|
|
""" |
|
|
|
|
|
PREFIX: str = "" |
|
|
SUFFIX: str = "" |
|
|
|
|
|
def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str: |
|
|
"""Make this string ready for output by adding any subclass-specific |
|
|
prefix or suffix. |
|
|
|
|
|
:param formatter: A `Formatter` object, or a string naming one |
|
|
of the standard formatters. The string will be passed into the |
|
|
`Formatter`, but only to trigger any side effects: the return |
|
|
value is ignored. |
|
|
|
|
|
:return: The string, with any subclass-specific prefix and |
|
|
suffix added on. |
|
|
""" |
|
|
if formatter is not None: |
|
|
self.format_string(self, formatter) |
|
|
return self.PREFIX + self + self.SUFFIX |
|
|
|
|
|
|
|
|
class CData(PreformattedString): |
|
|
"""A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_.""" |
|
|
|
|
|
PREFIX: str = "<![CDATA[" |
|
|
SUFFIX: str = "]]>" |
|
|
|
|
|
|
|
|
class ProcessingInstruction(PreformattedString): |
|
|
"""A SGML processing instruction.""" |
|
|
|
|
|
PREFIX: str = "<?" |
|
|
SUFFIX: str = ">" |
|
|
|
|
|
|
|
|
class XMLProcessingInstruction(ProcessingInstruction): |
|
|
"""An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_.""" |
|
|
|
|
|
PREFIX: str = "<?" |
|
|
SUFFIX: str = "?>" |
|
|
|
|
|
|
|
|
class Comment(PreformattedString): |
|
|
"""An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_.""" |
|
|
|
|
|
PREFIX: str = "<!--" |
|
|
SUFFIX: str = "-->" |
|
|
|
|
|
|
|
|
class Declaration(PreformattedString): |
|
|
"""An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_.""" |
|
|
|
|
|
PREFIX: str = "<?" |
|
|
SUFFIX: str = "?>" |
|
|
|
|
|
|
|
|
class Doctype(PreformattedString): |
|
|
"""A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_.""" |
|
|
|
|
|
@classmethod |
|
|
def for_name_and_ids( |
|
|
cls, name: str, pub_id: Optional[str], system_id: Optional[str] |
|
|
) -> Doctype: |
|
|
"""Generate an appropriate document type declaration for a given |
|
|
public ID and system ID. |
|
|
|
|
|
:param name: The name of the document's root element, e.g. 'html'. |
|
|
:param pub_id: The Formal Public Identifier for this document type, |
|
|
e.g. '-//W3C//DTD XHTML 1.1//EN' |
|
|
:param system_id: The system identifier for this document type, |
|
|
e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' |
|
|
""" |
|
|
return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id)) |
|
|
|
|
|
@classmethod |
|
|
def _string_for_name_and_ids( |
|
|
self, name: str, pub_id: Optional[str], system_id: Optional[str] |
|
|
) -> str: |
|
|
"""Generate a string to be used as the basis of a Doctype object. |
|
|
|
|
|
This is a separate method from for_name_and_ids() because the lxml |
|
|
TreeBuilder needs to call it. |
|
|
""" |
|
|
value = name or "" |
|
|
if pub_id is not None: |
|
|
value += ' PUBLIC "%s"' % pub_id |
|
|
if system_id is not None: |
|
|
value += ' "%s"' % system_id |
|
|
elif system_id is not None: |
|
|
value += ' SYSTEM "%s"' % system_id |
|
|
return value |
|
|
|
|
|
PREFIX: str = "<!DOCTYPE " |
|
|
SUFFIX: str = ">\n" |
|
|
|
|
|
|
|
|
class Stylesheet(NavigableString): |
|
|
"""A `NavigableString` representing the contents of a `<style> HTML |
|
|
tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_ |
|
|
(probably CSS). |
|
|
|
|
|
Used to distinguish embedded stylesheets from textual content. |
|
|
""" |
|
|
|
|
|
|
|
|
class Script(NavigableString): |
|
|
"""A `NavigableString` representing the contents of a `<script> |
|
|
HTML tag |
|
|
<https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_ |
|
|
(probably Javascript). |
|
|
|
|
|
Used to distinguish executable code from textual content. |
|
|
""" |
|
|
|
|
|
|
|
|
class TemplateString(NavigableString): |
|
|
"""A `NavigableString` representing a string found inside an `HTML |
|
|
<template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_ |
|
|
embedded in a larger document. |
|
|
|
|
|
Used to distinguish such strings from the main body of the document. |
|
|
""" |
|
|
|
|
|
|
|
|
class RubyTextString(NavigableString): |
|
|
"""A NavigableString representing the contents of an `<rt> HTML |
|
|
tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_. |
|
|
|
|
|
Can be used to distinguish such strings from the strings they're |
|
|
annotating. |
|
|
""" |
|
|
|
|
|
|
|
|
class RubyParenthesisString(NavigableString): |
|
|
"""A NavigableString representing the contents of an `<rp> HTML |
|
|
tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_. |
|
|
""" |
|
|
|
|
|
|
|
|
class Tag(PageElement): |
|
|
"""An HTML or XML tag that is part of a parse tree, along with its |
|
|
attributes, contents, and relationships to other parts of the tree. |
|
|
|
|
|
When Beautiful Soup parses the markup ``<b>penguin</b>``, it will |
|
|
create a `Tag` object representing the ``<b>`` tag. You can |
|
|
instantiate `Tag` objects directly, but it's not necessary unless |
|
|
you're adding entirely new markup to a parsed document. Most of |
|
|
the constructor arguments are intended for use by the `TreeBuilder` |
|
|
that's parsing a document. |
|
|
|
|
|
:param parser: A `BeautifulSoup` object representing the parse tree this |
|
|
`Tag` will be part of. |
|
|
:param builder: The `TreeBuilder` being used to build the tree. |
|
|
:param name: The name of the tag. |
|
|
:param namespace: The URI of this tag's XML namespace, if any. |
|
|
:param prefix: The prefix for this tag's XML namespace, if any. |
|
|
:param attrs: A dictionary of attribute values. |
|
|
:param parent: The `Tag` to use as the parent of this `Tag`. May be |
|
|
the `BeautifulSoup` object itself. |
|
|
:param previous: The `PageElement` that was parsed immediately before |
|
|
parsing this tag. |
|
|
:param is_xml: If True, this is an XML tag. Otherwise, this is an |
|
|
HTML tag. |
|
|
:param sourceline: The line number where this tag was found in its |
|
|
source document. |
|
|
:param sourcepos: The character position within ``sourceline`` where this |
|
|
tag was found. |
|
|
:param can_be_empty_element: If True, this tag should be |
|
|
represented as <tag/>. If False, this tag should be represented |
|
|
as <tag></tag>. |
|
|
:param cdata_list_attributes: A dictionary of attributes whose values should |
|
|
be parsed as lists of strings if they ever show up on this tag. |
|
|
:param preserve_whitespace_tags: Names of tags whose contents |
|
|
should have their whitespace preserved if they are encountered inside |
|
|
this tag. |
|
|
:param interesting_string_types: When iterating over this tag's |
|
|
string contents in methods like `Tag.strings` or |
|
|
`PageElement.get_text`, these are the types of strings that are |
|
|
interesting enough to be considered. By default, |
|
|
`NavigableString` (normal strings) and `CData` (CDATA |
|
|
sections) are the only interesting string subtypes. |
|
|
:param namespaces: A dictionary mapping currently active |
|
|
namespace prefixes to URIs, as of the point in the parsing process when |
|
|
this tag was encountered. This can be used later to |
|
|
construct CSS selectors. |
|
|
|
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
parser: Optional[BeautifulSoup] = None, |
|
|
builder: Optional[TreeBuilder] = None, |
|
|
name: Optional[str] = None, |
|
|
namespace: Optional[str] = None, |
|
|
prefix: Optional[str] = None, |
|
|
attrs: Optional[_RawOrProcessedAttributeValues] = None, |
|
|
parent: Optional[Union[BeautifulSoup, Tag]] = None, |
|
|
previous: _AtMostOneElement = None, |
|
|
is_xml: Optional[bool] = None, |
|
|
sourceline: Optional[int] = None, |
|
|
sourcepos: Optional[int] = None, |
|
|
can_be_empty_element: Optional[bool] = None, |
|
|
cdata_list_attributes: Optional[Dict[str, Set[str]]] = None, |
|
|
preserve_whitespace_tags: Optional[Set[str]] = None, |
|
|
interesting_string_types: Optional[Set[Type[NavigableString]]] = None, |
|
|
namespaces: Optional[Dict[str, str]] = None, |
|
|
|
|
|
|
|
|
|
|
|
): |
|
|
if parser is None: |
|
|
self.parser_class = None |
|
|
else: |
|
|
|
|
|
|
|
|
self.parser_class = parser.__class__ |
|
|
if name is None: |
|
|
raise ValueError("No value provided for new tag's name.") |
|
|
self.name = name |
|
|
self.namespace = namespace |
|
|
self._namespaces = namespaces or {} |
|
|
self.prefix = prefix |
|
|
if (not builder or builder.store_line_numbers) and ( |
|
|
sourceline is not None or sourcepos is not None |
|
|
): |
|
|
self.sourceline = sourceline |
|
|
self.sourcepos = sourcepos |
|
|
else: |
|
|
self.sourceline = sourceline |
|
|
self.sourcepos = sourcepos |
|
|
|
|
|
attr_dict_class: type[AttributeDict] |
|
|
attribute_value_list_class: type[AttributeValueList] |
|
|
if builder is None: |
|
|
if is_xml: |
|
|
attr_dict_class = XMLAttributeDict |
|
|
else: |
|
|
attr_dict_class = HTMLAttributeDict |
|
|
attribute_value_list_class = AttributeValueList |
|
|
else: |
|
|
attr_dict_class = builder.attribute_dict_class |
|
|
attribute_value_list_class = builder.attribute_value_list_class |
|
|
self.attribute_value_list_class = attribute_value_list_class |
|
|
|
|
|
if attrs is None: |
|
|
self.attrs = attr_dict_class() |
|
|
else: |
|
|
if builder is not None and builder.cdata_list_attributes: |
|
|
self.attrs = builder._replace_cdata_list_attribute_values( |
|
|
self.name, attrs |
|
|
) |
|
|
else: |
|
|
self.attrs = attr_dict_class() |
|
|
|
|
|
|
|
|
|
|
|
for k, v in attrs.items(): |
|
|
if isinstance(v, list): |
|
|
v = v.__class__(v) |
|
|
self.attrs[k] = v |
|
|
|
|
|
|
|
|
|
|
|
if builder: |
|
|
self.known_xml = builder.is_xml |
|
|
else: |
|
|
self.known_xml = is_xml |
|
|
self.contents: List[PageElement] = [] |
|
|
self.setup(parent, previous) |
|
|
self.hidden = False |
|
|
|
|
|
if builder is None: |
|
|
|
|
|
|
|
|
|
|
|
self.can_be_empty_element = can_be_empty_element |
|
|
self.cdata_list_attributes = cdata_list_attributes |
|
|
self.preserve_whitespace_tags = preserve_whitespace_tags |
|
|
self.interesting_string_types = interesting_string_types |
|
|
else: |
|
|
|
|
|
self.attribute_value_list_class = builder.attribute_value_list_class |
|
|
builder.set_up_substitutions(self) |
|
|
|
|
|
|
|
|
self.can_be_empty_element = builder.can_be_empty_element(name) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.cdata_list_attributes = builder.cdata_list_attributes |
|
|
|
|
|
|
|
|
|
|
|
self.preserve_whitespace_tags = builder.preserve_whitespace_tags |
|
|
|
|
|
if self.name in builder.string_containers: |
|
|
|
|
|
|
|
|
|
|
|
self.interesting_string_types = {builder.string_containers[self.name]} |
|
|
else: |
|
|
self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES |
|
|
|
|
|
parser_class: Optional[type[BeautifulSoup]] |
|
|
name: str |
|
|
namespace: Optional[str] |
|
|
prefix: Optional[str] |
|
|
attrs: _AttributeValues |
|
|
sourceline: Optional[int] |
|
|
sourcepos: Optional[int] |
|
|
known_xml: Optional[bool] |
|
|
contents: List[PageElement] |
|
|
hidden: bool |
|
|
interesting_string_types: Optional[Set[Type[NavigableString]]] |
|
|
|
|
|
can_be_empty_element: Optional[bool] |
|
|
cdata_list_attributes: Optional[Dict[str, Set[str]]] |
|
|
preserve_whitespace_tags: Optional[Set[str]] |
|
|
|
|
|
|
|
|
parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0") |
|
|
|
|
|
def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self: |
|
|
"""A deepcopy of a Tag is a new Tag, unconnected to the parse tree. |
|
|
Its contents are a copy of the old Tag's contents. |
|
|
""" |
|
|
clone = self.copy_self() |
|
|
|
|
|
if recursive: |
|
|
|
|
|
|
|
|
tag_stack: List[Tag] = [clone] |
|
|
for event, element in self._event_stream(self.descendants): |
|
|
if event is Tag.END_ELEMENT_EVENT: |
|
|
|
|
|
|
|
|
tag_stack.pop() |
|
|
else: |
|
|
descendant_clone = element.__deepcopy__(memo, recursive=False) |
|
|
|
|
|
tag_stack[-1].append(descendant_clone) |
|
|
|
|
|
if event is Tag.START_ELEMENT_EVENT: |
|
|
|
|
|
|
|
|
tag_stack.append(cast(Tag, descendant_clone)) |
|
|
return clone |
|
|
|
|
|
def copy_self(self) -> Self: |
|
|
"""Create a new Tag just like this one, but with no |
|
|
contents and unattached to any parse tree. |
|
|
|
|
|
This is the first step in the deepcopy process, but you can |
|
|
call it on its own to create a copy of a Tag without copying its |
|
|
contents. |
|
|
""" |
|
|
clone = type(self)( |
|
|
None, |
|
|
None, |
|
|
self.name, |
|
|
self.namespace, |
|
|
self.prefix, |
|
|
self.attrs, |
|
|
is_xml=self._is_xml, |
|
|
sourceline=self.sourceline, |
|
|
sourcepos=self.sourcepos, |
|
|
can_be_empty_element=self.can_be_empty_element, |
|
|
cdata_list_attributes=self.cdata_list_attributes, |
|
|
preserve_whitespace_tags=self.preserve_whitespace_tags, |
|
|
interesting_string_types=self.interesting_string_types, |
|
|
namespaces=self._namespaces, |
|
|
) |
|
|
for attr in ("can_be_empty_element", "hidden"): |
|
|
setattr(clone, attr, getattr(self, attr)) |
|
|
return clone |
|
|
|
|
|
@property |
|
|
def is_empty_element(self) -> bool: |
|
|
"""Is this tag an empty-element tag? (aka a self-closing tag) |
|
|
|
|
|
A tag that has contents is never an empty-element tag. |
|
|
|
|
|
A tag that has no contents may or may not be an empty-element |
|
|
tag. It depends on the `TreeBuilder` used to create the |
|
|
tag. If the builder has a designated list of empty-element |
|
|
tags, then only a tag whose name shows up in that list is |
|
|
considered an empty-element tag. This is usually the case |
|
|
for HTML documents. |
|
|
|
|
|
If the builder has no designated list of empty-element, then |
|
|
any tag with no contents is an empty-element tag. This is usually |
|
|
the case for XML documents. |
|
|
""" |
|
|
return len(self.contents) == 0 and self.can_be_empty_element is True |
|
|
|
|
|
@_deprecated("is_empty_element", "4.0.0") |
|
|
def isSelfClosing(self) -> bool: |
|
|
": :meta private:" |
|
|
return self.is_empty_element |
|
|
|
|
|
@property |
|
|
def string(self) -> Optional[str]: |
|
|
"""Convenience property to get the single string within this |
|
|
`Tag`, assuming there is just one. |
|
|
|
|
|
:return: If this `Tag` has a single child that's a |
|
|
`NavigableString`, the return value is that string. If this |
|
|
element has one child `Tag`, the return value is that child's |
|
|
`Tag.string`, recursively. If this `Tag` has no children, |
|
|
or has more than one child, the return value is ``None``. |
|
|
|
|
|
If this property is unexpectedly returning ``None`` for you, |
|
|
it's probably because your `Tag` has more than one thing |
|
|
inside it. |
|
|
""" |
|
|
if len(self.contents) != 1: |
|
|
return None |
|
|
child = self.contents[0] |
|
|
if isinstance(child, NavigableString): |
|
|
return child |
|
|
elif isinstance(child, Tag): |
|
|
return child.string |
|
|
return None |
|
|
|
|
|
@string.setter |
|
|
def string(self, string: str) -> None: |
|
|
"""Replace the `Tag.contents` of this `Tag` with a single string.""" |
|
|
self.clear() |
|
|
if isinstance(string, NavigableString): |
|
|
new_class = string.__class__ |
|
|
else: |
|
|
new_class = NavigableString |
|
|
self.append(new_class(string)) |
|
|
|
|
|
|
|
|
MAIN_CONTENT_STRING_TYPES = {NavigableString, CData} |
|
|
|
|
|
def _all_strings( |
|
|
self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default |
|
|
) -> Iterator[str]: |
|
|
"""Yield all strings of certain classes, possibly stripping them. |
|
|
|
|
|
:param strip: If True, all strings will be stripped before being |
|
|
yielded. |
|
|
|
|
|
:param types: A tuple of NavigableString subclasses. Any strings of |
|
|
a subclass not found in this list will be ignored. By |
|
|
default, the subclasses considered are the ones found in |
|
|
self.interesting_string_types. If that's not specified, |
|
|
only NavigableString and CData objects will be |
|
|
considered. That means no comments, processing |
|
|
instructions, etc. |
|
|
""" |
|
|
if types is self.default: |
|
|
if self.interesting_string_types is None: |
|
|
types = self.MAIN_CONTENT_STRING_TYPES |
|
|
else: |
|
|
types = self.interesting_string_types |
|
|
|
|
|
for descendant in self.descendants: |
|
|
if not isinstance(descendant, NavigableString): |
|
|
continue |
|
|
descendant_type = type(descendant) |
|
|
if isinstance(types, type): |
|
|
if descendant_type is not types: |
|
|
|
|
|
continue |
|
|
elif types is not None and descendant_type not in types: |
|
|
|
|
|
continue |
|
|
if strip: |
|
|
stripped = descendant.strip() |
|
|
if len(stripped) == 0: |
|
|
continue |
|
|
yield stripped |
|
|
else: |
|
|
yield descendant |
|
|
|
|
|
strings = property(_all_strings) |
|
|
|
|
|
def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]: |
|
|
"""Insert one or more new PageElements as a child of this `Tag`. |
|
|
|
|
|
This works similarly to :py:meth:`list.insert`, except you can insert |
|
|
multiple elements at once. |
|
|
|
|
|
:param position: The numeric position that should be occupied |
|
|
in this Tag's `Tag.children` by the first new `PageElement`. |
|
|
|
|
|
:param new_children: The PageElements to insert. |
|
|
|
|
|
:return The newly inserted PageElements. |
|
|
""" |
|
|
inserted: List[PageElement] = [] |
|
|
for new_child in new_children: |
|
|
inserted.extend(self._insert(position, new_child)) |
|
|
position += 1 |
|
|
return inserted |
|
|
|
|
|
def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]: |
|
|
if new_child is None: |
|
|
raise ValueError("Cannot insert None into a tag.") |
|
|
if new_child is self: |
|
|
raise ValueError("Cannot insert a tag into itself.") |
|
|
if isinstance(new_child, str) and not isinstance(new_child, NavigableString): |
|
|
new_child = NavigableString(new_child) |
|
|
|
|
|
from bs4 import BeautifulSoup |
|
|
if isinstance(new_child, BeautifulSoup): |
|
|
|
|
|
|
|
|
|
|
|
return self.insert(position, *list(new_child.contents)) |
|
|
position = min(position, len(self.contents)) |
|
|
if hasattr(new_child, "parent") and new_child.parent is not None: |
|
|
|
|
|
|
|
|
if new_child.parent is self: |
|
|
current_index = self.index(new_child) |
|
|
if current_index < position: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
position -= 1 |
|
|
elif current_index == position: |
|
|
|
|
|
|
|
|
return [new_child] |
|
|
new_child.extract() |
|
|
|
|
|
new_child.parent = self |
|
|
previous_child = None |
|
|
if position == 0: |
|
|
new_child.previous_sibling = None |
|
|
new_child.previous_element = self |
|
|
else: |
|
|
previous_child = self.contents[position - 1] |
|
|
new_child.previous_sibling = previous_child |
|
|
new_child.previous_sibling.next_sibling = new_child |
|
|
new_child.previous_element = previous_child._last_descendant(False) |
|
|
if new_child.previous_element is not None: |
|
|
new_child.previous_element.next_element = new_child |
|
|
|
|
|
new_childs_last_element = new_child._last_descendant( |
|
|
is_initialized=False, accept_self=True |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
new_childs_last_element = cast(PageElement, new_childs_last_element) |
|
|
|
|
|
if position >= len(self.contents): |
|
|
new_child.next_sibling = None |
|
|
|
|
|
parent: Optional[Tag] = self |
|
|
parents_next_sibling = None |
|
|
while parents_next_sibling is None and parent is not None: |
|
|
parents_next_sibling = parent.next_sibling |
|
|
parent = parent.parent |
|
|
if parents_next_sibling is not None: |
|
|
|
|
|
break |
|
|
if parents_next_sibling is not None: |
|
|
new_childs_last_element.next_element = parents_next_sibling |
|
|
else: |
|
|
|
|
|
|
|
|
new_childs_last_element.next_element = None |
|
|
else: |
|
|
next_child = self.contents[position] |
|
|
new_child.next_sibling = next_child |
|
|
if new_child.next_sibling is not None: |
|
|
new_child.next_sibling.previous_sibling = new_child |
|
|
new_childs_last_element.next_element = next_child |
|
|
|
|
|
if new_childs_last_element.next_element is not None: |
|
|
new_childs_last_element.next_element.previous_element = ( |
|
|
new_childs_last_element |
|
|
) |
|
|
self.contents.insert(position, new_child) |
|
|
|
|
|
return [new_child] |
|
|
|
|
|
def unwrap(self) -> Self: |
|
|
"""Replace this `PageElement` with its contents. |
|
|
|
|
|
:return: This object, no longer part of the tree. |
|
|
""" |
|
|
my_parent = self.parent |
|
|
if my_parent is None: |
|
|
raise ValueError( |
|
|
"Cannot replace an element with its contents when that " |
|
|
"element is not part of a tree." |
|
|
) |
|
|
my_index = my_parent.index(self) |
|
|
self.extract(_self_index=my_index) |
|
|
for child in reversed(self.contents[:]): |
|
|
my_parent.insert(my_index, child) |
|
|
return self |
|
|
|
|
|
replace_with_children = unwrap |
|
|
|
|
|
@_deprecated("unwrap", "4.0.0") |
|
|
def replaceWithChildren(self) -> _OneElement: |
|
|
": :meta private:" |
|
|
return self.unwrap() |
|
|
|
|
|
def append(self, tag: _InsertableElement) -> PageElement: |
|
|
""" |
|
|
Appends the given `PageElement` to the contents of this `Tag`. |
|
|
|
|
|
:param tag: A PageElement. |
|
|
|
|
|
:return The newly appended PageElement. |
|
|
""" |
|
|
return self.insert(len(self.contents), tag)[0] |
|
|
|
|
|
def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]: |
|
|
"""Appends one or more objects to the contents of this |
|
|
`Tag`. |
|
|
|
|
|
:param tags: If a list of `PageElement` objects is provided, |
|
|
they will be appended to this tag's contents, one at a time. |
|
|
If a single `Tag` is provided, its `Tag.contents` will be |
|
|
used to extend this object's `Tag.contents`. |
|
|
|
|
|
:return The list of PageElements that were appended. |
|
|
""" |
|
|
tag_list: Iterable[_InsertableElement] |
|
|
|
|
|
if isinstance(tags, Tag): |
|
|
tag_list = list(tags.contents) |
|
|
elif isinstance(tags, (PageElement, str)): |
|
|
|
|
|
|
|
|
warnings.warn( |
|
|
"A single non-Tag item was passed into Tag.extend. Use Tag.append instead.", |
|
|
UserWarning, |
|
|
stacklevel=2, |
|
|
) |
|
|
if isinstance(tags, str) and not isinstance(tags, PageElement): |
|
|
tags = NavigableString(tags) |
|
|
tag_list = [tags] |
|
|
elif isinstance(tags, Iterable): |
|
|
|
|
|
|
|
|
tag_list = list(tags) |
|
|
|
|
|
results: List[PageElement] = [] |
|
|
for tag in tag_list: |
|
|
results.append(self.append(tag)) |
|
|
|
|
|
return results |
|
|
|
|
|
def clear(self, decompose: bool = False) -> None: |
|
|
"""Destroy all children of this `Tag` by calling |
|
|
`PageElement.extract` on them. |
|
|
|
|
|
:param decompose: If this is True, `PageElement.decompose` (a |
|
|
more destructive method) will be called instead of |
|
|
`PageElement.extract`. |
|
|
""" |
|
|
for element in self.contents[:]: |
|
|
if decompose: |
|
|
element.decompose() |
|
|
else: |
|
|
element.extract() |
|
|
|
|
|
def smooth(self) -> None: |
|
|
"""Smooth out the children of this `Tag` by consolidating consecutive |
|
|
strings. |
|
|
|
|
|
If you perform a lot of operations that modify the tree, |
|
|
calling this method afterwards can make pretty-printed output |
|
|
look more natural. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marked = [] |
|
|
for i, a in enumerate(self.contents): |
|
|
if isinstance(a, Tag): |
|
|
|
|
|
a.smooth() |
|
|
if i == len(self.contents) - 1: |
|
|
|
|
|
|
|
|
continue |
|
|
b = self.contents[i + 1] |
|
|
if ( |
|
|
isinstance(a, NavigableString) |
|
|
and isinstance(b, NavigableString) |
|
|
and not isinstance(a, PreformattedString) |
|
|
and not isinstance(b, PreformattedString) |
|
|
): |
|
|
marked.append(i) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in reversed(marked): |
|
|
a = cast(NavigableString, self.contents[i]) |
|
|
b = cast(NavigableString, self.contents[i + 1]) |
|
|
b.extract() |
|
|
n = NavigableString(a + b) |
|
|
a.replace_with(n) |
|
|
|
|
|
def index(self, element: PageElement) -> int: |
|
|
"""Find the index of a child of this `Tag` (by identity, not value). |
|
|
|
|
|
Doing this by identity avoids issues when a `Tag` contains two |
|
|
children that have string equality. |
|
|
|
|
|
:param element: Look for this `PageElement` in this object's contents. |
|
|
""" |
|
|
for i, child in enumerate(self.contents): |
|
|
if child is element: |
|
|
return i |
|
|
raise ValueError("Tag.index: element not in tag") |
|
|
|
|
|
def get( |
|
|
self, key: str, default: Optional[_AttributeValue] = None |
|
|
) -> Optional[_AttributeValue]: |
|
|
"""Returns the value of the 'key' attribute for the tag, or |
|
|
the value given for 'default' if it doesn't have that |
|
|
attribute. |
|
|
|
|
|
:param key: The attribute to look for. |
|
|
:param default: Use this value if the attribute is not present |
|
|
on this `Tag`. |
|
|
""" |
|
|
return self.attrs.get(key, default) |
|
|
|
|
|
def get_attribute_list( |
|
|
self, key: str, default: Optional[AttributeValueList] = None |
|
|
) -> AttributeValueList: |
|
|
"""The same as get(), but always returns a (possibly empty) list. |
|
|
|
|
|
:param key: The attribute to look for. |
|
|
:param default: Use this value if the attribute is not present |
|
|
on this `Tag`. |
|
|
:return: A list of strings, usually empty or containing only a single |
|
|
value. |
|
|
""" |
|
|
list_value: AttributeValueList |
|
|
value = self.get(key, default) |
|
|
if value is None: |
|
|
list_value = self.attribute_value_list_class() |
|
|
elif isinstance(value, list): |
|
|
list_value = value |
|
|
else: |
|
|
if not isinstance(value, str): |
|
|
value = cast(str, value) |
|
|
list_value = self.attribute_value_list_class([value]) |
|
|
return list_value |
|
|
|
|
|
def has_attr(self, key: str) -> bool: |
|
|
"""Does this `Tag` have an attribute with the given name?""" |
|
|
return key in self.attrs |
|
|
|
|
|
def __hash__(self) -> int: |
|
|
return str(self).__hash__() |
|
|
|
|
|
def __getitem__(self, key: str) -> _AttributeValue: |
|
|
"""tag[key] returns the value of the 'key' attribute for the Tag, |
|
|
and throws an exception if it's not there.""" |
|
|
return self.attrs[key] |
|
|
|
|
|
def __iter__(self) -> Iterator[PageElement]: |
|
|
"Iterating over a Tag iterates over its contents." |
|
|
return iter(self.contents) |
|
|
|
|
|
def __len__(self) -> int: |
|
|
"The length of a Tag is the length of its list of contents." |
|
|
return len(self.contents) |
|
|
|
|
|
def __contains__(self, x: Any) -> bool: |
|
|
return x in self.contents |
|
|
|
|
|
def __bool__(self) -> bool: |
|
|
"A tag is non-None even if it has no contents." |
|
|
return True |
|
|
|
|
|
def __setitem__(self, key: str, value: _AttributeValue) -> None: |
|
|
"""Setting tag[key] sets the value of the 'key' attribute for the |
|
|
tag.""" |
|
|
self.attrs[key] = value |
|
|
|
|
|
def __delitem__(self, key: str) -> None: |
|
|
"Deleting tag[key] deletes all 'key' attributes for the tag." |
|
|
self.attrs.pop(key, None) |
|
|
|
|
|
def __call__( |
|
|
self, |
|
|
name: Optional[_StrainableElement] = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
recursive: bool = True, |
|
|
string: Optional[_StrainableString] = None, |
|
|
limit: Optional[int] = None, |
|
|
_stacklevel: int = 2, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _QueryResults: |
|
|
"""Calling a Tag like a function is the same as calling its |
|
|
find_all() method. Eg. tag('a') returns a list of all the A tags |
|
|
found within this tag.""" |
|
|
return self.find_all( |
|
|
name, attrs, recursive, string, limit, _stacklevel, **kwargs |
|
|
) |
|
|
|
|
|
def __getattr__(self, subtag: str) -> Optional[Tag]: |
|
|
"""Calling tag.subtag is the same as calling tag.find(name="subtag")""" |
|
|
|
|
|
result: _AtMostOneElement |
|
|
if len(subtag) > 3 and subtag.endswith("Tag"): |
|
|
|
|
|
tag_name = subtag[:-3] |
|
|
warnings.warn( |
|
|
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' |
|
|
% dict(name=tag_name), |
|
|
DeprecationWarning, |
|
|
stacklevel=2, |
|
|
) |
|
|
result = self.find(tag_name) |
|
|
|
|
|
elif not subtag.startswith("__") and not subtag == "contents": |
|
|
result = self.find(subtag) |
|
|
else: |
|
|
raise AttributeError( |
|
|
"'%s' object has no attribute '%s'" % (self.__class__, subtag) |
|
|
) |
|
|
return cast(Optional[Tag], result) |
|
|
|
|
|
def __eq__(self, other: Any) -> bool: |
|
|
"""Returns true iff this Tag has the same name, the same attributes, |
|
|
and the same contents (recursively) as `other`.""" |
|
|
if self is other: |
|
|
return True |
|
|
if not isinstance(other, Tag): |
|
|
return False |
|
|
if ( |
|
|
not hasattr(other, "name") |
|
|
or not hasattr(other, "attrs") |
|
|
or not hasattr(other, "contents") |
|
|
or self.name != other.name |
|
|
or self.attrs != other.attrs |
|
|
or len(self) != len(other) |
|
|
): |
|
|
return False |
|
|
for i, my_child in enumerate(self.contents): |
|
|
if my_child != other.contents[i]: |
|
|
return False |
|
|
return True |
|
|
|
|
|
def __ne__(self, other: Any) -> bool: |
|
|
"""Returns true iff this Tag is not identical to `other`, |
|
|
as defined in __eq__.""" |
|
|
return not self == other |
|
|
|
|
|
def __repr__(self) -> str: |
|
|
"""Renders this `Tag` as a string.""" |
|
|
return self.decode() |
|
|
|
|
|
__str__ = __unicode__ = __repr__ |
|
|
|
|
|
def encode( |
|
|
self, |
|
|
encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, |
|
|
indent_level: Optional[int] = None, |
|
|
formatter: _FormatterOrName = "minimal", |
|
|
errors: str = "xmlcharrefreplace", |
|
|
) -> bytes: |
|
|
"""Render this `Tag` and its contents as a bytestring. |
|
|
|
|
|
:param encoding: The encoding to use when converting to |
|
|
a bytestring. This may also affect the text of the document, |
|
|
specifically any encoding declarations within the document. |
|
|
:param indent_level: Each line of the rendering will be |
|
|
indented this many levels. (The ``formatter`` decides what a |
|
|
'level' means, in terms of spaces or other characters |
|
|
output.) This is used internally in recursive calls while |
|
|
pretty-printing. |
|
|
:param formatter: Either a `Formatter` object, or a string naming one of |
|
|
the standard formatters. |
|
|
:param errors: An error handling strategy such as |
|
|
'xmlcharrefreplace'. This value is passed along into |
|
|
:py:meth:`str.encode` and its value should be one of the `error |
|
|
handling constants defined by Python's codecs module |
|
|
<https://docs.python.org/3/library/codecs.html#error-handlers>`_. |
|
|
""" |
|
|
|
|
|
|
|
|
u = self.decode(indent_level, encoding, formatter) |
|
|
return u.encode(encoding, errors) |
|
|
|
|
|
def decode( |
|
|
self, |
|
|
indent_level: Optional[int] = None, |
|
|
eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, |
|
|
formatter: _FormatterOrName = "minimal", |
|
|
iterator: Optional[Iterator[PageElement]] = None, |
|
|
) -> str: |
|
|
"""Render this `Tag` and its contents as a Unicode string. |
|
|
|
|
|
:param indent_level: Each line of the rendering will be |
|
|
indented this many levels. (The ``formatter`` decides what a |
|
|
'level' means, in terms of spaces or other characters |
|
|
output.) This is used internally in recursive calls while |
|
|
pretty-printing. |
|
|
:param encoding: The encoding you intend to use when |
|
|
converting the string to a bytestring. decode() is *not* |
|
|
responsible for performing that encoding. This information |
|
|
is needed so that a real encoding can be substituted in if |
|
|
the document contains an encoding declaration (e.g. in a |
|
|
<meta> tag). |
|
|
:param formatter: Either a `Formatter` object, or a string |
|
|
naming one of the standard formatters. |
|
|
:param iterator: The iterator to use when navigating over the |
|
|
parse tree. This is only used by `Tag.decode_contents` and |
|
|
you probably won't need to use it. |
|
|
""" |
|
|
pieces = [] |
|
|
|
|
|
|
|
|
|
|
|
if not isinstance(formatter, Formatter): |
|
|
formatter = self.formatter_for_name(formatter) |
|
|
|
|
|
if indent_level is True: |
|
|
indent_level = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
string_literal_tag = None |
|
|
|
|
|
for event, element in self._event_stream(iterator): |
|
|
if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): |
|
|
element = cast(Tag, element) |
|
|
piece = element._format_tag(eventual_encoding, formatter, opening=True) |
|
|
elif event is Tag.END_ELEMENT_EVENT: |
|
|
element = cast(Tag, element) |
|
|
piece = element._format_tag(eventual_encoding, formatter, opening=False) |
|
|
if indent_level is not None: |
|
|
indent_level -= 1 |
|
|
else: |
|
|
element = cast(NavigableString, element) |
|
|
piece = element.output_ready(formatter) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if string_literal_tag: |
|
|
indent_before = indent_after = False |
|
|
else: |
|
|
indent_before = indent_after = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ( |
|
|
event is Tag.START_ELEMENT_EVENT |
|
|
and not string_literal_tag |
|
|
and not cast(Tag, element)._should_pretty_print() |
|
|
): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
indent_before = True |
|
|
indent_after = False |
|
|
string_literal_tag = element |
|
|
elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag: |
|
|
|
|
|
|
|
|
|
|
|
indent_before = False |
|
|
indent_after = True |
|
|
string_literal_tag = None |
|
|
|
|
|
|
|
|
|
|
|
if indent_level is not None: |
|
|
if indent_before or indent_after: |
|
|
if isinstance(element, NavigableString): |
|
|
piece = piece.strip() |
|
|
if piece: |
|
|
piece = self._indent_string( |
|
|
piece, indent_level, formatter, indent_before, indent_after |
|
|
) |
|
|
if event == Tag.START_ELEMENT_EVENT: |
|
|
indent_level += 1 |
|
|
pieces.append(piece) |
|
|
return "".join(pieces) |
|
|
|
|
|
class _TreeTraversalEvent(object): |
|
|
"""An internal class representing an event in the process |
|
|
of traversing a parse tree. |
|
|
|
|
|
:meta private: |
|
|
""" |
|
|
|
|
|
|
|
|
START_ELEMENT_EVENT = _TreeTraversalEvent() |
|
|
END_ELEMENT_EVENT = _TreeTraversalEvent() |
|
|
EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() |
|
|
STRING_ELEMENT_EVENT = _TreeTraversalEvent() |
|
|
|
|
|
def _event_stream( |
|
|
self, iterator: Optional[Iterator[PageElement]] = None |
|
|
) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]: |
|
|
"""Yield a sequence of events that can be used to reconstruct the DOM |
|
|
for this element. |
|
|
|
|
|
This lets us recreate the nested structure of this element |
|
|
(e.g. when formatting it as a string) without using recursive |
|
|
method calls. |
|
|
|
|
|
This is similar in concept to the SAX API, but it's a simpler |
|
|
interface designed for internal use. The events are different |
|
|
from SAX and the arguments associated with the events are Tags |
|
|
and other Beautiful Soup objects. |
|
|
|
|
|
:param iterator: An alternate iterator to use when traversing |
|
|
the tree. |
|
|
""" |
|
|
tag_stack: List[Tag] = [] |
|
|
|
|
|
iterator = iterator or self.self_and_descendants |
|
|
|
|
|
for c in iterator: |
|
|
|
|
|
|
|
|
|
|
|
while tag_stack and c.parent != tag_stack[-1]: |
|
|
now_closed_tag = tag_stack.pop() |
|
|
yield Tag.END_ELEMENT_EVENT, now_closed_tag |
|
|
|
|
|
if isinstance(c, Tag): |
|
|
if c.is_empty_element: |
|
|
yield Tag.EMPTY_ELEMENT_EVENT, c |
|
|
else: |
|
|
yield Tag.START_ELEMENT_EVENT, c |
|
|
tag_stack.append(c) |
|
|
continue |
|
|
else: |
|
|
yield Tag.STRING_ELEMENT_EVENT, c |
|
|
|
|
|
while tag_stack: |
|
|
now_closed_tag = tag_stack.pop() |
|
|
yield Tag.END_ELEMENT_EVENT, now_closed_tag |
|
|
|
|
|
def _indent_string( |
|
|
self, |
|
|
s: str, |
|
|
indent_level: int, |
|
|
formatter: Formatter, |
|
|
indent_before: bool, |
|
|
indent_after: bool, |
|
|
) -> str: |
|
|
"""Add indentation whitespace before and/or after a string. |
|
|
|
|
|
:param s: The string to amend with whitespace. |
|
|
:param indent_level: The indentation level; affects how much |
|
|
whitespace goes before the string. |
|
|
:param indent_before: Whether or not to add whitespace |
|
|
before the string. |
|
|
:param indent_after: Whether or not to add whitespace |
|
|
(a newline) after the string. |
|
|
""" |
|
|
space_before = "" |
|
|
if indent_before and indent_level: |
|
|
space_before = formatter.indent * indent_level |
|
|
|
|
|
space_after = "" |
|
|
if indent_after: |
|
|
space_after = "\n" |
|
|
|
|
|
return space_before + s + space_after |
|
|
|
|
|
def _format_tag( |
|
|
self, eventual_encoding: str, formatter: Formatter, opening: bool |
|
|
) -> str: |
|
|
if self.hidden: |
|
|
|
|
|
|
|
|
return "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
closing_slash = "" |
|
|
if not opening: |
|
|
closing_slash = "/" |
|
|
|
|
|
|
|
|
prefix = "" |
|
|
if self.prefix: |
|
|
prefix = self.prefix + ":" |
|
|
|
|
|
|
|
|
attribute_string = "" |
|
|
if opening: |
|
|
attributes = formatter.attributes(self) |
|
|
attrs = [] |
|
|
for key, val in attributes: |
|
|
if val is None: |
|
|
decoded = key |
|
|
else: |
|
|
if isinstance(val, list) or isinstance(val, tuple): |
|
|
val = " ".join(val) |
|
|
elif not isinstance(val, str): |
|
|
val = str(val) |
|
|
elif ( |
|
|
isinstance(val, AttributeValueWithCharsetSubstitution) |
|
|
and eventual_encoding is not None |
|
|
): |
|
|
val = val.substitute_encoding(eventual_encoding) |
|
|
|
|
|
text = formatter.attribute_value(val) |
|
|
decoded = str(key) + "=" + formatter.quoted_attribute_value(text) |
|
|
attrs.append(decoded) |
|
|
if attrs: |
|
|
attribute_string = " " + " ".join(attrs) |
|
|
|
|
|
|
|
|
|
|
|
void_element_closing_slash = "" |
|
|
if self.is_empty_element: |
|
|
void_element_closing_slash = formatter.void_element_close_prefix or "" |
|
|
|
|
|
|
|
|
return ( |
|
|
"<" |
|
|
+ closing_slash |
|
|
+ prefix |
|
|
+ self.name |
|
|
+ attribute_string |
|
|
+ void_element_closing_slash |
|
|
+ ">" |
|
|
) |
|
|
|
|
|
def _should_pretty_print(self, indent_level: int = 1) -> bool: |
|
|
"""Should this tag be pretty-printed? |
|
|
|
|
|
Most of them should, but some (such as <pre> in HTML |
|
|
documents) should not. |
|
|
""" |
|
|
return indent_level is not None and ( |
|
|
not self.preserve_whitespace_tags |
|
|
or self.name not in self.preserve_whitespace_tags |
|
|
) |
|
|
|
|
|
def prettify( |
|
|
self, |
|
|
encoding: Optional[_Encoding] = None, |
|
|
formatter: _FormatterOrName = "minimal", |
|
|
) -> Union[str, bytes]: |
|
|
"""Pretty-print this `Tag` as a string or bytestring. |
|
|
|
|
|
:param encoding: The encoding of the bytestring, or None if you want Unicode. |
|
|
:param formatter: A Formatter object, or a string naming one of |
|
|
the standard formatters. |
|
|
:return: A string (if no ``encoding`` is provided) or a bytestring |
|
|
(otherwise). |
|
|
""" |
|
|
if encoding is None: |
|
|
return self.decode(indent_level=0, formatter=formatter) |
|
|
else: |
|
|
return self.encode(encoding=encoding, indent_level=0, formatter=formatter) |
|
|
|
|
|
def decode_contents( |
|
|
self, |
|
|
indent_level: Optional[int] = None, |
|
|
eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, |
|
|
formatter: _FormatterOrName = "minimal", |
|
|
) -> str: |
|
|
"""Renders the contents of this tag as a Unicode string. |
|
|
|
|
|
:param indent_level: Each line of the rendering will be |
|
|
indented this many levels. (The formatter decides what a |
|
|
'level' means in terms of spaces or other characters |
|
|
output.) Used internally in recursive calls while |
|
|
pretty-printing. |
|
|
|
|
|
:param eventual_encoding: The tag is destined to be |
|
|
encoded into this encoding. decode_contents() is *not* |
|
|
responsible for performing that encoding. This information |
|
|
is needed so that a real encoding can be substituted in if |
|
|
the document contains an encoding declaration (e.g. in a |
|
|
<meta> tag). |
|
|
|
|
|
:param formatter: A `Formatter` object, or a string naming one of |
|
|
the standard Formatters. |
|
|
""" |
|
|
return self.decode( |
|
|
indent_level, eventual_encoding, formatter, iterator=self.descendants |
|
|
) |
|
|
|
|
|
def encode_contents( |
|
|
self, |
|
|
indent_level: Optional[int] = None, |
|
|
encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, |
|
|
formatter: _FormatterOrName = "minimal", |
|
|
) -> bytes: |
|
|
"""Renders the contents of this PageElement as a bytestring. |
|
|
|
|
|
:param indent_level: Each line of the rendering will be |
|
|
indented this many levels. (The ``formatter`` decides what a |
|
|
'level' means, in terms of spaces or other characters |
|
|
output.) This is used internally in recursive calls while |
|
|
pretty-printing. |
|
|
:param formatter: Either a `Formatter` object, or a string naming one of |
|
|
the standard formatters. |
|
|
:param encoding: The bytestring will be in this encoding. |
|
|
""" |
|
|
contents = self.decode_contents(indent_level, encoding, formatter) |
|
|
return contents.encode(encoding) |
|
|
|
|
|
@_deprecated("encode_contents", "4.0.0") |
|
|
def renderContents( |
|
|
self, |
|
|
encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, |
|
|
prettyPrint: bool = False, |
|
|
indentLevel: Optional[int] = 0, |
|
|
) -> bytes: |
|
|
"""Deprecated method for BS3 compatibility. |
|
|
|
|
|
:meta private: |
|
|
""" |
|
|
if not prettyPrint: |
|
|
indentLevel = None |
|
|
return self.encode_contents(indent_level=indentLevel, encoding=encoding) |
|
|
|
|
|
|
|
|
|
|
|
def find( |
|
|
self, |
|
|
name: _FindMethodName = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
recursive: bool = True, |
|
|
string: Optional[_StrainableString] = None, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _AtMostOneElement: |
|
|
"""Look in the children of this PageElement and find the first |
|
|
PageElement that matches the given criteria. |
|
|
|
|
|
All find_* methods take a common set of arguments. See the online |
|
|
documentation for detailed explanations. |
|
|
|
|
|
:param name: A filter on tag name. |
|
|
:param attrs: Additional filters on attribute values. |
|
|
:param recursive: If this is True, find() will perform a |
|
|
recursive search of this Tag's children. Otherwise, |
|
|
only the direct children will be considered. |
|
|
:param string: A filter on the `Tag.string` attribute. |
|
|
:param limit: Stop looking after finding this many results. |
|
|
:kwargs: Additional filters on attribute values. |
|
|
""" |
|
|
r = None |
|
|
results = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) |
|
|
if results: |
|
|
r = results[0] |
|
|
return r |
|
|
|
|
|
findChild = _deprecated_function_alias("findChild", "find", "3.0.0") |
|
|
|
|
|
def find_all( |
|
|
self, |
|
|
name: _FindMethodName = None, |
|
|
attrs: _StrainableAttributes = {}, |
|
|
recursive: bool = True, |
|
|
string: Optional[_StrainableString] = None, |
|
|
limit: Optional[int] = None, |
|
|
_stacklevel: int = 2, |
|
|
**kwargs: _StrainableAttribute, |
|
|
) -> _QueryResults: |
|
|
"""Look in the children of this `PageElement` and find all |
|
|
`PageElement` objects that match the given criteria. |
|
|
|
|
|
All find_* methods take a common set of arguments. See the online |
|
|
documentation for detailed explanations. |
|
|
|
|
|
:param name: A filter on tag name. |
|
|
:param attrs: Additional filters on attribute values. |
|
|
:param recursive: If this is True, find_all() will perform a |
|
|
recursive search of this PageElement's children. Otherwise, |
|
|
only the direct children will be considered. |
|
|
:param limit: Stop looking after finding this many results. |
|
|
:param _stacklevel: Used internally to improve warning messages. |
|
|
:kwargs: Additional filters on attribute values. |
|
|
""" |
|
|
generator = self.descendants |
|
|
if not recursive: |
|
|
generator = self.children |
|
|
return self._find_all( |
|
|
name, attrs, string, limit, generator, _stacklevel=_stacklevel + 1, **kwargs |
|
|
) |
|
|
|
|
|
findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0") |
|
|
findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0") |
|
|
|
|
|
|
|
|
@property |
|
|
def children(self) -> Iterator[PageElement]: |
|
|
"""Iterate over all direct children of this `PageElement`.""" |
|
|
return (x for x in self.contents) |
|
|
|
|
|
@property |
|
|
def self_and_descendants(self) -> Iterator[PageElement]: |
|
|
"""Iterate over this `Tag` and its children in a |
|
|
breadth-first sequence. |
|
|
""" |
|
|
return self._self_and(self.descendants) |
|
|
|
|
|
@property |
|
|
def descendants(self) -> Iterator[PageElement]: |
|
|
"""Iterate over all children of this `Tag` in a |
|
|
breadth-first sequence. |
|
|
""" |
|
|
if not len(self.contents): |
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
last_descendant = cast(PageElement, self._last_descendant(accept_self=True)) |
|
|
stopNode = last_descendant.next_element |
|
|
current: _AtMostOneElement = self.contents[0] |
|
|
while current is not stopNode and current is not None: |
|
|
successor = current.next_element |
|
|
yield current |
|
|
current = successor |
|
|
|
|
|
|
|
|
def select_one( |
|
|
self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any |
|
|
) -> Optional[Tag]: |
|
|
"""Perform a CSS selection operation on the current element. |
|
|
|
|
|
:param selector: A CSS selector. |
|
|
|
|
|
:param namespaces: A dictionary mapping namespace prefixes |
|
|
used in the CSS selector to namespace URIs. By default, |
|
|
Beautiful Soup will use the prefixes it encountered while |
|
|
parsing the document. |
|
|
|
|
|
:param kwargs: Keyword arguments to be passed into Soup Sieve's |
|
|
soupsieve.select() method. |
|
|
""" |
|
|
return self.css.select_one(selector, namespaces, **kwargs) |
|
|
|
|
|
def select( |
|
|
self, |
|
|
selector: str, |
|
|
namespaces: Optional[Dict[str, str]] = None, |
|
|
limit: int = 0, |
|
|
**kwargs: Any, |
|
|
) -> ResultSet[Tag]: |
|
|
"""Perform a CSS selection operation on the current element. |
|
|
|
|
|
This uses the SoupSieve library. |
|
|
|
|
|
:param selector: A string containing a CSS selector. |
|
|
|
|
|
:param namespaces: A dictionary mapping namespace prefixes |
|
|
used in the CSS selector to namespace URIs. By default, |
|
|
Beautiful Soup will use the prefixes it encountered while |
|
|
parsing the document. |
|
|
|
|
|
:param limit: After finding this number of results, stop looking. |
|
|
|
|
|
:param kwargs: Keyword arguments to be passed into SoupSieve's |
|
|
soupsieve.select() method. |
|
|
""" |
|
|
return self.css.select(selector, namespaces, limit, **kwargs) |
|
|
|
|
|
@property |
|
|
def css(self) -> CSS: |
|
|
"""Return an interface to the CSS selector API.""" |
|
|
return CSS(self) |
|
|
|
|
|
|
|
|
@_deprecated("children", "4.0.0") |
|
|
def childGenerator(self) -> Iterator[PageElement]: |
|
|
"""Deprecated generator. |
|
|
|
|
|
:meta private: |
|
|
""" |
|
|
return self.children |
|
|
|
|
|
@_deprecated("descendants", "4.0.0") |
|
|
def recursiveChildGenerator(self) -> Iterator[PageElement]: |
|
|
"""Deprecated generator. |
|
|
|
|
|
:meta private: |
|
|
""" |
|
|
return self.descendants |
|
|
|
|
|
@_deprecated("has_attr", "4.0.0") |
|
|
def has_key(self, key: str) -> bool: |
|
|
"""Deprecated method. This was kind of misleading because has_key() |
|
|
(attributes) was different from __in__ (contents). |
|
|
|
|
|
has_key() is gone in Python 3, anyway. |
|
|
|
|
|
:meta private: |
|
|
""" |
|
|
return self.has_attr(key) |
|
|
|
|
|
|
|
|
_PageElementT = TypeVar("_PageElementT", bound=PageElement) |
|
|
|
|
|
|
|
|
class ResultSet(List[_PageElementT], Generic[_PageElementT]): |
|
|
"""A ResultSet is a list of `PageElement` objects, gathered as the result |
|
|
of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of |
|
|
search results. |
|
|
""" |
|
|
|
|
|
source: Optional[ElementFilter] |
|
|
|
|
|
def __init__( |
|
|
self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = () |
|
|
) -> None: |
|
|
super(ResultSet, self).__init__(result) |
|
|
self.source = source |
|
|
|
|
|
def __getattr__(self, key: str) -> None: |
|
|
"""Raise a helpful exception to explain a common code fix.""" |
|
|
raise AttributeError( |
|
|
f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?""" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from bs4.filter import SoupStrainer |
|
|
|