Karim shoair commited on
Commit ·
6eaebde
1
Parent(s): 20efe8c
docs: improve All `Adaptor` class doc strings
Browse files- scrapling/parser.py +101 -100
scrapling/parser.py
CHANGED
|
@@ -67,21 +67,21 @@ class Adaptor(SelectorsGeneration):
|
|
| 67 |
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
| 68 |
|
| 69 |
Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
|
| 70 |
-
inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable which makes a lot of reference jobs
|
| 71 |
not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
|
| 72 |
It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
|
| 73 |
|
| 74 |
:param text: HTML body passed as text.
|
| 75 |
-
:param url: allows storing a URL with the
|
| 76 |
-
:param body: HTML body as ``bytes`` object. It can be used instead of the ``text`` argument.
|
| 77 |
:param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
|
| 78 |
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
| 79 |
-
|
| 80 |
-
:param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
|
| 81 |
Don't use it unless you know what you are doing!
|
| 82 |
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
| 83 |
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
|
| 84 |
-
:param auto_match: Globally turn
|
| 85 |
priority over all auto-match related arguments/functions in the class.
|
| 86 |
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
| 87 |
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
|
@@ -125,7 +125,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 125 |
self.__text = TextHandler(text or body.decode())
|
| 126 |
|
| 127 |
else:
|
| 128 |
-
# All
|
| 129 |
if not issubclass(type(root), html.HtmlMixin):
|
| 130 |
raise TypeError(
|
| 131 |
f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
|
|
@@ -181,15 +181,15 @@ class Adaptor(SelectorsGeneration):
|
|
| 181 |
else {}
|
| 182 |
)
|
| 183 |
|
| 184 |
-
# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
|
| 185 |
@staticmethod
|
| 186 |
def _is_text_node(
|
| 187 |
element: Union[html.HtmlElement, etree._ElementUnicodeResult],
|
| 188 |
) -> bool:
|
| 189 |
-
"""Return True if given element is a result of a string expression
|
| 190 |
Examples:
|
| 191 |
-
XPath -> '/text()', '/@attribute' etc...
|
| 192 |
-
CSS3
|
| 193 |
"""
|
| 194 |
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
| 195 |
return issubclass(type(element), etree._ElementUnicodeResult)
|
|
@@ -200,7 +200,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 200 |
) -> TextHandler:
|
| 201 |
"""Used internally to convert a single element's text content to TextHandler directly without checks
|
| 202 |
|
| 203 |
-
This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
|
| 204 |
"""
|
| 205 |
return TextHandler(str(element))
|
| 206 |
|
|
@@ -209,7 +209,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 209 |
return Adaptor(
|
| 210 |
root=element,
|
| 211 |
text="",
|
| 212 |
-
body=b"", # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
| 213 |
url=self.url,
|
| 214 |
encoding=self.encoding,
|
| 215 |
auto_match=self.__auto_match_enabled,
|
|
@@ -240,8 +240,8 @@ class Adaptor(SelectorsGeneration):
|
|
| 240 |
): # Lxml will give a warning if I used something like `not result`
|
| 241 |
return Adaptors([])
|
| 242 |
|
| 243 |
-
# From within the code, this method will always get a list of the same type
|
| 244 |
-
# so we will continue without checks for slight performance boost
|
| 245 |
if self._is_text_node(result[0]):
|
| 246 |
return TextHandlers(list(map(self.__content_convertor, result)))
|
| 247 |
|
|
@@ -253,12 +253,12 @@ class Adaptor(SelectorsGeneration):
|
|
| 253 |
|
| 254 |
# The following four properties I made them into functions instead of variables directly
|
| 255 |
# So they don't slow down the process of initializing many instances of the class and gets executed only
|
| 256 |
-
# when the user
|
| 257 |
# Doing that only made the library performance test sky rocked multiple times faster than before
|
| 258 |
# because I was executing them on initialization before :))
|
| 259 |
@property
|
| 260 |
def tag(self) -> str:
|
| 261 |
-
"""Get tag name of the element"""
|
| 262 |
if not self.__tag:
|
| 263 |
self.__tag = self._root.tag
|
| 264 |
return self.__tag
|
|
@@ -267,8 +267,8 @@ class Adaptor(SelectorsGeneration):
|
|
| 267 |
def text(self) -> TextHandler:
|
| 268 |
"""Get text content of the element"""
|
| 269 |
if not self.__text:
|
| 270 |
-
# If you want to escape lxml default
|
| 271 |
-
# before extracting text then keep `keep_comments` set to False while initializing the first class
|
| 272 |
self.__text = TextHandler(self._root.text)
|
| 273 |
return self.__text
|
| 274 |
|
|
@@ -322,7 +322,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 322 |
|
| 323 |
@property
|
| 324 |
def html_content(self) -> TextHandler:
|
| 325 |
-
"""Return the inner
|
| 326 |
return TextHandler(
|
| 327 |
etree.tostring(
|
| 328 |
self._root, encoding="unicode", method="html", with_tail=False
|
|
@@ -344,7 +344,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 344 |
)
|
| 345 |
|
| 346 |
def has_class(self, class_name: str) -> bool:
|
| 347 |
-
"""Check if element has a specific class
|
| 348 |
:param class_name: The class name to check for
|
| 349 |
:return: True if element has class with that name otherwise False
|
| 350 |
"""
|
|
@@ -382,7 +382,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 382 |
return Adaptors([])
|
| 383 |
|
| 384 |
def iterancestors(self) -> Generator["Adaptor", None, None]:
|
| 385 |
-
"""Return a generator that loops over all ancestors of the element, starting with element's parent."""
|
| 386 |
for ancestor in self._root.iterancestors():
|
| 387 |
yield self.__element_convertor(ancestor)
|
| 388 |
|
|
@@ -400,7 +400,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 400 |
|
| 401 |
@property
|
| 402 |
def path(self) -> "Adaptors[Adaptor]":
|
| 403 |
-
"""Returns list of type
|
| 404 |
lst = list(self.iterancestors())
|
| 405 |
return Adaptors(lst)
|
| 406 |
|
|
@@ -410,7 +410,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 410 |
next_element = self._root.getnext()
|
| 411 |
if next_element is not None:
|
| 412 |
while type(next_element) in html_forbidden:
|
| 413 |
-
# Ignore
|
| 414 |
next_element = next_element.getnext()
|
| 415 |
|
| 416 |
return self.__handle_element(next_element)
|
|
@@ -421,7 +421,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 421 |
prev_element = self._root.getprevious()
|
| 422 |
if prev_element is not None:
|
| 423 |
while type(prev_element) in html_forbidden:
|
| 424 |
-
# Ignore
|
| 425 |
prev_element = prev_element.getprevious()
|
| 426 |
|
| 427 |
return self.__handle_element(prev_element)
|
|
@@ -456,7 +456,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 456 |
|
| 457 |
return data + ">"
|
| 458 |
|
| 459 |
-
# From here we start the selecting functions
|
| 460 |
def relocate(
|
| 461 |
self,
|
| 462 |
element: Union[Dict, html.HtmlElement, "Adaptor"],
|
|
@@ -467,13 +467,13 @@ class Adaptor(SelectorsGeneration):
|
|
| 467 |
|
| 468 |
:param element: The element we want to relocate in the tree
|
| 469 |
:param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
|
| 470 |
-
calculation depends solely on the page structure so don't play with this number unless you must know
|
| 471 |
what you are doing!
|
| 472 |
:param adaptor_type: If True, the return result will be converted to `Adaptors` object
|
| 473 |
:return: List of pure HTML elements that got the highest matching score or 'Adaptors' object
|
| 474 |
"""
|
| 475 |
score_table = {}
|
| 476 |
-
# Note: `element` will
|
| 477 |
if isinstance(element, self.__class__):
|
| 478 |
element = element._root
|
| 479 |
|
|
@@ -481,7 +481,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 481 |
element = _StorageTools.element_to_dict(element)
|
| 482 |
|
| 483 |
for node in self._root.xpath(".//*"):
|
| 484 |
-
# Collect all elements in the page then for each element get the matching score of it against the node.
|
| 485 |
# Hence: the code doesn't stop even if the score was 100%
|
| 486 |
# because there might be another element(s) left in page with the same score
|
| 487 |
score = self.__calculate_similarity_score(element, node)
|
|
@@ -491,7 +491,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 491 |
highest_probability = max(score_table.keys())
|
| 492 |
if score_table[highest_probability] and highest_probability >= percentage:
|
| 493 |
if log.getEffectiveLevel() < 20:
|
| 494 |
-
# No need to execute this part if logging level is not debugging
|
| 495 |
log.debug(f"Highest probability was {highest_probability}%")
|
| 496 |
log.debug("Top 5 best matching elements are: ")
|
| 497 |
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
|
@@ -512,19 +512,19 @@ class Adaptor(SelectorsGeneration):
|
|
| 512 |
auto_save: bool = False,
|
| 513 |
percentage: int = 0,
|
| 514 |
) -> Union["Adaptor", "TextHandler", None]:
|
| 515 |
-
"""Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
|
| 516 |
|
| 517 |
**Important:
|
| 518 |
-
It's recommended to use the identifier argument if you plan to use different selector later
|
| 519 |
and want to relocate the same element(s)**
|
| 520 |
|
| 521 |
:param selector: The CSS3 selector to be used.
|
| 522 |
-
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
| 523 |
-
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
| 524 |
otherwise the selector will be used.
|
| 525 |
:param auto_save: Automatically save new elements for `auto_match` later
|
| 526 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 527 |
-
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 528 |
number unless you must know what you are doing!
|
| 529 |
"""
|
| 530 |
for element in self.css(
|
|
@@ -542,21 +542,21 @@ class Adaptor(SelectorsGeneration):
|
|
| 542 |
percentage: int = 0,
|
| 543 |
**kwargs: Any,
|
| 544 |
) -> Union["Adaptor", "TextHandler", None]:
|
| 545 |
-
"""Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
|
| 546 |
|
| 547 |
**Important:
|
| 548 |
-
It's recommended to use the identifier argument if you plan to use different selector later
|
| 549 |
and want to relocate the same element(s)**
|
| 550 |
|
| 551 |
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 552 |
|
| 553 |
:param selector: The XPath selector to be used.
|
| 554 |
-
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
| 555 |
-
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
| 556 |
otherwise the selector will be used.
|
| 557 |
:param auto_save: Automatically save new elements for `auto_match` later
|
| 558 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 559 |
-
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 560 |
number unless you must know what you are doing!
|
| 561 |
"""
|
| 562 |
for element in self.xpath(
|
|
@@ -573,22 +573,22 @@ class Adaptor(SelectorsGeneration):
|
|
| 573 |
auto_save: bool = False,
|
| 574 |
percentage: int = 0,
|
| 575 |
) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
|
| 576 |
-
"""Search current tree with CSS3 selectors
|
| 577 |
|
| 578 |
**Important:
|
| 579 |
-
It's recommended to use the identifier argument if you plan to use different selector later
|
| 580 |
and want to relocate the same element(s)**
|
| 581 |
|
| 582 |
:param selector: The CSS3 selector to be used.
|
| 583 |
-
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
| 584 |
-
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
| 585 |
otherwise the selector will be used.
|
| 586 |
:param auto_save: Automatically save new elements for `auto_match` later
|
| 587 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 588 |
-
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 589 |
number unless you must know what you are doing!
|
| 590 |
|
| 591 |
-
:return:
|
| 592 |
"""
|
| 593 |
try:
|
| 594 |
if not self.__auto_match_enabled or "," not in selector:
|
|
@@ -605,7 +605,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 605 |
results = []
|
| 606 |
if "," in selector:
|
| 607 |
for single_selector in split_selectors(selector):
|
| 608 |
-
# I'm doing this only so the `save` function
|
| 609 |
# Like using the ',' to combine two different selectors that point to different elements.
|
| 610 |
xpath_selector = translator_instance.css_to_xpath(
|
| 611 |
single_selector.canonical()
|
|
@@ -634,24 +634,24 @@ class Adaptor(SelectorsGeneration):
|
|
| 634 |
percentage: int = 0,
|
| 635 |
**kwargs: Any,
|
| 636 |
) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
|
| 637 |
-
"""Search current tree with XPath selectors
|
| 638 |
|
| 639 |
**Important:
|
| 640 |
-
It's recommended to use the identifier argument if you plan to use different selector later
|
| 641 |
and want to relocate the same element(s)**
|
| 642 |
|
| 643 |
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 644 |
|
| 645 |
:param selector: The XPath selector to be used.
|
| 646 |
-
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
| 647 |
-
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
| 648 |
otherwise the selector will be used.
|
| 649 |
:param auto_save: Automatically save new elements for `auto_match` later
|
| 650 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 651 |
-
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 652 |
number unless you must know what you are doing!
|
| 653 |
|
| 654 |
-
:return:
|
| 655 |
"""
|
| 656 |
try:
|
| 657 |
elements = self._root.xpath(selector, **kwargs)
|
|
@@ -700,9 +700,9 @@ class Adaptor(SelectorsGeneration):
|
|
| 700 |
*args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
|
| 701 |
**kwargs: str,
|
| 702 |
) -> "Adaptors":
|
| 703 |
-
"""Find elements by filters of your creations for ease.
|
| 704 |
|
| 705 |
-
:param args: Tag name(s),
|
| 706 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 707 |
:return: The `Adaptors` object of the elements or empty list
|
| 708 |
"""
|
|
@@ -796,7 +796,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 796 |
for pattern in patterns:
|
| 797 |
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
| 798 |
|
| 799 |
-
# Collect element if it fulfills passed function otherwise
|
| 800 |
for function in functions:
|
| 801 |
results = results.filter(function)
|
| 802 |
|
|
@@ -807,9 +807,9 @@ class Adaptor(SelectorsGeneration):
|
|
| 807 |
*args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
|
| 808 |
**kwargs: str,
|
| 809 |
) -> Union["Adaptor", None]:
|
| 810 |
-
"""Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
|
| 811 |
|
| 812 |
-
:param args: Tag name(s),
|
| 813 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 814 |
:return: The `Adaptor` object of the element or `None` if the result didn't match
|
| 815 |
"""
|
|
@@ -820,7 +820,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 820 |
def __calculate_similarity_score(
|
| 821 |
self, original: Dict, candidate: html.HtmlElement
|
| 822 |
) -> float:
|
| 823 |
-
"""Used internally to calculate a score that shows how candidate element similar to the original one
|
| 824 |
|
| 825 |
:param original: The original element in the form of the dictionary generated from `element_to_dict` function
|
| 826 |
:param candidate: The element to compare with the original element.
|
|
@@ -841,7 +841,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 841 |
).ratio() # * 0.3 # 30%
|
| 842 |
checks += 1
|
| 843 |
|
| 844 |
-
# if both
|
| 845 |
score += self.__calculate_dict_diff(
|
| 846 |
original["attributes"], candidate["attributes"]
|
| 847 |
) # * 0.3 # 30%
|
|
@@ -888,7 +888,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 888 |
).ratio() # * 0.1 # 10%
|
| 889 |
checks += 1
|
| 890 |
# else:
|
| 891 |
-
# # The original element
|
| 892 |
# score -= 0.1
|
| 893 |
|
| 894 |
if original.get("siblings"):
|
|
@@ -902,7 +902,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 902 |
|
| 903 |
@staticmethod
|
| 904 |
def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
|
| 905 |
-
"""Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
|
| 906 |
score = (
|
| 907 |
SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
|
| 908 |
* 0.5
|
|
@@ -918,7 +918,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 918 |
) -> None:
|
| 919 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 920 |
|
| 921 |
-
:param element: The element itself that we want to save to storage, it can be
|
| 922 |
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 923 |
the docs for more info.
|
| 924 |
"""
|
|
@@ -948,10 +948,11 @@ class Adaptor(SelectorsGeneration):
|
|
| 948 |
log.critical(
|
| 949 |
"Can't use Auto-match features while disabled globally, you have to start a new class instance."
|
| 950 |
)
|
|
|
|
| 951 |
|
| 952 |
# Operations on text functions
|
| 953 |
def json(self) -> Dict:
|
| 954 |
-
"""Return
|
| 955 |
if self.text:
|
| 956 |
return self.text.json()
|
| 957 |
else:
|
|
@@ -967,9 +968,9 @@ class Adaptor(SelectorsGeneration):
|
|
| 967 |
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 968 |
|
| 969 |
:param regex: Can be either a compiled regular expression or a string.
|
| 970 |
-
:param replace_entities:
|
| 971 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 972 |
-
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
| 973 |
"""
|
| 974 |
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
| 975 |
|
|
@@ -987,7 +988,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 987 |
:param default: The default value to be returned if there is no match
|
| 988 |
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 989 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 990 |
-
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
| 991 |
"""
|
| 992 |
return self.text.re_first(
|
| 993 |
regex, default, replace_entities, clean_match, case_sensitive
|
|
@@ -1003,22 +1004,22 @@ class Adaptor(SelectorsGeneration):
|
|
| 1003 |
match_text: bool = False,
|
| 1004 |
) -> Union["Adaptors[Adaptor]", List]:
|
| 1005 |
"""Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
|
| 1006 |
-
then return the ones that match the current element attributes with percentage higher than the input threshold.
|
| 1007 |
|
| 1008 |
This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
|
| 1009 |
-
a products-list container and want to find other products using that
|
| 1010 |
this function works in any case without depending on the element type.
|
| 1011 |
|
| 1012 |
-
:param similarity_threshold: The percentage to use while comparing
|
| 1013 |
Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
|
| 1014 |
-
same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless
|
| 1015 |
-
extremely unlucky then attributes matching comes into play so
|
| 1016 |
you are getting the results you don't want.
|
| 1017 |
-
Also, if current element doesn't have attributes and the similar element as well, then it's a 100% match.
|
| 1018 |
-
:param ignore_attributes: Attribute names passed will be ignored while matching the attributes in last step.
|
| 1019 |
-
The default value is to ignore `href` and `src` as URLs can change a lot between elements so it's unreliable
|
| 1020 |
-
:param match_text: If True,
|
| 1021 |
-
Not recommended to use in normal cases but it depends.
|
| 1022 |
|
| 1023 |
:return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
|
| 1024 |
"""
|
|
@@ -1035,7 +1036,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 1035 |
candidate: html.HtmlElement,
|
| 1036 |
) -> bool:
|
| 1037 |
"""Calculate a score of how much these elements are alike and return True
|
| 1038 |
-
if score is higher or
|
| 1039 |
candidate_attributes = (
|
| 1040 |
get_attributes(candidate) if ignore_attributes else candidate.attrib
|
| 1041 |
)
|
|
@@ -1049,7 +1050,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 1049 |
checks += len(candidate_attributes)
|
| 1050 |
else:
|
| 1051 |
if not candidate_attributes:
|
| 1052 |
-
# Both
|
| 1053 |
score += 1
|
| 1054 |
checks += 1
|
| 1055 |
|
|
@@ -1065,7 +1066,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 1065 |
return round(score / checks, 2) >= similarity_threshold
|
| 1066 |
return False
|
| 1067 |
|
| 1068 |
-
# We will use the elements root from now on to get the speed boost of using Lxml directly
|
| 1069 |
root = self._root
|
| 1070 |
current_depth = len(list(root.iterancestors()))
|
| 1071 |
target_attrs = get_attributes(root) if ignore_attributes else root.attrib
|
|
@@ -1105,9 +1106,9 @@ class Adaptor(SelectorsGeneration):
|
|
| 1105 |
) -> Union["Adaptors[Adaptor]", "Adaptor"]:
|
| 1106 |
"""Find elements that its text content fully/partially matches input.
|
| 1107 |
:param text: Text query to match
|
| 1108 |
-
:param first_match:
|
| 1109 |
-
:param partial: If enabled, function
|
| 1110 |
-
:param case_sensitive: if enabled, letters case will be taken into consideration
|
| 1111 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1112 |
"""
|
| 1113 |
|
|
@@ -1151,9 +1152,9 @@ class Adaptor(SelectorsGeneration):
|
|
| 1151 |
) -> Union["Adaptors[Adaptor]", "Adaptor"]:
|
| 1152 |
"""Find elements that its text content matches the input regex pattern.
|
| 1153 |
:param query: Regex query/pattern to match
|
| 1154 |
-
:param first_match: Return first element that matches conditions
|
| 1155 |
-
:param case_sensitive:
|
| 1156 |
-
:param clean_match:
|
| 1157 |
"""
|
| 1158 |
results = Adaptors([])
|
| 1159 |
|
|
@@ -1182,7 +1183,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 1182 |
|
| 1183 |
class Adaptors(List[Adaptor]):
|
| 1184 |
"""
|
| 1185 |
-
The
|
| 1186 |
"""
|
| 1187 |
|
| 1188 |
__slots__ = ()
|
|
@@ -1214,23 +1215,23 @@ class Adaptors(List[Adaptor]):
|
|
| 1214 |
) -> "Adaptors[Adaptor]":
|
| 1215 |
"""
|
| 1216 |
Call the ``.xpath()`` method for each element in this list and return
|
| 1217 |
-
their results as another
|
| 1218 |
|
| 1219 |
**Important:
|
| 1220 |
-
It's recommended to use the identifier argument if you plan to use different selector later
|
| 1221 |
and want to relocate the same element(s)**
|
| 1222 |
|
| 1223 |
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 1224 |
|
| 1225 |
:param selector: The XPath selector to be used.
|
| 1226 |
-
:param identifier: A string that will be used to retrieve element's data in auto-matching
|
| 1227 |
otherwise the selector will be used.
|
| 1228 |
:param auto_save: Automatically save new elements for `auto_match` later
|
| 1229 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 1230 |
-
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 1231 |
number unless you must know what you are doing!
|
| 1232 |
|
| 1233 |
-
:return:
|
| 1234 |
"""
|
| 1235 |
results = [
|
| 1236 |
n.xpath(
|
|
@@ -1249,21 +1250,21 @@ class Adaptors(List[Adaptor]):
|
|
| 1249 |
) -> "Adaptors[Adaptor]":
|
| 1250 |
"""
|
| 1251 |
Call the ``.css()`` method for each element in this list and return
|
| 1252 |
-
their results flattened as another
|
| 1253 |
|
| 1254 |
**Important:
|
| 1255 |
-
It's recommended to use the identifier argument if you plan to use different selector later
|
| 1256 |
and want to relocate the same element(s)**
|
| 1257 |
|
| 1258 |
:param selector: The CSS3 selector to be used.
|
| 1259 |
-
:param identifier: A string that will be used to retrieve element's data in auto-matching
|
| 1260 |
otherwise the selector will be used.
|
| 1261 |
:param auto_save: Automatically save new elements for `auto_match` later
|
| 1262 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 1263 |
-
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 1264 |
number unless you must know what you are doing!
|
| 1265 |
|
| 1266 |
-
:return:
|
| 1267 |
"""
|
| 1268 |
results = [
|
| 1269 |
n.css(selector, identifier or selector, False, auto_save, percentage)
|
|
@@ -1282,9 +1283,9 @@ class Adaptors(List[Adaptor]):
|
|
| 1282 |
their results flattened as List of TextHandler.
|
| 1283 |
|
| 1284 |
:param regex: Can be either a compiled regular expression or a string.
|
| 1285 |
-
:param replace_entities:
|
| 1286 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1287 |
-
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
| 1288 |
"""
|
| 1289 |
results = [
|
| 1290 |
n.text.re(regex, replace_entities, clean_match, case_sensitive)
|
|
@@ -1307,7 +1308,7 @@ class Adaptors(List[Adaptor]):
|
|
| 1307 |
:param default: The default value to be returned if there is no match
|
| 1308 |
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 1309 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1310 |
-
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
| 1311 |
"""
|
| 1312 |
for n in self:
|
| 1313 |
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
|
|
|
| 67 |
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
| 68 |
|
| 69 |
Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
|
| 70 |
+
inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable, which makes a lot of reference jobs
|
| 71 |
not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
|
| 72 |
It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
|
| 73 |
|
| 74 |
:param text: HTML body passed as text.
|
| 75 |
+
:param url: It allows storing a URL with the HTML data for retrieving later.
|
| 76 |
+
:param body: HTML body as an ``bytes`` object. It can be used instead of the ``text`` argument.
|
| 77 |
:param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
|
| 78 |
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
| 79 |
+
the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
| 80 |
+
:param root: Used internally to pass etree objects instead of text/body arguments, it takes the highest priority.
|
| 81 |
Don't use it unless you know what you are doing!
|
| 82 |
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
| 83 |
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
|
| 84 |
+
:param auto_match: Globally turn off the auto-match feature in all functions, this argument takes higher
|
| 85 |
priority over all auto-match related arguments/functions in the class.
|
| 86 |
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
| 87 |
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
|
|
|
| 125 |
self.__text = TextHandler(text or body.decode())
|
| 126 |
|
| 127 |
else:
|
| 128 |
+
# All HTML types inherit from HtmlMixin so this to check for all at once
|
| 129 |
if not issubclass(type(root), html.HtmlMixin):
|
| 130 |
raise TypeError(
|
| 131 |
f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
|
|
|
|
| 181 |
else {}
|
| 182 |
)
|
| 183 |
|
| 184 |
+
# Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
|
| 185 |
@staticmethod
|
| 186 |
def _is_text_node(
|
| 187 |
element: Union[html.HtmlElement, etree._ElementUnicodeResult],
|
| 188 |
) -> bool:
|
| 189 |
+
"""Return True if the given element is a result of a string expression
|
| 190 |
Examples:
|
| 191 |
+
XPath -> '/text()', '/@attribute', etc...
|
| 192 |
+
CSS3 -> '::text', '::attr(attrib)'...
|
| 193 |
"""
|
| 194 |
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
| 195 |
return issubclass(type(element), etree._ElementUnicodeResult)
|
|
|
|
| 200 |
) -> TextHandler:
|
| 201 |
"""Used internally to convert a single element's text content to TextHandler directly without checks
|
| 202 |
|
| 203 |
+
This single line has been isolated like this, so when it's used with `map` we get that slight performance boost vs. list comprehension
|
| 204 |
"""
|
| 205 |
return TextHandler(str(element))
|
| 206 |
|
|
|
|
| 209 |
return Adaptor(
|
| 210 |
root=element,
|
| 211 |
text="",
|
| 212 |
+
body=b"", # Since the root argument is provided, both `text` and `body` will be ignored, so this is just a filler
|
| 213 |
url=self.url,
|
| 214 |
encoding=self.encoding,
|
| 215 |
auto_match=self.__auto_match_enabled,
|
|
|
|
| 240 |
): # Lxml will give a warning if I used something like `not result`
|
| 241 |
return Adaptors([])
|
| 242 |
|
| 243 |
+
# From within the code, this method will always get a list of the same type,
|
| 244 |
+
# so we will continue without checks for a slight performance boost
|
| 245 |
if self._is_text_node(result[0]):
|
| 246 |
return TextHandlers(list(map(self.__content_convertor, result)))
|
| 247 |
|
|
|
|
| 253 |
|
| 254 |
# The following four properties I made them into functions instead of variables directly
|
| 255 |
# So they don't slow down the process of initializing many instances of the class and gets executed only
|
| 256 |
+
# when the user needs them for the first time for that specific element and gets cached for next times
|
| 257 |
# Doing that only made the library performance test sky rocked multiple times faster than before
|
| 258 |
# because I was executing them on initialization before :))
|
| 259 |
@property
|
| 260 |
def tag(self) -> str:
|
| 261 |
+
"""Get the tag name of the element"""
|
| 262 |
if not self.__tag:
|
| 263 |
self.__tag = self._root.tag
|
| 264 |
return self.__tag
|
|
|
|
| 267 |
def text(self) -> TextHandler:
|
| 268 |
"""Get text content of the element"""
|
| 269 |
if not self.__text:
|
| 270 |
+
# If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
|
| 271 |
+
# before extracting text, then keep `keep_comments` set to False while initializing the first class
|
| 272 |
self.__text = TextHandler(self._root.text)
|
| 273 |
return self.__text
|
| 274 |
|
|
|
|
| 322 |
|
| 323 |
@property
|
| 324 |
def html_content(self) -> TextHandler:
|
| 325 |
+
"""Return the inner HTML code of the element"""
|
| 326 |
return TextHandler(
|
| 327 |
etree.tostring(
|
| 328 |
self._root, encoding="unicode", method="html", with_tail=False
|
|
|
|
| 344 |
)
|
| 345 |
|
| 346 |
def has_class(self, class_name: str) -> bool:
|
| 347 |
+
"""Check if the element has a specific class
|
| 348 |
:param class_name: The class name to check for
|
| 349 |
:return: True if element has class with that name otherwise False
|
| 350 |
"""
|
|
|
|
| 382 |
return Adaptors([])
|
| 383 |
|
| 384 |
def iterancestors(self) -> Generator["Adaptor", None, None]:
|
| 385 |
+
"""Return a generator that loops over all ancestors of the element, starting with the element's parent."""
|
| 386 |
for ancestor in self._root.iterancestors():
|
| 387 |
yield self.__element_convertor(ancestor)
|
| 388 |
|
|
|
|
| 400 |
|
| 401 |
@property
|
| 402 |
def path(self) -> "Adaptors[Adaptor]":
|
| 403 |
+
"""Returns a list of type `Adaptors` that contains the path leading to the current element from the root."""
|
| 404 |
lst = list(self.iterancestors())
|
| 405 |
return Adaptors(lst)
|
| 406 |
|
|
|
|
| 410 |
next_element = self._root.getnext()
|
| 411 |
if next_element is not None:
|
| 412 |
while type(next_element) in html_forbidden:
|
| 413 |
+
# Ignore HTML comments and unwanted types
|
| 414 |
next_element = next_element.getnext()
|
| 415 |
|
| 416 |
return self.__handle_element(next_element)
|
|
|
|
| 421 |
prev_element = self._root.getprevious()
|
| 422 |
if prev_element is not None:
|
| 423 |
while type(prev_element) in html_forbidden:
|
| 424 |
+
# Ignore HTML comments and unwanted types
|
| 425 |
prev_element = prev_element.getprevious()
|
| 426 |
|
| 427 |
return self.__handle_element(prev_element)
|
|
|
|
| 456 |
|
| 457 |
return data + ">"
|
| 458 |
|
| 459 |
+
# From here we start with the selecting functions
|
| 460 |
def relocate(
|
| 461 |
self,
|
| 462 |
element: Union[Dict, html.HtmlElement, "Adaptor"],
|
|
|
|
| 467 |
|
| 468 |
:param element: The element we want to relocate in the tree
|
| 469 |
:param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
|
| 470 |
+
calculation depends solely on the page structure, so don't play with this number unless you must know
|
| 471 |
what you are doing!
|
| 472 |
:param adaptor_type: If True, the return result will be converted to `Adaptors` object
|
| 473 |
:return: List of pure HTML elements that got the highest matching score or 'Adaptors' object
|
| 474 |
"""
|
| 475 |
score_table = {}
|
| 476 |
+
# Note: `element` will most likely always be a dictionary at this point.
|
| 477 |
if isinstance(element, self.__class__):
|
| 478 |
element = element._root
|
| 479 |
|
|
|
|
| 481 |
element = _StorageTools.element_to_dict(element)
|
| 482 |
|
| 483 |
for node in self._root.xpath(".//*"):
|
| 484 |
+
# Collect all elements in the page, then for each element get the matching score of it against the node.
|
| 485 |
# Hence: the code doesn't stop even if the score was 100%
|
| 486 |
# because there might be another element(s) left in page with the same score
|
| 487 |
score = self.__calculate_similarity_score(element, node)
|
|
|
|
| 491 |
highest_probability = max(score_table.keys())
|
| 492 |
if score_table[highest_probability] and highest_probability >= percentage:
|
| 493 |
if log.getEffectiveLevel() < 20:
|
| 494 |
+
# No need to execute this part if the logging level is not debugging
|
| 495 |
log.debug(f"Highest probability was {highest_probability}%")
|
| 496 |
log.debug("Top 5 best matching elements are: ")
|
| 497 |
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
|
|
|
| 512 |
auto_save: bool = False,
|
| 513 |
percentage: int = 0,
|
| 514 |
) -> Union["Adaptor", "TextHandler", None]:
|
| 515 |
+
"""Search the current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
|
| 516 |
|
| 517 |
**Important:
|
| 518 |
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
| 519 |
and want to relocate the same element(s)**
|
| 520 |
|
| 521 |
:param selector: The CSS3 selector to be used.
|
| 522 |
+
:param auto_match: Enabled will make the function try to relocate the element if it was 'saved' before
|
| 523 |
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching,
|
| 524 |
otherwise the selector will be used.
|
| 525 |
:param auto_save: Automatically save new elements for `auto_match` later
|
| 526 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 527 |
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 528 |
number unless you must know what you are doing!
|
| 529 |
"""
|
| 530 |
for element in self.css(
|
|
|
|
| 542 |
percentage: int = 0,
|
| 543 |
**kwargs: Any,
|
| 544 |
) -> Union["Adaptor", "TextHandler", None]:
|
| 545 |
+
"""Search the current tree with XPath selectors and return the first result if possible, otherwise return `None`
|
| 546 |
|
| 547 |
**Important:
|
| 548 |
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
| 549 |
and want to relocate the same element(s)**
|
| 550 |
|
| 551 |
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 552 |
|
| 553 |
:param selector: The XPath selector to be used.
|
| 554 |
+
:param auto_match: Enabled will make the function try to relocate the element if it was 'saved' before
|
| 555 |
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching,
|
| 556 |
otherwise the selector will be used.
|
| 557 |
:param auto_save: Automatically save new elements for `auto_match` later
|
| 558 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 559 |
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 560 |
number unless you must know what you are doing!
|
| 561 |
"""
|
| 562 |
for element in self.xpath(
|
|
|
|
| 573 |
auto_save: bool = False,
|
| 574 |
percentage: int = 0,
|
| 575 |
) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
|
| 576 |
+
"""Search the current tree with CSS3 selectors
|
| 577 |
|
| 578 |
**Important:
|
| 579 |
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
| 580 |
and want to relocate the same element(s)**
|
| 581 |
|
| 582 |
:param selector: The CSS3 selector to be used.
|
| 583 |
+
:param auto_match: Enabled will make the function try to relocate the element if it was 'saved' before
|
| 584 |
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching,
|
| 585 |
otherwise the selector will be used.
|
| 586 |
:param auto_save: Automatically save new elements for `auto_match` later
|
| 587 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 588 |
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 589 |
number unless you must know what you are doing!
|
| 590 |
|
| 591 |
+
:return: `Adaptors` class.
|
| 592 |
"""
|
| 593 |
try:
|
| 594 |
if not self.__auto_match_enabled or "," not in selector:
|
|
|
|
| 605 |
results = []
|
| 606 |
if "," in selector:
|
| 607 |
for single_selector in split_selectors(selector):
|
| 608 |
+
# I'm doing this only so the `save` function saves data correctly for combined selectors
|
| 609 |
# Like using the ',' to combine two different selectors that point to different elements.
|
| 610 |
xpath_selector = translator_instance.css_to_xpath(
|
| 611 |
single_selector.canonical()
|
|
|
|
| 634 |
percentage: int = 0,
|
| 635 |
**kwargs: Any,
|
| 636 |
) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
|
| 637 |
+
"""Search the current tree with XPath selectors
|
| 638 |
|
| 639 |
**Important:
|
| 640 |
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
| 641 |
and want to relocate the same element(s)**
|
| 642 |
|
| 643 |
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 644 |
|
| 645 |
:param selector: The XPath selector to be used.
|
| 646 |
+
:param auto_match: Enabled will make the function try to relocate the element if it was 'saved' before
|
| 647 |
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching,
|
| 648 |
otherwise the selector will be used.
|
| 649 |
:param auto_save: Automatically save new elements for `auto_match` later
|
| 650 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 651 |
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 652 |
number unless you must know what you are doing!
|
| 653 |
|
| 654 |
+
:return: `Adaptors` class.
|
| 655 |
"""
|
| 656 |
try:
|
| 657 |
elements = self._root.xpath(selector, **kwargs)
|
|
|
|
| 700 |
*args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
|
| 701 |
**kwargs: str,
|
| 702 |
) -> "Adaptors":
|
| 703 |
+
"""Find elements by filters of your creations for ease.
|
| 704 |
|
| 705 |
+
:param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 706 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 707 |
:return: The `Adaptors` object of the elements or empty list
|
| 708 |
"""
|
|
|
|
| 796 |
for pattern in patterns:
|
| 797 |
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
| 798 |
|
| 799 |
+
# Collect an element if it fulfills the passed function otherwise
|
| 800 |
for function in functions:
|
| 801 |
results = results.filter(function)
|
| 802 |
|
|
|
|
| 807 |
*args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
|
| 808 |
**kwargs: str,
|
| 809 |
) -> Union["Adaptor", None]:
|
| 810 |
+
"""Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
|
| 811 |
|
| 812 |
+
:param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 813 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 814 |
:return: The `Adaptor` object of the element or `None` if the result didn't match
|
| 815 |
"""
|
|
|
|
| 820 |
def __calculate_similarity_score(
|
| 821 |
self, original: Dict, candidate: html.HtmlElement
|
| 822 |
) -> float:
|
| 823 |
+
"""Used internally to calculate a score that shows how a candidate element similar to the original one
|
| 824 |
|
| 825 |
:param original: The original element in the form of the dictionary generated from `element_to_dict` function
|
| 826 |
:param candidate: The element to compare with the original element.
|
|
|
|
| 841 |
).ratio() # * 0.3 # 30%
|
| 842 |
checks += 1
|
| 843 |
|
| 844 |
+
# if both don't have attributes, it still counts for something!
|
| 845 |
score += self.__calculate_dict_diff(
|
| 846 |
original["attributes"], candidate["attributes"]
|
| 847 |
) # * 0.3 # 30%
|
|
|
|
| 888 |
).ratio() # * 0.1 # 10%
|
| 889 |
checks += 1
|
| 890 |
# else:
|
| 891 |
+
# # The original element has a parent and this one not, this is not a good sign
|
| 892 |
# score -= 0.1
|
| 893 |
|
| 894 |
if original.get("siblings"):
|
|
|
|
| 902 |
|
| 903 |
@staticmethod
|
| 904 |
def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
|
| 905 |
+
"""Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
|
| 906 |
score = (
|
| 907 |
SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
|
| 908 |
* 0.5
|
|
|
|
| 918 |
) -> None:
|
| 919 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 920 |
|
| 921 |
+
:param element: The element itself that we want to save to storage, it can be an ` Adaptor ` or pure ` HtmlElement `
|
| 922 |
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 923 |
the docs for more info.
|
| 924 |
"""
|
|
|
|
| 948 |
log.critical(
|
| 949 |
"Can't use Auto-match features while disabled globally, you have to start a new class instance."
|
| 950 |
)
|
| 951 |
+
return None
|
| 952 |
|
| 953 |
# Operations on text functions
|
| 954 |
def json(self) -> Dict:
|
| 955 |
+
"""Return JSON response if the response is jsonable otherwise throws error"""
|
| 956 |
if self.text:
|
| 957 |
return self.text.json()
|
| 958 |
else:
|
|
|
|
| 968 |
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 969 |
|
| 970 |
:param regex: Can be either a compiled regular expression or a string.
|
| 971 |
+
:param replace_entities: If enabled character entity references are replaced by their corresponding character
|
| 972 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 973 |
+
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
| 974 |
"""
|
| 975 |
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
| 976 |
|
|
|
|
| 988 |
:param default: The default value to be returned if there is no match
|
| 989 |
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 990 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 991 |
+
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
| 992 |
"""
|
| 993 |
return self.text.re_first(
|
| 994 |
regex, default, replace_entities, clean_match, case_sensitive
|
|
|
|
| 1004 |
match_text: bool = False,
|
| 1005 |
) -> Union["Adaptors[Adaptor]", List]:
|
| 1006 |
"""Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
|
| 1007 |
+
then return the ones that match the current element attributes with a percentage higher than the input threshold.
|
| 1008 |
|
| 1009 |
This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
|
| 1010 |
+
a products-list container and want to find other products using that element as a starting point EXCEPT
|
| 1011 |
this function works in any case without depending on the element type.
|
| 1012 |
|
| 1013 |
+
:param similarity_threshold: The percentage to use while comparing element attributes.
|
| 1014 |
Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
|
| 1015 |
+
same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless you are
|
| 1016 |
+
extremely unlucky, then attributes matching comes into play, so don't play with this number unless
|
| 1017 |
you are getting the results you don't want.
|
| 1018 |
+
Also, if the current element doesn't have attributes and the similar element as well, then it's a 100% match.
|
| 1019 |
+
:param ignore_attributes: Attribute names passed will be ignored while matching the attributes in the last step.
|
| 1020 |
+
The default value is to ignore `href` and `src` as URLs can change a lot between elements, so it's unreliable
|
| 1021 |
+
:param match_text: If True, element text content will be taken into calculation while matching.
|
| 1022 |
+
Not recommended to use in normal cases, but it depends.
|
| 1023 |
|
| 1024 |
:return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
|
| 1025 |
"""
|
|
|
|
| 1036 |
candidate: html.HtmlElement,
|
| 1037 |
) -> bool:
|
| 1038 |
"""Calculate a score of how much these elements are alike and return True
|
| 1039 |
+
if the score is higher or equals the threshold"""
|
| 1040 |
candidate_attributes = (
|
| 1041 |
get_attributes(candidate) if ignore_attributes else candidate.attrib
|
| 1042 |
)
|
|
|
|
| 1050 |
checks += len(candidate_attributes)
|
| 1051 |
else:
|
| 1052 |
if not candidate_attributes:
|
| 1053 |
+
# Both don't have attributes, this must mean something
|
| 1054 |
score += 1
|
| 1055 |
checks += 1
|
| 1056 |
|
|
|
|
| 1066 |
return round(score / checks, 2) >= similarity_threshold
|
| 1067 |
return False
|
| 1068 |
|
| 1069 |
+
# We will use the elements' root from now on to get the speed boost of using Lxml directly
|
| 1070 |
root = self._root
|
| 1071 |
current_depth = len(list(root.iterancestors()))
|
| 1072 |
target_attrs = get_attributes(root) if ignore_attributes else root.attrib
|
|
|
|
| 1106 |
) -> Union["Adaptors[Adaptor]", "Adaptor"]:
|
| 1107 |
"""Find elements that its text content fully/partially matches input.
|
| 1108 |
:param text: Text query to match
|
| 1109 |
+
:param first_match: Returns the first element that matches conditions, enabled by default
|
| 1110 |
+
:param partial: If enabled, the function returns elements that contain the input text
|
| 1111 |
+
:param case_sensitive: if enabled, the letters case will be taken into consideration
|
| 1112 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1113 |
"""
|
| 1114 |
|
|
|
|
| 1152 |
) -> Union["Adaptors[Adaptor]", "Adaptor"]:
|
| 1153 |
"""Find elements that its text content matches the input regex pattern.
|
| 1154 |
:param query: Regex query/pattern to match
|
| 1155 |
+
:param first_match: Return the first element that matches conditions; enabled by default.
|
| 1156 |
+
:param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
|
| 1157 |
+
:param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
|
| 1158 |
"""
|
| 1159 |
results = Adaptors([])
|
| 1160 |
|
|
|
|
| 1183 |
|
| 1184 |
class Adaptors(List[Adaptor]):
|
| 1185 |
"""
|
| 1186 |
+
The `Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
| 1187 |
"""
|
| 1188 |
|
| 1189 |
__slots__ = ()
|
|
|
|
| 1215 |
) -> "Adaptors[Adaptor]":
|
| 1216 |
"""
|
| 1217 |
Call the ``.xpath()`` method for each element in this list and return
|
| 1218 |
+
their results as another `Adaptors` class.
|
| 1219 |
|
| 1220 |
**Important:
|
| 1221 |
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
| 1222 |
and want to relocate the same element(s)**
|
| 1223 |
|
| 1224 |
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 1225 |
|
| 1226 |
:param selector: The XPath selector to be used.
|
| 1227 |
+
:param identifier: A string that will be used to retrieve element's data in auto-matching,
|
| 1228 |
otherwise the selector will be used.
|
| 1229 |
:param auto_save: Automatically save new elements for `auto_match` later
|
| 1230 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 1231 |
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 1232 |
number unless you must know what you are doing!
|
| 1233 |
|
| 1234 |
+
:return: `Adaptors` class.
|
| 1235 |
"""
|
| 1236 |
results = [
|
| 1237 |
n.xpath(
|
|
|
|
| 1250 |
) -> "Adaptors[Adaptor]":
|
| 1251 |
"""
|
| 1252 |
Call the ``.css()`` method for each element in this list and return
|
| 1253 |
+
their results flattened as another `Adaptors` class.
|
| 1254 |
|
| 1255 |
**Important:
|
| 1256 |
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
| 1257 |
and want to relocate the same element(s)**
|
| 1258 |
|
| 1259 |
:param selector: The CSS3 selector to be used.
|
| 1260 |
+
:param identifier: A string that will be used to retrieve element's data in auto-matching,
|
| 1261 |
otherwise the selector will be used.
|
| 1262 |
:param auto_save: Automatically save new elements for `auto_match` later
|
| 1263 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 1264 |
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 1265 |
number unless you must know what you are doing!
|
| 1266 |
|
| 1267 |
+
:return: `Adaptors` class.
|
| 1268 |
"""
|
| 1269 |
results = [
|
| 1270 |
n.css(selector, identifier or selector, False, auto_save, percentage)
|
|
|
|
| 1283 |
their results flattened as List of TextHandler.
|
| 1284 |
|
| 1285 |
:param regex: Can be either a compiled regular expression or a string.
|
| 1286 |
+
:param replace_entities: If enabled character entity references are replaced by their corresponding character
|
| 1287 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1288 |
+
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
| 1289 |
"""
|
| 1290 |
results = [
|
| 1291 |
n.text.re(regex, replace_entities, clean_match, case_sensitive)
|
|
|
|
| 1308 |
:param default: The default value to be returned if there is no match
|
| 1309 |
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 1310 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1311 |
+
:param case_sensitive: if disabled, function will set the regex to ignore the letters case while compiling it
|
| 1312 |
"""
|
| 1313 |
for n in self:
|
| 1314 |
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|