Karim shoair commited on
Commit
6eaebde
·
1 Parent(s): 20efe8c

docs: improve All `Adaptor` class doc strings

Browse files
Files changed (1) hide show
  1. scrapling/parser.py +101 -100
scrapling/parser.py CHANGED
@@ -67,21 +67,21 @@ class Adaptor(SelectorsGeneration):
67
  with expressions in CSS, XPath, or with simply text. Check the docs for more info.
68
 
69
  Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
70
- inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable which makes a lot of reference jobs
71
  not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
72
  It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
73
 
74
  :param text: HTML body passed as text.
75
- :param url: allows storing a URL with the html data for retrieving later.
76
- :param body: HTML body as ``bytes`` object. It can be used instead of the ``text`` argument.
77
  :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
78
  :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
79
- libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
80
- :param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
81
  Don't use it unless you know what you are doing!
82
  :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
83
  :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
84
- :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
85
  priority over all auto-match related arguments/functions in the class.
86
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
87
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
@@ -125,7 +125,7 @@ class Adaptor(SelectorsGeneration):
125
  self.__text = TextHandler(text or body.decode())
126
 
127
  else:
128
- # All html types inherits from HtmlMixin so this to check for all at once
129
  if not issubclass(type(root), html.HtmlMixin):
130
  raise TypeError(
131
  f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
@@ -181,15 +181,15 @@ class Adaptor(SelectorsGeneration):
181
  else {}
182
  )
183
 
184
- # Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
185
  @staticmethod
186
  def _is_text_node(
187
  element: Union[html.HtmlElement, etree._ElementUnicodeResult],
188
  ) -> bool:
189
- """Return True if given element is a result of a string expression
190
  Examples:
191
- XPath -> '/text()', '/@attribute' etc...
192
- CSS3 -> '::text', '::attr(attrib)'...
193
  """
194
  # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
195
  return issubclass(type(element), etree._ElementUnicodeResult)
@@ -200,7 +200,7 @@ class Adaptor(SelectorsGeneration):
200
  ) -> TextHandler:
201
  """Used internally to convert a single element's text content to TextHandler directly without checks
202
 
203
- This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
204
  """
205
  return TextHandler(str(element))
206
 
@@ -209,7 +209,7 @@ class Adaptor(SelectorsGeneration):
209
  return Adaptor(
210
  root=element,
211
  text="",
212
- body=b"", # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
213
  url=self.url,
214
  encoding=self.encoding,
215
  auto_match=self.__auto_match_enabled,
@@ -240,8 +240,8 @@ class Adaptor(SelectorsGeneration):
240
  ): # Lxml will give a warning if I used something like `not result`
241
  return Adaptors([])
242
 
243
- # From within the code, this method will always get a list of the same type
244
- # so we will continue without checks for slight performance boost
245
  if self._is_text_node(result[0]):
246
  return TextHandlers(list(map(self.__content_convertor, result)))
247
 
@@ -253,12 +253,12 @@ class Adaptor(SelectorsGeneration):
253
 
254
  # The following four properties I made them into functions instead of variables directly
255
  # So they don't slow down the process of initializing many instances of the class and gets executed only
256
- # when the user need them for the first time for that specific element and gets cached for next times
257
  # Doing that only made the library performance test sky rocked multiple times faster than before
258
  # because I was executing them on initialization before :))
259
  @property
260
  def tag(self) -> str:
261
- """Get tag name of the element"""
262
  if not self.__tag:
263
  self.__tag = self._root.tag
264
  return self.__tag
@@ -267,8 +267,8 @@ class Adaptor(SelectorsGeneration):
267
  def text(self) -> TextHandler:
268
  """Get text content of the element"""
269
  if not self.__text:
270
- # If you want to escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
271
- # before extracting text then keep `keep_comments` set to False while initializing the first class
272
  self.__text = TextHandler(self._root.text)
273
  return self.__text
274
 
@@ -322,7 +322,7 @@ class Adaptor(SelectorsGeneration):
322
 
323
  @property
324
  def html_content(self) -> TextHandler:
325
- """Return the inner html code of the element"""
326
  return TextHandler(
327
  etree.tostring(
328
  self._root, encoding="unicode", method="html", with_tail=False
@@ -344,7 +344,7 @@ class Adaptor(SelectorsGeneration):
344
  )
345
 
346
  def has_class(self, class_name: str) -> bool:
347
- """Check if element has a specific class
348
  :param class_name: The class name to check for
349
  :return: True if element has class with that name otherwise False
350
  """
@@ -382,7 +382,7 @@ class Adaptor(SelectorsGeneration):
382
  return Adaptors([])
383
 
384
  def iterancestors(self) -> Generator["Adaptor", None, None]:
385
- """Return a generator that loops over all ancestors of the element, starting with element's parent."""
386
  for ancestor in self._root.iterancestors():
387
  yield self.__element_convertor(ancestor)
388
 
@@ -400,7 +400,7 @@ class Adaptor(SelectorsGeneration):
400
 
401
  @property
402
  def path(self) -> "Adaptors[Adaptor]":
403
- """Returns list of type :class:`Adaptors` that contains the path leading to the current element from the root."""
404
  lst = list(self.iterancestors())
405
  return Adaptors(lst)
406
 
@@ -410,7 +410,7 @@ class Adaptor(SelectorsGeneration):
410
  next_element = self._root.getnext()
411
  if next_element is not None:
412
  while type(next_element) in html_forbidden:
413
- # Ignore html comments and unwanted types
414
  next_element = next_element.getnext()
415
 
416
  return self.__handle_element(next_element)
@@ -421,7 +421,7 @@ class Adaptor(SelectorsGeneration):
421
  prev_element = self._root.getprevious()
422
  if prev_element is not None:
423
  while type(prev_element) in html_forbidden:
424
- # Ignore html comments and unwanted types
425
  prev_element = prev_element.getprevious()
426
 
427
  return self.__handle_element(prev_element)
@@ -456,7 +456,7 @@ class Adaptor(SelectorsGeneration):
456
 
457
  return data + ">"
458
 
459
- # From here we start the selecting functions
460
  def relocate(
461
  self,
462
  element: Union[Dict, html.HtmlElement, "Adaptor"],
@@ -467,13 +467,13 @@ class Adaptor(SelectorsGeneration):
467
 
468
  :param element: The element we want to relocate in the tree
469
  :param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
470
- calculation depends solely on the page structure so don't play with this number unless you must know
471
  what you are doing!
472
  :param adaptor_type: If True, the return result will be converted to `Adaptors` object
473
  :return: List of pure HTML elements that got the highest matching score or 'Adaptors' object
474
  """
475
  score_table = {}
476
- # Note: `element` will be most likely always be a dictionary at this point.
477
  if isinstance(element, self.__class__):
478
  element = element._root
479
 
@@ -481,7 +481,7 @@ class Adaptor(SelectorsGeneration):
481
  element = _StorageTools.element_to_dict(element)
482
 
483
  for node in self._root.xpath(".//*"):
484
- # Collect all elements in the page then for each element get the matching score of it against the node.
485
  # Hence: the code doesn't stop even if the score was 100%
486
  # because there might be another element(s) left in page with the same score
487
  score = self.__calculate_similarity_score(element, node)
@@ -491,7 +491,7 @@ class Adaptor(SelectorsGeneration):
491
  highest_probability = max(score_table.keys())
492
  if score_table[highest_probability] and highest_probability >= percentage:
493
  if log.getEffectiveLevel() < 20:
494
- # No need to execute this part if logging level is not debugging
495
  log.debug(f"Highest probability was {highest_probability}%")
496
  log.debug("Top 5 best matching elements are: ")
497
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
@@ -512,19 +512,19 @@ class Adaptor(SelectorsGeneration):
512
  auto_save: bool = False,
513
  percentage: int = 0,
514
  ) -> Union["Adaptor", "TextHandler", None]:
515
- """Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
516
 
517
  **Important:
518
- It's recommended to use the identifier argument if you plan to use different selector later
519
  and want to relocate the same element(s)**
520
 
521
  :param selector: The CSS3 selector to be used.
522
- :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
523
- :param identifier: A string that will be used to save/retrieve element's data in auto-matching
524
  otherwise the selector will be used.
525
  :param auto_save: Automatically save new elements for `auto_match` later
526
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
527
- Be aware that the percentage calculation depends solely on the page structure so don't play with this
528
  number unless you must know what you are doing!
529
  """
530
  for element in self.css(
@@ -542,21 +542,21 @@ class Adaptor(SelectorsGeneration):
542
  percentage: int = 0,
543
  **kwargs: Any,
544
  ) -> Union["Adaptor", "TextHandler", None]:
545
- """Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
546
 
547
  **Important:
548
- It's recommended to use the identifier argument if you plan to use different selector later
549
  and want to relocate the same element(s)**
550
 
551
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
552
 
553
  :param selector: The XPath selector to be used.
554
- :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
555
- :param identifier: A string that will be used to save/retrieve element's data in auto-matching
556
  otherwise the selector will be used.
557
  :param auto_save: Automatically save new elements for `auto_match` later
558
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
559
- Be aware that the percentage calculation depends solely on the page structure so don't play with this
560
  number unless you must know what you are doing!
561
  """
562
  for element in self.xpath(
@@ -573,22 +573,22 @@ class Adaptor(SelectorsGeneration):
573
  auto_save: bool = False,
574
  percentage: int = 0,
575
  ) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
576
- """Search current tree with CSS3 selectors
577
 
578
  **Important:
579
- It's recommended to use the identifier argument if you plan to use different selector later
580
  and want to relocate the same element(s)**
581
 
582
  :param selector: The CSS3 selector to be used.
583
- :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
584
- :param identifier: A string that will be used to save/retrieve element's data in auto-matching
585
  otherwise the selector will be used.
586
  :param auto_save: Automatically save new elements for `auto_match` later
587
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
588
- Be aware that the percentage calculation depends solely on the page structure so don't play with this
589
  number unless you must know what you are doing!
590
 
591
- :return: List as :class:`Adaptors`
592
  """
593
  try:
594
  if not self.__auto_match_enabled or "," not in selector:
@@ -605,7 +605,7 @@ class Adaptor(SelectorsGeneration):
605
  results = []
606
  if "," in selector:
607
  for single_selector in split_selectors(selector):
608
- # I'm doing this only so the `save` function save data correctly for combined selectors
609
  # Like using the ',' to combine two different selectors that point to different elements.
610
  xpath_selector = translator_instance.css_to_xpath(
611
  single_selector.canonical()
@@ -634,24 +634,24 @@ class Adaptor(SelectorsGeneration):
634
  percentage: int = 0,
635
  **kwargs: Any,
636
  ) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
637
- """Search current tree with XPath selectors
638
 
639
  **Important:
640
- It's recommended to use the identifier argument if you plan to use different selector later
641
  and want to relocate the same element(s)**
642
 
643
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
644
 
645
  :param selector: The XPath selector to be used.
646
- :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
647
- :param identifier: A string that will be used to save/retrieve element's data in auto-matching
648
  otherwise the selector will be used.
649
  :param auto_save: Automatically save new elements for `auto_match` later
650
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
651
- Be aware that the percentage calculation depends solely on the page structure so don't play with this
652
  number unless you must know what you are doing!
653
 
654
- :return: List as :class:`Adaptors`
655
  """
656
  try:
657
  elements = self._root.xpath(selector, **kwargs)
@@ -700,9 +700,9 @@ class Adaptor(SelectorsGeneration):
700
  *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
701
  **kwargs: str,
702
  ) -> "Adaptors":
703
- """Find elements by filters of your creations for ease..
704
 
705
- :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
706
  :param kwargs: The attributes you want to filter elements based on it.
707
  :return: The `Adaptors` object of the elements or empty list
708
  """
@@ -796,7 +796,7 @@ class Adaptor(SelectorsGeneration):
796
  for pattern in patterns:
797
  results = results.filter(lambda e: e.text.re(pattern, check_match=True))
798
 
799
- # Collect element if it fulfills passed function otherwise
800
  for function in functions:
801
  results = results.filter(function)
802
 
@@ -807,9 +807,9 @@ class Adaptor(SelectorsGeneration):
807
  *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
808
  **kwargs: str,
809
  ) -> Union["Adaptor", None]:
810
- """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
811
 
812
- :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
813
  :param kwargs: The attributes you want to filter elements based on it.
814
  :return: The `Adaptor` object of the element or `None` if the result didn't match
815
  """
@@ -820,7 +820,7 @@ class Adaptor(SelectorsGeneration):
820
  def __calculate_similarity_score(
821
  self, original: Dict, candidate: html.HtmlElement
822
  ) -> float:
823
- """Used internally to calculate a score that shows how candidate element similar to the original one
824
 
825
  :param original: The original element in the form of the dictionary generated from `element_to_dict` function
826
  :param candidate: The element to compare with the original element.
@@ -841,7 +841,7 @@ class Adaptor(SelectorsGeneration):
841
  ).ratio() # * 0.3 # 30%
842
  checks += 1
843
 
844
- # if both doesn't have attributes, it still count for something!
845
  score += self.__calculate_dict_diff(
846
  original["attributes"], candidate["attributes"]
847
  ) # * 0.3 # 30%
@@ -888,7 +888,7 @@ class Adaptor(SelectorsGeneration):
888
  ).ratio() # * 0.1 # 10%
889
  checks += 1
890
  # else:
891
- # # The original element have a parent and this one not, this is not a good sign
892
  # score -= 0.1
893
 
894
  if original.get("siblings"):
@@ -902,7 +902,7 @@ class Adaptor(SelectorsGeneration):
902
 
903
  @staticmethod
904
  def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
905
- """Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
906
  score = (
907
  SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
908
  * 0.5
@@ -918,7 +918,7 @@ class Adaptor(SelectorsGeneration):
918
  ) -> None:
919
  """Saves the element's unique properties to the storage for retrieval and relocation later
920
 
921
- :param element: The element itself that we want to save to storage, it can be a `Adaptor` or pure `HtmlElement`
922
  :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
923
  the docs for more info.
924
  """
@@ -948,10 +948,11 @@ class Adaptor(SelectorsGeneration):
948
  log.critical(
949
  "Can't use Auto-match features while disabled globally, you have to start a new class instance."
950
  )
 
951
 
952
  # Operations on text functions
953
  def json(self) -> Dict:
954
- """Return json response if the response is jsonable otherwise throws error"""
955
  if self.text:
956
  return self.text.json()
957
  else:
@@ -967,9 +968,9 @@ class Adaptor(SelectorsGeneration):
967
  """Apply the given regex to the current text and return a list of strings with the matches.
968
 
969
  :param regex: Can be either a compiled regular expression or a string.
970
- :param replace_entities: if enabled character entity references are replaced by their corresponding character
971
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
972
- :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
973
  """
974
  return self.text.re(regex, replace_entities, clean_match, case_sensitive)
975
 
@@ -987,7 +988,7 @@ class Adaptor(SelectorsGeneration):
987
  :param default: The default value to be returned if there is no match
988
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
989
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
990
- :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
991
  """
992
  return self.text.re_first(
993
  regex, default, replace_entities, clean_match, case_sensitive
@@ -1003,22 +1004,22 @@ class Adaptor(SelectorsGeneration):
1003
  match_text: bool = False,
1004
  ) -> Union["Adaptors[Adaptor]", List]:
1005
  """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
1006
- then return the ones that match the current element attributes with percentage higher than the input threshold.
1007
 
1008
  This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
1009
- a products-list container and want to find other products using that that element as a starting point EXCEPT
1010
  this function works in any case without depending on the element type.
1011
 
1012
- :param similarity_threshold: The percentage to use while comparing elements attributes.
1013
  Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
1014
- same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless your are
1015
- extremely unlucky then attributes matching comes into play so basically don't play with this number unless
1016
  you are getting the results you don't want.
1017
- Also, if current element doesn't have attributes and the similar element as well, then it's a 100% match.
1018
- :param ignore_attributes: Attribute names passed will be ignored while matching the attributes in last step.
1019
- The default value is to ignore `href` and `src` as URLs can change a lot between elements so it's unreliable
1020
- :param match_text: If True, elements text content will be taken into calculation while matching.
1021
- Not recommended to use in normal cases but it depends.
1022
 
1023
  :return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
1024
  """
@@ -1035,7 +1036,7 @@ class Adaptor(SelectorsGeneration):
1035
  candidate: html.HtmlElement,
1036
  ) -> bool:
1037
  """Calculate a score of how much these elements are alike and return True
1038
- if score is higher or equal the threshold"""
1039
  candidate_attributes = (
1040
  get_attributes(candidate) if ignore_attributes else candidate.attrib
1041
  )
@@ -1049,7 +1050,7 @@ class Adaptor(SelectorsGeneration):
1049
  checks += len(candidate_attributes)
1050
  else:
1051
  if not candidate_attributes:
1052
- # Both doesn't have attributes, this must mean something
1053
  score += 1
1054
  checks += 1
1055
 
@@ -1065,7 +1066,7 @@ class Adaptor(SelectorsGeneration):
1065
  return round(score / checks, 2) >= similarity_threshold
1066
  return False
1067
 
1068
- # We will use the elements root from now on to get the speed boost of using Lxml directly
1069
  root = self._root
1070
  current_depth = len(list(root.iterancestors()))
1071
  target_attrs = get_attributes(root) if ignore_attributes else root.attrib
@@ -1105,9 +1106,9 @@ class Adaptor(SelectorsGeneration):
1105
  ) -> Union["Adaptors[Adaptor]", "Adaptor"]:
1106
  """Find elements that its text content fully/partially matches input.
1107
  :param text: Text query to match
1108
- :param first_match: Return first element that matches conditions, enabled by default
1109
- :param partial: If enabled, function return elements that contains the input text
1110
- :param case_sensitive: if enabled, letters case will be taken into consideration
1111
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1112
  """
1113
 
@@ -1151,9 +1152,9 @@ class Adaptor(SelectorsGeneration):
1151
  ) -> Union["Adaptors[Adaptor]", "Adaptor"]:
1152
  """Find elements that its text content matches the input regex pattern.
1153
  :param query: Regex query/pattern to match
1154
- :param first_match: Return first element that matches conditions, enabled by default
1155
- :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
1156
- :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1157
  """
1158
  results = Adaptors([])
1159
 
@@ -1182,7 +1183,7 @@ class Adaptor(SelectorsGeneration):
1182
 
1183
  class Adaptors(List[Adaptor]):
1184
  """
1185
- The :class:`Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
1186
  """
1187
 
1188
  __slots__ = ()
@@ -1214,23 +1215,23 @@ class Adaptors(List[Adaptor]):
1214
  ) -> "Adaptors[Adaptor]":
1215
  """
1216
  Call the ``.xpath()`` method for each element in this list and return
1217
- their results as another :class:`Adaptors`.
1218
 
1219
  **Important:
1220
- It's recommended to use the identifier argument if you plan to use different selector later
1221
  and want to relocate the same element(s)**
1222
 
1223
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
1224
 
1225
  :param selector: The XPath selector to be used.
1226
- :param identifier: A string that will be used to retrieve element's data in auto-matching
1227
  otherwise the selector will be used.
1228
  :param auto_save: Automatically save new elements for `auto_match` later
1229
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
1230
- Be aware that the percentage calculation depends solely on the page structure so don't play with this
1231
  number unless you must know what you are doing!
1232
 
1233
- :return: List as :class:`Adaptors`
1234
  """
1235
  results = [
1236
  n.xpath(
@@ -1249,21 +1250,21 @@ class Adaptors(List[Adaptor]):
1249
  ) -> "Adaptors[Adaptor]":
1250
  """
1251
  Call the ``.css()`` method for each element in this list and return
1252
- their results flattened as another :class:`Adaptors`.
1253
 
1254
  **Important:
1255
- It's recommended to use the identifier argument if you plan to use different selector later
1256
  and want to relocate the same element(s)**
1257
 
1258
  :param selector: The CSS3 selector to be used.
1259
- :param identifier: A string that will be used to retrieve element's data in auto-matching
1260
  otherwise the selector will be used.
1261
  :param auto_save: Automatically save new elements for `auto_match` later
1262
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
1263
- Be aware that the percentage calculation depends solely on the page structure so don't play with this
1264
  number unless you must know what you are doing!
1265
 
1266
- :return: List as :class:`Adaptors`
1267
  """
1268
  results = [
1269
  n.css(selector, identifier or selector, False, auto_save, percentage)
@@ -1282,9 +1283,9 @@ class Adaptors(List[Adaptor]):
1282
  their results flattened as List of TextHandler.
1283
 
1284
  :param regex: Can be either a compiled regular expression or a string.
1285
- :param replace_entities: if enabled character entity references are replaced by their corresponding character
1286
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1287
- :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
1288
  """
1289
  results = [
1290
  n.text.re(regex, replace_entities, clean_match, case_sensitive)
@@ -1307,7 +1308,7 @@ class Adaptors(List[Adaptor]):
1307
  :param default: The default value to be returned if there is no match
1308
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
1309
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1310
- :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
1311
  """
1312
  for n in self:
1313
  for result in n.re(regex, replace_entities, clean_match, case_sensitive):
 
67
  with expressions in CSS, XPath, or with simply text. Check the docs for more info.
68
 
69
  Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
70
+ inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable, which makes a lot of reference jobs
71
  not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
72
  It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
73
 
74
  :param text: HTML body passed as text.
75
+ :param url: It allows storing a URL with the HTML data for retrieving later.
76
+ :param body: HTML body as an ``bytes`` object. It can be used instead of the ``text`` argument.
77
  :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
78
  :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
79
+ the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
80
+ :param root: Used internally to pass etree objects instead of text/body arguments, it takes the highest priority.
81
  Don't use it unless you know what you are doing!
82
  :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
83
  :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
84
+ :param auto_match: Globally turn off the auto-match feature in all functions, this argument takes higher
85
  priority over all auto-match related arguments/functions in the class.
86
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
87
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
 
125
  self.__text = TextHandler(text or body.decode())
126
 
127
  else:
128
+ # All HTML types inherit from HtmlMixin so this to check for all at once
129
  if not issubclass(type(root), html.HtmlMixin):
130
  raise TypeError(
131
  f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
 
181
  else {}
182
  )
183
 
184
+ # Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
185
  @staticmethod
186
  def _is_text_node(
187
  element: Union[html.HtmlElement, etree._ElementUnicodeResult],
188
  ) -> bool:
189
+ """Return True if the given element is a result of a string expression
190
  Examples:
191
+ XPath -> '/text()', '/@attribute', etc...
192
+ CSS3 -> '::text', '::attr(attrib)'...
193
  """
194
  # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
195
  return issubclass(type(element), etree._ElementUnicodeResult)
 
200
  ) -> TextHandler:
201
  """Used internally to convert a single element's text content to TextHandler directly without checks
202
 
203
+ This single line has been isolated like this, so when it's used with `map` we get that slight performance boost vs. list comprehension
204
  """
205
  return TextHandler(str(element))
206
 
 
209
  return Adaptor(
210
  root=element,
211
  text="",
212
+ body=b"", # Since the root argument is provided, both `text` and `body` will be ignored, so this is just a filler
213
  url=self.url,
214
  encoding=self.encoding,
215
  auto_match=self.__auto_match_enabled,
 
240
  ): # Lxml will give a warning if I used something like `not result`
241
  return Adaptors([])
242
 
243
+ # From within the code, this method will always get a list of the same type,
244
+ # so we will continue without checks for a slight performance boost
245
  if self._is_text_node(result[0]):
246
  return TextHandlers(list(map(self.__content_convertor, result)))
247
 
 
253
 
254
  # The following four properties I made them into functions instead of variables directly
255
  # So they don't slow down the process of initializing many instances of the class and gets executed only
256
+ # when the user needs them for the first time for that specific element and gets cached for next times
257
  # Doing that only made the library performance test sky rocked multiple times faster than before
258
  # because I was executing them on initialization before :))
259
  @property
260
  def tag(self) -> str:
261
+ """Get the tag name of the element"""
262
  if not self.__tag:
263
  self.__tag = self._root.tag
264
  return self.__tag
 
267
  def text(self) -> TextHandler:
268
  """Get text content of the element"""
269
  if not self.__text:
270
+ # If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
271
+ # before extracting text, then keep `keep_comments` set to False while initializing the first class
272
  self.__text = TextHandler(self._root.text)
273
  return self.__text
274
 
 
322
 
323
  @property
324
  def html_content(self) -> TextHandler:
325
+ """Return the inner HTML code of the element"""
326
  return TextHandler(
327
  etree.tostring(
328
  self._root, encoding="unicode", method="html", with_tail=False
 
344
  )
345
 
346
  def has_class(self, class_name: str) -> bool:
347
+ """Check if the element has a specific class
348
  :param class_name: The class name to check for
349
  :return: True if element has class with that name otherwise False
350
  """
 
382
  return Adaptors([])
383
 
384
  def iterancestors(self) -> Generator["Adaptor", None, None]:
385
+ """Return a generator that loops over all ancestors of the element, starting with the element's parent."""
386
  for ancestor in self._root.iterancestors():
387
  yield self.__element_convertor(ancestor)
388
 
 
400
 
401
  @property
402
  def path(self) -> "Adaptors[Adaptor]":
403
+ """Returns a list of type `Adaptors` that contains the path leading to the current element from the root."""
404
  lst = list(self.iterancestors())
405
  return Adaptors(lst)
406
 
 
410
  next_element = self._root.getnext()
411
  if next_element is not None:
412
  while type(next_element) in html_forbidden:
413
+ # Ignore HTML comments and unwanted types
414
  next_element = next_element.getnext()
415
 
416
  return self.__handle_element(next_element)
 
421
  prev_element = self._root.getprevious()
422
  if prev_element is not None:
423
  while type(prev_element) in html_forbidden:
424
+ # Ignore HTML comments and unwanted types
425
  prev_element = prev_element.getprevious()
426
 
427
  return self.__handle_element(prev_element)
 
456
 
457
  return data + ">"
458
 
459
+ # From here we start with the selecting functions
460
  def relocate(
461
  self,
462
  element: Union[Dict, html.HtmlElement, "Adaptor"],
 
467
 
468
  :param element: The element we want to relocate in the tree
469
  :param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
470
+ calculation depends solely on the page structure, so don't play with this number unless you must know
471
  what you are doing!
472
  :param adaptor_type: If True, the return result will be converted to `Adaptors` object
473
  :return: List of pure HTML elements that got the highest matching score or 'Adaptors' object
474
  """
475
  score_table = {}
476
+ # Note: `element` will most likely always be a dictionary at this point.
477
  if isinstance(element, self.__class__):
478
  element = element._root
479
 
 
481
  element = _StorageTools.element_to_dict(element)
482
 
483
  for node in self._root.xpath(".//*"):
484
+ # Collect all elements in the page, then for each element get the matching score of it against the node.
485
  # Hence: the code doesn't stop even if the score was 100%
486
  # because there might be another element(s) left in page with the same score
487
  score = self.__calculate_similarity_score(element, node)
 
491
  highest_probability = max(score_table.keys())
492
  if score_table[highest_probability] and highest_probability >= percentage:
493
  if log.getEffectiveLevel() < 20:
494
+ # No need to execute this part if the logging level is not debugging
495
  log.debug(f"Highest probability was {highest_probability}%")
496
  log.debug("Top 5 best matching elements are: ")
497
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
 
512
  auto_save: bool = False,
513
  percentage: int = 0,
514
  ) -> Union["Adaptor", "TextHandler", None]:
515
+ """Search the current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
516
 
517
  **Important:
518
+ It's recommended to use the identifier argument if you plan to use a different selector later
519
  and want to relocate the same element(s)**
520
 
521
  :param selector: The CSS3 selector to be used.
522
+ :param auto_match: Enabled will make the function try to relocate the element if it was 'saved' before
523
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching,
524
  otherwise the selector will be used.
525
  :param auto_save: Automatically save new elements for `auto_match` later
526
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
527
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
528
  number unless you must know what you are doing!
529
  """
530
  for element in self.css(
 
542
  percentage: int = 0,
543
  **kwargs: Any,
544
  ) -> Union["Adaptor", "TextHandler", None]:
545
+ """Search the current tree with XPath selectors and return the first result if possible, otherwise return `None`
546
 
547
  **Important:
548
+ It's recommended to use the identifier argument if you plan to use a different selector later
549
  and want to relocate the same element(s)**
550
 
551
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
552
 
553
  :param selector: The XPath selector to be used.
554
+ :param auto_match: Enabled will make the function try to relocate the element if it was 'saved' before
555
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching,
556
  otherwise the selector will be used.
557
  :param auto_save: Automatically save new elements for `auto_match` later
558
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
559
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
560
  number unless you must know what you are doing!
561
  """
562
  for element in self.xpath(
 
573
  auto_save: bool = False,
574
  percentage: int = 0,
575
  ) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
576
+ """Search the current tree with CSS3 selectors
577
 
578
  **Important:
579
+ It's recommended to use the identifier argument if you plan to use a different selector later
580
  and want to relocate the same element(s)**
581
 
582
  :param selector: The CSS3 selector to be used.
583
+ :param auto_match: Enabled will make the function try to relocate the element if it was 'saved' before
584
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching,
585
  otherwise the selector will be used.
586
  :param auto_save: Automatically save new elements for `auto_match` later
587
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
588
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
589
  number unless you must know what you are doing!
590
 
591
+ :return: `Adaptors` class.
592
  """
593
  try:
594
  if not self.__auto_match_enabled or "," not in selector:
 
605
  results = []
606
  if "," in selector:
607
  for single_selector in split_selectors(selector):
608
+ # I'm doing this only so the `save` function saves data correctly for combined selectors
609
  # Like using the ',' to combine two different selectors that point to different elements.
610
  xpath_selector = translator_instance.css_to_xpath(
611
  single_selector.canonical()
 
634
  percentage: int = 0,
635
  **kwargs: Any,
636
  ) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
637
+ """Search the current tree with XPath selectors
638
 
639
  **Important:
640
+ It's recommended to use the identifier argument if you plan to use a different selector later
641
  and want to relocate the same element(s)**
642
 
643
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
644
 
645
  :param selector: The XPath selector to be used.
646
+ :param auto_match: Enabled will make the function try to relocate the element if it was 'saved' before
647
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching,
648
  otherwise the selector will be used.
649
  :param auto_save: Automatically save new elements for `auto_match` later
650
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
651
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
652
  number unless you must know what you are doing!
653
 
654
+ :return: `Adaptors` class.
655
  """
656
  try:
657
  elements = self._root.xpath(selector, **kwargs)
 
700
  *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
701
  **kwargs: str,
702
  ) -> "Adaptors":
703
+ """Find elements by filters of your creations for ease.
704
 
705
+ :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
706
  :param kwargs: The attributes you want to filter elements based on it.
707
  :return: The `Adaptors` object of the elements or empty list
708
  """
 
796
  for pattern in patterns:
797
  results = results.filter(lambda e: e.text.re(pattern, check_match=True))
798
 
799
+ # Collect an element if it fulfills the passed function otherwise
800
  for function in functions:
801
  results = results.filter(function)
802
 
 
807
  *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
808
  **kwargs: str,
809
  ) -> Union["Adaptor", None]:
810
+ """Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
811
 
812
+ :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
813
  :param kwargs: The attributes you want to filter elements based on it.
814
  :return: The `Adaptor` object of the element or `None` if the result didn't match
815
  """
 
820
  def __calculate_similarity_score(
821
  self, original: Dict, candidate: html.HtmlElement
822
  ) -> float:
823
+ """Used internally to calculate a score that shows how a candidate element similar to the original one
824
 
825
  :param original: The original element in the form of the dictionary generated from `element_to_dict` function
826
  :param candidate: The element to compare with the original element.
 
841
  ).ratio() # * 0.3 # 30%
842
  checks += 1
843
 
844
+ # if both don't have attributes, it still counts for something!
845
  score += self.__calculate_dict_diff(
846
  original["attributes"], candidate["attributes"]
847
  ) # * 0.3 # 30%
 
888
  ).ratio() # * 0.1 # 10%
889
  checks += 1
890
  # else:
891
+ # # The original element has a parent and this one not, this is not a good sign
892
  # score -= 0.1
893
 
894
  if original.get("siblings"):
 
902
 
903
  @staticmethod
904
  def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
905
+ """Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
906
  score = (
907
  SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
908
  * 0.5
 
918
  ) -> None:
919
  """Saves the element's unique properties to the storage for retrieval and relocation later
920
 
921
+ :param element: The element itself that we want to save to storage, it can be an ` Adaptor ` or pure ` HtmlElement `
922
  :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
923
  the docs for more info.
924
  """
 
948
  log.critical(
949
  "Can't use Auto-match features while disabled globally, you have to start a new class instance."
950
  )
951
+ return None
952
 
953
  # Operations on text functions
954
  def json(self) -> Dict:
955
+ """Return JSON response if the response is jsonable otherwise throws error"""
956
  if self.text:
957
  return self.text.json()
958
  else:
 
968
  """Apply the given regex to the current text and return a list of strings with the matches.
969
 
970
  :param regex: Can be either a compiled regular expression or a string.
971
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
972
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
973
+ :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
974
  """
975
  return self.text.re(regex, replace_entities, clean_match, case_sensitive)
976
 
 
988
  :param default: The default value to be returned if there is no match
989
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
990
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
991
+ :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
992
  """
993
  return self.text.re_first(
994
  regex, default, replace_entities, clean_match, case_sensitive
 
1004
  match_text: bool = False,
1005
  ) -> Union["Adaptors[Adaptor]", List]:
1006
  """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
1007
+ then return the ones that match the current element attributes with a percentage higher than the input threshold.
1008
 
1009
  This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
1010
+ a products-list container and want to find other products using that element as a starting point EXCEPT
1011
  this function works in any case without depending on the element type.
1012
 
1013
+ :param similarity_threshold: The percentage to use while comparing element attributes.
1014
  Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
1015
+ same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless you are
1016
+ extremely unlucky, then attributes matching comes into play, so don't play with this number unless
1017
  you are getting the results you don't want.
1018
+ Also, if the current element doesn't have attributes and the similar element as well, then it's a 100% match.
1019
+ :param ignore_attributes: Attribute names passed will be ignored while matching the attributes in the last step.
1020
+ The default value is to ignore `href` and `src` as URLs can change a lot between elements, so it's unreliable
1021
+ :param match_text: If True, element text content will be taken into calculation while matching.
1022
+ Not recommended to use in normal cases, but it depends.
1023
 
1024
  :return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
1025
  """
 
1036
  candidate: html.HtmlElement,
1037
  ) -> bool:
1038
  """Calculate a score of how much these elements are alike and return True
1039
+ if the score is higher or equals the threshold"""
1040
  candidate_attributes = (
1041
  get_attributes(candidate) if ignore_attributes else candidate.attrib
1042
  )
 
1050
  checks += len(candidate_attributes)
1051
  else:
1052
  if not candidate_attributes:
1053
+ # Both don't have attributes, this must mean something
1054
  score += 1
1055
  checks += 1
1056
 
 
1066
  return round(score / checks, 2) >= similarity_threshold
1067
  return False
1068
 
1069
+ # We will use the elements' root from now on to get the speed boost of using Lxml directly
1070
  root = self._root
1071
  current_depth = len(list(root.iterancestors()))
1072
  target_attrs = get_attributes(root) if ignore_attributes else root.attrib
 
1106
  ) -> Union["Adaptors[Adaptor]", "Adaptor"]:
1107
  """Find elements that its text content fully/partially matches input.
1108
  :param text: Text query to match
1109
+ :param first_match: Returns the first element that matches conditions, enabled by default
1110
+ :param partial: If enabled, the function returns elements that contain the input text
1111
+ :param case_sensitive: if enabled, the letters case will be taken into consideration
1112
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1113
  """
1114
 
 
1152
  ) -> Union["Adaptors[Adaptor]", "Adaptor"]:
1153
  """Find elements that its text content matches the input regex pattern.
1154
  :param query: Regex query/pattern to match
1155
+ :param first_match: Return the first element that matches conditions; enabled by default.
1156
+ :param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
1157
+ :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
1158
  """
1159
  results = Adaptors([])
1160
 
 
1183
 
1184
  class Adaptors(List[Adaptor]):
1185
  """
1186
+ The `Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
1187
  """
1188
 
1189
  __slots__ = ()
 
1215
  ) -> "Adaptors[Adaptor]":
1216
  """
1217
  Call the ``.xpath()`` method for each element in this list and return
1218
+ their results as another `Adaptors` class.
1219
 
1220
  **Important:
1221
+ It's recommended to use the identifier argument if you plan to use a different selector later
1222
  and want to relocate the same element(s)**
1223
 
1224
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
1225
 
1226
  :param selector: The XPath selector to be used.
1227
+ :param identifier: A string that will be used to retrieve element's data in auto-matching,
1228
  otherwise the selector will be used.
1229
  :param auto_save: Automatically save new elements for `auto_match` later
1230
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
1231
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
1232
  number unless you must know what you are doing!
1233
 
1234
+ :return: `Adaptors` class.
1235
  """
1236
  results = [
1237
  n.xpath(
 
1250
  ) -> "Adaptors[Adaptor]":
1251
  """
1252
  Call the ``.css()`` method for each element in this list and return
1253
+ their results flattened as another `Adaptors` class.
1254
 
1255
  **Important:
1256
+ It's recommended to use the identifier argument if you plan to use a different selector later
1257
  and want to relocate the same element(s)**
1258
 
1259
  :param selector: The CSS3 selector to be used.
1260
+ :param identifier: A string that will be used to retrieve element's data in auto-matching,
1261
  otherwise the selector will be used.
1262
  :param auto_save: Automatically save new elements for `auto_match` later
1263
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
1264
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
1265
  number unless you must know what you are doing!
1266
 
1267
+ :return: `Adaptors` class.
1268
  """
1269
  results = [
1270
  n.css(selector, identifier or selector, False, auto_save, percentage)
 
1283
  their results flattened as List of TextHandler.
1284
 
1285
  :param regex: Can be either a compiled regular expression or a string.
1286
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
1287
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1288
+ :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
1289
  """
1290
  results = [
1291
  n.text.re(regex, replace_entities, clean_match, case_sensitive)
 
1308
  :param default: The default value to be returned if there is no match
1309
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
1310
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1311
+ :param case_sensitive: if disabled, function will set the regex to ignore the letters case while compiling it
1312
  """
1313
  for n in self:
1314
  for result in n.re(regex, replace_entities, clean_match, case_sensitive):