Karim shoair commited on
Commit
07abdd8
·
1 Parent(s): e0c7c32

fix(parser): Code restructure for speed boost & better type hints

Browse files

- Now return types are consistent across all the parser engine
- Parser got a 5-30% performance boost across different methods.
- Renamed some of the internal methods for clearer code.
- A lot better auto-completion experience after a lot of adjustments.

Files changed (1) hide show
  1. scrapling/parser.py +90 -82
scrapling/parser.py CHANGED
@@ -1,6 +1,7 @@
1
  import inspect
2
  import os
3
  import re
 
4
  from difflib import SequenceMatcher
5
  from urllib.parse import urljoin
6
 
@@ -145,47 +146,46 @@ class Adaptor(SelectorsGeneration):
145
  # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
146
  return issubclass(type(element), etree._ElementUnicodeResult)
147
 
148
- def __get_correct_result(
149
- self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
150
- ) -> Union[TextHandler, html.HtmlElement, 'Adaptor', str]:
151
- """Used internally in all functions to convert results to type (Adaptor|Adaptors) when possible"""
152
- if self._is_text_node(element):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  # etree._ElementUnicodeResult basically inherit from `str` so it's fine
154
- return TextHandler(str(element))
155
  else:
156
- if issubclass(type(element), html.HtmlMixin):
157
-
158
- return Adaptor(
159
- root=element,
160
- text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
161
- url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
162
- keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
163
- huge_tree=self.__huge_tree_enabled,
164
- **self.__response_data
165
- )
166
- return element
167
 
168
- def __convert_results(
169
- self, result: Union[List[html.HtmlElement], html.HtmlElement]
170
- ) -> Union['Adaptors[Adaptor]', 'Adaptor', List, None]:
171
- """Used internally in all functions to convert results to type (Adaptor|Adaptors) in bulk when possible"""
172
- if result is None:
173
- return None
174
- elif result == []: # Lxml will give a warning if I used something like `not result`
175
- return []
176
-
177
- if isinstance(result, Adaptors):
178
- return result
179
-
180
- if type(result) is list:
181
- results = [self.__get_correct_result(n) for n in result]
182
- if all(isinstance(res, self.__class__) for res in results):
183
- return Adaptors(results)
184
- elif all(isinstance(res, TextHandler) for res in results):
185
- return TextHandlers(results)
186
- return results
187
 
188
- return self.__get_correct_result(result)
 
 
 
 
 
189
 
190
  def __getstate__(self) -> Any:
191
  # lxml don't like it :)
@@ -282,14 +282,14 @@ class Adaptor(SelectorsGeneration):
282
  @property
283
  def parent(self) -> Union['Adaptor', None]:
284
  """Return the direct parent of the element or ``None`` otherwise"""
285
- return self.__convert_results(self._root.getparent())
286
 
287
  @property
288
  def children(self) -> Union['Adaptors[Adaptor]', List]:
289
  """Return the children elements of the current element or empty list otherwise"""
290
- return self.__convert_results(list(
291
- child for child in self._root.iterchildren() if type(child) not in html_forbidden
292
- ))
293
 
294
  @property
295
  def siblings(self) -> Union['Adaptors[Adaptor]', List]:
@@ -301,7 +301,7 @@ class Adaptor(SelectorsGeneration):
301
  def iterancestors(self) -> Generator['Adaptor', None, None]:
302
  """Return a generator that loops over all ancestors of the element, starting with element's parent."""
303
  for ancestor in self._root.iterancestors():
304
- yield self.__convert_results(ancestor)
305
 
306
  def find_ancestor(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
307
  """Loop over all ancestors of the element till one match the passed function
@@ -328,7 +328,7 @@ class Adaptor(SelectorsGeneration):
328
  # Ignore html comments and unwanted types
329
  next_element = next_element.getnext()
330
 
331
- return self.__convert_results(next_element)
332
 
333
  @property
334
  def previous(self) -> Union['Adaptor', None]:
@@ -339,7 +339,7 @@ class Adaptor(SelectorsGeneration):
339
  # Ignore html comments and unwanted types
340
  prev_element = prev_element.getprevious()
341
 
342
- return self.__convert_results(prev_element)
343
 
344
  # For easy copy-paste from Scrapy/parsel code when needed :)
345
  def get(self, default=None):
@@ -413,13 +413,16 @@ class Adaptor(SelectorsGeneration):
413
  if score_table:
414
  highest_probability = max(score_table.keys())
415
  if score_table[highest_probability] and highest_probability >= percentage:
416
- log.debug(f'Highest probability was {highest_probability}%')
417
- log.debug('Top 5 best matching elements are: ')
418
- for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
419
- log.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
 
 
 
420
  if not adaptor_type:
421
  return score_table[highest_probability]
422
- return self.__convert_results(score_table[highest_probability])
423
  return []
424
 
425
  def css_first(self, selector: str, identifier: str = '',
@@ -493,7 +496,7 @@ class Adaptor(SelectorsGeneration):
493
  :return: List as :class:`Adaptors`
494
  """
495
  try:
496
- if not self.__auto_match_enabled:
497
  # No need to split selectors in this case, let's save some CPU cycles :)
498
  xpath_selector = HTMLTranslator().css_to_xpath(selector)
499
  return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
@@ -507,11 +510,8 @@ class Adaptor(SelectorsGeneration):
507
  results += self.xpath(
508
  xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
509
  )
510
- else:
511
- xpath_selector = HTMLTranslator().css_to_xpath(selector)
512
- return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
513
 
514
- return self.__convert_results(results)
515
  except (SelectorError, SelectorSyntaxError,):
516
  raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
517
 
@@ -538,37 +538,37 @@ class Adaptor(SelectorsGeneration):
538
  :return: List as :class:`Adaptors`
539
  """
540
  try:
541
- selected_elements = self._root.xpath(selector, **kwargs)
542
 
543
- if selected_elements:
544
- if not self.__auto_match_enabled and auto_save:
545
- log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
546
-
547
- elif self.__auto_match_enabled and auto_save:
548
- self.save(selected_elements[0], identifier or selector)
549
 
550
- return self.__convert_results(selected_elements)
551
- else:
552
- if self.__auto_match_enabled and auto_match:
553
  element_data = self.retrieve(identifier or selector)
554
  if element_data:
555
- relocated = self.relocate(element_data, percentage)
556
- if relocated is not None and auto_save:
557
- self.save(relocated[0], identifier or selector)
558
 
559
- return self.__convert_results(relocated)
560
- else:
561
- return self.__convert_results(selected_elements)
562
-
563
- elif not self.__auto_match_enabled and auto_match:
564
  log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
 
 
565
 
566
- return self.__convert_results(selected_elements)
567
 
568
  except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
569
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
570
 
571
- def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptors[Adaptor]', List]:
572
  """Find elements by filters of your creations for ease..
573
 
574
  :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
@@ -588,7 +588,7 @@ class Adaptor(SelectorsGeneration):
588
 
589
  attributes = dict()
590
  tags, patterns = set(), set()
591
- results, functions, selectors = [], [], []
592
 
593
  def _search_tree(element: Adaptor, filter_function: Callable) -> None:
594
  """Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
@@ -662,7 +662,7 @@ class Adaptor(SelectorsGeneration):
662
  for function in functions:
663
  _search_tree(result, function)
664
 
665
- return self.__convert_results(results)
666
 
667
  def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
668
  """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
@@ -894,7 +894,7 @@ class Adaptor(SelectorsGeneration):
894
  if potential_match != root and are_alike(root, target_attrs, potential_match):
895
  similar_elements.append(potential_match)
896
 
897
- return self.__convert_results(similar_elements)
898
 
899
  def find_by_text(
900
  self, text: str, first_match: bool = True, partial: bool = False,
@@ -908,7 +908,7 @@ class Adaptor(SelectorsGeneration):
908
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
909
  """
910
 
911
- results = []
912
  if not case_sensitive:
913
  text = text.lower()
914
 
@@ -942,7 +942,7 @@ class Adaptor(SelectorsGeneration):
942
  if first_match:
943
  if results:
944
  return results[0]
945
- return self.__convert_results(results)
946
 
947
  def find_by_regex(
948
  self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
@@ -953,7 +953,7 @@ class Adaptor(SelectorsGeneration):
953
  :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
954
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
955
  """
956
- results = []
957
 
958
  def _traverse(node: Adaptor) -> None:
959
  """Check if element matches given regex otherwise, traverse the children tree and iterate"""
@@ -975,7 +975,7 @@ class Adaptor(SelectorsGeneration):
975
 
976
  if results and first_match:
977
  return results[0]
978
- return self.__convert_results(results)
979
 
980
 
981
  class Adaptors(List[Adaptor]):
@@ -984,7 +984,15 @@ class Adaptors(List[Adaptor]):
984
  """
985
  __slots__ = ()
986
 
987
- def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors[Adaptor]"]:
 
 
 
 
 
 
 
 
988
  lst = super().__getitem__(pos)
989
  if isinstance(pos, slice):
990
  return self.__class__(lst)
 
1
  import inspect
2
  import os
3
  import re
4
+ import typing
5
  from difflib import SequenceMatcher
6
  from urllib.parse import urljoin
7
 
 
146
  # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
147
  return issubclass(type(element), etree._ElementUnicodeResult)
148
 
149
+ @staticmethod
150
+ def __content_convertor(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> TextHandler:
151
+ """Used internally to convert a single element's text content to TextHandler directly without checks
152
+
153
+ This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
154
+ """
155
+ return TextHandler(str(element))
156
+
157
+ def __element_convertor(self, element: html.HtmlElement) -> 'Adaptor':
158
+ """Used internally to convert a single HtmlElement to Adaptor directly without checks"""
159
+ return Adaptor(
160
+ root=element,
161
+ text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
162
+ url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
163
+ keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
164
+ huge_tree=self.__huge_tree_enabled,
165
+ **self.__response_data
166
+ )
167
+
168
+ def __handle_element(self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> Union[TextHandler, 'Adaptor', None]:
169
+ """Used internally in all functions to convert a single element to type (Adaptor|TextHandler) when possible"""
170
+ if element is None:
171
+ return None
172
+ elif self._is_text_node(element):
173
  # etree._ElementUnicodeResult basically inherit from `str` so it's fine
174
+ return self.__content_convertor(element)
175
  else:
176
+ return self.__element_convertor(element)
 
 
 
 
 
 
 
 
 
 
177
 
178
+ def __handle_elements(self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]) -> Union['Adaptors', 'TextHandlers', List]:
179
+ """Used internally in all functions to convert results to type (Adaptors|TextHandlers) in bulk when possible"""
180
+ if not len(result): # Lxml will give a warning if I used something like `not result`
181
+ return Adaptors([])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
+ # From within the code, this method will always get a list of the same type
184
+ # so we will continue without checks for slight performance boost
185
+ if self._is_text_node(result[0]):
186
+ return TextHandlers(list(map(self.__content_convertor, result)))
187
+
188
+ return Adaptors(list(map(self.__element_convertor, result)))
189
 
190
  def __getstate__(self) -> Any:
191
  # lxml don't like it :)
 
282
  @property
283
  def parent(self) -> Union['Adaptor', None]:
284
  """Return the direct parent of the element or ``None`` otherwise"""
285
+ return self.__handle_element(self._root.getparent())
286
 
287
  @property
288
  def children(self) -> Union['Adaptors[Adaptor]', List]:
289
  """Return the children elements of the current element or empty list otherwise"""
290
+ return Adaptors([
291
+ self.__element_convertor(child) for child in self._root.iterchildren() if type(child) not in html_forbidden
292
+ ])
293
 
294
  @property
295
  def siblings(self) -> Union['Adaptors[Adaptor]', List]:
 
301
  def iterancestors(self) -> Generator['Adaptor', None, None]:
302
  """Return a generator that loops over all ancestors of the element, starting with element's parent."""
303
  for ancestor in self._root.iterancestors():
304
+ yield self.__element_convertor(ancestor)
305
 
306
  def find_ancestor(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
307
  """Loop over all ancestors of the element till one match the passed function
 
328
  # Ignore html comments and unwanted types
329
  next_element = next_element.getnext()
330
 
331
+ return self.__handle_element(next_element)
332
 
333
  @property
334
  def previous(self) -> Union['Adaptor', None]:
 
339
  # Ignore html comments and unwanted types
340
  prev_element = prev_element.getprevious()
341
 
342
+ return self.__handle_element(prev_element)
343
 
344
  # For easy copy-paste from Scrapy/parsel code when needed :)
345
  def get(self, default=None):
 
413
  if score_table:
414
  highest_probability = max(score_table.keys())
415
  if score_table[highest_probability] and highest_probability >= percentage:
416
+ if log.getEffectiveLevel() < 20:
417
+ # No need to execute this part if logging level is not debugging
418
+ log.debug(f'Highest probability was {highest_probability}%')
419
+ log.debug('Top 5 best matching elements are: ')
420
+ for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
421
+ log.debug(f'{percent} -> {self.__handle_elements(score_table[percent])}')
422
+
423
  if not adaptor_type:
424
  return score_table[highest_probability]
425
+ return self.__handle_elements(score_table[highest_probability])
426
  return []
427
 
428
  def css_first(self, selector: str, identifier: str = '',
 
496
  :return: List as :class:`Adaptors`
497
  """
498
  try:
499
+ if not self.__auto_match_enabled or ',' not in selector:
500
  # No need to split selectors in this case, let's save some CPU cycles :)
501
  xpath_selector = HTMLTranslator().css_to_xpath(selector)
502
  return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
 
510
  results += self.xpath(
511
  xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
512
  )
 
 
 
513
 
514
+ return results
515
  except (SelectorError, SelectorSyntaxError,):
516
  raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
517
 
 
538
  :return: List as :class:`Adaptors`
539
  """
540
  try:
541
+ elements = self._root.xpath(selector, **kwargs)
542
 
543
+ if elements:
544
+ if auto_save:
545
+ if not self.__auto_match_enabled:
546
+ log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
547
+ else:
548
+ self.save(elements[0], identifier or selector)
549
 
550
+ return self.__handle_elements(elements)
551
+ elif self.__auto_match_enabled:
552
+ if auto_match:
553
  element_data = self.retrieve(identifier or selector)
554
  if element_data:
555
+ elements = self.relocate(element_data, percentage)
556
+ if elements is not None and auto_save:
557
+ self.save(elements[0], identifier or selector)
558
 
559
+ return self.__handle_elements(elements)
560
+ else:
561
+ if auto_match:
 
 
562
  log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
563
+ elif auto_save:
564
+ log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
565
 
566
+ return self.__handle_elements(elements)
567
 
568
  except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
569
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
570
 
571
+ def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> 'Adaptors':
572
  """Find elements by filters of your creations for ease..
573
 
574
  :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
 
588
 
589
  attributes = dict()
590
  tags, patterns = set(), set()
591
+ results, functions, selectors = Adaptors([]), [], []
592
 
593
  def _search_tree(element: Adaptor, filter_function: Callable) -> None:
594
  """Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
 
662
  for function in functions:
663
  _search_tree(result, function)
664
 
665
+ return results
666
 
667
  def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
668
  """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
 
894
  if potential_match != root and are_alike(root, target_attrs, potential_match):
895
  similar_elements.append(potential_match)
896
 
897
+ return self.__handle_elements(similar_elements)
898
 
899
  def find_by_text(
900
  self, text: str, first_match: bool = True, partial: bool = False,
 
908
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
909
  """
910
 
911
+ results = Adaptors([])
912
  if not case_sensitive:
913
  text = text.lower()
914
 
 
942
  if first_match:
943
  if results:
944
  return results[0]
945
+ return results
946
 
947
  def find_by_regex(
948
  self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
 
953
  :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
954
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
955
  """
956
+ results = Adaptors([])
957
 
958
  def _traverse(node: Adaptor) -> None:
959
  """Check if element matches given regex otherwise, traverse the children tree and iterate"""
 
975
 
976
  if results and first_match:
977
  return results[0]
978
+ return results
979
 
980
 
981
  class Adaptors(List[Adaptor]):
 
984
  """
985
  __slots__ = ()
986
 
987
+ @typing.overload
988
+ def __getitem__(self, pos: SupportsIndex) -> Adaptor:
989
+ pass
990
+
991
+ @typing.overload
992
+ def __getitem__(self, pos: slice) -> "Adaptors":
993
+ pass
994
+
995
+ def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors"]:
996
  lst = super().__getitem__(pos)
997
  if isinstance(pos, slice):
998
  return self.__class__(lst)