Karim shoair commited on
Commit
1fee013
·
1 Parent(s): 6988daa

perf(parser): A lot of optimizations to speed things up

Browse files
scrapling/core/custom_types.py CHANGED
@@ -31,9 +31,6 @@ class TextHandler(str):
31
 
32
  __slots__ = ()
33
 
34
- def __new__(cls, string):
35
- return super().__new__(cls, str(string))
36
-
37
  def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler":
38
  lst = super().__getitem__(key)
39
  return cast(_TextHandlerType, TextHandler(lst))
 
31
 
32
  __slots__ = ()
33
 
 
 
 
34
  def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler":
35
  lst = super().__getitem__(key)
36
  return cast(_TextHandlerType, TextHandler(lst))
scrapling/parser.py CHANGED
@@ -40,6 +40,13 @@ from scrapling.core.translator import translator as _translator
40
  from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
41
 
42
  __DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
 
 
 
 
 
 
 
43
 
44
 
45
  class Selector(SelectorsGeneration):
@@ -101,7 +108,7 @@ class Selector(SelectorsGeneration):
101
  "Selector class needs HTML content, or root arguments to work"
102
  )
103
 
104
- self.__text = ""
105
  if root is None:
106
  if isinstance(content, str):
107
  body = (
@@ -284,10 +291,10 @@ class Selector(SelectorsGeneration):
284
  @property
285
  def text(self) -> TextHandler:
286
  """Get text content of the element"""
287
- if not self.__text:
288
  # If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
289
  # before extracting text, then keep `keep_comments` set to False while initializing the first class
290
- self.__text = TextHandler(self._root.text)
291
  return self.__text
292
 
293
  def get_all_text(
@@ -613,20 +620,17 @@ class Selector(SelectorsGeneration):
613
  )
614
 
615
  results = []
616
- if "," in selector:
617
- for single_selector in split_selectors(selector):
618
- # I'm doing this only so the `save` function saves data correctly for combined selectors
619
- # Like using the ',' to combine two different selectors that point to different elements.
620
- xpath_selector = _translator.css_to_xpath(
621
- single_selector.canonical()
622
- )
623
- results += self.xpath(
624
- xpath_selector,
625
- identifier or single_selector.canonical(),
626
- adaptive,
627
- auto_save,
628
- percentage,
629
- )
630
 
631
  return results
632
  except (
@@ -666,16 +670,13 @@ class Selector(SelectorsGeneration):
666
  :return: `Selectors` class.
667
  """
668
  try:
669
- elements = self._root.xpath(selector, **kwargs)
670
-
671
- if elements:
672
- if auto_save:
673
- if not self.__adaptive_enabled:
674
- log.warning(
675
- "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
676
- )
677
- else:
678
- self.save(elements[0], identifier or selector)
679
 
680
  return self.__handle_elements(elements)
681
  elif self.__adaptive_enabled:
@@ -718,13 +719,6 @@ class Selector(SelectorsGeneration):
718
  :param kwargs: The attributes you want to filter elements based on it.
719
  :return: The `Selectors` object of the elements or empty list
720
  """
721
- # Attributes that are Python reserved words and can't be used directly
722
- # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
723
- # https://www.w3schools.com/python/python_ref_keywords.asp
724
- whitelisted = {
725
- "class_": "class",
726
- "for_": "for",
727
- }
728
 
729
  if not args and not kwargs:
730
  raise TypeError(
@@ -782,7 +776,7 @@ class Selector(SelectorsGeneration):
782
 
783
  for attribute_name, value in kwargs.items():
784
  # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
785
- attribute_name = whitelisted.get(attribute_name, attribute_name)
786
  attributes[attribute_name] = value
787
 
788
  # It's easier and faster to build a selector than traversing the tree
 
40
  from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
41
 
42
  __DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
43
+ # Attributes that are Python reserved words and can't be used directly
44
+ # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
45
+ # https://www.w3schools.com/python/python_ref_keywords.asp
46
+ _whitelisted = {
47
+ "class_": "class",
48
+ "for_": "for",
49
+ }
50
 
51
 
52
  class Selector(SelectorsGeneration):
 
108
  "Selector class needs HTML content, or root arguments to work"
109
  )
110
 
111
+ self.__text = None
112
  if root is None:
113
  if isinstance(content, str):
114
  body = (
 
291
  @property
292
  def text(self) -> TextHandler:
293
  """Get text content of the element"""
294
+ if self.__text is None:
295
  # If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
296
  # before extracting text, then keep `keep_comments` set to False while initializing the first class
297
+ self.__text = TextHandler(self._root.text or "")
298
  return self.__text
299
 
300
  def get_all_text(
 
620
  )
621
 
622
  results = []
623
+ for single_selector in split_selectors(selector):
624
+ # I'm doing this only so the `save` function saves data correctly for combined selectors
625
+ # Like using the ',' to combine two different selectors that point to different elements.
626
+ xpath_selector = _translator.css_to_xpath(single_selector.canonical())
627
+ results += self.xpath(
628
+ xpath_selector,
629
+ identifier or single_selector.canonical(),
630
+ adaptive,
631
+ auto_save,
632
+ percentage,
633
+ )
 
 
 
634
 
635
  return results
636
  except (
 
670
  :return: `Selectors` class.
671
  """
672
  try:
673
+ if elements := self._root.xpath(selector, **kwargs):
674
+ if not self.__adaptive_enabled and auto_save:
675
+ log.warning(
676
+ "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
677
+ )
678
+ elif self.__adaptive_enabled and auto_save:
679
+ self.save(elements[0], identifier or selector)
 
 
 
680
 
681
  return self.__handle_elements(elements)
682
  elif self.__adaptive_enabled:
 
719
  :param kwargs: The attributes you want to filter elements based on it.
720
  :return: The `Selectors` object of the elements or empty list
721
  """
 
 
 
 
 
 
 
722
 
723
  if not args and not kwargs:
724
  raise TypeError(
 
776
 
777
  for attribute_name, value in kwargs.items():
778
  # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
779
+ attribute_name = _whitelisted.get(attribute_name, attribute_name)
780
  attributes[attribute_name] = value
781
 
782
  # It's easier and faster to build a selector than traversing the tree