Karim shoair commited on
Commit ·
1fee013
1
Parent(s): 6988daa
perf(parser): A lot of optimizations to speed things up
Browse files- scrapling/core/custom_types.py +0 -3
- scrapling/parser.py +29 -35
scrapling/core/custom_types.py
CHANGED
|
@@ -31,9 +31,6 @@ class TextHandler(str):
|
|
| 31 |
|
| 32 |
__slots__ = ()
|
| 33 |
|
| 34 |
-
def __new__(cls, string):
|
| 35 |
-
return super().__new__(cls, str(string))
|
| 36 |
-
|
| 37 |
def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler":
|
| 38 |
lst = super().__getitem__(key)
|
| 39 |
return cast(_TextHandlerType, TextHandler(lst))
|
|
|
|
| 31 |
|
| 32 |
__slots__ = ()
|
| 33 |
|
|
|
|
|
|
|
|
|
|
| 34 |
def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler":
|
| 35 |
lst = super().__getitem__(key)
|
| 36 |
return cast(_TextHandlerType, TextHandler(lst))
|
scrapling/parser.py
CHANGED
|
@@ -40,6 +40,13 @@ from scrapling.core.translator import translator as _translator
|
|
| 40 |
from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
|
| 41 |
|
| 42 |
__DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
class Selector(SelectorsGeneration):
|
|
@@ -101,7 +108,7 @@ class Selector(SelectorsGeneration):
|
|
| 101 |
"Selector class needs HTML content, or root arguments to work"
|
| 102 |
)
|
| 103 |
|
| 104 |
-
self.__text =
|
| 105 |
if root is None:
|
| 106 |
if isinstance(content, str):
|
| 107 |
body = (
|
|
@@ -284,10 +291,10 @@ class Selector(SelectorsGeneration):
|
|
| 284 |
@property
|
| 285 |
def text(self) -> TextHandler:
|
| 286 |
"""Get text content of the element"""
|
| 287 |
-
if
|
| 288 |
# If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
|
| 289 |
# before extracting text, then keep `keep_comments` set to False while initializing the first class
|
| 290 |
-
self.__text = TextHandler(self._root.text)
|
| 291 |
return self.__text
|
| 292 |
|
| 293 |
def get_all_text(
|
|
@@ -613,20 +620,17 @@ class Selector(SelectorsGeneration):
|
|
| 613 |
)
|
| 614 |
|
| 615 |
results = []
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
)
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
auto_save,
|
| 628 |
-
percentage,
|
| 629 |
-
)
|
| 630 |
|
| 631 |
return results
|
| 632 |
except (
|
|
@@ -666,16 +670,13 @@ class Selector(SelectorsGeneration):
|
|
| 666 |
:return: `Selectors` class.
|
| 667 |
"""
|
| 668 |
try:
|
| 669 |
-
elements = self._root.xpath(selector, **kwargs)
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
)
|
| 677 |
-
else:
|
| 678 |
-
self.save(elements[0], identifier or selector)
|
| 679 |
|
| 680 |
return self.__handle_elements(elements)
|
| 681 |
elif self.__adaptive_enabled:
|
|
@@ -718,13 +719,6 @@ class Selector(SelectorsGeneration):
|
|
| 718 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 719 |
:return: The `Selectors` object of the elements or empty list
|
| 720 |
"""
|
| 721 |
-
# Attributes that are Python reserved words and can't be used directly
|
| 722 |
-
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
| 723 |
-
# https://www.w3schools.com/python/python_ref_keywords.asp
|
| 724 |
-
whitelisted = {
|
| 725 |
-
"class_": "class",
|
| 726 |
-
"for_": "for",
|
| 727 |
-
}
|
| 728 |
|
| 729 |
if not args and not kwargs:
|
| 730 |
raise TypeError(
|
|
@@ -782,7 +776,7 @@ class Selector(SelectorsGeneration):
|
|
| 782 |
|
| 783 |
for attribute_name, value in kwargs.items():
|
| 784 |
# Only replace names for kwargs, replacing them in dictionaries doesn't make sense
|
| 785 |
-
attribute_name =
|
| 786 |
attributes[attribute_name] = value
|
| 787 |
|
| 788 |
# It's easier and faster to build a selector than traversing the tree
|
|
|
|
| 40 |
from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
|
| 41 |
|
| 42 |
__DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
|
| 43 |
+
# Attributes that are Python reserved words and can't be used directly
|
| 44 |
+
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
| 45 |
+
# https://www.w3schools.com/python/python_ref_keywords.asp
|
| 46 |
+
_whitelisted = {
|
| 47 |
+
"class_": "class",
|
| 48 |
+
"for_": "for",
|
| 49 |
+
}
|
| 50 |
|
| 51 |
|
| 52 |
class Selector(SelectorsGeneration):
|
|
|
|
| 108 |
"Selector class needs HTML content, or root arguments to work"
|
| 109 |
)
|
| 110 |
|
| 111 |
+
self.__text = None
|
| 112 |
if root is None:
|
| 113 |
if isinstance(content, str):
|
| 114 |
body = (
|
|
|
|
| 291 |
@property
|
| 292 |
def text(self) -> TextHandler:
|
| 293 |
"""Get text content of the element"""
|
| 294 |
+
if self.__text is None:
|
| 295 |
# If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
|
| 296 |
# before extracting text, then keep `keep_comments` set to False while initializing the first class
|
| 297 |
+
self.__text = TextHandler(self._root.text or "")
|
| 298 |
return self.__text
|
| 299 |
|
| 300 |
def get_all_text(
|
|
|
|
| 620 |
)
|
| 621 |
|
| 622 |
results = []
|
| 623 |
+
for single_selector in split_selectors(selector):
|
| 624 |
+
# I'm doing this only so the `save` function saves data correctly for combined selectors
|
| 625 |
+
# Like using the ',' to combine two different selectors that point to different elements.
|
| 626 |
+
xpath_selector = _translator.css_to_xpath(single_selector.canonical())
|
| 627 |
+
results += self.xpath(
|
| 628 |
+
xpath_selector,
|
| 629 |
+
identifier or single_selector.canonical(),
|
| 630 |
+
adaptive,
|
| 631 |
+
auto_save,
|
| 632 |
+
percentage,
|
| 633 |
+
)
|
|
|
|
|
|
|
|
|
|
| 634 |
|
| 635 |
return results
|
| 636 |
except (
|
|
|
|
| 670 |
:return: `Selectors` class.
|
| 671 |
"""
|
| 672 |
try:
|
| 673 |
+
if elements := self._root.xpath(selector, **kwargs):
|
| 674 |
+
if not self.__adaptive_enabled and auto_save:
|
| 675 |
+
log.warning(
|
| 676 |
+
"Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
|
| 677 |
+
)
|
| 678 |
+
elif self.__adaptive_enabled and auto_save:
|
| 679 |
+
self.save(elements[0], identifier or selector)
|
|
|
|
|
|
|
|
|
|
| 680 |
|
| 681 |
return self.__handle_elements(elements)
|
| 682 |
elif self.__adaptive_enabled:
|
|
|
|
| 719 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 720 |
:return: The `Selectors` object of the elements or empty list
|
| 721 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
|
| 723 |
if not args and not kwargs:
|
| 724 |
raise TypeError(
|
|
|
|
| 776 |
|
| 777 |
for attribute_name, value in kwargs.items():
|
| 778 |
# Only replace names for kwargs, replacing them in dictionaries doesn't make sense
|
| 779 |
+
attribute_name = _whitelisted.get(attribute_name, attribute_name)
|
| 780 |
attributes[attribute_name] = value
|
| 781 |
|
| 782 |
# It's easier and faster to build a selector than traversing the tree
|