Karim shoair commited on
Commit ·
ff3cdb9
1
Parent(s): 3db9c55
refactor(parser): optimize imports
Browse files- scrapling/parser.py +31 -28
scrapling/parser.py
CHANGED
|
@@ -7,7 +7,14 @@ from urllib.parse import urljoin
|
|
| 7 |
|
| 8 |
from cssselect import SelectorError, SelectorSyntaxError
|
| 9 |
from cssselect import parse as split_selectors
|
| 10 |
-
from lxml import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
from scrapling.core._types import (
|
| 13 |
Any,
|
|
@@ -54,7 +61,7 @@ class Selector(SelectorsGeneration):
|
|
| 54 |
url: Optional[str] = None,
|
| 55 |
encoding: str = "utf8",
|
| 56 |
huge_tree: bool = True,
|
| 57 |
-
root: Optional[
|
| 58 |
keep_comments: Optional[bool] = False,
|
| 59 |
keep_cdata: Optional[bool] = False,
|
| 60 |
adaptive: Optional[bool] = False,
|
|
@@ -105,7 +112,7 @@ class Selector(SelectorsGeneration):
|
|
| 105 |
)
|
| 106 |
|
| 107 |
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 108 |
-
parser =
|
| 109 |
recover=True,
|
| 110 |
remove_blank_text=True,
|
| 111 |
remove_comments=(not keep_comments),
|
|
@@ -115,7 +122,7 @@ class Selector(SelectorsGeneration):
|
|
| 115 |
default_doctype=True,
|
| 116 |
strip_cdata=(not keep_cdata),
|
| 117 |
)
|
| 118 |
-
self._root =
|
| 119 |
|
| 120 |
jsonable_text = content if isinstance(content, str) else body.decode()
|
| 121 |
if is_jsonable(jsonable_text):
|
|
@@ -123,7 +130,7 @@ class Selector(SelectorsGeneration):
|
|
| 123 |
|
| 124 |
else:
|
| 125 |
# All HTML types inherit from HtmlMixin so this to check for all at once
|
| 126 |
-
if not issubclass(type(root),
|
| 127 |
raise TypeError(
|
| 128 |
f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
|
| 129 |
)
|
|
@@ -190,7 +197,7 @@ class Selector(SelectorsGeneration):
|
|
| 190 |
# Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
|
| 191 |
@staticmethod
|
| 192 |
def _is_text_node(
|
| 193 |
-
element: Union[
|
| 194 |
) -> bool:
|
| 195 |
"""Return True if the given element is a result of a string expression
|
| 196 |
Examples:
|
|
@@ -198,11 +205,11 @@ class Selector(SelectorsGeneration):
|
|
| 198 |
CSS3 -> '::text', '::attr(attrib)'...
|
| 199 |
"""
|
| 200 |
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
| 201 |
-
return issubclass(type(element),
|
| 202 |
|
| 203 |
@staticmethod
|
| 204 |
def __content_convertor(
|
| 205 |
-
element: Union[
|
| 206 |
) -> TextHandler:
|
| 207 |
"""Used internally to convert a single element's text content to TextHandler directly without checks
|
| 208 |
|
|
@@ -210,7 +217,7 @@ class Selector(SelectorsGeneration):
|
|
| 210 |
"""
|
| 211 |
return TextHandler(str(element))
|
| 212 |
|
| 213 |
-
def __element_convertor(self, element:
|
| 214 |
"""Used internally to convert a single HtmlElement to Selector directly without checks"""
|
| 215 |
db_instance = (
|
| 216 |
self._storage if (hasattr(self, "_storage") and self._storage) else None
|
|
@@ -228,19 +235,19 @@ class Selector(SelectorsGeneration):
|
|
| 228 |
)
|
| 229 |
|
| 230 |
def __handle_element(
|
| 231 |
-
self, element: Union[
|
| 232 |
) -> Union[TextHandler, "Selector", None]:
|
| 233 |
"""Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
|
| 234 |
if element is None:
|
| 235 |
return None
|
| 236 |
elif self._is_text_node(element):
|
| 237 |
-
#
|
| 238 |
return self.__content_convertor(element)
|
| 239 |
else:
|
| 240 |
return self.__element_convertor(element)
|
| 241 |
|
| 242 |
def __handle_elements(
|
| 243 |
-
self, result: List[Union[
|
| 244 |
) -> Union["Selectors", "TextHandlers", List]:
|
| 245 |
"""Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible"""
|
| 246 |
if not len(
|
|
@@ -332,9 +339,7 @@ class Selector(SelectorsGeneration):
|
|
| 332 |
def html_content(self) -> TextHandler:
|
| 333 |
"""Return the inner HTML code of the element"""
|
| 334 |
return TextHandler(
|
| 335 |
-
|
| 336 |
-
self._root, encoding="unicode", method="html", with_tail=False
|
| 337 |
-
)
|
| 338 |
)
|
| 339 |
|
| 340 |
body = html_content
|
|
@@ -342,7 +347,7 @@ class Selector(SelectorsGeneration):
|
|
| 342 |
def prettify(self) -> TextHandler:
|
| 343 |
"""Return a prettified version of the element's inner html-code"""
|
| 344 |
return TextHandler(
|
| 345 |
-
|
| 346 |
self._root,
|
| 347 |
encoding="unicode",
|
| 348 |
pretty_print=True,
|
|
@@ -467,10 +472,10 @@ class Selector(SelectorsGeneration):
|
|
| 467 |
# From here we start with the selecting functions
|
| 468 |
def relocate(
|
| 469 |
self,
|
| 470 |
-
element: Union[Dict,
|
| 471 |
percentage: int = 0,
|
| 472 |
selector_type: bool = False,
|
| 473 |
-
) -> Union[List[Union[
|
| 474 |
"""This function will search again for the element in the page tree, used automatically on page structure change
|
| 475 |
|
| 476 |
:param element: The element we want to relocate in the tree
|
|
@@ -485,7 +490,7 @@ class Selector(SelectorsGeneration):
|
|
| 485 |
if isinstance(element, self.__class__):
|
| 486 |
element = element._root
|
| 487 |
|
| 488 |
-
if issubclass(type(element),
|
| 489 |
element = _StorageTools.element_to_dict(element)
|
| 490 |
|
| 491 |
for node in self._root.xpath(".//*"):
|
|
@@ -698,8 +703,8 @@ class Selector(SelectorsGeneration):
|
|
| 698 |
except (
|
| 699 |
SelectorError,
|
| 700 |
SelectorSyntaxError,
|
| 701 |
-
|
| 702 |
-
|
| 703 |
) as e:
|
| 704 |
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}") from e
|
| 705 |
|
|
@@ -826,7 +831,7 @@ class Selector(SelectorsGeneration):
|
|
| 826 |
return None
|
| 827 |
|
| 828 |
def __calculate_similarity_score(
|
| 829 |
-
self, original: Dict, candidate:
|
| 830 |
) -> float:
|
| 831 |
"""Used internally to calculate a score that shows how a candidate element similar to the original one
|
| 832 |
|
|
@@ -921,9 +926,7 @@ class Selector(SelectorsGeneration):
|
|
| 921 |
)
|
| 922 |
return score
|
| 923 |
|
| 924 |
-
def save(
|
| 925 |
-
self, element: Union["Selector", html.HtmlElement], identifier: str
|
| 926 |
-
) -> None:
|
| 927 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 928 |
|
| 929 |
:param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
|
|
@@ -1004,16 +1007,16 @@ class Selector(SelectorsGeneration):
|
|
| 1004 |
|
| 1005 |
@staticmethod
|
| 1006 |
def __get_attributes(
|
| 1007 |
-
element:
|
| 1008 |
) -> Dict:
|
| 1009 |
"""Return attributes dictionary without the ignored list"""
|
| 1010 |
return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
|
| 1011 |
|
| 1012 |
def __are_alike(
|
| 1013 |
self,
|
| 1014 |
-
original:
|
| 1015 |
original_attributes: Dict,
|
| 1016 |
-
candidate:
|
| 1017 |
ignore_attributes: Union[List, Tuple],
|
| 1018 |
similarity_threshold: float,
|
| 1019 |
match_text: bool = False,
|
|
|
|
| 7 |
|
| 8 |
from cssselect import SelectorError, SelectorSyntaxError
|
| 9 |
from cssselect import parse as split_selectors
|
| 10 |
+
from lxml.html import HtmlElement, HtmlMixin, HTMLParser
|
| 11 |
+
from lxml.etree import (
|
| 12 |
+
tostring,
|
| 13 |
+
fromstring,
|
| 14 |
+
XPathError,
|
| 15 |
+
XPathEvalError,
|
| 16 |
+
_ElementUnicodeResult,
|
| 17 |
+
)
|
| 18 |
|
| 19 |
from scrapling.core._types import (
|
| 20 |
Any,
|
|
|
|
| 61 |
url: Optional[str] = None,
|
| 62 |
encoding: str = "utf8",
|
| 63 |
huge_tree: bool = True,
|
| 64 |
+
root: Optional[HtmlElement] = None,
|
| 65 |
keep_comments: Optional[bool] = False,
|
| 66 |
keep_cdata: Optional[bool] = False,
|
| 67 |
adaptive: Optional[bool] = False,
|
|
|
|
| 112 |
)
|
| 113 |
|
| 114 |
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 115 |
+
parser = HTMLParser(
|
| 116 |
recover=True,
|
| 117 |
remove_blank_text=True,
|
| 118 |
remove_comments=(not keep_comments),
|
|
|
|
| 122 |
default_doctype=True,
|
| 123 |
strip_cdata=(not keep_cdata),
|
| 124 |
)
|
| 125 |
+
self._root = fromstring(body, parser=parser, base_url=url)
|
| 126 |
|
| 127 |
jsonable_text = content if isinstance(content, str) else body.decode()
|
| 128 |
if is_jsonable(jsonable_text):
|
|
|
|
| 130 |
|
| 131 |
else:
|
| 132 |
# All HTML types inherit from HtmlMixin so this to check for all at once
|
| 133 |
+
if not issubclass(type(root), HtmlMixin):
|
| 134 |
raise TypeError(
|
| 135 |
f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
|
| 136 |
)
|
|
|
|
| 197 |
# Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
|
| 198 |
@staticmethod
|
| 199 |
def _is_text_node(
|
| 200 |
+
element: Union[HtmlElement, _ElementUnicodeResult],
|
| 201 |
) -> bool:
|
| 202 |
"""Return True if the given element is a result of a string expression
|
| 203 |
Examples:
|
|
|
|
| 205 |
CSS3 -> '::text', '::attr(attrib)'...
|
| 206 |
"""
|
| 207 |
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
| 208 |
+
return issubclass(type(element), _ElementUnicodeResult)
|
| 209 |
|
| 210 |
@staticmethod
|
| 211 |
def __content_convertor(
|
| 212 |
+
element: Union[HtmlElement, _ElementUnicodeResult],
|
| 213 |
) -> TextHandler:
|
| 214 |
"""Used internally to convert a single element's text content to TextHandler directly without checks
|
| 215 |
|
|
|
|
| 217 |
"""
|
| 218 |
return TextHandler(str(element))
|
| 219 |
|
| 220 |
+
def __element_convertor(self, element: HtmlElement) -> "Selector":
|
| 221 |
"""Used internally to convert a single HtmlElement to Selector directly without checks"""
|
| 222 |
db_instance = (
|
| 223 |
self._storage if (hasattr(self, "_storage") and self._storage) else None
|
|
|
|
| 235 |
)
|
| 236 |
|
| 237 |
def __handle_element(
|
| 238 |
+
self, element: Union[HtmlElement, _ElementUnicodeResult]
|
| 239 |
) -> Union[TextHandler, "Selector", None]:
|
| 240 |
"""Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
|
| 241 |
if element is None:
|
| 242 |
return None
|
| 243 |
elif self._is_text_node(element):
|
| 244 |
+
# `_ElementUnicodeResult` basically inherit from `str` so it's fine
|
| 245 |
return self.__content_convertor(element)
|
| 246 |
else:
|
| 247 |
return self.__element_convertor(element)
|
| 248 |
|
| 249 |
def __handle_elements(
|
| 250 |
+
self, result: List[Union[HtmlElement, _ElementUnicodeResult]]
|
| 251 |
) -> Union["Selectors", "TextHandlers", List]:
|
| 252 |
"""Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible"""
|
| 253 |
if not len(
|
|
|
|
| 339 |
def html_content(self) -> TextHandler:
|
| 340 |
"""Return the inner HTML code of the element"""
|
| 341 |
return TextHandler(
|
| 342 |
+
tostring(self._root, encoding="unicode", method="html", with_tail=False)
|
|
|
|
|
|
|
| 343 |
)
|
| 344 |
|
| 345 |
body = html_content
|
|
|
|
| 347 |
def prettify(self) -> TextHandler:
|
| 348 |
"""Return a prettified version of the element's inner html-code"""
|
| 349 |
return TextHandler(
|
| 350 |
+
tostring(
|
| 351 |
self._root,
|
| 352 |
encoding="unicode",
|
| 353 |
pretty_print=True,
|
|
|
|
| 472 |
# From here we start with the selecting functions
|
| 473 |
def relocate(
|
| 474 |
self,
|
| 475 |
+
element: Union[Dict, HtmlElement, "Selector"],
|
| 476 |
percentage: int = 0,
|
| 477 |
selector_type: bool = False,
|
| 478 |
+
) -> Union[List[Union[HtmlElement, None]], "Selectors"]:
|
| 479 |
"""This function will search again for the element in the page tree, used automatically on page structure change
|
| 480 |
|
| 481 |
:param element: The element we want to relocate in the tree
|
|
|
|
| 490 |
if isinstance(element, self.__class__):
|
| 491 |
element = element._root
|
| 492 |
|
| 493 |
+
if issubclass(type(element), HtmlElement):
|
| 494 |
element = _StorageTools.element_to_dict(element)
|
| 495 |
|
| 496 |
for node in self._root.xpath(".//*"):
|
|
|
|
| 703 |
except (
|
| 704 |
SelectorError,
|
| 705 |
SelectorSyntaxError,
|
| 706 |
+
XPathError,
|
| 707 |
+
XPathEvalError,
|
| 708 |
) as e:
|
| 709 |
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}") from e
|
| 710 |
|
|
|
|
| 831 |
return None
|
| 832 |
|
| 833 |
def __calculate_similarity_score(
|
| 834 |
+
self, original: Dict, candidate: HtmlElement
|
| 835 |
) -> float:
|
| 836 |
"""Used internally to calculate a score that shows how a candidate element similar to the original one
|
| 837 |
|
|
|
|
| 926 |
)
|
| 927 |
return score
|
| 928 |
|
| 929 |
+
def save(self, element: Union["Selector", HtmlElement], identifier: str) -> None:
|
|
|
|
|
|
|
| 930 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 931 |
|
| 932 |
:param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
|
|
|
|
| 1007 |
|
| 1008 |
@staticmethod
|
| 1009 |
def __get_attributes(
|
| 1010 |
+
element: HtmlElement, ignore_attributes: Union[List, Tuple]
|
| 1011 |
) -> Dict:
|
| 1012 |
"""Return attributes dictionary without the ignored list"""
|
| 1013 |
return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
|
| 1014 |
|
| 1015 |
def __are_alike(
|
| 1016 |
self,
|
| 1017 |
+
original: HtmlElement,
|
| 1018 |
original_attributes: Dict,
|
| 1019 |
+
candidate: HtmlElement,
|
| 1020 |
ignore_attributes: Union[List, Tuple],
|
| 1021 |
similarity_threshold: float,
|
| 1022 |
match_text: bool = False,
|