Karim shoair commited on
Commit
ff3cdb9
·
1 Parent(s): 3db9c55

refactor(parser): optimize imports

Browse files
Files changed (1) hide show
  1. scrapling/parser.py +31 -28
scrapling/parser.py CHANGED
@@ -7,7 +7,14 @@ from urllib.parse import urljoin
7
 
8
  from cssselect import SelectorError, SelectorSyntaxError
9
  from cssselect import parse as split_selectors
10
- from lxml import etree, html
 
 
 
 
 
 
 
11
 
12
  from scrapling.core._types import (
13
  Any,
@@ -54,7 +61,7 @@ class Selector(SelectorsGeneration):
54
  url: Optional[str] = None,
55
  encoding: str = "utf8",
56
  huge_tree: bool = True,
57
- root: Optional[html.HtmlElement] = None,
58
  keep_comments: Optional[bool] = False,
59
  keep_cdata: Optional[bool] = False,
60
  adaptive: Optional[bool] = False,
@@ -105,7 +112,7 @@ class Selector(SelectorsGeneration):
105
  )
106
 
107
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
108
- parser = html.HTMLParser(
109
  recover=True,
110
  remove_blank_text=True,
111
  remove_comments=(not keep_comments),
@@ -115,7 +122,7 @@ class Selector(SelectorsGeneration):
115
  default_doctype=True,
116
  strip_cdata=(not keep_cdata),
117
  )
118
- self._root = etree.fromstring(body, parser=parser, base_url=url)
119
 
120
  jsonable_text = content if isinstance(content, str) else body.decode()
121
  if is_jsonable(jsonable_text):
@@ -123,7 +130,7 @@ class Selector(SelectorsGeneration):
123
 
124
  else:
125
  # All HTML types inherit from HtmlMixin so this to check for all at once
126
- if not issubclass(type(root), html.HtmlMixin):
127
  raise TypeError(
128
  f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
129
  )
@@ -190,7 +197,7 @@ class Selector(SelectorsGeneration):
190
  # Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
191
  @staticmethod
192
  def _is_text_node(
193
- element: Union[html.HtmlElement, etree._ElementUnicodeResult],
194
  ) -> bool:
195
  """Return True if the given element is a result of a string expression
196
  Examples:
@@ -198,11 +205,11 @@ class Selector(SelectorsGeneration):
198
  CSS3 -> '::text', '::attr(attrib)'...
199
  """
200
  # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
201
- return issubclass(type(element), etree._ElementUnicodeResult)
202
 
203
  @staticmethod
204
  def __content_convertor(
205
- element: Union[html.HtmlElement, etree._ElementUnicodeResult],
206
  ) -> TextHandler:
207
  """Used internally to convert a single element's text content to TextHandler directly without checks
208
 
@@ -210,7 +217,7 @@ class Selector(SelectorsGeneration):
210
  """
211
  return TextHandler(str(element))
212
 
213
- def __element_convertor(self, element: html.HtmlElement) -> "Selector":
214
  """Used internally to convert a single HtmlElement to Selector directly without checks"""
215
  db_instance = (
216
  self._storage if (hasattr(self, "_storage") and self._storage) else None
@@ -228,19 +235,19 @@ class Selector(SelectorsGeneration):
228
  )
229
 
230
  def __handle_element(
231
- self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
232
  ) -> Union[TextHandler, "Selector", None]:
233
  """Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
234
  if element is None:
235
  return None
236
  elif self._is_text_node(element):
237
- # etree._ElementUnicodeResult basically inherit from `str` so it's fine
238
  return self.__content_convertor(element)
239
  else:
240
  return self.__element_convertor(element)
241
 
242
  def __handle_elements(
243
- self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]
244
  ) -> Union["Selectors", "TextHandlers", List]:
245
  """Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible"""
246
  if not len(
@@ -332,9 +339,7 @@ class Selector(SelectorsGeneration):
332
  def html_content(self) -> TextHandler:
333
  """Return the inner HTML code of the element"""
334
  return TextHandler(
335
- etree.tostring(
336
- self._root, encoding="unicode", method="html", with_tail=False
337
- )
338
  )
339
 
340
  body = html_content
@@ -342,7 +347,7 @@ class Selector(SelectorsGeneration):
342
  def prettify(self) -> TextHandler:
343
  """Return a prettified version of the element's inner html-code"""
344
  return TextHandler(
345
- etree.tostring(
346
  self._root,
347
  encoding="unicode",
348
  pretty_print=True,
@@ -467,10 +472,10 @@ class Selector(SelectorsGeneration):
467
  # From here we start with the selecting functions
468
  def relocate(
469
  self,
470
- element: Union[Dict, html.HtmlElement, "Selector"],
471
  percentage: int = 0,
472
  selector_type: bool = False,
473
- ) -> Union[List[Union[html.HtmlElement, None]], "Selectors"]:
474
  """This function will search again for the element in the page tree, used automatically on page structure change
475
 
476
  :param element: The element we want to relocate in the tree
@@ -485,7 +490,7 @@ class Selector(SelectorsGeneration):
485
  if isinstance(element, self.__class__):
486
  element = element._root
487
 
488
- if issubclass(type(element), html.HtmlElement):
489
  element = _StorageTools.element_to_dict(element)
490
 
491
  for node in self._root.xpath(".//*"):
@@ -698,8 +703,8 @@ class Selector(SelectorsGeneration):
698
  except (
699
  SelectorError,
700
  SelectorSyntaxError,
701
- etree.XPathError,
702
- etree.XPathEvalError,
703
  ) as e:
704
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}") from e
705
 
@@ -826,7 +831,7 @@ class Selector(SelectorsGeneration):
826
  return None
827
 
828
  def __calculate_similarity_score(
829
- self, original: Dict, candidate: html.HtmlElement
830
  ) -> float:
831
  """Used internally to calculate a score that shows how a candidate element similar to the original one
832
 
@@ -921,9 +926,7 @@ class Selector(SelectorsGeneration):
921
  )
922
  return score
923
 
924
- def save(
925
- self, element: Union["Selector", html.HtmlElement], identifier: str
926
- ) -> None:
927
  """Saves the element's unique properties to the storage for retrieval and relocation later
928
 
929
  :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
@@ -1004,16 +1007,16 @@ class Selector(SelectorsGeneration):
1004
 
1005
  @staticmethod
1006
  def __get_attributes(
1007
- element: html.HtmlElement, ignore_attributes: Union[List, Tuple]
1008
  ) -> Dict:
1009
  """Return attributes dictionary without the ignored list"""
1010
  return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
1011
 
1012
  def __are_alike(
1013
  self,
1014
- original: html.HtmlElement,
1015
  original_attributes: Dict,
1016
- candidate: html.HtmlElement,
1017
  ignore_attributes: Union[List, Tuple],
1018
  similarity_threshold: float,
1019
  match_text: bool = False,
 
7
 
8
  from cssselect import SelectorError, SelectorSyntaxError
9
  from cssselect import parse as split_selectors
10
+ from lxml.html import HtmlElement, HtmlMixin, HTMLParser
11
+ from lxml.etree import (
12
+ tostring,
13
+ fromstring,
14
+ XPathError,
15
+ XPathEvalError,
16
+ _ElementUnicodeResult,
17
+ )
18
 
19
  from scrapling.core._types import (
20
  Any,
 
61
  url: Optional[str] = None,
62
  encoding: str = "utf8",
63
  huge_tree: bool = True,
64
+ root: Optional[HtmlElement] = None,
65
  keep_comments: Optional[bool] = False,
66
  keep_cdata: Optional[bool] = False,
67
  adaptive: Optional[bool] = False,
 
112
  )
113
 
114
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
115
+ parser = HTMLParser(
116
  recover=True,
117
  remove_blank_text=True,
118
  remove_comments=(not keep_comments),
 
122
  default_doctype=True,
123
  strip_cdata=(not keep_cdata),
124
  )
125
+ self._root = fromstring(body, parser=parser, base_url=url)
126
 
127
  jsonable_text = content if isinstance(content, str) else body.decode()
128
  if is_jsonable(jsonable_text):
 
130
 
131
  else:
132
  # All HTML types inherit from HtmlMixin so this to check for all at once
133
+ if not issubclass(type(root), HtmlMixin):
134
  raise TypeError(
135
  f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
136
  )
 
197
  # Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
198
  @staticmethod
199
  def _is_text_node(
200
+ element: Union[HtmlElement, _ElementUnicodeResult],
201
  ) -> bool:
202
  """Return True if the given element is a result of a string expression
203
  Examples:
 
205
  CSS3 -> '::text', '::attr(attrib)'...
206
  """
207
  # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
208
+ return issubclass(type(element), _ElementUnicodeResult)
209
 
210
  @staticmethod
211
  def __content_convertor(
212
+ element: Union[HtmlElement, _ElementUnicodeResult],
213
  ) -> TextHandler:
214
  """Used internally to convert a single element's text content to TextHandler directly without checks
215
 
 
217
  """
218
  return TextHandler(str(element))
219
 
220
+ def __element_convertor(self, element: HtmlElement) -> "Selector":
221
  """Used internally to convert a single HtmlElement to Selector directly without checks"""
222
  db_instance = (
223
  self._storage if (hasattr(self, "_storage") and self._storage) else None
 
235
  )
236
 
237
  def __handle_element(
238
+ self, element: Union[HtmlElement, _ElementUnicodeResult]
239
  ) -> Union[TextHandler, "Selector", None]:
240
  """Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
241
  if element is None:
242
  return None
243
  elif self._is_text_node(element):
244
+ # `_ElementUnicodeResult` basically inherit from `str` so it's fine
245
  return self.__content_convertor(element)
246
  else:
247
  return self.__element_convertor(element)
248
 
249
  def __handle_elements(
250
+ self, result: List[Union[HtmlElement, _ElementUnicodeResult]]
251
  ) -> Union["Selectors", "TextHandlers", List]:
252
  """Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible"""
253
  if not len(
 
339
  def html_content(self) -> TextHandler:
340
  """Return the inner HTML code of the element"""
341
  return TextHandler(
342
+ tostring(self._root, encoding="unicode", method="html", with_tail=False)
 
 
343
  )
344
 
345
  body = html_content
 
347
  def prettify(self) -> TextHandler:
348
  """Return a prettified version of the element's inner html-code"""
349
  return TextHandler(
350
+ tostring(
351
  self._root,
352
  encoding="unicode",
353
  pretty_print=True,
 
472
  # From here we start with the selecting functions
473
  def relocate(
474
  self,
475
+ element: Union[Dict, HtmlElement, "Selector"],
476
  percentage: int = 0,
477
  selector_type: bool = False,
478
+ ) -> Union[List[Union[HtmlElement, None]], "Selectors"]:
479
  """This function will search again for the element in the page tree, used automatically on page structure change
480
 
481
  :param element: The element we want to relocate in the tree
 
490
  if isinstance(element, self.__class__):
491
  element = element._root
492
 
493
+ if issubclass(type(element), HtmlElement):
494
  element = _StorageTools.element_to_dict(element)
495
 
496
  for node in self._root.xpath(".//*"):
 
703
  except (
704
  SelectorError,
705
  SelectorSyntaxError,
706
+ XPathError,
707
+ XPathEvalError,
708
  ) as e:
709
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}") from e
710
 
 
831
  return None
832
 
833
  def __calculate_similarity_score(
834
+ self, original: Dict, candidate: HtmlElement
835
  ) -> float:
836
  """Used internally to calculate a score that shows how a candidate element similar to the original one
837
 
 
926
  )
927
  return score
928
 
929
+ def save(self, element: Union["Selector", HtmlElement], identifier: str) -> None:
 
 
930
  """Saves the element's unique properties to the storage for retrieval and relocation later
931
 
932
  :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
 
1007
 
1008
  @staticmethod
1009
  def __get_attributes(
1010
+ element: HtmlElement, ignore_attributes: Union[List, Tuple]
1011
  ) -> Dict:
1012
  """Return attributes dictionary without the ignored list"""
1013
  return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
1014
 
1015
  def __are_alike(
1016
  self,
1017
+ original: HtmlElement,
1018
  original_attributes: Dict,
1019
+ candidate: HtmlElement,
1020
  ignore_attributes: Union[List, Tuple],
1021
  similarity_threshold: float,
1022
  match_text: bool = False,