Karim shoair commited on
Commit ·
e3e46c8
1
Parent(s): 11165c4
style(parser): optimize selectors instances creation
Browse files- scrapling/parser.py +10 -12
scrapling/parser.py
CHANGED
|
@@ -253,14 +253,14 @@ class Selector(SelectorsGeneration):
|
|
| 253 |
if not len(
|
| 254 |
result
|
| 255 |
): # Lxml will give a warning if I used something like `not result`
|
| 256 |
-
return Selectors(
|
| 257 |
|
| 258 |
# From within the code, this method will always get a list of the same type,
|
| 259 |
# so we will continue without checks for a slight performance boost
|
| 260 |
if self._is_text_node(result[0]):
|
| 261 |
return TextHandlers(list(map(self.__content_convertor, result)))
|
| 262 |
|
| 263 |
-
return Selectors(
|
| 264 |
|
| 265 |
def __getstate__(self) -> Any:
|
| 266 |
# lxml don't like it :)
|
|
@@ -378,11 +378,9 @@ class Selector(SelectorsGeneration):
|
|
| 378 |
def children(self) -> "Selectors[Selector]":
|
| 379 |
"""Return the children elements of the current element or empty list otherwise"""
|
| 380 |
return Selectors(
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
if type(child) not in html_forbidden
|
| 385 |
-
]
|
| 386 |
)
|
| 387 |
|
| 388 |
@property
|
|
@@ -390,9 +388,9 @@ class Selector(SelectorsGeneration):
|
|
| 390 |
"""Return other children of the current element's parent or empty list otherwise"""
|
| 391 |
if self.parent:
|
| 392 |
return Selectors(
|
| 393 |
-
|
| 394 |
)
|
| 395 |
-
return Selectors(
|
| 396 |
|
| 397 |
def iterancestors(self) -> Generator["Selector", None, None]:
|
| 398 |
"""Return a generator that loops over all ancestors of the element, starting with the element's parent."""
|
|
@@ -734,7 +732,7 @@ class Selector(SelectorsGeneration):
|
|
| 734 |
|
| 735 |
attributes = dict()
|
| 736 |
tags, patterns = set(), set()
|
| 737 |
-
results, functions, selectors = Selectors(
|
| 738 |
|
| 739 |
# Brace yourself for a wonderful journey!
|
| 740 |
for arg in args:
|
|
@@ -1134,7 +1132,7 @@ class Selector(SelectorsGeneration):
|
|
| 1134 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1135 |
"""
|
| 1136 |
|
| 1137 |
-
results = Selectors(
|
| 1138 |
if not case_sensitive:
|
| 1139 |
text = text.lower()
|
| 1140 |
|
|
@@ -1178,7 +1176,7 @@ class Selector(SelectorsGeneration):
|
|
| 1178 |
:param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
|
| 1179 |
:param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
|
| 1180 |
"""
|
| 1181 |
-
results = Selectors(
|
| 1182 |
|
| 1183 |
# This selector gets all elements with text content
|
| 1184 |
for node in self.__handle_elements(
|
|
|
|
| 253 |
if not len(
|
| 254 |
result
|
| 255 |
): # Lxml will give a warning if I used something like `not result`
|
| 256 |
+
return Selectors()
|
| 257 |
|
| 258 |
# From within the code, this method will always get a list of the same type,
|
| 259 |
# so we will continue without checks for a slight performance boost
|
| 260 |
if self._is_text_node(result[0]):
|
| 261 |
return TextHandlers(list(map(self.__content_convertor, result)))
|
| 262 |
|
| 263 |
+
return Selectors(map(self.__element_convertor, result))
|
| 264 |
|
| 265 |
def __getstate__(self) -> Any:
|
| 266 |
# lxml don't like it :)
|
|
|
|
| 378 |
def children(self) -> "Selectors[Selector]":
|
| 379 |
"""Return the children elements of the current element or empty list otherwise"""
|
| 380 |
return Selectors(
|
| 381 |
+
self.__element_convertor(child)
|
| 382 |
+
for child in self._root.iterchildren()
|
| 383 |
+
if type(child) not in html_forbidden
|
|
|
|
|
|
|
| 384 |
)
|
| 385 |
|
| 386 |
@property
|
|
|
|
| 388 |
"""Return other children of the current element's parent or empty list otherwise"""
|
| 389 |
if self.parent:
|
| 390 |
return Selectors(
|
| 391 |
+
child for child in self.parent.children if child._root != self._root
|
| 392 |
)
|
| 393 |
+
return Selectors()
|
| 394 |
|
| 395 |
def iterancestors(self) -> Generator["Selector", None, None]:
|
| 396 |
"""Return a generator that loops over all ancestors of the element, starting with the element's parent."""
|
|
|
|
| 732 |
|
| 733 |
attributes = dict()
|
| 734 |
tags, patterns = set(), set()
|
| 735 |
+
results, functions, selectors = Selectors(), [], []
|
| 736 |
|
| 737 |
# Brace yourself for a wonderful journey!
|
| 738 |
for arg in args:
|
|
|
|
| 1132 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1133 |
"""
|
| 1134 |
|
| 1135 |
+
results = Selectors()
|
| 1136 |
if not case_sensitive:
|
| 1137 |
text = text.lower()
|
| 1138 |
|
|
|
|
| 1176 |
:param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
|
| 1177 |
:param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
|
| 1178 |
"""
|
| 1179 |
+
results = Selectors()
|
| 1180 |
|
| 1181 |
# This selector gets all elements with text content
|
| 1182 |
for node in self.__handle_elements(
|