- Add pre-compiled XPath text selector
Browse files- scrapling/parser.py +23 -19
scrapling/parser.py
CHANGED
|
@@ -58,6 +58,7 @@ _find_all_elements = XPath(".//*")
|
|
| 58 |
_find_all_elements_with_spaces = XPath(
|
| 59 |
".//*[normalize-space(text())]"
|
| 60 |
) # This selector gets all elements with text content
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
class Selector(SelectorsGeneration):
|
|
@@ -299,28 +300,31 @@ class Selector(SelectorsGeneration):
|
|
| 299 |
|
| 300 |
ignored_elements: set[Any] = set()
|
| 301 |
if ignore_tags:
|
| 302 |
-
|
| 303 |
-
ignored_elements.add(element)
|
| 304 |
-
ignored_elements.update(cast(list, _find_all_elements(element)))
|
| 305 |
|
| 306 |
_all_strings = []
|
| 307 |
|
| 308 |
-
def append_text(text:
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
if
|
| 316 |
-
return
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
return cast(TextHandler, TextHandler(separator).join(_all_strings))
|
| 326 |
|
|
|
|
| 58 |
_find_all_elements_with_spaces = XPath(
|
| 59 |
".//*[normalize-space(text())]"
|
| 60 |
) # This selector gets all elements with text content
|
| 61 |
+
_find_all_text_nodes = XPath(".//text()")
|
| 62 |
|
| 63 |
|
| 64 |
class Selector(SelectorsGeneration):
|
|
|
|
| 300 |
|
| 301 |
ignored_elements: set[Any] = set()
|
| 302 |
if ignore_tags:
|
| 303 |
+
ignored_elements.update(self._root.iter(*ignore_tags))
|
|
|
|
|
|
|
| 304 |
|
| 305 |
_all_strings = []
|
| 306 |
|
| 307 |
+
def append_text(text: str) -> None:
|
| 308 |
+
processed_text = text.strip() if strip else text
|
| 309 |
+
if not valid_values or processed_text.strip():
|
| 310 |
+
_all_strings.append(processed_text)
|
| 311 |
+
|
| 312 |
+
def is_visible_text_node(text_node: _ElementUnicodeResult) -> bool:
|
| 313 |
+
parent = text_node.getparent()
|
| 314 |
+
if parent is None:
|
| 315 |
+
return False
|
| 316 |
+
|
| 317 |
+
owner = parent.getparent() if text_node.is_tail else parent
|
| 318 |
+
while owner is not None:
|
| 319 |
+
if owner in ignored_elements:
|
| 320 |
+
return False
|
| 321 |
+
owner = owner.getparent()
|
| 322 |
+
return True
|
| 323 |
+
|
| 324 |
+
for text_node in cast(list[_ElementUnicodeResult], _find_all_text_nodes(self._root)):
|
| 325 |
+
text = str(text_node)
|
| 326 |
+
if text and is_visible_text_node(text_node):
|
| 327 |
+
append_text(text)
|
| 328 |
|
| 329 |
return cast(TextHandler, TextHandler(separator).join(_all_strings))
|
| 330 |
|