Karim shoair commited on
Commit ·
20efe8c
1
Parent(s): 8400659
fix(parser): Solve the ignored elements children issue while keeping speed
Browse files- scrapling/parser.py +12 -6
scrapling/parser.py
CHANGED
|
@@ -291,15 +291,21 @@ class Adaptor(SelectorsGeneration):
|
|
| 291 |
|
| 292 |
:return: A TextHandler
|
| 293 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
_all_strings = []
|
| 295 |
for node in self._root.xpath(".//*"):
|
| 296 |
-
if node
|
| 297 |
text = node.text
|
| 298 |
-
if text and
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
_all_strings.append(text if not strip else text.strip())
|
| 303 |
|
| 304 |
return TextHandler(separator.join(_all_strings))
|
| 305 |
|
|
|
|
| 291 |
|
| 292 |
:return: A TextHandler
|
| 293 |
"""
|
| 294 |
+
ignored_elements = set()
|
| 295 |
+
if ignore_tags:
|
| 296 |
+
for tag in ignore_tags:
|
| 297 |
+
for element in self._root.xpath(f".//{tag}"):
|
| 298 |
+
ignored_elements.add(element)
|
| 299 |
+
ignored_elements.update(element.xpath(".//*"))
|
| 300 |
+
|
| 301 |
_all_strings = []
|
| 302 |
for node in self._root.xpath(".//*"):
|
| 303 |
+
if node not in ignored_elements:
|
| 304 |
text = node.text
|
| 305 |
+
if text and isinstance(text, str):
|
| 306 |
+
processed_text = text.strip() if strip else text
|
| 307 |
+
if not valid_values or processed_text.strip():
|
| 308 |
+
_all_strings.append(processed_text)
|
|
|
|
| 309 |
|
| 310 |
return TextHandler(separator.join(_all_strings))
|
| 311 |
|