fix: Selector.get_all_text() doesn't get all text #167
Browse files- Updated text extraction using recursion
- Added new unit test
- scrapling/parser.py +17 -7
- tests/parser/test_general.py +24 -0
scrapling/parser.py
CHANGED
|
@@ -304,13 +304,23 @@ class Selector(SelectorsGeneration):
|
|
| 304 |
ignored_elements.update(cast(list, _find_all_elements(element)))
|
| 305 |
|
| 306 |
_all_strings = []
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
return cast(TextHandler, TextHandler(separator).join(_all_strings))
|
| 316 |
|
|
|
|
| 304 |
ignored_elements.update(cast(list, _find_all_elements(element)))
|
| 305 |
|
| 306 |
_all_strings = []
|
| 307 |
+
|
| 308 |
+
def append_text(text: Any) -> None:
|
| 309 |
+
if text and isinstance(text, str):
|
| 310 |
+
processed_text = text.strip() if strip else text
|
| 311 |
+
if not valid_values or processed_text.strip():
|
| 312 |
+
_all_strings.append(processed_text)
|
| 313 |
+
|
| 314 |
+
def walk(node: Any) -> None:
|
| 315 |
+
if node in ignored_elements:
|
| 316 |
+
return
|
| 317 |
+
|
| 318 |
+
append_text(node.text)
|
| 319 |
+
for child in node:
|
| 320 |
+
walk(child)
|
| 321 |
+
append_text(child.tail)
|
| 322 |
+
|
| 323 |
+
walk(self._root)
|
| 324 |
|
| 325 |
return cast(TextHandler, TextHandler(separator).join(_all_strings))
|
| 326 |
|
tests/parser/test_general.py
CHANGED
|
@@ -327,6 +327,30 @@ def test_getting_all_text(page):
|
|
| 327 |
assert page.get_all_text() != ""
|
| 328 |
|
| 329 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
def test_regex_on_text(page):
|
| 331 |
"""Test regex operations on text"""
|
| 332 |
element = page.css('[data-id="1"] .price')[0]
|
|
|
|
| 327 |
assert page.get_all_text() != ""
|
| 328 |
|
| 329 |
|
| 330 |
+
def test_getting_all_text_from_nested_content():
|
| 331 |
+
"""Test getting all text preserves interleaved text nodes"""
|
| 332 |
+
html = """
|
| 333 |
+
<html>
|
| 334 |
+
<body>
|
| 335 |
+
<main>
|
| 336 |
+
string1
|
| 337 |
+
<b>string2</b>
|
| 338 |
+
string3
|
| 339 |
+
<div>
|
| 340 |
+
<span>string4</span>
|
| 341 |
+
</div>
|
| 342 |
+
string5
|
| 343 |
+
</main>
|
| 344 |
+
</body>
|
| 345 |
+
</html>
|
| 346 |
+
"""
|
| 347 |
+
|
| 348 |
+
page = Selector(html, adaptive=False)
|
| 349 |
+
node = page.css("main")[0]
|
| 350 |
+
|
| 351 |
+
assert node.get_all_text("\n", strip=True) == "string1\nstring2\nstring3\nstring4\nstring5"
|
| 352 |
+
|
| 353 |
+
|
| 354 |
def test_regex_on_text(page):
|
| 355 |
"""Test regex operations on text"""
|
| 356 |
element = page.css('[data-id="1"] .price')[0]
|