Spaces:

lenson78
/

Scrapling

Paused

mph commited on Mar 6

Commit

c9a1787

1 Parent(s): a0dac68

fix: Selector.get_all_text() doesn't get all text #167

- Updated text extraction using recursion
- Added new unit test

Files changed (2) hide show

scrapling/parser.py CHANGED Viewed

@@ -304,13 +304,23 @@ class Selector(SelectorsGeneration):
                 ignored_elements.update(cast(list, _find_all_elements(element)))
         _all_strings = []
-        for node in self._root.iter():
-            if node not in ignored_elements:
-                text = node.text
-                if text and isinstance(text, str):
-                    processed_text = text.strip() if strip else text
-                    if not valid_values or processed_text.strip():
-                        _all_strings.append(processed_text)
         return cast(TextHandler, TextHandler(separator).join(_all_strings))

                 ignored_elements.update(cast(list, _find_all_elements(element)))
         _all_strings = []
+        def append_text(text: Any) -> None:
+            if text and isinstance(text, str):
+                processed_text = text.strip() if strip else text
+                if not valid_values or processed_text.strip():
+                    _all_strings.append(processed_text)
+        def walk(node: Any) -> None:
+            if node in ignored_elements:
+                return
+            append_text(node.text)
+            for child in node:
+                walk(child)
+                append_text(child.tail)
+        walk(self._root)
         return cast(TextHandler, TextHandler(separator).join(_all_strings))

tests/parser/test_general.py CHANGED Viewed

@@ -327,6 +327,30 @@ def test_getting_all_text(page):
     assert page.get_all_text() != ""
 def test_regex_on_text(page):
     """Test regex operations on text"""
     element = page.css('[data-id="1"] .price')[0]

     assert page.get_all_text() != ""
+def test_getting_all_text_from_nested_content():
+    """Test getting all text preserves interleaved text nodes"""
+    html = """
+    <html>
+    <body>
+        <main>
+            string1
+            <b>string2</b>
+            string3
+            <div>
+                <span>string4</span>
+            </div>
+            string5
+        </main>
+    </body>
+    </html>
+    """
+    page = Selector(html, adaptive=False)
+    node = page.css("main")[0]
+    assert node.get_all_text("\n", strip=True) == "string1\nstring2\nstring3\nstring4\nstring5"
 def test_regex_on_text(page):
     """Test regex operations on text"""
     element = page.css('[data-id="1"] .price')[0]