mph commited on
Commit
c9a1787
·
1 Parent(s): a0dac68

fix: Selector.get_all_text() doesn't get all text #167

Browse files

- Updated text extraction using recursion
- Added new unit test

Files changed (2) hide show
  1. scrapling/parser.py +17 -7
  2. tests/parser/test_general.py +24 -0
scrapling/parser.py CHANGED
@@ -304,13 +304,23 @@ class Selector(SelectorsGeneration):
304
  ignored_elements.update(cast(list, _find_all_elements(element)))
305
 
306
  _all_strings = []
307
- for node in self._root.iter():
308
- if node not in ignored_elements:
309
- text = node.text
310
- if text and isinstance(text, str):
311
- processed_text = text.strip() if strip else text
312
- if not valid_values or processed_text.strip():
313
- _all_strings.append(processed_text)
 
 
 
 
 
 
 
 
 
 
314
 
315
  return cast(TextHandler, TextHandler(separator).join(_all_strings))
316
 
 
304
  ignored_elements.update(cast(list, _find_all_elements(element)))
305
 
306
  _all_strings = []
307
+
308
+ def append_text(text: Any) -> None:
309
+ if text and isinstance(text, str):
310
+ processed_text = text.strip() if strip else text
311
+ if not valid_values or processed_text.strip():
312
+ _all_strings.append(processed_text)
313
+
314
+ def walk(node: Any) -> None:
315
+ if node in ignored_elements:
316
+ return
317
+
318
+ append_text(node.text)
319
+ for child in node:
320
+ walk(child)
321
+ append_text(child.tail)
322
+
323
+ walk(self._root)
324
 
325
  return cast(TextHandler, TextHandler(separator).join(_all_strings))
326
 
tests/parser/test_general.py CHANGED
@@ -327,6 +327,30 @@ def test_getting_all_text(page):
327
  assert page.get_all_text() != ""
328
 
329
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  def test_regex_on_text(page):
331
  """Test regex operations on text"""
332
  element = page.css('[data-id="1"] .price')[0]
 
327
  assert page.get_all_text() != ""
328
 
329
 
330
+ def test_getting_all_text_from_nested_content():
331
+ """Test getting all text preserves interleaved text nodes"""
332
+ html = """
333
+ <html>
334
+ <body>
335
+ <main>
336
+ string1
337
+ <b>string2</b>
338
+ string3
339
+ <div>
340
+ <span>string4</span>
341
+ </div>
342
+ string5
343
+ </main>
344
+ </body>
345
+ </html>
346
+ """
347
+
348
+ page = Selector(html, adaptive=False)
349
+ node = page.css("main")[0]
350
+
351
+ assert node.get_all_text("\n", strip=True) == "string1\nstring2\nstring3\nstring4\nstring5"
352
+
353
+
354
  def test_regex_on_text(page):
355
  """Test regex operations on text"""
356
  element = page.css('[data-id="1"] .price')[0]