mph commited on
Commit
133ff8f
·
1 Parent(s): c9a1787

- Add pre-compiled XPath text selector

Browse files
Files changed (1) hide show
  1. scrapling/parser.py +23 -19
scrapling/parser.py CHANGED
@@ -58,6 +58,7 @@ _find_all_elements = XPath(".//*")
58
  _find_all_elements_with_spaces = XPath(
59
  ".//*[normalize-space(text())]"
60
  ) # This selector gets all elements with text content
 
61
 
62
 
63
  class Selector(SelectorsGeneration):
@@ -299,28 +300,31 @@ class Selector(SelectorsGeneration):
299
 
300
  ignored_elements: set[Any] = set()
301
  if ignore_tags:
302
- for element in self._root.iter(*ignore_tags):
303
- ignored_elements.add(element)
304
- ignored_elements.update(cast(list, _find_all_elements(element)))
305
 
306
  _all_strings = []
307
 
308
- def append_text(text: Any) -> None:
309
- if text and isinstance(text, str):
310
- processed_text = text.strip() if strip else text
311
- if not valid_values or processed_text.strip():
312
- _all_strings.append(processed_text)
313
-
314
- def walk(node: Any) -> None:
315
- if node in ignored_elements:
316
- return
317
-
318
- append_text(node.text)
319
- for child in node:
320
- walk(child)
321
- append_text(child.tail)
322
-
323
- walk(self._root)
 
 
 
 
 
324
 
325
  return cast(TextHandler, TextHandler(separator).join(_all_strings))
326
 
 
58
  _find_all_elements_with_spaces = XPath(
59
  ".//*[normalize-space(text())]"
60
  ) # This selector gets all elements with text content
61
+ _find_all_text_nodes = XPath(".//text()")
62
 
63
 
64
  class Selector(SelectorsGeneration):
 
300
 
301
  ignored_elements: set[Any] = set()
302
  if ignore_tags:
303
+ ignored_elements.update(self._root.iter(*ignore_tags))
 
 
304
 
305
  _all_strings = []
306
 
307
+ def append_text(text: str) -> None:
308
+ processed_text = text.strip() if strip else text
309
+ if not valid_values or processed_text.strip():
310
+ _all_strings.append(processed_text)
311
+
312
+ def is_visible_text_node(text_node: _ElementUnicodeResult) -> bool:
313
+ parent = text_node.getparent()
314
+ if parent is None:
315
+ return False
316
+
317
+ owner = parent.getparent() if text_node.is_tail else parent
318
+ while owner is not None:
319
+ if owner in ignored_elements:
320
+ return False
321
+ owner = owner.getparent()
322
+ return True
323
+
324
+ for text_node in cast(list[_ElementUnicodeResult], _find_all_text_nodes(self._root)):
325
+ text = str(text_node)
326
+ if text and is_visible_text_node(text_node):
327
+ append_text(text)
328
 
329
  return cast(TextHandler, TextHandler(separator).join(_all_strings))
330