Karim shoair commited on
Commit ·
9d5f1f4
1
Parent(s): 1003d98
fix(parser/find_all): Logic issues made conditions used sometimes in a (and) fashion and other times (OR)
Browse files- scrapling/parser.py +8 -8
scrapling/parser.py
CHANGED
|
@@ -19,7 +19,7 @@ from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
|
|
| 19 |
StorageSystemMixin, _StorageTools)
|
| 20 |
from scrapling.core.translator import HTMLTranslator
|
| 21 |
from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
|
| 22 |
-
is_jsonable, log
|
| 23 |
|
| 24 |
|
| 25 |
class Adaptor(SelectorsGeneration):
|
|
@@ -285,7 +285,6 @@ class Adaptor(SelectorsGeneration):
|
|
| 285 |
return self.__handle_element(self._root.getparent())
|
| 286 |
|
| 287 |
@property
|
| 288 |
-
@lru_cache(None, True)
|
| 289 |
def below_elements(self) -> 'Adaptors[Adaptor]':
|
| 290 |
"""Return all elements under the current element in the DOM tree"""
|
| 291 |
below = self._root.xpath('.//*')
|
|
@@ -603,12 +602,12 @@ class Adaptor(SelectorsGeneration):
|
|
| 603 |
raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
|
| 604 |
tags.update(set(arg))
|
| 605 |
|
| 606 |
-
elif
|
| 607 |
if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
|
| 608 |
raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
|
| 609 |
attributes.update(arg)
|
| 610 |
|
| 611 |
-
elif
|
| 612 |
patterns.add(arg)
|
| 613 |
|
| 614 |
elif callable(arg):
|
|
@@ -629,14 +628,14 @@ class Adaptor(SelectorsGeneration):
|
|
| 629 |
attributes[attribute_name] = value
|
| 630 |
|
| 631 |
# It's easier and faster to build a selector than traversing the tree
|
| 632 |
-
tags = tags or ['']
|
| 633 |
for tag in tags:
|
| 634 |
selector = tag
|
| 635 |
for key, value in attributes.items():
|
| 636 |
value = value.replace('"', r'\"') # Escape double quotes in user input
|
| 637 |
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
| 638 |
selector += '[{}="{}"]'.format(key, value)
|
| 639 |
-
if selector:
|
| 640 |
selectors.append(selector)
|
| 641 |
|
| 642 |
if selectors:
|
|
@@ -650,12 +649,13 @@ class Adaptor(SelectorsGeneration):
|
|
| 650 |
for function in functions:
|
| 651 |
results = results.filter(function)
|
| 652 |
else:
|
|
|
|
| 653 |
for pattern in patterns:
|
| 654 |
-
results.
|
| 655 |
|
| 656 |
# Collect element if it fulfills passed function otherwise
|
| 657 |
for function in functions:
|
| 658 |
-
results
|
| 659 |
|
| 660 |
return results
|
| 661 |
|
|
|
|
| 19 |
StorageSystemMixin, _StorageTools)
|
| 20 |
from scrapling.core.translator import HTMLTranslator
|
| 21 |
from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
|
| 22 |
+
is_jsonable, log)
|
| 23 |
|
| 24 |
|
| 25 |
class Adaptor(SelectorsGeneration):
|
|
|
|
| 285 |
return self.__handle_element(self._root.getparent())
|
| 286 |
|
| 287 |
@property
|
|
|
|
| 288 |
def below_elements(self) -> 'Adaptors[Adaptor]':
|
| 289 |
"""Return all elements under the current element in the DOM tree"""
|
| 290 |
below = self._root.xpath('.//*')
|
|
|
|
| 602 |
raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
|
| 603 |
tags.update(set(arg))
|
| 604 |
|
| 605 |
+
elif isinstance(arg, dict):
|
| 606 |
if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
|
| 607 |
raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
|
| 608 |
attributes.update(arg)
|
| 609 |
|
| 610 |
+
elif isinstance(arg, re.Pattern):
|
| 611 |
patterns.add(arg)
|
| 612 |
|
| 613 |
elif callable(arg):
|
|
|
|
| 628 |
attributes[attribute_name] = value
|
| 629 |
|
| 630 |
# It's easier and faster to build a selector than traversing the tree
|
| 631 |
+
tags = tags or ['*']
|
| 632 |
for tag in tags:
|
| 633 |
selector = tag
|
| 634 |
for key, value in attributes.items():
|
| 635 |
value = value.replace('"', r'\"') # Escape double quotes in user input
|
| 636 |
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
| 637 |
selector += '[{}="{}"]'.format(key, value)
|
| 638 |
+
if selector != '*':
|
| 639 |
selectors.append(selector)
|
| 640 |
|
| 641 |
if selectors:
|
|
|
|
| 649 |
for function in functions:
|
| 650 |
results = results.filter(function)
|
| 651 |
else:
|
| 652 |
+
results = results or self.below_elements
|
| 653 |
for pattern in patterns:
|
| 654 |
+
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
| 655 |
|
| 656 |
# Collect element if it fulfills passed function otherwise
|
| 657 |
for function in functions:
|
| 658 |
+
results = results.filter(function)
|
| 659 |
|
| 660 |
return results
|
| 661 |
|