Karim shoair commited on
Commit ·
fd61b76
1
Parent(s): da52163
Making `find_all`/`find` methods on steroids
Browse filesAdding the ability to find elements by regex patterns and functions.
- scrapling/parser.py +51 -14
scrapling/parser.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
from difflib import SequenceMatcher
|
| 3 |
|
| 4 |
from scrapling.core.translator import HTMLTranslator
|
| 5 |
from scrapling.core.mixins import SelectorsGeneration
|
| 6 |
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
|
| 7 |
from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
| 8 |
-
from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten,
|
| 9 |
from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
|
| 10 |
-
|
| 11 |
from lxml import etree, html
|
| 12 |
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
| 13 |
|
|
@@ -542,10 +543,10 @@ class Adaptor(SelectorsGeneration):
|
|
| 542 |
except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
|
| 543 |
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
| 544 |
|
| 545 |
-
def find_all(self, *args, **kwargs) -> Union['Adaptors[Adaptor]', List]:
|
| 546 |
-
"""Find elements by
|
| 547 |
|
| 548 |
-
:param args: Tag name(s), an iterable of tag names, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 549 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 550 |
:return: The `Adaptors` object of the elements or empty list
|
| 551 |
"""
|
|
@@ -560,9 +561,18 @@ class Adaptor(SelectorsGeneration):
|
|
| 560 |
if not args and not kwargs:
|
| 561 |
raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
|
| 562 |
|
| 563 |
-
tags = set()
|
| 564 |
-
selectors = []
|
| 565 |
attributes = dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
# Brace yourself for a wonderful journey!
|
| 567 |
for arg in args:
|
| 568 |
if type(arg) is str:
|
|
@@ -578,6 +588,15 @@ class Adaptor(SelectorsGeneration):
|
|
| 578 |
raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
|
| 579 |
attributes.update(arg)
|
| 580 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
else:
|
| 582 |
raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
|
| 583 |
|
|
@@ -597,14 +616,32 @@ class Adaptor(SelectorsGeneration):
|
|
| 597 |
value = value.replace('"', r'\"') # Escape double quotes in user input
|
| 598 |
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
| 599 |
selector += '[{}="{}"]'.format(key, value)
|
| 600 |
-
|
|
|
|
| 601 |
|
| 602 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 603 |
|
| 604 |
-
def find(self, *args, **kwargs) -> Union['Adaptor', None]:
|
| 605 |
-
"""Find elements by
|
| 606 |
|
| 607 |
-
:param args: Tag name(s), an iterable of tag names, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 608 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 609 |
:return: The `Adaptor` object of the element or `None` if the result didn't match
|
| 610 |
"""
|
|
@@ -882,10 +919,10 @@ class Adaptor(SelectorsGeneration):
|
|
| 882 |
return self.__convert_results(results)
|
| 883 |
|
| 884 |
def find_by_regex(
|
| 885 |
-
self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
| 886 |
) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
|
| 887 |
"""Find elements that its text content matches the input regex pattern.
|
| 888 |
-
:param query: Regex query to match
|
| 889 |
:param first_match: Return first element that matches conditions, enabled by default
|
| 890 |
:param case_sensitive: if enabled, letters case will be taken into consideration in the regex
|
| 891 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
|
|
|
| 1 |
import os
|
| 2 |
+
import re
|
| 3 |
+
import inspect
|
| 4 |
from difflib import SequenceMatcher
|
| 5 |
|
| 6 |
from scrapling.core.translator import HTMLTranslator
|
| 7 |
from scrapling.core.mixins import SelectorsGeneration
|
| 8 |
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
|
| 9 |
from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
| 10 |
+
from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
|
| 11 |
from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
|
|
|
|
| 12 |
from lxml import etree, html
|
| 13 |
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
| 14 |
|
|
|
|
| 543 |
except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
|
| 544 |
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
| 545 |
|
| 546 |
+
def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptors[Adaptor]', List]:
|
| 547 |
+
"""Find elements by filters of your creations for ease..
|
| 548 |
|
| 549 |
+
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 550 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 551 |
:return: The `Adaptors` object of the elements or empty list
|
| 552 |
"""
|
|
|
|
| 561 |
if not args and not kwargs:
|
| 562 |
raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
|
| 563 |
|
|
|
|
|
|
|
| 564 |
attributes = dict()
|
| 565 |
+
tags, patterns = set(), set()
|
| 566 |
+
results, functions, selectors = [], [], []
|
| 567 |
+
|
| 568 |
+
def _search_tree(element: Adaptor, filter_function: Callable) -> None:
|
| 569 |
+
"""Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
|
| 570 |
+
if filter_function(element):
|
| 571 |
+
results.append(element)
|
| 572 |
+
|
| 573 |
+
for branch in element.children:
|
| 574 |
+
_search_tree(branch, filter_function)
|
| 575 |
+
|
| 576 |
# Brace yourself for a wonderful journey!
|
| 577 |
for arg in args:
|
| 578 |
if type(arg) is str:
|
|
|
|
| 588 |
raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
|
| 589 |
attributes.update(arg)
|
| 590 |
|
| 591 |
+
elif type(arg) is re.Pattern:
|
| 592 |
+
patterns.add(arg)
|
| 593 |
+
|
| 594 |
+
elif callable(arg):
|
| 595 |
+
if len(inspect.signature(arg).parameters) > 0:
|
| 596 |
+
functions.append(arg)
|
| 597 |
+
else:
|
| 598 |
+
raise TypeError("Callable filter function must have at least one argument to take `Adaptor` objects.")
|
| 599 |
+
|
| 600 |
else:
|
| 601 |
raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
|
| 602 |
|
|
|
|
| 616 |
value = value.replace('"', r'\"') # Escape double quotes in user input
|
| 617 |
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
| 618 |
selector += '[{}="{}"]'.format(key, value)
|
| 619 |
+
if selector:
|
| 620 |
+
selectors.append(selector)
|
| 621 |
|
| 622 |
+
if selectors:
|
| 623 |
+
results = self.css(', '.join(selectors))
|
| 624 |
+
if results:
|
| 625 |
+
# From the results, get the ones that fulfill passed regex patterns
|
| 626 |
+
for pattern in patterns:
|
| 627 |
+
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
| 628 |
+
|
| 629 |
+
# From the results, get the ones that fulfill passed functions
|
| 630 |
+
for function in functions:
|
| 631 |
+
results = results.filter(function)
|
| 632 |
+
else:
|
| 633 |
+
for pattern in patterns:
|
| 634 |
+
results.extend(self.find_by_regex(pattern, first_match=False))
|
| 635 |
+
|
| 636 |
+
for function in functions:
|
| 637 |
+
_search_tree(self, function)
|
| 638 |
+
|
| 639 |
+
return self.__convert_results(results)
|
| 640 |
|
| 641 |
+
def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
|
| 642 |
+
"""Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
|
| 643 |
|
| 644 |
+
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 645 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 646 |
:return: The `Adaptor` object of the element or `None` if the result didn't match
|
| 647 |
"""
|
|
|
|
| 919 |
return self.__convert_results(results)
|
| 920 |
|
| 921 |
def find_by_regex(
|
| 922 |
+
self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
| 923 |
) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
|
| 924 |
"""Find elements that its text content matches the input regex pattern.
|
| 925 |
+
:param query: Regex query/pattern to match
|
| 926 |
:param first_match: Return first element that matches conditions, enabled by default
|
| 927 |
:param case_sensitive: if enabled, letters case will be taken into consideration in the regex
|
| 928 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|