Karim shoair commited on
Commit
fd61b76
·
1 Parent(s): da52163

Making `find_all`/`find` methods on steroids

Browse files

Adding the ability to find elements by regex patterns and functions.

Files changed (1) hide show
  1. scrapling/parser.py +51 -14
scrapling/parser.py CHANGED
@@ -1,13 +1,14 @@
1
  import os
 
 
2
  from difflib import SequenceMatcher
3
 
4
  from scrapling.core.translator import HTMLTranslator
5
  from scrapling.core.mixins import SelectorsGeneration
6
  from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
7
  from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
8
- from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, _is_iterable, html_forbidden
9
  from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
10
-
11
  from lxml import etree, html
12
  from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
13
 
@@ -542,10 +543,10 @@ class Adaptor(SelectorsGeneration):
542
  except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
543
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
544
 
545
- def find_all(self, *args, **kwargs) -> Union['Adaptors[Adaptor]', List]:
546
- """Find elements by their tag name and filter them based on attributes for ease..
547
 
548
- :param args: Tag name(s), an iterable of tag names, or a dictionary of elements' attributes. Leave empty for selecting all.
549
  :param kwargs: The attributes you want to filter elements based on it.
550
  :return: The `Adaptors` object of the elements or empty list
551
  """
@@ -560,9 +561,18 @@ class Adaptor(SelectorsGeneration):
560
  if not args and not kwargs:
561
  raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
562
 
563
- tags = set()
564
- selectors = []
565
  attributes = dict()
 
 
 
 
 
 
 
 
 
 
 
566
  # Brace yourself for a wonderful journey!
567
  for arg in args:
568
  if type(arg) is str:
@@ -578,6 +588,15 @@ class Adaptor(SelectorsGeneration):
578
  raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
579
  attributes.update(arg)
580
 
 
 
 
 
 
 
 
 
 
581
  else:
582
  raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
583
 
@@ -597,14 +616,32 @@ class Adaptor(SelectorsGeneration):
597
  value = value.replace('"', r'\"') # Escape double quotes in user input
598
  # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
599
  selector += '[{}="{}"]'.format(key, value)
600
- selectors.append(selector)
 
601
 
602
- return self.css(', '.join(selectors))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
 
604
- def find(self, *args, **kwargs) -> Union['Adaptor', None]:
605
- """Find elements by their tag name and filter them based on attributes for ease then return the first result. Otherwise return `None`.
606
 
607
- :param args: Tag name(s), an iterable of tag names, or a dictionary of elements' attributes. Leave empty for selecting all.
608
  :param kwargs: The attributes you want to filter elements based on it.
609
  :return: The `Adaptor` object of the element or `None` if the result didn't match
610
  """
@@ -882,10 +919,10 @@ class Adaptor(SelectorsGeneration):
882
  return self.__convert_results(results)
883
 
884
  def find_by_regex(
885
- self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
886
  ) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
887
  """Find elements that its text content matches the input regex pattern.
888
- :param query: Regex query to match
889
  :param first_match: Return first element that matches conditions, enabled by default
890
  :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
891
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
 
1
  import os
2
+ import re
3
+ import inspect
4
  from difflib import SequenceMatcher
5
 
6
  from scrapling.core.translator import HTMLTranslator
7
  from scrapling.core.mixins import SelectorsGeneration
8
  from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
9
  from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
10
+ from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
11
  from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
 
12
  from lxml import etree, html
13
  from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
14
 
 
543
  except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
544
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
545
 
546
+ def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptors[Adaptor]', List]:
547
+ """Find elements by filters of your creations for ease..
548
 
549
+ :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
550
  :param kwargs: The attributes you want to filter elements based on it.
551
  :return: The `Adaptors` object of the elements or empty list
552
  """
 
561
  if not args and not kwargs:
562
  raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
563
 
 
 
564
  attributes = dict()
565
+ tags, patterns = set(), set()
566
+ results, functions, selectors = [], [], []
567
+
568
+ def _search_tree(element: Adaptor, filter_function: Callable) -> None:
569
+ """Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
570
+ if filter_function(element):
571
+ results.append(element)
572
+
573
+ for branch in element.children:
574
+ _search_tree(branch, filter_function)
575
+
576
  # Brace yourself for a wonderful journey!
577
  for arg in args:
578
  if type(arg) is str:
 
588
  raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
589
  attributes.update(arg)
590
 
591
+ elif type(arg) is re.Pattern:
592
+ patterns.add(arg)
593
+
594
+ elif callable(arg):
595
+ if len(inspect.signature(arg).parameters) > 0:
596
+ functions.append(arg)
597
+ else:
598
+ raise TypeError("Callable filter function must have at least one argument to take `Adaptor` objects.")
599
+
600
  else:
601
  raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
602
 
 
616
  value = value.replace('"', r'\"') # Escape double quotes in user input
617
  # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
618
  selector += '[{}="{}"]'.format(key, value)
619
+ if selector:
620
+ selectors.append(selector)
621
 
622
+ if selectors:
623
+ results = self.css(', '.join(selectors))
624
+ if results:
625
+ # From the results, get the ones that fulfill passed regex patterns
626
+ for pattern in patterns:
627
+ results = results.filter(lambda e: e.text.re(pattern, check_match=True))
628
+
629
+ # From the results, get the ones that fulfill passed functions
630
+ for function in functions:
631
+ results = results.filter(function)
632
+ else:
633
+ for pattern in patterns:
634
+ results.extend(self.find_by_regex(pattern, first_match=False))
635
+
636
+ for function in functions:
637
+ _search_tree(self, function)
638
+
639
+ return self.__convert_results(results)
640
 
641
+ def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
642
+ """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
643
 
644
+ :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
645
  :param kwargs: The attributes you want to filter elements based on it.
646
  :return: The `Adaptor` object of the element or `None` if the result didn't match
647
  """
 
919
  return self.__convert_results(results)
920
 
921
  def find_by_regex(
922
+ self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
923
  ) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
924
  """Find elements that its text content matches the input regex pattern.
925
+ :param query: Regex query/pattern to match
926
  :param first_match: Return first element that matches conditions, enabled by default
927
  :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
928
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching