Karim shoair commited on
Commit
9d5f1f4
·
1 Parent(s): 1003d98

fix(parser/find_all): Logic issues made conditions used sometimes in a (and) fashion and other times (OR)

Browse files
Files changed (1) hide show
  1. scrapling/parser.py +8 -8
scrapling/parser.py CHANGED
@@ -19,7 +19,7 @@ from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
19
  StorageSystemMixin, _StorageTools)
20
  from scrapling.core.translator import HTMLTranslator
21
  from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
22
- is_jsonable, log, lru_cache)
23
 
24
 
25
  class Adaptor(SelectorsGeneration):
@@ -285,7 +285,6 @@ class Adaptor(SelectorsGeneration):
285
  return self.__handle_element(self._root.getparent())
286
 
287
  @property
288
- @lru_cache(None, True)
289
  def below_elements(self) -> 'Adaptors[Adaptor]':
290
  """Return all elements under the current element in the DOM tree"""
291
  below = self._root.xpath('.//*')
@@ -603,12 +602,12 @@ class Adaptor(SelectorsGeneration):
603
  raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
604
  tags.update(set(arg))
605
 
606
- elif type(arg) is dict:
607
  if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
608
  raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
609
  attributes.update(arg)
610
 
611
- elif type(arg) is re.Pattern:
612
  patterns.add(arg)
613
 
614
  elif callable(arg):
@@ -629,14 +628,14 @@ class Adaptor(SelectorsGeneration):
629
  attributes[attribute_name] = value
630
 
631
  # It's easier and faster to build a selector than traversing the tree
632
- tags = tags or ['']
633
  for tag in tags:
634
  selector = tag
635
  for key, value in attributes.items():
636
  value = value.replace('"', r'\"') # Escape double quotes in user input
637
  # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
638
  selector += '[{}="{}"]'.format(key, value)
639
- if selector:
640
  selectors.append(selector)
641
 
642
  if selectors:
@@ -650,12 +649,13 @@ class Adaptor(SelectorsGeneration):
650
  for function in functions:
651
  results = results.filter(function)
652
  else:
 
653
  for pattern in patterns:
654
- results.extend(self.find_by_regex(pattern, first_match=False))
655
 
656
  # Collect element if it fulfills passed function otherwise
657
  for function in functions:
658
- results.extend((results or self.below_elements).filter(function))
659
 
660
  return results
661
 
 
19
  StorageSystemMixin, _StorageTools)
20
  from scrapling.core.translator import HTMLTranslator
21
  from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
22
+ is_jsonable, log)
23
 
24
 
25
  class Adaptor(SelectorsGeneration):
 
285
  return self.__handle_element(self._root.getparent())
286
 
287
  @property
 
288
  def below_elements(self) -> 'Adaptors[Adaptor]':
289
  """Return all elements under the current element in the DOM tree"""
290
  below = self._root.xpath('.//*')
 
602
  raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
603
  tags.update(set(arg))
604
 
605
+ elif isinstance(arg, dict):
606
  if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
607
  raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
608
  attributes.update(arg)
609
 
610
+ elif isinstance(arg, re.Pattern):
611
  patterns.add(arg)
612
 
613
  elif callable(arg):
 
628
  attributes[attribute_name] = value
629
 
630
  # It's easier and faster to build a selector than traversing the tree
631
+ tags = tags or ['*']
632
  for tag in tags:
633
  selector = tag
634
  for key, value in attributes.items():
635
  value = value.replace('"', r'\"') # Escape double quotes in user input
636
  # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
637
  selector += '[{}="{}"]'.format(key, value)
638
+ if selector != '*':
639
  selectors.append(selector)
640
 
641
  if selectors:
 
649
  for function in functions:
650
  results = results.filter(function)
651
  else:
652
+ results = results or self.below_elements
653
  for pattern in patterns:
654
+ results = results.filter(lambda e: e.text.re(pattern, check_match=True))
655
 
656
  # Collect element if it fulfills passed function otherwise
657
  for function in functions:
658
+ results = results.filter(function)
659
 
660
  return results
661