Karim shoair commited on
Commit
da52163
·
1 Parent(s): 5468f28

Adding the argument `automatch_domain` to all fetchers

Browse files
scrapling/engines/toolbelt/custom.py CHANGED
@@ -27,10 +27,11 @@ class Response:
27
  @property
28
  def adaptor(self) -> Union[Adaptor, None]:
29
  """Generate Adaptor instance from this response if possible, otherwise return None"""
 
30
  if self.content:
31
- return Adaptor(body=self.content, url=self.url, encoding=self.encoding, **self.adaptor_arguments)
32
  elif self.text:
33
- return Adaptor(text=self.text, url=self.url, encoding=self.encoding, **self.adaptor_arguments)
34
  return None
35
 
36
  def __repr__(self):
@@ -41,6 +42,7 @@ class BaseFetcher:
41
  def __init__(
42
  self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
43
  storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
 
44
  ):
45
  """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
46
  are detected and passed automatically from the Fetcher based on the response for accessibility.
@@ -53,6 +55,8 @@ class BaseFetcher:
53
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
54
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
55
  If empty, default values will be used.
 
 
56
  :param debug: Enable debug mode
57
  """
58
  # Adaptor class parameters
@@ -67,6 +71,11 @@ class BaseFetcher:
67
  )
68
  # If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
69
  setup_basic_logging(level='debug' if debug else 'info')
 
 
 
 
 
70
 
71
 
72
  def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
 
27
  @property
28
  def adaptor(self) -> Union[Adaptor, None]:
29
  """Generate Adaptor instance from this response if possible, otherwise return None"""
30
+ automatch_domain = self.adaptor_arguments.pop('automatch_domain', None)
31
  if self.content:
32
+ return Adaptor(body=self.content, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
33
  elif self.text:
34
+ return Adaptor(text=self.text, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
35
  return None
36
 
37
  def __repr__(self):
 
42
  def __init__(
43
  self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
44
  storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
45
+ automatch_domain: Optional[str] = None,
46
  ):
47
  """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
48
  are detected and passed automatically from the Fetcher based on the response for accessibility.
 
55
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
56
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
57
  If empty, default values will be used.
58
+ :param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
59
+ Otherwise, the domain of the request is used by default.
60
  :param debug: Enable debug mode
61
  """
62
  # Adaptor class parameters
 
71
  )
72
  # If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
73
  setup_basic_logging(level='debug' if debug else 'info')
74
+ if automatch_domain:
75
+ if type(automatch_domain) is not str:
76
+ logging.warning('[Ignored] The argument "automatch_domain" must be of string type')
77
+ else:
78
+ self.adaptor_arguments.update({'automatch_domain': automatch_domain})
79
 
80
 
81
  def check_if_engine_usable(engine: Callable) -> Union[Callable, None]: