Karim shoair commited on
Commit ·
da52163
1
Parent(s): 5468f28
Adding the argument `automatch_domain` to all fetchers
Browse files
scrapling/engines/toolbelt/custom.py
CHANGED
|
@@ -27,10 +27,11 @@ class Response:
|
|
| 27 |
@property
|
| 28 |
def adaptor(self) -> Union[Adaptor, None]:
|
| 29 |
"""Generate Adaptor instance from this response if possible, otherwise return None"""
|
|
|
|
| 30 |
if self.content:
|
| 31 |
-
return Adaptor(body=self.content, url=self.url, encoding=self.encoding, **self.adaptor_arguments)
|
| 32 |
elif self.text:
|
| 33 |
-
return Adaptor(text=self.text, url=self.url, encoding=self.encoding, **self.adaptor_arguments)
|
| 34 |
return None
|
| 35 |
|
| 36 |
def __repr__(self):
|
|
@@ -41,6 +42,7 @@ class BaseFetcher:
|
|
| 41 |
def __init__(
|
| 42 |
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
| 43 |
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
|
|
|
|
| 44 |
):
|
| 45 |
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
| 46 |
are detected and passed automatically from the Fetcher based on the response for accessibility.
|
|
@@ -53,6 +55,8 @@ class BaseFetcher:
|
|
| 53 |
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
| 54 |
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
| 55 |
If empty, default values will be used.
|
|
|
|
|
|
|
| 56 |
:param debug: Enable debug mode
|
| 57 |
"""
|
| 58 |
# Adaptor class parameters
|
|
@@ -67,6 +71,11 @@ class BaseFetcher:
|
|
| 67 |
)
|
| 68 |
# If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
|
| 69 |
setup_basic_logging(level='debug' if debug else 'info')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
|
| 72 |
def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
|
|
|
|
| 27 |
@property
|
| 28 |
def adaptor(self) -> Union[Adaptor, None]:
|
| 29 |
"""Generate Adaptor instance from this response if possible, otherwise return None"""
|
| 30 |
+
automatch_domain = self.adaptor_arguments.pop('automatch_domain', None)
|
| 31 |
if self.content:
|
| 32 |
+
return Adaptor(body=self.content, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
|
| 33 |
elif self.text:
|
| 34 |
+
return Adaptor(text=self.text, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
|
| 35 |
return None
|
| 36 |
|
| 37 |
def __repr__(self):
|
|
|
|
| 42 |
def __init__(
|
| 43 |
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
| 44 |
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
|
| 45 |
+
automatch_domain: Optional[str] = None,
|
| 46 |
):
|
| 47 |
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
| 48 |
are detected and passed automatically from the Fetcher based on the response for accessibility.
|
|
|
|
| 55 |
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
| 56 |
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
| 57 |
If empty, default values will be used.
|
| 58 |
+
:param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
|
| 59 |
+
Otherwise, the domain of the request is used by default.
|
| 60 |
:param debug: Enable debug mode
|
| 61 |
"""
|
| 62 |
# Adaptor class parameters
|
|
|
|
| 71 |
)
|
| 72 |
# If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
|
| 73 |
setup_basic_logging(level='debug' if debug else 'info')
|
| 74 |
+
if automatch_domain:
|
| 75 |
+
if type(automatch_domain) is not str:
|
| 76 |
+
logging.warning('[Ignored] The argument "automatch_domain" must be of string type')
|
| 77 |
+
else:
|
| 78 |
+
self.adaptor_arguments.update({'automatch_domain': automatch_domain})
|
| 79 |
|
| 80 |
|
| 81 |
def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
|