Karim shoair commited on
Commit ·
0de8025
1
Parent(s): 9c87ce8
feat(Dynamic fetcher): Add `additional_args` option to customize browser as in StealthyFetcher
Browse files
docs/fetching/dynamic.md
CHANGED
|
@@ -89,6 +89,7 @@ Scrapling provides many options with this fetcher and its session classes. To ma
|
|
| 89 |
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
| 90 |
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. | ✔️ |
|
| 91 |
| user_data_dir | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions** | ✔️ |
|
|
|
|
| 92 |
| selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
|
| 93 |
|
| 94 |
In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, and `selector_config`.
|
|
|
|
| 89 |
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
| 90 |
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. | ✔️ |
|
| 91 |
| user_data_dir | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions** | ✔️ |
|
| 92 |
+
| additional_args | Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings. | ✔️ |
|
| 93 |
| selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
|
| 94 |
|
| 95 |
In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, and `selector_config`.
|
scrapling/engines/_browsers/_base.py
CHANGED
|
@@ -148,6 +148,7 @@ class DynamicSessionMixin:
|
|
| 148 |
self.init_script = config.init_script
|
| 149 |
self.wait_selector_state = config.wait_selector_state
|
| 150 |
self.selector_config = config.selector_config
|
|
|
|
| 151 |
self.page_action = config.page_action
|
| 152 |
self.user_data_dir = config.user_data_dir
|
| 153 |
self._headers_keys = {header.lower() for header in self.extra_headers.keys()} if self.extra_headers else set()
|
|
@@ -175,6 +176,7 @@ class DynamicSessionMixin:
|
|
| 175 |
self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
|
| 176 |
self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
|
| 177 |
self.launch_options["user_data_dir"] = self.user_data_dir
|
|
|
|
| 178 |
self.context_options = dict()
|
| 179 |
else:
|
| 180 |
# while `context_options` is left to be used when cdp mode is enabled
|
|
@@ -190,6 +192,7 @@ class DynamicSessionMixin:
|
|
| 190 |
)
|
| 191 |
self.context_options["extra_http_headers"] = dict(self.context_options["extra_http_headers"])
|
| 192 |
self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
class StealthySessionMixin:
|
|
|
|
| 148 |
self.init_script = config.init_script
|
| 149 |
self.wait_selector_state = config.wait_selector_state
|
| 150 |
self.selector_config = config.selector_config
|
| 151 |
+
self.additional_args = config.additional_args
|
| 152 |
self.page_action = config.page_action
|
| 153 |
self.user_data_dir = config.user_data_dir
|
| 154 |
self._headers_keys = {header.lower() for header in self.extra_headers.keys()} if self.extra_headers else set()
|
|
|
|
| 176 |
self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
|
| 177 |
self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
|
| 178 |
self.launch_options["user_data_dir"] = self.user_data_dir
|
| 179 |
+
self.launch_options.update(cast(Dict, self.additional_args))
|
| 180 |
self.context_options = dict()
|
| 181 |
else:
|
| 182 |
# while `context_options` is left to be used when cdp mode is enabled
|
|
|
|
| 192 |
)
|
| 193 |
self.context_options["extra_http_headers"] = dict(self.context_options["extra_http_headers"])
|
| 194 |
self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
|
| 195 |
+
self.context_options.update(cast(Dict, self.additional_args))
|
| 196 |
|
| 197 |
|
| 198 |
class StealthySessionMixin:
|
scrapling/engines/_browsers/_controllers.py
CHANGED
|
@@ -99,6 +99,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 99 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 100 |
user_data_dir: str = "",
|
| 101 |
selector_config: Optional[Dict] = None,
|
|
|
|
| 102 |
):
|
| 103 |
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
| 104 |
|
|
@@ -127,6 +128,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 127 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 128 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 129 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
|
|
|
| 130 |
"""
|
| 131 |
self.__validate__(
|
| 132 |
wait=wait,
|
|
@@ -151,6 +153,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 151 |
wait_selector=wait_selector,
|
| 152 |
disable_webgl=disable_webgl,
|
| 153 |
selector_config=selector_config,
|
|
|
|
| 154 |
disable_resources=disable_resources,
|
| 155 |
wait_selector_state=wait_selector_state,
|
| 156 |
)
|
|
@@ -306,7 +309,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 306 |
page_info.page, first_response, final_response, params.selector_config
|
| 307 |
)
|
| 308 |
|
| 309 |
-
# Close the page
|
| 310 |
page_info.page.close()
|
| 311 |
self.page_pool.pages.remove(page_info)
|
| 312 |
|
|
@@ -346,6 +349,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 346 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 347 |
user_data_dir: str = "",
|
| 348 |
selector_config: Optional[Dict] = None,
|
|
|
|
| 349 |
):
|
| 350 |
"""A Browser session manager with page pooling
|
| 351 |
|
|
@@ -375,6 +379,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 375 |
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 376 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 377 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
|
|
|
| 378 |
"""
|
| 379 |
|
| 380 |
self.__validate__(
|
|
@@ -400,6 +405,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 400 |
wait_selector=wait_selector,
|
| 401 |
disable_webgl=disable_webgl,
|
| 402 |
selector_config=selector_config,
|
|
|
|
| 403 |
disable_resources=disable_resources,
|
| 404 |
wait_selector_state=wait_selector_state,
|
| 405 |
)
|
|
@@ -560,7 +566,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 560 |
page_info.page, first_response, final_response, params.selector_config
|
| 561 |
)
|
| 562 |
|
| 563 |
-
# Close the page
|
| 564 |
await page_info.page.close()
|
| 565 |
self.page_pool.pages.remove(page_info)
|
| 566 |
return response
|
|
|
|
| 99 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 100 |
user_data_dir: str = "",
|
| 101 |
selector_config: Optional[Dict] = None,
|
| 102 |
+
additional_args: Optional[Dict] = None,
|
| 103 |
):
|
| 104 |
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
| 105 |
|
|
|
|
| 128 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 129 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 130 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 131 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 132 |
"""
|
| 133 |
self.__validate__(
|
| 134 |
wait=wait,
|
|
|
|
| 153 |
wait_selector=wait_selector,
|
| 154 |
disable_webgl=disable_webgl,
|
| 155 |
selector_config=selector_config,
|
| 156 |
+
additional_args=additional_args,
|
| 157 |
disable_resources=disable_resources,
|
| 158 |
wait_selector_state=wait_selector_state,
|
| 159 |
)
|
|
|
|
| 309 |
page_info.page, first_response, final_response, params.selector_config
|
| 310 |
)
|
| 311 |
|
| 312 |
+
# Close the page to free up resources
|
| 313 |
page_info.page.close()
|
| 314 |
self.page_pool.pages.remove(page_info)
|
| 315 |
|
|
|
|
| 349 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 350 |
user_data_dir: str = "",
|
| 351 |
selector_config: Optional[Dict] = None,
|
| 352 |
+
additional_args: Optional[Dict] = None,
|
| 353 |
):
|
| 354 |
"""A Browser session manager with page pooling
|
| 355 |
|
|
|
|
| 379 |
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 380 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 381 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 382 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 383 |
"""
|
| 384 |
|
| 385 |
self.__validate__(
|
|
|
|
| 405 |
wait_selector=wait_selector,
|
| 406 |
disable_webgl=disable_webgl,
|
| 407 |
selector_config=selector_config,
|
| 408 |
+
additional_args=additional_args,
|
| 409 |
disable_resources=disable_resources,
|
| 410 |
wait_selector_state=wait_selector_state,
|
| 411 |
)
|
|
|
|
| 566 |
page_info.page, first_response, final_response, params.selector_config
|
| 567 |
)
|
| 568 |
|
| 569 |
+
# Close the page to free up resources
|
| 570 |
await page_info.page.close()
|
| 571 |
self.page_pool.pages.remove(page_info)
|
| 572 |
return response
|
scrapling/engines/_browsers/_validators.py
CHANGED
|
@@ -89,6 +89,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
| 89 |
wait_selector_state: SelectorWaitStates = "attached"
|
| 90 |
user_data_dir: str = ""
|
| 91 |
selector_config: Optional[Dict] = {}
|
|
|
|
| 92 |
|
| 93 |
def __post_init__(self):
|
| 94 |
"""Custom validation after msgspec validation"""
|
|
@@ -103,6 +104,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
| 103 |
self.cookies = []
|
| 104 |
if not self.selector_config:
|
| 105 |
self.selector_config = {}
|
|
|
|
|
|
|
| 106 |
|
| 107 |
if self.init_script is not None:
|
| 108 |
_validate_file_path(self.init_script)
|
|
|
|
| 89 |
wait_selector_state: SelectorWaitStates = "attached"
|
| 90 |
user_data_dir: str = ""
|
| 91 |
selector_config: Optional[Dict] = {}
|
| 92 |
+
additional_args: Optional[Dict] = {}
|
| 93 |
|
| 94 |
def __post_init__(self):
|
| 95 |
"""Custom validation after msgspec validation"""
|
|
|
|
| 104 |
self.cookies = []
|
| 105 |
if not self.selector_config:
|
| 106 |
self.selector_config = {}
|
| 107 |
+
if not self.additional_args:
|
| 108 |
+
self.additional_args = {}
|
| 109 |
|
| 110 |
if self.init_script is not None:
|
| 111 |
_validate_file_path(self.init_script)
|
scrapling/fetchers/chrome.py
CHANGED
|
@@ -50,6 +50,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 50 |
network_idle: bool = False,
|
| 51 |
load_dom: bool = True,
|
| 52 |
wait_selector_state: SelectorWaitStates = "attached",
|
|
|
|
| 53 |
custom_config: Optional[Dict] = None,
|
| 54 |
) -> Response:
|
| 55 |
"""Opens up a browser and do your request based on your chosen options below.
|
|
@@ -79,6 +80,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 79 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 80 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 81 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
|
|
|
| 82 |
:return: A `Response` object.
|
| 83 |
"""
|
| 84 |
if not custom_config:
|
|
@@ -106,6 +108,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 106 |
extra_headers=extra_headers,
|
| 107 |
wait_selector=wait_selector,
|
| 108 |
disable_webgl=disable_webgl,
|
|
|
|
| 109 |
disable_resources=disable_resources,
|
| 110 |
wait_selector_state=wait_selector_state,
|
| 111 |
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
|
@@ -137,6 +140,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 137 |
network_idle: bool = False,
|
| 138 |
load_dom: bool = True,
|
| 139 |
wait_selector_state: SelectorWaitStates = "attached",
|
|
|
|
| 140 |
custom_config: Optional[Dict] = None,
|
| 141 |
) -> Response:
|
| 142 |
"""Opens up a browser and do your request based on your chosen options below.
|
|
@@ -166,6 +170,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 166 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 167 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 168 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
|
|
|
| 169 |
:return: A `Response` object.
|
| 170 |
"""
|
| 171 |
if not custom_config:
|
|
@@ -194,6 +199,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 194 |
extra_headers=extra_headers,
|
| 195 |
wait_selector=wait_selector,
|
| 196 |
disable_webgl=disable_webgl,
|
|
|
|
| 197 |
disable_resources=disable_resources,
|
| 198 |
wait_selector_state=wait_selector_state,
|
| 199 |
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
|
|
|
| 50 |
network_idle: bool = False,
|
| 51 |
load_dom: bool = True,
|
| 52 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 53 |
+
additional_args: Optional[Dict] = None,
|
| 54 |
custom_config: Optional[Dict] = None,
|
| 55 |
) -> Response:
|
| 56 |
"""Opens up a browser and do your request based on your chosen options below.
|
|
|
|
| 80 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 81 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 82 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 83 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 84 |
:return: A `Response` object.
|
| 85 |
"""
|
| 86 |
if not custom_config:
|
|
|
|
| 108 |
extra_headers=extra_headers,
|
| 109 |
wait_selector=wait_selector,
|
| 110 |
disable_webgl=disable_webgl,
|
| 111 |
+
additional_args=additional_args,
|
| 112 |
disable_resources=disable_resources,
|
| 113 |
wait_selector_state=wait_selector_state,
|
| 114 |
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
|
|
|
| 140 |
network_idle: bool = False,
|
| 141 |
load_dom: bool = True,
|
| 142 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 143 |
+
additional_args: Optional[Dict] = None,
|
| 144 |
custom_config: Optional[Dict] = None,
|
| 145 |
) -> Response:
|
| 146 |
"""Opens up a browser and do your request based on your chosen options below.
|
|
|
|
| 170 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 171 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 172 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 173 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 174 |
:return: A `Response` object.
|
| 175 |
"""
|
| 176 |
if not custom_config:
|
|
|
|
| 199 |
extra_headers=extra_headers,
|
| 200 |
wait_selector=wait_selector,
|
| 201 |
disable_webgl=disable_webgl,
|
| 202 |
+
additional_args=additional_args,
|
| 203 |
disable_resources=disable_resources,
|
| 204 |
wait_selector_state=wait_selector_state,
|
| 205 |
selector_config={**cls._generate_parser_arguments(), **custom_config},
|