Karim shoair commited on
Commit ·
b6969b2
1
Parent(s): e7d399c
feat(DynamicSession): New option to add extra browser flags
Browse files
docs/fetching/dynamic.md
CHANGED
|
@@ -89,6 +89,7 @@ Scrapling provides many options with this fetcher and its session classes. To ma
|
|
| 89 |
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
| 90 |
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. | ✔️ |
|
| 91 |
| user_data_dir | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions** | ✔️ |
|
|
|
|
| 92 |
| additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |
|
| 93 |
| selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
|
| 94 |
|
|
|
|
| 89 |
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
| 90 |
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. | ✔️ |
|
| 91 |
| user_data_dir | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions** | ✔️ |
|
| 92 |
+
| extra_flags | A list of additional browser flags to pass to the browser on launch. | ✔️ |
|
| 93 |
| additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |
|
| 94 |
| selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
|
| 95 |
|
scrapling/engines/_browsers/_base.py
CHANGED
|
@@ -276,6 +276,7 @@ class DynamicSessionMixin:
|
|
| 276 |
self.wait_selector = config.wait_selector
|
| 277 |
self.init_script = config.init_script
|
| 278 |
self.wait_selector_state = config.wait_selector_state
|
|
|
|
| 279 |
self.selector_config = config.selector_config
|
| 280 |
self.additional_args = config.additional_args
|
| 281 |
self.page_action = config.page_action
|
|
@@ -300,6 +301,7 @@ class DynamicSessionMixin:
|
|
| 300 |
self.stealth,
|
| 301 |
self.hide_canvas,
|
| 302 |
self.disable_webgl,
|
|
|
|
| 303 |
)
|
| 304 |
)
|
| 305 |
self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
|
|
|
|
| 276 |
self.wait_selector = config.wait_selector
|
| 277 |
self.init_script = config.init_script
|
| 278 |
self.wait_selector_state = config.wait_selector_state
|
| 279 |
+
self.extra_flags = config.extra_flags
|
| 280 |
self.selector_config = config.selector_config
|
| 281 |
self.additional_args = config.additional_args
|
| 282 |
self.page_action = config.page_action
|
|
|
|
| 301 |
self.stealth,
|
| 302 |
self.hide_canvas,
|
| 303 |
self.disable_webgl,
|
| 304 |
+
tuple(self.extra_flags) if self.extra_flags else tuple(),
|
| 305 |
)
|
| 306 |
)
|
| 307 |
self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
|
scrapling/engines/_browsers/_config_tools.py
CHANGED
|
@@ -70,12 +70,17 @@ def _launch_kwargs(
|
|
| 70 |
stealth,
|
| 71 |
hide_canvas,
|
| 72 |
disable_webgl,
|
|
|
|
| 73 |
) -> Tuple:
|
| 74 |
"""Creates the arguments we will use while launching playwright's browser"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
launch_kwargs = {
|
| 76 |
"locale": locale,
|
| 77 |
"headless": headless,
|
| 78 |
-
"args":
|
| 79 |
"color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
|
| 80 |
"proxy": proxy or tuple(),
|
| 81 |
"device_scale_factor": 2,
|
|
@@ -85,9 +90,10 @@ def _launch_kwargs(
|
|
| 85 |
"user_agent": useragent or __default_useragent__,
|
| 86 |
}
|
| 87 |
if stealth:
|
|
|
|
| 88 |
launch_kwargs.update(
|
| 89 |
{
|
| 90 |
-
"args":
|
| 91 |
"chromium_sandbox": True,
|
| 92 |
"is_mobile": False,
|
| 93 |
"has_touch": False,
|
|
|
|
| 70 |
stealth,
|
| 71 |
hide_canvas,
|
| 72 |
disable_webgl,
|
| 73 |
+
extra_flags: Tuple,
|
| 74 |
) -> Tuple:
|
| 75 |
"""Creates the arguments we will use while launching playwright's browser"""
|
| 76 |
+
base_args = DEFAULT_FLAGS
|
| 77 |
+
if extra_flags:
|
| 78 |
+
base_args = base_args + extra_flags
|
| 79 |
+
|
| 80 |
launch_kwargs = {
|
| 81 |
"locale": locale,
|
| 82 |
"headless": headless,
|
| 83 |
+
"args": base_args,
|
| 84 |
"color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
|
| 85 |
"proxy": proxy or tuple(),
|
| 86 |
"device_scale_factor": 2,
|
|
|
|
| 90 |
"user_agent": useragent or __default_useragent__,
|
| 91 |
}
|
| 92 |
if stealth:
|
| 93 |
+
stealth_args = base_args + _set_flags(hide_canvas, disable_webgl)
|
| 94 |
launch_kwargs.update(
|
| 95 |
{
|
| 96 |
+
"args": stealth_args,
|
| 97 |
"chromium_sandbox": True,
|
| 98 |
"is_mobile": False,
|
| 99 |
"has_touch": False,
|
scrapling/engines/_browsers/_controllers.py
CHANGED
|
@@ -95,6 +95,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 95 |
load_dom: bool = True,
|
| 96 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 97 |
user_data_dir: str = "",
|
|
|
|
| 98 |
selector_config: Optional[Dict] = None,
|
| 99 |
additional_args: Optional[Dict] = None,
|
| 100 |
):
|
|
@@ -124,6 +125,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 124 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 125 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 126 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
|
|
|
| 127 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 128 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 129 |
"""
|
|
@@ -149,6 +151,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 149 |
extra_headers=extra_headers,
|
| 150 |
wait_selector=wait_selector,
|
| 151 |
disable_webgl=disable_webgl,
|
|
|
|
| 152 |
selector_config=selector_config,
|
| 153 |
additional_args=additional_args,
|
| 154 |
disable_resources=disable_resources,
|
|
@@ -306,6 +309,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 306 |
load_dom: bool = True,
|
| 307 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 308 |
user_data_dir: str = "",
|
|
|
|
| 309 |
selector_config: Optional[Dict] = None,
|
| 310 |
additional_args: Optional[Dict] = None,
|
| 311 |
):
|
|
@@ -336,6 +340,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 336 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 337 |
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 338 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
|
|
|
| 339 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 340 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 341 |
"""
|
|
@@ -362,6 +367,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 362 |
extra_headers=extra_headers,
|
| 363 |
wait_selector=wait_selector,
|
| 364 |
disable_webgl=disable_webgl,
|
|
|
|
| 365 |
selector_config=selector_config,
|
| 366 |
additional_args=additional_args,
|
| 367 |
disable_resources=disable_resources,
|
|
|
|
| 95 |
load_dom: bool = True,
|
| 96 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 97 |
user_data_dir: str = "",
|
| 98 |
+
extra_flags: Optional[List[str]] = None,
|
| 99 |
selector_config: Optional[Dict] = None,
|
| 100 |
additional_args: Optional[Dict] = None,
|
| 101 |
):
|
|
|
|
| 125 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 126 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 127 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 128 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 129 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 130 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 131 |
"""
|
|
|
|
| 151 |
extra_headers=extra_headers,
|
| 152 |
wait_selector=wait_selector,
|
| 153 |
disable_webgl=disable_webgl,
|
| 154 |
+
extra_flags=extra_flags,
|
| 155 |
selector_config=selector_config,
|
| 156 |
additional_args=additional_args,
|
| 157 |
disable_resources=disable_resources,
|
|
|
|
| 309 |
load_dom: bool = True,
|
| 310 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 311 |
user_data_dir: str = "",
|
| 312 |
+
extra_flags: Optional[List[str]] = None,
|
| 313 |
selector_config: Optional[Dict] = None,
|
| 314 |
additional_args: Optional[Dict] = None,
|
| 315 |
):
|
|
|
|
| 340 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 341 |
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 342 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 343 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 344 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 345 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 346 |
"""
|
|
|
|
| 367 |
extra_headers=extra_headers,
|
| 368 |
wait_selector=wait_selector,
|
| 369 |
disable_webgl=disable_webgl,
|
| 370 |
+
extra_flags=extra_flags,
|
| 371 |
selector_config=selector_config,
|
| 372 |
additional_args=additional_args,
|
| 373 |
disable_resources=disable_resources,
|
scrapling/engines/_browsers/_validators.py
CHANGED
|
@@ -88,6 +88,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
| 88 |
load_dom: bool = True
|
| 89 |
wait_selector_state: SelectorWaitStates = "attached"
|
| 90 |
user_data_dir: str = ""
|
|
|
|
| 91 |
selector_config: Optional[Dict] = {}
|
| 92 |
additional_args: Optional[Dict] = {}
|
| 93 |
|
|
@@ -102,6 +103,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
| 102 |
|
| 103 |
if not self.cookies:
|
| 104 |
self.cookies = []
|
|
|
|
|
|
|
| 105 |
if not self.selector_config:
|
| 106 |
self.selector_config = {}
|
| 107 |
if not self.additional_args:
|
|
|
|
| 88 |
load_dom: bool = True
|
| 89 |
wait_selector_state: SelectorWaitStates = "attached"
|
| 90 |
user_data_dir: str = ""
|
| 91 |
+
extra_flags: Optional[List[str]] = None
|
| 92 |
selector_config: Optional[Dict] = {}
|
| 93 |
additional_args: Optional[Dict] = {}
|
| 94 |
|
|
|
|
| 103 |
|
| 104 |
if not self.cookies:
|
| 105 |
self.cookies = []
|
| 106 |
+
if not self.extra_flags:
|
| 107 |
+
self.extra_flags = []
|
| 108 |
if not self.selector_config:
|
| 109 |
self.selector_config = {}
|
| 110 |
if not self.additional_args:
|
scrapling/fetchers/chrome.py
CHANGED
|
@@ -50,6 +50,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 50 |
network_idle: bool = False,
|
| 51 |
load_dom: bool = True,
|
| 52 |
wait_selector_state: SelectorWaitStates = "attached",
|
|
|
|
| 53 |
additional_args: Optional[Dict] = None,
|
| 54 |
custom_config: Optional[Dict] = None,
|
| 55 |
) -> Response:
|
|
@@ -79,6 +80,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 79 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 80 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 81 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
|
|
| 82 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 83 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 84 |
:return: A `Response` object.
|
|
@@ -108,6 +110,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 108 |
extra_headers=extra_headers,
|
| 109 |
wait_selector=wait_selector,
|
| 110 |
disable_webgl=disable_webgl,
|
|
|
|
| 111 |
additional_args=additional_args,
|
| 112 |
disable_resources=disable_resources,
|
| 113 |
wait_selector_state=wait_selector_state,
|
|
@@ -140,6 +143,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 140 |
network_idle: bool = False,
|
| 141 |
load_dom: bool = True,
|
| 142 |
wait_selector_state: SelectorWaitStates = "attached",
|
|
|
|
| 143 |
additional_args: Optional[Dict] = None,
|
| 144 |
custom_config: Optional[Dict] = None,
|
| 145 |
) -> Response:
|
|
@@ -169,6 +173,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 169 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 170 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 171 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
|
|
| 172 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 173 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 174 |
:return: A `Response` object.
|
|
@@ -199,6 +204,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 199 |
extra_headers=extra_headers,
|
| 200 |
wait_selector=wait_selector,
|
| 201 |
disable_webgl=disable_webgl,
|
|
|
|
| 202 |
additional_args=additional_args,
|
| 203 |
disable_resources=disable_resources,
|
| 204 |
wait_selector_state=wait_selector_state,
|
|
|
|
| 50 |
network_idle: bool = False,
|
| 51 |
load_dom: bool = True,
|
| 52 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 53 |
+
extra_flags: Optional[List[str]] = None,
|
| 54 |
additional_args: Optional[Dict] = None,
|
| 55 |
custom_config: Optional[Dict] = None,
|
| 56 |
) -> Response:
|
|
|
|
| 80 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 81 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 82 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 83 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 84 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 85 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 86 |
:return: A `Response` object.
|
|
|
|
| 110 |
extra_headers=extra_headers,
|
| 111 |
wait_selector=wait_selector,
|
| 112 |
disable_webgl=disable_webgl,
|
| 113 |
+
extra_flags=extra_flags,
|
| 114 |
additional_args=additional_args,
|
| 115 |
disable_resources=disable_resources,
|
| 116 |
wait_selector_state=wait_selector_state,
|
|
|
|
| 143 |
network_idle: bool = False,
|
| 144 |
load_dom: bool = True,
|
| 145 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 146 |
+
extra_flags: Optional[List[str]] = None,
|
| 147 |
additional_args: Optional[Dict] = None,
|
| 148 |
custom_config: Optional[Dict] = None,
|
| 149 |
) -> Response:
|
|
|
|
| 173 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 174 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 175 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 176 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 177 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 178 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 179 |
:return: A `Response` object.
|
|
|
|
| 204 |
extra_headers=extra_headers,
|
| 205 |
wait_selector=wait_selector,
|
| 206 |
disable_webgl=disable_webgl,
|
| 207 |
+
extra_flags=extra_flags,
|
| 208 |
additional_args=additional_args,
|
| 209 |
disable_resources=disable_resources,
|
| 210 |
wait_selector_state=wait_selector_state,
|