Karim shoair commited on
Commit
b6969b2
·
1 Parent(s): e7d399c

feat(DynamicSession): New option to add extra browser flags

Browse files
docs/fetching/dynamic.md CHANGED
@@ -89,6 +89,7 @@ Scrapling provides many options with this fetcher and its session classes. To ma
89
  | locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
90
  | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. | ✔️ |
91
  | user_data_dir | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions** | ✔️ |
 
92
  | additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |
93
  | selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
94
 
 
89
  | locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
90
  | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. | ✔️ |
91
  | user_data_dir | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions** | ✔️ |
92
+ | extra_flags | A list of additional browser flags to pass to the browser on launch. | ✔️ |
93
  | additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |
94
  | selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
95
 
scrapling/engines/_browsers/_base.py CHANGED
@@ -276,6 +276,7 @@ class DynamicSessionMixin:
276
  self.wait_selector = config.wait_selector
277
  self.init_script = config.init_script
278
  self.wait_selector_state = config.wait_selector_state
 
279
  self.selector_config = config.selector_config
280
  self.additional_args = config.additional_args
281
  self.page_action = config.page_action
@@ -300,6 +301,7 @@ class DynamicSessionMixin:
300
  self.stealth,
301
  self.hide_canvas,
302
  self.disable_webgl,
 
303
  )
304
  )
305
  self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
 
276
  self.wait_selector = config.wait_selector
277
  self.init_script = config.init_script
278
  self.wait_selector_state = config.wait_selector_state
279
+ self.extra_flags = config.extra_flags
280
  self.selector_config = config.selector_config
281
  self.additional_args = config.additional_args
282
  self.page_action = config.page_action
 
301
  self.stealth,
302
  self.hide_canvas,
303
  self.disable_webgl,
304
+ tuple(self.extra_flags) if self.extra_flags else tuple(),
305
  )
306
  )
307
  self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
scrapling/engines/_browsers/_config_tools.py CHANGED
@@ -70,12 +70,17 @@ def _launch_kwargs(
70
  stealth,
71
  hide_canvas,
72
  disable_webgl,
 
73
  ) -> Tuple:
74
  """Creates the arguments we will use while launching playwright's browser"""
 
 
 
 
75
  launch_kwargs = {
76
  "locale": locale,
77
  "headless": headless,
78
- "args": DEFAULT_FLAGS,
79
  "color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
80
  "proxy": proxy or tuple(),
81
  "device_scale_factor": 2,
@@ -85,9 +90,10 @@ def _launch_kwargs(
85
  "user_agent": useragent or __default_useragent__,
86
  }
87
  if stealth:
 
88
  launch_kwargs.update(
89
  {
90
- "args": DEFAULT_FLAGS + _set_flags(hide_canvas, disable_webgl),
91
  "chromium_sandbox": True,
92
  "is_mobile": False,
93
  "has_touch": False,
 
70
  stealth,
71
  hide_canvas,
72
  disable_webgl,
73
+ extra_flags: Tuple,
74
  ) -> Tuple:
75
  """Creates the arguments we will use while launching playwright's browser"""
76
+ base_args = DEFAULT_FLAGS
77
+ if extra_flags:
78
+ base_args = base_args + extra_flags
79
+
80
  launch_kwargs = {
81
  "locale": locale,
82
  "headless": headless,
83
+ "args": base_args,
84
  "color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
85
  "proxy": proxy or tuple(),
86
  "device_scale_factor": 2,
 
90
  "user_agent": useragent or __default_useragent__,
91
  }
92
  if stealth:
93
+ stealth_args = base_args + _set_flags(hide_canvas, disable_webgl)
94
  launch_kwargs.update(
95
  {
96
+ "args": stealth_args,
97
  "chromium_sandbox": True,
98
  "is_mobile": False,
99
  "has_touch": False,
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -95,6 +95,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
95
  load_dom: bool = True,
96
  wait_selector_state: SelectorWaitStates = "attached",
97
  user_data_dir: str = "",
 
98
  selector_config: Optional[Dict] = None,
99
  additional_args: Optional[Dict] = None,
100
  ):
@@ -124,6 +125,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
124
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
125
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
126
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
 
127
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
128
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
129
  """
@@ -149,6 +151,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
149
  extra_headers=extra_headers,
150
  wait_selector=wait_selector,
151
  disable_webgl=disable_webgl,
 
152
  selector_config=selector_config,
153
  additional_args=additional_args,
154
  disable_resources=disable_resources,
@@ -306,6 +309,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
306
  load_dom: bool = True,
307
  wait_selector_state: SelectorWaitStates = "attached",
308
  user_data_dir: str = "",
 
309
  selector_config: Optional[Dict] = None,
310
  additional_args: Optional[Dict] = None,
311
  ):
@@ -336,6 +340,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
336
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
337
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
338
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
 
339
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
340
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
341
  """
@@ -362,6 +367,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
362
  extra_headers=extra_headers,
363
  wait_selector=wait_selector,
364
  disable_webgl=disable_webgl,
 
365
  selector_config=selector_config,
366
  additional_args=additional_args,
367
  disable_resources=disable_resources,
 
95
  load_dom: bool = True,
96
  wait_selector_state: SelectorWaitStates = "attached",
97
  user_data_dir: str = "",
98
+ extra_flags: Optional[List[str]] = None,
99
  selector_config: Optional[Dict] = None,
100
  additional_args: Optional[Dict] = None,
101
  ):
 
125
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
126
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
127
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
128
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
129
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
130
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
131
  """
 
151
  extra_headers=extra_headers,
152
  wait_selector=wait_selector,
153
  disable_webgl=disable_webgl,
154
+ extra_flags=extra_flags,
155
  selector_config=selector_config,
156
  additional_args=additional_args,
157
  disable_resources=disable_resources,
 
309
  load_dom: bool = True,
310
  wait_selector_state: SelectorWaitStates = "attached",
311
  user_data_dir: str = "",
312
+ extra_flags: Optional[List[str]] = None,
313
  selector_config: Optional[Dict] = None,
314
  additional_args: Optional[Dict] = None,
315
  ):
 
340
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
341
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
342
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
343
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
344
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
345
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
346
  """
 
367
  extra_headers=extra_headers,
368
  wait_selector=wait_selector,
369
  disable_webgl=disable_webgl,
370
+ extra_flags=extra_flags,
371
  selector_config=selector_config,
372
  additional_args=additional_args,
373
  disable_resources=disable_resources,
scrapling/engines/_browsers/_validators.py CHANGED
@@ -88,6 +88,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
88
  load_dom: bool = True
89
  wait_selector_state: SelectorWaitStates = "attached"
90
  user_data_dir: str = ""
 
91
  selector_config: Optional[Dict] = {}
92
  additional_args: Optional[Dict] = {}
93
 
@@ -102,6 +103,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
102
 
103
  if not self.cookies:
104
  self.cookies = []
 
 
105
  if not self.selector_config:
106
  self.selector_config = {}
107
  if not self.additional_args:
 
88
  load_dom: bool = True
89
  wait_selector_state: SelectorWaitStates = "attached"
90
  user_data_dir: str = ""
91
+ extra_flags: Optional[List[str]] = None
92
  selector_config: Optional[Dict] = {}
93
  additional_args: Optional[Dict] = {}
94
 
 
103
 
104
  if not self.cookies:
105
  self.cookies = []
106
+ if not self.extra_flags:
107
+ self.extra_flags = []
108
  if not self.selector_config:
109
  self.selector_config = {}
110
  if not self.additional_args:
scrapling/fetchers/chrome.py CHANGED
@@ -50,6 +50,7 @@ class DynamicFetcher(BaseFetcher):
50
  network_idle: bool = False,
51
  load_dom: bool = True,
52
  wait_selector_state: SelectorWaitStates = "attached",
 
53
  additional_args: Optional[Dict] = None,
54
  custom_config: Optional[Dict] = None,
55
  ) -> Response:
@@ -79,6 +80,7 @@ class DynamicFetcher(BaseFetcher):
79
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
80
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
81
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
 
82
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
83
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
84
  :return: A `Response` object.
@@ -108,6 +110,7 @@ class DynamicFetcher(BaseFetcher):
108
  extra_headers=extra_headers,
109
  wait_selector=wait_selector,
110
  disable_webgl=disable_webgl,
 
111
  additional_args=additional_args,
112
  disable_resources=disable_resources,
113
  wait_selector_state=wait_selector_state,
@@ -140,6 +143,7 @@ class DynamicFetcher(BaseFetcher):
140
  network_idle: bool = False,
141
  load_dom: bool = True,
142
  wait_selector_state: SelectorWaitStates = "attached",
 
143
  additional_args: Optional[Dict] = None,
144
  custom_config: Optional[Dict] = None,
145
  ) -> Response:
@@ -169,6 +173,7 @@ class DynamicFetcher(BaseFetcher):
169
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
170
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
171
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
 
172
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
173
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
174
  :return: A `Response` object.
@@ -199,6 +204,7 @@ class DynamicFetcher(BaseFetcher):
199
  extra_headers=extra_headers,
200
  wait_selector=wait_selector,
201
  disable_webgl=disable_webgl,
 
202
  additional_args=additional_args,
203
  disable_resources=disable_resources,
204
  wait_selector_state=wait_selector_state,
 
50
  network_idle: bool = False,
51
  load_dom: bool = True,
52
  wait_selector_state: SelectorWaitStates = "attached",
53
+ extra_flags: Optional[List[str]] = None,
54
  additional_args: Optional[Dict] = None,
55
  custom_config: Optional[Dict] = None,
56
  ) -> Response:
 
80
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
81
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
82
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
83
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
84
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
85
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
86
  :return: A `Response` object.
 
110
  extra_headers=extra_headers,
111
  wait_selector=wait_selector,
112
  disable_webgl=disable_webgl,
113
+ extra_flags=extra_flags,
114
  additional_args=additional_args,
115
  disable_resources=disable_resources,
116
  wait_selector_state=wait_selector_state,
 
143
  network_idle: bool = False,
144
  load_dom: bool = True,
145
  wait_selector_state: SelectorWaitStates = "attached",
146
+ extra_flags: Optional[List[str]] = None,
147
  additional_args: Optional[Dict] = None,
148
  custom_config: Optional[Dict] = None,
149
  ) -> Response:
 
173
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
174
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
175
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
176
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
177
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
178
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
179
  :return: A `Response` object.
 
204
  extra_headers=extra_headers,
205
  wait_selector=wait_selector,
206
  disable_webgl=disable_webgl,
207
+ extra_flags=extra_flags,
208
  additional_args=additional_args,
209
  disable_resources=disable_resources,
210
  wait_selector_state=wait_selector_state,