Karim shoair commited on
Commit
0de8025
·
1 Parent(s): 9c87ce8

feat(Dynamic fetcher): Add `additional_args` option to customize browser as in StealthyFetcher

Browse files
docs/fetching/dynamic.md CHANGED
@@ -89,6 +89,7 @@ Scrapling provides many options with this fetcher and its session classes. To ma
89
  | locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
90
  | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. | ✔️ |
91
  | user_data_dir | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions** | ✔️ |
 
92
  | selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
93
 
94
  In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, and `selector_config`.
 
89
  | locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
90
  | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. | ✔️ |
91
  | user_data_dir | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions** | ✔️ |
92
+ | additional_args | Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings. | ✔️ |
93
  | selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
94
 
95
  In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, and `selector_config`.
scrapling/engines/_browsers/_base.py CHANGED
@@ -148,6 +148,7 @@ class DynamicSessionMixin:
148
  self.init_script = config.init_script
149
  self.wait_selector_state = config.wait_selector_state
150
  self.selector_config = config.selector_config
 
151
  self.page_action = config.page_action
152
  self.user_data_dir = config.user_data_dir
153
  self._headers_keys = {header.lower() for header in self.extra_headers.keys()} if self.extra_headers else set()
@@ -175,6 +176,7 @@ class DynamicSessionMixin:
175
  self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
176
  self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
177
  self.launch_options["user_data_dir"] = self.user_data_dir
 
178
  self.context_options = dict()
179
  else:
180
  # while `context_options` is left to be used when cdp mode is enabled
@@ -190,6 +192,7 @@ class DynamicSessionMixin:
190
  )
191
  self.context_options["extra_http_headers"] = dict(self.context_options["extra_http_headers"])
192
  self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
 
193
 
194
 
195
  class StealthySessionMixin:
 
148
  self.init_script = config.init_script
149
  self.wait_selector_state = config.wait_selector_state
150
  self.selector_config = config.selector_config
151
+ self.additional_args = config.additional_args
152
  self.page_action = config.page_action
153
  self.user_data_dir = config.user_data_dir
154
  self._headers_keys = {header.lower() for header in self.extra_headers.keys()} if self.extra_headers else set()
 
176
  self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
177
  self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
178
  self.launch_options["user_data_dir"] = self.user_data_dir
179
+ self.launch_options.update(cast(Dict, self.additional_args))
180
  self.context_options = dict()
181
  else:
182
  # while `context_options` is left to be used when cdp mode is enabled
 
192
  )
193
  self.context_options["extra_http_headers"] = dict(self.context_options["extra_http_headers"])
194
  self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
195
+ self.context_options.update(cast(Dict, self.additional_args))
196
 
197
 
198
  class StealthySessionMixin:
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -99,6 +99,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
99
  wait_selector_state: SelectorWaitStates = "attached",
100
  user_data_dir: str = "",
101
  selector_config: Optional[Dict] = None,
 
102
  ):
103
  """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
104
 
@@ -127,6 +128,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
127
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
128
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
129
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
 
130
  """
131
  self.__validate__(
132
  wait=wait,
@@ -151,6 +153,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
151
  wait_selector=wait_selector,
152
  disable_webgl=disable_webgl,
153
  selector_config=selector_config,
 
154
  disable_resources=disable_resources,
155
  wait_selector_state=wait_selector_state,
156
  )
@@ -306,7 +309,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
306
  page_info.page, first_response, final_response, params.selector_config
307
  )
308
 
309
- # Close the page, to free up resources
310
  page_info.page.close()
311
  self.page_pool.pages.remove(page_info)
312
 
@@ -346,6 +349,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
346
  wait_selector_state: SelectorWaitStates = "attached",
347
  user_data_dir: str = "",
348
  selector_config: Optional[Dict] = None,
 
349
  ):
350
  """A Browser session manager with page pooling
351
 
@@ -375,6 +379,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
375
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
376
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
377
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
 
378
  """
379
 
380
  self.__validate__(
@@ -400,6 +405,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
400
  wait_selector=wait_selector,
401
  disable_webgl=disable_webgl,
402
  selector_config=selector_config,
 
403
  disable_resources=disable_resources,
404
  wait_selector_state=wait_selector_state,
405
  )
@@ -560,7 +566,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
560
  page_info.page, first_response, final_response, params.selector_config
561
  )
562
 
563
- # Close the page, to free up resources
564
  await page_info.page.close()
565
  self.page_pool.pages.remove(page_info)
566
  return response
 
99
  wait_selector_state: SelectorWaitStates = "attached",
100
  user_data_dir: str = "",
101
  selector_config: Optional[Dict] = None,
102
+ additional_args: Optional[Dict] = None,
103
  ):
104
  """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
105
 
 
128
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
129
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
130
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
131
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
132
  """
133
  self.__validate__(
134
  wait=wait,
 
153
  wait_selector=wait_selector,
154
  disable_webgl=disable_webgl,
155
  selector_config=selector_config,
156
+ additional_args=additional_args,
157
  disable_resources=disable_resources,
158
  wait_selector_state=wait_selector_state,
159
  )
 
309
  page_info.page, first_response, final_response, params.selector_config
310
  )
311
 
312
+ # Close the page to free up resources
313
  page_info.page.close()
314
  self.page_pool.pages.remove(page_info)
315
 
 
349
  wait_selector_state: SelectorWaitStates = "attached",
350
  user_data_dir: str = "",
351
  selector_config: Optional[Dict] = None,
352
+ additional_args: Optional[Dict] = None,
353
  ):
354
  """A Browser session manager with page pooling
355
 
 
379
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
380
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
381
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
382
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
383
  """
384
 
385
  self.__validate__(
 
405
  wait_selector=wait_selector,
406
  disable_webgl=disable_webgl,
407
  selector_config=selector_config,
408
+ additional_args=additional_args,
409
  disable_resources=disable_resources,
410
  wait_selector_state=wait_selector_state,
411
  )
 
566
  page_info.page, first_response, final_response, params.selector_config
567
  )
568
 
569
+ # Close the page to free up resources
570
  await page_info.page.close()
571
  self.page_pool.pages.remove(page_info)
572
  return response
scrapling/engines/_browsers/_validators.py CHANGED
@@ -89,6 +89,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
89
  wait_selector_state: SelectorWaitStates = "attached"
90
  user_data_dir: str = ""
91
  selector_config: Optional[Dict] = {}
 
92
 
93
  def __post_init__(self):
94
  """Custom validation after msgspec validation"""
@@ -103,6 +104,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
103
  self.cookies = []
104
  if not self.selector_config:
105
  self.selector_config = {}
 
 
106
 
107
  if self.init_script is not None:
108
  _validate_file_path(self.init_script)
 
89
  wait_selector_state: SelectorWaitStates = "attached"
90
  user_data_dir: str = ""
91
  selector_config: Optional[Dict] = {}
92
+ additional_args: Optional[Dict] = {}
93
 
94
  def __post_init__(self):
95
  """Custom validation after msgspec validation"""
 
104
  self.cookies = []
105
  if not self.selector_config:
106
  self.selector_config = {}
107
+ if not self.additional_args:
108
+ self.additional_args = {}
109
 
110
  if self.init_script is not None:
111
  _validate_file_path(self.init_script)
scrapling/fetchers/chrome.py CHANGED
@@ -50,6 +50,7 @@ class DynamicFetcher(BaseFetcher):
50
  network_idle: bool = False,
51
  load_dom: bool = True,
52
  wait_selector_state: SelectorWaitStates = "attached",
 
53
  custom_config: Optional[Dict] = None,
54
  ) -> Response:
55
  """Opens up a browser and do your request based on your chosen options below.
@@ -79,6 +80,7 @@ class DynamicFetcher(BaseFetcher):
79
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
80
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
81
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
 
82
  :return: A `Response` object.
83
  """
84
  if not custom_config:
@@ -106,6 +108,7 @@ class DynamicFetcher(BaseFetcher):
106
  extra_headers=extra_headers,
107
  wait_selector=wait_selector,
108
  disable_webgl=disable_webgl,
 
109
  disable_resources=disable_resources,
110
  wait_selector_state=wait_selector_state,
111
  selector_config={**cls._generate_parser_arguments(), **custom_config},
@@ -137,6 +140,7 @@ class DynamicFetcher(BaseFetcher):
137
  network_idle: bool = False,
138
  load_dom: bool = True,
139
  wait_selector_state: SelectorWaitStates = "attached",
 
140
  custom_config: Optional[Dict] = None,
141
  ) -> Response:
142
  """Opens up a browser and do your request based on your chosen options below.
@@ -166,6 +170,7 @@ class DynamicFetcher(BaseFetcher):
166
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
167
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
168
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
 
169
  :return: A `Response` object.
170
  """
171
  if not custom_config:
@@ -194,6 +199,7 @@ class DynamicFetcher(BaseFetcher):
194
  extra_headers=extra_headers,
195
  wait_selector=wait_selector,
196
  disable_webgl=disable_webgl,
 
197
  disable_resources=disable_resources,
198
  wait_selector_state=wait_selector_state,
199
  selector_config={**cls._generate_parser_arguments(), **custom_config},
 
50
  network_idle: bool = False,
51
  load_dom: bool = True,
52
  wait_selector_state: SelectorWaitStates = "attached",
53
+ additional_args: Optional[Dict] = None,
54
  custom_config: Optional[Dict] = None,
55
  ) -> Response:
56
  """Opens up a browser and do your request based on your chosen options below.
 
80
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
81
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
82
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
83
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
84
  :return: A `Response` object.
85
  """
86
  if not custom_config:
 
108
  extra_headers=extra_headers,
109
  wait_selector=wait_selector,
110
  disable_webgl=disable_webgl,
111
+ additional_args=additional_args,
112
  disable_resources=disable_resources,
113
  wait_selector_state=wait_selector_state,
114
  selector_config={**cls._generate_parser_arguments(), **custom_config},
 
140
  network_idle: bool = False,
141
  load_dom: bool = True,
142
  wait_selector_state: SelectorWaitStates = "attached",
143
+ additional_args: Optional[Dict] = None,
144
  custom_config: Optional[Dict] = None,
145
  ) -> Response:
146
  """Opens up a browser and do your request based on your chosen options below.
 
170
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
171
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
172
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
173
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
174
  :return: A `Response` object.
175
  """
176
  if not custom_config:
 
199
  extra_headers=extra_headers,
200
  wait_selector=wait_selector,
201
  disable_webgl=disable_webgl,
202
+ additional_args=additional_args,
203
  disable_resources=disable_resources,
204
  wait_selector_state=wait_selector_state,
205
  selector_config={**cls._generate_parser_arguments(), **custom_config},