Karim shoair commited on
Commit
3f0dddd
·
1 Parent(s): 83d9810

feat/refactor(fetchers): Replacing PlayWrightFetcher with DynamicFetcher and adding session classes

Browse files
scrapling/core/shell.py CHANGED
@@ -30,7 +30,7 @@ from scrapling.core._types import List, Optional, Dict, Tuple, Any, Union
30
  from scrapling.fetchers import (
31
  Fetcher,
32
  AsyncFetcher,
33
- PlayWrightFetcher,
34
  StealthyFetcher,
35
  Response,
36
  )
@@ -436,7 +436,7 @@ class CustomShell:
436
  return f"""
437
  -> Available Scrapling objects:
438
  - Fetcher/AsyncFetcher
439
- - PlayWrightFetcher
440
  - StealthyFetcher
441
  - Adaptor
442
 
@@ -445,7 +445,7 @@ class CustomShell:
445
  - {"post":<30} Shortcut for `Fetcher.post`
446
  - {"put":<30} Shortcut for `Fetcher.put`
447
  - {"delete":<30} Shortcut for `Fetcher.delete`
448
- - {"fetch":<30} Shortcut for `PlayWrightFetcher.fetch`
449
  - {"stealthy_fetch":<30} Shortcut for `StealthyFetcher.fetch`
450
 
451
  -> Useful commands
@@ -493,7 +493,7 @@ Type 'exit' or press Ctrl+D to exit.
493
  post = self.create_wrapper(Fetcher.post)
494
  put = self.create_wrapper(Fetcher.put)
495
  delete = self.create_wrapper(Fetcher.delete)
496
- dynamic_fetch = self.create_wrapper(PlayWrightFetcher.fetch)
497
  stealthy_fetch = self.create_wrapper(StealthyFetcher.fetch)
498
  curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
499
 
@@ -506,7 +506,7 @@ Type 'exit' or press Ctrl+D to exit.
506
  "Fetcher": Fetcher,
507
  "AsyncFetcher": AsyncFetcher,
508
  "fetch": dynamic_fetch,
509
- "PlayWrightFetcher": PlayWrightFetcher,
510
  "stealthy_fetch": stealthy_fetch,
511
  "StealthyFetcher": StealthyFetcher,
512
  "Adaptor": Adaptor,
 
30
  from scrapling.fetchers import (
31
  Fetcher,
32
  AsyncFetcher,
33
+ DynamicFetcher,
34
  StealthyFetcher,
35
  Response,
36
  )
 
436
  return f"""
437
  -> Available Scrapling objects:
438
  - Fetcher/AsyncFetcher
439
+ - DynamicFetcher
440
  - StealthyFetcher
441
  - Adaptor
442
 
 
445
  - {"post":<30} Shortcut for `Fetcher.post`
446
  - {"put":<30} Shortcut for `Fetcher.put`
447
  - {"delete":<30} Shortcut for `Fetcher.delete`
448
+ - {"fetch":<30} Shortcut for `DynamicFetcher.fetch`
449
  - {"stealthy_fetch":<30} Shortcut for `StealthyFetcher.fetch`
450
 
451
  -> Useful commands
 
493
  post = self.create_wrapper(Fetcher.post)
494
  put = self.create_wrapper(Fetcher.put)
495
  delete = self.create_wrapper(Fetcher.delete)
496
+ dynamic_fetch = self.create_wrapper(DynamicFetcher.fetch)
497
  stealthy_fetch = self.create_wrapper(StealthyFetcher.fetch)
498
  curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
499
 
 
506
  "Fetcher": Fetcher,
507
  "AsyncFetcher": AsyncFetcher,
508
  "fetch": dynamic_fetch,
509
+ "DynamicFetcher": DynamicFetcher,
510
  "stealthy_fetch": stealthy_fetch,
511
  "StealthyFetcher": StealthyFetcher,
512
  "Adaptor": Adaptor,
scrapling/engines/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
  from .camo import CamoufoxEngine
2
  from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
3
- from .pw import PlaywrightEngine
4
  from .static import FetcherSession, FetcherClient, AsyncFetcherClient
5
  from .toolbelt import check_if_engine_usable
 
6
 
7
- __all__ = ["CamoufoxEngine", "PlaywrightEngine"]
 
1
  from .camo import CamoufoxEngine
2
  from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
 
3
  from .static import FetcherSession, FetcherClient, AsyncFetcherClient
4
  from .toolbelt import check_if_engine_usable
5
+ from ._browsers import DynamicSession, AsyncDynamicSession
6
 
7
+ __all__ = ["FetcherSession", "DynamicSession", "AsyncDynamicSession"]
scrapling/engines/pw.py DELETED
@@ -1,402 +0,0 @@
1
- import json
2
-
3
- from playwright.sync_api import sync_playwright
4
- from playwright.async_api import async_playwright
5
- from playwright.sync_api import Response as SyncPlaywrightResponse
6
- from playwright.async_api import Response as AsyncPlaywrightResponse
7
- from rebrowser_playwright.sync_api import sync_playwright as sync_rebrowser_playwright
8
- from rebrowser_playwright.async_api import (
9
- async_playwright as async_rebrowser_playwright,
10
- )
11
-
12
- from scrapling.core._types import (
13
- Callable,
14
- Dict,
15
- Optional,
16
- SelectorWaitStates,
17
- Union,
18
- Iterable,
19
- )
20
- from scrapling.core.utils import log, lru_cache
21
- from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
22
- from scrapling.engines.toolbelt import (
23
- Response,
24
- ResponseFactory,
25
- async_intercept_route,
26
- check_type_validity,
27
- construct_cdp_url,
28
- construct_proxy_dict,
29
- generate_convincing_referer,
30
- generate_headers,
31
- intercept_route,
32
- js_bypass_path,
33
- )
34
-
35
-
36
- class PlaywrightEngine:
37
- def __init__(
38
- self,
39
- headless: Union[bool, str] = True,
40
- disable_resources: bool = False,
41
- useragent: Optional[str] = None,
42
- network_idle: bool = False,
43
- timeout: Optional[float] = 30000,
44
- wait: Optional[int] = 0,
45
- page_action: Callable = None,
46
- wait_selector: Optional[str] = None,
47
- locale: Optional[str] = "en-US",
48
- wait_selector_state: SelectorWaitStates = "attached",
49
- cookies: Optional[Iterable[Dict]] = None,
50
- stealth: bool = False,
51
- real_chrome: bool = False,
52
- hide_canvas: bool = False,
53
- disable_webgl: bool = False,
54
- cdp_url: Optional[str] = None,
55
- nstbrowser_mode: bool = False,
56
- nstbrowser_config: Optional[Dict] = None,
57
- google_search: bool = True,
58
- extra_headers: Optional[Dict[str, str]] = None,
59
- proxy: Optional[Union[str, Dict[str, str]]] = None,
60
- adaptor_arguments: Dict = None,
61
- ):
62
- """An engine that uses the PlayWright library checks the `PlayWrightFetcher` class for more documentation.
63
-
64
- :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
65
- :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
66
- Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
67
- This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
68
- :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
69
- :param cookies: Set cookies for the next request.
70
- :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
71
- :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
72
- :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
73
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
74
- :param wait_selector: Wait for a specific CSS selector to be in a specific state.
75
- :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
76
- :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
77
- :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
78
- :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
79
- :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
80
- :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
81
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
82
- :param nstbrowser_mode: Enables NSTBrowser mode, it has to be used with the ` cdp_url ` argument, or it will get completely ignored.
83
- :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
84
- :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
85
- :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
86
- :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
87
- :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
88
- """
89
- self.headless = headless
90
- self.locale = check_type_validity(locale, [str], "en-US", param_name="locale")
91
- self.disable_resources = disable_resources
92
- self.network_idle = bool(network_idle)
93
- self.stealth = bool(stealth)
94
- self.hide_canvas = bool(hide_canvas)
95
- self.disable_webgl = bool(disable_webgl)
96
- self.real_chrome = bool(real_chrome)
97
- self.google_search = bool(google_search)
98
- self.extra_headers = extra_headers or {}
99
- self.proxy = construct_proxy_dict(proxy)
100
- self.cdp_url = cdp_url
101
- self.useragent = useragent
102
- self.cookies = cookies or []
103
- self.timeout = check_type_validity(timeout, [int, float], 30000)
104
- self.wait = check_type_validity(wait, [int, float], 0)
105
- if page_action is not None:
106
- if callable(page_action):
107
- self.page_action = page_action
108
- else:
109
- self.page_action = None
110
- log.error('[Ignored] Argument "page_action" must be callable')
111
- else:
112
- self.page_action = None
113
-
114
- self.wait_selector = wait_selector
115
- self.wait_selector_state = wait_selector_state
116
- self.nstbrowser_mode = bool(nstbrowser_mode)
117
- self.nstbrowser_config = nstbrowser_config
118
- self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
119
- self.harmful_default_args = [
120
- # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
121
- "--enable-automation",
122
- "--disable-popup-blocking",
123
- # '--disable-component-update',
124
- # '--disable-default-apps',
125
- # '--disable-extensions',
126
- ]
127
-
128
- def _cdp_url_logic(self) -> str:
129
- """Constructs a new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
130
- :return: CDP URL
131
- """
132
- cdp_url = self.cdp_url
133
- if self.nstbrowser_mode:
134
- if self.nstbrowser_config and isinstance(self.nstbrowser_config, dict):
135
- config = self.nstbrowser_config
136
- else:
137
- query = NSTBROWSER_DEFAULT_QUERY.copy()
138
- if self.stealth:
139
- flags = self.__set_flags()
140
- query.update(
141
- {
142
- "args": dict(
143
- zip(flags, [""] * len(flags))
144
- ), # browser args should be a dictionary
145
- }
146
- )
147
-
148
- config = {
149
- "config": json.dumps(query),
150
- # 'token': ''
151
- }
152
- cdp_url = construct_cdp_url(cdp_url, config)
153
- else:
154
- # To validate it
155
- cdp_url = construct_cdp_url(cdp_url)
156
-
157
- return cdp_url
158
-
159
- @lru_cache(32, typed=True)
160
- def __set_flags(self):
161
- """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
162
- flags = DEFAULT_STEALTH_FLAGS
163
- if self.hide_canvas:
164
- flags += ("--fingerprinting-canvas-image-data-noise",)
165
- if self.disable_webgl:
166
- flags += (
167
- "--disable-webgl",
168
- "--disable-webgl-image-chromium",
169
- "--disable-webgl2",
170
- )
171
-
172
- return flags
173
-
174
- def __launch_kwargs(self):
175
- """Creates the arguments we will use while launching playwright's browser"""
176
- launch_kwargs = {
177
- "headless": self.headless,
178
- "ignore_default_args": self.harmful_default_args,
179
- "channel": "chrome" if self.real_chrome else "chromium",
180
- }
181
- if self.stealth:
182
- launch_kwargs.update({"args": self.__set_flags(), "chromium_sandbox": True})
183
-
184
- return launch_kwargs
185
-
186
- def __context_kwargs(self):
187
- """Creates the arguments for the browser context"""
188
- context_kwargs = {
189
- "proxy": self.proxy,
190
- "locale": self.locale,
191
- "color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
192
- "device_scale_factor": 2,
193
- "extra_http_headers": self.extra_headers if self.extra_headers else {},
194
- "user_agent": self.useragent
195
- if self.useragent
196
- else generate_headers(browser_mode=True).get("User-Agent"),
197
- }
198
- if self.stealth:
199
- context_kwargs.update(
200
- {
201
- "is_mobile": False,
202
- "has_touch": False,
203
- # I'm thinking about disabling it to rest from all Service Workers headache, but let's keep it as it is for now
204
- "service_workers": "allow",
205
- "ignore_https_errors": True,
206
- "screen": {"width": 1920, "height": 1080},
207
- "viewport": {"width": 1920, "height": 1080},
208
- "permissions": ["geolocation", "notifications"],
209
- }
210
- )
211
-
212
- return context_kwargs
213
-
214
- @lru_cache(1)
215
- def __stealth_scripts(self):
216
- # Basic bypasses nothing fancy as I'm still working on it
217
- # But with adding these bypasses to the above config, it bypasses many online tests like
218
- # https://bot.sannysoft.com/
219
- # https://kaliiiiiiiiii.github.io/brotector/
220
- # https://pixelscan.net/
221
- # https://iphey.com/
222
- # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
223
- # https://arh.antoinevastel.com/bots/areyouheadless/
224
- # https://prescience-data.github.io/execution-monitor.html
225
- return tuple(
226
- js_bypass_path(script)
227
- for script in (
228
- # Order is important
229
- "webdriver_fully.js",
230
- "window_chrome.js",
231
- "navigator_plugins.js",
232
- "pdf_viewer.js",
233
- "notification_permission.js",
234
- "screen_props.js",
235
- "playwright_fingerprint.js",
236
- )
237
- )
238
-
239
- def fetch(self, url: str) -> Response:
240
- """Opens up the browser and do your request based on your chosen options.
241
-
242
- :param url: Target url.
243
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
244
- """
245
-
246
- sync_context = sync_rebrowser_playwright
247
- if not self.stealth or self.real_chrome:
248
- # Because rebrowser_playwright doesn't play well with real browsers
249
- sync_context = sync_playwright
250
-
251
- final_response = None
252
- referer = generate_convincing_referer(url) if self.google_search else None
253
-
254
- def handle_response(finished_response: SyncPlaywrightResponse):
255
- nonlocal final_response
256
- if (
257
- finished_response.request.resource_type == "document"
258
- and finished_response.request.is_navigation_request()
259
- ):
260
- final_response = finished_response
261
-
262
- with sync_context() as p:
263
- # Creating the browser
264
- if self.cdp_url:
265
- cdp_url = self._cdp_url_logic()
266
- browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
267
- else:
268
- browser = p.chromium.launch(**self.__launch_kwargs())
269
-
270
- context = browser.new_context(**self.__context_kwargs())
271
- if self.cookies:
272
- context.add_cookies(self.cookies)
273
-
274
- page = context.new_page()
275
- page.set_default_navigation_timeout(self.timeout)
276
- page.set_default_timeout(self.timeout)
277
- page.on("response", handle_response)
278
-
279
- if self.extra_headers:
280
- page.set_extra_http_headers(self.extra_headers)
281
-
282
- if self.disable_resources:
283
- page.route("**/*", intercept_route)
284
-
285
- if self.stealth:
286
- for script in self.__stealth_scripts():
287
- page.add_init_script(path=script)
288
-
289
- first_response = page.goto(url, referer=referer)
290
- page.wait_for_load_state(state="domcontentloaded")
291
-
292
- if self.network_idle:
293
- page.wait_for_load_state("networkidle")
294
-
295
- if self.page_action is not None:
296
- try:
297
- page = self.page_action(page)
298
- except Exception as e:
299
- log.error(f"Error executing page_action: {e}")
300
-
301
- if self.wait_selector and type(self.wait_selector) is str:
302
- try:
303
- waiter = page.locator(self.wait_selector)
304
- waiter.first.wait_for(state=self.wait_selector_state)
305
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
306
- page.wait_for_load_state(state="load")
307
- page.wait_for_load_state(state="domcontentloaded")
308
- if self.network_idle:
309
- page.wait_for_load_state("networkidle")
310
- except Exception as e:
311
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
312
-
313
- page.wait_for_timeout(self.wait)
314
- response = ResponseFactory.from_playwright_response(
315
- page, first_response, final_response, self.adaptor_arguments
316
- )
317
- page.close()
318
- context.close()
319
- return response
320
-
321
- async def async_fetch(self, url: str) -> Response:
322
- """Async version of `fetch`
323
-
324
- :param url: Target url.
325
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
326
- """
327
-
328
- async_context = async_rebrowser_playwright
329
- if not self.stealth or self.real_chrome:
330
- # Because rebrowser_playwright doesn't play well with real browsers
331
- async_context = async_playwright
332
-
333
- final_response = None
334
- referer = generate_convincing_referer(url) if self.google_search else None
335
-
336
- async def handle_response(finished_response: AsyncPlaywrightResponse):
337
- nonlocal final_response
338
- if (
339
- finished_response.request.resource_type == "document"
340
- and finished_response.request.is_navigation_request()
341
- ):
342
- final_response = finished_response
343
-
344
- async with async_context() as p:
345
- # Creating the browser
346
- if self.cdp_url:
347
- cdp_url = self._cdp_url_logic()
348
- browser = await p.chromium.connect_over_cdp(endpoint_url=cdp_url)
349
- else:
350
- browser = await p.chromium.launch(**self.__launch_kwargs())
351
-
352
- context = await browser.new_context(**self.__context_kwargs())
353
- if self.cookies:
354
- await context.add_cookies(self.cookies)
355
-
356
- page = await context.new_page()
357
- page.set_default_navigation_timeout(self.timeout)
358
- page.set_default_timeout(self.timeout)
359
- page.on("response", handle_response)
360
-
361
- if self.extra_headers:
362
- await page.set_extra_http_headers(self.extra_headers)
363
-
364
- if self.disable_resources:
365
- await page.route("**/*", async_intercept_route)
366
-
367
- if self.stealth:
368
- for script in self.__stealth_scripts():
369
- await page.add_init_script(path=script)
370
-
371
- first_response = await page.goto(url, referer=referer)
372
- await page.wait_for_load_state(state="domcontentloaded")
373
-
374
- if self.network_idle:
375
- await page.wait_for_load_state("networkidle")
376
-
377
- if self.page_action is not None:
378
- try:
379
- page = await self.page_action(page)
380
- except Exception as e:
381
- log.error(f"Error executing async page_action: {e}")
382
-
383
- if self.wait_selector and type(self.wait_selector) is str:
384
- try:
385
- waiter = page.locator(self.wait_selector)
386
- await waiter.first.wait_for(state=self.wait_selector_state)
387
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
388
- await page.wait_for_load_state(state="load")
389
- await page.wait_for_load_state(state="domcontentloaded")
390
- if self.network_idle:
391
- await page.wait_for_load_state("networkidle")
392
- except Exception as e:
393
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
394
-
395
- await page.wait_for_timeout(self.wait)
396
- response = await ResponseFactory.from_async_playwright_response(
397
- page, first_response, final_response, self.adaptor_arguments
398
- )
399
- await page.close()
400
- await context.close()
401
-
402
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scrapling/fetchers.py CHANGED
@@ -11,7 +11,8 @@ from scrapling.core._types import (
11
  from scrapling.engines import (
12
  FetcherSession,
13
  CamoufoxEngine,
14
- PlaywrightEngine,
 
15
  check_if_engine_usable,
16
  FetcherClient as _FetcherClient,
17
  AsyncFetcherClient as _AsyncFetcherClient,
@@ -237,7 +238,7 @@ class StealthyFetcher(BaseFetcher):
237
  return await engine.async_fetch(url)
238
 
239
 
240
- class PlayWrightFetcher(BaseFetcher):
241
  """A `Fetcher` class type that provide many options, all of them are based on PlayWright.
242
 
243
  Using this Fetcher class, you can do requests with:
@@ -258,28 +259,27 @@ class PlayWrightFetcher(BaseFetcher):
258
  def fetch(
259
  cls,
260
  url: str,
261
- headless: Union[bool, str] = True,
262
- disable_resources: bool = None,
263
- useragent: Optional[str] = None,
264
- network_idle: bool = False,
265
- timeout: Optional[float] = 30000,
266
- wait: Optional[int] = 0,
267
- cookies: Optional[Iterable[Dict]] = None,
268
- page_action: Optional[Callable] = None,
269
- wait_selector: Optional[str] = None,
270
- wait_selector_state: SelectorWaitStates = "attached",
271
  hide_canvas: bool = False,
272
  disable_webgl: bool = False,
273
- extra_headers: Optional[Dict[str, str]] = None,
274
- google_search: bool = True,
275
- proxy: Optional[Union[str, Dict[str, str]]] = None,
276
- locale: Optional[str] = "en-US",
277
- stealth: bool = False,
278
  real_chrome: bool = False,
 
 
 
 
 
 
 
279
  cdp_url: Optional[str] = None,
280
- nstbrowser_mode: bool = False,
281
- nstbrowser_config: Optional[Dict] = None,
282
- custom_config: Dict = None,
 
 
 
 
283
  ) -> Response:
284
  """Opens up a browser and do your request based on your chosen options below.
285
 
@@ -289,10 +289,10 @@ class PlayWrightFetcher(BaseFetcher):
289
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
290
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
291
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
 
292
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
293
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
294
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
295
- :param cookies: Set cookies for the next request.
296
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
297
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
298
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
@@ -302,22 +302,21 @@ class PlayWrightFetcher(BaseFetcher):
302
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
303
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
304
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
305
- :param nstbrowser_mode: Enables NSTBrowser mode, it has to be used with the ` cdp_url ` argument, or it will get completely ignored.
306
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
307
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
308
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
309
- :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
310
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
311
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
312
  """
313
  if not custom_config:
314
  custom_config = {}
315
  elif not isinstance(custom_config, dict):
316
- ValueError(
317
  f"The custom parser config must be of type dictionary, got {cls.__class__}"
318
  )
319
 
320
- engine = PlaywrightEngine(
321
  wait=wait,
322
  proxy=proxy,
323
  locale=locale,
@@ -327,6 +326,7 @@ class PlayWrightFetcher(BaseFetcher):
327
  cookies=cookies,
328
  headless=headless,
329
  useragent=useragent,
 
330
  real_chrome=real_chrome,
331
  page_action=page_action,
332
  hide_canvas=hide_canvas,
@@ -335,40 +335,39 @@ class PlayWrightFetcher(BaseFetcher):
335
  extra_headers=extra_headers,
336
  wait_selector=wait_selector,
337
  disable_webgl=disable_webgl,
338
- nstbrowser_mode=nstbrowser_mode,
339
- nstbrowser_config=nstbrowser_config,
340
  disable_resources=disable_resources,
341
  wait_selector_state=wait_selector_state,
342
  adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
343
- )
344
- return engine.fetch(url)
 
 
345
 
346
  @classmethod
347
  async def async_fetch(
348
  cls,
349
  url: str,
350
- headless: Union[bool, str] = True,
351
- disable_resources: bool = None,
352
- useragent: Optional[str] = None,
353
- network_idle: bool = False,
354
- timeout: Optional[float] = 30000,
355
- wait: Optional[int] = 0,
356
- cookies: Optional[Iterable[Dict]] = None,
357
- page_action: Optional[Callable] = None,
358
- wait_selector: Optional[str] = None,
359
- wait_selector_state: SelectorWaitStates = "attached",
360
  hide_canvas: bool = False,
361
  disable_webgl: bool = False,
362
- extra_headers: Optional[Dict[str, str]] = None,
363
- google_search: bool = True,
364
- proxy: Optional[Union[str, Dict[str, str]]] = None,
365
- locale: Optional[str] = "en-US",
366
- stealth: bool = False,
367
  real_chrome: bool = False,
 
 
 
 
 
 
 
368
  cdp_url: Optional[str] = None,
369
- nstbrowser_mode: bool = False,
370
- nstbrowser_config: Optional[Dict] = None,
371
- custom_config: Dict = None,
 
 
 
 
372
  ) -> Response:
373
  """Opens up a browser and do your request based on your chosen options below.
374
 
@@ -378,8 +377,8 @@ class PlayWrightFetcher(BaseFetcher):
378
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
379
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
380
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
381
- :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
382
  :param cookies: Set cookies for the next request.
 
383
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
384
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
385
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
@@ -391,22 +390,21 @@ class PlayWrightFetcher(BaseFetcher):
391
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
392
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
393
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
394
- :param nstbrowser_mode: Enables NSTBrowser mode, it has to be used with the ` cdp_url ` argument, or it will get completely ignored.
395
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
396
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
397
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
398
- :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
399
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
400
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
401
  """
402
  if not custom_config:
403
  custom_config = {}
404
  elif not isinstance(custom_config, dict):
405
- ValueError(
406
  f"The custom parser config must be of type dictionary, got {cls.__class__}"
407
  )
408
 
409
- engine = PlaywrightEngine(
410
  wait=wait,
411
  proxy=proxy,
412
  locale=locale,
@@ -416,6 +414,7 @@ class PlayWrightFetcher(BaseFetcher):
416
  cookies=cookies,
417
  headless=headless,
418
  useragent=useragent,
 
419
  real_chrome=real_chrome,
420
  page_action=page_action,
421
  hide_canvas=hide_canvas,
@@ -424,13 +423,16 @@ class PlayWrightFetcher(BaseFetcher):
424
  extra_headers=extra_headers,
425
  wait_selector=wait_selector,
426
  disable_webgl=disable_webgl,
427
- nstbrowser_mode=nstbrowser_mode,
428
- nstbrowser_config=nstbrowser_config,
429
  disable_resources=disable_resources,
430
  wait_selector_state=wait_selector_state,
431
  adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
432
- )
433
- return await engine.async_fetch(url)
 
 
 
 
 
434
 
435
 
436
  class CustomFetcher(BaseFetcher):
 
11
  from scrapling.engines import (
12
  FetcherSession,
13
  CamoufoxEngine,
14
+ DynamicSession,
15
+ AsyncDynamicSession,
16
  check_if_engine_usable,
17
  FetcherClient as _FetcherClient,
18
  AsyncFetcherClient as _AsyncFetcherClient,
 
238
  return await engine.async_fetch(url)
239
 
240
 
241
+ class DynamicFetcher(BaseFetcher):
242
  """A `Fetcher` class type that provide many options, all of them are based on PlayWright.
243
 
244
  Using this Fetcher class, you can do requests with:
 
259
  def fetch(
260
  cls,
261
  url: str,
262
+ max_pages: int = 1,
263
+ headless: bool = True,
264
+ google_search: bool = True,
 
 
 
 
 
 
 
265
  hide_canvas: bool = False,
266
  disable_webgl: bool = False,
 
 
 
 
 
267
  real_chrome: bool = False,
268
+ stealth: bool = False,
269
+ wait: Union[int, float] = 0,
270
+ page_action: Optional[Callable] = None,
271
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
272
+ locale: str = "en-US",
273
+ extra_headers: Optional[Dict[str, str]] = None,
274
+ useragent: Optional[str] = None,
275
  cdp_url: Optional[str] = None,
276
+ timeout: Union[int, float] = 30000,
277
+ disable_resources: bool = False,
278
+ wait_selector: Optional[str] = None,
279
+ cookies: Optional[Iterable[Dict]] = None,
280
+ network_idle: bool = False,
281
+ wait_selector_state: SelectorWaitStates = "attached",
282
+ custom_config: Optional[Dict] = None,
283
  ) -> Response:
284
  """Opens up a browser and do your request based on your chosen options below.
285
 
 
289
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
290
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
291
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
292
+ :param cookies: Set cookies for the next request.
293
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
294
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
295
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
 
296
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
297
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
298
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
 
302
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
303
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
304
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
 
305
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
306
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
307
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
308
+ :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
309
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
310
+ :return: A `Response` object.
311
  """
312
  if not custom_config:
313
  custom_config = {}
314
  elif not isinstance(custom_config, dict):
315
+ raise ValueError(
316
  f"The custom parser config must be of type dictionary, got {cls.__class__}"
317
  )
318
 
319
+ with DynamicSession(
320
  wait=wait,
321
  proxy=proxy,
322
  locale=locale,
 
326
  cookies=cookies,
327
  headless=headless,
328
  useragent=useragent,
329
+ max_pages=max_pages,
330
  real_chrome=real_chrome,
331
  page_action=page_action,
332
  hide_canvas=hide_canvas,
 
335
  extra_headers=extra_headers,
336
  wait_selector=wait_selector,
337
  disable_webgl=disable_webgl,
 
 
338
  disable_resources=disable_resources,
339
  wait_selector_state=wait_selector_state,
340
  adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
341
+ ) as session:
342
+ response = session.fetch(url)
343
+
344
+ return response
345
 
346
  @classmethod
347
  async def async_fetch(
348
  cls,
349
  url: str,
350
+ max_pages: int = 1,
351
+ headless: bool = True,
352
+ google_search: bool = True,
 
 
 
 
 
 
 
353
  hide_canvas: bool = False,
354
  disable_webgl: bool = False,
 
 
 
 
 
355
  real_chrome: bool = False,
356
+ stealth: bool = False,
357
+ wait: Union[int, float] = 0,
358
+ page_action: Optional[Callable] = None,
359
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
360
+ locale: str = "en-US",
361
+ extra_headers: Optional[Dict[str, str]] = None,
362
+ useragent: Optional[str] = None,
363
  cdp_url: Optional[str] = None,
364
+ timeout: Union[int, float] = 30000,
365
+ disable_resources: bool = False,
366
+ wait_selector: Optional[str] = None,
367
+ cookies: Optional[Iterable[Dict]] = None,
368
+ network_idle: bool = False,
369
+ wait_selector_state: SelectorWaitStates = "attached",
370
+ custom_config: Optional[Dict] = None,
371
  ) -> Response:
372
  """Opens up a browser and do your request based on your chosen options below.
373
 
 
377
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
378
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
379
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
 
380
  :param cookies: Set cookies for the next request.
381
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
382
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
383
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
384
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
 
390
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
391
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
392
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
 
393
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
394
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
395
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
396
+ :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
397
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
398
+ :return: A `Response` object.
399
  """
400
  if not custom_config:
401
  custom_config = {}
402
  elif not isinstance(custom_config, dict):
403
+ raise ValueError(
404
  f"The custom parser config must be of type dictionary, got {cls.__class__}"
405
  )
406
 
407
+ async with AsyncDynamicSession(
408
  wait=wait,
409
  proxy=proxy,
410
  locale=locale,
 
414
  cookies=cookies,
415
  headless=headless,
416
  useragent=useragent,
417
+ max_pages=max_pages,
418
  real_chrome=real_chrome,
419
  page_action=page_action,
420
  hide_canvas=hide_canvas,
 
423
  extra_headers=extra_headers,
424
  wait_selector=wait_selector,
425
  disable_webgl=disable_webgl,
 
 
426
  disable_resources=disable_resources,
427
  wait_selector_state=wait_selector_state,
428
  adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
429
+ ) as session:
430
+ response = await session.fetch(url)
431
+
432
+ return response
433
+
434
+
435
+ PlayWrightFetcher = DynamicFetcher # For backward-compatibility
436
 
437
 
438
  class CustomFetcher(BaseFetcher):