Karim shoair commited on
Commit
66fd35f
·
1 Parent(s): 70d6704

feat(browser fetchers): A new option to control page JS

Browse files
scrapling/engines/_browsers/_base.py CHANGED
@@ -162,6 +162,7 @@ class DynamicSessionMixin:
162
  self.disable_resources = config.disable_resources
163
  self.cdp_url = config.cdp_url
164
  self.network_idle = config.network_idle
 
165
  self.wait_selector = config.wait_selector
166
  self.init_script = config.init_script
167
  self.wait_selector_state = config.wait_selector_state
@@ -216,6 +217,7 @@ class StealthySessionMixin:
216
  self.block_webrtc = config.block_webrtc
217
  self.allow_webgl = config.allow_webgl
218
  self.network_idle = config.network_idle
 
219
  self.humanize = config.humanize
220
  self.solve_cloudflare = config.solve_cloudflare
221
  self.wait = config.wait
 
162
  self.disable_resources = config.disable_resources
163
  self.cdp_url = config.cdp_url
164
  self.network_idle = config.network_idle
165
+ self.load_dom = config.load_dom
166
  self.wait_selector = config.wait_selector
167
  self.init_script = config.init_script
168
  self.wait_selector_state = config.wait_selector_state
 
217
  self.block_webrtc = config.block_webrtc
218
  self.allow_webgl = config.allow_webgl
219
  self.network_idle = config.network_idle
220
+ self.load_dom = config.load_dom
221
  self.humanize = config.humanize
222
  self.solve_cloudflare = config.solve_cloudflare
223
  self.wait = config.wait
scrapling/engines/_browsers/_camoufox.py CHANGED
@@ -46,6 +46,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
46
  "block_webrtc",
47
  "allow_webgl",
48
  "network_idle",
 
49
  "humanize",
50
  "solve_cloudflare",
51
  "wait",
@@ -82,6 +83,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
82
  block_webrtc: bool = False,
83
  allow_webgl: bool = True,
84
  network_idle: bool = False,
 
85
  humanize: bool | float = True,
86
  solve_cloudflare: bool = False,
87
  wait: int | float = 0,
@@ -116,6 +118,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
116
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
117
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
118
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
119
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
120
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
121
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
@@ -142,6 +145,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
142
  cookies=cookies,
143
  headless=headless,
144
  humanize=humanize,
 
145
  max_pages=__max_pages,
146
  disable_ads=disable_ads,
147
  allow_webgl=allow_webgl,
@@ -259,6 +263,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
259
  wait_selector: Optional[str] = _UNSET,
260
  wait_selector_state: SelectorWaitStates = _UNSET,
261
  network_idle: bool = _UNSET,
 
262
  solve_cloudflare: bool = _UNSET,
263
  selector_config: Optional[Dict] = _UNSET,
264
  ) -> Response:
@@ -276,6 +281,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
276
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
277
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
278
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
279
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
280
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
281
  :return: A `Response` object.
@@ -292,6 +298,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
292
  wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
293
  wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
294
  network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
 
295
  solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
296
  selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
297
  ),
@@ -321,7 +328,8 @@ class StealthySession(StealthySessionMixin, SyncSession):
321
  # Navigate to URL and wait for a specified state
322
  page_info.page.on("response", handle_response)
323
  first_response = page_info.page.goto(url, referer=referer)
324
- page_info.page.wait_for_load_state(state="domcontentloaded")
 
325
 
326
  if params.network_idle:
327
  page_info.page.wait_for_load_state("networkidle")
@@ -333,7 +341,8 @@ class StealthySession(StealthySessionMixin, SyncSession):
333
  self._solve_cloudflare(page_info.page)
334
  # Make sure the page is fully loaded after the captcha
335
  page_info.page.wait_for_load_state(state="load")
336
- page_info.page.wait_for_load_state(state="domcontentloaded")
 
337
  if params.network_idle:
338
  page_info.page.wait_for_load_state("networkidle")
339
 
@@ -349,7 +358,8 @@ class StealthySession(StealthySessionMixin, SyncSession):
349
  waiter.first.wait_for(state=params.wait_selector_state)
350
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
351
  page_info.page.wait_for_load_state(state="load")
352
- page_info.page.wait_for_load_state(state="domcontentloaded")
 
353
  if params.network_idle:
354
  page_info.page.wait_for_load_state("networkidle")
355
  except Exception as e:
@@ -382,6 +392,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
382
  block_webrtc: bool = False,
383
  allow_webgl: bool = True,
384
  network_idle: bool = False,
 
385
  humanize: bool | float = True,
386
  solve_cloudflare: bool = False,
387
  wait: int | float = 0,
@@ -416,6 +427,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
416
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
417
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
418
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
419
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
420
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
421
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
@@ -441,6 +453,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
441
  timeout=timeout,
442
  cookies=cookies,
443
  headless=headless,
 
444
  humanize=humanize,
445
  max_pages=max_pages,
446
  disable_ads=disable_ads,
@@ -559,6 +572,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
559
  wait_selector: Optional[str] = _UNSET,
560
  wait_selector_state: SelectorWaitStates = _UNSET,
561
  network_idle: bool = _UNSET,
 
562
  solve_cloudflare: bool = _UNSET,
563
  selector_config: Optional[Dict] = _UNSET,
564
  ) -> Response:
@@ -576,6 +590,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
576
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
577
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
578
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
579
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
580
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
581
  :return: A `Response` object.
@@ -591,6 +606,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
591
  wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
592
  wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
593
  network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
 
594
  solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
595
  selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
596
  ),
@@ -620,7 +636,8 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
620
  # Navigate to URL and wait for a specified state
621
  page_info.page.on("response", handle_response)
622
  first_response = await page_info.page.goto(url, referer=referer)
623
- await page_info.page.wait_for_load_state(state="domcontentloaded")
 
624
 
625
  if params.network_idle:
626
  await page_info.page.wait_for_load_state("networkidle")
@@ -632,7 +649,8 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
632
  await self._solve_cloudflare(page_info.page)
633
  # Make sure the page is fully loaded after the captcha
634
  await page_info.page.wait_for_load_state(state="load")
635
- await page_info.page.wait_for_load_state(state="domcontentloaded")
 
636
  if params.network_idle:
637
  await page_info.page.wait_for_load_state("networkidle")
638
 
@@ -648,7 +666,8 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
648
  await waiter.first.wait_for(state=params.wait_selector_state)
649
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
650
  await page_info.page.wait_for_load_state(state="load")
651
- await page_info.page.wait_for_load_state(state="domcontentloaded")
 
652
  if params.network_idle:
653
  await page_info.page.wait_for_load_state("networkidle")
654
  except Exception as e:
 
46
  "block_webrtc",
47
  "allow_webgl",
48
  "network_idle",
49
+ "load_dom",
50
  "humanize",
51
  "solve_cloudflare",
52
  "wait",
 
83
  block_webrtc: bool = False,
84
  allow_webgl: bool = True,
85
  network_idle: bool = False,
86
+ load_dom: bool = True,
87
  humanize: bool | float = True,
88
  solve_cloudflare: bool = False,
89
  wait: int | float = 0,
 
118
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
119
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
120
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
121
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
122
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
123
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
124
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
 
145
  cookies=cookies,
146
  headless=headless,
147
  humanize=humanize,
148
+ load_dom=load_dom,
149
  max_pages=__max_pages,
150
  disable_ads=disable_ads,
151
  allow_webgl=allow_webgl,
 
263
  wait_selector: Optional[str] = _UNSET,
264
  wait_selector_state: SelectorWaitStates = _UNSET,
265
  network_idle: bool = _UNSET,
266
+ load_dom: bool = _UNSET,
267
  solve_cloudflare: bool = _UNSET,
268
  selector_config: Optional[Dict] = _UNSET,
269
  ) -> Response:
 
281
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
282
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
283
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
284
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
285
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
286
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
287
  :return: A `Response` object.
 
298
  wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
299
  wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
300
  network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
301
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
302
  solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
303
  selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
304
  ),
 
328
  # Navigate to URL and wait for a specified state
329
  page_info.page.on("response", handle_response)
330
  first_response = page_info.page.goto(url, referer=referer)
331
+ if params.load_dom:
332
+ page_info.page.wait_for_load_state(state="domcontentloaded")
333
 
334
  if params.network_idle:
335
  page_info.page.wait_for_load_state("networkidle")
 
341
  self._solve_cloudflare(page_info.page)
342
  # Make sure the page is fully loaded after the captcha
343
  page_info.page.wait_for_load_state(state="load")
344
+ if params.load_dom:
345
+ page_info.page.wait_for_load_state(state="domcontentloaded")
346
  if params.network_idle:
347
  page_info.page.wait_for_load_state("networkidle")
348
 
 
358
  waiter.first.wait_for(state=params.wait_selector_state)
359
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
360
  page_info.page.wait_for_load_state(state="load")
361
+ if params.load_dom:
362
+ page_info.page.wait_for_load_state(state="domcontentloaded")
363
  if params.network_idle:
364
  page_info.page.wait_for_load_state("networkidle")
365
  except Exception as e:
 
392
  block_webrtc: bool = False,
393
  allow_webgl: bool = True,
394
  network_idle: bool = False,
395
+ load_dom: bool = True,
396
  humanize: bool | float = True,
397
  solve_cloudflare: bool = False,
398
  wait: int | float = 0,
 
427
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
428
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
429
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
430
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
431
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
432
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
433
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
 
453
  timeout=timeout,
454
  cookies=cookies,
455
  headless=headless,
456
+ load_dom=load_dom,
457
  humanize=humanize,
458
  max_pages=max_pages,
459
  disable_ads=disable_ads,
 
572
  wait_selector: Optional[str] = _UNSET,
573
  wait_selector_state: SelectorWaitStates = _UNSET,
574
  network_idle: bool = _UNSET,
575
+ load_dom: bool = _UNSET,
576
  solve_cloudflare: bool = _UNSET,
577
  selector_config: Optional[Dict] = _UNSET,
578
  ) -> Response:
 
590
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
591
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
592
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
593
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
594
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
595
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
596
  :return: A `Response` object.
 
606
  wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
607
  wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
608
  network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
609
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
610
  solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
611
  selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
612
  ),
 
636
  # Navigate to URL and wait for a specified state
637
  page_info.page.on("response", handle_response)
638
  first_response = await page_info.page.goto(url, referer=referer)
639
+ if params.load_dom:
640
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
641
 
642
  if params.network_idle:
643
  await page_info.page.wait_for_load_state("networkidle")
 
649
  await self._solve_cloudflare(page_info.page)
650
  # Make sure the page is fully loaded after the captcha
651
  await page_info.page.wait_for_load_state(state="load")
652
+ if params.load_dom:
653
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
654
  if params.network_idle:
655
  await page_info.page.wait_for_load_state("networkidle")
656
 
 
666
  await waiter.first.wait_for(state=params.wait_selector_state)
667
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
668
  await page_info.page.wait_for_load_state(state="load")
669
+ if params.load_dom:
670
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
671
  if params.network_idle:
672
  await page_info.page.wait_for_load_state("networkidle")
673
  except Exception as e:
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -54,6 +54,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
54
  "cookies",
55
  "disable_resources",
56
  "network_idle",
 
57
  "wait_selector",
58
  "init_script",
59
  "wait_selector_state",
@@ -93,6 +94,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
93
  init_script: Optional[str] = None,
94
  cookies: Optional[List[Dict]] = None,
95
  network_idle: bool = False,
 
96
  wait_selector_state: SelectorWaitStates = "attached",
97
  selector_config: Optional[Dict] = None,
98
  ):
@@ -116,6 +118,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
116
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
117
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
118
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
 
119
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
120
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
121
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
@@ -130,6 +133,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
130
  stealth=stealth,
131
  cdp_url=cdp_url,
132
  cookies=cookies,
 
133
  headless=headless,
134
  useragent=useragent,
135
  max_pages=__max_pages,
@@ -208,6 +212,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
208
  wait_selector: Optional[str] = _UNSET,
209
  wait_selector_state: SelectorWaitStates = _UNSET,
210
  network_idle: bool = _UNSET,
 
211
  selector_config: Optional[Dict] = _UNSET,
212
  ) -> Response:
213
  """Opens up the browser and do your request based on your chosen options.
@@ -224,6 +229,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
224
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
225
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
226
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
227
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
228
  :return: A `Response` object.
229
  """
@@ -239,6 +245,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
239
  wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
240
  wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
241
  network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
 
242
  selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
243
  ),
244
  PlaywrightConfig,
@@ -267,7 +274,8 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
267
  # Navigate to URL and wait for a specified state
268
  page_info.page.on("response", handle_response)
269
  first_response = page_info.page.goto(url, referer=referer)
270
- page_info.page.wait_for_load_state(state="domcontentloaded")
 
271
 
272
  if params.network_idle:
273
  page_info.page.wait_for_load_state("networkidle")
@@ -287,7 +295,8 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
287
  waiter.first.wait_for(state=params.wait_selector_state)
288
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
289
  page_info.page.wait_for_load_state(state="load")
290
- page_info.page.wait_for_load_state(state="domcontentloaded")
 
291
  if params.network_idle:
292
  page_info.page.wait_for_load_state("networkidle")
293
  except Exception as e: # pragma: no cover
@@ -335,6 +344,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
335
  init_script: Optional[str] = None,
336
  cookies: Optional[List[Dict]] = None,
337
  network_idle: bool = False,
 
338
  wait_selector_state: SelectorWaitStates = "attached",
339
  selector_config: Optional[Dict] = None,
340
  ):
@@ -347,6 +357,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
347
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
348
  :param cookies: Set cookies for the next request.
349
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
350
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
351
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
352
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
@@ -374,6 +385,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
374
  stealth=stealth,
375
  cdp_url=cdp_url,
376
  cookies=cookies,
 
377
  headless=headless,
378
  useragent=useragent,
379
  max_pages=max_pages,
@@ -453,6 +465,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
453
  wait_selector: Optional[str] = _UNSET,
454
  wait_selector_state: SelectorWaitStates = _UNSET,
455
  network_idle: bool = _UNSET,
 
456
  selector_config: Optional[Dict] = _UNSET,
457
  ) -> Response:
458
  """Opens up the browser and do your request based on your chosen options.
@@ -469,6 +482,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
469
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
470
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
471
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
472
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
473
  :return: A `Response` object.
474
  """
@@ -484,6 +498,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
484
  wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
485
  wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
486
  network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
 
487
  selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
488
  ),
489
  PlaywrightConfig,
@@ -512,7 +527,8 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
512
  # Navigate to URL and wait for a specified state
513
  page_info.page.on("response", handle_response)
514
  first_response = await page_info.page.goto(url, referer=referer)
515
- await page_info.page.wait_for_load_state(state="domcontentloaded")
 
516
 
517
  if params.network_idle:
518
  await page_info.page.wait_for_load_state("networkidle")
@@ -532,7 +548,8 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
532
  await waiter.first.wait_for(state=params.wait_selector_state)
533
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
534
  await page_info.page.wait_for_load_state(state="load")
535
- await page_info.page.wait_for_load_state(state="domcontentloaded")
 
536
  if params.network_idle:
537
  await page_info.page.wait_for_load_state("networkidle")
538
  except Exception as e:
 
54
  "cookies",
55
  "disable_resources",
56
  "network_idle",
57
+ "load_dom",
58
  "wait_selector",
59
  "init_script",
60
  "wait_selector_state",
 
94
  init_script: Optional[str] = None,
95
  cookies: Optional[List[Dict]] = None,
96
  network_idle: bool = False,
97
+ load_dom: bool = True,
98
  wait_selector_state: SelectorWaitStates = "attached",
99
  selector_config: Optional[Dict] = None,
100
  ):
 
118
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
119
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
120
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
121
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
122
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
123
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
124
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
 
133
  stealth=stealth,
134
  cdp_url=cdp_url,
135
  cookies=cookies,
136
+ load_dom=load_dom,
137
  headless=headless,
138
  useragent=useragent,
139
  max_pages=__max_pages,
 
212
  wait_selector: Optional[str] = _UNSET,
213
  wait_selector_state: SelectorWaitStates = _UNSET,
214
  network_idle: bool = _UNSET,
215
+ load_dom: bool = _UNSET,
216
  selector_config: Optional[Dict] = _UNSET,
217
  ) -> Response:
218
  """Opens up the browser and do your request based on your chosen options.
 
229
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
230
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
231
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
232
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
233
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
234
  :return: A `Response` object.
235
  """
 
245
  wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
246
  wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
247
  network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
248
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
249
  selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
250
  ),
251
  PlaywrightConfig,
 
274
  # Navigate to URL and wait for a specified state
275
  page_info.page.on("response", handle_response)
276
  first_response = page_info.page.goto(url, referer=referer)
277
+ if params.load_dom:
278
+ page_info.page.wait_for_load_state(state="domcontentloaded")
279
 
280
  if params.network_idle:
281
  page_info.page.wait_for_load_state("networkidle")
 
295
  waiter.first.wait_for(state=params.wait_selector_state)
296
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
297
  page_info.page.wait_for_load_state(state="load")
298
+ if params.load_dom:
299
+ page_info.page.wait_for_load_state(state="domcontentloaded")
300
  if params.network_idle:
301
  page_info.page.wait_for_load_state("networkidle")
302
  except Exception as e: # pragma: no cover
 
344
  init_script: Optional[str] = None,
345
  cookies: Optional[List[Dict]] = None,
346
  network_idle: bool = False,
347
+ load_dom: bool = True,
348
  wait_selector_state: SelectorWaitStates = "attached",
349
  selector_config: Optional[Dict] = None,
350
  ):
 
357
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
358
  :param cookies: Set cookies for the next request.
359
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
360
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
361
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
362
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
363
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
 
385
  stealth=stealth,
386
  cdp_url=cdp_url,
387
  cookies=cookies,
388
+ load_dom=load_dom,
389
  headless=headless,
390
  useragent=useragent,
391
  max_pages=max_pages,
 
465
  wait_selector: Optional[str] = _UNSET,
466
  wait_selector_state: SelectorWaitStates = _UNSET,
467
  network_idle: bool = _UNSET,
468
+ load_dom: bool = _UNSET,
469
  selector_config: Optional[Dict] = _UNSET,
470
  ) -> Response:
471
  """Opens up the browser and do your request based on your chosen options.
 
482
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
483
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
484
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
485
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
486
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
487
  :return: A `Response` object.
488
  """
 
498
  wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
499
  wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
500
  network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
501
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
502
  selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
503
  ),
504
  PlaywrightConfig,
 
527
  # Navigate to URL and wait for a specified state
528
  page_info.page.on("response", handle_response)
529
  first_response = await page_info.page.goto(url, referer=referer)
530
+ if self.load_dom:
531
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
532
 
533
  if params.network_idle:
534
  await page_info.page.wait_for_load_state("networkidle")
 
548
  await waiter.first.wait_for(state=params.wait_selector_state)
549
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
550
  await page_info.page.wait_for_load_state(state="load")
551
+ if self.load_dom:
552
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
553
  if params.network_idle:
554
  await page_info.page.wait_for_load_state("networkidle")
555
  except Exception as e:
scrapling/engines/_browsers/_validators.py CHANGED
@@ -35,6 +35,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
35
  wait_selector: Optional[str] = None
36
  cookies: Optional[List[Dict]] = None
37
  network_idle: bool = False
 
38
  wait_selector_state: SelectorWaitStates = "attached"
39
  selector_config: Optional[Dict] = None
40
 
@@ -92,6 +93,7 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
92
  block_webrtc: bool = False
93
  allow_webgl: bool = True
94
  network_idle: bool = False
 
95
  humanize: bool | float = True
96
  solve_cloudflare: bool = False
97
  wait: int | float = 0
 
35
  wait_selector: Optional[str] = None
36
  cookies: Optional[List[Dict]] = None
37
  network_idle: bool = False
38
+ load_dom: bool = True
39
  wait_selector_state: SelectorWaitStates = "attached"
40
  selector_config: Optional[Dict] = None
41
 
 
93
  block_webrtc: bool = False
94
  allow_webgl: bool = True
95
  network_idle: bool = False
96
+ load_dom: bool = True
97
  humanize: bool | float = True
98
  solve_cloudflare: bool = False
99
  wait: int | float = 0
scrapling/fetchers.py CHANGED
@@ -56,6 +56,7 @@ class StealthyFetcher(BaseFetcher):
56
  block_webrtc: bool = False,
57
  allow_webgl: bool = True,
58
  network_idle: bool = False,
 
59
  humanize: bool | float = True,
60
  solve_cloudflare: bool = False,
61
  wait: int | float = 0,
@@ -92,6 +93,7 @@ class StealthyFetcher(BaseFetcher):
92
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
93
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
94
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
95
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
96
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
97
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
@@ -123,6 +125,7 @@ class StealthyFetcher(BaseFetcher):
123
  cookies=cookies,
124
  headless=headless,
125
  humanize=humanize,
 
126
  disable_ads=disable_ads,
127
  allow_webgl=allow_webgl,
128
  page_action=page_action,
@@ -152,6 +155,7 @@ class StealthyFetcher(BaseFetcher):
152
  block_webrtc: bool = False,
153
  allow_webgl: bool = True,
154
  network_idle: bool = False,
 
155
  humanize: bool | float = True,
156
  solve_cloudflare: bool = False,
157
  wait: int | float = 0,
@@ -188,6 +192,7 @@ class StealthyFetcher(BaseFetcher):
188
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
189
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
190
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
191
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
192
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
193
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
@@ -220,6 +225,7 @@ class StealthyFetcher(BaseFetcher):
220
  cookies=cookies,
221
  headless=headless,
222
  humanize=humanize,
 
223
  disable_ads=disable_ads,
224
  allow_webgl=allow_webgl,
225
  page_action=page_action,
@@ -280,6 +286,7 @@ class DynamicFetcher(BaseFetcher):
280
  init_script: Optional[str] = None,
281
  cookies: Optional[Iterable[Dict]] = None,
282
  network_idle: bool = False,
 
283
  wait_selector_state: SelectorWaitStates = "attached",
284
  custom_config: Optional[Dict] = None,
285
  ) -> Response:
@@ -293,6 +300,7 @@ class DynamicFetcher(BaseFetcher):
293
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
294
  :param cookies: Set cookies for the next request.
295
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
296
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
297
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
298
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
@@ -325,6 +333,7 @@ class DynamicFetcher(BaseFetcher):
325
  cdp_url=cdp_url,
326
  cookies=cookies,
327
  headless=headless,
 
328
  useragent=useragent,
329
  real_chrome=real_chrome,
330
  page_action=page_action,
@@ -364,6 +373,7 @@ class DynamicFetcher(BaseFetcher):
364
  init_script: Optional[str] = None,
365
  cookies: Optional[Iterable[Dict]] = None,
366
  network_idle: bool = False,
 
367
  wait_selector_state: SelectorWaitStates = "attached",
368
  custom_config: Optional[Dict] = None,
369
  ) -> Response:
@@ -377,6 +387,7 @@ class DynamicFetcher(BaseFetcher):
377
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
378
  :param cookies: Set cookies for the next request.
379
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
380
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
381
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
382
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
@@ -402,6 +413,7 @@ class DynamicFetcher(BaseFetcher):
402
 
403
  async with AsyncDynamicSession(
404
  wait=wait,
 
405
  proxy=proxy,
406
  locale=locale,
407
  timeout=timeout,
@@ -409,8 +421,8 @@ class DynamicFetcher(BaseFetcher):
409
  cdp_url=cdp_url,
410
  cookies=cookies,
411
  headless=headless,
 
412
  useragent=useragent,
413
- max_pages=1,
414
  real_chrome=real_chrome,
415
  page_action=page_action,
416
  hide_canvas=hide_canvas,
 
56
  block_webrtc: bool = False,
57
  allow_webgl: bool = True,
58
  network_idle: bool = False,
59
+ load_dom: bool = True,
60
  humanize: bool | float = True,
61
  solve_cloudflare: bool = False,
62
  wait: int | float = 0,
 
93
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
94
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
95
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
96
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
97
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
98
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
99
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
 
125
  cookies=cookies,
126
  headless=headless,
127
  humanize=humanize,
128
+ load_dom=load_dom,
129
  disable_ads=disable_ads,
130
  allow_webgl=allow_webgl,
131
  page_action=page_action,
 
155
  block_webrtc: bool = False,
156
  allow_webgl: bool = True,
157
  network_idle: bool = False,
158
+ load_dom: bool = True,
159
  humanize: bool | float = True,
160
  solve_cloudflare: bool = False,
161
  wait: int | float = 0,
 
192
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
193
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
194
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
195
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
196
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
197
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
198
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
 
225
  cookies=cookies,
226
  headless=headless,
227
  humanize=humanize,
228
+ load_dom=load_dom,
229
  disable_ads=disable_ads,
230
  allow_webgl=allow_webgl,
231
  page_action=page_action,
 
286
  init_script: Optional[str] = None,
287
  cookies: Optional[Iterable[Dict]] = None,
288
  network_idle: bool = False,
289
+ load_dom: bool = True,
290
  wait_selector_state: SelectorWaitStates = "attached",
291
  custom_config: Optional[Dict] = None,
292
  ) -> Response:
 
300
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
301
  :param cookies: Set cookies for the next request.
302
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
303
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
304
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
305
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
306
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
 
333
  cdp_url=cdp_url,
334
  cookies=cookies,
335
  headless=headless,
336
+ load_dom=load_dom,
337
  useragent=useragent,
338
  real_chrome=real_chrome,
339
  page_action=page_action,
 
373
  init_script: Optional[str] = None,
374
  cookies: Optional[Iterable[Dict]] = None,
375
  network_idle: bool = False,
376
+ load_dom: bool = True,
377
  wait_selector_state: SelectorWaitStates = "attached",
378
  custom_config: Optional[Dict] = None,
379
  ) -> Response:
 
387
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
388
  :param cookies: Set cookies for the next request.
389
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
390
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
391
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
392
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
393
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
 
413
 
414
  async with AsyncDynamicSession(
415
  wait=wait,
416
+ max_pages=1,
417
  proxy=proxy,
418
  locale=locale,
419
  timeout=timeout,
 
421
  cdp_url=cdp_url,
422
  cookies=cookies,
423
  headless=headless,
424
+ load_dom=load_dom,
425
  useragent=useragent,
 
426
  real_chrome=real_chrome,
427
  page_action=page_action,
428
  hide_canvas=hide_canvas,