Karim shoair commited on
Commit
42a1f3d
·
1 Parent(s): 4f7700a

feat(fetchers): Improve StealthyFetcher + Adding StealthySession/AsyncStealthySession classes

Browse files
scrapling/engines/__init__.py CHANGED
@@ -1,7 +1,16 @@
1
- from .camo import CamoufoxEngine
2
  from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
3
  from .static import FetcherSession, FetcherClient, AsyncFetcherClient
4
- from .toolbelt import check_if_engine_usable
5
- from ._browsers import DynamicSession, AsyncDynamicSession
 
 
 
 
6
 
7
- __all__ = ["FetcherSession", "DynamicSession", "AsyncDynamicSession"]
 
 
 
 
 
 
 
 
1
  from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
2
  from .static import FetcherSession, FetcherClient, AsyncFetcherClient
3
+ from ._browsers import (
4
+ DynamicSession,
5
+ AsyncDynamicSession,
6
+ StealthySession,
7
+ AsyncStealthySession,
8
+ )
9
 
10
+ __all__ = [
11
+ "FetcherSession",
12
+ "DynamicSession",
13
+ "AsyncDynamicSession",
14
+ "StealthySession",
15
+ "AsyncStealthySession",
16
+ ]
scrapling/engines/_browsers/__init__.py CHANGED
@@ -1 +1,2 @@
1
  from ._controllers import DynamicSession, AsyncDynamicSession
 
 
1
  from ._controllers import DynamicSession, AsyncDynamicSession
2
+ from ._camoufox import StealthySession, AsyncStealthySession
scrapling/engines/{camo.py → _browsers/_camoufox.py} RENAMED
@@ -1,37 +1,93 @@
1
- import re
2
-
3
- from camoufox import DefaultAddons
4
- from playwright.sync_api import Page
5
- from camoufox.sync_api import Camoufox
6
- from camoufox.async_api import AsyncCamoufox
7
- from playwright.async_api import Page as async_Page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
 
9
  from scrapling.core._types import (
10
- Callable,
11
  Dict,
12
- List,
13
- Literal,
14
  Optional,
15
- SelectorWaitStates,
16
  Union,
17
- Iterable,
 
 
 
18
  )
19
- from scrapling.core.utils import log
20
  from scrapling.engines.toolbelt import (
21
  Response,
22
  ResponseFactory,
23
  async_intercept_route,
24
- check_type_validity,
25
- construct_proxy_dict,
26
  generate_convincing_referer,
27
  get_os_name,
28
  intercept_route,
29
  )
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- class CamoufoxEngine:
33
  def __init__(
34
  self,
 
35
  headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
36
  block_images: bool = False,
37
  disable_resources: bool = False,
@@ -39,29 +95,29 @@ class CamoufoxEngine:
39
  allow_webgl: bool = True,
40
  network_idle: bool = False,
41
  humanize: Union[bool, float] = True,
42
- solve_cloudflare: Optional[bool] = False,
43
- wait: Optional[int] = 0,
44
- timeout: Optional[float] = 30000,
45
- page_action: Callable = None,
46
  wait_selector: Optional[str] = None,
47
  addons: Optional[List[str]] = None,
48
  wait_selector_state: SelectorWaitStates = "attached",
49
- cookies: Optional[Iterable[Dict]] = None,
50
  google_search: bool = True,
51
  extra_headers: Optional[Dict[str, str]] = None,
52
  proxy: Optional[Union[str, Dict[str, str]]] = None,
53
  os_randomize: bool = False,
54
  disable_ads: bool = False,
55
  geoip: bool = False,
56
- adaptor_arguments: Dict = None,
57
- additional_arguments: Dict = None,
58
  ):
59
- """An engine that uses the Camoufox library; Check the `StealthyFetcher` class for more documentation.
60
 
61
  :param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
62
  :param block_images: Prevent the loading of images through Firefox preferences.
63
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
64
- :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
65
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
66
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
67
  :param block_webrtc: Blocks WebRTC entirely.
@@ -76,65 +132,90 @@ class CamoufoxEngine:
76
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
77
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
78
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
79
- :param wait_selector: Wait for a specific css selector to be in a specific state.
80
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
81
  It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
82
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
83
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
84
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
85
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
 
86
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
87
  :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
88
  """
89
- self.headless = headless
90
- self.block_images = bool(block_images)
91
- self.disable_resources = bool(disable_resources)
92
- self.block_webrtc = bool(block_webrtc)
93
- self.allow_webgl = bool(allow_webgl)
94
- self.network_idle = bool(network_idle)
95
- self.google_search = bool(google_search)
96
- self.os_randomize = bool(os_randomize)
97
- self.disable_ads = bool(disable_ads)
98
- self.geoip = bool(geoip)
99
- self.extra_headers = extra_headers or {}
100
- self.additional_arguments = additional_arguments or {}
101
- self.proxy = construct_proxy_dict(proxy)
102
- self.addons = addons or []
103
- self.cookies = cookies or []
104
- self.humanize = humanize
105
- self.solve_cloudflare = solve_cloudflare
106
- self.timeout = check_type_validity(timeout, [int, float], 30_000)
107
- self.wait = check_type_validity(wait, [int, float], 0)
108
-
109
- if self.solve_cloudflare and self.timeout < 60_000:
110
- self.timeout = 60_000
111
-
112
- # Page action callable validation
113
- self.page_action = None
114
- if page_action is not None:
115
- if callable(page_action):
116
- self.page_action = page_action
117
- else:
118
- log.error('[Ignored] Argument "page_action" must be callable')
119
 
120
- self.wait_selector = wait_selector
121
- self.wait_selector_state = wait_selector_state
122
- self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
123
-
124
- def _get_camoufox_options(self):
125
- """Return consistent browser options dictionary for both sync and async methods"""
126
- humanize = self.humanize
127
- if self.solve_cloudflare:
128
- humanize = True
129
-
130
- return {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  "geoip": self.geoip,
132
  "proxy": self.proxy,
133
  "enable_cache": True,
134
  "addons": self.addons,
135
  "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
136
  "headless": self.headless,
137
- "humanize": humanize,
138
  "i_know_what_im_doing": True, # To turn warnings off with the user configurations
139
  "allow_webgl": self.allow_webgl,
140
  "block_webrtc": self.block_webrtc,
@@ -142,9 +223,76 @@ class CamoufoxEngine:
142
  "os": None if self.os_randomize else get_os_name(),
143
  **self.additional_arguments,
144
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  @staticmethod
147
- def __detect_cloudflare(page_content):
148
  """
149
  Detect the type of Cloudflare challenge present in the provided page content.
150
 
@@ -179,8 +327,7 @@ class CamoufoxEngine:
179
  :param page: The targeted page
180
  :return:
181
  """
182
- page_content = page.content()
183
- challenge_type = self.__detect_cloudflare(page_content)
184
  if not challenge_type:
185
  log.error("No Cloudflare challenge found.")
186
  return
@@ -199,11 +346,7 @@ class CamoufoxEngine:
199
  # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
200
  page.wait_for_timeout(500)
201
 
202
- iframe = page.frame(
203
- url=re.compile(
204
- "challenges.cloudflare.com/cdn-cgi/challenge-platform/.*"
205
- )
206
- )
207
  if iframe is None:
208
  log.info("Didn't find Cloudflare iframe!")
209
  return
@@ -224,14 +367,261 @@ class CamoufoxEngine:
224
  log.info("Cloudflare captcha is solved")
225
  return
226
 
227
- async def _async_solve_cloudflare(self, page: async_Page):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  """Solve the cloudflare challenge displayed on the playwright page passed. The async version
229
 
230
  :param page: The async targeted page
231
  :return:
232
  """
233
- page_content = await page.content()
234
- challenge_type = self.__detect_cloudflare(page_content)
235
  if not challenge_type:
236
  log.error("No Cloudflare challenge found.")
237
  return
@@ -250,11 +640,7 @@ class CamoufoxEngine:
250
  # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
251
  await page.wait_for_timeout(500)
252
 
253
- iframe = page.frame(
254
- url=re.compile(
255
- "challenges.cloudflare.com/cdn-cgi/challenge-platform/.*"
256
- )
257
- )
258
  if iframe is None:
259
  log.info("Didn't find Cloudflare iframe!")
260
  return
@@ -277,90 +663,19 @@ class CamoufoxEngine:
277
  log.info("Cloudflare captcha is solved")
278
  return
279
 
280
- def fetch(self, url: str) -> Response:
281
  """Opens up the browser and do your request based on your chosen options.
282
 
283
- :param url: Target url.
284
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
285
  """
286
- final_response = None
287
- referer = generate_convincing_referer(url) if self.google_search else None
288
-
289
- def handle_response(finished_response):
290
- nonlocal final_response
291
- if (
292
- finished_response.request.resource_type == "document"
293
- and finished_response.request.is_navigation_request()
294
- ):
295
- final_response = finished_response
296
-
297
- with Camoufox(**self._get_camoufox_options()) as browser:
298
- context = browser.new_context()
299
- if self.cookies:
300
- context.add_cookies(self.cookies)
301
-
302
- page = context.new_page()
303
- page.set_default_navigation_timeout(self.timeout)
304
- page.set_default_timeout(self.timeout)
305
- page.on("response", handle_response)
306
-
307
- if self.disable_resources:
308
- page.route("**/*", intercept_route)
309
-
310
- if self.extra_headers:
311
- page.set_extra_http_headers(self.extra_headers)
312
-
313
- first_response = page.goto(url, referer=referer)
314
- page.wait_for_load_state(state="domcontentloaded")
315
-
316
- if self.network_idle:
317
- page.wait_for_load_state("networkidle")
318
-
319
- if self.solve_cloudflare:
320
- self._solve_cloudflare(page)
321
- # Make sure the page is fully loaded after the captcha
322
- page.wait_for_load_state(state="load")
323
- page.wait_for_load_state(state="domcontentloaded")
324
- if self.network_idle:
325
- page.wait_for_load_state("networkidle")
326
-
327
- if self.page_action is not None:
328
- try:
329
- page = self.page_action(page)
330
- except Exception as e:
331
- log.error(f"Error executing page_action: {e}")
332
-
333
- if self.wait_selector and type(self.wait_selector) is str:
334
- try:
335
- waiter = page.locator(self.wait_selector)
336
- waiter.first.wait_for(state=self.wait_selector_state)
337
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
338
- page.wait_for_load_state(state="load")
339
- page.wait_for_load_state(state="domcontentloaded")
340
- if self.network_idle:
341
- page.wait_for_load_state("networkidle")
342
- except Exception as e:
343
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
344
 
345
- page.wait_for_timeout(self.wait)
346
- response = ResponseFactory.from_playwright_response(
347
- page, first_response, final_response, self.adaptor_arguments
348
- )
349
- page.close()
350
- context.close()
351
-
352
- return response
353
-
354
- async def async_fetch(self, url: str) -> Response:
355
- """Opens up the browser and do your request based on your chosen options.
356
-
357
- :param url: Target url.
358
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
359
- """
360
  final_response = None
361
  referer = generate_convincing_referer(url) if self.google_search else None
362
 
363
- async def handle_response(finished_response):
364
  nonlocal final_response
365
  if (
366
  finished_response.request.resource_type == "document"
@@ -368,59 +683,59 @@ class CamoufoxEngine:
368
  ):
369
  final_response = finished_response
370
 
371
- async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
372
- context = await browser.new_context()
373
- if self.cookies:
374
- await context.add_cookies(self.cookies)
375
 
376
- page = await context.new_page()
377
- page.set_default_navigation_timeout(self.timeout)
378
- page.set_default_timeout(self.timeout)
379
- page.on("response", handle_response)
380
-
381
- if self.disable_resources:
382
- await page.route("**/*", async_intercept_route)
383
-
384
- if self.extra_headers:
385
- await page.set_extra_http_headers(self.extra_headers)
386
-
387
- first_response = await page.goto(url, referer=referer)
388
- await page.wait_for_load_state(state="domcontentloaded")
389
 
390
  if self.network_idle:
391
- await page.wait_for_load_state("networkidle")
 
 
 
392
 
393
  if self.solve_cloudflare:
394
- await self._async_solve_cloudflare(page)
395
  # Make sure the page is fully loaded after the captcha
396
- await page.wait_for_load_state(state="load")
397
- await page.wait_for_load_state(state="domcontentloaded")
398
  if self.network_idle:
399
- await page.wait_for_load_state("networkidle")
400
 
401
  if self.page_action is not None:
402
  try:
403
- page = await self.page_action(page)
404
  except Exception as e:
405
- log.error(f"Error executing async page_action: {e}")
406
 
407
- if self.wait_selector and type(self.wait_selector) is str:
408
  try:
409
- waiter = page.locator(self.wait_selector)
410
  await waiter.first.wait_for(state=self.wait_selector_state)
411
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
412
- await page.wait_for_load_state(state="load")
413
- await page.wait_for_load_state(state="domcontentloaded")
414
  if self.network_idle:
415
- await page.wait_for_load_state("networkidle")
416
  except Exception as e:
417
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
418
 
419
- await page.wait_for_timeout(self.wait)
 
 
420
  response = await ResponseFactory.from_async_playwright_response(
421
- page, first_response, final_response, self.adaptor_arguments
422
  )
423
- await page.close()
424
- await context.close()
425
 
426
- return response
 
 
 
 
 
 
 
 
1
+ from time import time, sleep
2
+ from re import compile as re_compile
3
+ from asyncio import sleep as asyncio_sleep, Lock
4
+
5
+ from camoufox import AsyncNewBrowser, NewBrowser, DefaultAddons
6
+ from playwright.sync_api import (
7
+ Response as SyncPlaywrightResponse,
8
+ sync_playwright,
9
+ BrowserType,
10
+ Browser,
11
+ BrowserContext,
12
+ Playwright,
13
+ Locator,
14
+ Page,
15
+ )
16
+ from playwright.async_api import (
17
+ async_playwright,
18
+ Response as AsyncPlaywrightResponse,
19
+ BrowserType as AsyncBrowserType,
20
+ Browser as AsyncBrowser,
21
+ BrowserContext as AsyncBrowserContext,
22
+ Playwright as AsyncPlaywright,
23
+ Locator as AsyncLocator,
24
+ Page as async_Page,
25
+ )
26
 
27
+ from scrapling.core.utils import log
28
+ from ._page import PageInfo, PagePool
29
+ from ._validators import validate, CamoufoxConfig
30
  from scrapling.core._types import (
 
31
  Dict,
 
 
32
  Optional,
 
33
  Union,
34
+ Callable,
35
+ Literal,
36
+ List,
37
+ SelectorWaitStates,
38
  )
 
39
  from scrapling.engines.toolbelt import (
40
  Response,
41
  ResponseFactory,
42
  async_intercept_route,
 
 
43
  generate_convincing_referer,
44
  get_os_name,
45
  intercept_route,
46
  )
47
 
48
+ __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
49
+
50
+
51
+ class StealthySession:
52
+ """A Stealthy session manager with page pooling."""
53
+
54
+ __slots__ = (
55
+ "max_pages",
56
+ "headless",
57
+ "block_images",
58
+ "disable_resources",
59
+ "block_webrtc",
60
+ "allow_webgl",
61
+ "network_idle",
62
+ "humanize",
63
+ "solve_cloudflare",
64
+ "wait",
65
+ "timeout",
66
+ "page_action",
67
+ "wait_selector",
68
+ "addons",
69
+ "wait_selector_state",
70
+ "cookies",
71
+ "google_search",
72
+ "extra_headers",
73
+ "proxy",
74
+ "os_randomize",
75
+ "disable_ads",
76
+ "geoip",
77
+ "adaptor_arguments",
78
+ "additional_arguments",
79
+ "playwright",
80
+ "browser",
81
+ "context",
82
+ "page_pool",
83
+ "_closed",
84
+ "launch_options",
85
+ "context_options",
86
+ )
87
 
 
88
  def __init__(
89
  self,
90
+ max_pages: int = 1,
91
  headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
92
  block_images: bool = False,
93
  disable_resources: bool = False,
 
95
  allow_webgl: bool = True,
96
  network_idle: bool = False,
97
  humanize: Union[bool, float] = True,
98
+ solve_cloudflare: bool = False,
99
+ wait: Union[int, float] = 0,
100
+ timeout: Union[int, float] = 30000,
101
+ page_action: Optional[Callable] = None,
102
  wait_selector: Optional[str] = None,
103
  addons: Optional[List[str]] = None,
104
  wait_selector_state: SelectorWaitStates = "attached",
105
+ cookies: Optional[List[Dict]] = None,
106
  google_search: bool = True,
107
  extra_headers: Optional[Dict[str, str]] = None,
108
  proxy: Optional[Union[str, Dict[str, str]]] = None,
109
  os_randomize: bool = False,
110
  disable_ads: bool = False,
111
  geoip: bool = False,
112
+ adaptor_arguments: Optional[Dict] = None,
113
+ additional_arguments: Optional[Dict] = None,
114
  ):
115
+ """A Browser session manager with page pooling
116
 
117
  :param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
118
  :param block_images: Prevent the loading of images through Firefox preferences.
119
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
120
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
121
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
122
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
123
  :param block_webrtc: Blocks WebRTC entirely.
 
132
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
133
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
134
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
135
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
136
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
137
  It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
138
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
139
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
140
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
141
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
142
+ :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
143
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
144
  :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
145
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
+ params = {
148
+ "max_pages": max_pages,
149
+ "headless": headless,
150
+ "block_images": block_images,
151
+ "disable_resources": disable_resources,
152
+ "block_webrtc": block_webrtc,
153
+ "allow_webgl": allow_webgl,
154
+ "network_idle": network_idle,
155
+ "humanize": humanize,
156
+ "solve_cloudflare": solve_cloudflare,
157
+ "wait": wait,
158
+ "timeout": timeout,
159
+ "page_action": page_action,
160
+ "wait_selector": wait_selector,
161
+ "addons": addons,
162
+ "wait_selector_state": wait_selector_state,
163
+ "cookies": cookies,
164
+ "google_search": google_search,
165
+ "extra_headers": extra_headers,
166
+ "proxy": proxy,
167
+ "os_randomize": os_randomize,
168
+ "disable_ads": disable_ads,
169
+ "geoip": geoip,
170
+ "adaptor_arguments": adaptor_arguments,
171
+ "additional_arguments": additional_arguments,
172
+ }
173
+ config = validate(params, CamoufoxConfig)
174
+
175
+ self.max_pages = config.max_pages
176
+ self.headless = config.headless
177
+ self.block_images = config.block_images
178
+ self.disable_resources = config.disable_resources
179
+ self.block_webrtc = config.block_webrtc
180
+ self.allow_webgl = config.allow_webgl
181
+ self.network_idle = config.network_idle
182
+ self.humanize = config.humanize
183
+ self.solve_cloudflare = config.solve_cloudflare
184
+ self.wait = config.wait
185
+ self.timeout = config.timeout
186
+ self.page_action = config.page_action
187
+ self.wait_selector = config.wait_selector
188
+ self.addons = config.addons
189
+ self.wait_selector_state = config.wait_selector_state
190
+ self.cookies = config.cookies
191
+ self.google_search = config.google_search
192
+ self.extra_headers = config.extra_headers
193
+ self.proxy = config.proxy
194
+ self.os_randomize = config.os_randomize
195
+ self.disable_ads = config.disable_ads
196
+ self.geoip = config.geoip
197
+ self.adaptor_arguments = config.adaptor_arguments
198
+ self.additional_arguments = config.additional_arguments
199
+
200
+ self.playwright: Optional[Playwright] = None
201
+ self.browser: Optional[Union[BrowserType, Browser]] = None
202
+ self.context: Optional[BrowserContext] = None
203
+ self.page_pool = PagePool(self.max_pages)
204
+ self._closed = False
205
+ self.adaptor_arguments = config.adaptor_arguments
206
+ self.page_action = config.page_action
207
+ self.__initiate_browser_options__()
208
+
209
+ def __initiate_browser_options__(self):
210
+ """Initiate browser options."""
211
+ self.launch_options = {
212
  "geoip": self.geoip,
213
  "proxy": self.proxy,
214
  "enable_cache": True,
215
  "addons": self.addons,
216
  "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
217
  "headless": self.headless,
218
+ "humanize": True if self.solve_cloudflare else self.humanize,
219
  "i_know_what_im_doing": True, # To turn warnings off with the user configurations
220
  "allow_webgl": self.allow_webgl,
221
  "block_webrtc": self.block_webrtc,
 
223
  "os": None if self.os_randomize else get_os_name(),
224
  **self.additional_arguments,
225
  }
226
+ self.context_options = {}
227
+
228
+ def __create__(self):
229
+ """Create a browser for this instance and context."""
230
+ self.playwright = sync_playwright().start()
231
+ self.browser = NewBrowser(self.playwright, **self.launch_options)
232
+ self.context = self.browser.new_context(**self.context_options)
233
+ if self.cookies:
234
+ self.context.add_cookies(self.cookies)
235
+
236
+ def __enter__(self):
237
+ self.__create__()
238
+ return self
239
+
240
+ def __exit__(self, exc_type, exc_val, exc_tb):
241
+ self.close()
242
+
243
+ def close(self):
244
+ """Close all resources"""
245
+ if self._closed:
246
+ return
247
+
248
+ if self.context:
249
+ self.context.close()
250
+ self.context = None
251
+
252
+ if self.browser:
253
+ self.browser.close()
254
+ self.browser = None
255
+
256
+ if self.playwright:
257
+ self.playwright.stop()
258
+ self.playwright = None
259
+
260
+ self._closed = True
261
+
262
+ def _get_or_create_page(self) -> PageInfo:
263
+ """Get an available page or create a new one"""
264
+ # Try to get a ready page first
265
+ page_info = self.page_pool.get_ready_page()
266
+ if page_info:
267
+ return page_info
268
+
269
+ # Create a new page if under limit
270
+ if self.page_pool.pages_count < self.max_pages:
271
+ page = self.context.new_page()
272
+ page.set_default_navigation_timeout(self.timeout)
273
+ page.set_default_timeout(self.timeout)
274
+ if self.extra_headers:
275
+ page.set_extra_http_headers(self.extra_headers)
276
+
277
+ if self.disable_resources:
278
+ page.route("**/*", intercept_route)
279
+
280
+ return self.page_pool.add_page(page)
281
+
282
+ # Wait for a page to become available
283
+ max_wait = 30
284
+ start_time = time()
285
+
286
+ while time() - start_time < max_wait:
287
+ page_info = self.page_pool.get_ready_page()
288
+ if page_info:
289
+ return page_info
290
+ sleep(0.05)
291
+
292
+ raise TimeoutError("No pages available within timeout period")
293
 
294
  @staticmethod
295
+ def _detect_cloudflare(page_content):
296
  """
297
  Detect the type of Cloudflare challenge present in the provided page content.
298
 
 
327
  :param page: The targeted page
328
  :return:
329
  """
330
+ challenge_type = self._detect_cloudflare(page.content())
 
331
  if not challenge_type:
332
  log.error("No Cloudflare challenge found.")
333
  return
 
346
  # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
347
  page.wait_for_timeout(500)
348
 
349
+ iframe = page.frame(url=__CF_PATTERN__)
 
 
 
 
350
  if iframe is None:
351
  log.info("Didn't find Cloudflare iframe!")
352
  return
 
367
  log.info("Cloudflare captcha is solved")
368
  return
369
 
370
+ def fetch(self, url: str) -> Response:
371
+ """Opens up the browser and do your request based on your chosen options.
372
+
373
+ :param url: The Target url.
374
+ :return: A `Response` object.
375
+ """
376
+ if self._closed:
377
+ raise RuntimeError("Context manager has been closed")
378
+
379
+ final_response = None
380
+ referer = generate_convincing_referer(url) if self.google_search else None
381
+
382
+ def handle_response(finished_response: SyncPlaywrightResponse):
383
+ nonlocal final_response
384
+ if (
385
+ finished_response.request.resource_type == "document"
386
+ and finished_response.request.is_navigation_request()
387
+ ):
388
+ final_response = finished_response
389
+
390
+ page_info = self._get_or_create_page()
391
+ page_info.mark_busy(url=url)
392
+
393
+ try:
394
+ # Navigate to URL and wait for a specified state
395
+ page_info.page.on("response", handle_response)
396
+ first_response = page_info.page.goto(url, referer=referer)
397
+ page_info.page.wait_for_load_state(state="domcontentloaded")
398
+
399
+ if self.network_idle:
400
+ page_info.page.wait_for_load_state("networkidle")
401
+
402
+ if not first_response:
403
+ raise RuntimeError(f"Failed to get response for {url}")
404
+
405
+ if self.solve_cloudflare:
406
+ self._solve_cloudflare(page_info.page)
407
+ # Make sure the page is fully loaded after the captcha
408
+ page_info.page.wait_for_load_state(state="load")
409
+ page_info.page.wait_for_load_state(state="domcontentloaded")
410
+ if self.network_idle:
411
+ page_info.page.wait_for_load_state("networkidle")
412
+
413
+ if self.page_action is not None:
414
+ try:
415
+ page_info.page = self.page_action(page_info.page)
416
+ except Exception as e:
417
+ log.error(f"Error executing page_action: {e}")
418
+
419
+ if self.wait_selector:
420
+ try:
421
+ waiter: Locator = page_info.page.locator(self.wait_selector)
422
+ waiter.first.wait_for(state=self.wait_selector_state)
423
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
424
+ page_info.page.wait_for_load_state(state="load")
425
+ page_info.page.wait_for_load_state(state="domcontentloaded")
426
+ if self.network_idle:
427
+ page_info.page.wait_for_load_state("networkidle")
428
+ except Exception as e:
429
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
430
+
431
+ page_info.page.wait_for_timeout(self.wait)
432
+ response = ResponseFactory.from_playwright_response(
433
+ page_info.page, first_response, final_response, self.adaptor_arguments
434
+ )
435
+
436
+ # Mark the page as ready for next use
437
+ page_info.mark_ready()
438
+
439
+ return response
440
+
441
+ except Exception as e:
442
+ page_info.mark_error()
443
+ raise e
444
+
445
+ def get_pool_stats(self) -> Dict[str, int]:
446
+ """Get statistics about the current page pool"""
447
+ return {
448
+ "total_pages": self.page_pool.pages_count,
449
+ "ready_pages": self.page_pool.ready_count,
450
+ "busy_pages": self.page_pool.busy_count,
451
+ "max_pages": self.max_pages,
452
+ }
453
+
454
+
455
+ class AsyncStealthySession(StealthySession):
456
+ """A Stealthy session manager with page pooling."""
457
+
458
+ def __init__(
459
+ self,
460
+ max_pages: int = 1,
461
+ headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
462
+ block_images: bool = False,
463
+ disable_resources: bool = False,
464
+ block_webrtc: bool = False,
465
+ allow_webgl: bool = True,
466
+ network_idle: bool = False,
467
+ humanize: Union[bool, float] = True,
468
+ solve_cloudflare: bool = False,
469
+ wait: Union[int, float] = 0,
470
+ timeout: Union[int, float] = 30000,
471
+ page_action: Optional[Callable] = None,
472
+ wait_selector: Optional[str] = None,
473
+ addons: Optional[List[str]] = None,
474
+ wait_selector_state: SelectorWaitStates = "attached",
475
+ cookies: Optional[List[Dict]] = None,
476
+ google_search: bool = True,
477
+ extra_headers: Optional[Dict[str, str]] = None,
478
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
479
+ os_randomize: bool = False,
480
+ disable_ads: bool = False,
481
+ geoip: bool = False,
482
+ adaptor_arguments: Optional[Dict] = None,
483
+ additional_arguments: Optional[Dict] = None,
484
+ ):
485
+ """A Browser session manager with page pooling
486
+
487
+ :param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
488
+ :param block_images: Prevent the loading of images through Firefox preferences.
489
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
490
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
491
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
492
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
493
+ :param block_webrtc: Blocks WebRTC entirely.
494
+ :param cookies: Set cookies for the next request.
495
+ :param addons: List of Firefox addons to use. Must be paths to extracted addons.
496
+ :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
497
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
498
+ :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
499
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
500
+ :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
501
+ :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
502
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
503
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
504
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
505
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
506
+ :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
507
+ It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
508
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
509
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
510
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
511
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
512
+ :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
513
+ :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
514
+ :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
515
+ """
516
+ super().__init__(
517
+ max_pages,
518
+ headless,
519
+ block_images,
520
+ disable_resources,
521
+ block_webrtc,
522
+ allow_webgl,
523
+ network_idle,
524
+ humanize,
525
+ solve_cloudflare,
526
+ wait,
527
+ timeout,
528
+ page_action,
529
+ wait_selector,
530
+ addons,
531
+ wait_selector_state,
532
+ cookies,
533
+ google_search,
534
+ extra_headers,
535
+ proxy,
536
+ os_randomize,
537
+ disable_ads,
538
+ geoip,
539
+ adaptor_arguments,
540
+ additional_arguments,
541
+ )
542
+ self.playwright: Optional[AsyncPlaywright] = None
543
+ self.browser: Optional[Union[AsyncBrowserType, AsyncBrowser]] = None
544
+ self.context: Optional[AsyncBrowserContext] = None
545
+ self._lock = Lock()
546
+ self.__enter__ = None
547
+ self.__exit__ = None
548
+
549
+ async def __create__(self):
550
+ """Create a browser for this instance and context."""
551
+ self.playwright: AsyncPlaywright = await async_playwright().start()
552
+ self.browser = await AsyncNewBrowser(self.playwright, **self.launch_options)
553
+ self.context: AsyncBrowserContext = await self.browser.new_context(
554
+ **self.context_options
555
+ )
556
+ if self.cookies:
557
+ await self.context.add_cookies(self.cookies)
558
+
559
+ async def __aenter__(self):
560
+ await self.__create__()
561
+ return self
562
+
563
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
564
+ await self.close()
565
+
566
+ async def close(self):
567
+ """Close all resources"""
568
+ if self._closed:
569
+ return
570
+
571
+ if self.context:
572
+ await self.context.close()
573
+ self.context = None
574
+
575
+ if self.browser:
576
+ await self.browser.close()
577
+ self.browser = None
578
+
579
+ if self.playwright:
580
+ await self.playwright.stop()
581
+ self.playwright = None
582
+
583
+ self._closed = True
584
+
585
+ async def _get_or_create_page(self) -> PageInfo:
586
+ """Get an available page or create a new one"""
587
+ async with self._lock:
588
+ # Try to get a ready page first
589
+ page_info = self.page_pool.get_ready_page()
590
+ if page_info:
591
+ return page_info
592
+
593
+ # Create a new page if under limit
594
+ if self.page_pool.pages_count < self.max_pages:
595
+ page = await self.context.new_page()
596
+ page.set_default_navigation_timeout(self.timeout)
597
+ page.set_default_timeout(self.timeout)
598
+ if self.extra_headers:
599
+ await page.set_extra_http_headers(self.extra_headers)
600
+
601
+ if self.disable_resources:
602
+ await page.route("**/*", async_intercept_route)
603
+
604
+ return self.page_pool.add_page(page)
605
+
606
+ # Wait for a page to become available
607
+ max_wait = 30
608
+ start_time = time()
609
+
610
+ while time() - start_time < max_wait:
611
+ page_info = self.page_pool.get_ready_page()
612
+ if page_info:
613
+ return page_info
614
+ await asyncio_sleep(0.05)
615
+
616
+ raise TimeoutError("No pages available within timeout period")
617
+
618
+ async def _solve_cloudflare(self, page: async_Page):
619
  """Solve the cloudflare challenge displayed on the playwright page passed. The async version
620
 
621
  :param page: The async targeted page
622
  :return:
623
  """
624
+ challenge_type = self._detect_cloudflare(await page.content())
 
625
  if not challenge_type:
626
  log.error("No Cloudflare challenge found.")
627
  return
 
640
  # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
641
  await page.wait_for_timeout(500)
642
 
643
+ iframe = page.frame(url=__CF_PATTERN__)
 
 
 
 
644
  if iframe is None:
645
  log.info("Didn't find Cloudflare iframe!")
646
  return
 
663
  log.info("Cloudflare captcha is solved")
664
  return
665
 
666
+ async def fetch(self, url: str) -> Response:
667
  """Opens up the browser and do your request based on your chosen options.
668
 
669
+ :param url: The Target url.
670
+ :return: A `Response` object.
671
  """
672
+ if self._closed:
673
+ raise RuntimeError("Context manager has been closed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
  final_response = None
676
  referer = generate_convincing_referer(url) if self.google_search else None
677
 
678
+ async def handle_response(finished_response: AsyncPlaywrightResponse):
679
  nonlocal final_response
680
  if (
681
  finished_response.request.resource_type == "document"
 
683
  ):
684
  final_response = finished_response
685
 
686
+ page_info = await self._get_or_create_page()
687
+ page_info.mark_busy(url=url)
 
 
688
 
689
+ try:
690
+ # Navigate to URL and wait for a specified state
691
+ page_info.page.on("response", handle_response)
692
+ first_response = await page_info.page.goto(url, referer=referer)
693
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
 
 
 
 
 
 
 
 
694
 
695
  if self.network_idle:
696
+ await page_info.page.wait_for_load_state("networkidle")
697
+
698
+ if not first_response:
699
+ raise RuntimeError(f"Failed to get response for {url}")
700
 
701
  if self.solve_cloudflare:
702
+ await self._solve_cloudflare(page_info.page)
703
  # Make sure the page is fully loaded after the captcha
704
+ await page_info.page.wait_for_load_state(state="load")
705
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
706
  if self.network_idle:
707
+ await page_info.page.wait_for_load_state("networkidle")
708
 
709
  if self.page_action is not None:
710
  try:
711
+ page_info.page = await self.page_action(page_info.page)
712
  except Exception as e:
713
+ log.error(f"Error executing page_action: {e}")
714
 
715
+ if self.wait_selector:
716
  try:
717
+ waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
718
  await waiter.first.wait_for(state=self.wait_selector_state)
719
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
720
+ await page_info.page.wait_for_load_state(state="load")
721
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
722
  if self.network_idle:
723
+ await page_info.page.wait_for_load_state("networkidle")
724
  except Exception as e:
725
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
726
 
727
+ await page_info.page.wait_for_timeout(self.wait)
728
+
729
+ # Create response object
730
  response = await ResponseFactory.from_async_playwright_response(
731
+ page_info.page, first_response, final_response, self.adaptor_arguments
732
  )
 
 
733
 
734
+ # Mark the page as ready for next use
735
+ page_info.mark_ready()
736
+
737
+ return response
738
+
739
+ except Exception as e:
740
+ page_info.mark_error()
741
+ raise e
scrapling/engines/_browsers/_validators.py CHANGED
@@ -1,5 +1,6 @@
1
  from msgspec import Struct, convert, ValidationError
2
  from urllib.parse import urlparse
 
3
 
4
  from scrapling.core._types import (
5
  Optional,
@@ -78,6 +79,70 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
78
  raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
79
 
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def validate(params, model):
82
  try:
83
  config = convert(params, model)
 
1
  from msgspec import Struct, convert, ValidationError
2
  from urllib.parse import urlparse
3
+ from os.path import exists, isdir
4
 
5
  from scrapling.core._types import (
6
  Optional,
 
79
  raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
80
 
81
 
82
+ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
83
+ """Configuration struct for validation"""
84
+
85
+ max_pages: int = 1
86
+ headless: Union[bool, Literal["virtual"]] = True # noqa: F821
87
+ block_images: bool = False
88
+ disable_resources: bool = False
89
+ block_webrtc: bool = False
90
+ allow_webgl: bool = True
91
+ network_idle: bool = False
92
+ humanize: Union[bool, float] = True
93
+ solve_cloudflare: bool = False
94
+ wait: Union[int, float] = 0
95
+ timeout: Union[int, float] = 30000
96
+ page_action: Optional[Callable] = None
97
+ wait_selector: Optional[str] = None
98
+ addons: Optional[List[str]] = None
99
+ wait_selector_state: SelectorWaitStates = "attached"
100
+ cookies: Optional[List[Dict]] = None
101
+ google_search: bool = True
102
+ extra_headers: Optional[Dict[str, str]] = None
103
+ proxy: Optional[Union[str, Dict[str, str]]] = (
104
+ None # The default value for proxy in Playwright's source is `None`
105
+ )
106
+ os_randomize: bool = False
107
+ disable_ads: bool = False
108
+ geoip: bool = False
109
+ adaptor_arguments: Optional[Dict] = None
110
+ additional_arguments: Optional[Dict] = None
111
+
112
+ def __post_init__(self):
113
+ """Custom validation after msgspec validation"""
114
+ if self.max_pages < 1 or self.max_pages > 50:
115
+ raise ValueError("max_pages must be between 1 and 50")
116
+ if self.timeout < 0:
117
+ raise ValueError("timeout must be >= 0")
118
+ if self.page_action is not None and not callable(self.page_action):
119
+ raise TypeError(
120
+ f"page_action must be callable, got {type(self.page_action).__name__}"
121
+ )
122
+ if self.proxy:
123
+ self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
124
+
125
+ if not self.addons:
126
+ self.addons = []
127
+ else:
128
+ for addon in self.addons:
129
+ if not exists(addon):
130
+ raise FileNotFoundError(f"Addon's path not found: {addon}")
131
+ elif not isdir(addon):
132
+ raise ValueError(
133
+ f"Addon's path is not a folder, you need to pass a folder of the extracted addon: {addon}"
134
+ )
135
+
136
+ if not self.cookies:
137
+ self.cookies = []
138
+ if self.solve_cloudflare and self.timeout < 60_000:
139
+ self.timeout = 60_000
140
+ if not self.adaptor_arguments:
141
+ self.adaptor_arguments = {}
142
+ if not self.additional_arguments:
143
+ self.additional_arguments = {}
144
+
145
+
146
  def validate(params, model):
147
  try:
148
  config = convert(params, model)
scrapling/fetchers.py CHANGED
@@ -10,10 +10,10 @@ from scrapling.core._types import (
10
  )
11
  from scrapling.engines import (
12
  FetcherSession,
13
- CamoufoxEngine,
 
14
  DynamicSession,
15
  AsyncDynamicSession,
16
- check_if_engine_usable,
17
  FetcherClient as _FetcherClient,
18
  AsyncFetcherClient as _AsyncFetcherClient,
19
  )
@@ -57,23 +57,23 @@ class StealthyFetcher(BaseFetcher):
57
  block_webrtc: bool = False,
58
  allow_webgl: bool = True,
59
  network_idle: bool = False,
60
- addons: Optional[List[str]] = None,
61
- cookies: Optional[Iterable[Dict]] = None,
62
- wait: Optional[int] = 0,
63
- timeout: Optional[float] = 30000,
64
- page_action: Callable = None,
65
  wait_selector: Optional[str] = None,
66
- humanize: Optional[Union[bool, float]] = True,
67
- solve_cloudflare: Optional[bool] = False,
68
  wait_selector_state: SelectorWaitStates = "attached",
 
69
  google_search: bool = True,
70
  extra_headers: Optional[Dict[str, str]] = None,
71
  proxy: Optional[Union[str, Dict[str, str]]] = None,
72
  os_randomize: bool = False,
73
  disable_ads: bool = False,
74
  geoip: bool = False,
75
- custom_config: Dict = None,
76
- additional_arguments: Dict = None,
77
  ) -> Response:
78
  """
79
  Opens up a browser and do your request based on your chosen options below.
@@ -106,7 +106,7 @@ class StealthyFetcher(BaseFetcher):
106
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
107
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
108
  :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
109
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
110
  """
111
  if not custom_config:
112
  custom_config = {}
@@ -115,8 +115,9 @@ class StealthyFetcher(BaseFetcher):
115
  f"The custom parser config must be of type dictionary, got {cls.__class__}"
116
  )
117
 
118
- engine = CamoufoxEngine(
119
  wait=wait,
 
120
  proxy=proxy,
121
  geoip=geoip,
122
  addons=addons,
@@ -139,8 +140,8 @@ class StealthyFetcher(BaseFetcher):
139
  wait_selector_state=wait_selector_state,
140
  adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
141
  additional_arguments=additional_arguments or {},
142
- )
143
- return engine.fetch(url)
144
 
145
  @classmethod
146
  async def async_fetch(
@@ -150,25 +151,25 @@ class StealthyFetcher(BaseFetcher):
150
  block_images: bool = False,
151
  disable_resources: bool = False,
152
  block_webrtc: bool = False,
153
- cookies: Optional[Iterable[Dict]] = None,
154
  allow_webgl: bool = True,
155
  network_idle: bool = False,
156
- addons: Optional[List[str]] = None,
157
- wait: Optional[int] = 0,
158
- timeout: Optional[float] = 30000,
159
- page_action: Callable = None,
 
160
  wait_selector: Optional[str] = None,
161
- humanize: Optional[Union[bool, float]] = True,
162
- solve_cloudflare: Optional[bool] = False,
163
  wait_selector_state: SelectorWaitStates = "attached",
 
164
  google_search: bool = True,
165
  extra_headers: Optional[Dict[str, str]] = None,
166
  proxy: Optional[Union[str, Dict[str, str]]] = None,
167
  os_randomize: bool = False,
168
  disable_ads: bool = False,
169
  geoip: bool = False,
170
- custom_config: Dict = None,
171
- additional_arguments: Dict = None,
172
  ) -> Response:
173
  """
174
  Opens up a browser and do your request based on your chosen options below.
@@ -201,7 +202,7 @@ class StealthyFetcher(BaseFetcher):
201
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
202
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
203
  :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
204
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
205
  """
206
  if not custom_config:
207
  custom_config = {}
@@ -210,8 +211,9 @@ class StealthyFetcher(BaseFetcher):
210
  f"The custom parser config must be of type dictionary, got {cls.__class__}"
211
  )
212
 
213
- engine = CamoufoxEngine(
214
  wait=wait,
 
215
  proxy=proxy,
216
  geoip=geoip,
217
  addons=addons,
@@ -234,8 +236,8 @@ class StealthyFetcher(BaseFetcher):
234
  wait_selector_state=wait_selector_state,
235
  adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
236
  additional_arguments=additional_arguments or {},
237
- )
238
- return await engine.async_fetch(url)
239
 
240
 
241
  class DynamicFetcher(BaseFetcher):
@@ -425,12 +427,3 @@ class DynamicFetcher(BaseFetcher):
425
 
426
 
427
  PlayWrightFetcher = DynamicFetcher # For backward-compatibility
428
-
429
-
430
- class CustomFetcher(BaseFetcher):
431
- @classmethod
432
- def fetch(cls, url: str, browser_engine, **kwargs) -> Response:
433
- engine = check_if_engine_usable(browser_engine)(
434
- adaptor_arguments=cls._generate_parser_arguments(), **kwargs
435
- )
436
- return engine.fetch(url)
 
10
  )
11
  from scrapling.engines import (
12
  FetcherSession,
13
+ StealthySession,
14
+ AsyncStealthySession,
15
  DynamicSession,
16
  AsyncDynamicSession,
 
17
  FetcherClient as _FetcherClient,
18
  AsyncFetcherClient as _AsyncFetcherClient,
19
  )
 
57
  block_webrtc: bool = False,
58
  allow_webgl: bool = True,
59
  network_idle: bool = False,
60
+ humanize: Union[bool, float] = True,
61
+ solve_cloudflare: bool = False,
62
+ wait: Union[int, float] = 0,
63
+ timeout: Union[int, float] = 30000,
64
+ page_action: Optional[Callable] = None,
65
  wait_selector: Optional[str] = None,
66
+ addons: Optional[List[str]] = None,
 
67
  wait_selector_state: SelectorWaitStates = "attached",
68
+ cookies: Optional[List[Dict]] = None,
69
  google_search: bool = True,
70
  extra_headers: Optional[Dict[str, str]] = None,
71
  proxy: Optional[Union[str, Dict[str, str]]] = None,
72
  os_randomize: bool = False,
73
  disable_ads: bool = False,
74
  geoip: bool = False,
75
+ custom_config: Optional[Dict] = None,
76
+ additional_arguments: Optional[Dict] = None,
77
  ) -> Response:
78
  """
79
  Opens up a browser and do your request based on your chosen options below.
 
106
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
107
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
108
  :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
109
+ :return: A `Response` object.
110
  """
111
  if not custom_config:
112
  custom_config = {}
 
115
  f"The custom parser config must be of type dictionary, got {cls.__class__}"
116
  )
117
 
118
+ with StealthySession(
119
  wait=wait,
120
+ max_pages=1,
121
  proxy=proxy,
122
  geoip=geoip,
123
  addons=addons,
 
140
  wait_selector_state=wait_selector_state,
141
  adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
142
  additional_arguments=additional_arguments or {},
143
+ ) as engine:
144
+ return engine.fetch(url)
145
 
146
  @classmethod
147
  async def async_fetch(
 
151
  block_images: bool = False,
152
  disable_resources: bool = False,
153
  block_webrtc: bool = False,
 
154
  allow_webgl: bool = True,
155
  network_idle: bool = False,
156
+ humanize: Union[bool, float] = True,
157
+ solve_cloudflare: bool = False,
158
+ wait: Union[int, float] = 0,
159
+ timeout: Union[int, float] = 30000,
160
+ page_action: Optional[Callable] = None,
161
  wait_selector: Optional[str] = None,
162
+ addons: Optional[List[str]] = None,
 
163
  wait_selector_state: SelectorWaitStates = "attached",
164
+ cookies: Optional[List[Dict]] = None,
165
  google_search: bool = True,
166
  extra_headers: Optional[Dict[str, str]] = None,
167
  proxy: Optional[Union[str, Dict[str, str]]] = None,
168
  os_randomize: bool = False,
169
  disable_ads: bool = False,
170
  geoip: bool = False,
171
+ custom_config: Optional[Dict] = None,
172
+ additional_arguments: Optional[Dict] = None,
173
  ) -> Response:
174
  """
175
  Opens up a browser and do your request based on your chosen options below.
 
202
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
203
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
204
  :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
205
+ :return: A `Response` object.
206
  """
207
  if not custom_config:
208
  custom_config = {}
 
211
  f"The custom parser config must be of type dictionary, got {cls.__class__}"
212
  )
213
 
214
+ async with AsyncStealthySession(
215
  wait=wait,
216
+ max_pages=1,
217
  proxy=proxy,
218
  geoip=geoip,
219
  addons=addons,
 
236
  wait_selector_state=wait_selector_state,
237
  adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
238
  additional_arguments=additional_arguments or {},
239
+ ) as engine:
240
+ return await engine.fetch(url)
241
 
242
 
243
  class DynamicFetcher(BaseFetcher):
 
427
 
428
 
429
  PlayWrightFetcher = DynamicFetcher # For backward-compatibility