Karim shoair commited on
Commit
20dd99a
·
1 Parent(s): 2c0ab17

refactor: Restructure the fetchers code to not use more memory than needed

Browse files
scrapling/fetchers/__init__.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING, Any
2
+
3
+ if TYPE_CHECKING:
4
+ from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession
5
+ from scrapling.fetchers.chrome import DynamicFetcher, DynamicSession, AsyncDynamicSession
6
+ from scrapling.fetchers.firefox import StealthyFetcher, StealthySession, AsyncStealthySession
7
+
8
+
9
+ # Lazy import mapping
10
+ _LAZY_IMPORTS = {
11
+ "Fetcher": ("scrapling.fetchers.requests", "Fetcher"),
12
+ "AsyncFetcher": ("scrapling.fetchers.requests", "AsyncFetcher"),
13
+ "FetcherSession": ("scrapling.fetchers.requests", "FetcherSession"),
14
+ "DynamicFetcher": ("scrapling.fetchers.chrome", "DynamicFetcher"),
15
+ "DynamicSession": ("scrapling.fetchers.chrome", "DynamicSession"),
16
+ "AsyncDynamicSession": ("scrapling.fetchers.chrome", "AsyncDynamicSession"),
17
+ "StealthyFetcher": ("scrapling.fetchers.firefox", "StealthyFetcher"),
18
+ "StealthySession": ("scrapling.fetchers.firefox", "StealthySession"),
19
+ "AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
20
+ }
21
+
22
+ __all__ = ["Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
23
+
24
+
25
+ def __getattr__(name: str) -> Any:
26
+ if name in _LAZY_IMPORTS:
27
+ module_path, class_name = _LAZY_IMPORTS[name]
28
+ module = __import__(module_path, fromlist=[class_name])
29
+ return getattr(module, class_name)
30
+ else:
31
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
32
+
33
+
34
+ def __dir__() -> list[str]:
35
+ """Support for dir() and autocomplete."""
36
+ return sorted(list(_LAZY_IMPORTS.keys()))
scrapling/fetchers/chrome.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scrapling.core._types import (
2
+ Callable,
3
+ Dict,
4
+ List,
5
+ Optional,
6
+ SelectorWaitStates,
7
+ Iterable,
8
+ )
9
+ from scrapling.engines.toolbelt.custom import BaseFetcher, Response
10
+ from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
11
+
12
+
13
+ class DynamicFetcher(BaseFetcher):
14
+ """A `Fetcher` class type that provide many options, all of them are based on PlayWright.
15
+
16
+ Using this Fetcher class, you can do requests with:
17
+ - Vanilla Playwright without any modifications other than the ones you chose.
18
+ - Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress, but it bypasses many online tests like bot.sannysoft.com
19
+ Some of the things stealth mode does include:
20
+ 1) Patches the CDP runtime fingerprint.
21
+ 2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
22
+ 3) Using custom flags on launch to hide Playwright even more and make it faster.
23
+ 4) Generates real browser's headers of the same type and same user OS, then append it to the request.
24
+ - Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher, and most of the options can be enabled on it.
25
+
26
+ > Note that these are the main options with PlayWright, but it can be mixed.
27
+ """
28
+
29
+ @classmethod
30
+ def fetch(
31
+ cls,
32
+ url: str,
33
+ headless: bool = True,
34
+ google_search: bool = True,
35
+ hide_canvas: bool = False,
36
+ disable_webgl: bool = False,
37
+ real_chrome: bool = False,
38
+ stealth: bool = False,
39
+ wait: int | float = 0,
40
+ page_action: Optional[Callable] = None,
41
+ proxy: Optional[str | Dict[str, str]] = None,
42
+ locale: str = "en-US",
43
+ extra_headers: Optional[Dict[str, str]] = None,
44
+ useragent: Optional[str] = None,
45
+ cdp_url: Optional[str] = None,
46
+ timeout: int | float = 30000,
47
+ disable_resources: bool = False,
48
+ wait_selector: Optional[str] = None,
49
+ init_script: Optional[str] = None,
50
+ cookies: Optional[Iterable[Dict]] = None,
51
+ network_idle: bool = False,
52
+ load_dom: bool = True,
53
+ wait_selector_state: SelectorWaitStates = "attached",
54
+ custom_config: Optional[Dict] = None,
55
+ ) -> Response:
56
+ """Opens up a browser and do your request based on your chosen options below.
57
+
58
+ :param url: Target url.
59
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
60
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
61
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
62
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
63
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
64
+ :param cookies: Set cookies for the next request.
65
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
66
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
67
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
68
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
69
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
70
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
71
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
72
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
73
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
74
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
75
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
76
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
77
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
78
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
79
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
80
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
81
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
82
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
83
+ :return: A `Response` object.
84
+ """
85
+ if not custom_config:
86
+ custom_config = {}
87
+ elif not isinstance(custom_config, dict):
88
+ raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
89
+
90
+ with DynamicSession(
91
+ wait=wait,
92
+ proxy=proxy,
93
+ locale=locale,
94
+ timeout=timeout,
95
+ stealth=stealth,
96
+ cdp_url=cdp_url,
97
+ cookies=cookies,
98
+ headless=headless,
99
+ load_dom=load_dom,
100
+ useragent=useragent,
101
+ real_chrome=real_chrome,
102
+ page_action=page_action,
103
+ hide_canvas=hide_canvas,
104
+ init_script=init_script,
105
+ network_idle=network_idle,
106
+ google_search=google_search,
107
+ extra_headers=extra_headers,
108
+ wait_selector=wait_selector,
109
+ disable_webgl=disable_webgl,
110
+ disable_resources=disable_resources,
111
+ wait_selector_state=wait_selector_state,
112
+ selector_config={**cls._generate_parser_arguments(), **custom_config},
113
+ ) as session:
114
+ return session.fetch(url)
115
+
116
+ @classmethod
117
+ async def async_fetch(
118
+ cls,
119
+ url: str,
120
+ headless: bool = True,
121
+ google_search: bool = True,
122
+ hide_canvas: bool = False,
123
+ disable_webgl: bool = False,
124
+ real_chrome: bool = False,
125
+ stealth: bool = False,
126
+ wait: int | float = 0,
127
+ page_action: Optional[Callable] = None,
128
+ proxy: Optional[str | Dict[str, str]] = None,
129
+ locale: str = "en-US",
130
+ extra_headers: Optional[Dict[str, str]] = None,
131
+ useragent: Optional[str] = None,
132
+ cdp_url: Optional[str] = None,
133
+ timeout: int | float = 30000,
134
+ disable_resources: bool = False,
135
+ wait_selector: Optional[str] = None,
136
+ init_script: Optional[str] = None,
137
+ cookies: Optional[Iterable[Dict]] = None,
138
+ network_idle: bool = False,
139
+ load_dom: bool = True,
140
+ wait_selector_state: SelectorWaitStates = "attached",
141
+ custom_config: Optional[Dict] = None,
142
+ ) -> Response:
143
+ """Opens up a browser and do your request based on your chosen options below.
144
+
145
+ :param url: Target url.
146
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
147
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
148
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
149
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
150
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
151
+ :param cookies: Set cookies for the next request.
152
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
153
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
154
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
155
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
156
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
157
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
158
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
159
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
160
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
161
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
162
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
163
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
164
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
165
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
166
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
167
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
168
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
169
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
170
+ :return: A `Response` object.
171
+ """
172
+ if not custom_config:
173
+ custom_config = {}
174
+ elif not isinstance(custom_config, dict):
175
+ raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
176
+
177
+ async with AsyncDynamicSession(
178
+ wait=wait,
179
+ max_pages=1,
180
+ proxy=proxy,
181
+ locale=locale,
182
+ timeout=timeout,
183
+ stealth=stealth,
184
+ cdp_url=cdp_url,
185
+ cookies=cookies,
186
+ headless=headless,
187
+ load_dom=load_dom,
188
+ useragent=useragent,
189
+ real_chrome=real_chrome,
190
+ page_action=page_action,
191
+ hide_canvas=hide_canvas,
192
+ init_script=init_script,
193
+ network_idle=network_idle,
194
+ google_search=google_search,
195
+ extra_headers=extra_headers,
196
+ wait_selector=wait_selector,
197
+ disable_webgl=disable_webgl,
198
+ disable_resources=disable_resources,
199
+ wait_selector_state=wait_selector_state,
200
+ selector_config={**cls._generate_parser_arguments(), **custom_config},
201
+ ) as session:
202
+ return await session.fetch(url)
203
+
204
+
205
+ PlayWrightFetcher = DynamicFetcher # For backward-compatibility
scrapling/{fetchers.py → fetchers/firefox.py} RENAMED
@@ -4,41 +4,9 @@ from scrapling.core._types import (
4
  List,
5
  Optional,
6
  SelectorWaitStates,
7
- Iterable,
8
- )
9
- from scrapling.engines.static import (
10
- FetcherSession,
11
- FetcherClient as _FetcherClient,
12
- AsyncFetcherClient as _AsyncFetcherClient,
13
- )
14
- from scrapling.engines._browsers import (
15
- DynamicSession,
16
- StealthySession,
17
- AsyncDynamicSession,
18
- AsyncStealthySession,
19
  )
20
  from scrapling.engines.toolbelt.custom import BaseFetcher, Response
21
-
22
- __FetcherClientInstance__ = _FetcherClient()
23
- __AsyncFetcherClientInstance__ = _AsyncFetcherClient()
24
-
25
-
26
- class Fetcher(BaseFetcher):
27
- """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
28
-
29
- get = __FetcherClientInstance__.get
30
- post = __FetcherClientInstance__.post
31
- put = __FetcherClientInstance__.put
32
- delete = __FetcherClientInstance__.delete
33
-
34
-
35
- class AsyncFetcher(BaseFetcher):
36
- """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
37
-
38
- get = __AsyncFetcherClientInstance__.get
39
- post = __AsyncFetcherClientInstance__.post
40
- put = __AsyncFetcherClientInstance__.put
41
- delete = __AsyncFetcherClientInstance__.delete
42
 
43
 
44
  class StealthyFetcher(BaseFetcher):
@@ -246,198 +214,3 @@ class StealthyFetcher(BaseFetcher):
246
  additional_args=additional_args or {},
247
  ) as engine:
248
  return await engine.fetch(url)
249
-
250
-
251
- class DynamicFetcher(BaseFetcher):
252
- """A `Fetcher` class type that provide many options, all of them are based on PlayWright.
253
-
254
- Using this Fetcher class, you can do requests with:
255
- - Vanilla Playwright without any modifications other than the ones you chose.
256
- - Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress, but it bypasses many online tests like bot.sannysoft.com
257
- Some of the things stealth mode does include:
258
- 1) Patches the CDP runtime fingerprint.
259
- 2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
260
- 3) Using custom flags on launch to hide Playwright even more and make it faster.
261
- 4) Generates real browser's headers of the same type and same user OS, then append it to the request.
262
- - Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher, and most of the options can be enabled on it.
263
-
264
- > Note that these are the main options with PlayWright, but it can be mixed.
265
- """
266
-
267
- @classmethod
268
- def fetch(
269
- cls,
270
- url: str,
271
- headless: bool = True,
272
- google_search: bool = True,
273
- hide_canvas: bool = False,
274
- disable_webgl: bool = False,
275
- real_chrome: bool = False,
276
- stealth: bool = False,
277
- wait: int | float = 0,
278
- page_action: Optional[Callable] = None,
279
- proxy: Optional[str | Dict[str, str]] = None,
280
- locale: str = "en-US",
281
- extra_headers: Optional[Dict[str, str]] = None,
282
- useragent: Optional[str] = None,
283
- cdp_url: Optional[str] = None,
284
- timeout: int | float = 30000,
285
- disable_resources: bool = False,
286
- wait_selector: Optional[str] = None,
287
- init_script: Optional[str] = None,
288
- cookies: Optional[Iterable[Dict]] = None,
289
- network_idle: bool = False,
290
- load_dom: bool = True,
291
- wait_selector_state: SelectorWaitStates = "attached",
292
- custom_config: Optional[Dict] = None,
293
- ) -> Response:
294
- """Opens up a browser and do your request based on your chosen options below.
295
-
296
- :param url: Target url.
297
- :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
298
- :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
299
- Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
300
- This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
301
- :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
302
- :param cookies: Set cookies for the next request.
303
- :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
304
- :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
305
- :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
306
- :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
307
- :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
308
- :param wait_selector: Wait for a specific CSS selector to be in a specific state.
309
- :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
310
- :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
311
- :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
312
- :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
313
- :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
314
- :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
315
- :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
316
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
317
- :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
318
- :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
319
- :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
320
- :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
321
- :return: A `Response` object.
322
- """
323
- if not custom_config:
324
- custom_config = {}
325
- elif not isinstance(custom_config, dict):
326
- raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
327
-
328
- with DynamicSession(
329
- wait=wait,
330
- proxy=proxy,
331
- locale=locale,
332
- timeout=timeout,
333
- stealth=stealth,
334
- cdp_url=cdp_url,
335
- cookies=cookies,
336
- headless=headless,
337
- load_dom=load_dom,
338
- useragent=useragent,
339
- real_chrome=real_chrome,
340
- page_action=page_action,
341
- hide_canvas=hide_canvas,
342
- init_script=init_script,
343
- network_idle=network_idle,
344
- google_search=google_search,
345
- extra_headers=extra_headers,
346
- wait_selector=wait_selector,
347
- disable_webgl=disable_webgl,
348
- disable_resources=disable_resources,
349
- wait_selector_state=wait_selector_state,
350
- selector_config={**cls._generate_parser_arguments(), **custom_config},
351
- ) as session:
352
- return session.fetch(url)
353
-
354
- @classmethod
355
- async def async_fetch(
356
- cls,
357
- url: str,
358
- headless: bool = True,
359
- google_search: bool = True,
360
- hide_canvas: bool = False,
361
- disable_webgl: bool = False,
362
- real_chrome: bool = False,
363
- stealth: bool = False,
364
- wait: int | float = 0,
365
- page_action: Optional[Callable] = None,
366
- proxy: Optional[str | Dict[str, str]] = None,
367
- locale: str = "en-US",
368
- extra_headers: Optional[Dict[str, str]] = None,
369
- useragent: Optional[str] = None,
370
- cdp_url: Optional[str] = None,
371
- timeout: int | float = 30000,
372
- disable_resources: bool = False,
373
- wait_selector: Optional[str] = None,
374
- init_script: Optional[str] = None,
375
- cookies: Optional[Iterable[Dict]] = None,
376
- network_idle: bool = False,
377
- load_dom: bool = True,
378
- wait_selector_state: SelectorWaitStates = "attached",
379
- custom_config: Optional[Dict] = None,
380
- ) -> Response:
381
- """Opens up a browser and do your request based on your chosen options below.
382
-
383
- :param url: Target url.
384
- :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
385
- :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
386
- Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
387
- This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
388
- :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
389
- :param cookies: Set cookies for the next request.
390
- :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
391
- :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
392
- :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
393
- :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
394
- :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
395
- :param wait_selector: Wait for a specific CSS selector to be in a specific state.
396
- :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
397
- :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
398
- :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
399
- :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
400
- :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
401
- :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
402
- :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
403
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
404
- :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
405
- :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
406
- :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
407
- :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
408
- :return: A `Response` object.
409
- """
410
- if not custom_config:
411
- custom_config = {}
412
- elif not isinstance(custom_config, dict):
413
- raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
414
-
415
- async with AsyncDynamicSession(
416
- wait=wait,
417
- max_pages=1,
418
- proxy=proxy,
419
- locale=locale,
420
- timeout=timeout,
421
- stealth=stealth,
422
- cdp_url=cdp_url,
423
- cookies=cookies,
424
- headless=headless,
425
- load_dom=load_dom,
426
- useragent=useragent,
427
- real_chrome=real_chrome,
428
- page_action=page_action,
429
- hide_canvas=hide_canvas,
430
- init_script=init_script,
431
- network_idle=network_idle,
432
- google_search=google_search,
433
- extra_headers=extra_headers,
434
- wait_selector=wait_selector,
435
- disable_webgl=disable_webgl,
436
- disable_resources=disable_resources,
437
- wait_selector_state=wait_selector_state,
438
- selector_config={**cls._generate_parser_arguments(), **custom_config},
439
- ) as session:
440
- return await session.fetch(url)
441
-
442
-
443
- PlayWrightFetcher = DynamicFetcher # For backward-compatibility
 
4
  List,
5
  Optional,
6
  SelectorWaitStates,
 
 
 
 
 
 
 
 
 
 
 
 
7
  )
8
  from scrapling.engines.toolbelt.custom import BaseFetcher, Response
9
+ from scrapling.engines._browsers._camoufox import StealthySession, AsyncStealthySession
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  class StealthyFetcher(BaseFetcher):
 
214
  additional_args=additional_args or {},
215
  ) as engine:
216
  return await engine.fetch(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scrapling/fetchers/requests.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scrapling.core._types import (
2
+ Callable,
3
+ Dict,
4
+ List,
5
+ Optional,
6
+ SelectorWaitStates,
7
+ Iterable,
8
+ )
9
+ from scrapling.engines.static import (
10
+ FetcherSession,
11
+ FetcherClient as _FetcherClient,
12
+ AsyncFetcherClient as _AsyncFetcherClient,
13
+ )
14
+ from scrapling.engines.toolbelt.custom import BaseFetcher, Response
15
+
16
+
17
+ __FetcherClientInstance__ = _FetcherClient()
18
+ __AsyncFetcherClientInstance__ = _AsyncFetcherClient()
19
+
20
+
21
+ class Fetcher(BaseFetcher):
22
+ """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
23
+
24
+ get = __FetcherClientInstance__.get
25
+ post = __FetcherClientInstance__.post
26
+ put = __FetcherClientInstance__.put
27
+ delete = __FetcherClientInstance__.delete
28
+
29
+
30
+ class AsyncFetcher(BaseFetcher):
31
+ """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
32
+
33
+ get = __AsyncFetcherClientInstance__.get
34
+ post = __AsyncFetcherClientInstance__.post
35
+ put = __AsyncFetcherClientInstance__.put
36
+ delete = __AsyncFetcherClientInstance__.delete