Karim shoair commited on
Commit
0cd97d9
·
1 Parent(s): 3ced0d2

feat(fetchers): Adding the foundation of the new browser-based fetchers logic

Browse files
scrapling/core/_types.py CHANGED
@@ -24,6 +24,7 @@ from typing import (
24
 
25
  SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
26
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
 
27
  StrOrBytes = Union[str, bytes]
28
 
29
  try:
 
24
 
25
  SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
26
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
27
+ PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
28
  StrOrBytes = Union[str, bytes]
29
 
30
  try:
scrapling/engines/_browsers/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from ._controllers import DynamicSession, AsyncDynamicSession
scrapling/engines/_browsers/_config_tools.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+
3
+ from scrapling.core._types import Tuple
4
+ from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, HARMFUL_DEFAULT_ARGS
5
+ from scrapling.engines.toolbelt import js_bypass_path, generate_headers
6
+
7
+ __default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
8
+
9
+
10
+ @lru_cache(1)
11
+ def _compiled_stealth_scripts():
12
+ """Pre-read and compile stealth scripts"""
13
+ # Basic bypasses nothing fancy as I'm still working on it
14
+ # But with adding these bypasses to the above config, it bypasses many online tests like
15
+ # https://bot.sannysoft.com/
16
+ # https://kaliiiiiiiiii.github.io/brotector/
17
+ # https://pixelscan.net/
18
+ # https://iphey.com/
19
+ # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
20
+ # https://arh.antoinevastel.com/bots/areyouheadless/
21
+ # https://prescience-data.github.io/execution-monitor.html
22
+ stealth_scripts_paths = tuple(
23
+ js_bypass_path(script)
24
+ for script in (
25
+ # Order is important
26
+ "webdriver_fully.js",
27
+ "window_chrome.js",
28
+ "navigator_plugins.js",
29
+ "pdf_viewer.js",
30
+ "notification_permission.js",
31
+ "screen_props.js",
32
+ "playwright_fingerprint.js",
33
+ )
34
+ )
35
+ scripts = []
36
+ for script_path in stealth_scripts_paths:
37
+ with open(script_path, "r") as f:
38
+ scripts.append(f.read())
39
+ return tuple(scripts)
40
+
41
+
42
+ @lru_cache(2, typed=True)
43
+ def _set_flags(hide_canvas, disable_webgl):
44
+ """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
45
+ flags = DEFAULT_STEALTH_FLAGS
46
+ if hide_canvas:
47
+ flags += ("--fingerprinting-canvas-image-data-noise",)
48
+ if disable_webgl:
49
+ flags += (
50
+ "--disable-webgl",
51
+ "--disable-webgl-image-chromium",
52
+ "--disable-webgl2",
53
+ )
54
+
55
+ return flags
56
+
57
+
58
+ @lru_cache(2, typed=True)
59
+ def _launch_kwargs(headless, real_chrome, stealth, hide_canvas, disable_webgl) -> Tuple:
60
+ """Creates the arguments we will use while launching playwright's browser"""
61
+ launch_kwargs = {
62
+ "headless": headless,
63
+ "ignore_default_args": HARMFUL_DEFAULT_ARGS,
64
+ "channel": "chrome" if real_chrome else "chromium",
65
+ }
66
+ if stealth:
67
+ launch_kwargs.update(
68
+ {"args": _set_flags(hide_canvas, disable_webgl), "chromium_sandbox": True}
69
+ )
70
+
71
+ return tuple(launch_kwargs.items())
72
+
73
+
74
+ @lru_cache(2, typed=True)
75
+ def _context_kwargs(proxy, locale, extra_headers, useragent, stealth) -> Tuple:
76
+ """Creates the arguments for the browser context"""
77
+ context_kwargs = {
78
+ "proxy": proxy or tuple(),
79
+ "locale": locale,
80
+ "color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
81
+ "device_scale_factor": 2,
82
+ "extra_http_headers": extra_headers or tuple(),
83
+ "user_agent": useragent or __default_useragent__,
84
+ }
85
+ if stealth:
86
+ context_kwargs.update(
87
+ {
88
+ "is_mobile": False,
89
+ "has_touch": False,
90
+ # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
91
+ "service_workers": "allow",
92
+ "ignore_https_errors": True,
93
+ "screen": {"width": 1920, "height": 1080},
94
+ "viewport": {"width": 1920, "height": 1080},
95
+ "permissions": ["geolocation", "notifications"],
96
+ }
97
+ )
98
+
99
+ return tuple(context_kwargs.items())
scrapling/engines/_browsers/_controllers.py ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import asyncio
3
+
4
+ # from camoufox import AsyncNewBrowser, NewBrowser
5
+ from playwright.sync_api import (
6
+ sync_playwright,
7
+ BrowserType,
8
+ Browser,
9
+ BrowserContext,
10
+ Playwright,
11
+ Locator,
12
+ )
13
+ from playwright.async_api import (
14
+ async_playwright,
15
+ BrowserType as AsyncBrowserType,
16
+ Browser as AsyncBrowser,
17
+ BrowserContext as AsyncBrowserContext,
18
+ Playwright as AsyncPlaywright,
19
+ Locator as AsyncLocator,
20
+ )
21
+ from playwright.sync_api import Response as SyncPlaywrightResponse
22
+ from playwright.async_api import Response as AsyncPlaywrightResponse
23
+ from rebrowser_playwright.sync_api import sync_playwright as sync_rebrowser_playwright
24
+ from rebrowser_playwright.async_api import (
25
+ async_playwright as async_rebrowser_playwright,
26
+ )
27
+
28
+ from scrapling.core.utils import log
29
+ from ._page import PageInfo, PagePool
30
+ from ._validators import validate, PlaywrightConfig
31
+ from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
32
+ from scrapling.core._types import (
33
+ Dict,
34
+ Optional,
35
+ Union,
36
+ Iterable,
37
+ Callable,
38
+ SelectorWaitStates,
39
+ )
40
+ from scrapling.engines.toolbelt import (
41
+ Response,
42
+ ResponseFactory,
43
+ generate_convincing_referer,
44
+ intercept_route,
45
+ async_intercept_route,
46
+ )
47
+
48
+
49
+ class DynamicSession:
50
+ """A Browser session manager with page pooling."""
51
+
52
+ __slots__ = (
53
+ "max_pages",
54
+ "headless",
55
+ "hide_canvas",
56
+ "disable_webgl",
57
+ "real_chrome",
58
+ "stealth",
59
+ "google_search",
60
+ "proxy",
61
+ "locale",
62
+ "extra_headers",
63
+ "useragent",
64
+ "timeout",
65
+ "cookies",
66
+ "disable_resources",
67
+ "network_idle",
68
+ "wait_selector",
69
+ "wait_selector_state",
70
+ "wait",
71
+ "playwright",
72
+ "browser",
73
+ "context",
74
+ "page_pool",
75
+ "_closed",
76
+ "adaptor_arguments",
77
+ "page_action",
78
+ "launch_options",
79
+ "context_options",
80
+ "cdp_url",
81
+ )
82
+
83
+ def __init__(
84
+ self,
85
+ max_pages: int = 1,
86
+ headless: bool = True,
87
+ google_search: bool = True,
88
+ hide_canvas: bool = False,
89
+ disable_webgl: bool = False,
90
+ real_chrome: bool = False,
91
+ stealth: bool = False,
92
+ wait: Union[int, float] = 0,
93
+ page_action: Optional[Callable] = None,
94
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
95
+ locale: str = "en-US",
96
+ extra_headers: Optional[Dict[str, str]] = None,
97
+ useragent: Optional[str] = None,
98
+ cdp_url: Optional[str] = None,
99
+ timeout: Union[int, float] = 30000,
100
+ disable_resources: bool = False,
101
+ wait_selector: Optional[str] = None,
102
+ cookies: Optional[Iterable[Dict]] = None,
103
+ network_idle: bool = False,
104
+ wait_selector_state: SelectorWaitStates = "attached",
105
+ adaptor_arguments: Optional[Dict] = None,
106
+ ):
107
+ """A Browser session manager with page pooling
108
+
109
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
110
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
111
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
112
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
113
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
114
+ :param cookies: Set cookies for the next request.
115
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
116
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
117
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
118
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
119
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
120
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
121
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
122
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
123
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
124
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
125
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
126
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
127
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
128
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
129
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
130
+ :param max_pages: The maximum number of pages to be opened at the same time. It will be used in rotation through a PagePool.
131
+ :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
132
+ """
133
+
134
+ params = {
135
+ "max_pages": max_pages,
136
+ "headless": headless,
137
+ "google_search": google_search,
138
+ "hide_canvas": hide_canvas,
139
+ "disable_webgl": disable_webgl,
140
+ "real_chrome": real_chrome,
141
+ "stealth": stealth,
142
+ "wait": wait,
143
+ "page_action": page_action,
144
+ "proxy": proxy,
145
+ "locale": locale,
146
+ "extra_headers": extra_headers,
147
+ "useragent": useragent,
148
+ "timeout": timeout,
149
+ "adaptor_arguments": adaptor_arguments,
150
+ "disable_resources": disable_resources,
151
+ "wait_selector": wait_selector,
152
+ "cookies": cookies,
153
+ "network_idle": network_idle,
154
+ "wait_selector_state": wait_selector_state,
155
+ "cdp_url": cdp_url,
156
+ }
157
+ config = validate(params, PlaywrightConfig)
158
+
159
+ self.max_pages = config.max_pages
160
+ self.headless = config.headless
161
+ self.hide_canvas = config.hide_canvas
162
+ self.disable_webgl = config.disable_webgl
163
+ self.real_chrome = config.real_chrome
164
+ self.stealth = config.stealth
165
+ self.google_search = config.google_search
166
+ self.wait = config.wait
167
+ self.proxy = config.proxy
168
+ self.locale = config.locale
169
+ self.extra_headers = config.extra_headers
170
+ self.useragent = config.useragent
171
+ self.timeout = config.timeout
172
+ self.cookies = list(config.cookies) if config.cookies else []
173
+ self.disable_resources = config.disable_resources
174
+ self.cdp_url = config.cdp_url
175
+ self.network_idle = config.network_idle
176
+ self.wait_selector = config.wait_selector
177
+ self.wait_selector_state = config.wait_selector_state
178
+
179
+ self.playwright: Optional[Playwright] = None
180
+ self.browser: Optional[Union[BrowserType, Browser]] = None
181
+ self.context: Optional[BrowserContext] = None
182
+ self.page_pool = PagePool(self.max_pages)
183
+ self._closed = False
184
+ self.adaptor_arguments = config.adaptor_arguments or {}
185
+ self.page_action = config.page_action
186
+ self.__initiate_browser_options__()
187
+
188
+ def __initiate_browser_options__(self):
189
+ self.launch_options = dict(
190
+ _launch_kwargs(
191
+ self.headless,
192
+ self.real_chrome,
193
+ self.stealth,
194
+ self.hide_canvas,
195
+ self.disable_webgl,
196
+ )
197
+ )
198
+ self.context_options = dict(
199
+ _context_kwargs(
200
+ self.proxy,
201
+ self.locale,
202
+ tuple(self.extra_headers.items()) if self.extra_headers else tuple(),
203
+ self.useragent,
204
+ self.stealth,
205
+ )
206
+ )
207
+ self.context_options["extra_http_headers"] = dict(
208
+ self.context_options["extra_http_headers"]
209
+ )
210
+ self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
211
+
212
+ def __create__(self):
213
+ """Create a browser for this instance and context."""
214
+ sync_context = sync_rebrowser_playwright
215
+ if not self.stealth or self.real_chrome:
216
+ # Because rebrowser_playwright doesn't play well with real browsers
217
+ sync_context = sync_playwright
218
+
219
+ self.playwright = sync_context().start()
220
+
221
+ browser_launcher = getattr(
222
+ self.playwright, "chrome" if self.real_chrome else "chromium"
223
+ )
224
+ if self.cdp_url:
225
+ self.browser = browser_launcher.connect_over_cdp(endpoint_url=self.cdp_url)
226
+ else:
227
+ self.browser = browser_launcher.launch(**self.launch_options)
228
+
229
+ self.context = self.browser.new_context(**self.context_options)
230
+ if self.cookies:
231
+ self.context.add_cookies(self.cookies)
232
+
233
+ def __enter__(self):
234
+ self.__create__()
235
+ return self
236
+
237
+ def __exit__(self, exc_type, exc_val, exc_tb):
238
+ self.close()
239
+
240
+ def close(self):
241
+ """Close all resources"""
242
+ if self._closed:
243
+ return
244
+
245
+ if self.context:
246
+ self.context.close()
247
+ self.context = None
248
+
249
+ if self.browser:
250
+ self.browser.close()
251
+ self.browser = None
252
+
253
+ if self.playwright:
254
+ self.playwright.stop()
255
+ self.playwright = None
256
+
257
+ self._closed = True
258
+
259
+ def _get_or_create_page(self) -> PageInfo:
260
+ """Get an available page or create a new one"""
261
+ # Try to get a ready page first
262
+ page_info = self.page_pool.get_ready_page()
263
+ if page_info:
264
+ return page_info
265
+
266
+ # Create new page if under limit
267
+ if self.page_pool.pages_count < self.max_pages:
268
+ page = self.context.new_page()
269
+ page.set_default_navigation_timeout(self.timeout)
270
+ page.set_default_timeout(self.timeout)
271
+ if self.extra_headers:
272
+ page.set_extra_http_headers(self.extra_headers)
273
+
274
+ if self.disable_resources:
275
+ page.route("**/*", intercept_route)
276
+
277
+ if self.stealth:
278
+ for script in _compiled_stealth_scripts():
279
+ page.add_init_script(path=script)
280
+
281
+ return self.page_pool.add_page(page)
282
+
283
+ # Wait for a page to become available
284
+ max_wait = 30
285
+ start_time = time.time()
286
+
287
+ while time.time() - start_time < max_wait:
288
+ page_info = self.page_pool.get_ready_page()
289
+ if page_info:
290
+ return page_info
291
+ time.sleep(0.05)
292
+
293
+ raise TimeoutError("No pages available within timeout period")
294
+
295
+ def fetch(self, url: str) -> Response:
296
+ """Opens up the browser and do your request based on your chosen options.
297
+
298
+ :param url: The Target url.
299
+ :return: A `Response` object.
300
+ """
301
+ if self._closed:
302
+ raise RuntimeError("Context manager has been closed")
303
+
304
+ final_response = None
305
+ referer = generate_convincing_referer(url) if self.google_search else None
306
+
307
+ def handle_response(finished_response: SyncPlaywrightResponse):
308
+ nonlocal final_response
309
+ if (
310
+ finished_response.request.resource_type == "document"
311
+ and finished_response.request.is_navigation_request()
312
+ ):
313
+ final_response = finished_response
314
+
315
+ page_info = self._get_or_create_page()
316
+ page_info.mark_busy(url=url)
317
+
318
+ try:
319
+ # Navigate to URL and wait for a specified state
320
+ page_info.page.on("response", handle_response)
321
+ first_response = page_info.page.goto(url, referer=referer)
322
+ page_info.page.wait_for_load_state(state="domcontentloaded")
323
+
324
+ if self.network_idle:
325
+ page_info.page.wait_for_load_state("networkidle")
326
+
327
+ if not first_response:
328
+ raise RuntimeError(f"Failed to get response for {url}")
329
+
330
+ if self.page_action is not None:
331
+ try:
332
+ page_info.page = self.page_action(page_info.page)
333
+ except Exception as e:
334
+ log.error(f"Error executing page_action: {e}")
335
+
336
+ if self.wait_selector:
337
+ try:
338
+ waiter: Locator = page_info.page.locator(self.wait_selector)
339
+ waiter.first.wait_for(state=self.wait_selector_state)
340
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
341
+ page_info.page.wait_for_load_state(state="load")
342
+ page_info.page.wait_for_load_state(state="domcontentloaded")
343
+ if self.network_idle:
344
+ page_info.page.wait_for_load_state("networkidle")
345
+ except Exception as e:
346
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
347
+
348
+ page_info.page.wait_for_timeout(self.wait)
349
+
350
+ # Create response object
351
+ response = ResponseFactory.from_playwright_response(
352
+ page_info.page, first_response, final_response, self.adaptor_arguments
353
+ )
354
+
355
+ # Mark page as ready for next use
356
+ page_info.mark_ready()
357
+
358
+ return response
359
+
360
+ except Exception as e:
361
+ page_info.mark_error()
362
+ raise e
363
+
364
+ def get_pool_stats(self) -> Dict[str, int]:
365
+ """Get statistics about the current page pool"""
366
+ return {
367
+ "total_pages": self.page_pool.pages_count,
368
+ "ready_pages": self.page_pool.ready_count,
369
+ "busy_pages": self.page_pool.busy_count,
370
+ "max_pages": self.max_pages,
371
+ }
372
+
373
+
374
+ class AsyncDynamicSession(DynamicSession):
375
+ """A Browser session manager with page pooling"""
376
+
377
+ def __init__(
378
+ self,
379
+ max_pages: int = 1,
380
+ headless: bool = True,
381
+ google_search: bool = True,
382
+ hide_canvas: bool = False,
383
+ disable_webgl: bool = False,
384
+ real_chrome: bool = False,
385
+ stealth: bool = False,
386
+ wait: Union[int, float] = 0,
387
+ page_action: Optional[Callable] = None,
388
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
389
+ locale: str = "en-US",
390
+ extra_headers: Optional[Dict[str, str]] = None,
391
+ useragent: Optional[str] = None,
392
+ cdp_url: Optional[str] = None,
393
+ timeout: Union[int, float] = 30000,
394
+ disable_resources: bool = False,
395
+ wait_selector: Optional[str] = None,
396
+ cookies: Optional[Iterable[Dict]] = None,
397
+ network_idle: bool = False,
398
+ wait_selector_state: SelectorWaitStates = "attached",
399
+ adaptor_arguments: Optional[Dict] = None,
400
+ ):
401
+ """A Browser session manager with page pooling
402
+
403
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
404
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
405
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
406
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
407
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
408
+ :param cookies: Set cookies for the next request.
409
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
410
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
411
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
412
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
413
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
414
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
415
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
416
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
417
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
418
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
419
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
420
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
421
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
422
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
423
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
424
+ :param max_pages: The maximum number of pages to be opened at the same time. It will be used in rotation through a PagePool.
425
+ :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
426
+ """
427
+
428
+ super().__init__(
429
+ max_pages,
430
+ headless,
431
+ google_search,
432
+ hide_canvas,
433
+ disable_webgl,
434
+ real_chrome,
435
+ stealth,
436
+ wait,
437
+ page_action,
438
+ proxy,
439
+ locale,
440
+ extra_headers,
441
+ useragent,
442
+ cdp_url,
443
+ timeout,
444
+ disable_resources,
445
+ wait_selector,
446
+ cookies,
447
+ network_idle,
448
+ wait_selector_state,
449
+ adaptor_arguments,
450
+ )
451
+
452
+ self.playwright: Optional[AsyncPlaywright] = None
453
+ self.browser: Optional[Union[AsyncBrowserType, AsyncBrowser]] = None
454
+ self.context: Optional[AsyncBrowserContext] = None
455
+ self._lock = asyncio.Lock()
456
+ self.__enter__ = None
457
+ self.__exit__ = None
458
+
459
+ async def __create__(self):
460
+ """Create a browser for this instance and context."""
461
+ async_context = async_rebrowser_playwright
462
+ if not self.stealth or self.real_chrome:
463
+ # Because rebrowser_playwright doesn't play well with real browsers
464
+ async_context = async_playwright
465
+
466
+ self.playwright: AsyncPlaywright = await async_context().start()
467
+
468
+ browser_launcher: AsyncBrowserType = getattr(
469
+ self.playwright, "chrome" if self.real_chrome else "chromium"
470
+ )
471
+ if self.cdp_url:
472
+ self.browser = await browser_launcher.connect_over_cdp(
473
+ endpoint_url=self.cdp_url
474
+ )
475
+ else:
476
+ self.browser = await browser_launcher.launch(**self.launch_options)
477
+
478
+ self.context: AsyncBrowserContext = await self.browser.new_context(
479
+ **self.context_options
480
+ )
481
+
482
+ if self.cookies:
483
+ await self.context.add_cookies(self.cookies)
484
+
485
+ async def __aenter__(self):
486
+ await self.__create__()
487
+ return self
488
+
489
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
490
+ await self.close()
491
+
492
+ async def close(self):
493
+ """Close all resources"""
494
+ if self._closed:
495
+ return
496
+
497
+ if self.context:
498
+ await self.context.close()
499
+ self.context = None
500
+
501
+ if self.browser:
502
+ await self.browser.close()
503
+ self.browser = None
504
+
505
+ if self.playwright:
506
+ await self.playwright.stop()
507
+ self.playwright = None
508
+
509
+ self._closed = True
510
+
511
+ async def _get_or_create_page(self) -> PageInfo:
512
+ """Get an available page or create a new one"""
513
+ async with self._lock:
514
+ # Try to get a ready page first
515
+ page_info = self.page_pool.get_ready_page()
516
+ if page_info:
517
+ return page_info
518
+
519
+ # Create new page if under limit
520
+ if self.page_pool.pages_count < self.max_pages:
521
+ page = await self.context.new_page()
522
+ page.set_default_navigation_timeout(self.timeout)
523
+ page.set_default_timeout(self.timeout)
524
+ if self.extra_headers:
525
+ await page.set_extra_http_headers(self.extra_headers)
526
+
527
+ if self.disable_resources:
528
+ await page.route("**/*", async_intercept_route)
529
+
530
+ if self.stealth:
531
+ for script in _compiled_stealth_scripts():
532
+ await page.add_init_script(path=script)
533
+
534
+ return self.page_pool.add_page(page)
535
+
536
+ # Wait for a page to become available
537
+ max_wait = 30 # seconds
538
+ start_time = time.time()
539
+
540
+ while time.time() - start_time < max_wait:
541
+ page_info = self.page_pool.get_ready_page()
542
+ if page_info:
543
+ return page_info
544
+ await asyncio.sleep(0.05)
545
+
546
+ raise TimeoutError("No pages available within timeout period")
547
+
548
+ async def fetch(self, url: str) -> Response:
549
+ """Opens up the browser and do your request based on your chosen options.
550
+
551
+ :param url: The Target url.
552
+ :return: A `Response` object.
553
+ """
554
+ if self._closed:
555
+ raise RuntimeError("Context manager has been closed")
556
+
557
+ final_response = None
558
+ referer = generate_convincing_referer(url) if self.google_search else None
559
+
560
+ async def handle_response(finished_response: AsyncPlaywrightResponse):
561
+ nonlocal final_response
562
+ if (
563
+ finished_response.request.resource_type == "document"
564
+ and finished_response.request.is_navigation_request()
565
+ ):
566
+ final_response = finished_response
567
+
568
+ page_info = await self._get_or_create_page()
569
+ page_info.mark_busy(url=url)
570
+
571
+ try:
572
+ # Navigate to URL and wait for a specified state
573
+ page_info.page.on("response", handle_response)
574
+ first_response = await page_info.page.goto(url, referer=referer)
575
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
576
+
577
+ if self.network_idle:
578
+ await page_info.page.wait_for_load_state("networkidle")
579
+
580
+ if not first_response:
581
+ raise RuntimeError(f"Failed to get response for {url}")
582
+
583
+ if self.page_action is not None:
584
+ try:
585
+ page_info.page = await self.page_action(page_info.page)
586
+ except Exception as e:
587
+ log.error(f"Error executing page_action: {e}")
588
+
589
+ if self.wait_selector:
590
+ try:
591
+ waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
592
+ await waiter.first.wait_for(state=self.wait_selector_state)
593
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
594
+ await page_info.page.wait_for_load_state(state="load")
595
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
596
+ if self.network_idle:
597
+ await page_info.page.wait_for_load_state("networkidle")
598
+ except Exception as e:
599
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
600
+
601
+ await page_info.page.wait_for_timeout(self.wait)
602
+
603
+ # Create response object
604
+ response = await ResponseFactory.from_async_playwright_response(
605
+ page_info.page, first_response, final_response, self.adaptor_arguments
606
+ )
607
+
608
+ # Mark page as ready for next use
609
+ page_info.mark_ready()
610
+
611
+ return response
612
+
613
+ except Exception as e:
614
+ page_info.mark_error()
615
+ raise e
scrapling/engines/_browsers/_page.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from threading import RLock
2
+ from dataclasses import dataclass
3
+
4
+ from playwright.sync_api import Page as SyncPage
5
+ from playwright.async_api import Page as AsyncPage
6
+
7
+ from scrapling.core._types import Optional, Union, List, Literal
8
+
9
+ PageState = Literal["ready", "busy", "error"] # States that a page can be in
10
+
11
+
12
+ @dataclass
13
+ class PageInfo:
14
+ """Information about the page and its current state"""
15
+
16
+ __slots__ = ("page", "state", "url")
17
+ page: Union[SyncPage, AsyncPage]
18
+ state: PageState
19
+ url: Optional[str]
20
+
21
+ def mark_busy(self, url: str = ""):
22
+ """Mark the page as busy"""
23
+ self.state = "busy"
24
+ self.url = url
25
+
26
+ def mark_ready(self):
27
+ """Mark the page as ready for new requests"""
28
+ self.state = "ready"
29
+ self.url = ""
30
+
31
+ def mark_error(self):
32
+ """Mark the page as having an error"""
33
+ self.state = "error"
34
+
35
+ def __repr__(self):
36
+ return f'Page(URL="{self.url!r}", state={self.state!r})'
37
+
38
+ def __eq__(self, other_page):
39
+ """Comparing this page to another page object."""
40
+ if other_page.__class__ is not self.__class__:
41
+ return NotImplemented
42
+ return self.page == other_page.page
43
+
44
+
45
+ class PagePool:
46
+ """Manages a pool of browser pages/tabs with state tracking"""
47
+
48
+ __slots__ = ("max_pages", "pages", "_lock")
49
+
50
+ def __init__(self, max_pages: int = 5):
51
+ self.max_pages = max_pages
52
+ self.pages: List[PageInfo] = []
53
+ self._lock = RLock()
54
+
55
+ def add_page(self, page: Union[SyncPage, AsyncPage]) -> PageInfo:
56
+ """Add a new page to the pool"""
57
+ with self._lock:
58
+ if len(self.pages) >= self.max_pages:
59
+ raise RuntimeError(f"Maximum page limit ({self.max_pages}) reached")
60
+
61
+ page_info = PageInfo(page, "ready", "")
62
+ self.pages.append(page_info)
63
+ return page_info
64
+
65
+ def get_ready_page(self) -> Optional[PageInfo]:
66
+ """Get a page that's ready for use"""
67
+ with self._lock:
68
+ for page_info in self.pages:
69
+ if page_info.state == "ready":
70
+ return page_info
71
+ return None
72
+
73
+ @property
74
+ def pages_count(self) -> int:
75
+ """Get the total number of pages"""
76
+ return len(self.pages)
77
+
78
+ @property
79
+ def ready_count(self) -> int:
80
+ """Get the number of ready pages"""
81
+ with self._lock:
82
+ return sum(1 for p in self.pages if p.state == "ready")
83
+
84
+ @property
85
+ def busy_count(self) -> int:
86
+ """Get the number of busy pages"""
87
+ with self._lock:
88
+ return sum(1 for p in self.pages if p.state == "busy")
89
+
90
+ def cleanup_error_pages(self):
91
+ """Remove pages in error state"""
92
+ with self._lock:
93
+ self.pages = [p for p in self.pages if p.state != "error"]
scrapling/engines/_browsers/_validators.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import msgspec
2
+ from urllib.parse import urlparse
3
+
4
+ from scrapling.core._types import (
5
+ Optional,
6
+ Union,
7
+ Dict,
8
+ Callable,
9
+ Iterable,
10
+ SelectorWaitStates,
11
+ )
12
+ from scrapling.engines.toolbelt import construct_proxy_dict
13
+
14
+
15
+ class PlaywrightConfig(msgspec.Struct, kw_only=True, frozen=False):
16
+ """Configuration struct for validation"""
17
+
18
+ max_pages: int = 1
19
+ cdp_url: Optional[str] = None
20
+ headless: bool = True
21
+ google_search: bool = True
22
+ hide_canvas: bool = False
23
+ disable_webgl: bool = False
24
+ real_chrome: bool = False
25
+ stealth: bool = False
26
+ wait: Union[int, float] = 0
27
+ page_action: Optional[Callable] = None
28
+ proxy: Optional[Union[str, Dict[str, str]]] = (
29
+ None # The default value for proxy in Playwright's source is `None`
30
+ )
31
+ locale: str = "en-US"
32
+ extra_headers: Optional[Dict[str, str]] = None
33
+ useragent: Optional[str] = None
34
+ timeout: Union[int, float] = 30000
35
+ disable_resources: bool = False
36
+ wait_selector: Optional[str] = None
37
+ cookies: Optional[Iterable[Dict]] = None
38
+ network_idle: bool = False
39
+ wait_selector_state: SelectorWaitStates = "attached"
40
+ adaptor_arguments: Optional[Dict] = None
41
+
42
+ def __post_init__(self):
43
+ """Custom validation after msgspec validation"""
44
+ if self.max_pages < 1 or self.max_pages > 50:
45
+ raise ValueError("max_pages must be between 1 and 50")
46
+ if self.wait_selector_state not in (
47
+ "attached",
48
+ "detached",
49
+ "hidden",
50
+ "visible",
51
+ ):
52
+ raise ValueError(f"Invalid wait_selector_state: {self.wait_selector_state}")
53
+ if self.timeout < 0:
54
+ raise ValueError("timeout must be >= 0")
55
+ if self.page_action is not None and not callable(self.page_action):
56
+ raise TypeError(
57
+ f"page_action must be callable, got {type(self.page_action).__name__}"
58
+ )
59
+ if self.proxy:
60
+ self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
61
+ if self.cdp_url:
62
+ self.__validate_cdp(self.cdp_url)
63
+
64
+ @staticmethod
65
+ def __validate_cdp(cdp_url):
66
+ try:
67
+ # Check the scheme
68
+ if not cdp_url.startswith(("ws://", "wss://")):
69
+ raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
70
+
71
+ # Validate hostname and port
72
+ if not urlparse(cdp_url).netloc:
73
+ raise ValueError("Invalid hostname for the CDP URL")
74
+
75
+ except AttributeError as e:
76
+ raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
77
+
78
+ except Exception as e:
79
+ raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
80
+
81
+
82
+ def validate(params, model):
83
+ try:
84
+ config = msgspec.convert(params, model)
85
+ except msgspec.ValidationError as e:
86
+ raise TypeError(f"Invalid argument type: {e}")
87
+
88
+ return config
scrapling/engines/constants.py CHANGED
@@ -12,6 +12,15 @@ DEFAULT_DISABLED_RESOURCES = {
12
  "stylesheet",
13
  }
14
 
 
 
 
 
 
 
 
 
 
15
  DEFAULT_STEALTH_FLAGS = (
16
  # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
17
  # Generally this will make the browser faster and less detectable
 
12
  "stylesheet",
13
  }
14
 
15
+ HARMFUL_DEFAULT_ARGS = (
16
+ # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
17
+ "--enable-automation",
18
+ "--disable-popup-blocking",
19
+ # '--disable-component-update',
20
+ # '--disable-default-apps',
21
+ # '--disable-extensions',
22
+ )
23
+
24
  DEFAULT_STEALTH_FLAGS = (
25
  # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
26
  # Generally this will make the browser faster and less detectable