Karim shoair commited on
Commit ·
42a1f3d
1
Parent(s): 4f7700a
feat(fetchers): Improve StealthyFetcher + Adding StealthySession/AsyncStealthySession classes
Browse files
scrapling/engines/__init__.py
CHANGED
|
@@ -1,7 +1,16 @@
|
|
| 1 |
-
from .camo import CamoufoxEngine
|
| 2 |
from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
|
| 3 |
from .static import FetcherSession, FetcherClient, AsyncFetcherClient
|
| 4 |
-
from .
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
__all__ = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
|
| 2 |
from .static import FetcherSession, FetcherClient, AsyncFetcherClient
|
| 3 |
+
from ._browsers import (
|
| 4 |
+
DynamicSession,
|
| 5 |
+
AsyncDynamicSession,
|
| 6 |
+
StealthySession,
|
| 7 |
+
AsyncStealthySession,
|
| 8 |
+
)
|
| 9 |
|
| 10 |
+
__all__ = [
|
| 11 |
+
"FetcherSession",
|
| 12 |
+
"DynamicSession",
|
| 13 |
+
"AsyncDynamicSession",
|
| 14 |
+
"StealthySession",
|
| 15 |
+
"AsyncStealthySession",
|
| 16 |
+
]
|
scrapling/engines/_browsers/__init__.py
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
from ._controllers import DynamicSession, AsyncDynamicSession
|
|
|
|
|
|
| 1 |
from ._controllers import DynamicSession, AsyncDynamicSession
|
| 2 |
+
from ._camoufox import StealthySession, AsyncStealthySession
|
scrapling/engines/{camo.py → _browsers/_camoufox.py}
RENAMED
|
@@ -1,37 +1,93 @@
|
|
| 1 |
-
import
|
| 2 |
-
|
| 3 |
-
from
|
| 4 |
-
|
| 5 |
-
from camoufox
|
| 6 |
-
from
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
from scrapling.core._types import (
|
| 10 |
-
Callable,
|
| 11 |
Dict,
|
| 12 |
-
List,
|
| 13 |
-
Literal,
|
| 14 |
Optional,
|
| 15 |
-
SelectorWaitStates,
|
| 16 |
Union,
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
| 18 |
)
|
| 19 |
-
from scrapling.core.utils import log
|
| 20 |
from scrapling.engines.toolbelt import (
|
| 21 |
Response,
|
| 22 |
ResponseFactory,
|
| 23 |
async_intercept_route,
|
| 24 |
-
check_type_validity,
|
| 25 |
-
construct_proxy_dict,
|
| 26 |
generate_convincing_referer,
|
| 27 |
get_os_name,
|
| 28 |
intercept_route,
|
| 29 |
)
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
class CamoufoxEngine:
|
| 33 |
def __init__(
|
| 34 |
self,
|
|
|
|
| 35 |
headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
|
| 36 |
block_images: bool = False,
|
| 37 |
disable_resources: bool = False,
|
|
@@ -39,29 +95,29 @@ class CamoufoxEngine:
|
|
| 39 |
allow_webgl: bool = True,
|
| 40 |
network_idle: bool = False,
|
| 41 |
humanize: Union[bool, float] = True,
|
| 42 |
-
solve_cloudflare:
|
| 43 |
-
wait:
|
| 44 |
-
timeout:
|
| 45 |
-
page_action: Callable = None,
|
| 46 |
wait_selector: Optional[str] = None,
|
| 47 |
addons: Optional[List[str]] = None,
|
| 48 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 49 |
-
cookies: Optional[
|
| 50 |
google_search: bool = True,
|
| 51 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 52 |
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 53 |
os_randomize: bool = False,
|
| 54 |
disable_ads: bool = False,
|
| 55 |
geoip: bool = False,
|
| 56 |
-
adaptor_arguments: Dict = None,
|
| 57 |
-
additional_arguments: Dict = None,
|
| 58 |
):
|
| 59 |
-
"""
|
| 60 |
|
| 61 |
:param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
|
| 62 |
:param block_images: Prevent the loading of images through Firefox preferences.
|
| 63 |
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 64 |
-
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
| 65 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 66 |
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 67 |
:param block_webrtc: Blocks WebRTC entirely.
|
|
@@ -76,65 +132,90 @@ class CamoufoxEngine:
|
|
| 76 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 77 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 78 |
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
| 79 |
-
:param wait_selector: Wait for a specific
|
| 80 |
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
| 81 |
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
| 82 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 83 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
| 84 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 85 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
|
|
| 86 |
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
| 87 |
:param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 88 |
"""
|
| 89 |
-
self.headless = headless
|
| 90 |
-
self.block_images = bool(block_images)
|
| 91 |
-
self.disable_resources = bool(disable_resources)
|
| 92 |
-
self.block_webrtc = bool(block_webrtc)
|
| 93 |
-
self.allow_webgl = bool(allow_webgl)
|
| 94 |
-
self.network_idle = bool(network_idle)
|
| 95 |
-
self.google_search = bool(google_search)
|
| 96 |
-
self.os_randomize = bool(os_randomize)
|
| 97 |
-
self.disable_ads = bool(disable_ads)
|
| 98 |
-
self.geoip = bool(geoip)
|
| 99 |
-
self.extra_headers = extra_headers or {}
|
| 100 |
-
self.additional_arguments = additional_arguments or {}
|
| 101 |
-
self.proxy = construct_proxy_dict(proxy)
|
| 102 |
-
self.addons = addons or []
|
| 103 |
-
self.cookies = cookies or []
|
| 104 |
-
self.humanize = humanize
|
| 105 |
-
self.solve_cloudflare = solve_cloudflare
|
| 106 |
-
self.timeout = check_type_validity(timeout, [int, float], 30_000)
|
| 107 |
-
self.wait = check_type_validity(wait, [int, float], 0)
|
| 108 |
-
|
| 109 |
-
if self.solve_cloudflare and self.timeout < 60_000:
|
| 110 |
-
self.timeout = 60_000
|
| 111 |
-
|
| 112 |
-
# Page action callable validation
|
| 113 |
-
self.page_action = None
|
| 114 |
-
if page_action is not None:
|
| 115 |
-
if callable(page_action):
|
| 116 |
-
self.page_action = page_action
|
| 117 |
-
else:
|
| 118 |
-
log.error('[Ignored] Argument "page_action" must be callable')
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
humanize
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
"geoip": self.geoip,
|
| 132 |
"proxy": self.proxy,
|
| 133 |
"enable_cache": True,
|
| 134 |
"addons": self.addons,
|
| 135 |
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
| 136 |
"headless": self.headless,
|
| 137 |
-
"humanize": humanize,
|
| 138 |
"i_know_what_im_doing": True, # To turn warnings off with the user configurations
|
| 139 |
"allow_webgl": self.allow_webgl,
|
| 140 |
"block_webrtc": self.block_webrtc,
|
|
@@ -142,9 +223,76 @@ class CamoufoxEngine:
|
|
| 142 |
"os": None if self.os_randomize else get_os_name(),
|
| 143 |
**self.additional_arguments,
|
| 144 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
@staticmethod
|
| 147 |
-
def
|
| 148 |
"""
|
| 149 |
Detect the type of Cloudflare challenge present in the provided page content.
|
| 150 |
|
|
@@ -179,8 +327,7 @@ class CamoufoxEngine:
|
|
| 179 |
:param page: The targeted page
|
| 180 |
:return:
|
| 181 |
"""
|
| 182 |
-
|
| 183 |
-
challenge_type = self.__detect_cloudflare(page_content)
|
| 184 |
if not challenge_type:
|
| 185 |
log.error("No Cloudflare challenge found.")
|
| 186 |
return
|
|
@@ -199,11 +346,7 @@ class CamoufoxEngine:
|
|
| 199 |
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 200 |
page.wait_for_timeout(500)
|
| 201 |
|
| 202 |
-
iframe = page.frame(
|
| 203 |
-
url=re.compile(
|
| 204 |
-
"challenges.cloudflare.com/cdn-cgi/challenge-platform/.*"
|
| 205 |
-
)
|
| 206 |
-
)
|
| 207 |
if iframe is None:
|
| 208 |
log.info("Didn't find Cloudflare iframe!")
|
| 209 |
return
|
|
@@ -224,14 +367,261 @@ class CamoufoxEngine:
|
|
| 224 |
log.info("Cloudflare captcha is solved")
|
| 225 |
return
|
| 226 |
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
| 229 |
|
| 230 |
:param page: The async targeted page
|
| 231 |
:return:
|
| 232 |
"""
|
| 233 |
-
|
| 234 |
-
challenge_type = self.__detect_cloudflare(page_content)
|
| 235 |
if not challenge_type:
|
| 236 |
log.error("No Cloudflare challenge found.")
|
| 237 |
return
|
|
@@ -250,11 +640,7 @@ class CamoufoxEngine:
|
|
| 250 |
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 251 |
await page.wait_for_timeout(500)
|
| 252 |
|
| 253 |
-
iframe = page.frame(
|
| 254 |
-
url=re.compile(
|
| 255 |
-
"challenges.cloudflare.com/cdn-cgi/challenge-platform/.*"
|
| 256 |
-
)
|
| 257 |
-
)
|
| 258 |
if iframe is None:
|
| 259 |
log.info("Didn't find Cloudflare iframe!")
|
| 260 |
return
|
|
@@ -277,90 +663,19 @@ class CamoufoxEngine:
|
|
| 277 |
log.info("Cloudflare captcha is solved")
|
| 278 |
return
|
| 279 |
|
| 280 |
-
def fetch(self, url: str) -> Response:
|
| 281 |
"""Opens up the browser and do your request based on your chosen options.
|
| 282 |
|
| 283 |
-
:param url: Target url.
|
| 284 |
-
:return: A `Response` object
|
| 285 |
"""
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
def handle_response(finished_response):
|
| 290 |
-
nonlocal final_response
|
| 291 |
-
if (
|
| 292 |
-
finished_response.request.resource_type == "document"
|
| 293 |
-
and finished_response.request.is_navigation_request()
|
| 294 |
-
):
|
| 295 |
-
final_response = finished_response
|
| 296 |
-
|
| 297 |
-
with Camoufox(**self._get_camoufox_options()) as browser:
|
| 298 |
-
context = browser.new_context()
|
| 299 |
-
if self.cookies:
|
| 300 |
-
context.add_cookies(self.cookies)
|
| 301 |
-
|
| 302 |
-
page = context.new_page()
|
| 303 |
-
page.set_default_navigation_timeout(self.timeout)
|
| 304 |
-
page.set_default_timeout(self.timeout)
|
| 305 |
-
page.on("response", handle_response)
|
| 306 |
-
|
| 307 |
-
if self.disable_resources:
|
| 308 |
-
page.route("**/*", intercept_route)
|
| 309 |
-
|
| 310 |
-
if self.extra_headers:
|
| 311 |
-
page.set_extra_http_headers(self.extra_headers)
|
| 312 |
-
|
| 313 |
-
first_response = page.goto(url, referer=referer)
|
| 314 |
-
page.wait_for_load_state(state="domcontentloaded")
|
| 315 |
-
|
| 316 |
-
if self.network_idle:
|
| 317 |
-
page.wait_for_load_state("networkidle")
|
| 318 |
-
|
| 319 |
-
if self.solve_cloudflare:
|
| 320 |
-
self._solve_cloudflare(page)
|
| 321 |
-
# Make sure the page is fully loaded after the captcha
|
| 322 |
-
page.wait_for_load_state(state="load")
|
| 323 |
-
page.wait_for_load_state(state="domcontentloaded")
|
| 324 |
-
if self.network_idle:
|
| 325 |
-
page.wait_for_load_state("networkidle")
|
| 326 |
-
|
| 327 |
-
if self.page_action is not None:
|
| 328 |
-
try:
|
| 329 |
-
page = self.page_action(page)
|
| 330 |
-
except Exception as e:
|
| 331 |
-
log.error(f"Error executing page_action: {e}")
|
| 332 |
-
|
| 333 |
-
if self.wait_selector and type(self.wait_selector) is str:
|
| 334 |
-
try:
|
| 335 |
-
waiter = page.locator(self.wait_selector)
|
| 336 |
-
waiter.first.wait_for(state=self.wait_selector_state)
|
| 337 |
-
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
| 338 |
-
page.wait_for_load_state(state="load")
|
| 339 |
-
page.wait_for_load_state(state="domcontentloaded")
|
| 340 |
-
if self.network_idle:
|
| 341 |
-
page.wait_for_load_state("networkidle")
|
| 342 |
-
except Exception as e:
|
| 343 |
-
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 344 |
|
| 345 |
-
page.wait_for_timeout(self.wait)
|
| 346 |
-
response = ResponseFactory.from_playwright_response(
|
| 347 |
-
page, first_response, final_response, self.adaptor_arguments
|
| 348 |
-
)
|
| 349 |
-
page.close()
|
| 350 |
-
context.close()
|
| 351 |
-
|
| 352 |
-
return response
|
| 353 |
-
|
| 354 |
-
async def async_fetch(self, url: str) -> Response:
|
| 355 |
-
"""Opens up the browser and do your request based on your chosen options.
|
| 356 |
-
|
| 357 |
-
:param url: Target url.
|
| 358 |
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 359 |
-
"""
|
| 360 |
final_response = None
|
| 361 |
referer = generate_convincing_referer(url) if self.google_search else None
|
| 362 |
|
| 363 |
-
async def handle_response(finished_response):
|
| 364 |
nonlocal final_response
|
| 365 |
if (
|
| 366 |
finished_response.request.resource_type == "document"
|
|
@@ -368,59 +683,59 @@ class CamoufoxEngine:
|
|
| 368 |
):
|
| 369 |
final_response = finished_response
|
| 370 |
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
if self.cookies:
|
| 374 |
-
await context.add_cookies(self.cookies)
|
| 375 |
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
page.
|
| 379 |
-
page.
|
| 380 |
-
|
| 381 |
-
if self.disable_resources:
|
| 382 |
-
await page.route("**/*", async_intercept_route)
|
| 383 |
-
|
| 384 |
-
if self.extra_headers:
|
| 385 |
-
await page.set_extra_http_headers(self.extra_headers)
|
| 386 |
-
|
| 387 |
-
first_response = await page.goto(url, referer=referer)
|
| 388 |
-
await page.wait_for_load_state(state="domcontentloaded")
|
| 389 |
|
| 390 |
if self.network_idle:
|
| 391 |
-
await page.wait_for_load_state("networkidle")
|
|
|
|
|
|
|
|
|
|
| 392 |
|
| 393 |
if self.solve_cloudflare:
|
| 394 |
-
await self.
|
| 395 |
# Make sure the page is fully loaded after the captcha
|
| 396 |
-
await page.wait_for_load_state(state="load")
|
| 397 |
-
await page.wait_for_load_state(state="domcontentloaded")
|
| 398 |
if self.network_idle:
|
| 399 |
-
await page.wait_for_load_state("networkidle")
|
| 400 |
|
| 401 |
if self.page_action is not None:
|
| 402 |
try:
|
| 403 |
-
page = await self.page_action(page)
|
| 404 |
except Exception as e:
|
| 405 |
-
log.error(f"Error executing
|
| 406 |
|
| 407 |
-
if self.wait_selector
|
| 408 |
try:
|
| 409 |
-
waiter = page.locator(self.wait_selector)
|
| 410 |
await waiter.first.wait_for(state=self.wait_selector_state)
|
| 411 |
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
| 412 |
-
await page.wait_for_load_state(state="load")
|
| 413 |
-
await page.wait_for_load_state(state="domcontentloaded")
|
| 414 |
if self.network_idle:
|
| 415 |
-
await page.wait_for_load_state("networkidle")
|
| 416 |
except Exception as e:
|
| 417 |
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 418 |
|
| 419 |
-
await page.wait_for_timeout(self.wait)
|
|
|
|
|
|
|
| 420 |
response = await ResponseFactory.from_async_playwright_response(
|
| 421 |
-
page, first_response, final_response, self.adaptor_arguments
|
| 422 |
)
|
| 423 |
-
await page.close()
|
| 424 |
-
await context.close()
|
| 425 |
|
| 426 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from time import time, sleep
|
| 2 |
+
from re import compile as re_compile
|
| 3 |
+
from asyncio import sleep as asyncio_sleep, Lock
|
| 4 |
+
|
| 5 |
+
from camoufox import AsyncNewBrowser, NewBrowser, DefaultAddons
|
| 6 |
+
from playwright.sync_api import (
|
| 7 |
+
Response as SyncPlaywrightResponse,
|
| 8 |
+
sync_playwright,
|
| 9 |
+
BrowserType,
|
| 10 |
+
Browser,
|
| 11 |
+
BrowserContext,
|
| 12 |
+
Playwright,
|
| 13 |
+
Locator,
|
| 14 |
+
Page,
|
| 15 |
+
)
|
| 16 |
+
from playwright.async_api import (
|
| 17 |
+
async_playwright,
|
| 18 |
+
Response as AsyncPlaywrightResponse,
|
| 19 |
+
BrowserType as AsyncBrowserType,
|
| 20 |
+
Browser as AsyncBrowser,
|
| 21 |
+
BrowserContext as AsyncBrowserContext,
|
| 22 |
+
Playwright as AsyncPlaywright,
|
| 23 |
+
Locator as AsyncLocator,
|
| 24 |
+
Page as async_Page,
|
| 25 |
+
)
|
| 26 |
|
| 27 |
+
from scrapling.core.utils import log
|
| 28 |
+
from ._page import PageInfo, PagePool
|
| 29 |
+
from ._validators import validate, CamoufoxConfig
|
| 30 |
from scrapling.core._types import (
|
|
|
|
| 31 |
Dict,
|
|
|
|
|
|
|
| 32 |
Optional,
|
|
|
|
| 33 |
Union,
|
| 34 |
+
Callable,
|
| 35 |
+
Literal,
|
| 36 |
+
List,
|
| 37 |
+
SelectorWaitStates,
|
| 38 |
)
|
|
|
|
| 39 |
from scrapling.engines.toolbelt import (
|
| 40 |
Response,
|
| 41 |
ResponseFactory,
|
| 42 |
async_intercept_route,
|
|
|
|
|
|
|
| 43 |
generate_convincing_referer,
|
| 44 |
get_os_name,
|
| 45 |
intercept_route,
|
| 46 |
)
|
| 47 |
|
| 48 |
+
__CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class StealthySession:
|
| 52 |
+
"""A Stealthy session manager with page pooling."""
|
| 53 |
+
|
| 54 |
+
__slots__ = (
|
| 55 |
+
"max_pages",
|
| 56 |
+
"headless",
|
| 57 |
+
"block_images",
|
| 58 |
+
"disable_resources",
|
| 59 |
+
"block_webrtc",
|
| 60 |
+
"allow_webgl",
|
| 61 |
+
"network_idle",
|
| 62 |
+
"humanize",
|
| 63 |
+
"solve_cloudflare",
|
| 64 |
+
"wait",
|
| 65 |
+
"timeout",
|
| 66 |
+
"page_action",
|
| 67 |
+
"wait_selector",
|
| 68 |
+
"addons",
|
| 69 |
+
"wait_selector_state",
|
| 70 |
+
"cookies",
|
| 71 |
+
"google_search",
|
| 72 |
+
"extra_headers",
|
| 73 |
+
"proxy",
|
| 74 |
+
"os_randomize",
|
| 75 |
+
"disable_ads",
|
| 76 |
+
"geoip",
|
| 77 |
+
"adaptor_arguments",
|
| 78 |
+
"additional_arguments",
|
| 79 |
+
"playwright",
|
| 80 |
+
"browser",
|
| 81 |
+
"context",
|
| 82 |
+
"page_pool",
|
| 83 |
+
"_closed",
|
| 84 |
+
"launch_options",
|
| 85 |
+
"context_options",
|
| 86 |
+
)
|
| 87 |
|
|
|
|
| 88 |
def __init__(
|
| 89 |
self,
|
| 90 |
+
max_pages: int = 1,
|
| 91 |
headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
|
| 92 |
block_images: bool = False,
|
| 93 |
disable_resources: bool = False,
|
|
|
|
| 95 |
allow_webgl: bool = True,
|
| 96 |
network_idle: bool = False,
|
| 97 |
humanize: Union[bool, float] = True,
|
| 98 |
+
solve_cloudflare: bool = False,
|
| 99 |
+
wait: Union[int, float] = 0,
|
| 100 |
+
timeout: Union[int, float] = 30000,
|
| 101 |
+
page_action: Optional[Callable] = None,
|
| 102 |
wait_selector: Optional[str] = None,
|
| 103 |
addons: Optional[List[str]] = None,
|
| 104 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 105 |
+
cookies: Optional[List[Dict]] = None,
|
| 106 |
google_search: bool = True,
|
| 107 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 108 |
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 109 |
os_randomize: bool = False,
|
| 110 |
disable_ads: bool = False,
|
| 111 |
geoip: bool = False,
|
| 112 |
+
adaptor_arguments: Optional[Dict] = None,
|
| 113 |
+
additional_arguments: Optional[Dict] = None,
|
| 114 |
):
|
| 115 |
+
"""A Browser session manager with page pooling
|
| 116 |
|
| 117 |
:param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
|
| 118 |
:param block_images: Prevent the loading of images through Firefox preferences.
|
| 119 |
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 120 |
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 121 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 122 |
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 123 |
:param block_webrtc: Blocks WebRTC entirely.
|
|
|
|
| 132 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 133 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 134 |
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
| 135 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 136 |
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
| 137 |
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
| 138 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 139 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
| 140 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 141 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 142 |
+
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 143 |
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
| 144 |
:param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 145 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
+
params = {
|
| 148 |
+
"max_pages": max_pages,
|
| 149 |
+
"headless": headless,
|
| 150 |
+
"block_images": block_images,
|
| 151 |
+
"disable_resources": disable_resources,
|
| 152 |
+
"block_webrtc": block_webrtc,
|
| 153 |
+
"allow_webgl": allow_webgl,
|
| 154 |
+
"network_idle": network_idle,
|
| 155 |
+
"humanize": humanize,
|
| 156 |
+
"solve_cloudflare": solve_cloudflare,
|
| 157 |
+
"wait": wait,
|
| 158 |
+
"timeout": timeout,
|
| 159 |
+
"page_action": page_action,
|
| 160 |
+
"wait_selector": wait_selector,
|
| 161 |
+
"addons": addons,
|
| 162 |
+
"wait_selector_state": wait_selector_state,
|
| 163 |
+
"cookies": cookies,
|
| 164 |
+
"google_search": google_search,
|
| 165 |
+
"extra_headers": extra_headers,
|
| 166 |
+
"proxy": proxy,
|
| 167 |
+
"os_randomize": os_randomize,
|
| 168 |
+
"disable_ads": disable_ads,
|
| 169 |
+
"geoip": geoip,
|
| 170 |
+
"adaptor_arguments": adaptor_arguments,
|
| 171 |
+
"additional_arguments": additional_arguments,
|
| 172 |
+
}
|
| 173 |
+
config = validate(params, CamoufoxConfig)
|
| 174 |
+
|
| 175 |
+
self.max_pages = config.max_pages
|
| 176 |
+
self.headless = config.headless
|
| 177 |
+
self.block_images = config.block_images
|
| 178 |
+
self.disable_resources = config.disable_resources
|
| 179 |
+
self.block_webrtc = config.block_webrtc
|
| 180 |
+
self.allow_webgl = config.allow_webgl
|
| 181 |
+
self.network_idle = config.network_idle
|
| 182 |
+
self.humanize = config.humanize
|
| 183 |
+
self.solve_cloudflare = config.solve_cloudflare
|
| 184 |
+
self.wait = config.wait
|
| 185 |
+
self.timeout = config.timeout
|
| 186 |
+
self.page_action = config.page_action
|
| 187 |
+
self.wait_selector = config.wait_selector
|
| 188 |
+
self.addons = config.addons
|
| 189 |
+
self.wait_selector_state = config.wait_selector_state
|
| 190 |
+
self.cookies = config.cookies
|
| 191 |
+
self.google_search = config.google_search
|
| 192 |
+
self.extra_headers = config.extra_headers
|
| 193 |
+
self.proxy = config.proxy
|
| 194 |
+
self.os_randomize = config.os_randomize
|
| 195 |
+
self.disable_ads = config.disable_ads
|
| 196 |
+
self.geoip = config.geoip
|
| 197 |
+
self.adaptor_arguments = config.adaptor_arguments
|
| 198 |
+
self.additional_arguments = config.additional_arguments
|
| 199 |
+
|
| 200 |
+
self.playwright: Optional[Playwright] = None
|
| 201 |
+
self.browser: Optional[Union[BrowserType, Browser]] = None
|
| 202 |
+
self.context: Optional[BrowserContext] = None
|
| 203 |
+
self.page_pool = PagePool(self.max_pages)
|
| 204 |
+
self._closed = False
|
| 205 |
+
self.adaptor_arguments = config.adaptor_arguments
|
| 206 |
+
self.page_action = config.page_action
|
| 207 |
+
self.__initiate_browser_options__()
|
| 208 |
+
|
| 209 |
+
def __initiate_browser_options__(self):
|
| 210 |
+
"""Initiate browser options."""
|
| 211 |
+
self.launch_options = {
|
| 212 |
"geoip": self.geoip,
|
| 213 |
"proxy": self.proxy,
|
| 214 |
"enable_cache": True,
|
| 215 |
"addons": self.addons,
|
| 216 |
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
| 217 |
"headless": self.headless,
|
| 218 |
+
"humanize": True if self.solve_cloudflare else self.humanize,
|
| 219 |
"i_know_what_im_doing": True, # To turn warnings off with the user configurations
|
| 220 |
"allow_webgl": self.allow_webgl,
|
| 221 |
"block_webrtc": self.block_webrtc,
|
|
|
|
| 223 |
"os": None if self.os_randomize else get_os_name(),
|
| 224 |
**self.additional_arguments,
|
| 225 |
}
|
| 226 |
+
self.context_options = {}
|
| 227 |
+
|
| 228 |
+
def __create__(self):
|
| 229 |
+
"""Create a browser for this instance and context."""
|
| 230 |
+
self.playwright = sync_playwright().start()
|
| 231 |
+
self.browser = NewBrowser(self.playwright, **self.launch_options)
|
| 232 |
+
self.context = self.browser.new_context(**self.context_options)
|
| 233 |
+
if self.cookies:
|
| 234 |
+
self.context.add_cookies(self.cookies)
|
| 235 |
+
|
| 236 |
+
def __enter__(self):
|
| 237 |
+
self.__create__()
|
| 238 |
+
return self
|
| 239 |
+
|
| 240 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 241 |
+
self.close()
|
| 242 |
+
|
| 243 |
+
def close(self):
|
| 244 |
+
"""Close all resources"""
|
| 245 |
+
if self._closed:
|
| 246 |
+
return
|
| 247 |
+
|
| 248 |
+
if self.context:
|
| 249 |
+
self.context.close()
|
| 250 |
+
self.context = None
|
| 251 |
+
|
| 252 |
+
if self.browser:
|
| 253 |
+
self.browser.close()
|
| 254 |
+
self.browser = None
|
| 255 |
+
|
| 256 |
+
if self.playwright:
|
| 257 |
+
self.playwright.stop()
|
| 258 |
+
self.playwright = None
|
| 259 |
+
|
| 260 |
+
self._closed = True
|
| 261 |
+
|
| 262 |
+
def _get_or_create_page(self) -> PageInfo:
|
| 263 |
+
"""Get an available page or create a new one"""
|
| 264 |
+
# Try to get a ready page first
|
| 265 |
+
page_info = self.page_pool.get_ready_page()
|
| 266 |
+
if page_info:
|
| 267 |
+
return page_info
|
| 268 |
+
|
| 269 |
+
# Create a new page if under limit
|
| 270 |
+
if self.page_pool.pages_count < self.max_pages:
|
| 271 |
+
page = self.context.new_page()
|
| 272 |
+
page.set_default_navigation_timeout(self.timeout)
|
| 273 |
+
page.set_default_timeout(self.timeout)
|
| 274 |
+
if self.extra_headers:
|
| 275 |
+
page.set_extra_http_headers(self.extra_headers)
|
| 276 |
+
|
| 277 |
+
if self.disable_resources:
|
| 278 |
+
page.route("**/*", intercept_route)
|
| 279 |
+
|
| 280 |
+
return self.page_pool.add_page(page)
|
| 281 |
+
|
| 282 |
+
# Wait for a page to become available
|
| 283 |
+
max_wait = 30
|
| 284 |
+
start_time = time()
|
| 285 |
+
|
| 286 |
+
while time() - start_time < max_wait:
|
| 287 |
+
page_info = self.page_pool.get_ready_page()
|
| 288 |
+
if page_info:
|
| 289 |
+
return page_info
|
| 290 |
+
sleep(0.05)
|
| 291 |
+
|
| 292 |
+
raise TimeoutError("No pages available within timeout period")
|
| 293 |
|
| 294 |
@staticmethod
|
| 295 |
+
def _detect_cloudflare(page_content):
|
| 296 |
"""
|
| 297 |
Detect the type of Cloudflare challenge present in the provided page content.
|
| 298 |
|
|
|
|
| 327 |
:param page: The targeted page
|
| 328 |
:return:
|
| 329 |
"""
|
| 330 |
+
challenge_type = self._detect_cloudflare(page.content())
|
|
|
|
| 331 |
if not challenge_type:
|
| 332 |
log.error("No Cloudflare challenge found.")
|
| 333 |
return
|
|
|
|
| 346 |
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 347 |
page.wait_for_timeout(500)
|
| 348 |
|
| 349 |
+
iframe = page.frame(url=__CF_PATTERN__)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
if iframe is None:
|
| 351 |
log.info("Didn't find Cloudflare iframe!")
|
| 352 |
return
|
|
|
|
| 367 |
log.info("Cloudflare captcha is solved")
|
| 368 |
return
|
| 369 |
|
| 370 |
+
def fetch(self, url: str) -> Response:
|
| 371 |
+
"""Opens up the browser and do your request based on your chosen options.
|
| 372 |
+
|
| 373 |
+
:param url: The Target url.
|
| 374 |
+
:return: A `Response` object.
|
| 375 |
+
"""
|
| 376 |
+
if self._closed:
|
| 377 |
+
raise RuntimeError("Context manager has been closed")
|
| 378 |
+
|
| 379 |
+
final_response = None
|
| 380 |
+
referer = generate_convincing_referer(url) if self.google_search else None
|
| 381 |
+
|
| 382 |
+
def handle_response(finished_response: SyncPlaywrightResponse):
|
| 383 |
+
nonlocal final_response
|
| 384 |
+
if (
|
| 385 |
+
finished_response.request.resource_type == "document"
|
| 386 |
+
and finished_response.request.is_navigation_request()
|
| 387 |
+
):
|
| 388 |
+
final_response = finished_response
|
| 389 |
+
|
| 390 |
+
page_info = self._get_or_create_page()
|
| 391 |
+
page_info.mark_busy(url=url)
|
| 392 |
+
|
| 393 |
+
try:
|
| 394 |
+
# Navigate to URL and wait for a specified state
|
| 395 |
+
page_info.page.on("response", handle_response)
|
| 396 |
+
first_response = page_info.page.goto(url, referer=referer)
|
| 397 |
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
| 398 |
+
|
| 399 |
+
if self.network_idle:
|
| 400 |
+
page_info.page.wait_for_load_state("networkidle")
|
| 401 |
+
|
| 402 |
+
if not first_response:
|
| 403 |
+
raise RuntimeError(f"Failed to get response for {url}")
|
| 404 |
+
|
| 405 |
+
if self.solve_cloudflare:
|
| 406 |
+
self._solve_cloudflare(page_info.page)
|
| 407 |
+
# Make sure the page is fully loaded after the captcha
|
| 408 |
+
page_info.page.wait_for_load_state(state="load")
|
| 409 |
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
| 410 |
+
if self.network_idle:
|
| 411 |
+
page_info.page.wait_for_load_state("networkidle")
|
| 412 |
+
|
| 413 |
+
if self.page_action is not None:
|
| 414 |
+
try:
|
| 415 |
+
page_info.page = self.page_action(page_info.page)
|
| 416 |
+
except Exception as e:
|
| 417 |
+
log.error(f"Error executing page_action: {e}")
|
| 418 |
+
|
| 419 |
+
if self.wait_selector:
|
| 420 |
+
try:
|
| 421 |
+
waiter: Locator = page_info.page.locator(self.wait_selector)
|
| 422 |
+
waiter.first.wait_for(state=self.wait_selector_state)
|
| 423 |
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
| 424 |
+
page_info.page.wait_for_load_state(state="load")
|
| 425 |
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
| 426 |
+
if self.network_idle:
|
| 427 |
+
page_info.page.wait_for_load_state("networkidle")
|
| 428 |
+
except Exception as e:
|
| 429 |
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 430 |
+
|
| 431 |
+
page_info.page.wait_for_timeout(self.wait)
|
| 432 |
+
response = ResponseFactory.from_playwright_response(
|
| 433 |
+
page_info.page, first_response, final_response, self.adaptor_arguments
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
# Mark the page as ready for next use
|
| 437 |
+
page_info.mark_ready()
|
| 438 |
+
|
| 439 |
+
return response
|
| 440 |
+
|
| 441 |
+
except Exception as e:
|
| 442 |
+
page_info.mark_error()
|
| 443 |
+
raise e
|
| 444 |
+
|
| 445 |
+
def get_pool_stats(self) -> Dict[str, int]:
|
| 446 |
+
"""Get statistics about the current page pool"""
|
| 447 |
+
return {
|
| 448 |
+
"total_pages": self.page_pool.pages_count,
|
| 449 |
+
"ready_pages": self.page_pool.ready_count,
|
| 450 |
+
"busy_pages": self.page_pool.busy_count,
|
| 451 |
+
"max_pages": self.max_pages,
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
class AsyncStealthySession(StealthySession):
|
| 456 |
+
"""A Stealthy session manager with page pooling."""
|
| 457 |
+
|
| 458 |
+
def __init__(
|
| 459 |
+
self,
|
| 460 |
+
max_pages: int = 1,
|
| 461 |
+
headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
|
| 462 |
+
block_images: bool = False,
|
| 463 |
+
disable_resources: bool = False,
|
| 464 |
+
block_webrtc: bool = False,
|
| 465 |
+
allow_webgl: bool = True,
|
| 466 |
+
network_idle: bool = False,
|
| 467 |
+
humanize: Union[bool, float] = True,
|
| 468 |
+
solve_cloudflare: bool = False,
|
| 469 |
+
wait: Union[int, float] = 0,
|
| 470 |
+
timeout: Union[int, float] = 30000,
|
| 471 |
+
page_action: Optional[Callable] = None,
|
| 472 |
+
wait_selector: Optional[str] = None,
|
| 473 |
+
addons: Optional[List[str]] = None,
|
| 474 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 475 |
+
cookies: Optional[List[Dict]] = None,
|
| 476 |
+
google_search: bool = True,
|
| 477 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 478 |
+
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 479 |
+
os_randomize: bool = False,
|
| 480 |
+
disable_ads: bool = False,
|
| 481 |
+
geoip: bool = False,
|
| 482 |
+
adaptor_arguments: Optional[Dict] = None,
|
| 483 |
+
additional_arguments: Optional[Dict] = None,
|
| 484 |
+
):
|
| 485 |
+
"""A Browser session manager with page pooling
|
| 486 |
+
|
| 487 |
+
:param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
|
| 488 |
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
| 489 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 490 |
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 491 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 492 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 493 |
+
:param block_webrtc: Blocks WebRTC entirely.
|
| 494 |
+
:param cookies: Set cookies for the next request.
|
| 495 |
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
| 496 |
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
| 497 |
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
| 498 |
+
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 499 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 500 |
+
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
| 501 |
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
| 502 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 503 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 504 |
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
| 505 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 506 |
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
| 507 |
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
| 508 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 509 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
| 510 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 511 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 512 |
+
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 513 |
+
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
| 514 |
+
:param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 515 |
+
"""
|
| 516 |
+
super().__init__(
|
| 517 |
+
max_pages,
|
| 518 |
+
headless,
|
| 519 |
+
block_images,
|
| 520 |
+
disable_resources,
|
| 521 |
+
block_webrtc,
|
| 522 |
+
allow_webgl,
|
| 523 |
+
network_idle,
|
| 524 |
+
humanize,
|
| 525 |
+
solve_cloudflare,
|
| 526 |
+
wait,
|
| 527 |
+
timeout,
|
| 528 |
+
page_action,
|
| 529 |
+
wait_selector,
|
| 530 |
+
addons,
|
| 531 |
+
wait_selector_state,
|
| 532 |
+
cookies,
|
| 533 |
+
google_search,
|
| 534 |
+
extra_headers,
|
| 535 |
+
proxy,
|
| 536 |
+
os_randomize,
|
| 537 |
+
disable_ads,
|
| 538 |
+
geoip,
|
| 539 |
+
adaptor_arguments,
|
| 540 |
+
additional_arguments,
|
| 541 |
+
)
|
| 542 |
+
self.playwright: Optional[AsyncPlaywright] = None
|
| 543 |
+
self.browser: Optional[Union[AsyncBrowserType, AsyncBrowser]] = None
|
| 544 |
+
self.context: Optional[AsyncBrowserContext] = None
|
| 545 |
+
self._lock = Lock()
|
| 546 |
+
self.__enter__ = None
|
| 547 |
+
self.__exit__ = None
|
| 548 |
+
|
| 549 |
+
async def __create__(self):
|
| 550 |
+
"""Create a browser for this instance and context."""
|
| 551 |
+
self.playwright: AsyncPlaywright = await async_playwright().start()
|
| 552 |
+
self.browser = await AsyncNewBrowser(self.playwright, **self.launch_options)
|
| 553 |
+
self.context: AsyncBrowserContext = await self.browser.new_context(
|
| 554 |
+
**self.context_options
|
| 555 |
+
)
|
| 556 |
+
if self.cookies:
|
| 557 |
+
await self.context.add_cookies(self.cookies)
|
| 558 |
+
|
| 559 |
+
async def __aenter__(self):
|
| 560 |
+
await self.__create__()
|
| 561 |
+
return self
|
| 562 |
+
|
| 563 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 564 |
+
await self.close()
|
| 565 |
+
|
| 566 |
+
async def close(self):
|
| 567 |
+
"""Close all resources"""
|
| 568 |
+
if self._closed:
|
| 569 |
+
return
|
| 570 |
+
|
| 571 |
+
if self.context:
|
| 572 |
+
await self.context.close()
|
| 573 |
+
self.context = None
|
| 574 |
+
|
| 575 |
+
if self.browser:
|
| 576 |
+
await self.browser.close()
|
| 577 |
+
self.browser = None
|
| 578 |
+
|
| 579 |
+
if self.playwright:
|
| 580 |
+
await self.playwright.stop()
|
| 581 |
+
self.playwright = None
|
| 582 |
+
|
| 583 |
+
self._closed = True
|
| 584 |
+
|
| 585 |
+
async def _get_or_create_page(self) -> PageInfo:
|
| 586 |
+
"""Get an available page or create a new one"""
|
| 587 |
+
async with self._lock:
|
| 588 |
+
# Try to get a ready page first
|
| 589 |
+
page_info = self.page_pool.get_ready_page()
|
| 590 |
+
if page_info:
|
| 591 |
+
return page_info
|
| 592 |
+
|
| 593 |
+
# Create a new page if under limit
|
| 594 |
+
if self.page_pool.pages_count < self.max_pages:
|
| 595 |
+
page = await self.context.new_page()
|
| 596 |
+
page.set_default_navigation_timeout(self.timeout)
|
| 597 |
+
page.set_default_timeout(self.timeout)
|
| 598 |
+
if self.extra_headers:
|
| 599 |
+
await page.set_extra_http_headers(self.extra_headers)
|
| 600 |
+
|
| 601 |
+
if self.disable_resources:
|
| 602 |
+
await page.route("**/*", async_intercept_route)
|
| 603 |
+
|
| 604 |
+
return self.page_pool.add_page(page)
|
| 605 |
+
|
| 606 |
+
# Wait for a page to become available
|
| 607 |
+
max_wait = 30
|
| 608 |
+
start_time = time()
|
| 609 |
+
|
| 610 |
+
while time() - start_time < max_wait:
|
| 611 |
+
page_info = self.page_pool.get_ready_page()
|
| 612 |
+
if page_info:
|
| 613 |
+
return page_info
|
| 614 |
+
await asyncio_sleep(0.05)
|
| 615 |
+
|
| 616 |
+
raise TimeoutError("No pages available within timeout period")
|
| 617 |
+
|
| 618 |
+
async def _solve_cloudflare(self, page: async_Page):
|
| 619 |
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
| 620 |
|
| 621 |
:param page: The async targeted page
|
| 622 |
:return:
|
| 623 |
"""
|
| 624 |
+
challenge_type = self._detect_cloudflare(await page.content())
|
|
|
|
| 625 |
if not challenge_type:
|
| 626 |
log.error("No Cloudflare challenge found.")
|
| 627 |
return
|
|
|
|
| 640 |
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 641 |
await page.wait_for_timeout(500)
|
| 642 |
|
| 643 |
+
iframe = page.frame(url=__CF_PATTERN__)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
if iframe is None:
|
| 645 |
log.info("Didn't find Cloudflare iframe!")
|
| 646 |
return
|
|
|
|
| 663 |
log.info("Cloudflare captcha is solved")
|
| 664 |
return
|
| 665 |
|
| 666 |
+
async def fetch(self, url: str) -> Response:
|
| 667 |
"""Opens up the browser and do your request based on your chosen options.
|
| 668 |
|
| 669 |
+
:param url: The Target url.
|
| 670 |
+
:return: A `Response` object.
|
| 671 |
"""
|
| 672 |
+
if self._closed:
|
| 673 |
+
raise RuntimeError("Context manager has been closed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
final_response = None
|
| 676 |
referer = generate_convincing_referer(url) if self.google_search else None
|
| 677 |
|
| 678 |
+
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
| 679 |
nonlocal final_response
|
| 680 |
if (
|
| 681 |
finished_response.request.resource_type == "document"
|
|
|
|
| 683 |
):
|
| 684 |
final_response = finished_response
|
| 685 |
|
| 686 |
+
page_info = await self._get_or_create_page()
|
| 687 |
+
page_info.mark_busy(url=url)
|
|
|
|
|
|
|
| 688 |
|
| 689 |
+
try:
|
| 690 |
+
# Navigate to URL and wait for a specified state
|
| 691 |
+
page_info.page.on("response", handle_response)
|
| 692 |
+
first_response = await page_info.page.goto(url, referer=referer)
|
| 693 |
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
|
| 695 |
if self.network_idle:
|
| 696 |
+
await page_info.page.wait_for_load_state("networkidle")
|
| 697 |
+
|
| 698 |
+
if not first_response:
|
| 699 |
+
raise RuntimeError(f"Failed to get response for {url}")
|
| 700 |
|
| 701 |
if self.solve_cloudflare:
|
| 702 |
+
await self._solve_cloudflare(page_info.page)
|
| 703 |
# Make sure the page is fully loaded after the captcha
|
| 704 |
+
await page_info.page.wait_for_load_state(state="load")
|
| 705 |
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
| 706 |
if self.network_idle:
|
| 707 |
+
await page_info.page.wait_for_load_state("networkidle")
|
| 708 |
|
| 709 |
if self.page_action is not None:
|
| 710 |
try:
|
| 711 |
+
page_info.page = await self.page_action(page_info.page)
|
| 712 |
except Exception as e:
|
| 713 |
+
log.error(f"Error executing page_action: {e}")
|
| 714 |
|
| 715 |
+
if self.wait_selector:
|
| 716 |
try:
|
| 717 |
+
waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
|
| 718 |
await waiter.first.wait_for(state=self.wait_selector_state)
|
| 719 |
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
| 720 |
+
await page_info.page.wait_for_load_state(state="load")
|
| 721 |
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
| 722 |
if self.network_idle:
|
| 723 |
+
await page_info.page.wait_for_load_state("networkidle")
|
| 724 |
except Exception as e:
|
| 725 |
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 726 |
|
| 727 |
+
await page_info.page.wait_for_timeout(self.wait)
|
| 728 |
+
|
| 729 |
+
# Create response object
|
| 730 |
response = await ResponseFactory.from_async_playwright_response(
|
| 731 |
+
page_info.page, first_response, final_response, self.adaptor_arguments
|
| 732 |
)
|
|
|
|
|
|
|
| 733 |
|
| 734 |
+
# Mark the page as ready for next use
|
| 735 |
+
page_info.mark_ready()
|
| 736 |
+
|
| 737 |
+
return response
|
| 738 |
+
|
| 739 |
+
except Exception as e:
|
| 740 |
+
page_info.mark_error()
|
| 741 |
+
raise e
|
scrapling/engines/_browsers/_validators.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from msgspec import Struct, convert, ValidationError
|
| 2 |
from urllib.parse import urlparse
|
|
|
|
| 3 |
|
| 4 |
from scrapling.core._types import (
|
| 5 |
Optional,
|
|
@@ -78,6 +79,70 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
| 78 |
raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
|
| 79 |
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
def validate(params, model):
|
| 82 |
try:
|
| 83 |
config = convert(params, model)
|
|
|
|
| 1 |
from msgspec import Struct, convert, ValidationError
|
| 2 |
from urllib.parse import urlparse
|
| 3 |
+
from os.path import exists, isdir
|
| 4 |
|
| 5 |
from scrapling.core._types import (
|
| 6 |
Optional,
|
|
|
|
| 79 |
raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
|
| 80 |
|
| 81 |
|
| 82 |
+
class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
| 83 |
+
"""Configuration struct for validation"""
|
| 84 |
+
|
| 85 |
+
max_pages: int = 1
|
| 86 |
+
headless: Union[bool, Literal["virtual"]] = True # noqa: F821
|
| 87 |
+
block_images: bool = False
|
| 88 |
+
disable_resources: bool = False
|
| 89 |
+
block_webrtc: bool = False
|
| 90 |
+
allow_webgl: bool = True
|
| 91 |
+
network_idle: bool = False
|
| 92 |
+
humanize: Union[bool, float] = True
|
| 93 |
+
solve_cloudflare: bool = False
|
| 94 |
+
wait: Union[int, float] = 0
|
| 95 |
+
timeout: Union[int, float] = 30000
|
| 96 |
+
page_action: Optional[Callable] = None
|
| 97 |
+
wait_selector: Optional[str] = None
|
| 98 |
+
addons: Optional[List[str]] = None
|
| 99 |
+
wait_selector_state: SelectorWaitStates = "attached"
|
| 100 |
+
cookies: Optional[List[Dict]] = None
|
| 101 |
+
google_search: bool = True
|
| 102 |
+
extra_headers: Optional[Dict[str, str]] = None
|
| 103 |
+
proxy: Optional[Union[str, Dict[str, str]]] = (
|
| 104 |
+
None # The default value for proxy in Playwright's source is `None`
|
| 105 |
+
)
|
| 106 |
+
os_randomize: bool = False
|
| 107 |
+
disable_ads: bool = False
|
| 108 |
+
geoip: bool = False
|
| 109 |
+
adaptor_arguments: Optional[Dict] = None
|
| 110 |
+
additional_arguments: Optional[Dict] = None
|
| 111 |
+
|
| 112 |
+
def __post_init__(self):
|
| 113 |
+
"""Custom validation after msgspec validation"""
|
| 114 |
+
if self.max_pages < 1 or self.max_pages > 50:
|
| 115 |
+
raise ValueError("max_pages must be between 1 and 50")
|
| 116 |
+
if self.timeout < 0:
|
| 117 |
+
raise ValueError("timeout must be >= 0")
|
| 118 |
+
if self.page_action is not None and not callable(self.page_action):
|
| 119 |
+
raise TypeError(
|
| 120 |
+
f"page_action must be callable, got {type(self.page_action).__name__}"
|
| 121 |
+
)
|
| 122 |
+
if self.proxy:
|
| 123 |
+
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
| 124 |
+
|
| 125 |
+
if not self.addons:
|
| 126 |
+
self.addons = []
|
| 127 |
+
else:
|
| 128 |
+
for addon in self.addons:
|
| 129 |
+
if not exists(addon):
|
| 130 |
+
raise FileNotFoundError(f"Addon's path not found: {addon}")
|
| 131 |
+
elif not isdir(addon):
|
| 132 |
+
raise ValueError(
|
| 133 |
+
f"Addon's path is not a folder, you need to pass a folder of the extracted addon: {addon}"
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
if not self.cookies:
|
| 137 |
+
self.cookies = []
|
| 138 |
+
if self.solve_cloudflare and self.timeout < 60_000:
|
| 139 |
+
self.timeout = 60_000
|
| 140 |
+
if not self.adaptor_arguments:
|
| 141 |
+
self.adaptor_arguments = {}
|
| 142 |
+
if not self.additional_arguments:
|
| 143 |
+
self.additional_arguments = {}
|
| 144 |
+
|
| 145 |
+
|
| 146 |
def validate(params, model):
|
| 147 |
try:
|
| 148 |
config = convert(params, model)
|
scrapling/fetchers.py
CHANGED
|
@@ -10,10 +10,10 @@ from scrapling.core._types import (
|
|
| 10 |
)
|
| 11 |
from scrapling.engines import (
|
| 12 |
FetcherSession,
|
| 13 |
-
|
|
|
|
| 14 |
DynamicSession,
|
| 15 |
AsyncDynamicSession,
|
| 16 |
-
check_if_engine_usable,
|
| 17 |
FetcherClient as _FetcherClient,
|
| 18 |
AsyncFetcherClient as _AsyncFetcherClient,
|
| 19 |
)
|
|
@@ -57,23 +57,23 @@ class StealthyFetcher(BaseFetcher):
|
|
| 57 |
block_webrtc: bool = False,
|
| 58 |
allow_webgl: bool = True,
|
| 59 |
network_idle: bool = False,
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
wait:
|
| 63 |
-
timeout:
|
| 64 |
-
page_action: Callable = None,
|
| 65 |
wait_selector: Optional[str] = None,
|
| 66 |
-
|
| 67 |
-
solve_cloudflare: Optional[bool] = False,
|
| 68 |
wait_selector_state: SelectorWaitStates = "attached",
|
|
|
|
| 69 |
google_search: bool = True,
|
| 70 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 71 |
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 72 |
os_randomize: bool = False,
|
| 73 |
disable_ads: bool = False,
|
| 74 |
geoip: bool = False,
|
| 75 |
-
custom_config: Dict = None,
|
| 76 |
-
additional_arguments: Dict = None,
|
| 77 |
) -> Response:
|
| 78 |
"""
|
| 79 |
Opens up a browser and do your request based on your chosen options below.
|
|
@@ -106,7 +106,7 @@ class StealthyFetcher(BaseFetcher):
|
|
| 106 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 107 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 108 |
:param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 109 |
-
:return: A `Response` object
|
| 110 |
"""
|
| 111 |
if not custom_config:
|
| 112 |
custom_config = {}
|
|
@@ -115,8 +115,9 @@ class StealthyFetcher(BaseFetcher):
|
|
| 115 |
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 116 |
)
|
| 117 |
|
| 118 |
-
|
| 119 |
wait=wait,
|
|
|
|
| 120 |
proxy=proxy,
|
| 121 |
geoip=geoip,
|
| 122 |
addons=addons,
|
|
@@ -139,8 +140,8 @@ class StealthyFetcher(BaseFetcher):
|
|
| 139 |
wait_selector_state=wait_selector_state,
|
| 140 |
adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
|
| 141 |
additional_arguments=additional_arguments or {},
|
| 142 |
-
)
|
| 143 |
-
|
| 144 |
|
| 145 |
@classmethod
|
| 146 |
async def async_fetch(
|
|
@@ -150,25 +151,25 @@ class StealthyFetcher(BaseFetcher):
|
|
| 150 |
block_images: bool = False,
|
| 151 |
disable_resources: bool = False,
|
| 152 |
block_webrtc: bool = False,
|
| 153 |
-
cookies: Optional[Iterable[Dict]] = None,
|
| 154 |
allow_webgl: bool = True,
|
| 155 |
network_idle: bool = False,
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
|
|
|
| 160 |
wait_selector: Optional[str] = None,
|
| 161 |
-
|
| 162 |
-
solve_cloudflare: Optional[bool] = False,
|
| 163 |
wait_selector_state: SelectorWaitStates = "attached",
|
|
|
|
| 164 |
google_search: bool = True,
|
| 165 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 166 |
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 167 |
os_randomize: bool = False,
|
| 168 |
disable_ads: bool = False,
|
| 169 |
geoip: bool = False,
|
| 170 |
-
custom_config: Dict = None,
|
| 171 |
-
additional_arguments: Dict = None,
|
| 172 |
) -> Response:
|
| 173 |
"""
|
| 174 |
Opens up a browser and do your request based on your chosen options below.
|
|
@@ -201,7 +202,7 @@ class StealthyFetcher(BaseFetcher):
|
|
| 201 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 202 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 203 |
:param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 204 |
-
:return: A `Response` object
|
| 205 |
"""
|
| 206 |
if not custom_config:
|
| 207 |
custom_config = {}
|
|
@@ -210,8 +211,9 @@ class StealthyFetcher(BaseFetcher):
|
|
| 210 |
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 211 |
)
|
| 212 |
|
| 213 |
-
|
| 214 |
wait=wait,
|
|
|
|
| 215 |
proxy=proxy,
|
| 216 |
geoip=geoip,
|
| 217 |
addons=addons,
|
|
@@ -234,8 +236,8 @@ class StealthyFetcher(BaseFetcher):
|
|
| 234 |
wait_selector_state=wait_selector_state,
|
| 235 |
adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
|
| 236 |
additional_arguments=additional_arguments or {},
|
| 237 |
-
)
|
| 238 |
-
|
| 239 |
|
| 240 |
|
| 241 |
class DynamicFetcher(BaseFetcher):
|
|
@@ -425,12 +427,3 @@ class DynamicFetcher(BaseFetcher):
|
|
| 425 |
|
| 426 |
|
| 427 |
PlayWrightFetcher = DynamicFetcher # For backward-compatibility
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
class CustomFetcher(BaseFetcher):
|
| 431 |
-
@classmethod
|
| 432 |
-
def fetch(cls, url: str, browser_engine, **kwargs) -> Response:
|
| 433 |
-
engine = check_if_engine_usable(browser_engine)(
|
| 434 |
-
adaptor_arguments=cls._generate_parser_arguments(), **kwargs
|
| 435 |
-
)
|
| 436 |
-
return engine.fetch(url)
|
|
|
|
| 10 |
)
|
| 11 |
from scrapling.engines import (
|
| 12 |
FetcherSession,
|
| 13 |
+
StealthySession,
|
| 14 |
+
AsyncStealthySession,
|
| 15 |
DynamicSession,
|
| 16 |
AsyncDynamicSession,
|
|
|
|
| 17 |
FetcherClient as _FetcherClient,
|
| 18 |
AsyncFetcherClient as _AsyncFetcherClient,
|
| 19 |
)
|
|
|
|
| 57 |
block_webrtc: bool = False,
|
| 58 |
allow_webgl: bool = True,
|
| 59 |
network_idle: bool = False,
|
| 60 |
+
humanize: Union[bool, float] = True,
|
| 61 |
+
solve_cloudflare: bool = False,
|
| 62 |
+
wait: Union[int, float] = 0,
|
| 63 |
+
timeout: Union[int, float] = 30000,
|
| 64 |
+
page_action: Optional[Callable] = None,
|
| 65 |
wait_selector: Optional[str] = None,
|
| 66 |
+
addons: Optional[List[str]] = None,
|
|
|
|
| 67 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 68 |
+
cookies: Optional[List[Dict]] = None,
|
| 69 |
google_search: bool = True,
|
| 70 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 71 |
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 72 |
os_randomize: bool = False,
|
| 73 |
disable_ads: bool = False,
|
| 74 |
geoip: bool = False,
|
| 75 |
+
custom_config: Optional[Dict] = None,
|
| 76 |
+
additional_arguments: Optional[Dict] = None,
|
| 77 |
) -> Response:
|
| 78 |
"""
|
| 79 |
Opens up a browser and do your request based on your chosen options below.
|
|
|
|
| 106 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 107 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 108 |
:param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 109 |
+
:return: A `Response` object.
|
| 110 |
"""
|
| 111 |
if not custom_config:
|
| 112 |
custom_config = {}
|
|
|
|
| 115 |
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 116 |
)
|
| 117 |
|
| 118 |
+
with StealthySession(
|
| 119 |
wait=wait,
|
| 120 |
+
max_pages=1,
|
| 121 |
proxy=proxy,
|
| 122 |
geoip=geoip,
|
| 123 |
addons=addons,
|
|
|
|
| 140 |
wait_selector_state=wait_selector_state,
|
| 141 |
adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
|
| 142 |
additional_arguments=additional_arguments or {},
|
| 143 |
+
) as engine:
|
| 144 |
+
return engine.fetch(url)
|
| 145 |
|
| 146 |
@classmethod
|
| 147 |
async def async_fetch(
|
|
|
|
| 151 |
block_images: bool = False,
|
| 152 |
disable_resources: bool = False,
|
| 153 |
block_webrtc: bool = False,
|
|
|
|
| 154 |
allow_webgl: bool = True,
|
| 155 |
network_idle: bool = False,
|
| 156 |
+
humanize: Union[bool, float] = True,
|
| 157 |
+
solve_cloudflare: bool = False,
|
| 158 |
+
wait: Union[int, float] = 0,
|
| 159 |
+
timeout: Union[int, float] = 30000,
|
| 160 |
+
page_action: Optional[Callable] = None,
|
| 161 |
wait_selector: Optional[str] = None,
|
| 162 |
+
addons: Optional[List[str]] = None,
|
|
|
|
| 163 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 164 |
+
cookies: Optional[List[Dict]] = None,
|
| 165 |
google_search: bool = True,
|
| 166 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 167 |
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 168 |
os_randomize: bool = False,
|
| 169 |
disable_ads: bool = False,
|
| 170 |
geoip: bool = False,
|
| 171 |
+
custom_config: Optional[Dict] = None,
|
| 172 |
+
additional_arguments: Optional[Dict] = None,
|
| 173 |
) -> Response:
|
| 174 |
"""
|
| 175 |
Opens up a browser and do your request based on your chosen options below.
|
|
|
|
| 202 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 203 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 204 |
:param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 205 |
+
:return: A `Response` object.
|
| 206 |
"""
|
| 207 |
if not custom_config:
|
| 208 |
custom_config = {}
|
|
|
|
| 211 |
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 212 |
)
|
| 213 |
|
| 214 |
+
async with AsyncStealthySession(
|
| 215 |
wait=wait,
|
| 216 |
+
max_pages=1,
|
| 217 |
proxy=proxy,
|
| 218 |
geoip=geoip,
|
| 219 |
addons=addons,
|
|
|
|
| 236 |
wait_selector_state=wait_selector_state,
|
| 237 |
adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
|
| 238 |
additional_arguments=additional_arguments or {},
|
| 239 |
+
) as engine:
|
| 240 |
+
return await engine.fetch(url)
|
| 241 |
|
| 242 |
|
| 243 |
class DynamicFetcher(BaseFetcher):
|
|
|
|
| 427 |
|
| 428 |
|
| 429 |
PlayWrightFetcher = DynamicFetcher # For backward-compatibility
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|