Scrapling / scrapling /engines /static.py
Karim shoair
fix: update code and docstrings to remove the old google referer logic
a28879b
from abc import ABC
from random import choice
from time import sleep as time_sleep
from asyncio import sleep as asyncio_sleep
from curl_cffi.curl import CurlError
from curl_cffi import CurlHttpVersion
from curl_cffi.requests import (
BrowserTypeLiteral,
Session as CurlSession,
AsyncSession as AsyncCurlSession,
)
from scrapling.core.utils import log
from scrapling.core._types import (
Any,
Dict,
Tuple,
Unpack,
Optional,
Awaitable,
SUPPORTED_HTTP_METHODS,
)
from .toolbelt.custom import Response
from .toolbelt.convertor import ResponseFactory
from .toolbelt.proxy_rotation import ProxyRotator, is_proxy_error
from ._browsers._types import RequestsSession, GetRequestParams, DataRequestParams, ImpersonateType
from .toolbelt.fingerprints import generate_headers, __default_useragent__
_NO_SESSION: Any = object()
def _select_random_browser(impersonate: ImpersonateType) -> Optional[BrowserTypeLiteral]:
"""
Handle browser selection logic for the ` impersonate ` parameter.
If impersonate is a list, randomly select one browser from it.
If it's a string or None, return as is.
"""
if isinstance(impersonate, list):
if not impersonate:
return None
return choice(impersonate)
return impersonate
class _ConfigurationLogic(ABC):
# Core Logic Handler (Internal Engine)
__slots__ = (
"_default_impersonate",
"_stealth",
"_default_proxies",
"_default_proxy",
"_default_proxy_auth",
"_default_timeout",
"_default_headers",
"_default_retries",
"_default_retry_delay",
"_default_follow_redirects",
"_default_max_redirects",
"_default_verify",
"_default_cert",
"_default_http3",
"selector_config",
"_is_alive",
"_proxy_rotator",
)
def __init__(self, **kwargs: Unpack[RequestsSession]):
self._default_impersonate = kwargs.get("impersonate", "chrome")
self._stealth = kwargs.get("stealthy_headers", True)
self._default_proxies = kwargs.get("proxies") or {}
self._default_proxy = kwargs.get("proxy") or None
self._default_proxy_auth = kwargs.get("proxy_auth") or None
self._default_timeout = kwargs.get("timeout", 30)
self._default_headers = kwargs.get("headers") or {}
self._default_retries = kwargs.get("retries", 3)
self._default_retry_delay = kwargs.get("retry_delay", 1)
self._default_follow_redirects = kwargs.get("follow_redirects", True)
self._default_max_redirects = kwargs.get("max_redirects", 30)
self._default_verify = kwargs.get("verify", True)
self._default_cert = kwargs.get("cert") or None
self._default_http3 = kwargs.get("http3", False)
self.selector_config = kwargs.get("selector_config") or {}
self._is_alive = False
self._proxy_rotator: Optional[ProxyRotator] = kwargs.get("proxy_rotator")
if self._proxy_rotator and (self._default_proxy or self._default_proxies):
raise ValueError(
"Cannot use 'proxy_rotator' together with 'proxy' or 'proxies'. "
"Use either a static proxy or proxy rotation, not both."
)
@staticmethod
def _get_param(kwargs: Dict, key: str, default: Any) -> Any:
"""Get parameter from kwargs if present, otherwise return default."""
return kwargs[key] if key in kwargs else default
def _merge_request_args(self, **method_kwargs) -> Dict[str, Any]:
"""Merge request-specific arguments with default session arguments."""
url = method_kwargs.pop("url")
# Get parameters from kwargs or use defaults
impersonate = self._get_param(method_kwargs, "impersonate", self._default_impersonate)
impersonate = _select_random_browser(impersonate)
http3_enabled = self._get_param(method_kwargs, "http3", self._default_http3)
stealth = self._get_param(method_kwargs, "stealth", self._stealth)
final_args = {
"url": url,
# Curl automatically generates the suitable browser headers when you use `impersonate`
"headers": self._headers_job(
url,
self._get_param(method_kwargs, "headers", self._default_headers),
stealth,
bool(impersonate),
),
"proxies": self._get_param(method_kwargs, "proxies", self._default_proxies),
"proxy": self._get_param(method_kwargs, "proxy", self._default_proxy),
"proxy_auth": self._get_param(method_kwargs, "proxy_auth", self._default_proxy_auth),
"timeout": self._get_param(method_kwargs, "timeout", self._default_timeout),
"allow_redirects": self._get_param(method_kwargs, "follow_redirects", self._default_follow_redirects),
"max_redirects": self._get_param(method_kwargs, "max_redirects", self._default_max_redirects),
"verify": self._get_param(method_kwargs, "verify", self._default_verify),
"cert": self._get_param(method_kwargs, "cert", self._default_cert),
"impersonate": impersonate,
}
# Add any remaining parameters that weren't explicitly handled above
# Skip the ones we already processed plus internal params
skip_keys = {
"impersonate",
"http3",
"stealth",
"headers",
"proxies",
"proxy",
"proxy_auth",
"timeout",
"follow_redirects",
"max_redirects",
"verify",
"cert",
"retries",
"retry_delay",
"selector_config",
# Browser session params (ignored by HTTP sessions)
"extra_headers",
"google_search",
}
for k, v in method_kwargs.items():
if k not in skip_keys and v is not None:
final_args[k] = v
if http3_enabled: # pragma: no cover
final_args["http_version"] = CurlHttpVersion.V3ONLY
if impersonate:
log.warning(
"The argument `http3` might cause errors if used with `impersonate` argument, try switching it off if you encounter any curl errors."
)
return final_args
def _headers_job(self, url, headers: Dict, stealth: bool, impersonate_enabled: bool) -> Dict:
"""
1. Adds a useragent to the headers if it doesn't have one
2. Generates real headers and append them to current headers
3. Sets a Google referer header.
"""
# Merge session headers with request headers, request takes precedence (if it was set)
final_headers = {**self._default_headers, **(headers if headers else {})}
headers_keys = {k.lower() for k in final_headers}
if stealth:
if "referer" not in headers_keys:
final_headers["referer"] = "https://www.google.com/"
if not impersonate_enabled: # Curl will generate the suitable headers
extra_headers = generate_headers(browser_mode=False)
final_headers.update(
{k: v for k, v in extra_headers.items() if k.lower() not in headers_keys}
) # Don't overwrite user-supplied headers
elif "user-agent" not in headers_keys and not impersonate_enabled: # pragma: no cover
final_headers["User-Agent"] = __default_useragent__
log.debug(f"Can't find useragent in headers so '{final_headers['User-Agent']}' was used.")
return final_headers
class _SyncSessionLogic(_ConfigurationLogic):
__slots__ = ("_curl_session",)
def __init__(self, **kwargs: Unpack[RequestsSession]):
super().__init__(**kwargs)
self._curl_session: Optional[CurlSession] = None
def __enter__(self):
"""Creates and returns a new synchronous Fetcher Session"""
if self._is_alive:
raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
self._curl_session = CurlSession()
self._is_alive = True
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Closes the active synchronous session managed by this instance, if any."""
# For type checking (not accessed error)
_ = (
exc_type,
exc_val,
exc_tb,
)
if self._curl_session:
self._curl_session.close()
self._curl_session = None
self._is_alive = False
def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
"""
Perform an HTTP request using the configured session.
"""
stealth = self._stealth if stealth is None else stealth
selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
max_retries = self._get_param(kwargs, "retries", self._default_retries)
retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
static_proxy = kwargs.pop("proxy", None)
session = self._curl_session
one_off_request = False
if session is _NO_SESSION and self.__enter__ is None:
# For usage inside FetcherClient
# It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
session = CurlSession()
one_off_request = True
if not session:
raise RuntimeError("No active session available.") # pragma: no cover
try:
for attempt in range(max_retries):
if self._proxy_rotator and static_proxy is None:
proxy = self._proxy_rotator.get_proxy()
else:
proxy = static_proxy
request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
try:
response = session.request(method, **request_args)
result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
return result
except CurlError as e: # pragma: no cover
if attempt < max_retries - 1:
# Now if the rotator is enabled, we will try again with the new proxy
# If it's not enabled, then we will try again with the same proxy
if is_proxy_error(e):
log.warning(
f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..."
)
else:
log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
time_sleep(retry_delay)
else:
log.error(f"Failed after {max_retries} attempts: {e}")
raise # Raise the exception if all retries fail
finally:
if session and one_off_request:
session.close()
raise RuntimeError("No active session available.") # pragma: no cover
def get(self, url: str, **kwargs: Unpack[GetRequestParams]) -> Response:
"""
Perform a GET request.
Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
:param url: Target URL for the request.
:param kwargs: Additional keyword arguments including:
- params: Query string parameters for the request.
- headers: Headers to include in the request.
- cookies: Cookies to use in the request.
- timeout: Number of seconds to wait before timing out.
- follow_redirects: Whether to follow redirects. Defaults to True.
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
- retries: Number of retry attempts. Defaults to 3.
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
- proxies: Dict of proxies to use.
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
- verify: Whether to verify HTTPS certificates.
- cert: Tuple of (cert, key) filenames for the client certificate.
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
:return: A `Response` object.
"""
stealthy_headers = kwargs.pop("stealthy_headers", None)
return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
"""
Perform a POST request.
Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
:param url: Target URL for the request.
:param kwargs: Additional keyword arguments including:
- data: Form data to include in the request body.
- json: A JSON serializable object to include in the body of the request.
- params: Query string parameters for the request.
- headers: Headers to include in the request.
- cookies: Cookies to use in the request.
- timeout: Number of seconds to wait before timing out.
- follow_redirects: Whether to follow redirects. Defaults to True.
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
- retries: Number of retry attempts. Defaults to 3.
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
- proxies: Dict of proxies to use.
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
- verify: Whether to verify HTTPS certificates.
- cert: Tuple of (cert, key) filenames for the client certificate.
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
:return: A `Response` object.
"""
stealthy_headers = kwargs.pop("stealthy_headers", None)
return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
"""
Perform a PUT request.
Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
:param url: Target URL for the request.
:param kwargs: Additional keyword arguments including:
- data: Form data to include in the request body.
- json: A JSON serializable object to include in the body of the request.
- params: Query string parameters for the request.
- headers: Headers to include in the request.
- cookies: Cookies to use in the request.
- timeout: Number of seconds to wait before timing out.
- follow_redirects: Whether to follow redirects. Defaults to True.
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
- retries: Number of retry attempts. Defaults to 3.
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
- proxies: Dict of proxies to use.
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
- verify: Whether to verify HTTPS certificates.
- cert: Tuple of (cert, key) filenames for the client certificate.
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
:return: A `Response` object.
"""
stealthy_headers = kwargs.pop("stealthy_headers", None)
return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
"""
Perform a DELETE request.
Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
:param url: Target URL for the request.
:param kwargs: Additional keyword arguments including:
- data: Form data to include in the request body.
- json: A JSON serializable object to include in the body of the request.
- params: Query string parameters for the request.
- headers: Headers to include in the request.
- cookies: Cookies to use in the request.
- timeout: Number of seconds to wait before timing out.
- follow_redirects: Whether to follow redirects. Defaults to True.
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
- retries: Number of retry attempts. Defaults to 3.
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
- proxies: Dict of proxies to use.
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
- verify: Whether to verify HTTPS certificates.
- cert: Tuple of (cert, key) filenames for the client certificate.
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
:return: A `Response` object.
"""
# Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
# But some websites accept it, it depends on the implementation used.
stealthy_headers = kwargs.pop("stealthy_headers", None)
return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
class _ASyncSessionLogic(_ConfigurationLogic):
__slots__ = ("_async_curl_session",)
def __init__(self, **kwargs: Unpack[RequestsSession]):
super().__init__(**kwargs)
self._async_curl_session: Optional[AsyncCurlSession] = None
async def __aenter__(self): # pragma: no cover
"""Creates and returns a new asynchronous Session."""
if self._is_alive:
raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
self._async_curl_session = AsyncCurlSession()
self._is_alive = True
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Closes the active asynchronous session managed by this instance, if any."""
# For type checking (not accessed error)
_ = (
exc_type,
exc_val,
exc_tb,
)
if self._async_curl_session:
await self._async_curl_session.close()
self._async_curl_session = None
self._is_alive = False
async def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
"""
Perform an HTTP request using the configured session.
"""
stealth = self._stealth if stealth is None else stealth
selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
max_retries = self._get_param(kwargs, "retries", self._default_retries)
retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
static_proxy = kwargs.pop("proxy", None)
session = self._async_curl_session
one_off_request = False
if session is _NO_SESSION and self.__aenter__ is None:
# For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons
# 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
# 2. `curl_cffi` doesn't support making async requests without sessions
# 3. Using a single session for many requests at the same time in async doesn't sit well with curl_cffi.
session = AsyncCurlSession()
one_off_request = True
if not session:
raise RuntimeError("No active session available.") # pragma: no cover
try:
# Determine if we should use proxy rotation
for attempt in range(max_retries):
if self._proxy_rotator and static_proxy is None:
proxy = self._proxy_rotator.get_proxy()
else:
proxy = static_proxy
request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
try:
response = await session.request(method, **request_args)
result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
return result
except CurlError as e: # pragma: no cover
if attempt < max_retries - 1:
# Now if the rotator is enabled, we will try again with the new proxy
# If it's not enabled, then we will try again with the same proxy
if is_proxy_error(e):
log.warning(
f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..."
)
else:
log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
await asyncio_sleep(retry_delay)
else:
log.error(f"Failed after {max_retries} attempts: {e}")
raise # Raise the exception if all retries fail
finally:
if session and one_off_request:
await session.close()
raise RuntimeError("No active session available.") # pragma: no cover
def get(self, url: str, **kwargs: Unpack[GetRequestParams]) -> Awaitable[Response]:
"""
Perform a GET request.
Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
:param url: Target URL for the request.
:param kwargs: Additional keyword arguments including:
- params: Query string parameters for the request.
- headers: Headers to include in the request.
- cookies: Cookies to use in the request.
- timeout: Number of seconds to wait before timing out.
- follow_redirects: Whether to follow redirects. Defaults to True.
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
- retries: Number of retry attempts. Defaults to 3.
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
- proxies: Dict of proxies to use.
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
- verify: Whether to verify HTTPS certificates.
- cert: Tuple of (cert, key) filenames for the client certificate.
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
:return: A `Response` object.
"""
stealthy_headers = kwargs.pop("stealthy_headers", None)
return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
"""
Perform a POST request.
Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
:param url: Target URL for the request.
:param kwargs: Additional keyword arguments including:
- data: Form data to include in the request body.
- json: A JSON serializable object to include in the body of the request.
- params: Query string parameters for the request.
- headers: Headers to include in the request.
- cookies: Cookies to use in the request.
- timeout: Number of seconds to wait before timing out.
- follow_redirects: Whether to follow redirects. Defaults to True.
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
- retries: Number of retry attempts. Defaults to 3.
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
- proxies: Dict of proxies to use.
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
- verify: Whether to verify HTTPS certificates.
- cert: Tuple of (cert, key) filenames for the client certificate.
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
:return: A `Response` object.
"""
stealthy_headers = kwargs.pop("stealthy_headers", None)
return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
"""
Perform a PUT request.
Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
:param url: Target URL for the request.
:param kwargs: Additional keyword arguments including:
- data: Form data to include in the request body.
- json: A JSON serializable object to include in the body of the request.
- params: Query string parameters for the request.
- headers: Headers to include in the request.
- cookies: Cookies to use in the request.
- timeout: Number of seconds to wait before timing out.
- follow_redirects: Whether to follow redirects. Defaults to True.
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
- retries: Number of retry attempts. Defaults to 3.
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
- proxies: Dict of proxies to use.
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
- verify: Whether to verify HTTPS certificates.
- cert: Tuple of (cert, key) filenames for the client certificate.
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
:return: A `Response` object.
"""
stealthy_headers = kwargs.pop("stealthy_headers", None)
return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
"""
Perform a DELETE request.
Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
:param url: Target URL for the request.
:param kwargs: Additional keyword arguments including:
- data: Form data to include in the request body.
- json: A JSON serializable object to include in the body of the request.
- params: Query string parameters for the request.
- headers: Headers to include in the request.
- cookies: Cookies to use in the request.
- timeout: Number of seconds to wait before timing out.
- follow_redirects: Whether to follow redirects. Defaults to True.
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
- retries: Number of retry attempts. Defaults to 3.
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
- proxies: Dict of proxies to use.
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
- verify: Whether to verify HTTPS certificates.
- cert: Tuple of (cert, key) filenames for the client certificate.
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
:return: A `Response` object.
"""
# Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
# But some websites accept it, it depends on the implementation used.
stealthy_headers = kwargs.pop("stealthy_headers", None)
return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
class FetcherSession:
"""
A factory context manager that provides configured Fetcher sessions.
When this manager is used in a 'with' or 'async with' block,
it yields a new session configured with the manager's defaults.
A single instance of this manager should ideally be used for one active
session at a time (or sequentially). Re-entering a context with the
same manager instance while a session is already active is disallowed.
"""
__slots__ = (
"_default_impersonate",
"_stealth",
"_default_proxies",
"_default_proxy",
"_default_proxy_auth",
"_default_timeout",
"_default_headers",
"_default_retries",
"_default_retry_delay",
"_default_follow_redirects",
"_default_max_redirects",
"_default_verify",
"_default_cert",
"_default_http3",
"selector_config",
"_client",
"_is_alive",
"_proxy_rotator",
)
def __init__(
self,
impersonate: ImpersonateType = "chrome",
http3: Optional[bool] = False,
stealthy_headers: Optional[bool] = True,
proxies: Optional[Dict[str, str]] = None,
proxy: Optional[str] = None,
proxy_auth: Optional[Tuple[str, str]] = None,
timeout: Optional[int | float] = 30,
headers: Optional[Dict[str, str]] = None,
retries: Optional[int] = 3,
retry_delay: Optional[int] = 1,
follow_redirects: bool = True,
max_redirects: int = 30,
verify: bool = True,
cert: Optional[str | Tuple[str, str]] = None,
selector_config: Optional[Dict] = None,
proxy_rotator: Optional[ProxyRotator] = None,
):
"""
:param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
:param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
Cannot be used together with the `proxies` parameter.
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
:param timeout: Number of seconds to wait before timing out.
:param headers: Headers to include in the session with every request.
:param retries: Number of retry attempts. Defaults to 3.
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
:param follow_redirects: Whether to follow redirects. Defaults to True.
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
:param verify: Whether to verify HTTPS certificates. Defaults to True.
:param cert: Tuple of (cert, key) filenames for the client certificate.
:param selector_config: Arguments passed when creating the final Selector class.
:param proxy_rotator: A ProxyRotator instance for automatic proxy rotation.
"""
self._default_impersonate: ImpersonateType = impersonate
self._stealth = stealthy_headers
self._default_proxies = proxies or {}
self._default_proxy = proxy or None
self._default_proxy_auth = proxy_auth or None
self._default_timeout = timeout
self._default_headers = headers or {}
self._default_retries = retries
self._default_retry_delay = retry_delay
self._default_follow_redirects = follow_redirects
self._default_max_redirects = max_redirects
self._default_verify = verify
self._default_cert = cert
self._default_http3 = http3
self.selector_config = selector_config or {}
self._is_alive = False
self._client: _SyncSessionLogic | _ASyncSessionLogic | None = None
self._proxy_rotator = proxy_rotator
def __enter__(self) -> _SyncSessionLogic:
"""Creates and returns a new synchronous Fetcher Session"""
if self._client is None:
# Use **vars(self) to avoid repeating all parameters
config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
config["stealthy_headers"] = self._stealth
config["selector_config"] = self.selector_config
config["proxy_rotator"] = self._proxy_rotator
self._client = _SyncSessionLogic(**config)
self._is_alive = True
return self._client.__enter__()
raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
def __exit__(self, exc_type, exc_val, exc_tb):
if self._client is not None and isinstance(self._client, _SyncSessionLogic):
self._client.__exit__(exc_type, exc_val, exc_tb)
self._client = None
self._is_alive = False
return
raise RuntimeError("Cannot exit invalid session")
async def __aenter__(self) -> _ASyncSessionLogic:
"""Creates and returns a new asynchronous Session."""
if self._client is None:
# Use **vars(self) to avoid repeating all parameters
config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
config["stealthy_headers"] = self._stealth
config["selector_config"] = self.selector_config
config["proxy_rotator"] = self._proxy_rotator
self._client = _ASyncSessionLogic(**config)
self._is_alive = True
return await self._client.__aenter__()
raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self._client is not None and isinstance(self._client, _ASyncSessionLogic):
await self._client.__aexit__(exc_type, exc_val, exc_tb)
self._client = None
self._is_alive = False
return
raise RuntimeError("Cannot exit invalid session")
class FetcherClient(_SyncSessionLogic):
__slots__ = ("__enter__", "__exit__")
def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)
self.__enter__: Any = None
self.__exit__: Any = None
self._curl_session: Any = _NO_SESSION
class AsyncFetcherClient(_ASyncSessionLogic):
__slots__ = ("__aenter__", "__aexit__")
def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)
self.__aenter__: Any = None
self.__aexit__: Any = None
self._async_curl_session: Any = _NO_SESSION