File size: 22,568 Bytes
024cbba 0cd97d9 6f2d7b6 0cd97d9 31c2447 ed96cdc c908f33 ed96cdc 0cd97d9 ee2299e 0cd97d9 ee2299e ed96cdc ee2299e 0cd97d9 c908f33 e39bf62 0cd97d9 1812d2b 0cd97d9 47dd985 0cd97d9 60d0c55 0cd97d9 03de577 ee2299e 0cd97d9 66fd35f f58c872 a28879b 0cd97d9 c61a805 b6969b2 8e67a4c 0de8025 0cd97d9 c908f33 ee2299e 0cd97d9 a17a010 0cd97d9 a17a010 8ff23b3 0cd97d9 8ff23b3 ed96cdc 8ff23b3 ed96cdc c908f33 ed96cdc 1803348 8ff23b3 a17a010 c908f33 0cd97d9 a28879b daaad4e a28879b daaad4e 47dd985 daaad4e 32daccc 0cd97d9 32daccc c908f33 1803348 0cd97d9 c7e573a e23e9c6 a28879b e23e9c6 0cd97d9 024cbba 31c2447 32daccc ed96cdc 47dd985 ed96cdc 32daccc ed96cdc 0cd97d9 ee2299e e39bf62 0cd97d9 ed96cdc c908f33 0cd97d9 1812d2b 0cd97d9 47dd985 0cd97d9 66fd35f 0cd97d9 60d0c55 0cd97d9 03de577 ee2299e 0cd97d9 f58c872 a28879b 0cd97d9 c10c240 c61a805 b6969b2 8e67a4c 0de8025 0cd97d9 c908f33 ee2299e 0cd97d9 31c2447 0cd97d9 a17a010 8ff23b3 ed96cdc 31c2447 ed96cdc 8ff23b3 ed96cdc 31c2447 8ff23b3 ed96cdc 8ff23b3 a17a010 c908f33 0cd97d9 a28879b daaad4e a28879b daaad4e 47dd985 daaad4e 32daccc 0cd97d9 32daccc c908f33 c181b7d 1803348 0cd97d9 c7e573a e23e9c6 a28879b e23e9c6 0cd97d9 024cbba 31c2447 32daccc ed96cdc 47dd985 ed96cdc 32daccc ed96cdc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 | from time import sleep as time_sleep
from asyncio import sleep as asyncio_sleep
from playwright.sync_api import (
Locator,
sync_playwright,
)
from playwright.async_api import (
async_playwright,
Locator as AsyncLocator,
)
from scrapling.core.utils import log
from scrapling.core._types import Optional, ProxyType, Unpack
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
from scrapling.engines._browsers._types import PlaywrightSession, PlaywrightFetchParams
from scrapling.engines._browsers._base import SyncSession, AsyncSession, DynamicSessionMixin
from scrapling.engines._browsers._validators import validate_fetch as _validate, PlaywrightConfig
class DynamicSession(SyncSession, DynamicSessionMixin):
"""A Browser session manager with page pooling."""
__slots__ = (
"_config",
"_context_options",
"_browser_options",
"_user_data_dir",
"_headers_keys",
"max_pages",
"page_pool",
"_max_wait_for_page",
"playwright",
"context",
)
def __init__(self, **kwargs: Unpack[PlaywrightSession]):
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
:param cookies: Set cookies for the next request.
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
rules. Defaults to the system default locale.
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
:param google_search: Enabled by default, Scrapling will set a Google referer header.
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
"""
self.__validate__(**kwargs)
super().__init__()
def start(self):
"""Create a browser for this instance and context."""
if not self.playwright:
self.playwright = sync_playwright().start()
try:
if self._config.cdp_url: # pragma: no cover
self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
if not self._config.proxy_rotator and self.browser:
self.context = self.browser.new_context(**self._context_options)
elif self._config.proxy_rotator:
self.browser = self.playwright.chromium.launch(**self._browser_options)
else:
persistent_options = (
self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
)
self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)
if self.context:
self.context = self._initialize_context(self._config, self.context)
self._is_alive = True
except Exception:
# Clean up playwright if browser setup fails
self.playwright.stop()
self.playwright = None
raise
else:
raise RuntimeError("Session has been already started")
def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
"""Opens up the browser and do your request based on your chosen options.
:param url: The Target url.
:param google_search: Enabled by default, Scrapling will set a Google referer header.
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
:return: A `Response` object.
"""
static_proxy = kwargs.pop("proxy", None)
params = _validate(kwargs, self, PlaywrightConfig)
if not self._is_alive: # pragma: no cover
raise RuntimeError("Context manager has been closed")
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
referer = (
"https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
)
for attempt in range(self._config.retries):
proxy: Optional[ProxyType] = None
if self._config.proxy_rotator and static_proxy is None:
proxy = self._config.proxy_rotator.get_proxy()
else:
proxy = static_proxy
with self._page_generator(
params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
) as page_info:
final_response = [None]
page = page_info.page
page.on("response", self._create_response_handler(page_info, final_response))
try:
first_response = page.goto(url, referer=referer)
self._wait_for_page_stability(page, params.load_dom, params.network_idle)
if not first_response:
raise RuntimeError(f"Failed to get response for {url}")
if params.page_action:
try:
_ = params.page_action(page)
except Exception as e: # pragma: no cover
log.error(f"Error executing page_action: {e}")
if params.wait_selector:
try:
waiter: Locator = page.locator(params.wait_selector)
waiter.first.wait_for(state=params.wait_selector_state)
self._wait_for_page_stability(page, params.load_dom, params.network_idle)
except Exception as e: # pragma: no cover
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
page.wait_for_timeout(params.wait)
response = ResponseFactory.from_playwright_response(
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
)
return response
except Exception as e:
page_info.mark_error()
if attempt < self._config.retries - 1:
if is_proxy_error(e):
log.warning(
f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
)
else:
log.warning(
f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
)
time_sleep(self._config.retry_delay)
else:
log.error(f"Failed after {self._config.retries} attempts: {e}")
raise
raise RuntimeError("Request failed") # pragma: no cover
class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
"""An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
__slots__ = (
"_config",
"_context_options",
"_browser_options",
"_user_data_dir",
"_headers_keys",
)
def __init__(self, **kwargs: Unpack[PlaywrightSession]):
"""A Browser session manager with page pooling
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
:param cookies: Set cookies for the next request.
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
rules. Defaults to the system default locale.
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
:param google_search: Enabled by default, Scrapling will set a Google referer header.
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
"""
self.__validate__(**kwargs)
super().__init__(max_pages=self._config.max_pages)
async def start(self) -> None:
"""Create a browser for this instance and context."""
if not self.playwright:
self.playwright = await async_playwright().start()
try:
if self._config.cdp_url:
self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
if not self._config.proxy_rotator and self.browser:
self.context = await self.browser.new_context(**self._context_options)
elif self._config.proxy_rotator:
self.browser = await self.playwright.chromium.launch(**self._browser_options)
else:
persistent_options = (
self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
)
self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)
if self.context:
self.context = await self._initialize_context(self._config, self.context)
self._is_alive = True
except Exception:
# Clean up playwright if browser setup fails
await self.playwright.stop()
self.playwright = None
raise
else:
raise RuntimeError("Session has been already started")
async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
"""Opens up the browser and do your request based on your chosen options.
:param url: The Target url.
:param google_search: Enabled by default, Scrapling will set a Google referer header.
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
:return: A `Response` object.
"""
static_proxy = kwargs.pop("proxy", None)
params = _validate(kwargs, self, PlaywrightConfig)
if not self._is_alive: # pragma: no cover
raise RuntimeError("Context manager has been closed")
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
referer = (
"https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
)
for attempt in range(self._config.retries):
proxy: Optional[ProxyType] = None
if self._config.proxy_rotator and static_proxy is None:
proxy = self._config.proxy_rotator.get_proxy()
else:
proxy = static_proxy
async with self._page_generator(
params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
) as page_info:
final_response = [None]
page = page_info.page
page.on("response", self._create_response_handler(page_info, final_response))
try:
first_response = await page.goto(url, referer=referer)
await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
if not first_response:
raise RuntimeError(f"Failed to get response for {url}")
if params.page_action:
try:
_ = await params.page_action(page)
except Exception as e: # pragma: no cover
log.error(f"Error executing page_action: {e}")
if params.wait_selector:
try:
waiter: AsyncLocator = page.locator(params.wait_selector)
await waiter.first.wait_for(state=params.wait_selector_state)
await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
except Exception as e: # pragma: no cover
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
await page.wait_for_timeout(params.wait)
response = await ResponseFactory.from_async_playwright_response(
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
)
return response
except Exception as e:
page_info.mark_error()
if attempt < self._config.retries - 1:
if is_proxy_error(e):
log.warning(
f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
)
else:
log.warning(
f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
)
await asyncio_sleep(self._config.retry_delay)
else:
log.error(f"Failed after {self._config.retries} attempts: {e}")
raise
raise RuntimeError("Request failed") # pragma: no cover
|