Spaces:

lenson78
/

Scrapling

Paused

Scrapling / scrapling /engines /_browsers /_controllers.py

Karim shoair

fix: update code and docstrings to remove the old google referer logic

a28879b 26 days ago

22.6 kB

	from time import sleep as time_sleep
	from asyncio import sleep as asyncio_sleep

	from playwright.sync_api import (
	Locator,
	sync_playwright,
	)
	from playwright.async_api import (
	async_playwright,
	Locator as AsyncLocator,
	)

	from scrapling.core.utils import log
	from scrapling.core._types import Optional, ProxyType, Unpack
	from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
	from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
	from scrapling.engines._browsers._types import PlaywrightSession, PlaywrightFetchParams
	from scrapling.engines._browsers._base import SyncSession, AsyncSession, DynamicSessionMixin
	from scrapling.engines._browsers._validators import validate_fetch as _validate, PlaywrightConfig


	class DynamicSession(SyncSession, DynamicSessionMixin):
	"""A Browser session manager with page pooling."""

	__slots__ = (
	"_config",
	"_context_options",
	"_browser_options",
	"_user_data_dir",
	"_headers_keys",
	"max_pages",
	"page_pool",
	"_max_wait_for_page",
	"playwright",
	"context",
	)

	def __init__(self, **kwargs: Unpack[PlaywrightSession]):
	"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

	:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
	:param disable_resources: Drop requests for unnecessary resources for a speed boost.
	Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
	:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
	:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
	:param cookies: Set cookies for the next request.
	:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
	:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
	:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
	:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
	:param wait_selector: Wait for a specific CSS selector to be in a specific state.
	:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
	:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
	rules. Defaults to the system default locale.
	:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
	:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
	:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
	:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
	:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
	:param google_search: Enabled by default, Scrapling will set a Google referer header.
	:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
	:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
	:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
	:param extra_flags: A list of additional browser flags to pass to the browser on launch.
	:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
	:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
	"""
	self.__validate__(**kwargs)
	super().__init__()

	def start(self):
	"""Create a browser for this instance and context."""
	if not self.playwright:
	self.playwright = sync_playwright().start()

	try:
	if self._config.cdp_url: # pragma: no cover
	self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
	if not self._config.proxy_rotator and self.browser:
	self.context = self.browser.new_context(**self._context_options)
	elif self._config.proxy_rotator:
	self.browser = self.playwright.chromium.launch(**self._browser_options)
	else:
	persistent_options = (
	self._browser_options \| self._context_options \| {"user_data_dir": self._user_data_dir}
	)
	self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)

	if self.context:
	self.context = self._initialize_context(self._config, self.context)

	self._is_alive = True
	except Exception:
	# Clean up playwright if browser setup fails
	self.playwright.stop()
	self.playwright = None
	raise
	else:
	raise RuntimeError("Session has been already started")

	def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
	"""Opens up the browser and do your request based on your chosen options.

	:param url: The Target url.
	:param google_search: Enabled by default, Scrapling will set a Google referer header.
	:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
	:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
	:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
	:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
	:param disable_resources: Drop requests for unnecessary resources for a speed boost.
	Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
	:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
	:param wait_selector: Wait for a specific CSS selector to be in a specific state.
	:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
	:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
	:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
	:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
	:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
	:return: A `Response` object.
	"""
	static_proxy = kwargs.pop("proxy", None)

	params = _validate(kwargs, self, PlaywrightConfig)
	if not self._is_alive: # pragma: no cover
	raise RuntimeError("Context manager has been closed")

	request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
	referer = (
	"https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
	)

	for attempt in range(self._config.retries):
	proxy: Optional[ProxyType] = None
	if self._config.proxy_rotator and static_proxy is None:
	proxy = self._config.proxy_rotator.get_proxy()
	else:
	proxy = static_proxy

	with self._page_generator(
	params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
	) as page_info:
	final_response = [None]
	page = page_info.page
	page.on("response", self._create_response_handler(page_info, final_response))

	try:
	first_response = page.goto(url, referer=referer)
	self._wait_for_page_stability(page, params.load_dom, params.network_idle)

	if not first_response:
	raise RuntimeError(f"Failed to get response for {url}")

	if params.page_action:
	try:
	_ = params.page_action(page)
	except Exception as e: # pragma: no cover
	log.error(f"Error executing page_action: {e}")

	if params.wait_selector:
	try:
	waiter: Locator = page.locator(params.wait_selector)
	waiter.first.wait_for(state=params.wait_selector_state)
	self._wait_for_page_stability(page, params.load_dom, params.network_idle)
	except Exception as e: # pragma: no cover
	log.error(f"Error waiting for selector {params.wait_selector}: {e}")

	page.wait_for_timeout(params.wait)

	response = ResponseFactory.from_playwright_response(
	page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
	)
	return response

	except Exception as e:
	page_info.mark_error()
	if attempt < self._config.retries - 1:
	if is_proxy_error(e):
	log.warning(
	f"Proxy '{proxy}' failed (attempt {attempt + 1}) \| Retrying in {self._config.retry_delay}s..."
	)
	else:
	log.warning(
	f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
	)
	time_sleep(self._config.retry_delay)
	else:
	log.error(f"Failed after {self._config.retries} attempts: {e}")
	raise

	raise RuntimeError("Request failed") # pragma: no cover


	class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
	"""An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""

	__slots__ = (
	"_config",
	"_context_options",
	"_browser_options",
	"_user_data_dir",
	"_headers_keys",
	)

	def __init__(self, **kwargs: Unpack[PlaywrightSession]):
	"""A Browser session manager with page pooling

	:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
	:param disable_resources: Drop requests for unnecessary resources for a speed boost.
	Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
	:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
	:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
	:param cookies: Set cookies for the next request.
	:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
	:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
	:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
	:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
	:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
	:param wait_selector: Wait for a specific CSS selector to be in a specific state.
	:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
	:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
	rules. Defaults to the system default locale.
	:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
	:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
	:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
	:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
	:param google_search: Enabled by default, Scrapling will set a Google referer header.
	:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
	:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
	:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
	:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
	:param extra_flags: A list of additional browser flags to pass to the browser on launch.
	:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
	:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
	"""
	self.__validate__(**kwargs)
	super().__init__(max_pages=self._config.max_pages)

	async def start(self) -> None:
	"""Create a browser for this instance and context."""
	if not self.playwright:
	self.playwright = await async_playwright().start()
	try:
	if self._config.cdp_url:
	self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
	if not self._config.proxy_rotator and self.browser:
	self.context = await self.browser.new_context(**self._context_options)
	elif self._config.proxy_rotator:
	self.browser = await self.playwright.chromium.launch(**self._browser_options)
	else:
	persistent_options = (
	self._browser_options \| self._context_options \| {"user_data_dir": self._user_data_dir}
	)
	self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)

	if self.context:
	self.context = await self._initialize_context(self._config, self.context)

	self._is_alive = True
	except Exception:
	# Clean up playwright if browser setup fails
	await self.playwright.stop()
	self.playwright = None
	raise
	else:
	raise RuntimeError("Session has been already started")

	async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
	"""Opens up the browser and do your request based on your chosen options.

	:param url: The Target url.
	:param google_search: Enabled by default, Scrapling will set a Google referer header.
	:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
	:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
	:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
	:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
	:param disable_resources: Drop requests for unnecessary resources for a speed boost.
	Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
	:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
	:param wait_selector: Wait for a specific CSS selector to be in a specific state.
	:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
	:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
	:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
	:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
	:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
	:return: A `Response` object.
	"""
	static_proxy = kwargs.pop("proxy", None)

	params = _validate(kwargs, self, PlaywrightConfig)

	if not self._is_alive: # pragma: no cover
	raise RuntimeError("Context manager has been closed")

	request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
	referer = (
	"https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
	)

	for attempt in range(self._config.retries):
	proxy: Optional[ProxyType] = None
	if self._config.proxy_rotator and static_proxy is None:
	proxy = self._config.proxy_rotator.get_proxy()
	else:
	proxy = static_proxy

	async with self._page_generator(
	params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
	) as page_info:
	final_response = [None]
	page = page_info.page
	page.on("response", self._create_response_handler(page_info, final_response))

	try:
	first_response = await page.goto(url, referer=referer)
	await self._wait_for_page_stability(page, params.load_dom, params.network_idle)

	if not first_response:
	raise RuntimeError(f"Failed to get response for {url}")

	if params.page_action:
	try:
	_ = await params.page_action(page)
	except Exception as e: # pragma: no cover
	log.error(f"Error executing page_action: {e}")

	if params.wait_selector:
	try:
	waiter: AsyncLocator = page.locator(params.wait_selector)
	await waiter.first.wait_for(state=params.wait_selector_state)
	await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
	except Exception as e: # pragma: no cover
	log.error(f"Error waiting for selector {params.wait_selector}: {e}")

	await page.wait_for_timeout(params.wait)

	response = await ResponseFactory.from_async_playwright_response(
	page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
	)
	return response

	except Exception as e:
	page_info.mark_error()
	if attempt < self._config.retries - 1:
	if is_proxy_error(e):
	log.warning(
	f"Proxy '{proxy}' failed (attempt {attempt + 1}) \| Retrying in {self._config.retry_delay}s..."
	)
	else:
	log.warning(
	f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
	)
	await asyncio_sleep(self._config.retry_delay)
	else:
	log.error(f"Failed after {self._config.retries} attempts: {e}")
	raise

	raise RuntimeError("Request failed") # pragma: no cover