| from asyncio import gather |
|
|
| from mcp.server.fastmcp import FastMCP |
| from pydantic import BaseModel, Field |
|
|
| from scrapling.core.shell import Convertor |
| from scrapling.engines.toolbelt.custom import Response as _ScraplingResponse |
| from scrapling.engines.static import ImpersonateType |
| from scrapling.fetchers import ( |
| Fetcher, |
| FetcherSession, |
| DynamicFetcher, |
| AsyncDynamicSession, |
| StealthyFetcher, |
| AsyncStealthySession, |
| ) |
| from scrapling.core._types import ( |
| Optional, |
| Tuple, |
| Mapping, |
| Dict, |
| List, |
| Any, |
| Generator, |
| Sequence, |
| SetCookieParam, |
| extraction_types, |
| SelectorWaitStates, |
| ) |
|
|
|
|
| class ResponseModel(BaseModel): |
| """Request's response information structure.""" |
|
|
| status: int = Field(description="The status code returned by the website.") |
| content: list[str] = Field(description="The content as Markdown/HTML or the text content of the page.") |
| url: str = Field(description="The URL given by the user that resulted in this response.") |
|
|
|
|
| def _content_translator(content: Generator[str, None, None], page: _ScraplingResponse) -> ResponseModel: |
| """Convert a content generator to a list of ResponseModel objects.""" |
| return ResponseModel(status=page.status, content=[result for result in content], url=page.url) |
|
|
|
|
| def _normalize_credentials(credentials: Optional[Dict[str, str]]) -> Optional[Tuple[str, str]]: |
| """Convert a credentials dictionary to a tuple accepted by fetchers.""" |
| if not credentials: |
| return None |
|
|
| username = credentials.get("username") |
| password = credentials.get("password") |
|
|
| if username is None or password is None: |
| raise ValueError("Credentials dictionary must contain both 'username' and 'password' keys") |
|
|
| return username, password |
|
|
|
|
| class ScraplingMCPServer: |
| @staticmethod |
| def get( |
| url: str, |
| impersonate: ImpersonateType = "chrome", |
| extraction_type: extraction_types = "markdown", |
| css_selector: Optional[str] = None, |
| main_content_only: bool = True, |
| params: Optional[Dict] = None, |
| headers: Optional[Mapping[str, Optional[str]]] = None, |
| cookies: Optional[Dict[str, str]] = None, |
| timeout: Optional[int | float] = 30, |
| follow_redirects: bool = True, |
| max_redirects: int = 30, |
| retries: Optional[int] = 3, |
| retry_delay: Optional[int] = 1, |
| proxy: Optional[str] = None, |
| proxy_auth: Optional[Dict[str, str]] = None, |
| auth: Optional[Dict[str, str]] = None, |
| verify: Optional[bool] = True, |
| http3: Optional[bool] = False, |
| stealthy_headers: Optional[bool] = True, |
| ) -> ResponseModel: |
| """Make GET HTTP request to a URL and return a structured output of the result. |
| Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly. |
| Note: If the `css_selector` resolves to more than one element, all the elements will be returned. |
| |
| :param url: The URL to request. |
| :param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default. |
| :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are: |
| - Markdown will convert the page content to Markdown format. |
| - HTML will return the raw HTML content of the page. |
| - Text will return the text content of the page. |
| :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None. |
| :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag. |
| :param params: Query string parameters for the request. |
| :param headers: Headers to include in the request. |
| :param cookies: Cookies to use in the request. |
| :param timeout: Number of seconds to wait before timing out. |
| :param follow_redirects: Whether to follow redirects. Defaults to True. |
| :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited. |
| :param retries: Number of retry attempts. Defaults to 3. |
| :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second. |
| :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030". |
| Cannot be used together with the `proxies` parameter. |
| :param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys. |
| :param auth: HTTP basic auth in dictionary format with `username` and `password` keys. |
| :param verify: Whether to verify HTTPS certificates. |
| :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`. |
| :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header. |
| """ |
| normalized_proxy_auth = _normalize_credentials(proxy_auth) |
| normalized_auth = _normalize_credentials(auth) |
|
|
| page = Fetcher.get( |
| url, |
| auth=normalized_auth, |
| proxy=proxy, |
| http3=http3, |
| verify=verify, |
| params=params, |
| proxy_auth=normalized_proxy_auth, |
| retry_delay=retry_delay, |
| stealthy_headers=stealthy_headers, |
| impersonate=impersonate, |
| headers=headers, |
| cookies=cookies, |
| timeout=timeout, |
| retries=retries, |
| max_redirects=max_redirects, |
| follow_redirects=follow_redirects, |
| ) |
| return _content_translator( |
| Convertor._extract_content( |
| page, |
| css_selector=css_selector, |
| extraction_type=extraction_type, |
| main_content_only=main_content_only, |
| ), |
| page, |
| ) |
|
|
| @staticmethod |
| async def bulk_get( |
| urls: List[str], |
| impersonate: ImpersonateType = "chrome", |
| extraction_type: extraction_types = "markdown", |
| css_selector: Optional[str] = None, |
| main_content_only: bool = True, |
| params: Optional[Dict] = None, |
| headers: Optional[Mapping[str, Optional[str]]] = None, |
| cookies: Optional[Dict[str, str]] = None, |
| timeout: Optional[int | float] = 30, |
| follow_redirects: bool = True, |
| max_redirects: int = 30, |
| retries: Optional[int] = 3, |
| retry_delay: Optional[int] = 1, |
| proxy: Optional[str] = None, |
| proxy_auth: Optional[Dict[str, str]] = None, |
| auth: Optional[Dict[str, str]] = None, |
| verify: Optional[bool] = True, |
| http3: Optional[bool] = False, |
| stealthy_headers: Optional[bool] = True, |
| ) -> List[ResponseModel]: |
| """Make GET HTTP request to a group of URLs and for each URL, return a structured output of the result. |
| Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly. |
| Note: If the `css_selector` resolves to more than one element, all the elements will be returned. |
| |
| :param urls: A list of the URLs to request. |
| :param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default. |
| :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are: |
| - Markdown will convert the page content to Markdown format. |
| - HTML will return the raw HTML content of the page. |
| - Text will return the text content of the page. |
| :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None. |
| :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag. |
| :param params: Query string parameters for the request. |
| :param headers: Headers to include in the request. |
| :param cookies: Cookies to use in the request. |
| :param timeout: Number of seconds to wait before timing out. |
| :param follow_redirects: Whether to follow redirects. Defaults to True. |
| :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited. |
| :param retries: Number of retry attempts. Defaults to 3. |
| :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second. |
| :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030". |
| Cannot be used together with the `proxies` parameter. |
| :param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys. |
| :param auth: HTTP basic auth in dictionary format with `username` and `password` keys. |
| :param verify: Whether to verify HTTPS certificates. |
| :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`. |
| :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header. |
| """ |
| normalized_proxy_auth = _normalize_credentials(proxy_auth) |
| normalized_auth = _normalize_credentials(auth) |
|
|
| async with FetcherSession() as session: |
| tasks: List[Any] = [ |
| session.get( |
| url, |
| auth=normalized_auth, |
| proxy=proxy, |
| http3=http3, |
| verify=verify, |
| params=params, |
| headers=headers, |
| cookies=cookies, |
| timeout=timeout, |
| retries=retries, |
| proxy_auth=normalized_proxy_auth, |
| retry_delay=retry_delay, |
| impersonate=impersonate, |
| max_redirects=max_redirects, |
| follow_redirects=follow_redirects, |
| stealthy_headers=stealthy_headers, |
| ) |
| for url in urls |
| ] |
| responses = await gather(*tasks) |
| return [ |
| _content_translator( |
| Convertor._extract_content( |
| page, |
| css_selector=css_selector, |
| extraction_type=extraction_type, |
| main_content_only=main_content_only, |
| ), |
| page, |
| ) |
| for page in responses |
| ] |
|
|
| @staticmethod |
| async def fetch( |
| url: str, |
| extraction_type: extraction_types = "markdown", |
| css_selector: Optional[str] = None, |
| main_content_only: bool = True, |
| headless: bool = True, |
| google_search: bool = True, |
| real_chrome: bool = False, |
| wait: int | float = 0, |
| proxy: Optional[str | Dict[str, str]] = None, |
| timezone_id: str | None = None, |
| locale: str | None = None, |
| extra_headers: Optional[Dict[str, str]] = None, |
| useragent: Optional[str] = None, |
| cdp_url: Optional[str] = None, |
| timeout: int | float = 30000, |
| disable_resources: bool = False, |
| wait_selector: Optional[str] = None, |
| cookies: Sequence[SetCookieParam] | None = None, |
| network_idle: bool = False, |
| wait_selector_state: SelectorWaitStates = "attached", |
| ) -> ResponseModel: |
| """Use playwright to open a browser to fetch a URL and return a structured output of the result. |
| Note: This is only suitable for low-mid protection levels. |
| Note: If the `css_selector` resolves to more than one element, all the elements will be returned. |
| |
| :param url: The URL to request. |
| :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are: |
| - Markdown will convert the page content to Markdown format. |
| - HTML will return the raw HTML content of the page. |
| - Text will return the text content of the page. |
| :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None. |
| :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag. |
| :param headless: Run the browser in headless/hidden (default), or headful/visible mode. |
| :param disable_resources: Drop requests for unnecessary resources for a speed boost. |
| Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. |
| :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. |
| :param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts. |
| :param network_idle: Wait for the page until there are no network connections for at least 500 ms. |
| :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000 |
| :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object. |
| :param wait_selector: Wait for a specific CSS selector to be in a specific state. |
| :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone. |
| :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting |
| rules. Defaults to the system default locale. |
| :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`. |
| :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. |
| :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. |
| :param google_search: Enabled by default, Scrapling will set a Google referer header. |
| :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._ |
| :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. |
| """ |
| page = await DynamicFetcher.async_fetch( |
| url, |
| wait=wait, |
| proxy=proxy, |
| locale=locale, |
| timeout=timeout, |
| cookies=cookies, |
| cdp_url=cdp_url, |
| headless=headless, |
| useragent=useragent, |
| timezone_id=timezone_id, |
| real_chrome=real_chrome, |
| network_idle=network_idle, |
| wait_selector=wait_selector, |
| extra_headers=extra_headers, |
| google_search=google_search, |
| disable_resources=disable_resources, |
| wait_selector_state=wait_selector_state, |
| ) |
| return _content_translator( |
| Convertor._extract_content( |
| page, |
| css_selector=css_selector, |
| extraction_type=extraction_type, |
| main_content_only=main_content_only, |
| ), |
| page, |
| ) |
|
|
| @staticmethod |
| async def bulk_fetch( |
| urls: List[str], |
| extraction_type: extraction_types = "markdown", |
| css_selector: Optional[str] = None, |
| main_content_only: bool = True, |
| headless: bool = True, |
| google_search: bool = True, |
| real_chrome: bool = False, |
| wait: int | float = 0, |
| proxy: Optional[str | Dict[str, str]] = None, |
| timezone_id: str | None = None, |
| locale: str | None = None, |
| extra_headers: Optional[Dict[str, str]] = None, |
| useragent: Optional[str] = None, |
| cdp_url: Optional[str] = None, |
| timeout: int | float = 30000, |
| disable_resources: bool = False, |
| wait_selector: Optional[str] = None, |
| cookies: Sequence[SetCookieParam] | None = None, |
| network_idle: bool = False, |
| wait_selector_state: SelectorWaitStates = "attached", |
| ) -> List[ResponseModel]: |
| """Use playwright to open a browser, then fetch a group of URLs at the same time, and for each page return a structured output of the result. |
| Note: This is only suitable for low-mid protection levels. |
| Note: If the `css_selector` resolves to more than one element, all the elements will be returned. |
| |
| :param urls: A list of the URLs to request. |
| :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are: |
| - Markdown will convert the page content to Markdown format. |
| - HTML will return the raw HTML content of the page. |
| - Text will return the text content of the page. |
| :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None. |
| :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag. |
| :param headless: Run the browser in headless/hidden (default), or headful/visible mode. |
| :param disable_resources: Drop requests for unnecessary resources for a speed boost. |
| Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. |
| :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. |
| :param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts. |
| :param network_idle: Wait for the page until there are no network connections for at least 500 ms. |
| :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000 |
| :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object. |
| :param wait_selector: Wait for a specific CSS selector to be in a specific state. |
| :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone. |
| :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting |
| rules. Defaults to the system default locale. |
| :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`. |
| :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. |
| :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. |
| :param google_search: Enabled by default, Scrapling will set a Google referer header. |
| :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._ |
| :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. |
| """ |
| async with AsyncDynamicSession( |
| wait=wait, |
| proxy=proxy, |
| locale=locale, |
| timeout=timeout, |
| cookies=cookies, |
| cdp_url=cdp_url, |
| headless=headless, |
| max_pages=len(urls), |
| useragent=useragent, |
| timezone_id=timezone_id, |
| real_chrome=real_chrome, |
| network_idle=network_idle, |
| wait_selector=wait_selector, |
| google_search=google_search, |
| extra_headers=extra_headers, |
| disable_resources=disable_resources, |
| wait_selector_state=wait_selector_state, |
| ) as session: |
| tasks = [session.fetch(url) for url in urls] |
| responses = await gather(*tasks) |
| return [ |
| _content_translator( |
| Convertor._extract_content( |
| page, |
| css_selector=css_selector, |
| extraction_type=extraction_type, |
| main_content_only=main_content_only, |
| ), |
| page, |
| ) |
| for page in responses |
| ] |
|
|
| @staticmethod |
| async def stealthy_fetch( |
| url: str, |
| extraction_type: extraction_types = "markdown", |
| css_selector: Optional[str] = None, |
| main_content_only: bool = True, |
| headless: bool = True, |
| google_search: bool = True, |
| real_chrome: bool = False, |
| wait: int | float = 0, |
| proxy: Optional[str | Dict[str, str]] = None, |
| timezone_id: str | None = None, |
| locale: str | None = None, |
| extra_headers: Optional[Dict[str, str]] = None, |
| useragent: Optional[str] = None, |
| hide_canvas: bool = False, |
| cdp_url: Optional[str] = None, |
| timeout: int | float = 30000, |
| disable_resources: bool = False, |
| wait_selector: Optional[str] = None, |
| cookies: Sequence[SetCookieParam] | None = None, |
| network_idle: bool = False, |
| wait_selector_state: SelectorWaitStates = "attached", |
| block_webrtc: bool = False, |
| allow_webgl: bool = True, |
| solve_cloudflare: bool = False, |
| additional_args: Optional[Dict] = None, |
| ) -> ResponseModel: |
| """Use the stealthy fetcher to fetch a URL and return a structured output of the result. |
| Note: This is the only suitable fetcher for high protection levels. |
| Note: If the `css_selector` resolves to more than one element, all the elements will be returned. |
| |
| :param url: The URL to request. |
| :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are: |
| - Markdown will convert the page content to Markdown format. |
| - HTML will return the raw HTML content of the page. |
| - Text will return the text content of the page. |
| :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None. |
| :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag. |
| :param headless: Run the browser in headless/hidden (default), or headful/visible mode. |
| :param disable_resources: Drop requests for unnecessary resources for a speed boost. |
| Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. |
| :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. |
| :param cookies: Set cookies for the next request. |
| :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you. |
| :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled. |
| :param network_idle: Wait for the page until there are no network connections for at least 500 ms. |
| :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object. |
| :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000 |
| :param wait_selector: Wait for a specific CSS selector to be in a specific state. |
| :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone. |
| :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting |
| rules. Defaults to the system default locale. |
| :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`. |
| :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. |
| :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting. |
| :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak. |
| :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. |
| :param google_search: Enabled by default, Scrapling will set a Google referer header. |
| :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._ |
| :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. |
| :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings. |
| """ |
| page = await StealthyFetcher.async_fetch( |
| url, |
| wait=wait, |
| proxy=proxy, |
| locale=locale, |
| cdp_url=cdp_url, |
| timeout=timeout, |
| cookies=cookies, |
| headless=headless, |
| useragent=useragent, |
| timezone_id=timezone_id, |
| real_chrome=real_chrome, |
| hide_canvas=hide_canvas, |
| allow_webgl=allow_webgl, |
| network_idle=network_idle, |
| block_webrtc=block_webrtc, |
| wait_selector=wait_selector, |
| google_search=google_search, |
| extra_headers=extra_headers, |
| additional_args=additional_args, |
| solve_cloudflare=solve_cloudflare, |
| disable_resources=disable_resources, |
| wait_selector_state=wait_selector_state, |
| ) |
| return _content_translator( |
| Convertor._extract_content( |
| page, |
| css_selector=css_selector, |
| extraction_type=extraction_type, |
| main_content_only=main_content_only, |
| ), |
| page, |
| ) |
|
|
| @staticmethod |
| async def bulk_stealthy_fetch( |
| urls: List[str], |
| extraction_type: extraction_types = "markdown", |
| css_selector: Optional[str] = None, |
| main_content_only: bool = True, |
| headless: bool = True, |
| google_search: bool = True, |
| real_chrome: bool = False, |
| wait: int | float = 0, |
| proxy: Optional[str | Dict[str, str]] = None, |
| timezone_id: str | None = None, |
| locale: str | None = None, |
| extra_headers: Optional[Dict[str, str]] = None, |
| useragent: Optional[str] = None, |
| hide_canvas: bool = False, |
| cdp_url: Optional[str] = None, |
| timeout: int | float = 30000, |
| disable_resources: bool = False, |
| wait_selector: Optional[str] = None, |
| cookies: Sequence[SetCookieParam] | None = None, |
| network_idle: bool = False, |
| wait_selector_state: SelectorWaitStates = "attached", |
| block_webrtc: bool = False, |
| allow_webgl: bool = True, |
| solve_cloudflare: bool = False, |
| additional_args: Optional[Dict] = None, |
| ) -> List[ResponseModel]: |
| """Use the stealthy fetcher to fetch a group of URLs at the same time, and for each page return a structured output of the result. |
| Note: This is the only suitable fetcher for high protection levels. |
| Note: If the `css_selector` resolves to more than one element, all the elements will be returned. |
| |
| :param urls: A list of the URLs to request. |
| :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are: |
| - Markdown will convert the page content to Markdown format. |
| - HTML will return the raw HTML content of the page. |
| - Text will return the text content of the page. |
| :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None. |
| :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag. |
| :param headless: Run the browser in headless/hidden (default), or headful/visible mode. |
| :param disable_resources: Drop requests for unnecessary resources for a speed boost. |
| Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. |
| :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. |
| :param cookies: Set cookies for the next request. |
| :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you. |
| :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled. |
| :param network_idle: Wait for the page until there are no network connections for at least 500 ms. |
| :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object. |
| :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000 |
| :param wait_selector: Wait for a specific CSS selector to be in a specific state. |
| :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone. |
| :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting |
| rules. Defaults to the system default locale. |
| :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`. |
| :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. |
| :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting. |
| :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak. |
| :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. |
| :param google_search: Enabled by default, Scrapling will set a Google referer header. |
| :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._ |
| :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. |
| :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings. |
| """ |
| async with AsyncStealthySession( |
| wait=wait, |
| proxy=proxy, |
| locale=locale, |
| cdp_url=cdp_url, |
| timeout=timeout, |
| cookies=cookies, |
| headless=headless, |
| useragent=useragent, |
| timezone_id=timezone_id, |
| real_chrome=real_chrome, |
| hide_canvas=hide_canvas, |
| allow_webgl=allow_webgl, |
| network_idle=network_idle, |
| block_webrtc=block_webrtc, |
| wait_selector=wait_selector, |
| google_search=google_search, |
| extra_headers=extra_headers, |
| additional_args=additional_args, |
| solve_cloudflare=solve_cloudflare, |
| disable_resources=disable_resources, |
| wait_selector_state=wait_selector_state, |
| ) as session: |
| tasks = [session.fetch(url) for url in urls] |
| responses = await gather(*tasks) |
| return [ |
| _content_translator( |
| Convertor._extract_content( |
| page, |
| css_selector=css_selector, |
| extraction_type=extraction_type, |
| main_content_only=main_content_only, |
| ), |
| page, |
| ) |
| for page in responses |
| ] |
|
|
| def serve(self, http: bool, host: str, port: int): |
| """Serve the MCP server.""" |
| server = FastMCP(name="Scrapling", host=host, port=port) |
| server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True) |
| server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True) |
| server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True) |
| server.add_tool( |
| self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True |
| ) |
| server.add_tool( |
| self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True |
| ) |
| server.add_tool( |
| self.bulk_stealthy_fetch, |
| title="bulk_stealthy_fetch", |
| description=self.bulk_stealthy_fetch.__doc__, |
| structured_output=True, |
| ) |
| server.run(transport="stdio" if not http else "streamable-http") |
|
|