Karim shoair
fix: update code and docstrings to remove the old google referer logic
a28879b
from asyncio import gather
from mcp.server.fastmcp import FastMCP
from pydantic import BaseModel, Field
from scrapling.core.shell import Convertor
from scrapling.engines.toolbelt.custom import Response as _ScraplingResponse
from scrapling.engines.static import ImpersonateType
from scrapling.fetchers import (
Fetcher,
FetcherSession,
DynamicFetcher,
AsyncDynamicSession,
StealthyFetcher,
AsyncStealthySession,
)
from scrapling.core._types import (
Optional,
Tuple,
Mapping,
Dict,
List,
Any,
Generator,
Sequence,
SetCookieParam,
extraction_types,
SelectorWaitStates,
)
class ResponseModel(BaseModel):
"""Request's response information structure."""
status: int = Field(description="The status code returned by the website.")
content: list[str] = Field(description="The content as Markdown/HTML or the text content of the page.")
url: str = Field(description="The URL given by the user that resulted in this response.")
def _content_translator(content: Generator[str, None, None], page: _ScraplingResponse) -> ResponseModel:
"""Convert a content generator to a list of ResponseModel objects."""
return ResponseModel(status=page.status, content=[result for result in content], url=page.url)
def _normalize_credentials(credentials: Optional[Dict[str, str]]) -> Optional[Tuple[str, str]]:
"""Convert a credentials dictionary to a tuple accepted by fetchers."""
if not credentials:
return None
username = credentials.get("username")
password = credentials.get("password")
if username is None or password is None:
raise ValueError("Credentials dictionary must contain both 'username' and 'password' keys")
return username, password
class ScraplingMCPServer:
@staticmethod
def get(
url: str,
impersonate: ImpersonateType = "chrome",
extraction_type: extraction_types = "markdown",
css_selector: Optional[str] = None,
main_content_only: bool = True,
params: Optional[Dict] = None,
headers: Optional[Mapping[str, Optional[str]]] = None,
cookies: Optional[Dict[str, str]] = None,
timeout: Optional[int | float] = 30,
follow_redirects: bool = True,
max_redirects: int = 30,
retries: Optional[int] = 3,
retry_delay: Optional[int] = 1,
proxy: Optional[str] = None,
proxy_auth: Optional[Dict[str, str]] = None,
auth: Optional[Dict[str, str]] = None,
verify: Optional[bool] = True,
http3: Optional[bool] = False,
stealthy_headers: Optional[bool] = True,
) -> ResponseModel:
"""Make GET HTTP request to a URL and return a structured output of the result.
Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
:param url: The URL to request.
:param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
- Markdown will convert the page content to Markdown format.
- HTML will return the raw HTML content of the page.
- Text will return the text content of the page.
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
:param params: Query string parameters for the request.
:param headers: Headers to include in the request.
:param cookies: Cookies to use in the request.
:param timeout: Number of seconds to wait before timing out.
:param follow_redirects: Whether to follow redirects. Defaults to True.
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
:param retries: Number of retry attempts. Defaults to 3.
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
Cannot be used together with the `proxies` parameter.
:param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys.
:param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
:param verify: Whether to verify HTTPS certificates.
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
"""
normalized_proxy_auth = _normalize_credentials(proxy_auth)
normalized_auth = _normalize_credentials(auth)
page = Fetcher.get(
url,
auth=normalized_auth,
proxy=proxy,
http3=http3,
verify=verify,
params=params,
proxy_auth=normalized_proxy_auth,
retry_delay=retry_delay,
stealthy_headers=stealthy_headers,
impersonate=impersonate,
headers=headers,
cookies=cookies,
timeout=timeout,
retries=retries,
max_redirects=max_redirects,
follow_redirects=follow_redirects,
)
return _content_translator(
Convertor._extract_content(
page,
css_selector=css_selector,
extraction_type=extraction_type,
main_content_only=main_content_only,
),
page,
)
@staticmethod
async def bulk_get(
urls: List[str],
impersonate: ImpersonateType = "chrome",
extraction_type: extraction_types = "markdown",
css_selector: Optional[str] = None,
main_content_only: bool = True,
params: Optional[Dict] = None,
headers: Optional[Mapping[str, Optional[str]]] = None,
cookies: Optional[Dict[str, str]] = None,
timeout: Optional[int | float] = 30,
follow_redirects: bool = True,
max_redirects: int = 30,
retries: Optional[int] = 3,
retry_delay: Optional[int] = 1,
proxy: Optional[str] = None,
proxy_auth: Optional[Dict[str, str]] = None,
auth: Optional[Dict[str, str]] = None,
verify: Optional[bool] = True,
http3: Optional[bool] = False,
stealthy_headers: Optional[bool] = True,
) -> List[ResponseModel]:
"""Make GET HTTP request to a group of URLs and for each URL, return a structured output of the result.
Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
:param urls: A list of the URLs to request.
:param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
- Markdown will convert the page content to Markdown format.
- HTML will return the raw HTML content of the page.
- Text will return the text content of the page.
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
:param params: Query string parameters for the request.
:param headers: Headers to include in the request.
:param cookies: Cookies to use in the request.
:param timeout: Number of seconds to wait before timing out.
:param follow_redirects: Whether to follow redirects. Defaults to True.
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
:param retries: Number of retry attempts. Defaults to 3.
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
Cannot be used together with the `proxies` parameter.
:param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys.
:param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
:param verify: Whether to verify HTTPS certificates.
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
"""
normalized_proxy_auth = _normalize_credentials(proxy_auth)
normalized_auth = _normalize_credentials(auth)
async with FetcherSession() as session:
tasks: List[Any] = [
session.get(
url,
auth=normalized_auth,
proxy=proxy,
http3=http3,
verify=verify,
params=params,
headers=headers,
cookies=cookies,
timeout=timeout,
retries=retries,
proxy_auth=normalized_proxy_auth,
retry_delay=retry_delay,
impersonate=impersonate,
max_redirects=max_redirects,
follow_redirects=follow_redirects,
stealthy_headers=stealthy_headers,
)
for url in urls
]
responses = await gather(*tasks)
return [
_content_translator(
Convertor._extract_content(
page,
css_selector=css_selector,
extraction_type=extraction_type,
main_content_only=main_content_only,
),
page,
)
for page in responses
]
@staticmethod
async def fetch(
url: str,
extraction_type: extraction_types = "markdown",
css_selector: Optional[str] = None,
main_content_only: bool = True,
headless: bool = True, # noqa: F821
google_search: bool = True,
real_chrome: bool = False,
wait: int | float = 0,
proxy: Optional[str | Dict[str, str]] = None,
timezone_id: str | None = None,
locale: str | None = None,
extra_headers: Optional[Dict[str, str]] = None,
useragent: Optional[str] = None,
cdp_url: Optional[str] = None,
timeout: int | float = 30000,
disable_resources: bool = False,
wait_selector: Optional[str] = None,
cookies: Sequence[SetCookieParam] | None = None,
network_idle: bool = False,
wait_selector_state: SelectorWaitStates = "attached",
) -> ResponseModel:
"""Use playwright to open a browser to fetch a URL and return a structured output of the result.
Note: This is only suitable for low-mid protection levels.
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
:param url: The URL to request.
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
- Markdown will convert the page content to Markdown format.
- HTML will return the raw HTML content of the page.
- Text will return the text content of the page.
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
:param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
rules. Defaults to the system default locale.
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
:param google_search: Enabled by default, Scrapling will set a Google referer header.
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
"""
page = await DynamicFetcher.async_fetch(
url,
wait=wait,
proxy=proxy,
locale=locale,
timeout=timeout,
cookies=cookies,
cdp_url=cdp_url,
headless=headless,
useragent=useragent,
timezone_id=timezone_id,
real_chrome=real_chrome,
network_idle=network_idle,
wait_selector=wait_selector,
extra_headers=extra_headers,
google_search=google_search,
disable_resources=disable_resources,
wait_selector_state=wait_selector_state,
)
return _content_translator(
Convertor._extract_content(
page,
css_selector=css_selector,
extraction_type=extraction_type,
main_content_only=main_content_only,
),
page,
)
@staticmethod
async def bulk_fetch(
urls: List[str],
extraction_type: extraction_types = "markdown",
css_selector: Optional[str] = None,
main_content_only: bool = True,
headless: bool = True, # noqa: F821
google_search: bool = True,
real_chrome: bool = False,
wait: int | float = 0,
proxy: Optional[str | Dict[str, str]] = None,
timezone_id: str | None = None,
locale: str | None = None,
extra_headers: Optional[Dict[str, str]] = None,
useragent: Optional[str] = None,
cdp_url: Optional[str] = None,
timeout: int | float = 30000,
disable_resources: bool = False,
wait_selector: Optional[str] = None,
cookies: Sequence[SetCookieParam] | None = None,
network_idle: bool = False,
wait_selector_state: SelectorWaitStates = "attached",
) -> List[ResponseModel]:
"""Use playwright to open a browser, then fetch a group of URLs at the same time, and for each page return a structured output of the result.
Note: This is only suitable for low-mid protection levels.
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
:param urls: A list of the URLs to request.
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
- Markdown will convert the page content to Markdown format.
- HTML will return the raw HTML content of the page.
- Text will return the text content of the page.
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
:param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
rules. Defaults to the system default locale.
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
:param google_search: Enabled by default, Scrapling will set a Google referer header.
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
"""
async with AsyncDynamicSession(
wait=wait,
proxy=proxy,
locale=locale,
timeout=timeout,
cookies=cookies,
cdp_url=cdp_url,
headless=headless,
max_pages=len(urls),
useragent=useragent,
timezone_id=timezone_id,
real_chrome=real_chrome,
network_idle=network_idle,
wait_selector=wait_selector,
google_search=google_search,
extra_headers=extra_headers,
disable_resources=disable_resources,
wait_selector_state=wait_selector_state,
) as session:
tasks = [session.fetch(url) for url in urls]
responses = await gather(*tasks)
return [
_content_translator(
Convertor._extract_content(
page,
css_selector=css_selector,
extraction_type=extraction_type,
main_content_only=main_content_only,
),
page,
)
for page in responses
]
@staticmethod
async def stealthy_fetch(
url: str,
extraction_type: extraction_types = "markdown",
css_selector: Optional[str] = None,
main_content_only: bool = True,
headless: bool = True, # noqa: F821
google_search: bool = True,
real_chrome: bool = False,
wait: int | float = 0,
proxy: Optional[str | Dict[str, str]] = None,
timezone_id: str | None = None,
locale: str | None = None,
extra_headers: Optional[Dict[str, str]] = None,
useragent: Optional[str] = None,
hide_canvas: bool = False,
cdp_url: Optional[str] = None,
timeout: int | float = 30000,
disable_resources: bool = False,
wait_selector: Optional[str] = None,
cookies: Sequence[SetCookieParam] | None = None,
network_idle: bool = False,
wait_selector_state: SelectorWaitStates = "attached",
block_webrtc: bool = False,
allow_webgl: bool = True,
solve_cloudflare: bool = False,
additional_args: Optional[Dict] = None,
) -> ResponseModel:
"""Use the stealthy fetcher to fetch a URL and return a structured output of the result.
Note: This is the only suitable fetcher for high protection levels.
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
:param url: The URL to request.
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
- Markdown will convert the page content to Markdown format.
- HTML will return the raw HTML content of the page.
- Text will return the text content of the page.
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
:param cookies: Set cookies for the next request.
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
rules. Defaults to the system default locale.
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
:param google_search: Enabled by default, Scrapling will set a Google referer header.
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
"""
page = await StealthyFetcher.async_fetch(
url,
wait=wait,
proxy=proxy,
locale=locale,
cdp_url=cdp_url,
timeout=timeout,
cookies=cookies,
headless=headless,
useragent=useragent,
timezone_id=timezone_id,
real_chrome=real_chrome,
hide_canvas=hide_canvas,
allow_webgl=allow_webgl,
network_idle=network_idle,
block_webrtc=block_webrtc,
wait_selector=wait_selector,
google_search=google_search,
extra_headers=extra_headers,
additional_args=additional_args,
solve_cloudflare=solve_cloudflare,
disable_resources=disable_resources,
wait_selector_state=wait_selector_state,
)
return _content_translator(
Convertor._extract_content(
page,
css_selector=css_selector,
extraction_type=extraction_type,
main_content_only=main_content_only,
),
page,
)
@staticmethod
async def bulk_stealthy_fetch(
urls: List[str],
extraction_type: extraction_types = "markdown",
css_selector: Optional[str] = None,
main_content_only: bool = True,
headless: bool = True, # noqa: F821
google_search: bool = True,
real_chrome: bool = False,
wait: int | float = 0,
proxy: Optional[str | Dict[str, str]] = None,
timezone_id: str | None = None,
locale: str | None = None,
extra_headers: Optional[Dict[str, str]] = None,
useragent: Optional[str] = None,
hide_canvas: bool = False,
cdp_url: Optional[str] = None,
timeout: int | float = 30000,
disable_resources: bool = False,
wait_selector: Optional[str] = None,
cookies: Sequence[SetCookieParam] | None = None,
network_idle: bool = False,
wait_selector_state: SelectorWaitStates = "attached",
block_webrtc: bool = False,
allow_webgl: bool = True,
solve_cloudflare: bool = False,
additional_args: Optional[Dict] = None,
) -> List[ResponseModel]:
"""Use the stealthy fetcher to fetch a group of URLs at the same time, and for each page return a structured output of the result.
Note: This is the only suitable fetcher for high protection levels.
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
:param urls: A list of the URLs to request.
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
- Markdown will convert the page content to Markdown format.
- HTML will return the raw HTML content of the page.
- Text will return the text content of the page.
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
:param cookies: Set cookies for the next request.
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
rules. Defaults to the system default locale.
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
:param google_search: Enabled by default, Scrapling will set a Google referer header.
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
"""
async with AsyncStealthySession(
wait=wait,
proxy=proxy,
locale=locale,
cdp_url=cdp_url,
timeout=timeout,
cookies=cookies,
headless=headless,
useragent=useragent,
timezone_id=timezone_id,
real_chrome=real_chrome,
hide_canvas=hide_canvas,
allow_webgl=allow_webgl,
network_idle=network_idle,
block_webrtc=block_webrtc,
wait_selector=wait_selector,
google_search=google_search,
extra_headers=extra_headers,
additional_args=additional_args,
solve_cloudflare=solve_cloudflare,
disable_resources=disable_resources,
wait_selector_state=wait_selector_state,
) as session:
tasks = [session.fetch(url) for url in urls]
responses = await gather(*tasks)
return [
_content_translator(
Convertor._extract_content(
page,
css_selector=css_selector,
extraction_type=extraction_type,
main_content_only=main_content_only,
),
page,
)
for page in responses
]
def serve(self, http: bool, host: str, port: int):
"""Serve the MCP server."""
server = FastMCP(name="Scrapling", host=host, port=port)
server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True)
server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True)
server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True)
server.add_tool(
self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True
)
server.add_tool(
self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True
)
server.add_tool(
self.bulk_stealthy_fetch,
title="bulk_stealthy_fetch",
description=self.bulk_stealthy_fetch.__doc__,
structured_output=True,
)
server.run(transport="stdio" if not http else "streamable-http")