Spaces:
Running
Running
| from pathlib import Path | |
| from subprocess import check_output | |
| from sys import executable as python_executable | |
| from scrapling.core.utils import log | |
| from scrapling.engines.toolbelt.custom import Response | |
| from scrapling.core.utils._shell import _CookieParser, _ParseHeaders | |
| from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable | |
| from orjson import loads as json_loads, JSONDecodeError | |
| try: | |
| from click import command, option, Choice, group, argument | |
| except (ImportError, ModuleNotFoundError) as e: | |
| raise ModuleNotFoundError( | |
| "You need to install scrapling with any of the extras to enable Shell commands. See: https://scrapling.readthedocs.io/en/latest/#installation" | |
| ) from e | |
| __OUTPUT_FILE_HELP__ = "The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively." | |
| __PACKAGE_DIR__ = Path(__file__).parent | |
| def __Execute(cmd: List[str], help_line: str) -> None: # pragma: no cover | |
| print(f"Installing {help_line}...") | |
| _ = check_output(cmd, shell=False) # nosec B603 | |
| # I meant to not use try except here | |
| def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any]]: | |
| """Parse JSON string into a Python object""" | |
| if not json_string: | |
| return None | |
| try: | |
| return json_loads(json_string) | |
| except JSONDecodeError as err: # pragma: no cover | |
| raise ValueError(f"Invalid JSON data '{json_string}': {err}") | |
| def __Request_and_Save( | |
| fetcher_func: Callable[..., Response], | |
| url: str, | |
| output_file: str, | |
| css_selector: Optional[str] = None, | |
| **kwargs, | |
| ) -> None: | |
| """Make a request using the specified fetcher function and save the result""" | |
| from scrapling.core.shell import Convertor | |
| # Handle relative paths - convert to an absolute path based on the current working directory | |
| output_path = Path(output_file) | |
| if not output_path.is_absolute(): | |
| output_path = Path.cwd() / output_file | |
| response = fetcher_func(url, **kwargs) | |
| Convertor.write_content_to_file(response, str(output_path), css_selector) | |
| log.info(f"Content successfully saved to '{output_path}'") | |
| def __ParseExtractArguments( | |
| headers: List[str], cookies: str, params: str, json: Optional[str] = None | |
| ) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str], Optional[Dict[str, str]]]: | |
| """Parse arguments for extract command""" | |
| parsed_headers, parsed_cookies = _ParseHeaders(headers) | |
| if cookies: | |
| for key, value in _CookieParser(cookies): | |
| try: | |
| parsed_cookies[key] = value | |
| except Exception as err: | |
| raise ValueError(f"Could not parse cookies '{cookies}': {err}") | |
| parsed_json = __ParseJSONData(json) | |
| parsed_params = {} | |
| for param in params: | |
| if "=" in param: | |
| key, value = param.split("=", 1) | |
| parsed_params[key] = value | |
| return parsed_headers, parsed_cookies, parsed_params, parsed_json | |
| def __BuildRequest(headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs) -> Dict: | |
| """Build a request object using the specified arguments""" | |
| # Parse parameters | |
| parsed_headers, parsed_cookies, parsed_params, parsed_json = __ParseExtractArguments(headers, cookies, params, json) | |
| # Build request arguments | |
| request_kwargs: Dict[str, Any] = { | |
| "headers": parsed_headers if parsed_headers else None, | |
| "cookies": parsed_cookies if parsed_cookies else None, | |
| } | |
| if parsed_json: | |
| request_kwargs["json"] = parsed_json | |
| if parsed_params: | |
| request_kwargs["params"] = parsed_params | |
| if "proxy" in kwargs: | |
| request_kwargs["proxy"] = kwargs.pop("proxy") | |
| # Parse impersonate parameter if it contains commas (for random selection) | |
| if "impersonate" in kwargs and "," in (kwargs.get("impersonate") or ""): | |
| kwargs["impersonate"] = [browser.strip() for browser in kwargs["impersonate"].split(",")] | |
| return {**request_kwargs, **kwargs} | |
| def install(force): # pragma: no cover | |
| if force or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists(): | |
| __Execute( | |
| [python_executable, "-m", "playwright", "install", "chromium"], | |
| "Playwright browsers", | |
| ) | |
| __Execute( | |
| [ | |
| python_executable, | |
| "-m", | |
| "playwright", | |
| "install-deps", | |
| "chromium", | |
| ], | |
| "Playwright dependencies", | |
| ) | |
| from tld.utils import update_tld_names | |
| update_tld_names(fail_silently=True) | |
| # if no errors raised by the above commands, then we add the below file | |
| __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch() | |
| else: | |
| print("The dependencies are already installed") | |
| def mcp(http, host, port): | |
| from scrapling.core.ai import ScraplingMCPServer | |
| server = ScraplingMCPServer() | |
| server.serve(http, host, port) | |
| def shell(code, level): | |
| from scrapling.core.shell import CustomShell | |
| console = CustomShell(code=code, log_level=level) | |
| console.start() | |
| def extract(): | |
| """Extract content from web pages and save to files""" | |
| pass | |
| def get( | |
| url, | |
| output_file, | |
| headers, | |
| cookies, | |
| timeout, | |
| proxy, | |
| css_selector, | |
| params, | |
| follow_redirects, | |
| verify, | |
| impersonate, | |
| stealthy_headers, | |
| ): | |
| """ | |
| Perform a GET request and save the content to a file. | |
| :param url: Target URL for the request. | |
| :param output_file: Output file path (.md for Markdown, .html for HTML). | |
| :param headers: HTTP headers to include in the request. | |
| :param cookies: Cookies to use in the request. | |
| :param timeout: Number of seconds to wait before timing out. | |
| :param proxy: Proxy URL to use. (Format: "http://username:password@localhost:8030") | |
| :param css_selector: CSS selector to extract specific content. | |
| :param params: Query string parameters for the request. | |
| :param follow_redirects: Whether to follow redirects. | |
| :param verify: Whether to verify HTTPS certificates. | |
| :param impersonate: Browser version to impersonate. | |
| :param stealthy_headers: If enabled, creates and adds real browser headers. | |
| """ | |
| kwargs = __BuildRequest( | |
| headers, | |
| cookies, | |
| params, | |
| None, | |
| timeout=timeout, | |
| follow_redirects=follow_redirects, | |
| verify=verify, | |
| stealthy_headers=stealthy_headers, | |
| impersonate=impersonate, | |
| proxy=proxy, | |
| ) | |
| from scrapling.fetchers import Fetcher | |
| __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs) | |
| def post( | |
| url, | |
| output_file, | |
| data, | |
| json, | |
| headers, | |
| cookies, | |
| timeout, | |
| proxy, | |
| css_selector, | |
| params, | |
| follow_redirects, | |
| verify, | |
| impersonate, | |
| stealthy_headers, | |
| ): | |
| """ | |
| Perform a POST request and save the content to a file. | |
| :param url: Target URL for the request. | |
| :param output_file: Output file path (.md for Markdown, .html for HTML). | |
| :param data: Form data to include in the request body. (as string, ex: "param1=value1¶m2=value2") | |
| :param json: A JSON serializable object to include in the body of the request. | |
| :param headers: Headers to include in the request. | |
| :param cookies: Cookies to use in the request. | |
| :param timeout: Number of seconds to wait before timing out. | |
| :param proxy: Proxy URL to use. | |
| :param css_selector: CSS selector to extract specific content. | |
| :param params: Query string parameters for the request. | |
| :param follow_redirects: Whether to follow redirects. | |
| :param verify: Whether to verify HTTPS certificates. | |
| :param impersonate: Browser version to impersonate. | |
| :param stealthy_headers: If enabled, creates and adds real browser headers. | |
| """ | |
| kwargs = __BuildRequest( | |
| headers, | |
| cookies, | |
| params, | |
| json, | |
| timeout=timeout, | |
| follow_redirects=follow_redirects, | |
| verify=verify, | |
| stealthy_headers=stealthy_headers, | |
| impersonate=impersonate, | |
| proxy=proxy, | |
| data=data, | |
| ) | |
| from scrapling.fetchers import Fetcher | |
| __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs) | |
| def put( | |
| url, | |
| output_file, | |
| data, | |
| json, | |
| headers, | |
| cookies, | |
| timeout, | |
| proxy, | |
| css_selector, | |
| params, | |
| follow_redirects, | |
| verify, | |
| impersonate, | |
| stealthy_headers, | |
| ): | |
| """ | |
| Perform a PUT request and save the content to a file. | |
| :param url: Target URL for the request. | |
| :param output_file: Output file path (.md for Markdown, .html for HTML). | |
| :param data: Form data to include in the request body. | |
| :param json: A JSON serializable object to include in the body of the request. | |
| :param headers: Headers to include in the request. | |
| :param cookies: Cookies to use in the request. | |
| :param timeout: Number of seconds to wait before timing out. | |
| :param proxy: Proxy URL to use. | |
| :param css_selector: CSS selector to extract specific content. | |
| :param params: Query string parameters for the request. | |
| :param follow_redirects: Whether to follow redirects. | |
| :param verify: Whether to verify HTTPS certificates. | |
| :param impersonate: Browser version to impersonate. | |
| :param stealthy_headers: If enabled, creates and adds real browser headers. | |
| """ | |
| kwargs = __BuildRequest( | |
| headers, | |
| cookies, | |
| params, | |
| json, | |
| timeout=timeout, | |
| follow_redirects=follow_redirects, | |
| verify=verify, | |
| stealthy_headers=stealthy_headers, | |
| impersonate=impersonate, | |
| proxy=proxy, | |
| data=data, | |
| ) | |
| from scrapling.fetchers import Fetcher | |
| __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs) | |
| def delete( | |
| url, | |
| output_file, | |
| headers, | |
| cookies, | |
| timeout, | |
| proxy, | |
| css_selector, | |
| params, | |
| follow_redirects, | |
| verify, | |
| impersonate, | |
| stealthy_headers, | |
| ): | |
| """ | |
| Perform a DELETE request and save the content to a file. | |
| :param url: Target URL for the request. | |
| :param output_file: Output file path (.md for Markdown, .html for HTML). | |
| :param headers: Headers to include in the request. | |
| :param cookies: Cookies to use in the request. | |
| :param timeout: Number of seconds to wait before timing out. | |
| :param proxy: Proxy URL to use. | |
| :param css_selector: CSS selector to extract specific content. | |
| :param params: Query string parameters for the request. | |
| :param follow_redirects: Whether to follow redirects. | |
| :param verify: Whether to verify HTTPS certificates. | |
| :param impersonate: Browser version to impersonate. | |
| :param stealthy_headers: If enabled, creates and adds real browser headers. | |
| """ | |
| kwargs = __BuildRequest( | |
| headers, | |
| cookies, | |
| params, | |
| None, | |
| timeout=timeout, | |
| follow_redirects=follow_redirects, | |
| verify=verify, | |
| stealthy_headers=stealthy_headers, | |
| impersonate=impersonate, | |
| proxy=proxy, | |
| ) | |
| from scrapling.fetchers import Fetcher | |
| __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs) | |
| def fetch( | |
| url, | |
| output_file, | |
| headless, | |
| disable_resources, | |
| network_idle, | |
| timeout, | |
| wait, | |
| css_selector, | |
| wait_selector, | |
| locale, | |
| real_chrome, | |
| proxy, | |
| extra_headers, | |
| ): | |
| """ | |
| Opens up a browser and fetch content using DynamicFetcher. | |
| :param url: Target url. | |
| :param output_file: Output file path (.md for Markdown, .html for HTML). | |
| :param headless: Run the browser in headless/hidden or headful/visible mode. | |
| :param disable_resources: Drop requests of unnecessary resources for a speed boost. | |
| :param network_idle: Wait for the page until there are no network connections for at least 500 ms. | |
| :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. | |
| :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning. | |
| :param css_selector: CSS selector to extract specific content. | |
| :param wait_selector: Wait for a specific CSS selector to be in a specific state. | |
| :param locale: Set the locale for the browser. | |
| :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. | |
| :param proxy: The proxy to be used with requests. | |
| :param extra_headers: Extra headers to add to the request. | |
| """ | |
| # Parse parameters | |
| parsed_headers, _ = _ParseHeaders(extra_headers, False) | |
| # Build request arguments | |
| kwargs = { | |
| "headless": headless, | |
| "disable_resources": disable_resources, | |
| "network_idle": network_idle, | |
| "timeout": timeout, | |
| "locale": locale, | |
| "real_chrome": real_chrome, | |
| } | |
| if wait > 0: | |
| kwargs["wait"] = wait | |
| if wait_selector: | |
| kwargs["wait_selector"] = wait_selector | |
| if proxy: | |
| kwargs["proxy"] = proxy | |
| if parsed_headers: | |
| kwargs["extra_headers"] = parsed_headers | |
| from scrapling.fetchers import DynamicFetcher | |
| __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs) | |
| def stealthy_fetch( | |
| url, | |
| output_file, | |
| headless, | |
| disable_resources, | |
| block_webrtc, | |
| solve_cloudflare, | |
| allow_webgl, | |
| network_idle, | |
| real_chrome, | |
| hide_canvas, | |
| timeout, | |
| wait, | |
| css_selector, | |
| wait_selector, | |
| proxy, | |
| extra_headers, | |
| ): | |
| """ | |
| Opens up a browser with advanced stealth features and fetch content using StealthyFetcher. | |
| :param url: Target url. | |
| :param output_file: Output file path (.md for Markdown, .html for HTML). | |
| :param headless: Run the browser in headless/hidden, or headful/visible mode. | |
| :param disable_resources: Drop requests of unnecessary resources for a speed boost. | |
| :param block_webrtc: Blocks WebRTC entirely. | |
| :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges. | |
| :param allow_webgl: Allow WebGL (recommended to keep enabled). | |
| :param network_idle: Wait for the page until there are no network connections for at least 500 ms. | |
| :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. | |
| :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting. | |
| :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. | |
| :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning. | |
| :param css_selector: CSS selector to extract specific content. | |
| :param wait_selector: Wait for a specific CSS selector to be in a specific state. | |
| :param proxy: The proxy to be used with requests. | |
| :param extra_headers: Extra headers to add to the request. | |
| """ | |
| # Parse parameters | |
| parsed_headers, _ = _ParseHeaders(extra_headers, False) | |
| # Build request arguments | |
| kwargs = { | |
| "headless": headless, | |
| "disable_resources": disable_resources, | |
| "block_webrtc": block_webrtc, | |
| "solve_cloudflare": solve_cloudflare, | |
| "allow_webgl": allow_webgl, | |
| "network_idle": network_idle, | |
| "real_chrome": real_chrome, | |
| "hide_canvas": hide_canvas, | |
| "timeout": timeout, | |
| } | |
| if wait > 0: | |
| kwargs["wait"] = wait | |
| if wait_selector: | |
| kwargs["wait_selector"] = wait_selector | |
| if proxy: | |
| kwargs["proxy"] = proxy | |
| if parsed_headers: | |
| kwargs["extra_headers"] = parsed_headers | |
| from scrapling.fetchers import StealthyFetcher | |
| __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs) | |
| def main(): | |
| pass | |
| # Adding commands | |
| main.add_command(install) | |
| main.add_command(shell) | |
| main.add_command(extract) | |
| main.add_command(mcp) | |