Spaces:
Sleeping
Sleeping
| { | |
| "id": "http_fetch", | |
| "name": "HTTP Fetch", | |
| "version": "1.0.0", | |
| "description": "Fetch a URL and return cleaned text content. Strips HTML tags, scripts, and styles. Useful for agents that need to read web pages, docs, or APIs.", | |
| "author": "Chris4K", | |
| "tags": ["web", "scraping", "http", "fetch", "parsing"], | |
| "dependencies": ["requests"], | |
| "schema": { | |
| "input": { | |
| "url": "str — URL to fetch", | |
| "max_chars": "int — max characters to return (default: 4000)", | |
| "return_raw": "bool — return raw HTML instead of cleaned text (default: false)" | |
| }, | |
| "output": { | |
| "url": "str", | |
| "content": "str", | |
| "status_code": "int", | |
| "content_length": "int", | |
| "truncated": "bool" | |
| } | |
| }, | |
| "code": "import re\nimport requests\nfrom urllib.parse import urlparse\n\n\ndef _strip_html(html: str) -> str:\n \"\"\"Remove tags, scripts, styles; collapse whitespace.\"\"\"\n html = re.sub(r\"<(script|style)[^>]*>.*?</\\1>\", \"\", html, flags=re.DOTALL | re.IGNORECASE)\n html = re.sub(r\"<!--.*?-->\", \"\", html, flags=re.DOTALL)\n html = re.sub(r\"<[^>]+>\", \" \", html)\n html = re.sub(r\" \", \" \", html)\n html = re.sub(r\"&[a-z]{2,6};\", \"\", html)\n html = re.sub(r\"[ \\t]+\", \" \", html)\n html = re.sub(r\"\\n{3,}\", \"\\n\\n\", html)\n return html.strip()\n\n\ndef execute(\n url: str,\n max_chars: int = 4000,\n return_raw: bool = False,\n timeout: int = 10,\n) -> dict:\n \"\"\"Fetch a URL and return its content.\"\"\"\n if not url or not url.strip():\n return {\"error\": \"URL cannot be empty\"}\n\n # Basic URL validation\n parsed = urlparse(url)\n if parsed.scheme not in (\"http\", \"https\"):\n return {\"error\": f\"Only http/https URLs are supported (got '{parsed.scheme}')\"}\n\n headers = {\n \"User-Agent\": \"FORGE-Agent/1.0 Mozilla/5.0\",\n \"Accept\": \"text/html,application/xhtml+xml,text/plain\",\n \"Accept-Language\": \"en-US,en;q=0.5\",\n }\n\n try:\n r = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)\n r.raise_for_status()\n except requests.exceptions.Timeout:\n return {\"error\": f\"Timeout after {timeout}s\", \"url\": url}\n except requests.exceptions.ConnectionError as e:\n return {\"error\": f\"Connection error: {e}\", \"url\": url}\n except requests.exceptions.HTTPError as e:\n return {\"error\": f\"HTTP {r.status_code}: {e}\", \"url\": url, \"status_code\": r.status_code}\n except Exception as e:\n return {\"error\": str(e), \"url\": url}\n\n raw = r.text\n content = raw if return_raw else _strip_html(raw)\n truncated = len(content) > max_chars\n content = content[:max_chars] + (\"...\" if truncated else \"\")\n\n return {\n \"url\": r.url,\n \"status_code\": r.status_code,\n \"content\": content,\n \"content_length\": len(r.text),\n \"returned_chars\": len(content),\n \"truncated\": truncated,\n \"content_type\": r.headers.get(\"content-type\", \"\"),\n }\n", | |
| "downloads": 0, | |
| "created_at": 1710000004 | |
| } | |