Spaces:

Chris4K
/

agent-forge-bak

Sleeping

App Files Files Community

agent-forge-bak / http_fetch.json

Chris4K

Upload 14 files

988c7cc verified 6 days ago

raw

history blame contribute delete

3.17 kB

	{
	"id": "http_fetch",
	"name": "HTTP Fetch",
	"version": "1.0.0",
	"description": "Fetch a URL and return cleaned text content. Strips HTML tags, scripts, and styles. Useful for agents that need to read web pages, docs, or APIs.",
	"author": "Chris4K",
	"tags": ["web", "scraping", "http", "fetch", "parsing"],
	"dependencies": ["requests"],
	"schema": {
	"input": {
	"url": "str — URL to fetch",
	"max_chars": "int — max characters to return (default: 4000)",
	"return_raw": "bool — return raw HTML instead of cleaned text (default: false)"
	},
	"output": {
	"url": "str",
	"content": "str",
	"status_code": "int",
	"content_length": "int",
	"truncated": "bool"
	}
	},
	"code": "import re\nimport requests\nfrom urllib.parse import urlparse\n\n\ndef _strip_html(html: str) -> str:\n \"\"\"Remove tags, scripts, styles; collapse whitespace.\"\"\"\n html = re.sub(r\"<(script\|style)[^>]>.?</\\1>\", \"\", html, flags=re.DOTALL \| re.IGNORECASE)\n html = re.sub(r\"<!--.*?-->\", \"\", html, flags=re.DOTALL)\n html = re.sub(r\"<[^>]+>\", \" \", html)\n html = re.sub(r\" \", \" \", html)\n html = re.sub(r\"&[a-z]{2,6};\", \"\", html)\n html = re.sub(r\"[ \\t]+\", \" \", html)\n html = re.sub(r\"\\n{3,}\", \"\\n\\n\", html)\n return html.strip()\n\n\ndef execute(\n url: str,\n max_chars: int = 4000,\n return_raw: bool = False,\n timeout: int = 10,\n) -> dict:\n \"\"\"Fetch a URL and return its content.\"\"\"\n if not url or not url.strip():\n return {\"error\": \"URL cannot be empty\"}\n\n # Basic URL validation\n parsed = urlparse(url)\n if parsed.scheme not in (\"http\", \"https\"):\n return {\"error\": f\"Only http/https URLs are supported (got '{parsed.scheme}')\"}\n\n headers = {\n \"User-Agent\": \"FORGE-Agent/1.0 Mozilla/5.0\",\n \"Accept\": \"text/html,application/xhtml+xml,text/plain\",\n \"Accept-Language\": \"en-US,en;q=0.5\",\n }\n\n try:\n r = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)\n r.raise_for_status()\n except requests.exceptions.Timeout:\n return {\"error\": f\"Timeout after {timeout}s\", \"url\": url}\n except requests.exceptions.ConnectionError as e:\n return {\"error\": f\"Connection error: {e}\", \"url\": url}\n except requests.exceptions.HTTPError as e:\n return {\"error\": f\"HTTP {r.status_code}: {e}\", \"url\": url, \"status_code\": r.status_code}\n except Exception as e:\n return {\"error\": str(e), \"url\": url}\n\n raw = r.text\n content = raw if return_raw else _strip_html(raw)\n truncated = len(content) > max_chars\n content = content[:max_chars] + (\"...\" if truncated else \"\")\n\n return {\n \"url\": r.url,\n \"status_code\": r.status_code,\n \"content\": content,\n \"content_length\": len(r.text),\n \"returned_chars\": len(content),\n \"truncated\": truncated,\n \"content_type\": r.headers.get(\"content-type\", \"\"),\n }\n",
	"downloads": 0,
	"created_at": 1710000004
	}