File size: 3,167 Bytes
988c7cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
{
  "id": "http_fetch",
  "name": "HTTP Fetch",
  "version": "1.0.0",
  "description": "Fetch a URL and return cleaned text content. Strips HTML tags, scripts, and styles. Useful for agents that need to read web pages, docs, or APIs.",
  "author": "Chris4K",
  "tags": ["web", "scraping", "http", "fetch", "parsing"],
  "dependencies": ["requests"],
  "schema": {
    "input": {
      "url": "str — URL to fetch",
      "max_chars": "int — max characters to return (default: 4000)",
      "return_raw": "bool — return raw HTML instead of cleaned text (default: false)"
    },
    "output": {
      "url": "str",
      "content": "str",
      "status_code": "int",
      "content_length": "int",
      "truncated": "bool"
    }
  },
  "code": "import re\nimport requests\nfrom urllib.parse import urlparse\n\n\ndef _strip_html(html: str) -> str:\n    \"\"\"Remove tags, scripts, styles; collapse whitespace.\"\"\"\n    html = re.sub(r\"<(script|style)[^>]*>.*?</\\1>\", \"\", html, flags=re.DOTALL | re.IGNORECASE)\n    html = re.sub(r\"<!--.*?-->\", \"\", html, flags=re.DOTALL)\n    html = re.sub(r\"<[^>]+>\", \" \", html)\n    html = re.sub(r\"&nbsp;\", \" \", html)\n    html = re.sub(r\"&[a-z]{2,6};\", \"\", html)\n    html = re.sub(r\"[ \\t]+\", \" \", html)\n    html = re.sub(r\"\\n{3,}\", \"\\n\\n\", html)\n    return html.strip()\n\n\ndef execute(\n    url: str,\n    max_chars: int = 4000,\n    return_raw: bool = False,\n    timeout: int = 10,\n) -> dict:\n    \"\"\"Fetch a URL and return its content.\"\"\"\n    if not url or not url.strip():\n        return {\"error\": \"URL cannot be empty\"}\n\n    # Basic URL validation\n    parsed = urlparse(url)\n    if parsed.scheme not in (\"http\", \"https\"):\n        return {\"error\": f\"Only http/https URLs are supported (got '{parsed.scheme}')\"}\n\n    headers = {\n        \"User-Agent\": \"FORGE-Agent/1.0 Mozilla/5.0\",\n        \"Accept\": \"text/html,application/xhtml+xml,text/plain\",\n        \"Accept-Language\": \"en-US,en;q=0.5\",\n    }\n\n    try:\n        r = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)\n        r.raise_for_status()\n    except requests.exceptions.Timeout:\n        return {\"error\": f\"Timeout after {timeout}s\", \"url\": url}\n    except requests.exceptions.ConnectionError as e:\n        return {\"error\": f\"Connection error: {e}\", \"url\": url}\n    except requests.exceptions.HTTPError as e:\n        return {\"error\": f\"HTTP {r.status_code}: {e}\", \"url\": url, \"status_code\": r.status_code}\n    except Exception as e:\n        return {\"error\": str(e), \"url\": url}\n\n    raw = r.text\n    content = raw if return_raw else _strip_html(raw)\n    truncated = len(content) > max_chars\n    content = content[:max_chars] + (\"...\" if truncated else \"\")\n\n    return {\n        \"url\": r.url,\n        \"status_code\": r.status_code,\n        \"content\": content,\n        \"content_length\": len(r.text),\n        \"returned_chars\": len(content),\n        \"truncated\": truncated,\n        \"content_type\": r.headers.get(\"content-type\", \"\"),\n    }\n",
  "downloads": 0,
  "created_at": 1710000004
}