self-trained2 / agent /tools /web_tools.py
DeepImagix's picture
Upload web_tools.py
d656581 verified
Raw
History Blame Contribute Delete
28.3 kB
"""
NeuraPrompt Agent β€” Web Tools v8.2 (Reliability + Per-Site Handlers)
====================================================================
Fixes over v8.1:
* _safe_request now accepts any 2xx status (DDG returns 202 with valid
content β€” previously discarded, breaking both DDG engines).
* DDG Lite engine replaced with DDG HTML endpoint (html.duckduckgo.com/html/),
which returns 200 with parseable result__a / result__snippet selectors.
DDG Lite's HTML structure had changed and the old selectors found nothing.
* BeautifulSoup parser falls back to html.parser if lxml is missing β€” works
regardless of environment (per spec).
* Per-site User-Agent strategy: Wikipedia etc. block generic browser UAs
from datacenter IPs (403). Bot UA with contact info returns 200.
* New SUPPORTED_SITES registry with per-site fetch handlers:
- Wikipedia β†’ REST API (reliable, no scraping)
- GitHub β†’ raw.githubusercontent.com (no JS, no auth needed)
* fetch_url now has a 4-step fallback chain: site handler β†’ bot UA β†’
browser UA β†’ web.archive.org cache. Previously gave up after one attempt.
Public API unchanged:
web_search(query) -> str
fetch_url(url) -> str
"""
import requests
import random
import time
import re
from urllib.parse import quote_plus, unquote, urlparse, parse_qs
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import logging
log = logging.getLogger("agent.tools.web.v8.2")
# ─────────────────────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────────────────────
TIMEOUT_SEARCH = 8
TIMEOUT_FETCH = 15
MAX_RETRIES = 1
BACKOFF_FACTOR = 0.3
MAX_RESULTS = 6
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/125.0.0.0",
]
# Bot UA β€” used for sites that block generic browser UAs from datacenter IPs.
# Format follows Wikipedia's policy: <name>/<version> (<contact>)
BOT_USER_AGENT = "NeuraPromptAgent/1.0 (https://neuraprompt.ai; contact@neuraprompt.ai)"
# Sites that block generic browser UAs β†’ must use bot UA.
# Matched against the URL's netloc (case-insensitive substring).
BOT_UA_SITES = (
"wikipedia.org",
"wikimedia.org",
"wiktionary.org",
"wikiquote.org",
"wikibooks.org",
"wikisource.org",
"wikinews.org",
"wikiversity.org",
"mediawiki.org",
"meta.wikimedia.org",
"gov.uk",
"nature.com",
"sciencedirect.com",
"springer.com",
)
SEARXNG_INSTANCES = [
"https://searx.be",
"https://search.ononoki.org",
"https://searx.tiekoetter.com",
"https://searx.prvcy.eu",
"https://search.sapti.me",
"https://darmarit.org/searx",
"https://searxng.site",
]
# ─────────────────────────────────────────────────────────────
# SUPPORTED SITES (per-site fetch handlers β€” "supported web URLs")
# ─────────────────────────────────────────────────────────────
# Each handler: (url, session) -> str (page text) or None to fall through
# Add new sites here as needed. Handlers are tried BEFORE the generic fetcher.
def _fetch_wikipedia(url: str, session: requests.Session) -> str | None:
"""Wikipedia REST API β€” reliable, structured, no scraping.
Handles en.wikipedia.org/wiki/<Title> β†’ /api/rest_v1/page/html/<Title>"""
parsed = urlparse(url)
if "wikipedia.org" not in parsed.netloc:
return None
path = parsed.path
m = re.match(r"^/wiki/(.+)$", path)
if not m:
return None
title = unquote(m.group(1))
# Use the REST API for the same language edition
lang = parsed.netloc.split(".")[0] or "en"
api_url = f"https://{lang}.wikipedia.org/api/rest_v1/page/html/{quote_plus(title)}"
resp = _safe_request(
session, api_url,
headers={"User-Agent": BOT_USER_AGENT, "Accept": "text/html"},
timeout=TIMEOUT_FETCH,
allow_redirects=True,
)
if not resp:
return None
soup = _make_soup(resp.text)
# Strip the REST API's header/footer chrome
for tag in soup(["script", "style", "nav", "header", "footer", "aside"]):
tag.decompose()
text = soup.get_text(" ", strip=True)
if len(text) < 100:
return None
title_tag = soup.find("title")
page_title = title_tag.get_text(strip=True) if title_tag else title
return f"Title: {page_title}\nURL: {url}\nSource: Wikipedia REST API\n{'='*60}\n\n{text[:12000]}"
def _fetch_github(url: str, session: requests.Session) -> str | None:
"""GitHub: rewrite /blob/ URLs to raw.githubusercontent.com for plain-text fetch.
Also handles raw.githubusercontent.com directly."""
parsed = urlparse(url)
if "github.com" not in parsed.netloc and "raw.githubusercontent.com" not in parsed.netloc:
return None
raw_url = url
# Convert: github.com/{user}/{repo}/blob/{ref}/{path} β†’ raw.githubusercontent.com/{user}/{repo}/{ref}/{path}
m = re.match(
r"^https?://github\.com/([^/]+)/([^/]+)/blob/(.+)$",
url,
)
if m:
raw_url = f"https://raw.githubusercontent.com/{m.group(1)}/{m.group(2)}/{m.group(3)}"
resp = _safe_request(
session, raw_url,
headers={"User-Agent": BOT_USER_AGENT},
timeout=TIMEOUT_FETCH,
allow_redirects=True,
)
if not resp:
return None
content_type = resp.headers.get("content-type", "").lower()
text = resp.text
if len(text) < 10:
return None
# For raw files, the response IS the file content (no HTML wrapper).
if "text/html" not in content_type:
return f"URL: {raw_url}\nContent-Type: {content_type}\n{'='*60}\n\n{text[:12000]}"
# If it's somehow HTML (e.g. a repo page), strip tags.
soup = _make_soup(text)
for tag in soup(["script", "style", "nav", "header", "footer", "aside"]):
tag.decompose()
body = soup.get_text(" ", strip=True)
if len(body) < 100:
return None
return f"URL: {url}\n{'='*60}\n\n{body[:12000]}"
# Registry: netloc substring β†’ handler. Tried in order; first match wins.
SUPPORTED_SITES = [
("wikipedia.org", _fetch_wikipedia),
("wikimedia.org", _fetch_wikipedia),
("github.com", _fetch_github),
("raw.githubusercontent.com", _fetch_github),
]
# ─────────────────────────────────────────────────────────────
# SESSION BUILDER
# ─────────────────────────────────────────────────────────────
def _build_session() -> requests.Session:
"""Create a resilient session with retry logic."""
session = requests.Session()
retry_strategy = Retry(
total=MAX_RETRIES,
backoff_factor=BACKOFF_FACTOR,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["HEAD", "GET", "OPTIONS"],
raise_on_status=False,
)
adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=10, pool_maxsize=10)
session.mount("https://", adapter)
session.mount("http://", adapter)
return session
# ─────────────────────────────────────────────────────────────
# PARSER HELPER (lxml with html.parser fallback β€” portability)
# ─────────────────────────────────────────────────────────────
def _make_soup(html: str) -> BeautifulSoup:
"""Parse HTML with lxml if available, else fall back to html.parser.
The original code hardcoded 'lxml' which crashes if lxml isn't installed."""
try:
return BeautifulSoup(html, "lxml")
except Exception:
# lxml missing or parse error β€” html.parser is always available.
return BeautifulSoup(html, "html.parser")
# ─────────────────────────────────────────────────────────────
# HEADERS (per-site UA strategy β€” fixes Wikipedia 403)
# ─────────────────────────────────────────────────────────────
def _ua_for_url(url: str) -> str:
"""Pick a User-Agent appropriate for the target site.
Sites in BOT_UA_SITES (Wikipedia, etc.) block generic browser UAs from
datacenter IPs β†’ must use a bot UA with contact info.
Search engines need browser UAs β†’ use a random browser UA."""
try:
netloc = urlparse(url).netloc.lower()
except Exception:
return random.choice(USER_AGENTS)
for site in BOT_UA_SITES:
if site in netloc:
return BOT_USER_AGENT
return random.choice(USER_AGENTS)
def _headers(url: str = "", referer: str = "") -> dict:
"""Generate realistic browser headers. UA is chosen per-target-site."""
ua = _ua_for_url(url) if url else random.choice(USER_AGENTS)
headers = {
"User-Agent": ua,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none" if not referer else "same-origin",
"Sec-Fetch-User": "?1",
"Cache-Control": "max-age=0",
}
if referer:
headers["Referer"] = referer
return headers
# ─────────────────────────────────────────────────────────────
# SAFE REQUEST (FIXED β€” accepts any 2xx, not just 200)
# ─────────────────────────────────────────────────────────────
def _safe_request(session: requests.Session, url: str, **kwargs) -> requests.Response | None:
"""
Make a request with full error isolation.
Returns the Response on any 2xx status (200-299), None on any failure.
BUGFIX: The original only accepted status==200, which silently discarded
DuckDuckGo's HTTP 202 responses β€” even though those responses contained
valid JSON/HTML bodies. DDG returns 202 ('Accepted') as its normal
success status for the lite and API endpoints.
"""
try:
start = time.time()
response = session.get(url, **kwargs)
elapsed = time.time() - start
log.debug(f"[{response.status_code}] {url[:80]}... ({elapsed:.2f}s)")
# Accept any 2xx (200 OK, 201 Created, 202 Accepted, 204 No Content, etc.)
if 200 <= response.status_code < 300:
return response
elif response.status_code in (403, 429):
log.warning(f"Blocked [{response.status_code}]: {url[:80]}")
else:
log.warning(f"HTTP {response.status_code}: {url[:80]}")
except requests.exceptions.SSLError as e:
log.warning(f"SSL error for {url[:60]}: {e}")
except requests.exceptions.ProxyError as e:
log.warning(f"Proxy error for {url[:60]}: {e}")
except requests.exceptions.ConnectionError as e:
log.warning(f"Connection failed for {url[:60]}: {e}")
except requests.exceptions.Timeout:
log.warning(f"Timeout for {url[:60]}")
except requests.exceptions.TooManyRedirects:
log.warning(f"Redirect loop for {url[:60]}")
except requests.exceptions.RequestException as e:
log.warning(f"Request failed for {url[:60]}: {e}")
except Exception as e:
log.warning(f"Unexpected error for {url[:60]}: {e}")
return None
# ─────────────────────────────────────────────────────────────
# SEARCH ENGINES
# ─────────────────────────────────────────────────────────────
def _ddg_decode_href(href: str) -> str:
"""DDG HTML endpoint wraps external URLs as /l/?uddg=<encoded>.
Decode them back to the original URL."""
if not href:
return ""
if href.startswith("/"):
href = f"https://html.duckduckgo.com{href}"
if "uddg=" in href:
try:
qs = parse_qs(urlparse(href).query)
return unquote(qs.get("uddg", [href])[0])
except Exception:
return href
return href
def _search_ddg_html(session: requests.Session, query: str) -> list[dict] | None:
"""Engine 1: DuckDuckGo HTML endpoint (replaces the broken DDG Lite engine).
DDG Lite (lite.duckduckgo.com/lite/) returns 202 with stale selectors.
DDG HTML (html.duckduckgo.com/html/) returns 200 with current
`a.result__a` and `.result__snippet` selectors β€” reliable."""
url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
response = _safe_request(
session, url,
headers=_headers(url, referer="https://duckduckgo.com/"),
timeout=TIMEOUT_SEARCH,
allow_redirects=True,
)
if not response:
return None
soup = _make_soup(response.text)
results = []
for a in soup.select("a.result__a")[:MAX_RESULTS]:
title = a.get_text(strip=True)
raw_href = a.get("href", "")
href = _ddg_decode_href(raw_href)
if not title or not href:
continue
# Skip DDG's own ad/redirect URLs β€” they wrap ads as /y.js?ad_domain=...
if "duckduckgo.com/y.js" in href or "duckduckgo.com/y.js" in raw_href:
continue
if not href.startswith(("http://", "https://")):
continue
# Snippet is a sibling element inside the same result wrapper.
snippet = ""
result_div = a.find_parent("div", class_="result") or a.find_parent("div")
if result_div:
snip_el = result_div.select_one(".result__snippet")
if snip_el:
snippet = snip_el.get_text(strip=True)
results.append({
"title": title,
"url": href,
"snippet": snippet,
"engine": "ddg-html",
})
return results if results else None
def _search_ddg_api(session: requests.Session, query: str) -> list[dict] | None:
"""Engine 2: DuckDuckGo Instant Answer API (JSON, no scraping).
Now works because _safe_request accepts 202 responses."""
url = f"https://api.duckduckgo.com/?q={quote_plus(query)}&format=json&no_html=1&skip_disambig=1"
response = _safe_request(
session, url,
headers=_headers(url),
timeout=TIMEOUT_SEARCH,
)
if not response:
return None
try:
data = response.json()
except Exception:
return None
results = []
abstract = (data.get("AbstractText") or data.get("Answer") or "").strip()
abstract_url = data.get("AbstractURL", "")
if abstract and abstract_url:
results.append({
"title": data.get("Heading", "Quick Answer"),
"url": abstract_url,
"snippet": abstract,
"engine": "ddg-api",
})
for topic in data.get("RelatedTopics", [])[:MAX_RESULTS - 1]:
# Some RelatedTopics entries are nested topic groups β€” skip those.
if not isinstance(topic, dict):
continue
text = (topic.get("Text") or "").strip()
first_url = topic.get("FirstURL", "")
if text and first_url:
results.append({
"title": text.split(" - ")[0] if " - " in text else text[:60],
"url": first_url,
"snippet": text,
"engine": "ddg-api",
})
return results if results else None
def _search_bing(session: requests.Session, query: str) -> list[dict] | None:
"""Engine 3: Bing HTML."""
url = f"https://www.bing.com/search?q={quote_plus(query)}"
response = _safe_request(
session, url,
headers=_headers(url, referer="https://www.bing.com/"),
timeout=TIMEOUT_SEARCH + 3,
allow_redirects=True,
)
if not response:
return None
soup = _make_soup(response.text)
results = []
for li in soup.select("li.b_algo")[:MAX_RESULTS]:
a = li.select_one("a")
if not a:
continue
title = a.get_text(strip=True)
href = a.get("href", "")
snippet = ""
for sel in ["p", ".b_caption p", "div.b_attribution+div", ".b_snippet"]:
el = li.select_one(sel)
if el:
snippet = el.get_text(strip=True)
break
if title and href and href.startswith("http"):
results.append({
"title": title,
"url": href,
"snippet": snippet,
"engine": "bing",
})
return results if results else None
def _search_searxng(session: requests.Session, query: str) -> list[dict] | None:
"""Engine 4: SearXNG public instances (last-resort fallback)."""
instances = list(SEARXNG_INSTANCES)
random.shuffle(instances)
for base in instances[:3]:
url = f"{base}/search?q={quote_plus(query)}&format=json&language=en"
response = _safe_request(
session, url,
headers=_headers(url, referer=base),
timeout=TIMEOUT_SEARCH + 5,
)
if not response:
continue
try:
data = response.json()
except Exception:
continue
results = []
for r in data.get("results", [])[:MAX_RESULTS]:
title = (r.get("title") or "").strip()
href = (r.get("url") or "").strip()
snippet = (r.get("content") or r.get("snippet") or "").strip()
if title and href:
results.append({
"title": title,
"url": href,
"snippet": snippet,
"engine": f"searxng-{base.split('//')[1].split('.')[0]}",
})
if results:
return results
return None
# ─────────────────────────────────────────────────────────────
# FETCH URL HELPERS (for fetch_url fallback chain)
# ─────────────────────────────────────────────────────────────
def _fetch_direct(url: str, session: requests.Session, ua: str) -> str | None:
"""Direct fetch with a specific User-Agent. Returns extracted text or None."""
resp = _safe_request(
session, url,
headers={**_headers(url), "User-Agent": ua},
timeout=TIMEOUT_FETCH,
allow_redirects=True,
)
if not resp:
return None
content_type = resp.headers.get("content-type", "").lower()
if "text/html" not in content_type and "text/plain" not in content_type and "application/json" not in content_type:
# Non-text content β€” return a short preview.
preview = resp.text[:4000] if resp.text else "[binary content]"
return f"[Non-HTML content: {content_type}]\n\n{preview}"
soup = _make_soup(resp.text)
# Strip noise elements.
for tag in soup(["script", "style", "nav", "header", "footer", "aside",
"form", "iframe", "noscript", "svg", "canvas",
"advertisement", ".ad", ".ads", ".cookie-banner"]):
tag.decompose()
content_blocks = []
for selector in ["article", "main", "[role='main']", ".content", ".post", ".entry"]:
for el in soup.select(selector):
text = el.get_text(" ", strip=True)
if len(text) > 300:
content_blocks.append(text)
if not content_blocks:
for tag in soup.find_all(["h1", "h2", "h3", "h4", "p", "li", "td"]):
text = tag.get_text(" ", strip=True)
if len(text) > 30:
content_blocks.append(text)
seen = set()
final_blocks = []
for block in content_blocks:
sig = block[:100].lower()
if sig not in seen:
seen.add(sig)
final_blocks.append(block)
text = "\n\n".join(final_blocks)
if not text:
return None
title = ""
title_tag = soup.find("title")
if title_tag:
title = title_tag.get_text(strip=True)
header = (f"Title: {title}\nURL: {url}\n{'='*60}\n\n"
if title else f"URL: {url}\n{'='*60}\n\n")
return (header + text[:10000]).strip()
def _fetch_archive_org(url: str, session: requests.Session) -> str | None:
"""Last-resort fallback: fetch via web.archive.org cached snapshot.
Archive.org is rate-limit-friendly and ignores the origin site's bot blocks."""
api_url = f"https://archive.org/wayback/available?url={quote_plus(url)}"
resp = _safe_request(
session, api_url,
headers={"User-Agent": BOT_USER_AGENT},
timeout=TIMEOUT_FETCH,
)
if not resp:
return None
try:
data = resp.json()
except Exception:
return None
snapshots = data.get("archived_snapshots") or {}
closest = snapshots.get("closest") or {}
archive_url = closest.get("url")
if not archive_url or not closest.get("available"):
return None
# Fetch the archived snapshot.
return _fetch_direct(archive_url, session, BOT_USER_AGENT)
# ─────────────────────────────────────────────────────────────
# PUBLIC API
# ─────────────────────────────────────────────────────────────
def web_search(query: str) -> str:
"""
Search the web using multiple no-key engines with automatic fallback.
Priority: DDG HTML β†’ DDG API β†’ Bing β†’ SearXNG
"""
if not query or not query.strip():
return "Error: search query cannot be empty."
query = query.strip()
session = _build_session()
engines = [
("DuckDuckGo HTML", _search_ddg_html),
("DuckDuckGo API", _search_ddg_api),
("Bing", _search_bing),
("SearXNG", _search_searxng),
]
all_errors = []
for name, engine_func in engines:
try:
log.info(f"Trying {name} for: {query[:50]}...")
results = engine_func(session, query)
if results:
formatted = []
for r in results:
line = f"β€’ {r['title']}"
if r.get("snippet"):
line += f"\n{r['snippet']}"
line += f"\nπŸ”— {r['url']}"
formatted.append(line)
footer = f"\n\n[Results via {name} | {len(results)} found]"
return "\n\n".join(formatted) + footer
except Exception as e:
log.error(f"Critical error in {name}: {e}")
all_errors.append(f"{name}: {str(e)}")
continue
# All engines failed
error_detail = " | ".join(all_errors) if all_errors else "All engines returned no results."
return (
f"Search failed for: '{query}'\n"
f"All fallback engines exhausted.\n"
f"Details: {error_detail}\n"
f"Tip: Check your internet connection or try again later."
)
def fetch_url(url: str) -> str:
"""
Fetch a webpage with per-site handlers and a 4-step fallback chain:
1. Site-specific handler (Wikipedia REST API, GitHub raw, etc.)
2. Direct fetch with bot UA (for sites that block browser UAs)
3. Direct fetch with browser UA (for sites that block bot UAs)
4. web.archive.org cached snapshot (last resort)
"""
if not url or not url.strip():
return "Error: URL cannot be empty."
url = url.strip()
if not url.startswith(("http://", "https://")):
return "Error: URL must start with http:// or https://"
session = _build_session()
# ---- Step 1: Try site-specific handlers (SUPPORTED_SITES) ----
try:
netloc = urlparse(url).netloc.lower()
except Exception:
netloc = ""
for site_pattern, handler in SUPPORTED_SITES:
if site_pattern in netloc:
try:
log.info(f"Trying site handler for '{site_pattern}': {url[:80]}")
result = handler(url, session)
if result:
return result
except Exception as e:
log.warning(f"Site handler {site_pattern} failed: {e}")
# Fall through to generic fetch
break # only try the first matching handler
# ---- Step 2: Direct fetch with bot UA ----
log.info(f"Trying direct fetch (bot UA): {url[:80]}")
result = _fetch_direct(url, session, BOT_USER_AGENT)
if result:
return result
# ---- Step 3: Direct fetch with browser UA ----
log.info(f"Trying direct fetch (browser UA): {url[:80]}")
browser_ua = random.choice(USER_AGENTS)
result = _fetch_direct(url, session, browser_ua)
if result:
return result
# ---- Step 4: web.archive.org cache ----
log.info(f"Trying archive.org cache: {url[:80]}")
result = _fetch_archive_org(url, session)
if result:
return f"[Fetched via archive.org cache]\n\n{result}"
# All fallbacks exhausted
return (
f"Could not fetch URL: {url}\n"
"All fetch strategies failed (site handler, bot UA, browser UA, archive.org).\n"
"The site may require JavaScript, require authentication, or be offline."
)
# ─────────────────────────────────────────────────────────────
# TEST / DEBUG
# ─────────────────────────────────────────────────────────────
if __name__ == "__main__":
import logging
logging.basicConfig(level=logging.INFO)
print("=" * 60)
print("TEST: web_search('Python programming language')")
print("=" * 60)
print(web_search("Python programming language"))
print("\n" + "=" * 60)
print("TEST: fetch_url('https://en.wikipedia.org/wiki/Python_(programming_language)')")
print("=" * 60)
print(fetch_url("https://en.wikipedia.org/wiki/Python_(programming_language)")[:1500])
print("\n" + "=" * 60)
print("TEST: fetch_url('https://github.com/python/cpython/blob/main/README.rst')")
print("=" * 60)
print(fetch_url("https://github.com/python/cpython/blob/main/README.rst")[:1500])