Spaces:
Running
Running
| """ | |
| NeuraPrompt Agent β Web Tools v8.2 (Reliability + Per-Site Handlers) | |
| ==================================================================== | |
| Fixes over v8.1: | |
| * _safe_request now accepts any 2xx status (DDG returns 202 with valid | |
| content β previously discarded, breaking both DDG engines). | |
| * DDG Lite engine replaced with DDG HTML endpoint (html.duckduckgo.com/html/), | |
| which returns 200 with parseable result__a / result__snippet selectors. | |
| DDG Lite's HTML structure had changed and the old selectors found nothing. | |
| * BeautifulSoup parser falls back to html.parser if lxml is missing β works | |
| regardless of environment (per spec). | |
| * Per-site User-Agent strategy: Wikipedia etc. block generic browser UAs | |
| from datacenter IPs (403). Bot UA with contact info returns 200. | |
| * New SUPPORTED_SITES registry with per-site fetch handlers: | |
| - Wikipedia β REST API (reliable, no scraping) | |
| - GitHub β raw.githubusercontent.com (no JS, no auth needed) | |
| * fetch_url now has a 4-step fallback chain: site handler β bot UA β | |
| browser UA β web.archive.org cache. Previously gave up after one attempt. | |
| Public API unchanged: | |
| web_search(query) -> str | |
| fetch_url(url) -> str | |
| """ | |
| import requests | |
| import random | |
| import time | |
| import re | |
| from urllib.parse import quote_plus, unquote, urlparse, parse_qs | |
| from bs4 import BeautifulSoup | |
| from requests.adapters import HTTPAdapter | |
| from urllib3.util.retry import Retry | |
| import logging | |
| log = logging.getLogger("agent.tools.web.v8.2") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CONFIG | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| TIMEOUT_SEARCH = 8 | |
| TIMEOUT_FETCH = 15 | |
| MAX_RETRIES = 1 | |
| BACKOFF_FACTOR = 0.3 | |
| MAX_RESULTS = 6 | |
| USER_AGENTS = [ | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36", | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", | |
| "Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0", | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/125.0.0.0", | |
| ] | |
| # Bot UA β used for sites that block generic browser UAs from datacenter IPs. | |
| # Format follows Wikipedia's policy: <name>/<version> (<contact>) | |
| BOT_USER_AGENT = "NeuraPromptAgent/1.0 (https://neuraprompt.ai; contact@neuraprompt.ai)" | |
| # Sites that block generic browser UAs β must use bot UA. | |
| # Matched against the URL's netloc (case-insensitive substring). | |
| BOT_UA_SITES = ( | |
| "wikipedia.org", | |
| "wikimedia.org", | |
| "wiktionary.org", | |
| "wikiquote.org", | |
| "wikibooks.org", | |
| "wikisource.org", | |
| "wikinews.org", | |
| "wikiversity.org", | |
| "mediawiki.org", | |
| "meta.wikimedia.org", | |
| "gov.uk", | |
| "nature.com", | |
| "sciencedirect.com", | |
| "springer.com", | |
| ) | |
| SEARXNG_INSTANCES = [ | |
| "https://searx.be", | |
| "https://search.ononoki.org", | |
| "https://searx.tiekoetter.com", | |
| "https://searx.prvcy.eu", | |
| "https://search.sapti.me", | |
| "https://darmarit.org/searx", | |
| "https://searxng.site", | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SUPPORTED SITES (per-site fetch handlers β "supported web URLs") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Each handler: (url, session) -> str (page text) or None to fall through | |
| # Add new sites here as needed. Handlers are tried BEFORE the generic fetcher. | |
| def _fetch_wikipedia(url: str, session: requests.Session) -> str | None: | |
| """Wikipedia REST API β reliable, structured, no scraping. | |
| Handles en.wikipedia.org/wiki/<Title> β /api/rest_v1/page/html/<Title>""" | |
| parsed = urlparse(url) | |
| if "wikipedia.org" not in parsed.netloc: | |
| return None | |
| path = parsed.path | |
| m = re.match(r"^/wiki/(.+)$", path) | |
| if not m: | |
| return None | |
| title = unquote(m.group(1)) | |
| # Use the REST API for the same language edition | |
| lang = parsed.netloc.split(".")[0] or "en" | |
| api_url = f"https://{lang}.wikipedia.org/api/rest_v1/page/html/{quote_plus(title)}" | |
| resp = _safe_request( | |
| session, api_url, | |
| headers={"User-Agent": BOT_USER_AGENT, "Accept": "text/html"}, | |
| timeout=TIMEOUT_FETCH, | |
| allow_redirects=True, | |
| ) | |
| if not resp: | |
| return None | |
| soup = _make_soup(resp.text) | |
| # Strip the REST API's header/footer chrome | |
| for tag in soup(["script", "style", "nav", "header", "footer", "aside"]): | |
| tag.decompose() | |
| text = soup.get_text(" ", strip=True) | |
| if len(text) < 100: | |
| return None | |
| title_tag = soup.find("title") | |
| page_title = title_tag.get_text(strip=True) if title_tag else title | |
| return f"Title: {page_title}\nURL: {url}\nSource: Wikipedia REST API\n{'='*60}\n\n{text[:12000]}" | |
| def _fetch_github(url: str, session: requests.Session) -> str | None: | |
| """GitHub: rewrite /blob/ URLs to raw.githubusercontent.com for plain-text fetch. | |
| Also handles raw.githubusercontent.com directly.""" | |
| parsed = urlparse(url) | |
| if "github.com" not in parsed.netloc and "raw.githubusercontent.com" not in parsed.netloc: | |
| return None | |
| raw_url = url | |
| # Convert: github.com/{user}/{repo}/blob/{ref}/{path} β raw.githubusercontent.com/{user}/{repo}/{ref}/{path} | |
| m = re.match( | |
| r"^https?://github\.com/([^/]+)/([^/]+)/blob/(.+)$", | |
| url, | |
| ) | |
| if m: | |
| raw_url = f"https://raw.githubusercontent.com/{m.group(1)}/{m.group(2)}/{m.group(3)}" | |
| resp = _safe_request( | |
| session, raw_url, | |
| headers={"User-Agent": BOT_USER_AGENT}, | |
| timeout=TIMEOUT_FETCH, | |
| allow_redirects=True, | |
| ) | |
| if not resp: | |
| return None | |
| content_type = resp.headers.get("content-type", "").lower() | |
| text = resp.text | |
| if len(text) < 10: | |
| return None | |
| # For raw files, the response IS the file content (no HTML wrapper). | |
| if "text/html" not in content_type: | |
| return f"URL: {raw_url}\nContent-Type: {content_type}\n{'='*60}\n\n{text[:12000]}" | |
| # If it's somehow HTML (e.g. a repo page), strip tags. | |
| soup = _make_soup(text) | |
| for tag in soup(["script", "style", "nav", "header", "footer", "aside"]): | |
| tag.decompose() | |
| body = soup.get_text(" ", strip=True) | |
| if len(body) < 100: | |
| return None | |
| return f"URL: {url}\n{'='*60}\n\n{body[:12000]}" | |
| # Registry: netloc substring β handler. Tried in order; first match wins. | |
| SUPPORTED_SITES = [ | |
| ("wikipedia.org", _fetch_wikipedia), | |
| ("wikimedia.org", _fetch_wikipedia), | |
| ("github.com", _fetch_github), | |
| ("raw.githubusercontent.com", _fetch_github), | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SESSION BUILDER | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _build_session() -> requests.Session: | |
| """Create a resilient session with retry logic.""" | |
| session = requests.Session() | |
| retry_strategy = Retry( | |
| total=MAX_RETRIES, | |
| backoff_factor=BACKOFF_FACTOR, | |
| status_forcelist=[429, 500, 502, 503, 504], | |
| allowed_methods=["HEAD", "GET", "OPTIONS"], | |
| raise_on_status=False, | |
| ) | |
| adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=10, pool_maxsize=10) | |
| session.mount("https://", adapter) | |
| session.mount("http://", adapter) | |
| return session | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PARSER HELPER (lxml with html.parser fallback β portability) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _make_soup(html: str) -> BeautifulSoup: | |
| """Parse HTML with lxml if available, else fall back to html.parser. | |
| The original code hardcoded 'lxml' which crashes if lxml isn't installed.""" | |
| try: | |
| return BeautifulSoup(html, "lxml") | |
| except Exception: | |
| # lxml missing or parse error β html.parser is always available. | |
| return BeautifulSoup(html, "html.parser") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HEADERS (per-site UA strategy β fixes Wikipedia 403) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _ua_for_url(url: str) -> str: | |
| """Pick a User-Agent appropriate for the target site. | |
| Sites in BOT_UA_SITES (Wikipedia, etc.) block generic browser UAs from | |
| datacenter IPs β must use a bot UA with contact info. | |
| Search engines need browser UAs β use a random browser UA.""" | |
| try: | |
| netloc = urlparse(url).netloc.lower() | |
| except Exception: | |
| return random.choice(USER_AGENTS) | |
| for site in BOT_UA_SITES: | |
| if site in netloc: | |
| return BOT_USER_AGENT | |
| return random.choice(USER_AGENTS) | |
| def _headers(url: str = "", referer: str = "") -> dict: | |
| """Generate realistic browser headers. UA is chosen per-target-site.""" | |
| ua = _ua_for_url(url) if url else random.choice(USER_AGENTS) | |
| headers = { | |
| "User-Agent": ua, | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.5", | |
| "Accept-Encoding": "gzip, deflate, br", | |
| "DNT": "1", | |
| "Connection": "keep-alive", | |
| "Upgrade-Insecure-Requests": "1", | |
| "Sec-Fetch-Dest": "document", | |
| "Sec-Fetch-Mode": "navigate", | |
| "Sec-Fetch-Site": "none" if not referer else "same-origin", | |
| "Sec-Fetch-User": "?1", | |
| "Cache-Control": "max-age=0", | |
| } | |
| if referer: | |
| headers["Referer"] = referer | |
| return headers | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SAFE REQUEST (FIXED β accepts any 2xx, not just 200) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _safe_request(session: requests.Session, url: str, **kwargs) -> requests.Response | None: | |
| """ | |
| Make a request with full error isolation. | |
| Returns the Response on any 2xx status (200-299), None on any failure. | |
| BUGFIX: The original only accepted status==200, which silently discarded | |
| DuckDuckGo's HTTP 202 responses β even though those responses contained | |
| valid JSON/HTML bodies. DDG returns 202 ('Accepted') as its normal | |
| success status for the lite and API endpoints. | |
| """ | |
| try: | |
| start = time.time() | |
| response = session.get(url, **kwargs) | |
| elapsed = time.time() - start | |
| log.debug(f"[{response.status_code}] {url[:80]}... ({elapsed:.2f}s)") | |
| # Accept any 2xx (200 OK, 201 Created, 202 Accepted, 204 No Content, etc.) | |
| if 200 <= response.status_code < 300: | |
| return response | |
| elif response.status_code in (403, 429): | |
| log.warning(f"Blocked [{response.status_code}]: {url[:80]}") | |
| else: | |
| log.warning(f"HTTP {response.status_code}: {url[:80]}") | |
| except requests.exceptions.SSLError as e: | |
| log.warning(f"SSL error for {url[:60]}: {e}") | |
| except requests.exceptions.ProxyError as e: | |
| log.warning(f"Proxy error for {url[:60]}: {e}") | |
| except requests.exceptions.ConnectionError as e: | |
| log.warning(f"Connection failed for {url[:60]}: {e}") | |
| except requests.exceptions.Timeout: | |
| log.warning(f"Timeout for {url[:60]}") | |
| except requests.exceptions.TooManyRedirects: | |
| log.warning(f"Redirect loop for {url[:60]}") | |
| except requests.exceptions.RequestException as e: | |
| log.warning(f"Request failed for {url[:60]}: {e}") | |
| except Exception as e: | |
| log.warning(f"Unexpected error for {url[:60]}: {e}") | |
| return None | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SEARCH ENGINES | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _ddg_decode_href(href: str) -> str: | |
| """DDG HTML endpoint wraps external URLs as /l/?uddg=<encoded>. | |
| Decode them back to the original URL.""" | |
| if not href: | |
| return "" | |
| if href.startswith("/"): | |
| href = f"https://html.duckduckgo.com{href}" | |
| if "uddg=" in href: | |
| try: | |
| qs = parse_qs(urlparse(href).query) | |
| return unquote(qs.get("uddg", [href])[0]) | |
| except Exception: | |
| return href | |
| return href | |
| def _search_ddg_html(session: requests.Session, query: str) -> list[dict] | None: | |
| """Engine 1: DuckDuckGo HTML endpoint (replaces the broken DDG Lite engine). | |
| DDG Lite (lite.duckduckgo.com/lite/) returns 202 with stale selectors. | |
| DDG HTML (html.duckduckgo.com/html/) returns 200 with current | |
| `a.result__a` and `.result__snippet` selectors β reliable.""" | |
| url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}" | |
| response = _safe_request( | |
| session, url, | |
| headers=_headers(url, referer="https://duckduckgo.com/"), | |
| timeout=TIMEOUT_SEARCH, | |
| allow_redirects=True, | |
| ) | |
| if not response: | |
| return None | |
| soup = _make_soup(response.text) | |
| results = [] | |
| for a in soup.select("a.result__a")[:MAX_RESULTS]: | |
| title = a.get_text(strip=True) | |
| raw_href = a.get("href", "") | |
| href = _ddg_decode_href(raw_href) | |
| if not title or not href: | |
| continue | |
| # Skip DDG's own ad/redirect URLs β they wrap ads as /y.js?ad_domain=... | |
| if "duckduckgo.com/y.js" in href or "duckduckgo.com/y.js" in raw_href: | |
| continue | |
| if not href.startswith(("http://", "https://")): | |
| continue | |
| # Snippet is a sibling element inside the same result wrapper. | |
| snippet = "" | |
| result_div = a.find_parent("div", class_="result") or a.find_parent("div") | |
| if result_div: | |
| snip_el = result_div.select_one(".result__snippet") | |
| if snip_el: | |
| snippet = snip_el.get_text(strip=True) | |
| results.append({ | |
| "title": title, | |
| "url": href, | |
| "snippet": snippet, | |
| "engine": "ddg-html", | |
| }) | |
| return results if results else None | |
| def _search_ddg_api(session: requests.Session, query: str) -> list[dict] | None: | |
| """Engine 2: DuckDuckGo Instant Answer API (JSON, no scraping). | |
| Now works because _safe_request accepts 202 responses.""" | |
| url = f"https://api.duckduckgo.com/?q={quote_plus(query)}&format=json&no_html=1&skip_disambig=1" | |
| response = _safe_request( | |
| session, url, | |
| headers=_headers(url), | |
| timeout=TIMEOUT_SEARCH, | |
| ) | |
| if not response: | |
| return None | |
| try: | |
| data = response.json() | |
| except Exception: | |
| return None | |
| results = [] | |
| abstract = (data.get("AbstractText") or data.get("Answer") or "").strip() | |
| abstract_url = data.get("AbstractURL", "") | |
| if abstract and abstract_url: | |
| results.append({ | |
| "title": data.get("Heading", "Quick Answer"), | |
| "url": abstract_url, | |
| "snippet": abstract, | |
| "engine": "ddg-api", | |
| }) | |
| for topic in data.get("RelatedTopics", [])[:MAX_RESULTS - 1]: | |
| # Some RelatedTopics entries are nested topic groups β skip those. | |
| if not isinstance(topic, dict): | |
| continue | |
| text = (topic.get("Text") or "").strip() | |
| first_url = topic.get("FirstURL", "") | |
| if text and first_url: | |
| results.append({ | |
| "title": text.split(" - ")[0] if " - " in text else text[:60], | |
| "url": first_url, | |
| "snippet": text, | |
| "engine": "ddg-api", | |
| }) | |
| return results if results else None | |
| def _search_bing(session: requests.Session, query: str) -> list[dict] | None: | |
| """Engine 3: Bing HTML.""" | |
| url = f"https://www.bing.com/search?q={quote_plus(query)}" | |
| response = _safe_request( | |
| session, url, | |
| headers=_headers(url, referer="https://www.bing.com/"), | |
| timeout=TIMEOUT_SEARCH + 3, | |
| allow_redirects=True, | |
| ) | |
| if not response: | |
| return None | |
| soup = _make_soup(response.text) | |
| results = [] | |
| for li in soup.select("li.b_algo")[:MAX_RESULTS]: | |
| a = li.select_one("a") | |
| if not a: | |
| continue | |
| title = a.get_text(strip=True) | |
| href = a.get("href", "") | |
| snippet = "" | |
| for sel in ["p", ".b_caption p", "div.b_attribution+div", ".b_snippet"]: | |
| el = li.select_one(sel) | |
| if el: | |
| snippet = el.get_text(strip=True) | |
| break | |
| if title and href and href.startswith("http"): | |
| results.append({ | |
| "title": title, | |
| "url": href, | |
| "snippet": snippet, | |
| "engine": "bing", | |
| }) | |
| return results if results else None | |
| def _search_searxng(session: requests.Session, query: str) -> list[dict] | None: | |
| """Engine 4: SearXNG public instances (last-resort fallback).""" | |
| instances = list(SEARXNG_INSTANCES) | |
| random.shuffle(instances) | |
| for base in instances[:3]: | |
| url = f"{base}/search?q={quote_plus(query)}&format=json&language=en" | |
| response = _safe_request( | |
| session, url, | |
| headers=_headers(url, referer=base), | |
| timeout=TIMEOUT_SEARCH + 5, | |
| ) | |
| if not response: | |
| continue | |
| try: | |
| data = response.json() | |
| except Exception: | |
| continue | |
| results = [] | |
| for r in data.get("results", [])[:MAX_RESULTS]: | |
| title = (r.get("title") or "").strip() | |
| href = (r.get("url") or "").strip() | |
| snippet = (r.get("content") or r.get("snippet") or "").strip() | |
| if title and href: | |
| results.append({ | |
| "title": title, | |
| "url": href, | |
| "snippet": snippet, | |
| "engine": f"searxng-{base.split('//')[1].split('.')[0]}", | |
| }) | |
| if results: | |
| return results | |
| return None | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # FETCH URL HELPERS (for fetch_url fallback chain) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _fetch_direct(url: str, session: requests.Session, ua: str) -> str | None: | |
| """Direct fetch with a specific User-Agent. Returns extracted text or None.""" | |
| resp = _safe_request( | |
| session, url, | |
| headers={**_headers(url), "User-Agent": ua}, | |
| timeout=TIMEOUT_FETCH, | |
| allow_redirects=True, | |
| ) | |
| if not resp: | |
| return None | |
| content_type = resp.headers.get("content-type", "").lower() | |
| if "text/html" not in content_type and "text/plain" not in content_type and "application/json" not in content_type: | |
| # Non-text content β return a short preview. | |
| preview = resp.text[:4000] if resp.text else "[binary content]" | |
| return f"[Non-HTML content: {content_type}]\n\n{preview}" | |
| soup = _make_soup(resp.text) | |
| # Strip noise elements. | |
| for tag in soup(["script", "style", "nav", "header", "footer", "aside", | |
| "form", "iframe", "noscript", "svg", "canvas", | |
| "advertisement", ".ad", ".ads", ".cookie-banner"]): | |
| tag.decompose() | |
| content_blocks = [] | |
| for selector in ["article", "main", "[role='main']", ".content", ".post", ".entry"]: | |
| for el in soup.select(selector): | |
| text = el.get_text(" ", strip=True) | |
| if len(text) > 300: | |
| content_blocks.append(text) | |
| if not content_blocks: | |
| for tag in soup.find_all(["h1", "h2", "h3", "h4", "p", "li", "td"]): | |
| text = tag.get_text(" ", strip=True) | |
| if len(text) > 30: | |
| content_blocks.append(text) | |
| seen = set() | |
| final_blocks = [] | |
| for block in content_blocks: | |
| sig = block[:100].lower() | |
| if sig not in seen: | |
| seen.add(sig) | |
| final_blocks.append(block) | |
| text = "\n\n".join(final_blocks) | |
| if not text: | |
| return None | |
| title = "" | |
| title_tag = soup.find("title") | |
| if title_tag: | |
| title = title_tag.get_text(strip=True) | |
| header = (f"Title: {title}\nURL: {url}\n{'='*60}\n\n" | |
| if title else f"URL: {url}\n{'='*60}\n\n") | |
| return (header + text[:10000]).strip() | |
| def _fetch_archive_org(url: str, session: requests.Session) -> str | None: | |
| """Last-resort fallback: fetch via web.archive.org cached snapshot. | |
| Archive.org is rate-limit-friendly and ignores the origin site's bot blocks.""" | |
| api_url = f"https://archive.org/wayback/available?url={quote_plus(url)}" | |
| resp = _safe_request( | |
| session, api_url, | |
| headers={"User-Agent": BOT_USER_AGENT}, | |
| timeout=TIMEOUT_FETCH, | |
| ) | |
| if not resp: | |
| return None | |
| try: | |
| data = resp.json() | |
| except Exception: | |
| return None | |
| snapshots = data.get("archived_snapshots") or {} | |
| closest = snapshots.get("closest") or {} | |
| archive_url = closest.get("url") | |
| if not archive_url or not closest.get("available"): | |
| return None | |
| # Fetch the archived snapshot. | |
| return _fetch_direct(archive_url, session, BOT_USER_AGENT) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PUBLIC API | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def web_search(query: str) -> str: | |
| """ | |
| Search the web using multiple no-key engines with automatic fallback. | |
| Priority: DDG HTML β DDG API β Bing β SearXNG | |
| """ | |
| if not query or not query.strip(): | |
| return "Error: search query cannot be empty." | |
| query = query.strip() | |
| session = _build_session() | |
| engines = [ | |
| ("DuckDuckGo HTML", _search_ddg_html), | |
| ("DuckDuckGo API", _search_ddg_api), | |
| ("Bing", _search_bing), | |
| ("SearXNG", _search_searxng), | |
| ] | |
| all_errors = [] | |
| for name, engine_func in engines: | |
| try: | |
| log.info(f"Trying {name} for: {query[:50]}...") | |
| results = engine_func(session, query) | |
| if results: | |
| formatted = [] | |
| for r in results: | |
| line = f"β’ {r['title']}" | |
| if r.get("snippet"): | |
| line += f"\n{r['snippet']}" | |
| line += f"\nπ {r['url']}" | |
| formatted.append(line) | |
| footer = f"\n\n[Results via {name} | {len(results)} found]" | |
| return "\n\n".join(formatted) + footer | |
| except Exception as e: | |
| log.error(f"Critical error in {name}: {e}") | |
| all_errors.append(f"{name}: {str(e)}") | |
| continue | |
| # All engines failed | |
| error_detail = " | ".join(all_errors) if all_errors else "All engines returned no results." | |
| return ( | |
| f"Search failed for: '{query}'\n" | |
| f"All fallback engines exhausted.\n" | |
| f"Details: {error_detail}\n" | |
| f"Tip: Check your internet connection or try again later." | |
| ) | |
| def fetch_url(url: str) -> str: | |
| """ | |
| Fetch a webpage with per-site handlers and a 4-step fallback chain: | |
| 1. Site-specific handler (Wikipedia REST API, GitHub raw, etc.) | |
| 2. Direct fetch with bot UA (for sites that block browser UAs) | |
| 3. Direct fetch with browser UA (for sites that block bot UAs) | |
| 4. web.archive.org cached snapshot (last resort) | |
| """ | |
| if not url or not url.strip(): | |
| return "Error: URL cannot be empty." | |
| url = url.strip() | |
| if not url.startswith(("http://", "https://")): | |
| return "Error: URL must start with http:// or https://" | |
| session = _build_session() | |
| # ---- Step 1: Try site-specific handlers (SUPPORTED_SITES) ---- | |
| try: | |
| netloc = urlparse(url).netloc.lower() | |
| except Exception: | |
| netloc = "" | |
| for site_pattern, handler in SUPPORTED_SITES: | |
| if site_pattern in netloc: | |
| try: | |
| log.info(f"Trying site handler for '{site_pattern}': {url[:80]}") | |
| result = handler(url, session) | |
| if result: | |
| return result | |
| except Exception as e: | |
| log.warning(f"Site handler {site_pattern} failed: {e}") | |
| # Fall through to generic fetch | |
| break # only try the first matching handler | |
| # ---- Step 2: Direct fetch with bot UA ---- | |
| log.info(f"Trying direct fetch (bot UA): {url[:80]}") | |
| result = _fetch_direct(url, session, BOT_USER_AGENT) | |
| if result: | |
| return result | |
| # ---- Step 3: Direct fetch with browser UA ---- | |
| log.info(f"Trying direct fetch (browser UA): {url[:80]}") | |
| browser_ua = random.choice(USER_AGENTS) | |
| result = _fetch_direct(url, session, browser_ua) | |
| if result: | |
| return result | |
| # ---- Step 4: web.archive.org cache ---- | |
| log.info(f"Trying archive.org cache: {url[:80]}") | |
| result = _fetch_archive_org(url, session) | |
| if result: | |
| return f"[Fetched via archive.org cache]\n\n{result}" | |
| # All fallbacks exhausted | |
| return ( | |
| f"Could not fetch URL: {url}\n" | |
| "All fetch strategies failed (site handler, bot UA, browser UA, archive.org).\n" | |
| "The site may require JavaScript, require authentication, or be offline." | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TEST / DEBUG | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| print("=" * 60) | |
| print("TEST: web_search('Python programming language')") | |
| print("=" * 60) | |
| print(web_search("Python programming language")) | |
| print("\n" + "=" * 60) | |
| print("TEST: fetch_url('https://en.wikipedia.org/wiki/Python_(programming_language)')") | |
| print("=" * 60) | |
| print(fetch_url("https://en.wikipedia.org/wiki/Python_(programming_language)")[:1500]) | |
| print("\n" + "=" * 60) | |
| print("TEST: fetch_url('https://github.com/python/cpython/blob/main/README.rst')") | |
| print("=" * 60) | |
| print(fetch_url("https://github.com/python/cpython/blob/main/README.rst")[:1500]) | |