Spaces:
Running on Zero
Running on Zero
| """Web research for campaign goals — ddgs search + BeautifulSoup page extraction. CPU-only. | |
| Used by forge_campaign (app.py) when the user ticks "research the web first": the goal is searched | |
| on DuckDuckGo (ddgs, no API key), the top pages are fetched and stripped to plain text, and the | |
| bounded snippets are handed to the LLM as grounding for the campaign's quest chain (the LLM never | |
| browses — code does; see CLAUDE.md's thin-LLM principle). Everything is best-effort: any network | |
| failure just shrinks the snippet list, and an empty result falls back to the model's own knowledge. | |
| Budgets below keep the total well under Nemotron's context and a ZeroGPU call's patience: | |
| ~3 fetched pages x 1500 chars + a few raw search snippets ≈ 6KB of grounding text. | |
| """ | |
| from __future__ import annotations | |
| MAX_RESULTS = 6 # ddgs hits kept per query | |
| MAX_PAGES = 3 # pages actually fetched + parsed | |
| PAGE_CHARS = 1500 # plain-text chars kept per fetched page | |
| SNIPPET_CHARS = 300 # chars kept per unfetched search-result snippet | |
| TIMEOUT_S = 10 | |
| _HEADERS = {"User-Agent": "Mozilla/5.0 (FrogQuest hackathon research bot)"} | |
| def web_search(query: str, k: int = MAX_RESULTS) -> list[dict]: | |
| """DuckDuckGo search -> [{title, url, snippet}]. Empty list on any failure.""" | |
| try: | |
| from ddgs import DDGS | |
| with DDGS() as ddg: | |
| hits = ddg.text(query, max_results=k) or [] | |
| return [{"title": h.get("title") or "", "url": h.get("href") or h.get("url") or "", | |
| "snippet": h.get("body") or ""} for h in hits] | |
| except Exception as e: | |
| print(f"[research] search failed for {query!r}: {e}") | |
| return [] | |
| def fetch_text(url: str, max_chars: int = PAGE_CHARS) -> str: | |
| """Fetch a page and reduce it to readable plain text. Empty string on any failure.""" | |
| try: | |
| import requests | |
| from bs4 import BeautifulSoup | |
| resp = requests.get(url, timeout=TIMEOUT_S, headers=_HEADERS) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| for tag in soup(["script", "style", "nav", "header", "footer", "aside", "form", "noscript"]): | |
| tag.decompose() | |
| text = " ".join(soup.get_text(" ").split()) | |
| return text[:max_chars] | |
| except Exception as e: | |
| print(f"[research] fetch failed for {url!r}: {e}") | |
| return "" | |
| def research_goal(goal: str) -> dict: | |
| """Bounded research pass for one long-term goal. | |
| Returns {"snippets": [str], "sources": [url]} — parallel lists; snippets are prefixed with the | |
| page title so the LLM can tell sources apart. Both empty when the web is unreachable. | |
| """ | |
| goal = (goal or "").strip() | |
| if not goal: | |
| return {"snippets": [], "sources": []} | |
| results: list[dict] = [] | |
| seen: set[str] = set() | |
| for query in (f"how to {goal}", f"{goal} step by step plan"): | |
| for r in web_search(query): | |
| if r["url"] and r["url"] not in seen: | |
| seen.add(r["url"]) | |
| results.append(r) | |
| snippets: list[str] = [] | |
| sources: list[str] = [] | |
| for r in results[:MAX_PAGES]: # the top hits get a full fetch | |
| body = fetch_text(r["url"]) or r["snippet"][:SNIPPET_CHARS] | |
| if body: | |
| snippets.append(f"{r['title']}: {body}") | |
| sources.append(r["url"]) | |
| for r in results[MAX_PAGES:MAX_PAGES + 4]: # a few more as cheap search-snippets only | |
| if r["snippet"]: | |
| snippets.append(f"{r['title']}: {r['snippet'][:SNIPPET_CHARS]}") | |
| sources.append(r["url"]) | |
| return {"snippets": snippets, "sources": sources} | |
| if __name__ == "__main__": # live smoke test (network required) | |
| out = research_goal("run a 10k race") | |
| print(f"{len(out['snippets'])} snippets / {len(out['sources'])} sources") | |
| for s, u in zip(out["snippets"], out["sources"]): | |
| print(f"- [{u}] {s[:100]}...") | |