FrogQuest / research.py
VirusDumb's picture
Big Leagues Calling
c6815eb
Raw
History Blame Contribute Delete
3.9 kB
"""Web research for campaign goals — ddgs search + BeautifulSoup page extraction. CPU-only.
Used by forge_campaign (app.py) when the user ticks "research the web first": the goal is searched
on DuckDuckGo (ddgs, no API key), the top pages are fetched and stripped to plain text, and the
bounded snippets are handed to the LLM as grounding for the campaign's quest chain (the LLM never
browses — code does; see CLAUDE.md's thin-LLM principle). Everything is best-effort: any network
failure just shrinks the snippet list, and an empty result falls back to the model's own knowledge.
Budgets below keep the total well under Nemotron's context and a ZeroGPU call's patience:
~3 fetched pages x 1500 chars + a few raw search snippets ≈ 6KB of grounding text.
"""
from __future__ import annotations
MAX_RESULTS = 6 # ddgs hits kept per query
MAX_PAGES = 3 # pages actually fetched + parsed
PAGE_CHARS = 1500 # plain-text chars kept per fetched page
SNIPPET_CHARS = 300 # chars kept per unfetched search-result snippet
TIMEOUT_S = 10
_HEADERS = {"User-Agent": "Mozilla/5.0 (FrogQuest hackathon research bot)"}
def web_search(query: str, k: int = MAX_RESULTS) -> list[dict]:
"""DuckDuckGo search -> [{title, url, snippet}]. Empty list on any failure."""
try:
from ddgs import DDGS
with DDGS() as ddg:
hits = ddg.text(query, max_results=k) or []
return [{"title": h.get("title") or "", "url": h.get("href") or h.get("url") or "",
"snippet": h.get("body") or ""} for h in hits]
except Exception as e:
print(f"[research] search failed for {query!r}: {e}")
return []
def fetch_text(url: str, max_chars: int = PAGE_CHARS) -> str:
"""Fetch a page and reduce it to readable plain text. Empty string on any failure."""
try:
import requests
from bs4 import BeautifulSoup
resp = requests.get(url, timeout=TIMEOUT_S, headers=_HEADERS)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
for tag in soup(["script", "style", "nav", "header", "footer", "aside", "form", "noscript"]):
tag.decompose()
text = " ".join(soup.get_text(" ").split())
return text[:max_chars]
except Exception as e:
print(f"[research] fetch failed for {url!r}: {e}")
return ""
def research_goal(goal: str) -> dict:
"""Bounded research pass for one long-term goal.
Returns {"snippets": [str], "sources": [url]} — parallel lists; snippets are prefixed with the
page title so the LLM can tell sources apart. Both empty when the web is unreachable.
"""
goal = (goal or "").strip()
if not goal:
return {"snippets": [], "sources": []}
results: list[dict] = []
seen: set[str] = set()
for query in (f"how to {goal}", f"{goal} step by step plan"):
for r in web_search(query):
if r["url"] and r["url"] not in seen:
seen.add(r["url"])
results.append(r)
snippets: list[str] = []
sources: list[str] = []
for r in results[:MAX_PAGES]: # the top hits get a full fetch
body = fetch_text(r["url"]) or r["snippet"][:SNIPPET_CHARS]
if body:
snippets.append(f"{r['title']}: {body}")
sources.append(r["url"])
for r in results[MAX_PAGES:MAX_PAGES + 4]: # a few more as cheap search-snippets only
if r["snippet"]:
snippets.append(f"{r['title']}: {r['snippet'][:SNIPPET_CHARS]}")
sources.append(r["url"])
return {"snippets": snippets, "sources": sources}
if __name__ == "__main__": # live smoke test (network required)
out = research_goal("run a 10k race")
print(f"{len(out['snippets'])} snippets / {len(out['sources'])} sources")
for s, u in zip(out["snippets"], out["sources"]):
print(f"- [{u}] {s[:100]}...")