Spaces:

build-small-hackathon
/

FrogQuest

Running on Zero

App Files Files Community

FrogQuest / research.py

VirusDumb

Big Leagues Calling

c6815eb 9 days ago

Raw

History Blame Contribute Delete

3.9 kB

	"""Web research for campaign goals — ddgs search + BeautifulSoup page extraction. CPU-only.

	Used by forge_campaign (app.py) when the user ticks "research the web first": the goal is searched
	on DuckDuckGo (ddgs, no API key), the top pages are fetched and stripped to plain text, and the
	bounded snippets are handed to the LLM as grounding for the campaign's quest chain (the LLM never
	browses — code does; see CLAUDE.md's thin-LLM principle). Everything is best-effort: any network
	failure just shrinks the snippet list, and an empty result falls back to the model's own knowledge.

	Budgets below keep the total well under Nemotron's context and a ZeroGPU call's patience:
	~3 fetched pages x 1500 chars + a few raw search snippets ≈ 6KB of grounding text.
	"""
	from __future__ import annotations

	MAX_RESULTS = 6 # ddgs hits kept per query
	MAX_PAGES = 3 # pages actually fetched + parsed
	PAGE_CHARS = 1500 # plain-text chars kept per fetched page
	SNIPPET_CHARS = 300 # chars kept per unfetched search-result snippet
	TIMEOUT_S = 10

	_HEADERS = {"User-Agent": "Mozilla/5.0 (FrogQuest hackathon research bot)"}


	def web_search(query: str, k: int = MAX_RESULTS) -> list[dict]:
	"""DuckDuckGo search -> [{title, url, snippet}]. Empty list on any failure."""
	try:
	from ddgs import DDGS
	with DDGS() as ddg:
	hits = ddg.text(query, max_results=k) or []
	return [{"title": h.get("title") or "", "url": h.get("href") or h.get("url") or "",
	"snippet": h.get("body") or ""} for h in hits]
	except Exception as e:
	print(f"[research] search failed for {query!r}: {e}")
	return []


	def fetch_text(url: str, max_chars: int = PAGE_CHARS) -> str:
	"""Fetch a page and reduce it to readable plain text. Empty string on any failure."""
	try:
	import requests
	from bs4 import BeautifulSoup
	resp = requests.get(url, timeout=TIMEOUT_S, headers=_HEADERS)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")
	for tag in soup(["script", "style", "nav", "header", "footer", "aside", "form", "noscript"]):
	tag.decompose()
	text = " ".join(soup.get_text(" ").split())
	return text[:max_chars]
	except Exception as e:
	print(f"[research] fetch failed for {url!r}: {e}")
	return ""


	def research_goal(goal: str) -> dict:
	"""Bounded research pass for one long-term goal.

	Returns {"snippets": [str], "sources": [url]} — parallel lists; snippets are prefixed with the
	page title so the LLM can tell sources apart. Both empty when the web is unreachable.
	"""
	goal = (goal or "").strip()
	if not goal:
	return {"snippets": [], "sources": []}

	results: list[dict] = []
	seen: set[str] = set()
	for query in (f"how to {goal}", f"{goal} step by step plan"):
	for r in web_search(query):
	if r["url"] and r["url"] not in seen:
	seen.add(r["url"])
	results.append(r)

	snippets: list[str] = []
	sources: list[str] = []
	for r in results[:MAX_PAGES]: # the top hits get a full fetch
	body = fetch_text(r["url"]) or r["snippet"][:SNIPPET_CHARS]
	if body:
	snippets.append(f"{r['title']}: {body}")
	sources.append(r["url"])
	for r in results[MAX_PAGES:MAX_PAGES + 4]: # a few more as cheap search-snippets only
	if r["snippet"]:
	snippets.append(f"{r['title']}: {r['snippet'][:SNIPPET_CHARS]}")
	sources.append(r["url"])
	return {"snippets": snippets, "sources": sources}


	if __name__ == "__main__": # live smoke test (network required)
	out = research_goal("run a 10k race")
	print(f"{len(out['snippets'])} snippets / {len(out['sources'])} sources")
	for s, u in zip(out["snippets"], out["sources"]):
	print(f"- [{u}] {s[:100]}...")