bbkdevops's picture
download
raw
5.18 kB
"""External research loop for source-grounded answers.
This is a practical retrieval policy, not a claim of autonomous perfect RL.
It searches the live web, fetches candidate pages, hashes evidence, and returns
only source-backed snippets that downstream answer guards may use.
"""
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from html.parser import HTMLParser
import hashlib
import json
from pathlib import Path
import re
from typing import Iterable
from urllib.parse import quote_plus, unquote, urlparse, parse_qs
import httpx
TOKEN_RE = re.compile(r"[\w\u0E00-\u0E7F]+", re.UNICODE)
def _sha256(text: str) -> str:
return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
def _terms(text: str) -> set[str]:
return {tok.lower() for tok in TOKEN_RE.findall(text) if len(tok) >= 2}
class _TextExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.parts: list[str] = []
self.links: list[str] = []
self._skip = False
def handle_starttag(self, tag, attrs):
if tag in {"script", "style", "noscript"}:
self._skip = True
if tag == "a":
attrs_dict = dict(attrs)
href = attrs_dict.get("href")
if href:
self.links.append(href)
def handle_endtag(self, tag):
if tag in {"script", "style", "noscript"}:
self._skip = False
def handle_data(self, data):
if not self._skip and data.strip():
self.parts.append(data.strip())
@property
def text(self) -> str:
return re.sub(r"\s+", " ", " ".join(self.parts)).strip()
@dataclass
class ResearchSource:
url: str
title: str
text: str
sha256: str
matched_terms: list[str]
score: float
class ExternalResearcher:
def __init__(self, client: httpx.Client | None = None, timeout_s: float = 15.0):
self.client = client or httpx.Client(
timeout=timeout_s,
follow_redirects=True,
headers={"User-Agent": "TinyMindResearch/1.0 source-grounded evaluation"},
)
def search_urls(self, query: str, max_results: int = 5) -> list[str]:
url = f"https://duckduckgo.com/html/?q={quote_plus(query)}"
resp = self.client.get(url)
resp.raise_for_status()
parser = _TextExtractor()
parser.feed(resp.text)
urls: list[str] = []
for href in parser.links:
parsed = urlparse(href)
if parsed.path == "/l/":
uddg = parse_qs(parsed.query).get("uddg", [""])[0]
href = unquote(uddg)
if href.startswith("http") and "duckduckgo.com" not in href:
urls.append(href)
if len(urls) >= max_results:
break
return list(dict.fromkeys(urls))
def fetch_source(self, url: str, query: str) -> ResearchSource | None:
try:
resp = self.client.get(url)
resp.raise_for_status()
except Exception:
return None
content_type = resp.headers.get("content-type", "")
if "text/html" in content_type or "<html" in resp.text[:500].lower():
parser = _TextExtractor()
parser.feed(resp.text)
text = parser.text
else:
text = resp.text
text = text[:12000]
q_terms = _terms(query)
matched = sorted(q_terms & _terms(text))
score = len(matched) / max(len(q_terms), 1)
if score <= 0:
return None
title = text[:90]
return ResearchSource(url=url, title=title, text=text, sha256=_sha256(text), matched_terms=matched, score=score)
def research(self, query: str, out_dir: str | Path, max_results: int = 5) -> dict:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
urls = self.search_urls(query, max_results=max_results)
sources: list[ResearchSource] = []
for url in urls:
source = self.fetch_source(url, query)
if source is not None:
sources.append(source)
sources.sort(key=lambda src: src.score, reverse=True)
report = {
"schema_version": "tinymind-external-research-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"query": query,
"policy": "search_live_web_fetch_hash_verify_then_answer_only_from_sources",
"deep_rl_policy_proxy": {
"state": "missing_or_insufficient_internal_evidence",
"actions": ["search", "fetch", "hash", "score_overlap", "answer_or_refuse"],
"reward": "maximize grounded evidence coverage and minimize unsupported claims",
},
"searched_urls": urls,
"sources": [source.__dict__ for source in sources],
"source_count": len(sources),
}
path = out / "external_research_report.json"
report["report_path"] = str(path)
path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
return report

Xet Storage Details

Size:
5.18 kB
·
Xet hash:
3a5444c75721f674bafca4a0d4021d63ede6a03acefca991183c4b22f8e3e6d3

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.