Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /external_research.py

bbkdevops

about 1 month ago

download

raw

5.18 kB

	"""External research loop for source-grounded answers.

	This is a practical retrieval policy, not a claim of autonomous perfect RL.
	It searches the live web, fetches candidate pages, hashes evidence, and returns
	only source-backed snippets that downstream answer guards may use.
	"""

	from __future__ import annotations

	from dataclasses import dataclass
	from datetime import datetime, timezone
	from html.parser import HTMLParser
	import hashlib
	import json
	from pathlib import Path
	import re
	from typing import Iterable
	from urllib.parse import quote_plus, unquote, urlparse, parse_qs

	import httpx


	TOKEN_RE = re.compile(r"[\w\u0E00-\u0E7F]+", re.UNICODE)


	def _sha256(text: str) -> str:
	return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()


	def _terms(text: str) -> set[str]:
	return {tok.lower() for tok in TOKEN_RE.findall(text) if len(tok) >= 2}


	class _TextExtractor(HTMLParser):
	def __init__(self):
	super().__init__()
	self.parts: list[str] = []
	self.links: list[str] = []
	self._skip = False

	def handle_starttag(self, tag, attrs):
	if tag in {"script", "style", "noscript"}:
	self._skip = True
	if tag == "a":
	attrs_dict = dict(attrs)
	href = attrs_dict.get("href")
	if href:
	self.links.append(href)

	def handle_endtag(self, tag):
	if tag in {"script", "style", "noscript"}:
	self._skip = False

	def handle_data(self, data):
	if not self._skip and data.strip():
	self.parts.append(data.strip())

	@property
	def text(self) -> str:
	return re.sub(r"\s+", " ", " ".join(self.parts)).strip()


	@dataclass
	class ResearchSource:
	url: str
	title: str
	text: str
	sha256: str
	matched_terms: list[str]
	score: float


	class ExternalResearcher:
	def __init__(self, client: httpx.Client \| None = None, timeout_s: float = 15.0):
	self.client = client or httpx.Client(
	timeout=timeout_s,
	follow_redirects=True,
	headers={"User-Agent": "TinyMindResearch/1.0 source-grounded evaluation"},
	)

	def search_urls(self, query: str, max_results: int = 5) -> list[str]:
	url = f"https://duckduckgo.com/html/?q={quote_plus(query)}"
	resp = self.client.get(url)
	resp.raise_for_status()
	parser = _TextExtractor()
	parser.feed(resp.text)
	urls: list[str] = []
	for href in parser.links:
	parsed = urlparse(href)
	if parsed.path == "/l/":
	uddg = parse_qs(parsed.query).get("uddg", [""])[0]
	href = unquote(uddg)
	if href.startswith("http") and "duckduckgo.com" not in href:
	urls.append(href)
	if len(urls) >= max_results:
	break
	return list(dict.fromkeys(urls))

	def fetch_source(self, url: str, query: str) -> ResearchSource \| None:
	try:
	resp = self.client.get(url)
	resp.raise_for_status()
	except Exception:
	return None
	content_type = resp.headers.get("content-type", "")
	if "text/html" in content_type or "<html" in resp.text[:500].lower():
	parser = _TextExtractor()
	parser.feed(resp.text)
	text = parser.text
	else:
	text = resp.text
	text = text[:12000]
	q_terms = _terms(query)
	matched = sorted(q_terms & _terms(text))
	score = len(matched) / max(len(q_terms), 1)
	if score <= 0:
	return None
	title = text[:90]
	return ResearchSource(url=url, title=title, text=text, sha256=_sha256(text), matched_terms=matched, score=score)

	def research(self, query: str, out_dir: str \| Path, max_results: int = 5) -> dict:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	urls = self.search_urls(query, max_results=max_results)
	sources: list[ResearchSource] = []
	for url in urls:
	source = self.fetch_source(url, query)
	if source is not None:
	sources.append(source)
	sources.sort(key=lambda src: src.score, reverse=True)
	report = {
	"schema_version": "tinymind-external-research-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"query": query,
	"policy": "search_live_web_fetch_hash_verify_then_answer_only_from_sources",
	"deep_rl_policy_proxy": {
	"state": "missing_or_insufficient_internal_evidence",
	"actions": ["search", "fetch", "hash", "score_overlap", "answer_or_refuse"],
	"reward": "maximize grounded evidence coverage and minimize unsupported claims",
	},
	"searched_urls": urls,
	"sources": [source.__dict__ for source in sources],
	"source_count": len(sources),
	}
	path = out / "external_research_report.json"
	report["report_path"] = str(path)
	path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	return report

Xet Storage Details

Size:: 5.18 kB
Xet hash:: 3a5444c75721f674bafca4a0d4021d63ede6a03acefca991183c4b22f8e3e6d3

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.