Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /general_web_knowledge.py

bbkdevops

29 days ago

download

raw

11.8 kB

	"""Live-web knowledge access for TinyMind.

	This layer gives the model a disciplined path to external, current knowledge:
	search, fetch, hash, cache, extract snippets, and answer only from evidence.
	It is intentionally not a world-knowledge claim baked into weights.
	"""

	from __future__ import annotations

	from dataclasses import asdict, dataclass
	from datetime import datetime, timezone
	import argparse
	import hashlib
	import json
	from pathlib import Path
	import re
	from typing import Any

	from data.external_research import ExternalResearcher


	TOKEN_RE = re.compile(r"[\w\u0E00-\u0E7F]+", re.UNICODE)
	SENTENCE_RE = re.compile(r"[^.!?\n。！？]+[.!?。！？]?", re.UNICODE)
	STOPWORDS = {
	"the",
	"and",
	"for",
	"with",
	"that",
	"this",
	"what",
	"when",
	"where",
	"why",
	"how",
	"คือ",
	"อะไร",
	"อย่างไร",
	"ของ",
	"และ",
	"ใน",
	"ที่",
	"ให้",
	}


	def _sha256(text: str) -> str:
	return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()


	def _terms(text: str) -> set[str]:
	terms: set[str] = set()
	for token in TOKEN_RE.findall(text):
	low = token.lower()
	if len(low) >= 2 and low not in STOPWORDS:
	terms.add(low)
	if re.search(r"[\u0E00-\u0E7F]", low):
	chars = [ch for ch in low if "\u0E00" <= ch <= "\u0E7F"]
	terms.update("".join(chars[i : i + 3]) for i in range(max(0, len(chars) - 2)))
	return terms


	def _anchor_terms(text: str) -> set[str]:
	"""Terms that must stay present to avoid grounding on generic search noise."""
	anchors = {
	token.lower()
	for token in TOKEN_RE.findall(text)
	if len(token) >= 4 and token.lower() not in STOPWORDS and re.search(r"[A-Za-z0-9]", token)
	}
	return anchors


	def _sentences(text: str) -> list[str]:
	out: list[str] = []
	for match in SENTENCE_RE.findall(text):
	cleaned = re.sub(r"\s+", " ", match).strip()
	if 40 <= len(cleaned) <= 500:
	out.append(cleaned)
	return out


	@dataclass
	class WebEvidence:
	source_url: str
	title: str
	sha256: str
	score: float
	matched_terms: list[str]
	snippet: str
	cache_path: str


	class GeneralWebKnowledgeEngine:
	"""Evidence-first live-web knowledge adapter for ordinary language QA."""

	def __init__(self, researcher: ExternalResearcher \| None = None):
	self.researcher = researcher or ExternalResearcher()

	def answer(
	self,
	question: str,
	out_dir: str \| Path,
	*,
	max_results: int = 6,
	top_k: int = 4,
	language: str = "auto",
	) -> dict[str, Any]:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	cache_dir = out / "web_cache"
	cache_dir.mkdir(exist_ok=True)

	report = self.researcher.research(question, out / "external_research", max_results=max_results)
	evidence = self._build_evidence(question, report.get("sources", []), cache_dir, top_k=top_k)
	if not evidence:
	result = {
	"schema_version": "tinymind-general-web-knowledge-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"question": question,
	"status": "insufficient_evidence",
	"answer": "ค้นเว็บแล้ว แต่ยังไม่พบหลักฐานที่ตรงและตรวจสอบได้พอ จึงไม่ควรสรุปเป็นข้อเท็จจริง",
	"evidence": [],
	"source_report": report.get("report_path"),
	"claim_gate": {
	"web_access_ready": True,
	"answer_grounded": False,
	"unsupported_answer_allowed": False,
	"model_weight_knowledge_claim": False,
	},
	}
	return self._write_outputs(result, out)

	answer = self._compose_answer(question, evidence, language=language)
	result = {
	"schema_version": "tinymind-general-web-knowledge-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"question": question,
	"status": "grounded",
	"answer": answer,
	"evidence": [asdict(item) for item in evidence],
	"source_report": report.get("report_path"),
	"retrieval_policy": {
	"steps": ["search_web", "fetch_pages", "hash_sources", "cache_raw_text", "extract_supported_snippets", "answer_with_citations"],
	"top_k": top_k,
	"max_results": max_results,
	},
	"claim_gate": {
	"web_access_ready": True,
	"answer_grounded": True,
	"unsupported_answer_allowed": False,
	"model_weight_knowledge_claim": False,
	"world_best_claim_allowed": False,
	},
	}
	return self._write_outputs(result, out)

	def _build_evidence(self, question: str, sources: list[dict[str, Any]], cache_dir: Path, *, top_k: int) -> list[WebEvidence]:
	q_terms = _terms(question)
	anchor_terms = _anchor_terms(question)
	evidence: list[WebEvidence] = []
	for source in sources:
	text = str(source.get("text") or "")
	if not text.strip():
	continue
	source_terms = _terms(text)
	if anchor_terms and not (anchor_terms & source_terms):
	continue
	matched = sorted(q_terms & source_terms)
	if not matched:
	matched = list(source.get("matched_terms") or [])
	score = len(set(matched)) / max(1, len(q_terms))
	if score <= 0 or (anchor_terms and not (anchor_terms & set(matched))):
	continue
	digest = str(source.get("sha256") or _sha256(text))
	cache_path = cache_dir / f"{digest[:16]}.txt"
	cache_path.write_text(text, encoding="utf-8")
	snippet = self._snippet(question, text, matched)
	if anchor_terms and not (anchor_terms & _terms(snippet)):
	continue
	evidence.append(
	WebEvidence(
	source_url=str(source.get("url") or ""),
	title=str(source.get("title") or text[:90]),
	sha256=digest,
	score=round(score, 6),
	matched_terms=matched[:24],
	snippet=snippet,
	cache_path=str(cache_path),
	)
	)
	evidence.sort(key=lambda item: (-item.score, item.source_url))
	return evidence[:top_k]

	def _snippet(self, question: str, text: str, matched: list[str]) -> str:
	matched_set = {m.lower() for m in matched}
	anchor_set = _anchor_terms(question)
	ranked: list[tuple[int, str]] = []
	for sentence in _sentences(text):
	terms = _terms(sentence)
	overlap = len(terms & matched_set)
	if overlap:
	anchor_bonus = 10 * len(terms & anchor_set)
	ranked.append((anchor_bonus + overlap, sentence))
	ranked.sort(key=lambda item: (-item[0], len(item[1])))
	if ranked:
	return " ".join(sentence for _, sentence in ranked[:2])[:900]
	return re.sub(r"\s+", " ", text).strip()[:900]

	def _compose_answer(self, question: str, evidence: list[WebEvidence], *, language: str) -> str:
	thai = language == "th" or (language == "auto" and re.search(r"[\u0E00-\u0E7F]", question))
	if thai:
	lines = ["คำตอบจากหลักฐานเว็บล่าสุด:"]
	for i, item in enumerate(evidence, start=1):
	lines.append(f"{i}. {item.snippet} [แหล่งที่มา {i}]")
	lines.append("ข้อจำกัด: คำตอบนี้ยึดเฉพาะหลักฐานที่ fetch และ hash แล้ว ไม่ถือเป็นความรู้ที่ฝังอยู่ในน้ำหนักโมเดล")
	else:
	lines = ["Evidence-grounded answer from live web sources:"]
	for i, item in enumerate(evidence, start=1):
	lines.append(f"{i}. {item.snippet} [source {i}]")
	lines.append("Limit: this answer is grounded in fetched, hashed evidence, not a claim that the model weights already knew it.")
	return "\n".join(lines)

	def _write_outputs(self, result: dict[str, Any], out: Path) -> dict[str, Any]:
	report_path = out / "general_web_knowledge_report.json"
	result["report_path"] = str(report_path)
	report_path.write_text(json.dumps(result, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")

	sft_path = out / "general_web_knowledge_sft.jsonl"
	with sft_path.open("w", encoding="utf-8", newline="\n") as f:
	f.write(
	json.dumps(
	{
	"source": "general_web_knowledge_live_evidence",
	"messages": [
	{
	"role": "system",
	"content": "When knowledge may be current or external, search web evidence first, cite sources, and refuse unsupported claims.",
	},
	{"role": "user", "content": str(result["question"])},
	{"role": "assistant", "content": str(result["answer"])},
	],
	"metadata": {
	"status": result["status"],
	"evidence_count": len(result.get("evidence", [])),
	"report_path": str(report_path),
	},
	},
	ensure_ascii=False,
	sort_keys=True,
	)
	+ "\n"
	)
	result["sft_path"] = str(sft_path)
	return result


	def write_general_web_knowledge(
	question: str,
	out_dir: str \| Path,
	*,
	max_results: int = 6,
	top_k: int = 4,
	language: str = "auto",
	researcher: ExternalResearcher \| None = None,
	) -> dict[str, Any]:
	return GeneralWebKnowledgeEngine(researcher=researcher).answer(
	question,
	out_dir,
	max_results=max_results,
	top_k=top_k,
	language=language,
	)


	def main() -> int:
	parser = argparse.ArgumentParser(description="TinyMind live-web knowledge access with hash-verified evidence.")
	parser.add_argument("--question", required=True)
	parser.add_argument("--out-dir", default="reports/general_web_knowledge")
	parser.add_argument("--max-results", type=int, default=6)
	parser.add_argument("--top-k", type=int, default=4)
	parser.add_argument("--language", choices=["auto", "th", "en"], default="auto")
	args = parser.parse_args()
	result = write_general_web_knowledge(
	args.question,
	args.out_dir,
	max_results=args.max_results,
	top_k=args.top_k,
	language=args.language,
	)
	print(json.dumps(
	{
	"report_path": result["report_path"],
	"sft_path": result["sft_path"],
	"status": result["status"],
	"evidence_count": len(result.get("evidence", [])),
	"answer_grounded": result["claim_gate"]["answer_grounded"],
	},
	ensure_ascii=False,
	indent=2,
	))
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())

Xet Storage Details

Size:: 11.8 kB
Xet hash:: 4d0a336bf74e4f1d47d356310dc561745700252242b93897c7ced4fe387226cb

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.