bbkdevops's picture
download
raw
11.8 kB
"""Live-web knowledge access for TinyMind.
This layer gives the model a disciplined path to external, current knowledge:
search, fetch, hash, cache, extract snippets, and answer only from evidence.
It is intentionally not a world-knowledge claim baked into weights.
"""
from __future__ import annotations
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
import argparse
import hashlib
import json
from pathlib import Path
import re
from typing import Any
from data.external_research import ExternalResearcher
TOKEN_RE = re.compile(r"[\w\u0E00-\u0E7F]+", re.UNICODE)
SENTENCE_RE = re.compile(r"[^.!?\n。!?]+[.!?。!?]?", re.UNICODE)
STOPWORDS = {
"the",
"and",
"for",
"with",
"that",
"this",
"what",
"when",
"where",
"why",
"how",
"คือ",
"อะไร",
"อย่างไร",
"ของ",
"และ",
"ใน",
"ที่",
"ให้",
}
def _sha256(text: str) -> str:
return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
def _terms(text: str) -> set[str]:
terms: set[str] = set()
for token in TOKEN_RE.findall(text):
low = token.lower()
if len(low) >= 2 and low not in STOPWORDS:
terms.add(low)
if re.search(r"[\u0E00-\u0E7F]", low):
chars = [ch for ch in low if "\u0E00" <= ch <= "\u0E7F"]
terms.update("".join(chars[i : i + 3]) for i in range(max(0, len(chars) - 2)))
return terms
def _anchor_terms(text: str) -> set[str]:
"""Terms that must stay present to avoid grounding on generic search noise."""
anchors = {
token.lower()
for token in TOKEN_RE.findall(text)
if len(token) >= 4 and token.lower() not in STOPWORDS and re.search(r"[A-Za-z0-9]", token)
}
return anchors
def _sentences(text: str) -> list[str]:
out: list[str] = []
for match in SENTENCE_RE.findall(text):
cleaned = re.sub(r"\s+", " ", match).strip()
if 40 <= len(cleaned) <= 500:
out.append(cleaned)
return out
@dataclass
class WebEvidence:
source_url: str
title: str
sha256: str
score: float
matched_terms: list[str]
snippet: str
cache_path: str
class GeneralWebKnowledgeEngine:
"""Evidence-first live-web knowledge adapter for ordinary language QA."""
def __init__(self, researcher: ExternalResearcher | None = None):
self.researcher = researcher or ExternalResearcher()
def answer(
self,
question: str,
out_dir: str | Path,
*,
max_results: int = 6,
top_k: int = 4,
language: str = "auto",
) -> dict[str, Any]:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
cache_dir = out / "web_cache"
cache_dir.mkdir(exist_ok=True)
report = self.researcher.research(question, out / "external_research", max_results=max_results)
evidence = self._build_evidence(question, report.get("sources", []), cache_dir, top_k=top_k)
if not evidence:
result = {
"schema_version": "tinymind-general-web-knowledge-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"question": question,
"status": "insufficient_evidence",
"answer": "ค้นเว็บแล้ว แต่ยังไม่พบหลักฐานที่ตรงและตรวจสอบได้พอ จึงไม่ควรสรุปเป็นข้อเท็จจริง",
"evidence": [],
"source_report": report.get("report_path"),
"claim_gate": {
"web_access_ready": True,
"answer_grounded": False,
"unsupported_answer_allowed": False,
"model_weight_knowledge_claim": False,
},
}
return self._write_outputs(result, out)
answer = self._compose_answer(question, evidence, language=language)
result = {
"schema_version": "tinymind-general-web-knowledge-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"question": question,
"status": "grounded",
"answer": answer,
"evidence": [asdict(item) for item in evidence],
"source_report": report.get("report_path"),
"retrieval_policy": {
"steps": ["search_web", "fetch_pages", "hash_sources", "cache_raw_text", "extract_supported_snippets", "answer_with_citations"],
"top_k": top_k,
"max_results": max_results,
},
"claim_gate": {
"web_access_ready": True,
"answer_grounded": True,
"unsupported_answer_allowed": False,
"model_weight_knowledge_claim": False,
"world_best_claim_allowed": False,
},
}
return self._write_outputs(result, out)
def _build_evidence(self, question: str, sources: list[dict[str, Any]], cache_dir: Path, *, top_k: int) -> list[WebEvidence]:
q_terms = _terms(question)
anchor_terms = _anchor_terms(question)
evidence: list[WebEvidence] = []
for source in sources:
text = str(source.get("text") or "")
if not text.strip():
continue
source_terms = _terms(text)
if anchor_terms and not (anchor_terms & source_terms):
continue
matched = sorted(q_terms & source_terms)
if not matched:
matched = list(source.get("matched_terms") or [])
score = len(set(matched)) / max(1, len(q_terms))
if score <= 0 or (anchor_terms and not (anchor_terms & set(matched))):
continue
digest = str(source.get("sha256") or _sha256(text))
cache_path = cache_dir / f"{digest[:16]}.txt"
cache_path.write_text(text, encoding="utf-8")
snippet = self._snippet(question, text, matched)
if anchor_terms and not (anchor_terms & _terms(snippet)):
continue
evidence.append(
WebEvidence(
source_url=str(source.get("url") or ""),
title=str(source.get("title") or text[:90]),
sha256=digest,
score=round(score, 6),
matched_terms=matched[:24],
snippet=snippet,
cache_path=str(cache_path),
)
)
evidence.sort(key=lambda item: (-item.score, item.source_url))
return evidence[:top_k]
def _snippet(self, question: str, text: str, matched: list[str]) -> str:
matched_set = {m.lower() for m in matched}
anchor_set = _anchor_terms(question)
ranked: list[tuple[int, str]] = []
for sentence in _sentences(text):
terms = _terms(sentence)
overlap = len(terms & matched_set)
if overlap:
anchor_bonus = 10 * len(terms & anchor_set)
ranked.append((anchor_bonus + overlap, sentence))
ranked.sort(key=lambda item: (-item[0], len(item[1])))
if ranked:
return " ".join(sentence for _, sentence in ranked[:2])[:900]
return re.sub(r"\s+", " ", text).strip()[:900]
def _compose_answer(self, question: str, evidence: list[WebEvidence], *, language: str) -> str:
thai = language == "th" or (language == "auto" and re.search(r"[\u0E00-\u0E7F]", question))
if thai:
lines = ["คำตอบจากหลักฐานเว็บล่าสุด:"]
for i, item in enumerate(evidence, start=1):
lines.append(f"{i}. {item.snippet} [แหล่งที่มา {i}]")
lines.append("ข้อจำกัด: คำตอบนี้ยึดเฉพาะหลักฐานที่ fetch และ hash แล้ว ไม่ถือเป็นความรู้ที่ฝังอยู่ในน้ำหนักโมเดล")
else:
lines = ["Evidence-grounded answer from live web sources:"]
for i, item in enumerate(evidence, start=1):
lines.append(f"{i}. {item.snippet} [source {i}]")
lines.append("Limit: this answer is grounded in fetched, hashed evidence, not a claim that the model weights already knew it.")
return "\n".join(lines)
def _write_outputs(self, result: dict[str, Any], out: Path) -> dict[str, Any]:
report_path = out / "general_web_knowledge_report.json"
result["report_path"] = str(report_path)
report_path.write_text(json.dumps(result, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
sft_path = out / "general_web_knowledge_sft.jsonl"
with sft_path.open("w", encoding="utf-8", newline="\n") as f:
f.write(
json.dumps(
{
"source": "general_web_knowledge_live_evidence",
"messages": [
{
"role": "system",
"content": "When knowledge may be current or external, search web evidence first, cite sources, and refuse unsupported claims.",
},
{"role": "user", "content": str(result["question"])},
{"role": "assistant", "content": str(result["answer"])},
],
"metadata": {
"status": result["status"],
"evidence_count": len(result.get("evidence", [])),
"report_path": str(report_path),
},
},
ensure_ascii=False,
sort_keys=True,
)
+ "\n"
)
result["sft_path"] = str(sft_path)
return result
def write_general_web_knowledge(
question: str,
out_dir: str | Path,
*,
max_results: int = 6,
top_k: int = 4,
language: str = "auto",
researcher: ExternalResearcher | None = None,
) -> dict[str, Any]:
return GeneralWebKnowledgeEngine(researcher=researcher).answer(
question,
out_dir,
max_results=max_results,
top_k=top_k,
language=language,
)
def main() -> int:
parser = argparse.ArgumentParser(description="TinyMind live-web knowledge access with hash-verified evidence.")
parser.add_argument("--question", required=True)
parser.add_argument("--out-dir", default="reports/general_web_knowledge")
parser.add_argument("--max-results", type=int, default=6)
parser.add_argument("--top-k", type=int, default=4)
parser.add_argument("--language", choices=["auto", "th", "en"], default="auto")
args = parser.parse_args()
result = write_general_web_knowledge(
args.question,
args.out_dir,
max_results=args.max_results,
top_k=args.top_k,
language=args.language,
)
print(json.dumps(
{
"report_path": result["report_path"],
"sft_path": result["sft_path"],
"status": result["status"],
"evidence_count": len(result.get("evidence", [])),
"answer_grounded": result["claim_gate"]["answer_grounded"],
},
ensure_ascii=False,
indent=2,
))
return 0
if __name__ == "__main__":
raise SystemExit(main())

Xet Storage Details

Size:
11.8 kB
·
Xet hash:
4d0a336bf74e4f1d47d356310dc561745700252242b93897c7ced4fe387226cb

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.