Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /general_web_knowledge.py
| """Live-web knowledge access for TinyMind. | |
| This layer gives the model a disciplined path to external, current knowledge: | |
| search, fetch, hash, cache, extract snippets, and answer only from evidence. | |
| It is intentionally not a world-knowledge claim baked into weights. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import asdict, dataclass | |
| from datetime import datetime, timezone | |
| import argparse | |
| import hashlib | |
| import json | |
| from pathlib import Path | |
| import re | |
| from typing import Any | |
| from data.external_research import ExternalResearcher | |
| TOKEN_RE = re.compile(r"[\w\u0E00-\u0E7F]+", re.UNICODE) | |
| SENTENCE_RE = re.compile(r"[^.!?\n。!?]+[.!?。!?]?", re.UNICODE) | |
| STOPWORDS = { | |
| "the", | |
| "and", | |
| "for", | |
| "with", | |
| "that", | |
| "this", | |
| "what", | |
| "when", | |
| "where", | |
| "why", | |
| "how", | |
| "คือ", | |
| "อะไร", | |
| "อย่างไร", | |
| "ของ", | |
| "และ", | |
| "ใน", | |
| "ที่", | |
| "ให้", | |
| } | |
| def _sha256(text: str) -> str: | |
| return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest() | |
| def _terms(text: str) -> set[str]: | |
| terms: set[str] = set() | |
| for token in TOKEN_RE.findall(text): | |
| low = token.lower() | |
| if len(low) >= 2 and low not in STOPWORDS: | |
| terms.add(low) | |
| if re.search(r"[\u0E00-\u0E7F]", low): | |
| chars = [ch for ch in low if "\u0E00" <= ch <= "\u0E7F"] | |
| terms.update("".join(chars[i : i + 3]) for i in range(max(0, len(chars) - 2))) | |
| return terms | |
| def _anchor_terms(text: str) -> set[str]: | |
| """Terms that must stay present to avoid grounding on generic search noise.""" | |
| anchors = { | |
| token.lower() | |
| for token in TOKEN_RE.findall(text) | |
| if len(token) >= 4 and token.lower() not in STOPWORDS and re.search(r"[A-Za-z0-9]", token) | |
| } | |
| return anchors | |
| def _sentences(text: str) -> list[str]: | |
| out: list[str] = [] | |
| for match in SENTENCE_RE.findall(text): | |
| cleaned = re.sub(r"\s+", " ", match).strip() | |
| if 40 <= len(cleaned) <= 500: | |
| out.append(cleaned) | |
| return out | |
| class WebEvidence: | |
| source_url: str | |
| title: str | |
| sha256: str | |
| score: float | |
| matched_terms: list[str] | |
| snippet: str | |
| cache_path: str | |
| class GeneralWebKnowledgeEngine: | |
| """Evidence-first live-web knowledge adapter for ordinary language QA.""" | |
| def __init__(self, researcher: ExternalResearcher | None = None): | |
| self.researcher = researcher or ExternalResearcher() | |
| def answer( | |
| self, | |
| question: str, | |
| out_dir: str | Path, | |
| *, | |
| max_results: int = 6, | |
| top_k: int = 4, | |
| language: str = "auto", | |
| ) -> dict[str, Any]: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| cache_dir = out / "web_cache" | |
| cache_dir.mkdir(exist_ok=True) | |
| report = self.researcher.research(question, out / "external_research", max_results=max_results) | |
| evidence = self._build_evidence(question, report.get("sources", []), cache_dir, top_k=top_k) | |
| if not evidence: | |
| result = { | |
| "schema_version": "tinymind-general-web-knowledge-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "question": question, | |
| "status": "insufficient_evidence", | |
| "answer": "ค้นเว็บแล้ว แต่ยังไม่พบหลักฐานที่ตรงและตรวจสอบได้พอ จึงไม่ควรสรุปเป็นข้อเท็จจริง", | |
| "evidence": [], | |
| "source_report": report.get("report_path"), | |
| "claim_gate": { | |
| "web_access_ready": True, | |
| "answer_grounded": False, | |
| "unsupported_answer_allowed": False, | |
| "model_weight_knowledge_claim": False, | |
| }, | |
| } | |
| return self._write_outputs(result, out) | |
| answer = self._compose_answer(question, evidence, language=language) | |
| result = { | |
| "schema_version": "tinymind-general-web-knowledge-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "question": question, | |
| "status": "grounded", | |
| "answer": answer, | |
| "evidence": [asdict(item) for item in evidence], | |
| "source_report": report.get("report_path"), | |
| "retrieval_policy": { | |
| "steps": ["search_web", "fetch_pages", "hash_sources", "cache_raw_text", "extract_supported_snippets", "answer_with_citations"], | |
| "top_k": top_k, | |
| "max_results": max_results, | |
| }, | |
| "claim_gate": { | |
| "web_access_ready": True, | |
| "answer_grounded": True, | |
| "unsupported_answer_allowed": False, | |
| "model_weight_knowledge_claim": False, | |
| "world_best_claim_allowed": False, | |
| }, | |
| } | |
| return self._write_outputs(result, out) | |
| def _build_evidence(self, question: str, sources: list[dict[str, Any]], cache_dir: Path, *, top_k: int) -> list[WebEvidence]: | |
| q_terms = _terms(question) | |
| anchor_terms = _anchor_terms(question) | |
| evidence: list[WebEvidence] = [] | |
| for source in sources: | |
| text = str(source.get("text") or "") | |
| if not text.strip(): | |
| continue | |
| source_terms = _terms(text) | |
| if anchor_terms and not (anchor_terms & source_terms): | |
| continue | |
| matched = sorted(q_terms & source_terms) | |
| if not matched: | |
| matched = list(source.get("matched_terms") or []) | |
| score = len(set(matched)) / max(1, len(q_terms)) | |
| if score <= 0 or (anchor_terms and not (anchor_terms & set(matched))): | |
| continue | |
| digest = str(source.get("sha256") or _sha256(text)) | |
| cache_path = cache_dir / f"{digest[:16]}.txt" | |
| cache_path.write_text(text, encoding="utf-8") | |
| snippet = self._snippet(question, text, matched) | |
| if anchor_terms and not (anchor_terms & _terms(snippet)): | |
| continue | |
| evidence.append( | |
| WebEvidence( | |
| source_url=str(source.get("url") or ""), | |
| title=str(source.get("title") or text[:90]), | |
| sha256=digest, | |
| score=round(score, 6), | |
| matched_terms=matched[:24], | |
| snippet=snippet, | |
| cache_path=str(cache_path), | |
| ) | |
| ) | |
| evidence.sort(key=lambda item: (-item.score, item.source_url)) | |
| return evidence[:top_k] | |
| def _snippet(self, question: str, text: str, matched: list[str]) -> str: | |
| matched_set = {m.lower() for m in matched} | |
| anchor_set = _anchor_terms(question) | |
| ranked: list[tuple[int, str]] = [] | |
| for sentence in _sentences(text): | |
| terms = _terms(sentence) | |
| overlap = len(terms & matched_set) | |
| if overlap: | |
| anchor_bonus = 10 * len(terms & anchor_set) | |
| ranked.append((anchor_bonus + overlap, sentence)) | |
| ranked.sort(key=lambda item: (-item[0], len(item[1]))) | |
| if ranked: | |
| return " ".join(sentence for _, sentence in ranked[:2])[:900] | |
| return re.sub(r"\s+", " ", text).strip()[:900] | |
| def _compose_answer(self, question: str, evidence: list[WebEvidence], *, language: str) -> str: | |
| thai = language == "th" or (language == "auto" and re.search(r"[\u0E00-\u0E7F]", question)) | |
| if thai: | |
| lines = ["คำตอบจากหลักฐานเว็บล่าสุด:"] | |
| for i, item in enumerate(evidence, start=1): | |
| lines.append(f"{i}. {item.snippet} [แหล่งที่มา {i}]") | |
| lines.append("ข้อจำกัด: คำตอบนี้ยึดเฉพาะหลักฐานที่ fetch และ hash แล้ว ไม่ถือเป็นความรู้ที่ฝังอยู่ในน้ำหนักโมเดล") | |
| else: | |
| lines = ["Evidence-grounded answer from live web sources:"] | |
| for i, item in enumerate(evidence, start=1): | |
| lines.append(f"{i}. {item.snippet} [source {i}]") | |
| lines.append("Limit: this answer is grounded in fetched, hashed evidence, not a claim that the model weights already knew it.") | |
| return "\n".join(lines) | |
| def _write_outputs(self, result: dict[str, Any], out: Path) -> dict[str, Any]: | |
| report_path = out / "general_web_knowledge_report.json" | |
| result["report_path"] = str(report_path) | |
| report_path.write_text(json.dumps(result, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| sft_path = out / "general_web_knowledge_sft.jsonl" | |
| with sft_path.open("w", encoding="utf-8", newline="\n") as f: | |
| f.write( | |
| json.dumps( | |
| { | |
| "source": "general_web_knowledge_live_evidence", | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": "When knowledge may be current or external, search web evidence first, cite sources, and refuse unsupported claims.", | |
| }, | |
| {"role": "user", "content": str(result["question"])}, | |
| {"role": "assistant", "content": str(result["answer"])}, | |
| ], | |
| "metadata": { | |
| "status": result["status"], | |
| "evidence_count": len(result.get("evidence", [])), | |
| "report_path": str(report_path), | |
| }, | |
| }, | |
| ensure_ascii=False, | |
| sort_keys=True, | |
| ) | |
| + "\n" | |
| ) | |
| result["sft_path"] = str(sft_path) | |
| return result | |
| def write_general_web_knowledge( | |
| question: str, | |
| out_dir: str | Path, | |
| *, | |
| max_results: int = 6, | |
| top_k: int = 4, | |
| language: str = "auto", | |
| researcher: ExternalResearcher | None = None, | |
| ) -> dict[str, Any]: | |
| return GeneralWebKnowledgeEngine(researcher=researcher).answer( | |
| question, | |
| out_dir, | |
| max_results=max_results, | |
| top_k=top_k, | |
| language=language, | |
| ) | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="TinyMind live-web knowledge access with hash-verified evidence.") | |
| parser.add_argument("--question", required=True) | |
| parser.add_argument("--out-dir", default="reports/general_web_knowledge") | |
| parser.add_argument("--max-results", type=int, default=6) | |
| parser.add_argument("--top-k", type=int, default=4) | |
| parser.add_argument("--language", choices=["auto", "th", "en"], default="auto") | |
| args = parser.parse_args() | |
| result = write_general_web_knowledge( | |
| args.question, | |
| args.out_dir, | |
| max_results=args.max_results, | |
| top_k=args.top_k, | |
| language=args.language, | |
| ) | |
| print(json.dumps( | |
| { | |
| "report_path": result["report_path"], | |
| "sft_path": result["sft_path"], | |
| "status": result["status"], | |
| "evidence_count": len(result.get("evidence", [])), | |
| "answer_grounded": result["claim_gate"]["answer_grounded"], | |
| }, | |
| ensure_ascii=False, | |
| indent=2, | |
| )) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |
Xet Storage Details
- Size:
- 11.8 kB
- Xet hash:
- 4d0a336bf74e4f1d47d356310dc561745700252242b93897c7ced4fe387226cb
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.