Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /evaluation /extreme_memory.py
| """Disk-backed extreme memory evidence for TinyMind. | |
| PureField keeps bounded recurrent state for model dynamics. This module adds | |
| an exact persistent archive for very long streams, so old evidence can be | |
| recalled without growing the model's KV cache. | |
| """ | |
| from __future__ import annotations | |
| import hashlib | |
| import json | |
| import random | |
| from pathlib import Path | |
| import re | |
| from typing import Iterable | |
| TARGET_TOKENS_SUPPORTED = 10_000_000 | |
| def _sha256_json(payload: object) -> str: | |
| data = json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":")).encode("utf-8") | |
| return hashlib.sha256(data).hexdigest() | |
| class ExtremeMemoryArchive: | |
| """Append-only chunk archive with exact passkey recall.""" | |
| def __init__(self, root: str | Path, chunk_tokens: int = 8192): | |
| self.root = Path(root) | |
| self.chunk_tokens = int(chunk_tokens) | |
| self.chunks_dir = self.root / "chunks" | |
| self.index_path = self.root / "passkey_index.json" | |
| self.manifest_path = self.root / "manifest.json" | |
| def ingest(self, tokens: Iterable[int], passkeys: dict[int, int] | None = None) -> dict: | |
| self.chunks_dir.mkdir(parents=True, exist_ok=True) | |
| passkeys = passkeys or {} | |
| passkey_index: dict[str, dict] = {} | |
| total_tokens = 0 | |
| chunk_count = 0 | |
| rolling = hashlib.sha256() | |
| chunk: list[int] = [] | |
| chunk_start = 0 | |
| def flush() -> None: | |
| nonlocal chunk, chunk_count, chunk_start | |
| if not chunk: | |
| return | |
| digest = _sha256_json(chunk) | |
| chunk_path = self.chunks_dir / f"chunk_{chunk_count:08d}.jsonl" | |
| row = { | |
| "chunk_id": chunk_count, | |
| "start": chunk_start, | |
| "length": len(chunk), | |
| "sha256": digest, | |
| "tokens": chunk, | |
| } | |
| chunk_path.write_text(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n", encoding="utf-8") | |
| for position, expected in passkeys.items(): | |
| if chunk_start <= int(position) < chunk_start + len(chunk): | |
| offset = int(position) - chunk_start | |
| passkey_index[str(position)] = { | |
| "chunk_id": chunk_count, | |
| "chunk_path": str(chunk_path), | |
| "offset": offset, | |
| "token": int(chunk[offset]), | |
| "expected": int(expected), | |
| "matched": int(chunk[offset]) == int(expected), | |
| } | |
| chunk_count += 1 | |
| chunk_start += len(chunk) | |
| chunk = [] | |
| for token in tokens: | |
| value = int(token) | |
| rolling.update(value.to_bytes(8, byteorder="little", signed=True)) | |
| chunk.append(value) | |
| total_tokens += 1 | |
| if len(chunk) >= self.chunk_tokens: | |
| flush() | |
| flush() | |
| self.index_path.write_text(json.dumps(passkey_index, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| state_bytes = 32 + len(passkey_index) * 32 | |
| manifest = { | |
| "schema_version": "tinymind-extreme-memory-archive-v1", | |
| "total_tokens": total_tokens, | |
| "target_tokens_supported": TARGET_TOKENS_SUPPORTED, | |
| "chunk_tokens": self.chunk_tokens, | |
| "chunk_count": chunk_count, | |
| "stream_sha256": rolling.hexdigest(), | |
| "passkeys_indexed": len(passkey_index), | |
| "passkeys_matched": sum(1 for row in passkey_index.values() if row["matched"]), | |
| "state_bytes": state_bytes, | |
| "kv_tokens_stored": 0, | |
| "chunks_dir": str(self.chunks_dir), | |
| "index_path": str(self.index_path), | |
| } | |
| self.manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| return manifest | |
| def recall_passkey(self, position: int) -> dict: | |
| index = json.loads(self.index_path.read_text(encoding="utf-8")) | |
| row = index[str(int(position))] | |
| chunk_row = json.loads(Path(row["chunk_path"]).read_text(encoding="utf-8").splitlines()[0]) | |
| token = int(chunk_row["tokens"][int(row["offset"])]) | |
| return { | |
| "position": int(position), | |
| "token": token, | |
| "expected": int(row["expected"]), | |
| "matched": token == int(row["expected"]), | |
| "chunk_id": int(row["chunk_id"]), | |
| } | |
| def recall_token(self, position: int) -> dict: | |
| manifest = json.loads(self.manifest_path.read_text(encoding="utf-8")) | |
| total = int(manifest["total_tokens"]) | |
| pos = int(position) | |
| if pos < 0 or pos >= total: | |
| raise IndexError(f"position {pos} outside archived token range 0..{total - 1}") | |
| chunk_tokens = int(manifest["chunk_tokens"]) | |
| chunk_id = pos // chunk_tokens | |
| offset = pos % chunk_tokens | |
| chunk_path = self.chunks_dir / f"chunk_{chunk_id:08d}.jsonl" | |
| row = json.loads(chunk_path.read_text(encoding="utf-8").splitlines()[0]) | |
| digest = _sha256_json(row["tokens"]) | |
| if digest != row["sha256"]: | |
| raise ValueError(f"chunk {chunk_id} hash mismatch") | |
| token = int(row["tokens"][offset]) | |
| return { | |
| "position": pos, | |
| "token": token, | |
| "chunk_id": int(row["chunk_id"]), | |
| "offset": int(offset), | |
| "chunk_sha256": row["sha256"], | |
| "total_tokens": total, | |
| "hash_verified": True, | |
| } | |
| def _token_stream(token_count: int, passkeys: dict[int, int], seed: int) -> Iterable[int]: | |
| rng = random.Random(seed) | |
| for position in range(int(token_count)): | |
| if position in passkeys: | |
| yield passkeys[position] | |
| else: | |
| yield rng.randrange(4, 32000) | |
| def run_extreme_memory_passkey_benchmark( | |
| out_dir: str | Path, | |
| token_count: int = TARGET_TOKENS_SUPPORTED, | |
| chunk_tokens: int = 8192, | |
| passkey_positions: list[int] | None = None, | |
| seed: int = 20260523, | |
| ) -> dict: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| if passkey_positions is None: | |
| passkey_positions = [0, token_count // 2, token_count - 1] | |
| passkeys = {int(pos): 90_000_000 + i for i, pos in enumerate(passkey_positions)} | |
| archive = ExtremeMemoryArchive(out / "archive", chunk_tokens=chunk_tokens) | |
| manifest = archive.ingest(_token_stream(token_count, passkeys, seed), passkeys=passkeys) | |
| recalls = [archive.recall_passkey(position) for position in passkey_positions] | |
| passed = all(row["matched"] for row in recalls) | |
| report = { | |
| "schema_version": "tinymind-extreme-memory-v1", | |
| "claim": "Exact passkey recall through disk-backed persistent memory without full KV growth.", | |
| "world_best_claim": False, | |
| "target_tokens_supported": TARGET_TOKENS_SUPPORTED, | |
| "measured_tokens": int(token_count), | |
| "chunk_tokens": int(chunk_tokens), | |
| "chunk_count": manifest["chunk_count"], | |
| "state_bytes": manifest["state_bytes"], | |
| "kv_tokens_stored": manifest["kv_tokens_stored"], | |
| "archive_manifest": str(archive.manifest_path), | |
| "passkey_recall": { | |
| "passed": passed, | |
| "count": len(recalls), | |
| "hits": recalls, | |
| }, | |
| } | |
| report_path = out / "extreme_memory_report.json" | |
| report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md = out / "extreme_memory_report.md" | |
| md.write_text( | |
| "\n".join( | |
| [ | |
| "# TinyMind Extreme Memory Evidence", | |
| "", | |
| f"- Measured tokens: {token_count}", | |
| f"- Target supported tokens: {TARGET_TOKENS_SUPPORTED}", | |
| f"- Passkey recall passed: {passed}", | |
| f"- KV tokens stored: {manifest['kv_tokens_stored']}", | |
| f"- Archive chunks: {manifest['chunk_count']}", | |
| "- World-best claim: not asserted", | |
| "", | |
| "This proves exact persisted recall for the measured stream. It does not claim external rank-1.", | |
| "", | |
| ] | |
| ), | |
| encoding="utf-8", | |
| ) | |
| report["report_path"] = str(report_path) | |
| report["markdown_path"] = str(md) | |
| return report | |
| def _position_from_question(question: str, total_tokens: int) -> int | None: | |
| q = question.lower() | |
| if any(marker in q for marker in ("first token", "token แรก", "ตำแหน่งแรก", "ต้น")): | |
| return 0 | |
| if any(marker in q for marker in ("middle token", "token กลาง", "ตรงกลาง", "กลาง")): | |
| return total_tokens // 2 | |
| if any(marker in q for marker in ("last token", "token สุดท้าย", "ท้าย", "สุดท้าย")): | |
| return total_tokens - 1 | |
| m = re.search(r"(?:position|pos|token|ตำแหน่ง)\s*[:#]?\s*([0-9][0-9,]*)", question, re.IGNORECASE) | |
| if not m: | |
| return None | |
| return int(m.group(1).replace(",", "")) | |
| def answer_extreme_context_question(archive_root: str | Path, question: str) -> dict: | |
| archive = ExtremeMemoryArchive(archive_root) | |
| manifest = json.loads(archive.manifest_path.read_text(encoding="utf-8")) | |
| total = int(manifest["total_tokens"]) | |
| position = _position_from_question(question, total) | |
| if position is None: | |
| return { | |
| "schema_version": "tinymind-10m-context-answer-v1", | |
| "question": question, | |
| "status": "insufficient_query", | |
| "answer": "ต้องระบุตำแหน่ง token, first/middle/last, หรือ chunk hash ก่อน เพื่อกันการเดา", | |
| "archive_manifest": str(archive.manifest_path), | |
| "hallucination_gate": {"passed": True, "reason": "refused_without_exact_locator"}, | |
| "world_best_claim_allowed": False, | |
| } | |
| try: | |
| recall = archive.recall_token(position) | |
| except (IndexError, ValueError) as exc: | |
| return { | |
| "schema_version": "tinymind-10m-context-answer-v1", | |
| "question": question, | |
| "status": "blocked", | |
| "answer": f"ไม่ตอบเดา: {exc}", | |
| "archive_manifest": str(archive.manifest_path), | |
| "hallucination_gate": {"passed": True, "reason": "blocked_invalid_or_corrupt_locator"}, | |
| "world_best_claim_allowed": False, | |
| } | |
| return { | |
| "schema_version": "tinymind-10m-context-answer-v1", | |
| "question": question, | |
| "status": "grounded", | |
| "answer": ( | |
| f"token ที่ตำแหน่ง {recall['position']} คือ {recall['token']} " | |
| f"จาก chunk {recall['chunk_id']} offset {recall['offset']} ตรวจ hash แล้ว" | |
| ), | |
| "recall": recall, | |
| "archive_manifest": str(archive.manifest_path), | |
| "context_policy": { | |
| "total_tokens": total, | |
| "kv_tokens_stored": int(manifest.get("kv_tokens_stored", 0)), | |
| "guarantee": "Exact recall comes from hashed archive chunks; unsupported questions are refused instead of guessed.", | |
| }, | |
| "hallucination_gate": {"passed": True, "reason": "exact_position_chunk_hash_verified"}, | |
| "world_best_claim_allowed": False, | |
| } | |
| def write_extreme_context_answer(archive_root: str | Path, question: str, out_path: str | Path) -> dict: | |
| result = answer_extreme_context_question(archive_root, question) | |
| out = Path(out_path) | |
| out.parent.mkdir(parents=True, exist_ok=True) | |
| out.write_text(json.dumps(result, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| result["out_path"] = str(out) | |
| return result | |
Xet Storage Details
- Size:
- 11.8 kB
- Xet hash:
- 108989e81a50df32e524216f8d84f98f7048299b72661452afc4695dd2573739
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.