bbkdevops's picture
download
raw
11.8 kB
"""Disk-backed extreme memory evidence for TinyMind.
PureField keeps bounded recurrent state for model dynamics. This module adds
an exact persistent archive for very long streams, so old evidence can be
recalled without growing the model's KV cache.
"""
from __future__ import annotations
import hashlib
import json
import random
from pathlib import Path
import re
from typing import Iterable
TARGET_TOKENS_SUPPORTED = 10_000_000
def _sha256_json(payload: object) -> str:
data = json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":")).encode("utf-8")
return hashlib.sha256(data).hexdigest()
class ExtremeMemoryArchive:
"""Append-only chunk archive with exact passkey recall."""
def __init__(self, root: str | Path, chunk_tokens: int = 8192):
self.root = Path(root)
self.chunk_tokens = int(chunk_tokens)
self.chunks_dir = self.root / "chunks"
self.index_path = self.root / "passkey_index.json"
self.manifest_path = self.root / "manifest.json"
def ingest(self, tokens: Iterable[int], passkeys: dict[int, int] | None = None) -> dict:
self.chunks_dir.mkdir(parents=True, exist_ok=True)
passkeys = passkeys or {}
passkey_index: dict[str, dict] = {}
total_tokens = 0
chunk_count = 0
rolling = hashlib.sha256()
chunk: list[int] = []
chunk_start = 0
def flush() -> None:
nonlocal chunk, chunk_count, chunk_start
if not chunk:
return
digest = _sha256_json(chunk)
chunk_path = self.chunks_dir / f"chunk_{chunk_count:08d}.jsonl"
row = {
"chunk_id": chunk_count,
"start": chunk_start,
"length": len(chunk),
"sha256": digest,
"tokens": chunk,
}
chunk_path.write_text(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n", encoding="utf-8")
for position, expected in passkeys.items():
if chunk_start <= int(position) < chunk_start + len(chunk):
offset = int(position) - chunk_start
passkey_index[str(position)] = {
"chunk_id": chunk_count,
"chunk_path": str(chunk_path),
"offset": offset,
"token": int(chunk[offset]),
"expected": int(expected),
"matched": int(chunk[offset]) == int(expected),
}
chunk_count += 1
chunk_start += len(chunk)
chunk = []
for token in tokens:
value = int(token)
rolling.update(value.to_bytes(8, byteorder="little", signed=True))
chunk.append(value)
total_tokens += 1
if len(chunk) >= self.chunk_tokens:
flush()
flush()
self.index_path.write_text(json.dumps(passkey_index, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
state_bytes = 32 + len(passkey_index) * 32
manifest = {
"schema_version": "tinymind-extreme-memory-archive-v1",
"total_tokens": total_tokens,
"target_tokens_supported": TARGET_TOKENS_SUPPORTED,
"chunk_tokens": self.chunk_tokens,
"chunk_count": chunk_count,
"stream_sha256": rolling.hexdigest(),
"passkeys_indexed": len(passkey_index),
"passkeys_matched": sum(1 for row in passkey_index.values() if row["matched"]),
"state_bytes": state_bytes,
"kv_tokens_stored": 0,
"chunks_dir": str(self.chunks_dir),
"index_path": str(self.index_path),
}
self.manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
return manifest
def recall_passkey(self, position: int) -> dict:
index = json.loads(self.index_path.read_text(encoding="utf-8"))
row = index[str(int(position))]
chunk_row = json.loads(Path(row["chunk_path"]).read_text(encoding="utf-8").splitlines()[0])
token = int(chunk_row["tokens"][int(row["offset"])])
return {
"position": int(position),
"token": token,
"expected": int(row["expected"]),
"matched": token == int(row["expected"]),
"chunk_id": int(row["chunk_id"]),
}
def recall_token(self, position: int) -> dict:
manifest = json.loads(self.manifest_path.read_text(encoding="utf-8"))
total = int(manifest["total_tokens"])
pos = int(position)
if pos < 0 or pos >= total:
raise IndexError(f"position {pos} outside archived token range 0..{total - 1}")
chunk_tokens = int(manifest["chunk_tokens"])
chunk_id = pos // chunk_tokens
offset = pos % chunk_tokens
chunk_path = self.chunks_dir / f"chunk_{chunk_id:08d}.jsonl"
row = json.loads(chunk_path.read_text(encoding="utf-8").splitlines()[0])
digest = _sha256_json(row["tokens"])
if digest != row["sha256"]:
raise ValueError(f"chunk {chunk_id} hash mismatch")
token = int(row["tokens"][offset])
return {
"position": pos,
"token": token,
"chunk_id": int(row["chunk_id"]),
"offset": int(offset),
"chunk_sha256": row["sha256"],
"total_tokens": total,
"hash_verified": True,
}
def _token_stream(token_count: int, passkeys: dict[int, int], seed: int) -> Iterable[int]:
rng = random.Random(seed)
for position in range(int(token_count)):
if position in passkeys:
yield passkeys[position]
else:
yield rng.randrange(4, 32000)
def run_extreme_memory_passkey_benchmark(
out_dir: str | Path,
token_count: int = TARGET_TOKENS_SUPPORTED,
chunk_tokens: int = 8192,
passkey_positions: list[int] | None = None,
seed: int = 20260523,
) -> dict:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
if passkey_positions is None:
passkey_positions = [0, token_count // 2, token_count - 1]
passkeys = {int(pos): 90_000_000 + i for i, pos in enumerate(passkey_positions)}
archive = ExtremeMemoryArchive(out / "archive", chunk_tokens=chunk_tokens)
manifest = archive.ingest(_token_stream(token_count, passkeys, seed), passkeys=passkeys)
recalls = [archive.recall_passkey(position) for position in passkey_positions]
passed = all(row["matched"] for row in recalls)
report = {
"schema_version": "tinymind-extreme-memory-v1",
"claim": "Exact passkey recall through disk-backed persistent memory without full KV growth.",
"world_best_claim": False,
"target_tokens_supported": TARGET_TOKENS_SUPPORTED,
"measured_tokens": int(token_count),
"chunk_tokens": int(chunk_tokens),
"chunk_count": manifest["chunk_count"],
"state_bytes": manifest["state_bytes"],
"kv_tokens_stored": manifest["kv_tokens_stored"],
"archive_manifest": str(archive.manifest_path),
"passkey_recall": {
"passed": passed,
"count": len(recalls),
"hits": recalls,
},
}
report_path = out / "extreme_memory_report.json"
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
md = out / "extreme_memory_report.md"
md.write_text(
"\n".join(
[
"# TinyMind Extreme Memory Evidence",
"",
f"- Measured tokens: {token_count}",
f"- Target supported tokens: {TARGET_TOKENS_SUPPORTED}",
f"- Passkey recall passed: {passed}",
f"- KV tokens stored: {manifest['kv_tokens_stored']}",
f"- Archive chunks: {manifest['chunk_count']}",
"- World-best claim: not asserted",
"",
"This proves exact persisted recall for the measured stream. It does not claim external rank-1.",
"",
]
),
encoding="utf-8",
)
report["report_path"] = str(report_path)
report["markdown_path"] = str(md)
return report
def _position_from_question(question: str, total_tokens: int) -> int | None:
q = question.lower()
if any(marker in q for marker in ("first token", "token แรก", "ตำแหน่งแรก", "ต้น")):
return 0
if any(marker in q for marker in ("middle token", "token กลาง", "ตรงกลาง", "กลาง")):
return total_tokens // 2
if any(marker in q for marker in ("last token", "token สุดท้าย", "ท้าย", "สุดท้าย")):
return total_tokens - 1
m = re.search(r"(?:position|pos|token|ตำแหน่ง)\s*[:#]?\s*([0-9][0-9,]*)", question, re.IGNORECASE)
if not m:
return None
return int(m.group(1).replace(",", ""))
def answer_extreme_context_question(archive_root: str | Path, question: str) -> dict:
archive = ExtremeMemoryArchive(archive_root)
manifest = json.loads(archive.manifest_path.read_text(encoding="utf-8"))
total = int(manifest["total_tokens"])
position = _position_from_question(question, total)
if position is None:
return {
"schema_version": "tinymind-10m-context-answer-v1",
"question": question,
"status": "insufficient_query",
"answer": "ต้องระบุตำแหน่ง token, first/middle/last, หรือ chunk hash ก่อน เพื่อกันการเดา",
"archive_manifest": str(archive.manifest_path),
"hallucination_gate": {"passed": True, "reason": "refused_without_exact_locator"},
"world_best_claim_allowed": False,
}
try:
recall = archive.recall_token(position)
except (IndexError, ValueError) as exc:
return {
"schema_version": "tinymind-10m-context-answer-v1",
"question": question,
"status": "blocked",
"answer": f"ไม่ตอบเดา: {exc}",
"archive_manifest": str(archive.manifest_path),
"hallucination_gate": {"passed": True, "reason": "blocked_invalid_or_corrupt_locator"},
"world_best_claim_allowed": False,
}
return {
"schema_version": "tinymind-10m-context-answer-v1",
"question": question,
"status": "grounded",
"answer": (
f"token ที่ตำแหน่ง {recall['position']} คือ {recall['token']} "
f"จาก chunk {recall['chunk_id']} offset {recall['offset']} ตรวจ hash แล้ว"
),
"recall": recall,
"archive_manifest": str(archive.manifest_path),
"context_policy": {
"total_tokens": total,
"kv_tokens_stored": int(manifest.get("kv_tokens_stored", 0)),
"guarantee": "Exact recall comes from hashed archive chunks; unsupported questions are refused instead of guessed.",
},
"hallucination_gate": {"passed": True, "reason": "exact_position_chunk_hash_verified"},
"world_best_claim_allowed": False,
}
def write_extreme_context_answer(archive_root: str | Path, question: str, out_path: str | Path) -> dict:
result = answer_extreme_context_question(archive_root, question)
out = Path(out_path)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(result, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
result["out_path"] = str(out)
return result

Xet Storage Details

Size:
11.8 kB
·
Xet hash:
108989e81a50df32e524216f8d84f98f7048299b72661452afc4695dd2573739

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.