Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /evaluation /extreme_memory.py

bbkdevops

about 1 month ago

download

raw

11.8 kB

	"""Disk-backed extreme memory evidence for TinyMind.

	PureField keeps bounded recurrent state for model dynamics. This module adds
	an exact persistent archive for very long streams, so old evidence can be
	recalled without growing the model's KV cache.
	"""

	from __future__ import annotations

	import hashlib
	import json
	import random
	from pathlib import Path
	import re
	from typing import Iterable


	TARGET_TOKENS_SUPPORTED = 10_000_000


	def _sha256_json(payload: object) -> str:
	data = json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":")).encode("utf-8")
	return hashlib.sha256(data).hexdigest()


	class ExtremeMemoryArchive:
	"""Append-only chunk archive with exact passkey recall."""

	def __init__(self, root: str \| Path, chunk_tokens: int = 8192):
	self.root = Path(root)
	self.chunk_tokens = int(chunk_tokens)
	self.chunks_dir = self.root / "chunks"
	self.index_path = self.root / "passkey_index.json"
	self.manifest_path = self.root / "manifest.json"

	def ingest(self, tokens: Iterable[int], passkeys: dict[int, int] \| None = None) -> dict:
	self.chunks_dir.mkdir(parents=True, exist_ok=True)
	passkeys = passkeys or {}
	passkey_index: dict[str, dict] = {}
	total_tokens = 0
	chunk_count = 0
	rolling = hashlib.sha256()
	chunk: list[int] = []
	chunk_start = 0

	def flush() -> None:
	nonlocal chunk, chunk_count, chunk_start
	if not chunk:
	return
	digest = _sha256_json(chunk)
	chunk_path = self.chunks_dir / f"chunk_{chunk_count:08d}.jsonl"
	row = {
	"chunk_id": chunk_count,
	"start": chunk_start,
	"length": len(chunk),
	"sha256": digest,
	"tokens": chunk,
	}
	chunk_path.write_text(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n", encoding="utf-8")
	for position, expected in passkeys.items():
	if chunk_start <= int(position) < chunk_start + len(chunk):
	offset = int(position) - chunk_start
	passkey_index[str(position)] = {
	"chunk_id": chunk_count,
	"chunk_path": str(chunk_path),
	"offset": offset,
	"token": int(chunk[offset]),
	"expected": int(expected),
	"matched": int(chunk[offset]) == int(expected),
	}
	chunk_count += 1
	chunk_start += len(chunk)
	chunk = []

	for token in tokens:
	value = int(token)
	rolling.update(value.to_bytes(8, byteorder="little", signed=True))
	chunk.append(value)
	total_tokens += 1
	if len(chunk) >= self.chunk_tokens:
	flush()
	flush()

	self.index_path.write_text(json.dumps(passkey_index, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	state_bytes = 32 + len(passkey_index) * 32
	manifest = {
	"schema_version": "tinymind-extreme-memory-archive-v1",
	"total_tokens": total_tokens,
	"target_tokens_supported": TARGET_TOKENS_SUPPORTED,
	"chunk_tokens": self.chunk_tokens,
	"chunk_count": chunk_count,
	"stream_sha256": rolling.hexdigest(),
	"passkeys_indexed": len(passkey_index),
	"passkeys_matched": sum(1 for row in passkey_index.values() if row["matched"]),
	"state_bytes": state_bytes,
	"kv_tokens_stored": 0,
	"chunks_dir": str(self.chunks_dir),
	"index_path": str(self.index_path),
	}
	self.manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	return manifest

	def recall_passkey(self, position: int) -> dict:
	index = json.loads(self.index_path.read_text(encoding="utf-8"))
	row = index[str(int(position))]
	chunk_row = json.loads(Path(row["chunk_path"]).read_text(encoding="utf-8").splitlines()[0])
	token = int(chunk_row["tokens"][int(row["offset"])])
	return {
	"position": int(position),
	"token": token,
	"expected": int(row["expected"]),
	"matched": token == int(row["expected"]),
	"chunk_id": int(row["chunk_id"]),
	}

	def recall_token(self, position: int) -> dict:
	manifest = json.loads(self.manifest_path.read_text(encoding="utf-8"))
	total = int(manifest["total_tokens"])
	pos = int(position)
	if pos < 0 or pos >= total:
	raise IndexError(f"position {pos} outside archived token range 0..{total - 1}")
	chunk_tokens = int(manifest["chunk_tokens"])
	chunk_id = pos // chunk_tokens
	offset = pos % chunk_tokens
	chunk_path = self.chunks_dir / f"chunk_{chunk_id:08d}.jsonl"
	row = json.loads(chunk_path.read_text(encoding="utf-8").splitlines()[0])
	digest = _sha256_json(row["tokens"])
	if digest != row["sha256"]:
	raise ValueError(f"chunk {chunk_id} hash mismatch")
	token = int(row["tokens"][offset])
	return {
	"position": pos,
	"token": token,
	"chunk_id": int(row["chunk_id"]),
	"offset": int(offset),
	"chunk_sha256": row["sha256"],
	"total_tokens": total,
	"hash_verified": True,
	}


	def _token_stream(token_count: int, passkeys: dict[int, int], seed: int) -> Iterable[int]:
	rng = random.Random(seed)
	for position in range(int(token_count)):
	if position in passkeys:
	yield passkeys[position]
	else:
	yield rng.randrange(4, 32000)


	def run_extreme_memory_passkey_benchmark(
	out_dir: str \| Path,
	token_count: int = TARGET_TOKENS_SUPPORTED,
	chunk_tokens: int = 8192,
	passkey_positions: list[int] \| None = None,
	seed: int = 20260523,
	) -> dict:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	if passkey_positions is None:
	passkey_positions = [0, token_count // 2, token_count - 1]
	passkeys = {int(pos): 90_000_000 + i for i, pos in enumerate(passkey_positions)}
	archive = ExtremeMemoryArchive(out / "archive", chunk_tokens=chunk_tokens)
	manifest = archive.ingest(_token_stream(token_count, passkeys, seed), passkeys=passkeys)
	recalls = [archive.recall_passkey(position) for position in passkey_positions]
	passed = all(row["matched"] for row in recalls)
	report = {
	"schema_version": "tinymind-extreme-memory-v1",
	"claim": "Exact passkey recall through disk-backed persistent memory without full KV growth.",
	"world_best_claim": False,
	"target_tokens_supported": TARGET_TOKENS_SUPPORTED,
	"measured_tokens": int(token_count),
	"chunk_tokens": int(chunk_tokens),
	"chunk_count": manifest["chunk_count"],
	"state_bytes": manifest["state_bytes"],
	"kv_tokens_stored": manifest["kv_tokens_stored"],
	"archive_manifest": str(archive.manifest_path),
	"passkey_recall": {
	"passed": passed,
	"count": len(recalls),
	"hits": recalls,
	},
	}
	report_path = out / "extreme_memory_report.json"
	report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	md = out / "extreme_memory_report.md"
	md.write_text(
	"\n".join(
	[
	"# TinyMind Extreme Memory Evidence",
	"",
	f"- Measured tokens: {token_count}",
	f"- Target supported tokens: {TARGET_TOKENS_SUPPORTED}",
	f"- Passkey recall passed: {passed}",
	f"- KV tokens stored: {manifest['kv_tokens_stored']}",
	f"- Archive chunks: {manifest['chunk_count']}",
	"- World-best claim: not asserted",
	"",
	"This proves exact persisted recall for the measured stream. It does not claim external rank-1.",
	"",
	]
	),
	encoding="utf-8",
	)
	report["report_path"] = str(report_path)
	report["markdown_path"] = str(md)
	return report


	def _position_from_question(question: str, total_tokens: int) -> int \| None:
	q = question.lower()
	if any(marker in q for marker in ("first token", "token แรก", "ตำแหน่งแรก", "ต้น")):
	return 0
	if any(marker in q for marker in ("middle token", "token กลาง", "ตรงกลาง", "กลาง")):
	return total_tokens // 2
	if any(marker in q for marker in ("last token", "token สุดท้าย", "ท้าย", "สุดท้าย")):
	return total_tokens - 1
	m = re.search(r"(?:position\|pos\|token\|ตำแหน่ง)\s[:#]?\s([0-9][0-9,]*)", question, re.IGNORECASE)
	if not m:
	return None
	return int(m.group(1).replace(",", ""))


	def answer_extreme_context_question(archive_root: str \| Path, question: str) -> dict:
	archive = ExtremeMemoryArchive(archive_root)
	manifest = json.loads(archive.manifest_path.read_text(encoding="utf-8"))
	total = int(manifest["total_tokens"])
	position = _position_from_question(question, total)
	if position is None:
	return {
	"schema_version": "tinymind-10m-context-answer-v1",
	"question": question,
	"status": "insufficient_query",
	"answer": "ต้องระบุตำแหน่ง token, first/middle/last, หรือ chunk hash ก่อน เพื่อกันการเดา",
	"archive_manifest": str(archive.manifest_path),
	"hallucination_gate": {"passed": True, "reason": "refused_without_exact_locator"},
	"world_best_claim_allowed": False,
	}
	try:
	recall = archive.recall_token(position)
	except (IndexError, ValueError) as exc:
	return {
	"schema_version": "tinymind-10m-context-answer-v1",
	"question": question,
	"status": "blocked",
	"answer": f"ไม่ตอบเดา: {exc}",
	"archive_manifest": str(archive.manifest_path),
	"hallucination_gate": {"passed": True, "reason": "blocked_invalid_or_corrupt_locator"},
	"world_best_claim_allowed": False,
	}
	return {
	"schema_version": "tinymind-10m-context-answer-v1",
	"question": question,
	"status": "grounded",
	"answer": (
	f"token ที่ตำแหน่ง {recall['position']} คือ {recall['token']} "
	f"จาก chunk {recall['chunk_id']} offset {recall['offset']} ตรวจ hash แล้ว"
	),
	"recall": recall,
	"archive_manifest": str(archive.manifest_path),
	"context_policy": {
	"total_tokens": total,
	"kv_tokens_stored": int(manifest.get("kv_tokens_stored", 0)),
	"guarantee": "Exact recall comes from hashed archive chunks; unsupported questions are refused instead of guessed.",
	},
	"hallucination_gate": {"passed": True, "reason": "exact_position_chunk_hash_verified"},
	"world_best_claim_allowed": False,
	}


	def write_extreme_context_answer(archive_root: str \| Path, question: str, out_path: str \| Path) -> dict:
	result = answer_extreme_context_question(archive_root, question)
	out = Path(out_path)
	out.parent.mkdir(parents=True, exist_ok=True)
	out.write_text(json.dumps(result, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	result["out_path"] = str(out)
	return result

Xet Storage Details

Size:: 11.8 kB
Xet hash:: 108989e81a50df32e524216f8d84f98f7048299b72661452afc4695dd2573739

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.