Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /model /pure_signal_decoder.py

bbkdevops

about 1 month ago

download

raw

8.75 kB

	"""TinyMind PureSignal decoder.

	PureSignal is a deterministic, evidence-first decoder that turns any byte
	artifact plus extracted text/metadata into a compact semantic signal frame.
	It is not a replacement for specialist OCR/ASR/vision models; it is the shared
	decoder substrate that makes every file contribute useful, auditable evidence.
	"""

	from __future__ import annotations

	from dataclasses import dataclass, asdict
	from datetime import datetime, timezone
	import hashlib
	import json
	import math
	from pathlib import Path
	import re
	from typing import Any


	TOKEN_RE = re.compile(r"[\w\u0E00-\u0E7F]+", re.UNICODE)


	def _sha256_bytes(data: bytes) -> str:
	return hashlib.sha256(data).hexdigest()


	def _sha256_text(text: str) -> str:
	return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()


	def _entropy(data: bytes) -> float:
	if not data:
	return 0.0
	counts = [0] * 256
	for b in data:
	counts[b] += 1
	total = len(data)
	return -sum((c / total) * math.log2(c / total) for c in counts if c)


	def _histogram16(data: bytes) -> list[float]:
	buckets = [0] * 16
	if not data:
	return [0.0] * 16
	for b in data:
	buckets[b >> 4] += 1
	total = float(len(data))
	return [round(v / total, 6) for v in buckets]


	def _printable_ratio(data: bytes) -> float:
	if not data:
	return 0.0
	printable = sum(1 for b in data if b in (9, 10, 13) or 32 <= b <= 126 or b >= 128)
	return printable / len(data)


	@dataclass(frozen=True)
	class PureSignalPolicy:
	max_bytes: int = 2_000_000
	max_text_chars: int = 200_000
	vector_bins: int = 64
	require_all_stages_useful: bool = True


	class PureSignalDecoder:
	"""Decode bytes + optional extracted artifacts into one auditable signal frame."""

	def __init__(self, policy: PureSignalPolicy \| None = None):
	self.policy = policy or PureSignalPolicy()

	def decode_file(
	self,
	path: str \| Path,
	*,
	modality: str,
	extracted_outputs: list[dict[str, Any]] \| None = None,
	out_dir: str \| Path \| None = None,
	) -> dict[str, Any]:
	p = Path(path)
	data = p.read_bytes()[: self.policy.max_bytes]
	extracted_text = self._read_extracted_text(extracted_outputs or [])
	stages = {
	"byte_fingerprint": self._byte_fingerprint(data),
	"structure_signal": self._structure_signal(data, modality),
	"language_signal": self._language_signal(extracted_text),
	"modality_signal": self._modality_signal(modality, extracted_outputs or []),
	"fusion_vector": self._fusion_vector(data, extracted_text, modality),
	}
	zero_waste = self._zero_waste(stages)
	report = {
	"schema_version": "tinymind-pure-signal-decoder-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"source_path": str(p),
	"source_sha256": _sha256_bytes(p.read_bytes()),
	"policy": asdict(self.policy),
	"modality": modality,
	"extracted_text_sha256": _sha256_text(extracted_text) if extracted_text else None,
	"stages": stages,
	"zero_waste_audit": zero_waste,
	"claim_gate": {
	"pure_signal_decoder_ready": zero_waste["passed"],
	"all_stages_productive": zero_waste["all_stages_productive"],
	"specialist_semantic_model_required_for_full_understanding": modality in {"image", "audio", "video"},
	"raw_bytes_magically_understood": False,
	"world_all_formats_perfect_claim_allowed": False,
	"reason": "PureSignal produces useful shared evidence for every file, then specialist decoders may add OCR/ASR/vision semantics.",
	},
	}
	if out_dir is not None:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	path_out = out / "pure_signal_decoder_report.json"
	vector_out = out / "pure_signal_vector.json"
	path_out.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
	vector_out.write_text(json.dumps(stages["fusion_vector"], ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
	report["json_path"] = str(path_out)
	report["vector_path"] = str(vector_out)
	return report

	def _read_extracted_text(self, outputs: list[dict[str, Any]]) -> str:
	parts: list[str] = []
	for output in outputs:
	if output.get("type") != "text":
	continue
	path = Path(str(output.get("path", "")))
	if path.exists():
	parts.append(path.read_text(encoding="utf-8", errors="replace"))
	return "\n".join(parts)[: self.policy.max_text_chars]

	def _byte_fingerprint(self, data: bytes) -> dict[str, Any]:
	return {
	"use": "identity_integrity_and_binary_texture",
	"sha256_prefix": _sha256_bytes(data)[:24],
	"sample_bytes": len(data),
	"entropy_bits_per_byte": round(_entropy(data), 6),
	"histogram16": _histogram16(data),
	"printable_ratio": round(_printable_ratio(data), 6),
	}

	def _structure_signal(self, data: bytes, modality: str) -> dict[str, Any]:
	magic = data[:16]
	null_ratio = data.count(0) / max(1, len(data))
	line_count = data.count(b"\n")
	return {
	"use": "format_routing_and_corruption_detection",
	"modality_hint": modality,
	"magic_hex": magic.hex(),
	"null_ratio": round(null_ratio, 6),
	"line_count": line_count,
	"likely_binary": bool(null_ratio > 0.02 or _printable_ratio(data) < 0.70),
	}

	def _language_signal(self, text: str) -> dict[str, Any]:
	tokens = TOKEN_RE.findall(text)
	thai_chars = sum(1 for ch in text if "\u0e00" <= ch <= "\u0e7f")
	ascii_letters = sum(1 for ch in text if ch.isascii() and ch.isalpha())
	code_markers = sum(text.count(marker) for marker in ("def ", "function ", "class ", "import ", "{", "};", "=>"))
	unique = len({tok.lower() for tok in tokens})
	return {
	"use": "text_code_language_grounding",
	"chars": len(text),
	"tokens": len(tokens),
	"unique_token_ratio": round(unique / max(1, len(tokens)), 6),
	"thai_char_ratio": round(thai_chars / max(1, len(text)), 6),
	"ascii_letter_ratio": round(ascii_letters / max(1, len(text)), 6),
	"code_marker_count": code_markers,
	}

	def _modality_signal(self, modality: str, outputs: list[dict[str, Any]]) -> dict[str, Any]:
	output_types = sorted({str(item.get("type")) for item in outputs if item.get("type")})
	return {
	"use": "specialist_decoder_handoff",
	"modality": modality,
	"output_types": output_types,
	"output_count": len(outputs),
	"needs_specialist_decoder": modality in {"image", "audio", "video"},
	"ready_for_language_model_context": bool(outputs),
	}

	def _fusion_vector(self, data: bytes, text: str, modality: str) -> dict[str, Any]:
	bins = max(8, int(self.policy.vector_bins))
	values = [0.0] * bins
	if data:
	for i, b in enumerate(data):
	values[i % bins] += (b / 255.0)
	denom = max(1, math.ceil(len(data) / bins))
	values = [v / denom for v in values]
	modality_code = (sum(ord(ch) for ch in modality) % 997) / 997.0
	text_density = len(TOKEN_RE.findall(text)) / max(1, len(text))
	values[0] = (values[0] + modality_code) / 2.0
	values[1] = (values[1] + min(1.0, text_density * 10.0)) / 2.0
	return {
	"use": "shared_multimodal_retrieval_embedding_seed",
	"dims": bins,
	"values": [round(v, 6) for v in values],
	"l2_norm": round(math.sqrt(sum(v * v for v in values)), 6),
	}

	def _zero_waste(self, stages: dict[str, dict[str, Any]]) -> dict[str, Any]:
	productivity = {
	name: bool(stage.get("use")) and len(stage) > 1
	for name, stage in stages.items()
	}
	all_productive = all(productivity.values())
	return {
	"stage_productivity": productivity,
	"all_stages_productive": all_productive,
	"passed": all_productive if self.policy.require_all_stages_useful else True,
	"principle": "Every decoder stage must emit reusable evidence for integrity, routing, language grounding, specialist handoff, or retrieval.",
	}

Xet Storage Details

Size:: 8.75 kB
Xet hash:: 558c606ce76456a073b0b05e3d9ea7d138e89e274540693b62325d777089f294

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.