bbkdevops's picture
download
raw
8.75 kB
"""TinyMind PureSignal decoder.
PureSignal is a deterministic, evidence-first decoder that turns any byte
artifact plus extracted text/metadata into a compact semantic signal frame.
It is not a replacement for specialist OCR/ASR/vision models; it is the shared
decoder substrate that makes every file contribute useful, auditable evidence.
"""
from __future__ import annotations
from dataclasses import dataclass, asdict
from datetime import datetime, timezone
import hashlib
import json
import math
from pathlib import Path
import re
from typing import Any
TOKEN_RE = re.compile(r"[\w\u0E00-\u0E7F]+", re.UNICODE)
def _sha256_bytes(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
def _sha256_text(text: str) -> str:
return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
def _entropy(data: bytes) -> float:
if not data:
return 0.0
counts = [0] * 256
for b in data:
counts[b] += 1
total = len(data)
return -sum((c / total) * math.log2(c / total) for c in counts if c)
def _histogram16(data: bytes) -> list[float]:
buckets = [0] * 16
if not data:
return [0.0] * 16
for b in data:
buckets[b >> 4] += 1
total = float(len(data))
return [round(v / total, 6) for v in buckets]
def _printable_ratio(data: bytes) -> float:
if not data:
return 0.0
printable = sum(1 for b in data if b in (9, 10, 13) or 32 <= b <= 126 or b >= 128)
return printable / len(data)
@dataclass(frozen=True)
class PureSignalPolicy:
max_bytes: int = 2_000_000
max_text_chars: int = 200_000
vector_bins: int = 64
require_all_stages_useful: bool = True
class PureSignalDecoder:
"""Decode bytes + optional extracted artifacts into one auditable signal frame."""
def __init__(self, policy: PureSignalPolicy | None = None):
self.policy = policy or PureSignalPolicy()
def decode_file(
self,
path: str | Path,
*,
modality: str,
extracted_outputs: list[dict[str, Any]] | None = None,
out_dir: str | Path | None = None,
) -> dict[str, Any]:
p = Path(path)
data = p.read_bytes()[: self.policy.max_bytes]
extracted_text = self._read_extracted_text(extracted_outputs or [])
stages = {
"byte_fingerprint": self._byte_fingerprint(data),
"structure_signal": self._structure_signal(data, modality),
"language_signal": self._language_signal(extracted_text),
"modality_signal": self._modality_signal(modality, extracted_outputs or []),
"fusion_vector": self._fusion_vector(data, extracted_text, modality),
}
zero_waste = self._zero_waste(stages)
report = {
"schema_version": "tinymind-pure-signal-decoder-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"source_path": str(p),
"source_sha256": _sha256_bytes(p.read_bytes()),
"policy": asdict(self.policy),
"modality": modality,
"extracted_text_sha256": _sha256_text(extracted_text) if extracted_text else None,
"stages": stages,
"zero_waste_audit": zero_waste,
"claim_gate": {
"pure_signal_decoder_ready": zero_waste["passed"],
"all_stages_productive": zero_waste["all_stages_productive"],
"specialist_semantic_model_required_for_full_understanding": modality in {"image", "audio", "video"},
"raw_bytes_magically_understood": False,
"world_all_formats_perfect_claim_allowed": False,
"reason": "PureSignal produces useful shared evidence for every file, then specialist decoders may add OCR/ASR/vision semantics.",
},
}
if out_dir is not None:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
path_out = out / "pure_signal_decoder_report.json"
vector_out = out / "pure_signal_vector.json"
path_out.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
vector_out.write_text(json.dumps(stages["fusion_vector"], ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
report["json_path"] = str(path_out)
report["vector_path"] = str(vector_out)
return report
def _read_extracted_text(self, outputs: list[dict[str, Any]]) -> str:
parts: list[str] = []
for output in outputs:
if output.get("type") != "text":
continue
path = Path(str(output.get("path", "")))
if path.exists():
parts.append(path.read_text(encoding="utf-8", errors="replace"))
return "\n".join(parts)[: self.policy.max_text_chars]
def _byte_fingerprint(self, data: bytes) -> dict[str, Any]:
return {
"use": "identity_integrity_and_binary_texture",
"sha256_prefix": _sha256_bytes(data)[:24],
"sample_bytes": len(data),
"entropy_bits_per_byte": round(_entropy(data), 6),
"histogram16": _histogram16(data),
"printable_ratio": round(_printable_ratio(data), 6),
}
def _structure_signal(self, data: bytes, modality: str) -> dict[str, Any]:
magic = data[:16]
null_ratio = data.count(0) / max(1, len(data))
line_count = data.count(b"\n")
return {
"use": "format_routing_and_corruption_detection",
"modality_hint": modality,
"magic_hex": magic.hex(),
"null_ratio": round(null_ratio, 6),
"line_count": line_count,
"likely_binary": bool(null_ratio > 0.02 or _printable_ratio(data) < 0.70),
}
def _language_signal(self, text: str) -> dict[str, Any]:
tokens = TOKEN_RE.findall(text)
thai_chars = sum(1 for ch in text if "\u0e00" <= ch <= "\u0e7f")
ascii_letters = sum(1 for ch in text if ch.isascii() and ch.isalpha())
code_markers = sum(text.count(marker) for marker in ("def ", "function ", "class ", "import ", "{", "};", "=>"))
unique = len({tok.lower() for tok in tokens})
return {
"use": "text_code_language_grounding",
"chars": len(text),
"tokens": len(tokens),
"unique_token_ratio": round(unique / max(1, len(tokens)), 6),
"thai_char_ratio": round(thai_chars / max(1, len(text)), 6),
"ascii_letter_ratio": round(ascii_letters / max(1, len(text)), 6),
"code_marker_count": code_markers,
}
def _modality_signal(self, modality: str, outputs: list[dict[str, Any]]) -> dict[str, Any]:
output_types = sorted({str(item.get("type")) for item in outputs if item.get("type")})
return {
"use": "specialist_decoder_handoff",
"modality": modality,
"output_types": output_types,
"output_count": len(outputs),
"needs_specialist_decoder": modality in {"image", "audio", "video"},
"ready_for_language_model_context": bool(outputs),
}
def _fusion_vector(self, data: bytes, text: str, modality: str) -> dict[str, Any]:
bins = max(8, int(self.policy.vector_bins))
values = [0.0] * bins
if data:
for i, b in enumerate(data):
values[i % bins] += (b / 255.0)
denom = max(1, math.ceil(len(data) / bins))
values = [v / denom for v in values]
modality_code = (sum(ord(ch) for ch in modality) % 997) / 997.0
text_density = len(TOKEN_RE.findall(text)) / max(1, len(text))
values[0] = (values[0] + modality_code) / 2.0
values[1] = (values[1] + min(1.0, text_density * 10.0)) / 2.0
return {
"use": "shared_multimodal_retrieval_embedding_seed",
"dims": bins,
"values": [round(v, 6) for v in values],
"l2_norm": round(math.sqrt(sum(v * v for v in values)), 6),
}
def _zero_waste(self, stages: dict[str, dict[str, Any]]) -> dict[str, Any]:
productivity = {
name: bool(stage.get("use")) and len(stage) > 1
for name, stage in stages.items()
}
all_productive = all(productivity.values())
return {
"stage_productivity": productivity,
"all_stages_productive": all_productive,
"passed": all_productive if self.policy.require_all_stages_useful else True,
"principle": "Every decoder stage must emit reusable evidence for integrity, routing, language grounding, specialist handoff, or retrieval.",
}

Xet Storage Details

Size:
8.75 kB
·
Xet hash:
558c606ce76456a073b0b05e3d9ea7d138e89e274540693b62325d777089f294

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.