Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /model /pure_signal_decoder.py
| """TinyMind PureSignal decoder. | |
| PureSignal is a deterministic, evidence-first decoder that turns any byte | |
| artifact plus extracted text/metadata into a compact semantic signal frame. | |
| It is not a replacement for specialist OCR/ASR/vision models; it is the shared | |
| decoder substrate that makes every file contribute useful, auditable evidence. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass, asdict | |
| from datetime import datetime, timezone | |
| import hashlib | |
| import json | |
| import math | |
| from pathlib import Path | |
| import re | |
| from typing import Any | |
| TOKEN_RE = re.compile(r"[\w\u0E00-\u0E7F]+", re.UNICODE) | |
| def _sha256_bytes(data: bytes) -> str: | |
| return hashlib.sha256(data).hexdigest() | |
| def _sha256_text(text: str) -> str: | |
| return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest() | |
| def _entropy(data: bytes) -> float: | |
| if not data: | |
| return 0.0 | |
| counts = [0] * 256 | |
| for b in data: | |
| counts[b] += 1 | |
| total = len(data) | |
| return -sum((c / total) * math.log2(c / total) for c in counts if c) | |
| def _histogram16(data: bytes) -> list[float]: | |
| buckets = [0] * 16 | |
| if not data: | |
| return [0.0] * 16 | |
| for b in data: | |
| buckets[b >> 4] += 1 | |
| total = float(len(data)) | |
| return [round(v / total, 6) for v in buckets] | |
| def _printable_ratio(data: bytes) -> float: | |
| if not data: | |
| return 0.0 | |
| printable = sum(1 for b in data if b in (9, 10, 13) or 32 <= b <= 126 or b >= 128) | |
| return printable / len(data) | |
| class PureSignalPolicy: | |
| max_bytes: int = 2_000_000 | |
| max_text_chars: int = 200_000 | |
| vector_bins: int = 64 | |
| require_all_stages_useful: bool = True | |
| class PureSignalDecoder: | |
| """Decode bytes + optional extracted artifacts into one auditable signal frame.""" | |
| def __init__(self, policy: PureSignalPolicy | None = None): | |
| self.policy = policy or PureSignalPolicy() | |
| def decode_file( | |
| self, | |
| path: str | Path, | |
| *, | |
| modality: str, | |
| extracted_outputs: list[dict[str, Any]] | None = None, | |
| out_dir: str | Path | None = None, | |
| ) -> dict[str, Any]: | |
| p = Path(path) | |
| data = p.read_bytes()[: self.policy.max_bytes] | |
| extracted_text = self._read_extracted_text(extracted_outputs or []) | |
| stages = { | |
| "byte_fingerprint": self._byte_fingerprint(data), | |
| "structure_signal": self._structure_signal(data, modality), | |
| "language_signal": self._language_signal(extracted_text), | |
| "modality_signal": self._modality_signal(modality, extracted_outputs or []), | |
| "fusion_vector": self._fusion_vector(data, extracted_text, modality), | |
| } | |
| zero_waste = self._zero_waste(stages) | |
| report = { | |
| "schema_version": "tinymind-pure-signal-decoder-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "source_path": str(p), | |
| "source_sha256": _sha256_bytes(p.read_bytes()), | |
| "policy": asdict(self.policy), | |
| "modality": modality, | |
| "extracted_text_sha256": _sha256_text(extracted_text) if extracted_text else None, | |
| "stages": stages, | |
| "zero_waste_audit": zero_waste, | |
| "claim_gate": { | |
| "pure_signal_decoder_ready": zero_waste["passed"], | |
| "all_stages_productive": zero_waste["all_stages_productive"], | |
| "specialist_semantic_model_required_for_full_understanding": modality in {"image", "audio", "video"}, | |
| "raw_bytes_magically_understood": False, | |
| "world_all_formats_perfect_claim_allowed": False, | |
| "reason": "PureSignal produces useful shared evidence for every file, then specialist decoders may add OCR/ASR/vision semantics.", | |
| }, | |
| } | |
| if out_dir is not None: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| path_out = out / "pure_signal_decoder_report.json" | |
| vector_out = out / "pure_signal_vector.json" | |
| path_out.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8") | |
| vector_out.write_text(json.dumps(stages["fusion_vector"], ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8") | |
| report["json_path"] = str(path_out) | |
| report["vector_path"] = str(vector_out) | |
| return report | |
| def _read_extracted_text(self, outputs: list[dict[str, Any]]) -> str: | |
| parts: list[str] = [] | |
| for output in outputs: | |
| if output.get("type") != "text": | |
| continue | |
| path = Path(str(output.get("path", ""))) | |
| if path.exists(): | |
| parts.append(path.read_text(encoding="utf-8", errors="replace")) | |
| return "\n".join(parts)[: self.policy.max_text_chars] | |
| def _byte_fingerprint(self, data: bytes) -> dict[str, Any]: | |
| return { | |
| "use": "identity_integrity_and_binary_texture", | |
| "sha256_prefix": _sha256_bytes(data)[:24], | |
| "sample_bytes": len(data), | |
| "entropy_bits_per_byte": round(_entropy(data), 6), | |
| "histogram16": _histogram16(data), | |
| "printable_ratio": round(_printable_ratio(data), 6), | |
| } | |
| def _structure_signal(self, data: bytes, modality: str) -> dict[str, Any]: | |
| magic = data[:16] | |
| null_ratio = data.count(0) / max(1, len(data)) | |
| line_count = data.count(b"\n") | |
| return { | |
| "use": "format_routing_and_corruption_detection", | |
| "modality_hint": modality, | |
| "magic_hex": magic.hex(), | |
| "null_ratio": round(null_ratio, 6), | |
| "line_count": line_count, | |
| "likely_binary": bool(null_ratio > 0.02 or _printable_ratio(data) < 0.70), | |
| } | |
| def _language_signal(self, text: str) -> dict[str, Any]: | |
| tokens = TOKEN_RE.findall(text) | |
| thai_chars = sum(1 for ch in text if "\u0e00" <= ch <= "\u0e7f") | |
| ascii_letters = sum(1 for ch in text if ch.isascii() and ch.isalpha()) | |
| code_markers = sum(text.count(marker) for marker in ("def ", "function ", "class ", "import ", "{", "};", "=>")) | |
| unique = len({tok.lower() for tok in tokens}) | |
| return { | |
| "use": "text_code_language_grounding", | |
| "chars": len(text), | |
| "tokens": len(tokens), | |
| "unique_token_ratio": round(unique / max(1, len(tokens)), 6), | |
| "thai_char_ratio": round(thai_chars / max(1, len(text)), 6), | |
| "ascii_letter_ratio": round(ascii_letters / max(1, len(text)), 6), | |
| "code_marker_count": code_markers, | |
| } | |
| def _modality_signal(self, modality: str, outputs: list[dict[str, Any]]) -> dict[str, Any]: | |
| output_types = sorted({str(item.get("type")) for item in outputs if item.get("type")}) | |
| return { | |
| "use": "specialist_decoder_handoff", | |
| "modality": modality, | |
| "output_types": output_types, | |
| "output_count": len(outputs), | |
| "needs_specialist_decoder": modality in {"image", "audio", "video"}, | |
| "ready_for_language_model_context": bool(outputs), | |
| } | |
| def _fusion_vector(self, data: bytes, text: str, modality: str) -> dict[str, Any]: | |
| bins = max(8, int(self.policy.vector_bins)) | |
| values = [0.0] * bins | |
| if data: | |
| for i, b in enumerate(data): | |
| values[i % bins] += (b / 255.0) | |
| denom = max(1, math.ceil(len(data) / bins)) | |
| values = [v / denom for v in values] | |
| modality_code = (sum(ord(ch) for ch in modality) % 997) / 997.0 | |
| text_density = len(TOKEN_RE.findall(text)) / max(1, len(text)) | |
| values[0] = (values[0] + modality_code) / 2.0 | |
| values[1] = (values[1] + min(1.0, text_density * 10.0)) / 2.0 | |
| return { | |
| "use": "shared_multimodal_retrieval_embedding_seed", | |
| "dims": bins, | |
| "values": [round(v, 6) for v in values], | |
| "l2_norm": round(math.sqrt(sum(v * v for v in values)), 6), | |
| } | |
| def _zero_waste(self, stages: dict[str, dict[str, Any]]) -> dict[str, Any]: | |
| productivity = { | |
| name: bool(stage.get("use")) and len(stage) > 1 | |
| for name, stage in stages.items() | |
| } | |
| all_productive = all(productivity.values()) | |
| return { | |
| "stage_productivity": productivity, | |
| "all_stages_productive": all_productive, | |
| "passed": all_productive if self.policy.require_all_stages_useful else True, | |
| "principle": "Every decoder stage must emit reusable evidence for integrity, routing, language grounding, specialist handoff, or retrieval.", | |
| } | |
Xet Storage Details
- Size:
- 8.75 kB
- Xet hash:
- 558c606ce76456a073b0b05e3d9ea7d138e89e274540693b62325d777089f294
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.