Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /continuous_update_governor.py
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import hashlib | |
| import json | |
| from pathlib import Path | |
| from typing import Any | |
| SCHEMA_VERSION = "tinymind-continuous-update-governor-v1" | |
| DEFAULT_SUFFIXES = {".jsonl", ".json", ".md", ".txt", ".py", ".js", ".ts", ".rs", ".go", ".c", ".cpp", ".h", ".java", ".kt", ".lua"} | |
| def _sha_file(path: Path) -> str: | |
| h = hashlib.sha256() | |
| with path.open("rb") as f: | |
| for chunk in iter(lambda: f.read(1024 * 1024), b""): | |
| h.update(chunk) | |
| return h.hexdigest() | |
| def _candidate_files(source_roots: list[str | Path]) -> list[Path]: | |
| files: list[Path] = [] | |
| root_dir = Path(__file__).resolve().parents[1] | |
| for root in source_roots: | |
| p = Path(root) | |
| if not p.is_absolute(): | |
| p = root_dir / p | |
| if p.is_file() and p.suffix.lower() in DEFAULT_SUFFIXES: | |
| files.append(p) | |
| elif p.is_dir(): | |
| files.extend(f for f in p.rglob("*") if f.is_file() and f.suffix.lower() in DEFAULT_SUFFIXES) | |
| return sorted(set(files), key=lambda x: str(x).lower()) | |
| def _load_json(path: str | Path | None) -> dict[str, Any]: | |
| if not path: | |
| return {} | |
| p = Path(path) | |
| if not p.exists(): | |
| return {} | |
| return json.loads(p.read_text(encoding="utf-8", errors="replace")) | |
| def build_continuous_update_manifest( | |
| out_dir: str | Path, | |
| *, | |
| source_roots: list[str | Path] | None = None, | |
| dataset_manifest: str | Path | None = "reports/dataset_quality_governor/dataset_quality_governor_manifest.json", | |
| cadence_hours: int = 24, | |
| ) -> dict[str, Any]: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| roots = list(source_roots or ["data/jsonl", "third_party"]) | |
| files = _candidate_files(roots) | |
| sample = [ | |
| { | |
| "path": str(path), | |
| "bytes": path.stat().st_size, | |
| "sha256": _sha_file(path), | |
| } | |
| for path in files[:200] | |
| ] | |
| data = _load_json(dataset_manifest) | |
| pipeline = [ | |
| { | |
| "stage": "discover", | |
| "rule": "collect only explicit source roots; no blind public-internet ingestion", | |
| "output": "candidate file inventory with sha256", | |
| }, | |
| { | |
| "stage": "normalize", | |
| "rule": "decode text/json/jsonl, chunk large artifacts, preserve source_path/source_sha256", | |
| "output": "source-grounded training candidates", | |
| }, | |
| { | |
| "stage": "purity_filter", | |
| "rule": "reject secrets, encoded blobs, repetition loops, symbol noise, malformed records, and unsafe cyber payloads", | |
| "output": "quality-governed JSONL", | |
| }, | |
| { | |
| "stage": "holdout", | |
| "rule": "reserve deterministic eval split before training", | |
| "output": "train/eval evidence pair", | |
| }, | |
| { | |
| "stage": "train_gate", | |
| "rule": "continued training may use only governed data manifests, not raw scrape dumps", | |
| "output": "adapter/update candidate", | |
| }, | |
| { | |
| "stage": "eval_gate", | |
| "rule": "measure loss, PPL, Thai/English, coding, tool reliability, grounding, long context, drift, and regression", | |
| "output": "promotion or rollback decision", | |
| }, | |
| ] | |
| report: dict[str, Any] = { | |
| "schema_version": SCHEMA_VERSION, | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "cadence_hours": cadence_hours, | |
| "sources": { | |
| "roots": [str(r) for r in roots], | |
| "candidate_files": len(files), | |
| "inventory_sample": sample, | |
| }, | |
| "current_dataset": { | |
| "manifest": str(dataset_manifest) if dataset_manifest else None, | |
| "kept_records": data.get("kept_records", 0), | |
| "rejected_records": data.get("rejected_records", 0), | |
| "domain_counts": data.get("domain_counts", {}), | |
| "reject_counts": data.get("reject_counts", {}), | |
| }, | |
| "pipeline": pipeline, | |
| "claim_gate": { | |
| "continuous_update_plan_ready": True, | |
| "auto_absorb_without_filter_allowed": False, | |
| "always_up_to_date_claim_allowed": False, | |
| "zero_junk_claim_allowed": False, | |
| "reason": "The loop can keep sources refreshed and aggressively filtered, but every update still needs evidence, filtering, eval, and rollback gates.", | |
| }, | |
| } | |
| path = out / "continuous_update_governor_manifest.json" | |
| report["manifest_path"] = str(path) | |
| path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| return report | |
Xet Storage Details
- Size:
- 4.68 kB
- Xet hash:
- 232e61540fd0d0d24d6be8f52572a251bea6bd6be7f611b95400600b33b57eb0
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.