bbkdevops's picture
download
raw
4.68 kB
from __future__ import annotations
from datetime import datetime, timezone
import hashlib
import json
from pathlib import Path
from typing import Any
SCHEMA_VERSION = "tinymind-continuous-update-governor-v1"
DEFAULT_SUFFIXES = {".jsonl", ".json", ".md", ".txt", ".py", ".js", ".ts", ".rs", ".go", ".c", ".cpp", ".h", ".java", ".kt", ".lua"}
def _sha_file(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
h.update(chunk)
return h.hexdigest()
def _candidate_files(source_roots: list[str | Path]) -> list[Path]:
files: list[Path] = []
root_dir = Path(__file__).resolve().parents[1]
for root in source_roots:
p = Path(root)
if not p.is_absolute():
p = root_dir / p
if p.is_file() and p.suffix.lower() in DEFAULT_SUFFIXES:
files.append(p)
elif p.is_dir():
files.extend(f for f in p.rglob("*") if f.is_file() and f.suffix.lower() in DEFAULT_SUFFIXES)
return sorted(set(files), key=lambda x: str(x).lower())
def _load_json(path: str | Path | None) -> dict[str, Any]:
if not path:
return {}
p = Path(path)
if not p.exists():
return {}
return json.loads(p.read_text(encoding="utf-8", errors="replace"))
def build_continuous_update_manifest(
out_dir: str | Path,
*,
source_roots: list[str | Path] | None = None,
dataset_manifest: str | Path | None = "reports/dataset_quality_governor/dataset_quality_governor_manifest.json",
cadence_hours: int = 24,
) -> dict[str, Any]:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
roots = list(source_roots or ["data/jsonl", "third_party"])
files = _candidate_files(roots)
sample = [
{
"path": str(path),
"bytes": path.stat().st_size,
"sha256": _sha_file(path),
}
for path in files[:200]
]
data = _load_json(dataset_manifest)
pipeline = [
{
"stage": "discover",
"rule": "collect only explicit source roots; no blind public-internet ingestion",
"output": "candidate file inventory with sha256",
},
{
"stage": "normalize",
"rule": "decode text/json/jsonl, chunk large artifacts, preserve source_path/source_sha256",
"output": "source-grounded training candidates",
},
{
"stage": "purity_filter",
"rule": "reject secrets, encoded blobs, repetition loops, symbol noise, malformed records, and unsafe cyber payloads",
"output": "quality-governed JSONL",
},
{
"stage": "holdout",
"rule": "reserve deterministic eval split before training",
"output": "train/eval evidence pair",
},
{
"stage": "train_gate",
"rule": "continued training may use only governed data manifests, not raw scrape dumps",
"output": "adapter/update candidate",
},
{
"stage": "eval_gate",
"rule": "measure loss, PPL, Thai/English, coding, tool reliability, grounding, long context, drift, and regression",
"output": "promotion or rollback decision",
},
]
report: dict[str, Any] = {
"schema_version": SCHEMA_VERSION,
"created_at": datetime.now(timezone.utc).isoformat(),
"cadence_hours": cadence_hours,
"sources": {
"roots": [str(r) for r in roots],
"candidate_files": len(files),
"inventory_sample": sample,
},
"current_dataset": {
"manifest": str(dataset_manifest) if dataset_manifest else None,
"kept_records": data.get("kept_records", 0),
"rejected_records": data.get("rejected_records", 0),
"domain_counts": data.get("domain_counts", {}),
"reject_counts": data.get("reject_counts", {}),
},
"pipeline": pipeline,
"claim_gate": {
"continuous_update_plan_ready": True,
"auto_absorb_without_filter_allowed": False,
"always_up_to_date_claim_allowed": False,
"zero_junk_claim_allowed": False,
"reason": "The loop can keep sources refreshed and aggressively filtered, but every update still needs evidence, filtering, eval, and rollback gates.",
},
}
path = out / "continuous_update_governor_manifest.json"
report["manifest_path"] = str(path)
path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
return report

Xet Storage Details

Size:
4.68 kB
·
Xet hash:
232e61540fd0d0d24d6be8f52572a251bea6bd6be7f611b95400600b33b57eb0

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.