Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /continuous_update_governor.py

bbkdevops

about 1 month ago

download

raw

4.68 kB

	from __future__ import annotations

	from datetime import datetime, timezone
	import hashlib
	import json
	from pathlib import Path
	from typing import Any


	SCHEMA_VERSION = "tinymind-continuous-update-governor-v1"
	DEFAULT_SUFFIXES = {".jsonl", ".json", ".md", ".txt", ".py", ".js", ".ts", ".rs", ".go", ".c", ".cpp", ".h", ".java", ".kt", ".lua"}


	def _sha_file(path: Path) -> str:
	h = hashlib.sha256()
	with path.open("rb") as f:
	for chunk in iter(lambda: f.read(1024 * 1024), b""):
	h.update(chunk)
	return h.hexdigest()


	def _candidate_files(source_roots: list[str \| Path]) -> list[Path]:
	files: list[Path] = []
	root_dir = Path(__file__).resolve().parents[1]
	for root in source_roots:
	p = Path(root)
	if not p.is_absolute():
	p = root_dir / p
	if p.is_file() and p.suffix.lower() in DEFAULT_SUFFIXES:
	files.append(p)
	elif p.is_dir():
	files.extend(f for f in p.rglob("*") if f.is_file() and f.suffix.lower() in DEFAULT_SUFFIXES)
	return sorted(set(files), key=lambda x: str(x).lower())


	def _load_json(path: str \| Path \| None) -> dict[str, Any]:
	if not path:
	return {}
	p = Path(path)
	if not p.exists():
	return {}
	return json.loads(p.read_text(encoding="utf-8", errors="replace"))


	def build_continuous_update_manifest(
	out_dir: str \| Path,
	*,
	source_roots: list[str \| Path] \| None = None,
	dataset_manifest: str \| Path \| None = "reports/dataset_quality_governor/dataset_quality_governor_manifest.json",
	cadence_hours: int = 24,
	) -> dict[str, Any]:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	roots = list(source_roots or ["data/jsonl", "third_party"])
	files = _candidate_files(roots)
	sample = [
	{
	"path": str(path),
	"bytes": path.stat().st_size,
	"sha256": _sha_file(path),
	}
	for path in files[:200]
	]
	data = _load_json(dataset_manifest)
	pipeline = [
	{
	"stage": "discover",
	"rule": "collect only explicit source roots; no blind public-internet ingestion",
	"output": "candidate file inventory with sha256",
	},
	{
	"stage": "normalize",
	"rule": "decode text/json/jsonl, chunk large artifacts, preserve source_path/source_sha256",
	"output": "source-grounded training candidates",
	},
	{
	"stage": "purity_filter",
	"rule": "reject secrets, encoded blobs, repetition loops, symbol noise, malformed records, and unsafe cyber payloads",
	"output": "quality-governed JSONL",
	},
	{
	"stage": "holdout",
	"rule": "reserve deterministic eval split before training",
	"output": "train/eval evidence pair",
	},
	{
	"stage": "train_gate",
	"rule": "continued training may use only governed data manifests, not raw scrape dumps",
	"output": "adapter/update candidate",
	},
	{
	"stage": "eval_gate",
	"rule": "measure loss, PPL, Thai/English, coding, tool reliability, grounding, long context, drift, and regression",
	"output": "promotion or rollback decision",
	},
	]
	report: dict[str, Any] = {
	"schema_version": SCHEMA_VERSION,
	"created_at": datetime.now(timezone.utc).isoformat(),
	"cadence_hours": cadence_hours,
	"sources": {
	"roots": [str(r) for r in roots],
	"candidate_files": len(files),
	"inventory_sample": sample,
	},
	"current_dataset": {
	"manifest": str(dataset_manifest) if dataset_manifest else None,
	"kept_records": data.get("kept_records", 0),
	"rejected_records": data.get("rejected_records", 0),
	"domain_counts": data.get("domain_counts", {}),
	"reject_counts": data.get("reject_counts", {}),
	},
	"pipeline": pipeline,
	"claim_gate": {
	"continuous_update_plan_ready": True,
	"auto_absorb_without_filter_allowed": False,
	"always_up_to_date_claim_allowed": False,
	"zero_junk_claim_allowed": False,
	"reason": "The loop can keep sources refreshed and aggressively filtered, but every update still needs evidence, filtering, eval, and rollback gates.",
	},
	}
	path = out / "continuous_update_governor_manifest.json"
	report["manifest_path"] = str(path)
	path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	return report

Xet Storage Details

Size:: 4.68 kB
Xet hash:: 232e61540fd0d0d24d6be8f52572a251bea6bd6be7f611b95400600b33b57eb0

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.