Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /native_baseline_probe.py

bbkdevops

about 1 month ago

download

raw

9.55 kB

	from __future__ import annotations

	from datetime import datetime, timezone
	import json
	from pathlib import Path
	import re
	import time
	from typing import Any

	import torch

	from model.native_axiom_regenesis import AxiomReGenesisConfig, TinyMindAxiomReGenesis, config_from_dict
	from runtime.constrained_decode import apply_constrained_repair
	from train.native_axiom_regenesis_train import _decode, _encode, _retrieved_from_ids


	PROBES = [
	{
	"id": "language_semantics",
	"prompt": "อธิบายความต่างระหว่าง ambiguity, vagueness และ uncertainty เป็นภาษาไทย พร้อมตัวอย่างสั้น ๆ",
	"must": ["ambiguity", "vagueness", "uncertainty"],
	},
	{
	"id": "raw_code_bits",
	"prompt": "Explain how to sign-extend a packed signed 6-bit integer and name two boundary values.",
	"must": ["mask", "sign", "-32", "31"],
	},
	{
	"id": "systems_abi",
	"prompt": "Explain ABI compatibility for a Rust/C FFI boundary in one concise paragraph.",
	"must": ["calling", "layout", "symbol"],
	},
	{
	"id": "pure_math_bound",
	"prompt": "พิสูจน์สั้น ๆ ว่า m_t = c m_{t-1} + x_t มีขอบเขตเมื่อ 0<c<1 และ \|x_t\|<=B",
	"must": ["B", "1-c", "ขอบเขต"],
	},
	{
	"id": "entropy_relation",
	"prompt": "Explain the relation H(P,Q)=H(P)+KL(P\|\|Q) and why it matters for eval loss.",
	"must": ["cross", "KL", "entropy"],
	},
	]


	def _repeated_ngrams(text: str) -> bool:
	words = re.findall(r"\w+", text.lower(), flags=re.UNICODE)
	grams = [" ".join(words[i : i + 5]) for i in range(max(0, len(words) - 4))]
	return len(grams) != len(set(grams))


	def score(sample: dict[str, Any], response: str) -> tuple[int, list[str]]:
	flags: list[str] = []
	lower = response.lower()
	missing = [term for term in sample["must"] if term.lower() not in lower]
	if missing:
	flags.append("missing:" + ",".join(missing))
	if _repeated_ngrams(response):
	flags.append("repetition")
	if len(response) < 80:
	flags.append("too_short")
	if "```" in response and response.count("```") % 2 != 0:
	flags.append("broken_code_fence")
	points = 4 - len(flags)
	if any(flag.startswith("missing:") for flag in flags):
	points -= 1
	return max(points, 0), flags


	def _load_native(checkpoint: str \| Path, device: torch.device) -> TinyMindAxiomReGenesis:
	payload = torch.load(checkpoint, map_location=device)
	cfg_payload = payload.get("config")
	if not isinstance(cfg_payload, dict):
	raise ValueError(f"native checkpoint missing config: {checkpoint}")
	cfg = config_from_dict(cfg_payload)
	model = TinyMindAxiomReGenesis(cfg).to(device)
	state_dict = payload.get("state_dict")
	if not isinstance(state_dict, dict):
	raise ValueError(f"native checkpoint missing state_dict: {checkpoint}")
	model.load_state_dict(state_dict)
	model.eval()
	return model


	@torch.no_grad()
	def _generate_native(model: TinyMindAxiomReGenesis, prompt: str, device: torch.device, *, max_new_tokens: int = 96) -> str:
	text = f"SYSTEM: Answer precisely. Use constraints from the user. Avoid repetition.\nUSER: {prompt}\nASSISTANT:"
	ids = _encode(text, min(model.cfg.max_seq_len, 192), model.cfg.vocab_size, model.cfg.tokenizer_mode).unsqueeze(0).to(device)
	retrieved = _retrieved_from_ids(ids, model.cfg.regen_top_k)
	out = model.generate(ids, max_new_tokens=max_new_tokens, retrieved_tokens=retrieved)
	tail = out[:, ids.shape[1] :]
	return _decode(tail[0], model.cfg.vocab_size, model.cfg.tokenizer_mode).strip()


	def _extract_baseline_from_deep_core(report_path: str \| Path, adapter_name: str \| None = None) -> dict[str, Any]:
	report = json.loads(Path(report_path).read_text(encoding="utf-8"))
	adapters = report.get("adapters") or {}
	if not adapters:
	raise ValueError(f"baseline report has no adapters: {report_path}")
	if adapter_name and adapter_name in adapters:
	name = adapter_name
	else:
	name = max(adapters, key=lambda key: int(adapters[key].get("total_score", -1)))
	payload = adapters[name]
	samples = payload.get("samples") or []
	by_id = {sample.get("id"): sample for sample in samples if isinstance(sample, dict)}
	normalized = []
	for probe in PROBES:
	sample = by_id.get(probe["id"]) or {}
	response = str(sample.get("response", ""))
	points, flags = score(probe, response)
	normalized.append(
	{
	"id": probe["id"],
	"prompt": probe["prompt"],
	"response": response,
	"score": points,
	"flags": flags,
	"source_score": sample.get("score"),
	"source_flags": sample.get("flags"),
	}
	)
	return {
	"name": name,
	"adapter": payload.get("adapter"),
	"report_path": str(report_path),
	"score": sum(sample["score"] for sample in normalized),
	"max_score": len(PROBES) * 4,
	"samples": normalized,
	}


	def run_native_baseline_probe(
	out_dir: str \| Path,
	*,
	native_checkpoint: str \| Path,
	baseline_report: str \| Path,
	baseline_adapter_name: str \| None = None,
	device: str \| None = None,
	max_new_tokens: int = 96,
	controlled_repair: bool = False,
	) -> dict[str, Any]:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	run_device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
	started = time.time()

	baseline = _extract_baseline_from_deep_core(baseline_report, baseline_adapter_name)
	model = _load_native(native_checkpoint, run_device)
	native_samples = []
	for probe in PROBES:
	raw_response = _generate_native(model, probe["prompt"], run_device, max_new_tokens=max_new_tokens)
	repair_task = dict(probe)
	repair_task.setdefault("axis", probe["id"])
	response, events = apply_constrained_repair(repair_task, raw_response) if controlled_repair else (raw_response, [])
	points, flags = score(probe, response)
	sample = {
	"id": probe["id"],
	"prompt": probe["prompt"],
	"response": response,
	"score": points,
	"flags": flags,
	}
	if events:
	sample["raw_response"] = raw_response
	sample["constraint_events"] = events
	native_samples.append(sample)

	native = {
	"name": model.cfg.architecture_name,
	"checkpoint": str(native_checkpoint),
	"parameter_count": model.parameter_count,
	"config": {
	"layers": model.cfg.n_layers,
	"dim": model.cfg.dim,
	"lanes": model.cfg.lanes,
	"virtual_dim": model.cfg.axiom_effective_dim,
	},
	"score": sum(sample["score"] for sample in native_samples),
	"max_score": len(PROBES) * 4,
	"samples": native_samples,
	}
	axis_comparison = []
	baseline_by_id = {sample["id"]: sample for sample in baseline["samples"]}
	native_by_id = {sample["id"]: sample for sample in native["samples"]}
	native_wins = 0
	ties = 0
	baseline_wins = 0
	for probe in PROBES:
	base_score = int(baseline_by_id[probe["id"]]["score"])
	native_score = int(native_by_id[probe["id"]]["score"])
	if native_score > base_score:
	native_wins += 1
	winner = "native"
	elif native_score == base_score:
	ties += 1
	winner = "tie"
	else:
	baseline_wins += 1
	winner = "baseline"
	axis_comparison.append(
	{
	"id": probe["id"],
	"baseline_score": base_score,
	"native_score": native_score,
	"delta_native_minus_baseline": native_score - base_score,
	"winner": winner,
	}
	)

	report = {
	"schema": "tinymind.native_baseline_probe.v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"elapsed_s": time.time() - started,
	"device": str(run_device),
	"probe_count": len(PROBES),
	"baseline": baseline,
	"native": native,
	"axis_comparison": axis_comparison,
	"summary": {
	"baseline_score": baseline["score"],
	"native_score": native["score"],
	"max_score": len(PROBES) * 4,
	"native_wins": native_wins,
	"baseline_wins": baseline_wins,
	"ties": ties,
	"native_beats_baseline_2_axes": native_wins >= 2,
	"controlled_repair_enabled": bool(controlled_repair),
	},
	"claim_gate": {
	"baseline_vs_native_probe_complete": True,
	"native_promotion_allowed": native_wins >= 2 and native["score"] > baseline["score"],
	"scale_allowed_by_probe": native_wins >= 2,
	"raw_model_capability_claim": not bool(controlled_repair),
	"world_best_claim_allowed": False,
	"reason": "This is a five-axis local probe. Controlled repair scores are runtime-system scores, not raw model scores.",
	},
	}
	path = out / "native_baseline_probe_report.json"
	report["json_path"] = str(path)
	path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
	return report

Xet Storage Details

Size:: 9.55 kB
Xet hash:: 39d279c77619b1e69f3f7e5e5c97ce950a4b233d32742ee641bff742ca8826c9

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.