Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /native_baseline_probe.py
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| import re | |
| import time | |
| from typing import Any | |
| import torch | |
| from model.native_axiom_regenesis import AxiomReGenesisConfig, TinyMindAxiomReGenesis, config_from_dict | |
| from runtime.constrained_decode import apply_constrained_repair | |
| from train.native_axiom_regenesis_train import _decode, _encode, _retrieved_from_ids | |
| PROBES = [ | |
| { | |
| "id": "language_semantics", | |
| "prompt": "อธิบายความต่างระหว่าง ambiguity, vagueness และ uncertainty เป็นภาษาไทย พร้อมตัวอย่างสั้น ๆ", | |
| "must": ["ambiguity", "vagueness", "uncertainty"], | |
| }, | |
| { | |
| "id": "raw_code_bits", | |
| "prompt": "Explain how to sign-extend a packed signed 6-bit integer and name two boundary values.", | |
| "must": ["mask", "sign", "-32", "31"], | |
| }, | |
| { | |
| "id": "systems_abi", | |
| "prompt": "Explain ABI compatibility for a Rust/C FFI boundary in one concise paragraph.", | |
| "must": ["calling", "layout", "symbol"], | |
| }, | |
| { | |
| "id": "pure_math_bound", | |
| "prompt": "พิสูจน์สั้น ๆ ว่า m_t = c m_{t-1} + x_t มีขอบเขตเมื่อ 0<c<1 และ |x_t|<=B", | |
| "must": ["B", "1-c", "ขอบเขต"], | |
| }, | |
| { | |
| "id": "entropy_relation", | |
| "prompt": "Explain the relation H(P,Q)=H(P)+KL(P||Q) and why it matters for eval loss.", | |
| "must": ["cross", "KL", "entropy"], | |
| }, | |
| ] | |
| def _repeated_ngrams(text: str) -> bool: | |
| words = re.findall(r"\w+", text.lower(), flags=re.UNICODE) | |
| grams = [" ".join(words[i : i + 5]) for i in range(max(0, len(words) - 4))] | |
| return len(grams) != len(set(grams)) | |
| def score(sample: dict[str, Any], response: str) -> tuple[int, list[str]]: | |
| flags: list[str] = [] | |
| lower = response.lower() | |
| missing = [term for term in sample["must"] if term.lower() not in lower] | |
| if missing: | |
| flags.append("missing:" + ",".join(missing)) | |
| if _repeated_ngrams(response): | |
| flags.append("repetition") | |
| if len(response) < 80: | |
| flags.append("too_short") | |
| if "```" in response and response.count("```") % 2 != 0: | |
| flags.append("broken_code_fence") | |
| points = 4 - len(flags) | |
| if any(flag.startswith("missing:") for flag in flags): | |
| points -= 1 | |
| return max(points, 0), flags | |
| def _load_native(checkpoint: str | Path, device: torch.device) -> TinyMindAxiomReGenesis: | |
| payload = torch.load(checkpoint, map_location=device) | |
| cfg_payload = payload.get("config") | |
| if not isinstance(cfg_payload, dict): | |
| raise ValueError(f"native checkpoint missing config: {checkpoint}") | |
| cfg = config_from_dict(cfg_payload) | |
| model = TinyMindAxiomReGenesis(cfg).to(device) | |
| state_dict = payload.get("state_dict") | |
| if not isinstance(state_dict, dict): | |
| raise ValueError(f"native checkpoint missing state_dict: {checkpoint}") | |
| model.load_state_dict(state_dict) | |
| model.eval() | |
| return model | |
| def _generate_native(model: TinyMindAxiomReGenesis, prompt: str, device: torch.device, *, max_new_tokens: int = 96) -> str: | |
| text = f"SYSTEM: Answer precisely. Use constraints from the user. Avoid repetition.\nUSER: {prompt}\nASSISTANT:" | |
| ids = _encode(text, min(model.cfg.max_seq_len, 192), model.cfg.vocab_size, model.cfg.tokenizer_mode).unsqueeze(0).to(device) | |
| retrieved = _retrieved_from_ids(ids, model.cfg.regen_top_k) | |
| out = model.generate(ids, max_new_tokens=max_new_tokens, retrieved_tokens=retrieved) | |
| tail = out[:, ids.shape[1] :] | |
| return _decode(tail[0], model.cfg.vocab_size, model.cfg.tokenizer_mode).strip() | |
| def _extract_baseline_from_deep_core(report_path: str | Path, adapter_name: str | None = None) -> dict[str, Any]: | |
| report = json.loads(Path(report_path).read_text(encoding="utf-8")) | |
| adapters = report.get("adapters") or {} | |
| if not adapters: | |
| raise ValueError(f"baseline report has no adapters: {report_path}") | |
| if adapter_name and adapter_name in adapters: | |
| name = adapter_name | |
| else: | |
| name = max(adapters, key=lambda key: int(adapters[key].get("total_score", -1))) | |
| payload = adapters[name] | |
| samples = payload.get("samples") or [] | |
| by_id = {sample.get("id"): sample for sample in samples if isinstance(sample, dict)} | |
| normalized = [] | |
| for probe in PROBES: | |
| sample = by_id.get(probe["id"]) or {} | |
| response = str(sample.get("response", "")) | |
| points, flags = score(probe, response) | |
| normalized.append( | |
| { | |
| "id": probe["id"], | |
| "prompt": probe["prompt"], | |
| "response": response, | |
| "score": points, | |
| "flags": flags, | |
| "source_score": sample.get("score"), | |
| "source_flags": sample.get("flags"), | |
| } | |
| ) | |
| return { | |
| "name": name, | |
| "adapter": payload.get("adapter"), | |
| "report_path": str(report_path), | |
| "score": sum(sample["score"] for sample in normalized), | |
| "max_score": len(PROBES) * 4, | |
| "samples": normalized, | |
| } | |
| def run_native_baseline_probe( | |
| out_dir: str | Path, | |
| *, | |
| native_checkpoint: str | Path, | |
| baseline_report: str | Path, | |
| baseline_adapter_name: str | None = None, | |
| device: str | None = None, | |
| max_new_tokens: int = 96, | |
| controlled_repair: bool = False, | |
| ) -> dict[str, Any]: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| run_device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu")) | |
| started = time.time() | |
| baseline = _extract_baseline_from_deep_core(baseline_report, baseline_adapter_name) | |
| model = _load_native(native_checkpoint, run_device) | |
| native_samples = [] | |
| for probe in PROBES: | |
| raw_response = _generate_native(model, probe["prompt"], run_device, max_new_tokens=max_new_tokens) | |
| repair_task = dict(probe) | |
| repair_task.setdefault("axis", probe["id"]) | |
| response, events = apply_constrained_repair(repair_task, raw_response) if controlled_repair else (raw_response, []) | |
| points, flags = score(probe, response) | |
| sample = { | |
| "id": probe["id"], | |
| "prompt": probe["prompt"], | |
| "response": response, | |
| "score": points, | |
| "flags": flags, | |
| } | |
| if events: | |
| sample["raw_response"] = raw_response | |
| sample["constraint_events"] = events | |
| native_samples.append(sample) | |
| native = { | |
| "name": model.cfg.architecture_name, | |
| "checkpoint": str(native_checkpoint), | |
| "parameter_count": model.parameter_count, | |
| "config": { | |
| "layers": model.cfg.n_layers, | |
| "dim": model.cfg.dim, | |
| "lanes": model.cfg.lanes, | |
| "virtual_dim": model.cfg.axiom_effective_dim, | |
| }, | |
| "score": sum(sample["score"] for sample in native_samples), | |
| "max_score": len(PROBES) * 4, | |
| "samples": native_samples, | |
| } | |
| axis_comparison = [] | |
| baseline_by_id = {sample["id"]: sample for sample in baseline["samples"]} | |
| native_by_id = {sample["id"]: sample for sample in native["samples"]} | |
| native_wins = 0 | |
| ties = 0 | |
| baseline_wins = 0 | |
| for probe in PROBES: | |
| base_score = int(baseline_by_id[probe["id"]]["score"]) | |
| native_score = int(native_by_id[probe["id"]]["score"]) | |
| if native_score > base_score: | |
| native_wins += 1 | |
| winner = "native" | |
| elif native_score == base_score: | |
| ties += 1 | |
| winner = "tie" | |
| else: | |
| baseline_wins += 1 | |
| winner = "baseline" | |
| axis_comparison.append( | |
| { | |
| "id": probe["id"], | |
| "baseline_score": base_score, | |
| "native_score": native_score, | |
| "delta_native_minus_baseline": native_score - base_score, | |
| "winner": winner, | |
| } | |
| ) | |
| report = { | |
| "schema": "tinymind.native_baseline_probe.v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "elapsed_s": time.time() - started, | |
| "device": str(run_device), | |
| "probe_count": len(PROBES), | |
| "baseline": baseline, | |
| "native": native, | |
| "axis_comparison": axis_comparison, | |
| "summary": { | |
| "baseline_score": baseline["score"], | |
| "native_score": native["score"], | |
| "max_score": len(PROBES) * 4, | |
| "native_wins": native_wins, | |
| "baseline_wins": baseline_wins, | |
| "ties": ties, | |
| "native_beats_baseline_2_axes": native_wins >= 2, | |
| "controlled_repair_enabled": bool(controlled_repair), | |
| }, | |
| "claim_gate": { | |
| "baseline_vs_native_probe_complete": True, | |
| "native_promotion_allowed": native_wins >= 2 and native["score"] > baseline["score"], | |
| "scale_allowed_by_probe": native_wins >= 2, | |
| "raw_model_capability_claim": not bool(controlled_repair), | |
| "world_best_claim_allowed": False, | |
| "reason": "This is a five-axis local probe. Controlled repair scores are runtime-system scores, not raw model scores.", | |
| }, | |
| } | |
| path = out / "native_baseline_probe_report.json" | |
| report["json_path"] = str(path) | |
| path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8") | |
| return report | |
Xet Storage Details
- Size:
- 9.55 kB
- Xet hash:
- 39d279c77619b1e69f3f7e5e5c97ce950a4b233d32742ee641bff742ca8826c9
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.