bbkdevops's picture
download
raw
9.55 kB
from __future__ import annotations
from datetime import datetime, timezone
import json
from pathlib import Path
import re
import time
from typing import Any
import torch
from model.native_axiom_regenesis import AxiomReGenesisConfig, TinyMindAxiomReGenesis, config_from_dict
from runtime.constrained_decode import apply_constrained_repair
from train.native_axiom_regenesis_train import _decode, _encode, _retrieved_from_ids
PROBES = [
{
"id": "language_semantics",
"prompt": "อธิบายความต่างระหว่าง ambiguity, vagueness และ uncertainty เป็นภาษาไทย พร้อมตัวอย่างสั้น ๆ",
"must": ["ambiguity", "vagueness", "uncertainty"],
},
{
"id": "raw_code_bits",
"prompt": "Explain how to sign-extend a packed signed 6-bit integer and name two boundary values.",
"must": ["mask", "sign", "-32", "31"],
},
{
"id": "systems_abi",
"prompt": "Explain ABI compatibility for a Rust/C FFI boundary in one concise paragraph.",
"must": ["calling", "layout", "symbol"],
},
{
"id": "pure_math_bound",
"prompt": "พิสูจน์สั้น ๆ ว่า m_t = c m_{t-1} + x_t มีขอบเขตเมื่อ 0<c<1 และ |x_t|<=B",
"must": ["B", "1-c", "ขอบเขต"],
},
{
"id": "entropy_relation",
"prompt": "Explain the relation H(P,Q)=H(P)+KL(P||Q) and why it matters for eval loss.",
"must": ["cross", "KL", "entropy"],
},
]
def _repeated_ngrams(text: str) -> bool:
words = re.findall(r"\w+", text.lower(), flags=re.UNICODE)
grams = [" ".join(words[i : i + 5]) for i in range(max(0, len(words) - 4))]
return len(grams) != len(set(grams))
def score(sample: dict[str, Any], response: str) -> tuple[int, list[str]]:
flags: list[str] = []
lower = response.lower()
missing = [term for term in sample["must"] if term.lower() not in lower]
if missing:
flags.append("missing:" + ",".join(missing))
if _repeated_ngrams(response):
flags.append("repetition")
if len(response) < 80:
flags.append("too_short")
if "```" in response and response.count("```") % 2 != 0:
flags.append("broken_code_fence")
points = 4 - len(flags)
if any(flag.startswith("missing:") for flag in flags):
points -= 1
return max(points, 0), flags
def _load_native(checkpoint: str | Path, device: torch.device) -> TinyMindAxiomReGenesis:
payload = torch.load(checkpoint, map_location=device)
cfg_payload = payload.get("config")
if not isinstance(cfg_payload, dict):
raise ValueError(f"native checkpoint missing config: {checkpoint}")
cfg = config_from_dict(cfg_payload)
model = TinyMindAxiomReGenesis(cfg).to(device)
state_dict = payload.get("state_dict")
if not isinstance(state_dict, dict):
raise ValueError(f"native checkpoint missing state_dict: {checkpoint}")
model.load_state_dict(state_dict)
model.eval()
return model
@torch.no_grad()
def _generate_native(model: TinyMindAxiomReGenesis, prompt: str, device: torch.device, *, max_new_tokens: int = 96) -> str:
text = f"SYSTEM: Answer precisely. Use constraints from the user. Avoid repetition.\nUSER: {prompt}\nASSISTANT:"
ids = _encode(text, min(model.cfg.max_seq_len, 192), model.cfg.vocab_size, model.cfg.tokenizer_mode).unsqueeze(0).to(device)
retrieved = _retrieved_from_ids(ids, model.cfg.regen_top_k)
out = model.generate(ids, max_new_tokens=max_new_tokens, retrieved_tokens=retrieved)
tail = out[:, ids.shape[1] :]
return _decode(tail[0], model.cfg.vocab_size, model.cfg.tokenizer_mode).strip()
def _extract_baseline_from_deep_core(report_path: str | Path, adapter_name: str | None = None) -> dict[str, Any]:
report = json.loads(Path(report_path).read_text(encoding="utf-8"))
adapters = report.get("adapters") or {}
if not adapters:
raise ValueError(f"baseline report has no adapters: {report_path}")
if adapter_name and adapter_name in adapters:
name = adapter_name
else:
name = max(adapters, key=lambda key: int(adapters[key].get("total_score", -1)))
payload = adapters[name]
samples = payload.get("samples") or []
by_id = {sample.get("id"): sample for sample in samples if isinstance(sample, dict)}
normalized = []
for probe in PROBES:
sample = by_id.get(probe["id"]) or {}
response = str(sample.get("response", ""))
points, flags = score(probe, response)
normalized.append(
{
"id": probe["id"],
"prompt": probe["prompt"],
"response": response,
"score": points,
"flags": flags,
"source_score": sample.get("score"),
"source_flags": sample.get("flags"),
}
)
return {
"name": name,
"adapter": payload.get("adapter"),
"report_path": str(report_path),
"score": sum(sample["score"] for sample in normalized),
"max_score": len(PROBES) * 4,
"samples": normalized,
}
def run_native_baseline_probe(
out_dir: str | Path,
*,
native_checkpoint: str | Path,
baseline_report: str | Path,
baseline_adapter_name: str | None = None,
device: str | None = None,
max_new_tokens: int = 96,
controlled_repair: bool = False,
) -> dict[str, Any]:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
run_device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
started = time.time()
baseline = _extract_baseline_from_deep_core(baseline_report, baseline_adapter_name)
model = _load_native(native_checkpoint, run_device)
native_samples = []
for probe in PROBES:
raw_response = _generate_native(model, probe["prompt"], run_device, max_new_tokens=max_new_tokens)
repair_task = dict(probe)
repair_task.setdefault("axis", probe["id"])
response, events = apply_constrained_repair(repair_task, raw_response) if controlled_repair else (raw_response, [])
points, flags = score(probe, response)
sample = {
"id": probe["id"],
"prompt": probe["prompt"],
"response": response,
"score": points,
"flags": flags,
}
if events:
sample["raw_response"] = raw_response
sample["constraint_events"] = events
native_samples.append(sample)
native = {
"name": model.cfg.architecture_name,
"checkpoint": str(native_checkpoint),
"parameter_count": model.parameter_count,
"config": {
"layers": model.cfg.n_layers,
"dim": model.cfg.dim,
"lanes": model.cfg.lanes,
"virtual_dim": model.cfg.axiom_effective_dim,
},
"score": sum(sample["score"] for sample in native_samples),
"max_score": len(PROBES) * 4,
"samples": native_samples,
}
axis_comparison = []
baseline_by_id = {sample["id"]: sample for sample in baseline["samples"]}
native_by_id = {sample["id"]: sample for sample in native["samples"]}
native_wins = 0
ties = 0
baseline_wins = 0
for probe in PROBES:
base_score = int(baseline_by_id[probe["id"]]["score"])
native_score = int(native_by_id[probe["id"]]["score"])
if native_score > base_score:
native_wins += 1
winner = "native"
elif native_score == base_score:
ties += 1
winner = "tie"
else:
baseline_wins += 1
winner = "baseline"
axis_comparison.append(
{
"id": probe["id"],
"baseline_score": base_score,
"native_score": native_score,
"delta_native_minus_baseline": native_score - base_score,
"winner": winner,
}
)
report = {
"schema": "tinymind.native_baseline_probe.v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"elapsed_s": time.time() - started,
"device": str(run_device),
"probe_count": len(PROBES),
"baseline": baseline,
"native": native,
"axis_comparison": axis_comparison,
"summary": {
"baseline_score": baseline["score"],
"native_score": native["score"],
"max_score": len(PROBES) * 4,
"native_wins": native_wins,
"baseline_wins": baseline_wins,
"ties": ties,
"native_beats_baseline_2_axes": native_wins >= 2,
"controlled_repair_enabled": bool(controlled_repair),
},
"claim_gate": {
"baseline_vs_native_probe_complete": True,
"native_promotion_allowed": native_wins >= 2 and native["score"] > baseline["score"],
"scale_allowed_by_probe": native_wins >= 2,
"raw_model_capability_claim": not bool(controlled_repair),
"world_best_claim_allowed": False,
"reason": "This is a five-axis local probe. Controlled repair scores are runtime-system scores, not raw model scores.",
},
}
path = out / "native_baseline_probe_report.json"
report["json_path"] = str(path)
path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
return report

Xet Storage Details

Size:
9.55 kB
·
Xet hash:
39d279c77619b1e69f3f7e5e5c97ce950a4b233d32742ee641bff742ca8826c9

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.