bbkdevops's picture
download
raw
7.86 kB
"""External Hugging Face model checks on TinyMind self-dialogue data."""
from __future__ import annotations
import json
import math
import time
from pathlib import Path
from typing import Iterable
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
from evaluation.local_evidence import _encode
from evaluation.self_dialogue_evidence import SYSTEM_PROMPT
from model.architecture import OmegaModel
DEFAULT_EXTERNAL_MODELS = (
"sshleifer/tiny-gpt2",
"distilgpt2",
"gpt2",
)
def _load_jsonl(path: str | Path, limit: int | None = None) -> list[dict]:
rows = [json.loads(line) for line in Path(path).read_text(encoding="utf-8").splitlines() if line.strip()]
return rows[:limit] if limit else rows
def _dialogue_text(row: dict) -> str:
return (
f"<bos><system>{SYSTEM_PROMPT}</system>\n"
f"<user>{row['prompt']}</user>\n"
f"<assistant>{row['target']}<eos>"
)
def _corrupt(row: dict) -> dict:
corrupt = dict(row)
oracle = str(row.get("oracle", ""))
if oracle.isdigit():
corrupt["target"] = (
"<plan>Trust the first draft without checking.</plan>\n"
"<act>Skip recomputation and keep the inconsistent value.</act>\n"
"<verify>failed_check=true; oracle_mismatch=true</verify>\n"
f"<final>{int(oracle) + 7}</final>"
)
else:
corrupt["target"] = row["target"] + "\n<verify>corrupted</verify>"
return corrupt
@torch.no_grad()
def _causal_lm_loss(model, tokenizer, text: str, device: torch.device) -> float:
encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
input_ids = encoded["input_ids"].to(device)
if input_ids.shape[1] < 2:
return float("inf")
logits = model(input_ids).logits[:, :-1, :].contiguous()
labels = input_ids[:, 1:].contiguous()
loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), labels.view(-1))
return float(loss.item())
def evaluate_hf_model(model_id: str, rows: list[dict], device: torch.device) -> dict:
started = time.time()
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id)
model.to(device)
model.eval()
oracle_losses: list[float] = []
corrupt_losses: list[float] = []
wins = 0
examples: list[dict] = []
for row in rows:
oracle_loss = _causal_lm_loss(model, tokenizer, _dialogue_text(row), device)
corrupt_loss = _causal_lm_loss(model, tokenizer, _dialogue_text(_corrupt(row)), device)
preferred = oracle_loss <= corrupt_loss
wins += int(preferred)
oracle_losses.append(oracle_loss)
corrupt_losses.append(corrupt_loss)
if len(examples) < 3:
examples.append(
{
"prompt": row["prompt"],
"oracle_loss": oracle_loss,
"corrupt_loss": corrupt_loss,
"preferred_oracle": preferred,
}
)
avg_loss = sum(oracle_losses) / max(len(oracle_losses), 1)
return {
"source": "huggingface_hub",
"model_id": model_id,
"samples": len(rows),
"native_avg_oracle_loss": avg_loss,
"native_avg_oracle_perplexity": float(math.exp(min(avg_loss, 20.0))),
"native_avg_corrupt_loss": sum(corrupt_losses) / max(len(corrupt_losses), 1),
"oracle_preference_accuracy": wins / max(len(rows), 1),
"elapsed_s": time.time() - started,
"examples": examples,
}
@torch.no_grad()
def _tinymind_loss(model: OmegaModel, text: str, device: torch.device) -> float:
ids = _encode(text, model.cfg.max_seq_len, model.cfg.vocab_size).unsqueeze(0).to(device)
out = model(ids, labels=ids)
return float(out["loss"].item())
def evaluate_tinymind_checkpoint(checkpoint_path: str | Path, rows: list[dict], device: torch.device) -> dict:
started = time.time()
ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False)
model = OmegaModel(ckpt["model_cfg"]).to(device)
model.load_state_dict(ckpt["model_state"])
model.eval()
oracle_losses: list[float] = []
corrupt_losses: list[float] = []
wins = 0
examples: list[dict] = []
for row in rows:
oracle_loss = _tinymind_loss(model, _dialogue_text(row), device)
corrupt_loss = _tinymind_loss(model, _dialogue_text(_corrupt(row)), device)
preferred = oracle_loss <= corrupt_loss
wins += int(preferred)
oracle_losses.append(oracle_loss)
corrupt_losses.append(corrupt_loss)
if len(examples) < 3:
examples.append(
{
"prompt": row["prompt"],
"oracle_loss": oracle_loss,
"corrupt_loss": corrupt_loss,
"preferred_oracle": preferred,
}
)
avg_loss = sum(oracle_losses) / max(len(oracle_losses), 1)
return {
"source": "local_tinymind_checkpoint",
"model_id": "TinyMind-PureField-ReGenesis-OracleMax",
"checkpoint": str(checkpoint_path),
"samples": len(rows),
"native_avg_oracle_loss": avg_loss,
"native_avg_oracle_perplexity": float(math.exp(min(avg_loss, 20.0))),
"native_avg_corrupt_loss": sum(corrupt_losses) / max(len(corrupt_losses), 1),
"oracle_preference_accuracy": wins / max(len(rows), 1),
"elapsed_s": time.time() - started,
"examples": examples,
}
def run_external_model_eval(
eval_path: str | Path,
out_dir: str | Path,
model_ids: Iterable[str] = DEFAULT_EXTERNAL_MODELS,
tinymind_checkpoint: str | Path | None = None,
limit: int | None = 6,
) -> dict:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
rows = _load_jsonl(eval_path, limit=limit)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results = []
if tinymind_checkpoint:
results.append(evaluate_tinymind_checkpoint(tinymind_checkpoint, rows, device))
results.extend(evaluate_hf_model(model_id, rows, device) for model_id in model_ids)
report = {
"schema_version": "tinymind-external-hf-self-dialogue-eval-v1",
"eval_path": str(eval_path),
"device": str(device),
"samples": len(rows),
"models": results,
"comparison_note": (
"Loss/perplexity is model-native and not directly comparable across tokenizers; "
"oracle_preference_accuracy is only a smoke metric and must be paired with generation-quality checks."
),
"world_best_claim_allowed": False,
}
report_path = out / "external_hf_self_dialogue_eval.json"
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
md = ["# TinyMind External HF Self-Dialogue Eval", "", f"- Samples: {len(rows)}", f"- Device: {device}", ""]
md.append("| Model | Oracle preference | Native loss | Native ppl |")
md.append("|---|---:|---:|---:|")
for row in results:
md.append(
"| "
f"{row['model_id']} | "
f"{row['oracle_preference_accuracy']:.3f} | "
f"{row['native_avg_oracle_loss']:.4f} | "
f"{row['native_avg_oracle_perplexity']:.2f} |"
)
md.extend(
[
"",
"World-best claim: blocked; this is a local external-model smoke eval, not an official leaderboard.",
"Oracle preference can saturate and is not accepted without generation-quality evidence.",
]
)
(out / "external_hf_self_dialogue_eval.md").write_text("\n".join(md) + "\n", encoding="utf-8")
return report

Xet Storage Details

Size:
7.86 kB
·
Xet hash:
817069e0f3ded0f8c692c82bf4195b31245ceed7b34b51582c02f3cd75087ff5

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.