Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /evaluation /external_model_eval.py
| """External Hugging Face model checks on TinyMind self-dialogue data.""" | |
| from __future__ import annotations | |
| import json | |
| import math | |
| import time | |
| from pathlib import Path | |
| from typing import Iterable | |
| import torch | |
| import torch.nn.functional as F | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from evaluation.local_evidence import _encode | |
| from evaluation.self_dialogue_evidence import SYSTEM_PROMPT | |
| from model.architecture import OmegaModel | |
| DEFAULT_EXTERNAL_MODELS = ( | |
| "sshleifer/tiny-gpt2", | |
| "distilgpt2", | |
| "gpt2", | |
| ) | |
| def _load_jsonl(path: str | Path, limit: int | None = None) -> list[dict]: | |
| rows = [json.loads(line) for line in Path(path).read_text(encoding="utf-8").splitlines() if line.strip()] | |
| return rows[:limit] if limit else rows | |
| def _dialogue_text(row: dict) -> str: | |
| return ( | |
| f"<bos><system>{SYSTEM_PROMPT}</system>\n" | |
| f"<user>{row['prompt']}</user>\n" | |
| f"<assistant>{row['target']}<eos>" | |
| ) | |
| def _corrupt(row: dict) -> dict: | |
| corrupt = dict(row) | |
| oracle = str(row.get("oracle", "")) | |
| if oracle.isdigit(): | |
| corrupt["target"] = ( | |
| "<plan>Trust the first draft without checking.</plan>\n" | |
| "<act>Skip recomputation and keep the inconsistent value.</act>\n" | |
| "<verify>failed_check=true; oracle_mismatch=true</verify>\n" | |
| f"<final>{int(oracle) + 7}</final>" | |
| ) | |
| else: | |
| corrupt["target"] = row["target"] + "\n<verify>corrupted</verify>" | |
| return corrupt | |
| def _causal_lm_loss(model, tokenizer, text: str, device: torch.device) -> float: | |
| encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) | |
| input_ids = encoded["input_ids"].to(device) | |
| if input_ids.shape[1] < 2: | |
| return float("inf") | |
| logits = model(input_ids).logits[:, :-1, :].contiguous() | |
| labels = input_ids[:, 1:].contiguous() | |
| loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), labels.view(-1)) | |
| return float(loss.item()) | |
| def evaluate_hf_model(model_id: str, rows: list[dict], device: torch.device) -> dict: | |
| started = time.time() | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model = AutoModelForCausalLM.from_pretrained(model_id) | |
| model.to(device) | |
| model.eval() | |
| oracle_losses: list[float] = [] | |
| corrupt_losses: list[float] = [] | |
| wins = 0 | |
| examples: list[dict] = [] | |
| for row in rows: | |
| oracle_loss = _causal_lm_loss(model, tokenizer, _dialogue_text(row), device) | |
| corrupt_loss = _causal_lm_loss(model, tokenizer, _dialogue_text(_corrupt(row)), device) | |
| preferred = oracle_loss <= corrupt_loss | |
| wins += int(preferred) | |
| oracle_losses.append(oracle_loss) | |
| corrupt_losses.append(corrupt_loss) | |
| if len(examples) < 3: | |
| examples.append( | |
| { | |
| "prompt": row["prompt"], | |
| "oracle_loss": oracle_loss, | |
| "corrupt_loss": corrupt_loss, | |
| "preferred_oracle": preferred, | |
| } | |
| ) | |
| avg_loss = sum(oracle_losses) / max(len(oracle_losses), 1) | |
| return { | |
| "source": "huggingface_hub", | |
| "model_id": model_id, | |
| "samples": len(rows), | |
| "native_avg_oracle_loss": avg_loss, | |
| "native_avg_oracle_perplexity": float(math.exp(min(avg_loss, 20.0))), | |
| "native_avg_corrupt_loss": sum(corrupt_losses) / max(len(corrupt_losses), 1), | |
| "oracle_preference_accuracy": wins / max(len(rows), 1), | |
| "elapsed_s": time.time() - started, | |
| "examples": examples, | |
| } | |
| def _tinymind_loss(model: OmegaModel, text: str, device: torch.device) -> float: | |
| ids = _encode(text, model.cfg.max_seq_len, model.cfg.vocab_size).unsqueeze(0).to(device) | |
| out = model(ids, labels=ids) | |
| return float(out["loss"].item()) | |
| def evaluate_tinymind_checkpoint(checkpoint_path: str | Path, rows: list[dict], device: torch.device) -> dict: | |
| started = time.time() | |
| ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False) | |
| model = OmegaModel(ckpt["model_cfg"]).to(device) | |
| model.load_state_dict(ckpt["model_state"]) | |
| model.eval() | |
| oracle_losses: list[float] = [] | |
| corrupt_losses: list[float] = [] | |
| wins = 0 | |
| examples: list[dict] = [] | |
| for row in rows: | |
| oracle_loss = _tinymind_loss(model, _dialogue_text(row), device) | |
| corrupt_loss = _tinymind_loss(model, _dialogue_text(_corrupt(row)), device) | |
| preferred = oracle_loss <= corrupt_loss | |
| wins += int(preferred) | |
| oracle_losses.append(oracle_loss) | |
| corrupt_losses.append(corrupt_loss) | |
| if len(examples) < 3: | |
| examples.append( | |
| { | |
| "prompt": row["prompt"], | |
| "oracle_loss": oracle_loss, | |
| "corrupt_loss": corrupt_loss, | |
| "preferred_oracle": preferred, | |
| } | |
| ) | |
| avg_loss = sum(oracle_losses) / max(len(oracle_losses), 1) | |
| return { | |
| "source": "local_tinymind_checkpoint", | |
| "model_id": "TinyMind-PureField-ReGenesis-OracleMax", | |
| "checkpoint": str(checkpoint_path), | |
| "samples": len(rows), | |
| "native_avg_oracle_loss": avg_loss, | |
| "native_avg_oracle_perplexity": float(math.exp(min(avg_loss, 20.0))), | |
| "native_avg_corrupt_loss": sum(corrupt_losses) / max(len(corrupt_losses), 1), | |
| "oracle_preference_accuracy": wins / max(len(rows), 1), | |
| "elapsed_s": time.time() - started, | |
| "examples": examples, | |
| } | |
| def run_external_model_eval( | |
| eval_path: str | Path, | |
| out_dir: str | Path, | |
| model_ids: Iterable[str] = DEFAULT_EXTERNAL_MODELS, | |
| tinymind_checkpoint: str | Path | None = None, | |
| limit: int | None = 6, | |
| ) -> dict: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| rows = _load_jsonl(eval_path, limit=limit) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| results = [] | |
| if tinymind_checkpoint: | |
| results.append(evaluate_tinymind_checkpoint(tinymind_checkpoint, rows, device)) | |
| results.extend(evaluate_hf_model(model_id, rows, device) for model_id in model_ids) | |
| report = { | |
| "schema_version": "tinymind-external-hf-self-dialogue-eval-v1", | |
| "eval_path": str(eval_path), | |
| "device": str(device), | |
| "samples": len(rows), | |
| "models": results, | |
| "comparison_note": ( | |
| "Loss/perplexity is model-native and not directly comparable across tokenizers; " | |
| "oracle_preference_accuracy is only a smoke metric and must be paired with generation-quality checks." | |
| ), | |
| "world_best_claim_allowed": False, | |
| } | |
| report_path = out / "external_hf_self_dialogue_eval.json" | |
| report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md = ["# TinyMind External HF Self-Dialogue Eval", "", f"- Samples: {len(rows)}", f"- Device: {device}", ""] | |
| md.append("| Model | Oracle preference | Native loss | Native ppl |") | |
| md.append("|---|---:|---:|---:|") | |
| for row in results: | |
| md.append( | |
| "| " | |
| f"{row['model_id']} | " | |
| f"{row['oracle_preference_accuracy']:.3f} | " | |
| f"{row['native_avg_oracle_loss']:.4f} | " | |
| f"{row['native_avg_oracle_perplexity']:.2f} |" | |
| ) | |
| md.extend( | |
| [ | |
| "", | |
| "World-best claim: blocked; this is a local external-model smoke eval, not an official leaderboard.", | |
| "Oracle preference can saturate and is not accepted without generation-quality evidence.", | |
| ] | |
| ) | |
| (out / "external_hf_self_dialogue_eval.md").write_text("\n".join(md) + "\n", encoding="utf-8") | |
| return report | |
Xet Storage Details
- Size:
- 7.86 kB
- Xet hash:
- 817069e0f3ded0f8c692c82bf4195b31245ceed7b34b51582c02f3cd75087ff5
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.