ultron / benchmark_ultron.py
trojan0x's picture
Update benchmark_ultron.py
91fdd42 verified
#!/usr/bin/env python3
"""
Ultron Benchmarking — Post-Training Evaluation
Downloads trained checkpoints from HF Hub and evaluates on standard benchmarks
using lm-evaluation-harness.
Benchmarks (0-shot, matching Parcae/FineWeb paper suite):
- HellaSwag
- ARC-Easy / ARC-Challenge
- PIQA
- WinoGrande
- BoolQ
Also tests depth extrapolation: same model evaluated at different loop counts.
Usage:
python benchmark_ultron.py --model_id trojan0x/ultron-small-baseline
python benchmark_ultron.py --model_id trojan0x/ultron-small-moe
python benchmark_ultron.py --model_id trojan0x/ultron-small-baseline --depth_extrapolation
"""
import os
import sys
import json
import argparse
import types
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import asdict
from huggingface_hub import hf_hub_download, snapshot_download, HfApi
from transformers import AutoTokenizer
# Setup Ultron
def setup_ultron():
from huggingface_hub import snapshot_download
repo_path = snapshot_download("trojan0x/ultron", allow_patterns=["ultron/*.py"])
sys.path.insert(0, repo_path)
print(f"Ultron loaded from: {repo_path}")
setup_ultron()
from ultron.model import Ultron, UltronConfig
def load_model(model_id, device="cuda"):
"""Load trained Ultron model from HF Hub."""
print(f"Loading model from {model_id}...")
# Download checkpoint
ckpt_path = hf_hub_download(model_id, "ultron_final.pt")
ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
# Reconstruct config
cfg_dict = ckpt["config"]
cfg = UltronConfig(**cfg_dict)
# Build and load model
model = Ultron(cfg)
model.load_state_dict(ckpt["model_state_dict"])
model = model.to(device)
model.eval()
print(f" Params: {model.get_num_params(False):,}")
print(f" Trained for {ckpt['step']:,} steps, {ckpt['tokens_seen']:,} tokens")
print(f" ρ(A): {model.get_spectral_radius():.6f}")
return model, cfg
class UltronLMWrapper(nn.Module):
"""Wraps Ultron for lm-evaluation-harness compatibility."""
def __init__(self, model, cfg, n_loops=None):
super().__init__()
self.model = model
self.n_loops = n_loops or cfg.max_loop_iters
self.config = types.SimpleNamespace(
max_position_embeddings=cfg.max_seq_len,
vocab_size=cfg.vocab_size,
model_type="ultron",
hidden_size=cfg.dim,
)
self.device = next(model.parameters()).device
def forward(self, input_ids, **kwargs):
logits = self.model(input_ids, n_loops=self.n_loops)
# lm-eval expects output.logits
return types.SimpleNamespace(logits=logits)
def parameters(self):
return self.model.parameters()
def to(self, *args, **kwargs):
self.model = self.model.to(*args, **kwargs)
return self
def evaluate(model_wrapper, tokenizer, tasks, limit=None, batch_size=8):
"""Run lm-evaluation-harness benchmarks."""
import lm_eval
from lm_eval.models.huggingface import HFLM
lm = HFLM(
pretrained=model_wrapper,
tokenizer=tokenizer,
max_length=model_wrapper.config.max_position_embeddings,
batch_size=batch_size,
backend="causal",
)
kwargs = {
"model": lm,
"tasks": tasks,
"num_fewshot": 0,
"log_samples": False,
}
if limit is not None:
kwargs["limit"] = limit
results = lm_eval.simple_evaluate(**kwargs)
return results["results"]
def print_results(results, label=""):
"""Pretty-print benchmark results."""
if label:
print(f"\n{'='*60}")
print(f" {label}")
print(f"{'='*60}")
print(f"\n{'Task':<20} {'Metric':<20} {'Score':>8}")
print("-" * 50)
for task, scores in results.items():
# Pick best metric
for metric in ["acc_norm,none", "acc,none"]:
if metric in scores:
val = scores[metric]
print(f"{task:<20} {metric:<20} {val:>8.4f}")
break
print()
def main():
parser = argparse.ArgumentParser(description="Ultron Benchmarking")
parser.add_argument("--model_id", type=str, required=True,
help="HF Hub model ID (e.g., trojan0x/ultron-small-baseline)")
parser.add_argument("--tasks", type=str, nargs="+",
default=["hellaswag", "arc_easy", "arc_challenge", "piqa", "winogrande", "boolq"])
parser.add_argument("--limit", type=int, default=None,
help="Limit eval samples per task (for quick testing)")
parser.add_argument("--batch_size", type=int, default=8)
parser.add_argument("--depth_extrapolation", action="store_true",
help="Test at multiple loop counts")
parser.add_argument("--upload_results", action="store_true",
help="Upload results to the model repo")
args = parser.parse_args()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
# Load model
model, cfg = load_model(args.model_id, device)
if args.depth_extrapolation:
# Test at multiple loop depths
loop_counts = [1, 2, 4, cfg.max_loop_iters, 12, 16]
all_results = {}
for n_loops in loop_counts:
print(f"\n--- Evaluating at {n_loops} loops ---")
wrapper = UltronLMWrapper(model, cfg, n_loops=n_loops)
results = evaluate(wrapper, tokenizer, args.tasks,
limit=args.limit or 200, batch_size=args.batch_size)
all_results[n_loops] = results
print_results(results, f"n_loops = {n_loops}")
# Summary table
print("\n" + "="*80)
print("DEPTH EXTRAPOLATION SUMMARY")
print("="*80)
print(f"{'n_loops':<10}", end="")
for task in args.tasks:
print(f"{task:<15}", end="")
print()
print("-" * (10 + 15 * len(args.tasks)))
for n_loops, results in all_results.items():
print(f"{n_loops:<10}", end="")
for task in args.tasks:
if task in results:
for m in ["acc_norm,none", "acc,none"]:
if m in results[task]:
print(f"{results[task][m]:<15.4f}", end="")
break
else:
print(f"{'N/A':<15}", end="")
else:
print(f"{'N/A':<15}", end="")
print()
# Save results
summary = {
"model_id": args.model_id,
"type": "depth_extrapolation",
"results": {str(k): v for k, v in all_results.items()},
}
else:
# Standard evaluation
wrapper = UltronLMWrapper(model, cfg)
results = evaluate(wrapper, tokenizer, args.tasks,
limit=args.limit, batch_size=args.batch_size)
print_results(results, f"Benchmark Results: {args.model_id}")
summary = {
"model_id": args.model_id,
"type": "standard",
"n_loops": cfg.max_loop_iters,
"results": results,
}
# Save locally
results_path = "benchmark_results.json"
with open(results_path, "w") as f:
json.dump(summary, f, indent=2, default=str)
print(f"\nResults saved to {results_path}")
# Upload to Hub
if args.upload_results:
try:
api = HfApi()
api.upload_file(
path_or_fileobj=results_path,
path_in_repo="benchmark_results.json",
repo_id=args.model_id,
)
print(f"Results uploaded to {args.model_id}")
except Exception as e:
print(f"Upload failed: {e}")
if __name__ == "__main__":
main()