#!/usr/bin/env python3 """ Ultron Benchmarking — Post-Training Evaluation Downloads trained checkpoints from HF Hub and evaluates on standard benchmarks using lm-evaluation-harness. Benchmarks (0-shot, matching Parcae/FineWeb paper suite): - HellaSwag - ARC-Easy / ARC-Challenge - PIQA - WinoGrande - BoolQ Also tests depth extrapolation: same model evaluated at different loop counts. Usage: python benchmark_ultron.py --model_id trojan0x/ultron-small-baseline python benchmark_ultron.py --model_id trojan0x/ultron-small-moe python benchmark_ultron.py --model_id trojan0x/ultron-small-baseline --depth_extrapolation """ import os import sys import json import argparse import types import math import torch import torch.nn as nn import torch.nn.functional as F from dataclasses import asdict from huggingface_hub import hf_hub_download, snapshot_download, HfApi from transformers import AutoTokenizer # Setup Ultron def setup_ultron(): from huggingface_hub import snapshot_download repo_path = snapshot_download("trojan0x/ultron", allow_patterns=["ultron/*.py"]) sys.path.insert(0, repo_path) print(f"Ultron loaded from: {repo_path}") setup_ultron() from ultron.model import Ultron, UltronConfig def load_model(model_id, device="cuda"): """Load trained Ultron model from HF Hub.""" print(f"Loading model from {model_id}...") # Download checkpoint ckpt_path = hf_hub_download(model_id, "ultron_final.pt") ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False) # Reconstruct config cfg_dict = ckpt["config"] cfg = UltronConfig(**cfg_dict) # Build and load model model = Ultron(cfg) model.load_state_dict(ckpt["model_state_dict"]) model = model.to(device) model.eval() print(f" Params: {model.get_num_params(False):,}") print(f" Trained for {ckpt['step']:,} steps, {ckpt['tokens_seen']:,} tokens") print(f" ρ(A): {model.get_spectral_radius():.6f}") return model, cfg class UltronLMWrapper(nn.Module): """Wraps Ultron for lm-evaluation-harness compatibility.""" def __init__(self, model, cfg, n_loops=None): super().__init__() self.model = model self.n_loops = n_loops or cfg.max_loop_iters self.config = types.SimpleNamespace( max_position_embeddings=cfg.max_seq_len, vocab_size=cfg.vocab_size, model_type="ultron", hidden_size=cfg.dim, ) self.device = next(model.parameters()).device def forward(self, input_ids, **kwargs): logits = self.model(input_ids, n_loops=self.n_loops) # lm-eval expects output.logits return types.SimpleNamespace(logits=logits) def parameters(self): return self.model.parameters() def to(self, *args, **kwargs): self.model = self.model.to(*args, **kwargs) return self def evaluate(model_wrapper, tokenizer, tasks, limit=None, batch_size=8): """Run lm-evaluation-harness benchmarks.""" import lm_eval from lm_eval.models.huggingface import HFLM lm = HFLM( pretrained=model_wrapper, tokenizer=tokenizer, max_length=model_wrapper.config.max_position_embeddings, batch_size=batch_size, backend="causal", ) kwargs = { "model": lm, "tasks": tasks, "num_fewshot": 0, "log_samples": False, } if limit is not None: kwargs["limit"] = limit results = lm_eval.simple_evaluate(**kwargs) return results["results"] def print_results(results, label=""): """Pretty-print benchmark results.""" if label: print(f"\n{'='*60}") print(f" {label}") print(f"{'='*60}") print(f"\n{'Task':<20} {'Metric':<20} {'Score':>8}") print("-" * 50) for task, scores in results.items(): # Pick best metric for metric in ["acc_norm,none", "acc,none"]: if metric in scores: val = scores[metric] print(f"{task:<20} {metric:<20} {val:>8.4f}") break print() def main(): parser = argparse.ArgumentParser(description="Ultron Benchmarking") parser.add_argument("--model_id", type=str, required=True, help="HF Hub model ID (e.g., trojan0x/ultron-small-baseline)") parser.add_argument("--tasks", type=str, nargs="+", default=["hellaswag", "arc_easy", "arc_challenge", "piqa", "winogrande", "boolq"]) parser.add_argument("--limit", type=int, default=None, help="Limit eval samples per task (for quick testing)") parser.add_argument("--batch_size", type=int, default=8) parser.add_argument("--depth_extrapolation", action="store_true", help="Test at multiple loop counts") parser.add_argument("--upload_results", action="store_true", help="Upload results to the model repo") args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = AutoTokenizer.from_pretrained("gpt2") tokenizer.pad_token = tokenizer.eos_token # Load model model, cfg = load_model(args.model_id, device) if args.depth_extrapolation: # Test at multiple loop depths loop_counts = [1, 2, 4, cfg.max_loop_iters, 12, 16] all_results = {} for n_loops in loop_counts: print(f"\n--- Evaluating at {n_loops} loops ---") wrapper = UltronLMWrapper(model, cfg, n_loops=n_loops) results = evaluate(wrapper, tokenizer, args.tasks, limit=args.limit or 200, batch_size=args.batch_size) all_results[n_loops] = results print_results(results, f"n_loops = {n_loops}") # Summary table print("\n" + "="*80) print("DEPTH EXTRAPOLATION SUMMARY") print("="*80) print(f"{'n_loops':<10}", end="") for task in args.tasks: print(f"{task:<15}", end="") print() print("-" * (10 + 15 * len(args.tasks))) for n_loops, results in all_results.items(): print(f"{n_loops:<10}", end="") for task in args.tasks: if task in results: for m in ["acc_norm,none", "acc,none"]: if m in results[task]: print(f"{results[task][m]:<15.4f}", end="") break else: print(f"{'N/A':<15}", end="") else: print(f"{'N/A':<15}", end="") print() # Save results summary = { "model_id": args.model_id, "type": "depth_extrapolation", "results": {str(k): v for k, v in all_results.items()}, } else: # Standard evaluation wrapper = UltronLMWrapper(model, cfg) results = evaluate(wrapper, tokenizer, args.tasks, limit=args.limit, batch_size=args.batch_size) print_results(results, f"Benchmark Results: {args.model_id}") summary = { "model_id": args.model_id, "type": "standard", "n_loops": cfg.max_loop_iters, "results": results, } # Save locally results_path = "benchmark_results.json" with open(results_path, "w") as f: json.dump(summary, f, indent=2, default=str) print(f"\nResults saved to {results_path}") # Upload to Hub if args.upload_results: try: api = HfApi() api.upload_file( path_or_fileobj=results_path, path_in_repo="benchmark_results.json", repo_id=args.model_id, ) print(f"Results uploaded to {args.model_id}") except Exception as e: print(f"Upload failed: {e}") if __name__ == "__main__": main()