| |
| """ |
| Ultron Benchmarking — Post-Training Evaluation |
| |
| Downloads trained checkpoints from HF Hub and evaluates on standard benchmarks |
| using lm-evaluation-harness. |
| |
| Benchmarks (0-shot, matching Parcae/FineWeb paper suite): |
| - HellaSwag |
| - ARC-Easy / ARC-Challenge |
| - PIQA |
| - WinoGrande |
| - BoolQ |
| |
| Also tests depth extrapolation: same model evaluated at different loop counts. |
| |
| Usage: |
| python benchmark_ultron.py --model_id trojan0x/ultron-small-baseline |
| python benchmark_ultron.py --model_id trojan0x/ultron-small-moe |
| python benchmark_ultron.py --model_id trojan0x/ultron-small-baseline --depth_extrapolation |
| """ |
|
|
| import os |
| import sys |
| import json |
| import argparse |
| import types |
| import math |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from dataclasses import asdict |
|
|
| from huggingface_hub import hf_hub_download, snapshot_download, HfApi |
| from transformers import AutoTokenizer |
|
|
| |
| def setup_ultron(): |
| from huggingface_hub import snapshot_download |
| repo_path = snapshot_download("trojan0x/ultron", allow_patterns=["ultron/*.py"]) |
| sys.path.insert(0, repo_path) |
| print(f"Ultron loaded from: {repo_path}") |
|
|
| setup_ultron() |
| from ultron.model import Ultron, UltronConfig |
|
|
|
|
| def load_model(model_id, device="cuda"): |
| """Load trained Ultron model from HF Hub.""" |
| print(f"Loading model from {model_id}...") |
|
|
| |
| ckpt_path = hf_hub_download(model_id, "ultron_final.pt") |
| ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False) |
|
|
| |
| cfg_dict = ckpt["config"] |
| cfg = UltronConfig(**cfg_dict) |
|
|
| |
| model = Ultron(cfg) |
| model.load_state_dict(ckpt["model_state_dict"]) |
| model = model.to(device) |
| model.eval() |
|
|
| print(f" Params: {model.get_num_params(False):,}") |
| print(f" Trained for {ckpt['step']:,} steps, {ckpt['tokens_seen']:,} tokens") |
| print(f" ρ(A): {model.get_spectral_radius():.6f}") |
|
|
| return model, cfg |
|
|
|
|
| class UltronLMWrapper(nn.Module): |
| """Wraps Ultron for lm-evaluation-harness compatibility.""" |
|
|
| def __init__(self, model, cfg, n_loops=None): |
| super().__init__() |
| self.model = model |
| self.n_loops = n_loops or cfg.max_loop_iters |
| self.config = types.SimpleNamespace( |
| max_position_embeddings=cfg.max_seq_len, |
| vocab_size=cfg.vocab_size, |
| model_type="ultron", |
| hidden_size=cfg.dim, |
| ) |
| self.device = next(model.parameters()).device |
|
|
| def forward(self, input_ids, **kwargs): |
| logits = self.model(input_ids, n_loops=self.n_loops) |
| |
| return types.SimpleNamespace(logits=logits) |
|
|
| def parameters(self): |
| return self.model.parameters() |
|
|
| def to(self, *args, **kwargs): |
| self.model = self.model.to(*args, **kwargs) |
| return self |
|
|
|
|
| def evaluate(model_wrapper, tokenizer, tasks, limit=None, batch_size=8): |
| """Run lm-evaluation-harness benchmarks.""" |
| import lm_eval |
| from lm_eval.models.huggingface import HFLM |
|
|
| lm = HFLM( |
| pretrained=model_wrapper, |
| tokenizer=tokenizer, |
| max_length=model_wrapper.config.max_position_embeddings, |
| batch_size=batch_size, |
| backend="causal", |
| ) |
|
|
| kwargs = { |
| "model": lm, |
| "tasks": tasks, |
| "num_fewshot": 0, |
| "log_samples": False, |
| } |
| if limit is not None: |
| kwargs["limit"] = limit |
|
|
| results = lm_eval.simple_evaluate(**kwargs) |
| return results["results"] |
|
|
|
|
| def print_results(results, label=""): |
| """Pretty-print benchmark results.""" |
| if label: |
| print(f"\n{'='*60}") |
| print(f" {label}") |
| print(f"{'='*60}") |
|
|
| print(f"\n{'Task':<20} {'Metric':<20} {'Score':>8}") |
| print("-" * 50) |
| for task, scores in results.items(): |
| |
| for metric in ["acc_norm,none", "acc,none"]: |
| if metric in scores: |
| val = scores[metric] |
| print(f"{task:<20} {metric:<20} {val:>8.4f}") |
| break |
| print() |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Ultron Benchmarking") |
| parser.add_argument("--model_id", type=str, required=True, |
| help="HF Hub model ID (e.g., trojan0x/ultron-small-baseline)") |
| parser.add_argument("--tasks", type=str, nargs="+", |
| default=["hellaswag", "arc_easy", "arc_challenge", "piqa", "winogrande", "boolq"]) |
| parser.add_argument("--limit", type=int, default=None, |
| help="Limit eval samples per task (for quick testing)") |
| parser.add_argument("--batch_size", type=int, default=8) |
| parser.add_argument("--depth_extrapolation", action="store_true", |
| help="Test at multiple loop counts") |
| parser.add_argument("--upload_results", action="store_true", |
| help="Upload results to the model repo") |
| args = parser.parse_args() |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| tokenizer = AutoTokenizer.from_pretrained("gpt2") |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| |
| model, cfg = load_model(args.model_id, device) |
|
|
| if args.depth_extrapolation: |
| |
| loop_counts = [1, 2, 4, cfg.max_loop_iters, 12, 16] |
| all_results = {} |
|
|
| for n_loops in loop_counts: |
| print(f"\n--- Evaluating at {n_loops} loops ---") |
| wrapper = UltronLMWrapper(model, cfg, n_loops=n_loops) |
| results = evaluate(wrapper, tokenizer, args.tasks, |
| limit=args.limit or 200, batch_size=args.batch_size) |
| all_results[n_loops] = results |
| print_results(results, f"n_loops = {n_loops}") |
|
|
| |
| print("\n" + "="*80) |
| print("DEPTH EXTRAPOLATION SUMMARY") |
| print("="*80) |
| print(f"{'n_loops':<10}", end="") |
| for task in args.tasks: |
| print(f"{task:<15}", end="") |
| print() |
| print("-" * (10 + 15 * len(args.tasks))) |
|
|
| for n_loops, results in all_results.items(): |
| print(f"{n_loops:<10}", end="") |
| for task in args.tasks: |
| if task in results: |
| for m in ["acc_norm,none", "acc,none"]: |
| if m in results[task]: |
| print(f"{results[task][m]:<15.4f}", end="") |
| break |
| else: |
| print(f"{'N/A':<15}", end="") |
| else: |
| print(f"{'N/A':<15}", end="") |
| print() |
|
|
| |
| summary = { |
| "model_id": args.model_id, |
| "type": "depth_extrapolation", |
| "results": {str(k): v for k, v in all_results.items()}, |
| } |
|
|
| else: |
| |
| wrapper = UltronLMWrapper(model, cfg) |
| results = evaluate(wrapper, tokenizer, args.tasks, |
| limit=args.limit, batch_size=args.batch_size) |
| print_results(results, f"Benchmark Results: {args.model_id}") |
|
|
| summary = { |
| "model_id": args.model_id, |
| "type": "standard", |
| "n_loops": cfg.max_loop_iters, |
| "results": results, |
| } |
|
|
| |
| results_path = "benchmark_results.json" |
| with open(results_path, "w") as f: |
| json.dump(summary, f, indent=2, default=str) |
| print(f"\nResults saved to {results_path}") |
|
|
| |
| if args.upload_results: |
| try: |
| api = HfApi() |
| api.upload_file( |
| path_or_fileobj=results_path, |
| path_in_repo="benchmark_results.json", |
| repo_id=args.model_id, |
| ) |
| print(f"Results uploaded to {args.model_id}") |
| except Exception as e: |
| print(f"Upload failed: {e}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|