trojan0x
/

ultron

Model card Files Files and versions

xet

Community

trojan0x commited on Apr 21

Commit

91fdd42

verified ·

1 Parent(s): a8346e3

Update benchmark_ultron.py

Browse files

Files changed (1) hide show

benchmark_ultron.py +248 -0

benchmark_ultron.py ADDED Viewed

	@@ -0,0 +1,248 @@

+#!/usr/bin/env python3
+"""
+Ultron Benchmarking — Post-Training Evaluation
+Downloads trained checkpoints from HF Hub and evaluates on standard benchmarks
+using lm-evaluation-harness.
+Benchmarks (0-shot, matching Parcae/FineWeb paper suite):
+  - HellaSwag
+  - ARC-Easy / ARC-Challenge
+  - PIQA
+  - WinoGrande
+  - BoolQ
+Also tests depth extrapolation: same model evaluated at different loop counts.
+Usage:
+  python benchmark_ultron.py --model_id trojan0x/ultron-small-baseline
+  python benchmark_ultron.py --model_id trojan0x/ultron-small-moe
+  python benchmark_ultron.py --model_id trojan0x/ultron-small-baseline --depth_extrapolation
+"""
+import os
+import sys
+import json
+import argparse
+import types
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import asdict
+from huggingface_hub import hf_hub_download, snapshot_download, HfApi
+from transformers import AutoTokenizer
+# Setup Ultron
+def setup_ultron():
+    from huggingface_hub import snapshot_download
+    repo_path = snapshot_download("trojan0x/ultron", allow_patterns=["ultron/*.py"])
+    sys.path.insert(0, repo_path)
+    print(f"Ultron loaded from: {repo_path}")
+setup_ultron()
+from ultron.model import Ultron, UltronConfig
+def load_model(model_id, device="cuda"):
+    """Load trained Ultron model from HF Hub."""
+    print(f"Loading model from {model_id}...")
+    # Download checkpoint
+    ckpt_path = hf_hub_download(model_id, "ultron_final.pt")
+    ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
+    # Reconstruct config
+    cfg_dict = ckpt["config"]
+    cfg = UltronConfig(**cfg_dict)
+    # Build and load model
+    model = Ultron(cfg)
+    model.load_state_dict(ckpt["model_state_dict"])
+    model = model.to(device)
+    model.eval()
+    print(f"  Params: {model.get_num_params(False):,}")
+    print(f"  Trained for {ckpt['step']:,} steps, {ckpt['tokens_seen']:,} tokens")
+    print(f"  ρ(A): {model.get_spectral_radius():.6f}")
+    return model, cfg
+class UltronLMWrapper(nn.Module):
+    """Wraps Ultron for lm-evaluation-harness compatibility."""
+    def __init__(self, model, cfg, n_loops=None):
+        super().__init__()
+        self.model = model
+        self.n_loops = n_loops or cfg.max_loop_iters
+        self.config = types.SimpleNamespace(
+            max_position_embeddings=cfg.max_seq_len,
+            vocab_size=cfg.vocab_size,
+            model_type="ultron",
+            hidden_size=cfg.dim,
+        )
+        self.device = next(model.parameters()).device
+    def forward(self, input_ids, **kwargs):
+        logits = self.model(input_ids, n_loops=self.n_loops)
+        # lm-eval expects output.logits
+        return types.SimpleNamespace(logits=logits)
+    def parameters(self):
+        return self.model.parameters()
+    def to(self, *args, **kwargs):
+        self.model = self.model.to(*args, **kwargs)
+        return self
+def evaluate(model_wrapper, tokenizer, tasks, limit=None, batch_size=8):
+    """Run lm-evaluation-harness benchmarks."""
+    import lm_eval
+    from lm_eval.models.huggingface import HFLM
+    lm = HFLM(
+        pretrained=model_wrapper,
+        tokenizer=tokenizer,
+        max_length=model_wrapper.config.max_position_embeddings,
+        batch_size=batch_size,
+        backend="causal",
+    )
+    kwargs = {
+        "model": lm,
+        "tasks": tasks,
+        "num_fewshot": 0,
+        "log_samples": False,
+    }
+    if limit is not None:
+        kwargs["limit"] = limit
+    results = lm_eval.simple_evaluate(**kwargs)
+    return results["results"]
+def print_results(results, label=""):
+    """Pretty-print benchmark results."""
+    if label:
+        print(f"\n{'='*60}")
+        print(f"  {label}")
+        print(f"{'='*60}")
+    print(f"\n{'Task':<20} {'Metric':<20} {'Score':>8}")
+    print("-" * 50)
+    for task, scores in results.items():
+        # Pick best metric
+        for metric in ["acc_norm,none", "acc,none"]:
+            if metric in scores:
+                val = scores[metric]
+                print(f"{task:<20} {metric:<20} {val:>8.4f}")
+                break
+    print()
+def main():
+    parser = argparse.ArgumentParser(description="Ultron Benchmarking")
+    parser.add_argument("--model_id", type=str, required=True,
+                        help="HF Hub model ID (e.g., trojan0x/ultron-small-baseline)")
+    parser.add_argument("--tasks", type=str, nargs="+",
+                        default=["hellaswag", "arc_easy", "arc_challenge", "piqa", "winogrande", "boolq"])
+    parser.add_argument("--limit", type=int, default=None,
+                        help="Limit eval samples per task (for quick testing)")
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--depth_extrapolation", action="store_true",
+                        help="Test at multiple loop counts")
+    parser.add_argument("--upload_results", action="store_true",
+                        help="Upload results to the model repo")
+    args = parser.parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    tokenizer.pad_token = tokenizer.eos_token
+    # Load model
+    model, cfg = load_model(args.model_id, device)
+    if args.depth_extrapolation:
+        # Test at multiple loop depths
+        loop_counts = [1, 2, 4, cfg.max_loop_iters, 12, 16]
+        all_results = {}
+        for n_loops in loop_counts:
+            print(f"\n--- Evaluating at {n_loops} loops ---")
+            wrapper = UltronLMWrapper(model, cfg, n_loops=n_loops)
+            results = evaluate(wrapper, tokenizer, args.tasks,
+                             limit=args.limit or 200, batch_size=args.batch_size)
+            all_results[n_loops] = results
+            print_results(results, f"n_loops = {n_loops}")
+        # Summary table
+        print("\n" + "="*80)
+        print("DEPTH EXTRAPOLATION SUMMARY")
+        print("="*80)
+        print(f"{'n_loops':<10}", end="")
+        for task in args.tasks:
+            print(f"{task:<15}", end="")
+        print()
+        print("-" * (10 + 15 * len(args.tasks)))
+        for n_loops, results in all_results.items():
+            print(f"{n_loops:<10}", end="")
+            for task in args.tasks:
+                if task in results:
+                    for m in ["acc_norm,none", "acc,none"]:
+                        if m in results[task]:
+                            print(f"{results[task][m]:<15.4f}", end="")
+                            break
+                    else:
+                        print(f"{'N/A':<15}", end="")
+                else:
+                    print(f"{'N/A':<15}", end="")
+            print()
+        # Save results
+        summary = {
+            "model_id": args.model_id,
+            "type": "depth_extrapolation",
+            "results": {str(k): v for k, v in all_results.items()},
+        }
+    else:
+        # Standard evaluation
+        wrapper = UltronLMWrapper(model, cfg)
+        results = evaluate(wrapper, tokenizer, args.tasks,
+                         limit=args.limit, batch_size=args.batch_size)
+        print_results(results, f"Benchmark Results: {args.model_id}")
+        summary = {
+            "model_id": args.model_id,
+            "type": "standard",
+            "n_loops": cfg.max_loop_iters,
+            "results": results,
+        }
+    # Save locally
+    results_path = "benchmark_results.json"
+    with open(results_path, "w") as f:
+        json.dump(summary, f, indent=2, default=str)
+    print(f"\nResults saved to {results_path}")
+    # Upload to Hub
+    if args.upload_results:
+        try:
+            api = HfApi()
+            api.upload_file(
+                path_or_fileobj=results_path,
+                path_in_repo="benchmark_results.json",
+                repo_id=args.model_id,
+            )
+            print(f"Results uploaded to {args.model_id}")
+        except Exception as e:
+            print(f"Upload failed: {e}")
+if __name__ == "__main__":
+    main()