trojan0x commited on
Commit
91fdd42
·
verified ·
1 Parent(s): a8346e3

Update benchmark_ultron.py

Browse files
Files changed (1) hide show
  1. benchmark_ultron.py +248 -0
benchmark_ultron.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Ultron Benchmarking — Post-Training Evaluation
4
+
5
+ Downloads trained checkpoints from HF Hub and evaluates on standard benchmarks
6
+ using lm-evaluation-harness.
7
+
8
+ Benchmarks (0-shot, matching Parcae/FineWeb paper suite):
9
+ - HellaSwag
10
+ - ARC-Easy / ARC-Challenge
11
+ - PIQA
12
+ - WinoGrande
13
+ - BoolQ
14
+
15
+ Also tests depth extrapolation: same model evaluated at different loop counts.
16
+
17
+ Usage:
18
+ python benchmark_ultron.py --model_id trojan0x/ultron-small-baseline
19
+ python benchmark_ultron.py --model_id trojan0x/ultron-small-moe
20
+ python benchmark_ultron.py --model_id trojan0x/ultron-small-baseline --depth_extrapolation
21
+ """
22
+
23
+ import os
24
+ import sys
25
+ import json
26
+ import argparse
27
+ import types
28
+ import math
29
+
30
+ import torch
31
+ import torch.nn as nn
32
+ import torch.nn.functional as F
33
+ from dataclasses import asdict
34
+
35
+ from huggingface_hub import hf_hub_download, snapshot_download, HfApi
36
+ from transformers import AutoTokenizer
37
+
38
+ # Setup Ultron
39
+ def setup_ultron():
40
+ from huggingface_hub import snapshot_download
41
+ repo_path = snapshot_download("trojan0x/ultron", allow_patterns=["ultron/*.py"])
42
+ sys.path.insert(0, repo_path)
43
+ print(f"Ultron loaded from: {repo_path}")
44
+
45
+ setup_ultron()
46
+ from ultron.model import Ultron, UltronConfig
47
+
48
+
49
+ def load_model(model_id, device="cuda"):
50
+ """Load trained Ultron model from HF Hub."""
51
+ print(f"Loading model from {model_id}...")
52
+
53
+ # Download checkpoint
54
+ ckpt_path = hf_hub_download(model_id, "ultron_final.pt")
55
+ ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
56
+
57
+ # Reconstruct config
58
+ cfg_dict = ckpt["config"]
59
+ cfg = UltronConfig(**cfg_dict)
60
+
61
+ # Build and load model
62
+ model = Ultron(cfg)
63
+ model.load_state_dict(ckpt["model_state_dict"])
64
+ model = model.to(device)
65
+ model.eval()
66
+
67
+ print(f" Params: {model.get_num_params(False):,}")
68
+ print(f" Trained for {ckpt['step']:,} steps, {ckpt['tokens_seen']:,} tokens")
69
+ print(f" ρ(A): {model.get_spectral_radius():.6f}")
70
+
71
+ return model, cfg
72
+
73
+
74
+ class UltronLMWrapper(nn.Module):
75
+ """Wraps Ultron for lm-evaluation-harness compatibility."""
76
+
77
+ def __init__(self, model, cfg, n_loops=None):
78
+ super().__init__()
79
+ self.model = model
80
+ self.n_loops = n_loops or cfg.max_loop_iters
81
+ self.config = types.SimpleNamespace(
82
+ max_position_embeddings=cfg.max_seq_len,
83
+ vocab_size=cfg.vocab_size,
84
+ model_type="ultron",
85
+ hidden_size=cfg.dim,
86
+ )
87
+ self.device = next(model.parameters()).device
88
+
89
+ def forward(self, input_ids, **kwargs):
90
+ logits = self.model(input_ids, n_loops=self.n_loops)
91
+ # lm-eval expects output.logits
92
+ return types.SimpleNamespace(logits=logits)
93
+
94
+ def parameters(self):
95
+ return self.model.parameters()
96
+
97
+ def to(self, *args, **kwargs):
98
+ self.model = self.model.to(*args, **kwargs)
99
+ return self
100
+
101
+
102
+ def evaluate(model_wrapper, tokenizer, tasks, limit=None, batch_size=8):
103
+ """Run lm-evaluation-harness benchmarks."""
104
+ import lm_eval
105
+ from lm_eval.models.huggingface import HFLM
106
+
107
+ lm = HFLM(
108
+ pretrained=model_wrapper,
109
+ tokenizer=tokenizer,
110
+ max_length=model_wrapper.config.max_position_embeddings,
111
+ batch_size=batch_size,
112
+ backend="causal",
113
+ )
114
+
115
+ kwargs = {
116
+ "model": lm,
117
+ "tasks": tasks,
118
+ "num_fewshot": 0,
119
+ "log_samples": False,
120
+ }
121
+ if limit is not None:
122
+ kwargs["limit"] = limit
123
+
124
+ results = lm_eval.simple_evaluate(**kwargs)
125
+ return results["results"]
126
+
127
+
128
+ def print_results(results, label=""):
129
+ """Pretty-print benchmark results."""
130
+ if label:
131
+ print(f"\n{'='*60}")
132
+ print(f" {label}")
133
+ print(f"{'='*60}")
134
+
135
+ print(f"\n{'Task':<20} {'Metric':<20} {'Score':>8}")
136
+ print("-" * 50)
137
+ for task, scores in results.items():
138
+ # Pick best metric
139
+ for metric in ["acc_norm,none", "acc,none"]:
140
+ if metric in scores:
141
+ val = scores[metric]
142
+ print(f"{task:<20} {metric:<20} {val:>8.4f}")
143
+ break
144
+ print()
145
+
146
+
147
+ def main():
148
+ parser = argparse.ArgumentParser(description="Ultron Benchmarking")
149
+ parser.add_argument("--model_id", type=str, required=True,
150
+ help="HF Hub model ID (e.g., trojan0x/ultron-small-baseline)")
151
+ parser.add_argument("--tasks", type=str, nargs="+",
152
+ default=["hellaswag", "arc_easy", "arc_challenge", "piqa", "winogrande", "boolq"])
153
+ parser.add_argument("--limit", type=int, default=None,
154
+ help="Limit eval samples per task (for quick testing)")
155
+ parser.add_argument("--batch_size", type=int, default=8)
156
+ parser.add_argument("--depth_extrapolation", action="store_true",
157
+ help="Test at multiple loop counts")
158
+ parser.add_argument("--upload_results", action="store_true",
159
+ help="Upload results to the model repo")
160
+ args = parser.parse_args()
161
+
162
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
163
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
164
+ tokenizer.pad_token = tokenizer.eos_token
165
+
166
+ # Load model
167
+ model, cfg = load_model(args.model_id, device)
168
+
169
+ if args.depth_extrapolation:
170
+ # Test at multiple loop depths
171
+ loop_counts = [1, 2, 4, cfg.max_loop_iters, 12, 16]
172
+ all_results = {}
173
+
174
+ for n_loops in loop_counts:
175
+ print(f"\n--- Evaluating at {n_loops} loops ---")
176
+ wrapper = UltronLMWrapper(model, cfg, n_loops=n_loops)
177
+ results = evaluate(wrapper, tokenizer, args.tasks,
178
+ limit=args.limit or 200, batch_size=args.batch_size)
179
+ all_results[n_loops] = results
180
+ print_results(results, f"n_loops = {n_loops}")
181
+
182
+ # Summary table
183
+ print("\n" + "="*80)
184
+ print("DEPTH EXTRAPOLATION SUMMARY")
185
+ print("="*80)
186
+ print(f"{'n_loops':<10}", end="")
187
+ for task in args.tasks:
188
+ print(f"{task:<15}", end="")
189
+ print()
190
+ print("-" * (10 + 15 * len(args.tasks)))
191
+
192
+ for n_loops, results in all_results.items():
193
+ print(f"{n_loops:<10}", end="")
194
+ for task in args.tasks:
195
+ if task in results:
196
+ for m in ["acc_norm,none", "acc,none"]:
197
+ if m in results[task]:
198
+ print(f"{results[task][m]:<15.4f}", end="")
199
+ break
200
+ else:
201
+ print(f"{'N/A':<15}", end="")
202
+ else:
203
+ print(f"{'N/A':<15}", end="")
204
+ print()
205
+
206
+ # Save results
207
+ summary = {
208
+ "model_id": args.model_id,
209
+ "type": "depth_extrapolation",
210
+ "results": {str(k): v for k, v in all_results.items()},
211
+ }
212
+
213
+ else:
214
+ # Standard evaluation
215
+ wrapper = UltronLMWrapper(model, cfg)
216
+ results = evaluate(wrapper, tokenizer, args.tasks,
217
+ limit=args.limit, batch_size=args.batch_size)
218
+ print_results(results, f"Benchmark Results: {args.model_id}")
219
+
220
+ summary = {
221
+ "model_id": args.model_id,
222
+ "type": "standard",
223
+ "n_loops": cfg.max_loop_iters,
224
+ "results": results,
225
+ }
226
+
227
+ # Save locally
228
+ results_path = "benchmark_results.json"
229
+ with open(results_path, "w") as f:
230
+ json.dump(summary, f, indent=2, default=str)
231
+ print(f"\nResults saved to {results_path}")
232
+
233
+ # Upload to Hub
234
+ if args.upload_results:
235
+ try:
236
+ api = HfApi()
237
+ api.upload_file(
238
+ path_or_fileobj=results_path,
239
+ path_in_repo="benchmark_results.json",
240
+ repo_id=args.model_id,
241
+ )
242
+ print(f"Results uploaded to {args.model_id}")
243
+ except Exception as e:
244
+ print(f"Upload failed: {e}")
245
+
246
+
247
+ if __name__ == "__main__":
248
+ main()