elevow commited on
Commit
3824ea2
·
verified ·
1 Parent(s): 1d58c43

Update update_data.py

Browse files
Files changed (1) hide show
  1. update_data.py +143 -7
update_data.py CHANGED
@@ -1,4 +1,4 @@
1
- # /// script
2
  # requires-python = ">=3.11"
3
  # dependencies = [
4
  # "httpx",
@@ -7,18 +7,30 @@
7
  # ///
8
  """
9
  Regenerate data.json and upload to the elevow/benchmarks Space.
 
10
  Source template: duplicated from davanstrien/benchmark-race
11
  https://huggingface.co/spaces/elevow/benchmarks
 
 
 
 
 
 
 
12
  Run locally (from repo root or this folder):
13
  export HF_TOKEN=hf_...
14
  uv run scripts/elevow-benchmarks/update_data.py
 
15
  Or copy this file to your Space repo root on Hugging Face and run there.
 
16
  Schedule on HF Jobs (example — point to YOUR raw file):
17
  hf jobs scheduled uv run "0 8,20 * * *" \\
18
  --secrets HF_TOKEN \\
19
  https://huggingface.co/spaces/elevow/benchmarks/resolve/main/update_data.py
20
  """
 
21
  from __future__ import annotations
 
22
  import json
23
  import os
24
  import re
@@ -27,16 +39,20 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
27
  from datetime import datetime, timezone
28
  from pathlib import Path
29
  from typing import Any
 
30
  import httpx
31
  from huggingface_hub import HfApi
 
32
  # Upload target: your fork (was davanstrien/benchmark-race in upstream).
33
  SPACE_REPO = os.environ.get("BENCHMARK_SPACE_REPO", "elevow/benchmarks")
 
34
  ALIGNED_LOGO_URL = (
35
  "https://www.google.com/s2/favicons?sz=128&domain_url="
36
  "https%3A%2F%2Ftryaligned.ai"
37
  )
38
  ALIGNED_LOGOS_KEY = "AlignedAI"
39
  ALIGNED_COLOR = "#059669"
 
40
  # Full HF model_id strings from leaderboard APIs — add any row that should show Aligned branding.
41
  MODEL_IDS_USE_ALIGNED_LOGO: frozenset[str] = frozenset(
42
  {
@@ -44,6 +60,61 @@ MODEL_IDS_USE_ALIGNED_LOGO: frozenset[str] = frozenset(
44
  # "Qwen/Qwen2.5-Coder-32B-Instruct",
45
  }
46
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  BENCHMARK_CONFIGS = [
48
  {"dataset": "SWE-bench/SWE-bench_Verified", "key": "sweVerified", "name": "SWE-bench Verified", "gated": False},
49
  {"dataset": "ScaleAI/SWE-bench_Pro", "key": "swePro", "name": "SWE-bench Pro", "gated": False},
@@ -56,25 +127,58 @@ BENCHMARK_CONFIGS = [
56
  {"dataset": "harborframework/terminal-bench-2.0", "key": "terminalBench", "name": "Terminal-Bench 2.0", "gated": False},
57
  {"dataset": "FutureMa/EvasionBench", "key": "evasionBench", "name": "EvasionBench", "gated": False},
58
  ]
 
59
  PALETTE = [
60
  "#6366f1", "#0d9488", "#d97706", "#e11d48", "#7c3aed",
61
  "#16a34a", "#2563eb", "#ea580c", "#8b5cf6", "#0891b2",
62
  "#c026d3", "#65a30d", "#dc2626", "#0284c7", "#a21caf",
63
  "#059669", "#9333ea", "#ca8a04", "#be185d", "#0369a1",
64
  ]
 
 
65
  def inject_aligned_race_branding(
66
  benchmarks: dict[str, Any],
67
  logos: dict[str, str],
68
  color_map: dict[str, str],
69
- ) -> None:
70
- """Add Aligned logo URL, optional per-model race_logo_key, and bar color."""
 
 
 
71
  logos[ALIGNED_LOGOS_KEY] = ALIGNED_LOGO_URL
72
  color_map[ALIGNED_LOGOS_KEY] = ALIGNED_COLOR
 
 
 
73
  for _key, bm in benchmarks.items():
74
  for m in bm.get("models") or []:
75
  mid = m.get("model_id") or ""
76
- if mid in MODEL_IDS_USE_ALIGNED_LOGO:
 
 
 
 
77
  m["race_logo_key"] = ALIGNED_LOGOS_KEY
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def fetch_leaderboard(config: dict, hf_token: str | None) -> list[dict]:
79
  url = f"https://huggingface.co/api/datasets/{config['dataset']}/leaderboard"
80
  headers = {}
@@ -83,6 +187,7 @@ def fetch_leaderboard(config: dict, hf_token: str | None) -> list[dict]:
83
  elif config["gated"]:
84
  print(f" {config['name']}: skipped (gated, no token)")
85
  return []
 
86
  print(f" {config['name']}: fetching scores...")
87
  try:
88
  resp = httpx.get(url, headers=headers, timeout=30)
@@ -95,6 +200,7 @@ def fetch_leaderboard(config: dict, hf_token: str | None) -> list[dict]:
95
  except Exception as e:
96
  print(f" error: {e}")
97
  return []
 
98
  seen: dict[str, float] = {}
99
  for entry in data:
100
  model_id = entry.get("modelId")
@@ -103,11 +209,15 @@ def fetch_leaderboard(config: dict, hf_token: str | None) -> list[dict]:
103
  score = float(score)
104
  if model_id not in seen or score > seen[model_id]:
105
  seen[model_id] = score
 
106
  print(f" {len(seen)} models")
107
  return [{"model_id": mid, "score": s} for mid, s in seen.items()]
 
 
108
  def fetch_model_dates(model_ids: list[str], hf_token: str | None) -> dict[str, dict]:
109
  api = HfApi()
110
  results: dict[str, dict] = {}
 
111
  def _get_info(mid: str):
112
  try:
113
  info = api.model_info(mid, token=hf_token)
@@ -121,13 +231,17 @@ def fetch_model_dates(model_ids: list[str], hf_token: str | None) -> dict[str, d
121
  return mid, info.created_at.strftime("%Y-%m-%d"), params_b
122
  except Exception:
123
  return mid, None, None
 
124
  with ThreadPoolExecutor(max_workers=8) as pool:
125
  futures = {pool.submit(_get_info, mid): mid for mid in model_ids}
126
  for f in as_completed(futures):
127
  mid, date, params = f.result()
128
  if date:
129
  results[mid] = {"date": date, "parameters_b": params}
 
130
  return results
 
 
131
  def fetch_logo(provider: str) -> str | None:
132
  try:
133
  resp = httpx.get(
@@ -139,6 +253,8 @@ def fetch_logo(provider: str) -> str | None:
139
  except Exception:
140
  pass
141
  return None
 
 
142
  def fetch_all_logos(providers: set[str]) -> dict[str, str]:
143
  logos: dict[str, str] = {}
144
  with ThreadPoolExecutor(max_workers=8) as pool:
@@ -149,22 +265,29 @@ def fetch_all_logos(providers: set[str]) -> dict[str, str]:
149
  if url:
150
  logos[p] = url
151
  return logos
 
 
152
  def main() -> None:
153
  hf_token = os.environ.get("HF_TOKEN")
154
  print(f"Generating data.json → upload to {SPACE_REPO}\n")
 
155
  all_scores: dict[str, dict] = {}
156
  all_model_ids: set[str] = set()
 
157
  for config in BENCHMARK_CONFIGS:
158
  rows = fetch_leaderboard(config, hf_token)
159
  if rows:
160
  all_scores[config["key"]] = {"name": config["name"], "rows": rows}
161
  all_model_ids.update(r["model_id"] for r in rows)
 
162
  print(f"\n{len(all_model_ids)} unique models across {len(all_scores)} benchmarks")
163
  print("Fetching model dates...")
164
  model_dates = fetch_model_dates(list(all_model_ids), hf_token)
165
  print(f" got dates for {len(model_dates)}/{len(all_model_ids)} models")
 
166
  all_providers: set[str] = set()
167
  benchmarks: dict[str, Any] = {}
 
168
  for key, info in all_scores.items():
169
  models: list[dict] = []
170
  for row in info["rows"]:
@@ -183,29 +306,40 @@ def main() -> None:
183
  })
184
  if models:
185
  benchmarks[key] = {"name": info["name"], "models": models}
 
186
  print(f"\nFetching logos for {len(all_providers)} providers...")
187
  logos = fetch_all_logos(all_providers)
188
  print(f" got {len(logos)} logos")
 
189
  color_map: dict[str, str] = {}
190
  for i, provider in enumerate(sorted(all_providers)):
191
  color_map[provider] = PALETTE[i % len(PALETTE)]
192
- inject_aligned_race_branding(benchmarks, logos, color_map)
193
- print(f" injected {ALIGNED_LOGOS_KEY} logo + color; race_logo_key on {len(MODEL_IDS_USE_ALIGNED_LOGO)} id(s) configured")
 
 
 
 
 
 
194
  output = {
195
  "benchmarks": benchmarks,
196
  "logos": logos,
197
  "colors": color_map,
198
  "generated_at": datetime.now(timezone.utc).isoformat(),
199
  }
 
200
  data_json = json.dumps(output, indent=2)
201
  print(f"\nGenerated {len(data_json) / 1024:.1f} KB")
202
  for key, bm in benchmarks.items():
203
  print(f" {bm['name']}: {len(bm['models'])} models")
 
204
  print(f"\nUploading data.json to {SPACE_REPO}...")
205
  api = HfApi()
206
  with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
207
  f.write(data_json)
208
  tmp_path = f.name
 
209
  try:
210
  api.upload_file(
211
  path_or_fileobj=tmp_path,
@@ -217,5 +351,7 @@ def main() -> None:
217
  print("Done!")
218
  finally:
219
  Path(tmp_path).unlink(missing_ok=True)
 
 
220
  if __name__ == "__main__":
221
- main()
 
1
+ # /// script
2
  # requires-python = ">=3.11"
3
  # dependencies = [
4
  # "httpx",
 
7
  # ///
8
  """
9
  Regenerate data.json and upload to the elevow/benchmarks Space.
10
+
11
  Source template: duplicated from davanstrien/benchmark-race
12
  https://huggingface.co/spaces/elevow/benchmarks
13
+
14
+ **Single file:** All Aligned race branding, axis relabeling, optional org-groq tagging, and
15
+ offline ``patch_output_dict`` live here (no separate inject script).
16
+
17
+ Populate ``MODEL_IDS_ALIGNED_AXIS_LABEL`` with full HF ``model_id`` strings (as leaderboards
18
+ return them) to show **Aligned AI — {lane} · …** on race bar labels via rewritten ``short_name``.
19
+
20
  Run locally (from repo root or this folder):
21
  export HF_TOKEN=hf_...
22
  uv run scripts/elevow-benchmarks/update_data.py
23
+
24
  Or copy this file to your Space repo root on Hugging Face and run there.
25
+
26
  Schedule on HF Jobs (example — point to YOUR raw file):
27
  hf jobs scheduled uv run "0 8,20 * * *" \\
28
  --secrets HF_TOKEN \\
29
  https://huggingface.co/spaces/elevow/benchmarks/resolve/main/update_data.py
30
  """
31
+
32
  from __future__ import annotations
33
+
34
  import json
35
  import os
36
  import re
 
39
  from datetime import datetime, timezone
40
  from pathlib import Path
41
  from typing import Any
42
+
43
  import httpx
44
  from huggingface_hub import HfApi
45
+
46
  # Upload target: your fork (was davanstrien/benchmark-race in upstream).
47
  SPACE_REPO = os.environ.get("BENCHMARK_SPACE_REPO", "elevow/benchmarks")
48
+
49
  ALIGNED_LOGO_URL = (
50
  "https://www.google.com/s2/favicons?sz=128&domain_url="
51
  "https%3A%2F%2Ftryaligned.ai"
52
  )
53
  ALIGNED_LOGOS_KEY = "AlignedAI"
54
  ALIGNED_COLOR = "#059669"
55
+
56
  # Full HF model_id strings from leaderboard APIs — add any row that should show Aligned branding.
57
  MODEL_IDS_USE_ALIGNED_LOGO: frozenset[str] = frozenset(
58
  {
 
60
  # "Qwen/Qwen2.5-Coder-32B-Instruct",
61
  }
62
  )
63
+
64
+ # HF benchmark-race charts label bars with `short_name`. For models you treat as Groq-hosted
65
+ # Aligned references, rewrite that field to "Aligned AI — {lane} · {checkpoint}" (same lanes as
66
+ # client GMCQ charts). Stock Space UI ignores `race_logo_key` unless you fork index.html; it
67
+ # always uses `short_name` for the bar text.
68
+ MODEL_IDS_ALIGNED_AXIS_LABEL: frozenset[str] = frozenset(
69
+ {
70
+ # Same strings as leaderboards return, e.g.:
71
+ # "meta-llama/Llama-3.3-70B-Instruct",
72
+ # "meta-llama/Llama-4-Scout-17B-16E-Instruct",
73
+ }
74
+ )
75
+
76
+ # If True, tag every row whose HF org is literally "groq" with race_logo_key (rare on leaderboards).
77
+ USE_ALIGNED_FOR_ORG_GROQ = False
78
+
79
+ # Copy-paste example if you add a synthetic Aligned row by hand (ensure logos/colors cover provider).
80
+ SYNTHETIC_ALIGNED_ROW_EXAMPLE = r"""
81
+ # After building `models` for one benchmark, you may append:
82
+ # models.append({
83
+ # "model_id": "tryaligned/Aligned-AI",
84
+ # "short_name": "Aligned-AI",
85
+ # "provider": "tryaligned",
86
+ # "score": 0.0,
87
+ # "date": "2026-01-01",
88
+ # "race_logo_key": "AlignedAI",
89
+ # })
90
+ # Then ensure logos["AlignedAI"] is set and colors include "tryaligned".
91
+ """
92
+
93
+
94
+ def aligned_groq_lane_for_model_id(model_id: str) -> str:
95
+ """Match client `alignedGroqLaneForRawModel` heuristics on HF model_id."""
96
+ s = model_id.lower()
97
+ if "scout" in s:
98
+ return "Vision"
99
+ if "coder" in s:
100
+ return "Code"
101
+ if "llama-3.1" in s and "8b" in s:
102
+ return "Fast"
103
+ return "Reasoning"
104
+
105
+
106
+ def aligned_axis_label_from_model_id(model_id: str) -> str:
107
+ """Bar label for forked data.json (benchmark-race reads `m.short_name`)."""
108
+ slug = model_id.split("/")[-1].replace("-", " ").replace("_", " ")
109
+ slug = re.sub(r"\s+", " ", slug).strip()
110
+ if len(slug) > 20:
111
+ slug = f"{slug[:18]}…"
112
+ lane = aligned_groq_lane_for_model_id(model_id)
113
+ label = f"Aligned AI — {lane} · {slug}"
114
+ if len(label) > 45:
115
+ label = f"{label[:43]}…"
116
+ return label
117
+
118
  BENCHMARK_CONFIGS = [
119
  {"dataset": "SWE-bench/SWE-bench_Verified", "key": "sweVerified", "name": "SWE-bench Verified", "gated": False},
120
  {"dataset": "ScaleAI/SWE-bench_Pro", "key": "swePro", "name": "SWE-bench Pro", "gated": False},
 
127
  {"dataset": "harborframework/terminal-bench-2.0", "key": "terminalBench", "name": "Terminal-Bench 2.0", "gated": False},
128
  {"dataset": "FutureMa/EvasionBench", "key": "evasionBench", "name": "EvasionBench", "gated": False},
129
  ]
130
+
131
  PALETTE = [
132
  "#6366f1", "#0d9488", "#d97706", "#e11d48", "#7c3aed",
133
  "#16a34a", "#2563eb", "#ea580c", "#8b5cf6", "#0891b2",
134
  "#c026d3", "#65a30d", "#dc2626", "#0284c7", "#a21caf",
135
  "#059669", "#9333ea", "#ca8a04", "#be185d", "#0369a1",
136
  ]
137
+
138
+
139
  def inject_aligned_race_branding(
140
  benchmarks: dict[str, Any],
141
  logos: dict[str, str],
142
  color_map: dict[str, str],
143
+ ) -> tuple[int, int]:
144
+ """Add Aligned logo URL, optional per-model race_logo_key, bar color, and axis labels.
145
+
146
+ Returns (logo_tag_count, axis_relabel_count) for logging.
147
+ """
148
  logos[ALIGNED_LOGOS_KEY] = ALIGNED_LOGO_URL
149
  color_map[ALIGNED_LOGOS_KEY] = ALIGNED_COLOR
150
+
151
+ logo_n = 0
152
+ axis_n = 0
153
  for _key, bm in benchmarks.items():
154
  for m in bm.get("models") or []:
155
  mid = m.get("model_id") or ""
156
+ provider = mid.split("/")[0] if "/" in mid else mid
157
+ use_logo = mid in MODEL_IDS_USE_ALIGNED_LOGO
158
+ use_axis = mid in MODEL_IDS_ALIGNED_AXIS_LABEL
159
+ use_groq_org = USE_ALIGNED_FOR_ORG_GROQ and provider.lower() == "groq"
160
+ if use_logo or use_axis or use_groq_org:
161
  m["race_logo_key"] = ALIGNED_LOGOS_KEY
162
+ logo_n += 1
163
+ if use_axis:
164
+ orig_sn = m.get("short_name") or (mid.split("/")[-1] if "/" in mid else mid)
165
+ m["chart_full_name"] = f"Published HF model: {orig_sn.replace('-', ' ')}"
166
+ m["short_name"] = aligned_axis_label_from_model_id(mid)
167
+ axis_n += 1
168
+
169
+ return logo_n, axis_n
170
+
171
+
172
+ def patch_output_dict(output: dict[str, Any]) -> dict[str, Any]:
173
+ """Deep-copy a loaded data.json dict, apply Aligned branding in place, return the copy."""
174
+ out = json.loads(json.dumps(output))
175
+ benchmarks = out.get("benchmarks") or {}
176
+ logos = out.setdefault("logos", {})
177
+ colors = out.setdefault("colors", {})
178
+ inject_aligned_race_branding(benchmarks, logos, colors)
179
+ return out
180
+
181
+
182
  def fetch_leaderboard(config: dict, hf_token: str | None) -> list[dict]:
183
  url = f"https://huggingface.co/api/datasets/{config['dataset']}/leaderboard"
184
  headers = {}
 
187
  elif config["gated"]:
188
  print(f" {config['name']}: skipped (gated, no token)")
189
  return []
190
+
191
  print(f" {config['name']}: fetching scores...")
192
  try:
193
  resp = httpx.get(url, headers=headers, timeout=30)
 
200
  except Exception as e:
201
  print(f" error: {e}")
202
  return []
203
+
204
  seen: dict[str, float] = {}
205
  for entry in data:
206
  model_id = entry.get("modelId")
 
209
  score = float(score)
210
  if model_id not in seen or score > seen[model_id]:
211
  seen[model_id] = score
212
+
213
  print(f" {len(seen)} models")
214
  return [{"model_id": mid, "score": s} for mid, s in seen.items()]
215
+
216
+
217
  def fetch_model_dates(model_ids: list[str], hf_token: str | None) -> dict[str, dict]:
218
  api = HfApi()
219
  results: dict[str, dict] = {}
220
+
221
  def _get_info(mid: str):
222
  try:
223
  info = api.model_info(mid, token=hf_token)
 
231
  return mid, info.created_at.strftime("%Y-%m-%d"), params_b
232
  except Exception:
233
  return mid, None, None
234
+
235
  with ThreadPoolExecutor(max_workers=8) as pool:
236
  futures = {pool.submit(_get_info, mid): mid for mid in model_ids}
237
  for f in as_completed(futures):
238
  mid, date, params = f.result()
239
  if date:
240
  results[mid] = {"date": date, "parameters_b": params}
241
+
242
  return results
243
+
244
+
245
  def fetch_logo(provider: str) -> str | None:
246
  try:
247
  resp = httpx.get(
 
253
  except Exception:
254
  pass
255
  return None
256
+
257
+
258
  def fetch_all_logos(providers: set[str]) -> dict[str, str]:
259
  logos: dict[str, str] = {}
260
  with ThreadPoolExecutor(max_workers=8) as pool:
 
265
  if url:
266
  logos[p] = url
267
  return logos
268
+
269
+
270
  def main() -> None:
271
  hf_token = os.environ.get("HF_TOKEN")
272
  print(f"Generating data.json → upload to {SPACE_REPO}\n")
273
+
274
  all_scores: dict[str, dict] = {}
275
  all_model_ids: set[str] = set()
276
+
277
  for config in BENCHMARK_CONFIGS:
278
  rows = fetch_leaderboard(config, hf_token)
279
  if rows:
280
  all_scores[config["key"]] = {"name": config["name"], "rows": rows}
281
  all_model_ids.update(r["model_id"] for r in rows)
282
+
283
  print(f"\n{len(all_model_ids)} unique models across {len(all_scores)} benchmarks")
284
  print("Fetching model dates...")
285
  model_dates = fetch_model_dates(list(all_model_ids), hf_token)
286
  print(f" got dates for {len(model_dates)}/{len(all_model_ids)} models")
287
+
288
  all_providers: set[str] = set()
289
  benchmarks: dict[str, Any] = {}
290
+
291
  for key, info in all_scores.items():
292
  models: list[dict] = []
293
  for row in info["rows"]:
 
306
  })
307
  if models:
308
  benchmarks[key] = {"name": info["name"], "models": models}
309
+
310
  print(f"\nFetching logos for {len(all_providers)} providers...")
311
  logos = fetch_all_logos(all_providers)
312
  print(f" got {len(logos)} logos")
313
+
314
  color_map: dict[str, str] = {}
315
  for i, provider in enumerate(sorted(all_providers)):
316
  color_map[provider] = PALETTE[i % len(PALETTE)]
317
+
318
+ tagged, relabeled = inject_aligned_race_branding(benchmarks, logos, color_map)
319
+ print(
320
+ f" injected {ALIGNED_LOGOS_KEY} logo + color; "
321
+ f"race_logo_key on {tagged} row(s); "
322
+ f"Aligned axis short_name on {relabeled} row(s)"
323
+ )
324
+
325
  output = {
326
  "benchmarks": benchmarks,
327
  "logos": logos,
328
  "colors": color_map,
329
  "generated_at": datetime.now(timezone.utc).isoformat(),
330
  }
331
+
332
  data_json = json.dumps(output, indent=2)
333
  print(f"\nGenerated {len(data_json) / 1024:.1f} KB")
334
  for key, bm in benchmarks.items():
335
  print(f" {bm['name']}: {len(bm['models'])} models")
336
+
337
  print(f"\nUploading data.json to {SPACE_REPO}...")
338
  api = HfApi()
339
  with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
340
  f.write(data_json)
341
  tmp_path = f.name
342
+
343
  try:
344
  api.upload_file(
345
  path_or_fileobj=tmp_path,
 
351
  print("Done!")
352
  finally:
353
  Path(tmp_path).unlink(missing_ok=True)
354
+
355
+
356
  if __name__ == "__main__":
357
+ main()