Linker1907 commited on
Commit
16cdd3e
Β·
1 Parent(s): 44de261

Update fetch script to upload parquet dataset to HuggingFace

Browse files

- Add datasets and pyarrow dependencies to UV script
- Flatten nested JSON structure for parquet compatibility
- Create columns for all benchmarks (even if null)
- Upload directly to OpenEvals/leaderboard-data dataset
- Remove local file read/write dependencies
- Add aggregate score and coverage metrics
- 82 models, 11 benchmarks, 23 columns total

Files changed (1) hide show
  1. scripts/fetch_api_only.py +126 -60
scripts/fetch_api_only.py CHANGED
@@ -1,6 +1,6 @@
1
  #!/usr/bin/env python3
2
  # /// script
3
- # dependencies = ["requests", "huggingface-hub"]
4
  # ///
5
 
6
  import requests
@@ -198,83 +198,147 @@ def fetch_all_from_apis(hf_token=None):
198
 
199
  print(f" βœ“ Found {len([e for e in data if e.get('modelId')])} models")
200
 
201
- breakpoint()
202
  # Calculate aggregate scores
203
  models = list(models_dict.values())
204
- for model in models:
205
- benchmarks = model.get("benchmarks", {})
206
- if benchmarks:
207
- scores = [b["score"] for b in benchmarks.values()]
208
- model["aggregateScore"] = round(sum(scores) / len(scores), 2)
209
- model["coverageCount"] = len(benchmarks)
210
- model["coveragePercent"] = round((len(benchmarks) / 12) * 100, 1)
211
-
212
- # Sort by aggregate score
213
- models.sort(key=lambda x: x["aggregateScore"], reverse=True)
214
 
215
  return models
216
 
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  def main():
219
  print("=" * 70)
220
- print("Fetching ONLY from Official APIs (No Manual Data)")
221
  print("=" * 70)
222
  print()
223
 
224
- # Get HF token from environment or command line
225
- hf_token = None
226
- if len(sys.argv) > 1:
227
- if sys.argv[1] == "--token" and len(sys.argv) > 2:
228
- hf_token = sys.argv[2]
229
- print("βœ“ Using token from command line")
230
 
231
  if not hf_token:
232
- hf_token = os.environ.get("HF_TOKEN")
233
- if hf_token:
234
- print("βœ“ Using token from HF_TOKEN environment variable")
235
-
236
- if hf_token:
237
- print("πŸ”“ Token provided - will attempt to fetch gated datasets (GPQA, HLE)")
238
- else:
239
- print("⚠️ No token provided - gated datasets will be skipped")
240
- print(" To access gated datasets, use: --token YOUR_HF_TOKEN")
241
- print(" or set HF_TOKEN environment variable")
242
 
 
 
243
  print()
244
 
 
245
  models = fetch_all_from_apis(hf_token)
246
 
247
- # Load benchmark definitions
248
- try:
249
- with open("data/leaderboard.json", "r") as f:
250
- leaderboard_data = json.load(f)
251
- except:
252
- print("Error loading leaderboard.json")
253
- return 1
254
-
255
- # Replace models with API-only data
256
- leaderboard_data["models"] = models
257
- leaderboard_data["metadata"]["totalModels"] = len(models)
258
- leaderboard_data["metadata"]["lastUpdated"] = datetime.now().isoformat() + "Z"
259
-
260
- # Save
261
- with open("data/leaderboard.json", "w") as f:
262
- json.dump(leaderboard_data, indent=2, fp=f)
263
 
264
  print()
265
  print("=" * 70)
266
- print(f"βœ“ Loaded {len(models)} models from APIs only")
267
  print("=" * 70)
268
 
269
- # Show coverage
270
- benchmarks = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  "gsm8k",
272
  "mmluPro",
273
  "gpqa",
274
  "hle",
275
  "olmOcr",
276
  "sweVerified",
277
- "arguana",
278
  "swePro",
279
  "aime2026",
280
  "terminalBench",
@@ -282,17 +346,19 @@ def main():
282
  "hmmt2026",
283
  ]
284
 
285
- print("\nBenchmark Coverage:")
286
- for bench in benchmarks:
287
- count = sum(1 for m in models if bench in m.get("benchmarks", {}))
288
- if count > 0:
289
- print(f" {bench:20s}: {count:2d} models")
290
-
291
- print("\nTop 10 Models:")
292
- for i, m in enumerate(models[:10], 1):
293
- print(f" {i:2d}. {m['name']:<40s} {m['aggregateScore']:>5.1f}")
294
-
295
- print("\nβœ“ Data updated - 100% from APIs!")
 
 
296
 
297
 
298
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
  # /// script
3
+ # dependencies = ["requests", "huggingface-hub", "datasets", "pyarrow"]
4
  # ///
5
 
6
  import requests
 
198
 
199
  print(f" βœ“ Found {len([e for e in data if e.get('modelId')])} models")
200
 
 
201
  # Calculate aggregate scores
202
  models = list(models_dict.values())
 
 
 
 
 
 
 
 
 
 
203
 
204
  return models
205
 
206
 
207
+ def flatten_model_for_parquet(model, all_benchmark_keys):
208
+ """Flatten nested model structure for parquet compatibility.
209
+
210
+ Converts nested JSON structure into flat columns suitable for parquet format.
211
+ Each benchmark score becomes its own column.
212
+
213
+ Args:
214
+ model: Model dict with nested structure
215
+ all_benchmark_keys: List of all possible benchmark keys to ensure consistent schema
216
+ """
217
+ flat = {
218
+ "model_id": model["id"],
219
+ "model_name": model["name"],
220
+ "provider": model["provider"],
221
+ "model_type": model["type"],
222
+ "parameters_billions": model["metadata"].get("parametersInBillions"),
223
+ "license": model["metadata"].get("license", "Unknown"),
224
+ "context_window": model["metadata"].get("contextWindow", 0),
225
+ "modality": model["metadata"].get("modality", "text"),
226
+ "architecture": model["metadata"].get("architecture", "Transformer"),
227
+ }
228
+
229
+ # Add ALL benchmark columns (with None for missing values)
230
+ # This ensures consistent schema across all rows
231
+ benchmarks = model.get("benchmarks", {})
232
+ for bench_key in sorted(all_benchmark_keys):
233
+ if bench_key in benchmarks:
234
+ bench_data = benchmarks[bench_key]
235
+ flat[f"{bench_key}_score"] = bench_data.get("value")
236
+ else:
237
+ flat[f"{bench_key}_score"] = None
238
+
239
+ # Calculate aggregate metrics
240
+ if benchmarks:
241
+ scores = [
242
+ b.get("value") for b in benchmarks.values() if b.get("value") is not None
243
+ ]
244
+ if scores:
245
+ flat["aggregate_score"] = round(sum(scores) / len(scores), 2)
246
+ flat["coverage_count"] = len(benchmarks)
247
+ flat["coverage_percent"] = round((len(benchmarks) / 11) * 100, 1)
248
+ else:
249
+ flat["aggregate_score"] = None
250
+ flat["coverage_count"] = 0
251
+ flat["coverage_percent"] = 0.0
252
+ else:
253
+ flat["aggregate_score"] = None
254
+ flat["coverage_count"] = 0
255
+ flat["coverage_percent"] = 0.0
256
+
257
+ return flat
258
+
259
+
260
  def main():
261
  print("=" * 70)
262
+ print("Fetching from Official APIs & Uploading to HF Dataset")
263
  print("=" * 70)
264
  print()
265
 
266
+ # Get HF token from environment (required for upload)
267
+ hf_token = os.environ.get("HF_TOKEN")
 
 
 
 
268
 
269
  if not hf_token:
270
+ print("❌ HF_TOKEN environment variable required")
271
+ print(" Export your token: export HF_TOKEN=your_token")
272
+ sys.exit(1)
 
 
 
 
 
 
 
273
 
274
+ print("βœ“ HF_TOKEN found")
275
+ print("πŸ”“ Will fetch gated datasets (GPQA, HLE)")
276
  print()
277
 
278
+ # Fetch models from APIs
279
  models = fetch_all_from_apis(hf_token)
280
 
281
+ if not models:
282
+ print("❌ No models fetched - exiting")
283
+ sys.exit(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
  print()
286
  print("=" * 70)
287
+ print(f"βœ“ Fetched {len(models)} models from APIs")
288
  print("=" * 70)
289
 
290
+ # Collect all benchmark keys to ensure consistent schema
291
+ all_benchmark_keys = set()
292
+ for m in models:
293
+ all_benchmark_keys.update(m.get("benchmarks", {}).keys())
294
+
295
+ print(
296
+ f"\nπŸ” Found {len(all_benchmark_keys)} unique benchmarks: {sorted(all_benchmark_keys)}"
297
+ )
298
+
299
+ # Flatten data for parquet (pass all_benchmark_keys for consistent schema)
300
+ print("\nπŸ“Š Flattening data for parquet format...")
301
+ flattened_models = [
302
+ flatten_model_for_parquet(m, all_benchmark_keys) for m in models
303
+ ]
304
+
305
+ # Create HF Dataset
306
+ from datasets import Dataset
307
+
308
+ dataset = Dataset.from_list(flattened_models)
309
+
310
+ print(f" βœ“ Created dataset with {len(dataset)} rows")
311
+ print(f" βœ“ Schema: {len(dataset.column_names)} columns")
312
+
313
+ # Upload to HuggingFace
314
+ DATASET_REPO = "OpenEvals/leaderboard-data"
315
+
316
+ print(f"\nπŸ“€ Uploading to {DATASET_REPO}...")
317
+
318
+ try:
319
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")
320
+
321
+ dataset.push_to_hub(
322
+ DATASET_REPO,
323
+ token=hf_token,
324
+ commit_message=f"Automated update: {timestamp}",
325
+ )
326
+
327
+ print(f" βœ… Successfully uploaded!")
328
+ print(f" πŸ”— View at: https://huggingface.co/datasets/{DATASET_REPO}")
329
+
330
+ except Exception as e:
331
+ print(f" ❌ Upload failed: {e}")
332
+ sys.exit(1)
333
+
334
+ # Show summary
335
+ benchmark_keys = [
336
  "gsm8k",
337
  "mmluPro",
338
  "gpqa",
339
  "hle",
340
  "olmOcr",
341
  "sweVerified",
 
342
  "swePro",
343
  "aime2026",
344
  "terminalBench",
 
346
  "hmmt2026",
347
  ]
348
 
349
+ print("\nπŸ“Š Benchmark Coverage:")
350
+ for bench in benchmark_keys:
351
+ col_name = f"{bench}_score"
352
+ if col_name in dataset.column_names:
353
+ # Count non-null values in the column
354
+ values = dataset[col_name]
355
+ count = sum(1 for v in values if v is not None)
356
+ if count > 0:
357
+ print(f" {bench:20s}: {count:2d} models")
358
+
359
+ print("\nβœ… Data updated successfully!")
360
+ print(f" Total models: {len(models)}")
361
+ print(f" Timestamp: {timestamp}")
362
 
363
 
364
  if __name__ == "__main__":