Spaces:
Running
Running
Commit Β·
16cdd3e
1
Parent(s): 44de261
Update fetch script to upload parquet dataset to HuggingFace
Browse files- Add datasets and pyarrow dependencies to UV script
- Flatten nested JSON structure for parquet compatibility
- Create columns for all benchmarks (even if null)
- Upload directly to OpenEvals/leaderboard-data dataset
- Remove local file read/write dependencies
- Add aggregate score and coverage metrics
- 82 models, 11 benchmarks, 23 columns total
- scripts/fetch_api_only.py +126 -60
scripts/fetch_api_only.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
# /// script
|
| 3 |
-
# dependencies = ["requests", "huggingface-hub"]
|
| 4 |
# ///
|
| 5 |
|
| 6 |
import requests
|
|
@@ -198,83 +198,147 @@ def fetch_all_from_apis(hf_token=None):
|
|
| 198 |
|
| 199 |
print(f" β Found {len([e for e in data if e.get('modelId')])} models")
|
| 200 |
|
| 201 |
-
breakpoint()
|
| 202 |
# Calculate aggregate scores
|
| 203 |
models = list(models_dict.values())
|
| 204 |
-
for model in models:
|
| 205 |
-
benchmarks = model.get("benchmarks", {})
|
| 206 |
-
if benchmarks:
|
| 207 |
-
scores = [b["score"] for b in benchmarks.values()]
|
| 208 |
-
model["aggregateScore"] = round(sum(scores) / len(scores), 2)
|
| 209 |
-
model["coverageCount"] = len(benchmarks)
|
| 210 |
-
model["coveragePercent"] = round((len(benchmarks) / 12) * 100, 1)
|
| 211 |
-
|
| 212 |
-
# Sort by aggregate score
|
| 213 |
-
models.sort(key=lambda x: x["aggregateScore"], reverse=True)
|
| 214 |
|
| 215 |
return models
|
| 216 |
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
def main():
|
| 219 |
print("=" * 70)
|
| 220 |
-
print("Fetching
|
| 221 |
print("=" * 70)
|
| 222 |
print()
|
| 223 |
|
| 224 |
-
# Get HF token from environment
|
| 225 |
-
hf_token =
|
| 226 |
-
if len(sys.argv) > 1:
|
| 227 |
-
if sys.argv[1] == "--token" and len(sys.argv) > 2:
|
| 228 |
-
hf_token = sys.argv[2]
|
| 229 |
-
print("β Using token from command line")
|
| 230 |
|
| 231 |
if not hf_token:
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
if hf_token:
|
| 237 |
-
print("π Token provided - will attempt to fetch gated datasets (GPQA, HLE)")
|
| 238 |
-
else:
|
| 239 |
-
print("β οΈ No token provided - gated datasets will be skipped")
|
| 240 |
-
print(" To access gated datasets, use: --token YOUR_HF_TOKEN")
|
| 241 |
-
print(" or set HF_TOKEN environment variable")
|
| 242 |
|
|
|
|
|
|
|
| 243 |
print()
|
| 244 |
|
|
|
|
| 245 |
models = fetch_all_from_apis(hf_token)
|
| 246 |
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
leaderboard_data = json.load(f)
|
| 251 |
-
except:
|
| 252 |
-
print("Error loading leaderboard.json")
|
| 253 |
-
return 1
|
| 254 |
-
|
| 255 |
-
# Replace models with API-only data
|
| 256 |
-
leaderboard_data["models"] = models
|
| 257 |
-
leaderboard_data["metadata"]["totalModels"] = len(models)
|
| 258 |
-
leaderboard_data["metadata"]["lastUpdated"] = datetime.now().isoformat() + "Z"
|
| 259 |
-
|
| 260 |
-
# Save
|
| 261 |
-
with open("data/leaderboard.json", "w") as f:
|
| 262 |
-
json.dump(leaderboard_data, indent=2, fp=f)
|
| 263 |
|
| 264 |
print()
|
| 265 |
print("=" * 70)
|
| 266 |
-
print(f"β
|
| 267 |
print("=" * 70)
|
| 268 |
|
| 269 |
-
#
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
"gsm8k",
|
| 272 |
"mmluPro",
|
| 273 |
"gpqa",
|
| 274 |
"hle",
|
| 275 |
"olmOcr",
|
| 276 |
"sweVerified",
|
| 277 |
-
"arguana",
|
| 278 |
"swePro",
|
| 279 |
"aime2026",
|
| 280 |
"terminalBench",
|
|
@@ -282,17 +346,19 @@ def main():
|
|
| 282 |
"hmmt2026",
|
| 283 |
]
|
| 284 |
|
| 285 |
-
print("\
|
| 286 |
-
for bench in
|
| 287 |
-
|
| 288 |
-
if
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
print("\n
|
|
|
|
|
|
|
| 296 |
|
| 297 |
|
| 298 |
if __name__ == "__main__":
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
# /// script
|
| 3 |
+
# dependencies = ["requests", "huggingface-hub", "datasets", "pyarrow"]
|
| 4 |
# ///
|
| 5 |
|
| 6 |
import requests
|
|
|
|
| 198 |
|
| 199 |
print(f" β Found {len([e for e in data if e.get('modelId')])} models")
|
| 200 |
|
|
|
|
| 201 |
# Calculate aggregate scores
|
| 202 |
models = list(models_dict.values())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
return models
|
| 205 |
|
| 206 |
|
| 207 |
+
def flatten_model_for_parquet(model, all_benchmark_keys):
|
| 208 |
+
"""Flatten nested model structure for parquet compatibility.
|
| 209 |
+
|
| 210 |
+
Converts nested JSON structure into flat columns suitable for parquet format.
|
| 211 |
+
Each benchmark score becomes its own column.
|
| 212 |
+
|
| 213 |
+
Args:
|
| 214 |
+
model: Model dict with nested structure
|
| 215 |
+
all_benchmark_keys: List of all possible benchmark keys to ensure consistent schema
|
| 216 |
+
"""
|
| 217 |
+
flat = {
|
| 218 |
+
"model_id": model["id"],
|
| 219 |
+
"model_name": model["name"],
|
| 220 |
+
"provider": model["provider"],
|
| 221 |
+
"model_type": model["type"],
|
| 222 |
+
"parameters_billions": model["metadata"].get("parametersInBillions"),
|
| 223 |
+
"license": model["metadata"].get("license", "Unknown"),
|
| 224 |
+
"context_window": model["metadata"].get("contextWindow", 0),
|
| 225 |
+
"modality": model["metadata"].get("modality", "text"),
|
| 226 |
+
"architecture": model["metadata"].get("architecture", "Transformer"),
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
# Add ALL benchmark columns (with None for missing values)
|
| 230 |
+
# This ensures consistent schema across all rows
|
| 231 |
+
benchmarks = model.get("benchmarks", {})
|
| 232 |
+
for bench_key in sorted(all_benchmark_keys):
|
| 233 |
+
if bench_key in benchmarks:
|
| 234 |
+
bench_data = benchmarks[bench_key]
|
| 235 |
+
flat[f"{bench_key}_score"] = bench_data.get("value")
|
| 236 |
+
else:
|
| 237 |
+
flat[f"{bench_key}_score"] = None
|
| 238 |
+
|
| 239 |
+
# Calculate aggregate metrics
|
| 240 |
+
if benchmarks:
|
| 241 |
+
scores = [
|
| 242 |
+
b.get("value") for b in benchmarks.values() if b.get("value") is not None
|
| 243 |
+
]
|
| 244 |
+
if scores:
|
| 245 |
+
flat["aggregate_score"] = round(sum(scores) / len(scores), 2)
|
| 246 |
+
flat["coverage_count"] = len(benchmarks)
|
| 247 |
+
flat["coverage_percent"] = round((len(benchmarks) / 11) * 100, 1)
|
| 248 |
+
else:
|
| 249 |
+
flat["aggregate_score"] = None
|
| 250 |
+
flat["coverage_count"] = 0
|
| 251 |
+
flat["coverage_percent"] = 0.0
|
| 252 |
+
else:
|
| 253 |
+
flat["aggregate_score"] = None
|
| 254 |
+
flat["coverage_count"] = 0
|
| 255 |
+
flat["coverage_percent"] = 0.0
|
| 256 |
+
|
| 257 |
+
return flat
|
| 258 |
+
|
| 259 |
+
|
| 260 |
def main():
|
| 261 |
print("=" * 70)
|
| 262 |
+
print("Fetching from Official APIs & Uploading to HF Dataset")
|
| 263 |
print("=" * 70)
|
| 264 |
print()
|
| 265 |
|
| 266 |
+
# Get HF token from environment (required for upload)
|
| 267 |
+
hf_token = os.environ.get("HF_TOKEN")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
if not hf_token:
|
| 270 |
+
print("β HF_TOKEN environment variable required")
|
| 271 |
+
print(" Export your token: export HF_TOKEN=your_token")
|
| 272 |
+
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
+
print("β HF_TOKEN found")
|
| 275 |
+
print("π Will fetch gated datasets (GPQA, HLE)")
|
| 276 |
print()
|
| 277 |
|
| 278 |
+
# Fetch models from APIs
|
| 279 |
models = fetch_all_from_apis(hf_token)
|
| 280 |
|
| 281 |
+
if not models:
|
| 282 |
+
print("β No models fetched - exiting")
|
| 283 |
+
sys.exit(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
print()
|
| 286 |
print("=" * 70)
|
| 287 |
+
print(f"β Fetched {len(models)} models from APIs")
|
| 288 |
print("=" * 70)
|
| 289 |
|
| 290 |
+
# Collect all benchmark keys to ensure consistent schema
|
| 291 |
+
all_benchmark_keys = set()
|
| 292 |
+
for m in models:
|
| 293 |
+
all_benchmark_keys.update(m.get("benchmarks", {}).keys())
|
| 294 |
+
|
| 295 |
+
print(
|
| 296 |
+
f"\nπ Found {len(all_benchmark_keys)} unique benchmarks: {sorted(all_benchmark_keys)}"
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
# Flatten data for parquet (pass all_benchmark_keys for consistent schema)
|
| 300 |
+
print("\nπ Flattening data for parquet format...")
|
| 301 |
+
flattened_models = [
|
| 302 |
+
flatten_model_for_parquet(m, all_benchmark_keys) for m in models
|
| 303 |
+
]
|
| 304 |
+
|
| 305 |
+
# Create HF Dataset
|
| 306 |
+
from datasets import Dataset
|
| 307 |
+
|
| 308 |
+
dataset = Dataset.from_list(flattened_models)
|
| 309 |
+
|
| 310 |
+
print(f" β Created dataset with {len(dataset)} rows")
|
| 311 |
+
print(f" β Schema: {len(dataset.column_names)} columns")
|
| 312 |
+
|
| 313 |
+
# Upload to HuggingFace
|
| 314 |
+
DATASET_REPO = "OpenEvals/leaderboard-data"
|
| 315 |
+
|
| 316 |
+
print(f"\nπ€ Uploading to {DATASET_REPO}...")
|
| 317 |
+
|
| 318 |
+
try:
|
| 319 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")
|
| 320 |
+
|
| 321 |
+
dataset.push_to_hub(
|
| 322 |
+
DATASET_REPO,
|
| 323 |
+
token=hf_token,
|
| 324 |
+
commit_message=f"Automated update: {timestamp}",
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
print(f" β
Successfully uploaded!")
|
| 328 |
+
print(f" π View at: https://huggingface.co/datasets/{DATASET_REPO}")
|
| 329 |
+
|
| 330 |
+
except Exception as e:
|
| 331 |
+
print(f" β Upload failed: {e}")
|
| 332 |
+
sys.exit(1)
|
| 333 |
+
|
| 334 |
+
# Show summary
|
| 335 |
+
benchmark_keys = [
|
| 336 |
"gsm8k",
|
| 337 |
"mmluPro",
|
| 338 |
"gpqa",
|
| 339 |
"hle",
|
| 340 |
"olmOcr",
|
| 341 |
"sweVerified",
|
|
|
|
| 342 |
"swePro",
|
| 343 |
"aime2026",
|
| 344 |
"terminalBench",
|
|
|
|
| 346 |
"hmmt2026",
|
| 347 |
]
|
| 348 |
|
| 349 |
+
print("\nπ Benchmark Coverage:")
|
| 350 |
+
for bench in benchmark_keys:
|
| 351 |
+
col_name = f"{bench}_score"
|
| 352 |
+
if col_name in dataset.column_names:
|
| 353 |
+
# Count non-null values in the column
|
| 354 |
+
values = dataset[col_name]
|
| 355 |
+
count = sum(1 for v in values if v is not None)
|
| 356 |
+
if count > 0:
|
| 357 |
+
print(f" {bench:20s}: {count:2d} models")
|
| 358 |
+
|
| 359 |
+
print("\nβ
Data updated successfully!")
|
| 360 |
+
print(f" Total models: {len(models)}")
|
| 361 |
+
print(f" Timestamp: {timestamp}")
|
| 362 |
|
| 363 |
|
| 364 |
if __name__ == "__main__":
|