every-leaderboards / scripts /fetch_api_only.py
Linker1907's picture
Update fetch script to upload parquet dataset to HuggingFace
16cdd3e
#!/usr/bin/env python3
# /// script
# dependencies = ["requests", "huggingface-hub", "datasets", "pyarrow"]
# ///
import requests
import json
import os
import sys
from datetime import datetime
BENCHMARK_CONFIGS = [
{"dataset": "openai/gsm8k", "key": "gsm8k", "name": "GSM8K", "gated": False},
{
"dataset": "TIGER-Lab/MMLU-Pro",
"key": "mmluPro",
"name": "MMLU-Pro",
"gated": False,
},
{
"dataset": "Idavidrein/gpqa",
"key": "gpqa",
"name": "GPQA Diamond",
"gated": True,
},
{"dataset": "cais/hle", "key": "hle", "name": "HLE", "gated": True},
{
"dataset": "SWE-bench/SWE-bench_Verified",
"key": "sweVerified",
"name": "SWE-bench Verified",
"gated": False,
},
{
"dataset": "MathArena/aime_2026",
"key": "aime2026",
"name": "AIME 2026",
"gated": False,
},
{
"dataset": "MathArena/hmmt_feb_2026",
"key": "hmmt2026",
"name": "HMMT Feb 2026",
"gated": False,
},
{
"dataset": "allenai/olmOCR-bench",
"key": "olmOcr",
"name": "olmOCR-bench",
"gated": False,
},
{
"dataset": "harborframework/terminal-bench-2.0",
"key": "terminalBench",
"name": "Terminal-Bench 2.0",
"gated": False,
},
{
"dataset": "ScaleAI/SWE-bench_Pro",
"key": "swePro",
"name": "SWE-bench Pro",
"gated": False,
},
{
"dataset": "FutureMa/EvasionBench",
"key": "evasionBench",
"name": "EvasionBench",
"gated": False,
},
]
def fetch_model_parameters(model_id, hf_token=None):
"""Fetch parameter count for a model from HuggingFace API.
Args:
model_id: Model ID (e.g., "meta-llama/Llama-3-70B")
hf_token: Optional HuggingFace token for private models
Returns:
Parameter count in billions, or None if not available
"""
url = f"https://huggingface.co/api/models/{model_id}"
headers = {}
if hf_token:
headers["Authorization"] = f"Bearer {hf_token}"
response = requests.get(url, headers=headers, timeout=5)
if response.status_code != 200:
return parse_params_from_name(model_id)
data = response.json()
if "safetensors" in data:
total_params = data["safetensors"].get("total")
if total_params:
return round(total_params / 1_000_000_000, 1)
if "parameters" in data["safetensors"]:
bf16_params = data["safetensors"]["parameters"].get("BF16")
if bf16_params:
return round(bf16_params / 1_000_000_000, 1)
return parse_params_from_name(model_id)
def parse_params_from_name(model_id):
"""Parse parameter count from model name/ID.
Examples:
- "meta-llama/Llama-3-70B" -> 70.0
- "Qwen/Qwen2-72B" -> 72.0
- "Qwen/Qwen3.5-397B-A17B" -> 397.0 (full model, not active params)
- "microsoft/Phi-3.5-mini-instruct" -> None (no clear size)
Returns:
Parameter count in billions, or None
"""
import re
# Pattern to match numbers followed by 'B' (case insensitive)
# Looks for patterns like: -70B, -72B-, _8B, etc.
# Also handles decimals like 1.5B
# Prioritize larger numbers (full model size vs active params)
matches = re.findall(r"[-_/](\d+\.?\d*)[Bb](?:[-_/]|$)", model_id)
if matches:
# Convert all matches to floats and take the maximum
# (assumes full model size is larger than active params)
params = [float(m) for m in matches]
return max(params)
return None
def fetch_all_from_apis(hf_token=None):
"""Fetch ALL models from APIs only - no manual data.
Args:
hf_token: Optional HuggingFace token for accessing gated datasets
"""
models_dict = {}
for config in BENCHMARK_CONFIGS:
url = f"https://huggingface.co/api/datasets/{config['dataset']}/leaderboard"
# Skip gated datasets if no token provided
if config.get("gated", False) and not hf_token:
print(f"Skipping {config['name']} (gated, requires HF token)")
continue
print(f"Fetching {config['name']}...")
# Add authorization header for gated datasets
headers = {}
if config.get("gated", False) and hf_token:
headers["Authorization"] = f"Bearer {hf_token}"
print(f" πŸ”’ Using auth token for gated dataset")
response = requests.get(url, headers=headers, timeout=10)
if response.status_code != 200:
print(f" ⚠️ Skip (status {response.status_code})")
continue
data = response.json()
for entry in data:
model_id = entry.get("modelId")
score = entry.get("value")
# Create or update model
if model_id not in models_dict:
# Fetch parameter count from HuggingFace API
param_count = fetch_model_parameters(model_id, hf_token)
models_dict[model_id] = {
"id": model_id.lower().replace("/", "-"),
"name": model_id,
"provider": model_id.split("/")[0]
if "/" in model_id
else "Unknown",
"type": "open",
"metadata": {
"license": "Unknown",
"parametersInBillions": param_count,
"contextWindow": 0,
"modality": "text",
"architecture": "Transformer",
},
"benchmarks": {},
}
# Add benchmark score
models_dict[model_id]["benchmarks"][config["key"]] = entry
print(f" βœ“ Found {len([e for e in data if e.get('modelId')])} models")
# Calculate aggregate scores
models = list(models_dict.values())
return models
def flatten_model_for_parquet(model, all_benchmark_keys):
"""Flatten nested model structure for parquet compatibility.
Converts nested JSON structure into flat columns suitable for parquet format.
Each benchmark score becomes its own column.
Args:
model: Model dict with nested structure
all_benchmark_keys: List of all possible benchmark keys to ensure consistent schema
"""
flat = {
"model_id": model["id"],
"model_name": model["name"],
"provider": model["provider"],
"model_type": model["type"],
"parameters_billions": model["metadata"].get("parametersInBillions"),
"license": model["metadata"].get("license", "Unknown"),
"context_window": model["metadata"].get("contextWindow", 0),
"modality": model["metadata"].get("modality", "text"),
"architecture": model["metadata"].get("architecture", "Transformer"),
}
# Add ALL benchmark columns (with None for missing values)
# This ensures consistent schema across all rows
benchmarks = model.get("benchmarks", {})
for bench_key in sorted(all_benchmark_keys):
if bench_key in benchmarks:
bench_data = benchmarks[bench_key]
flat[f"{bench_key}_score"] = bench_data.get("value")
else:
flat[f"{bench_key}_score"] = None
# Calculate aggregate metrics
if benchmarks:
scores = [
b.get("value") for b in benchmarks.values() if b.get("value") is not None
]
if scores:
flat["aggregate_score"] = round(sum(scores) / len(scores), 2)
flat["coverage_count"] = len(benchmarks)
flat["coverage_percent"] = round((len(benchmarks) / 11) * 100, 1)
else:
flat["aggregate_score"] = None
flat["coverage_count"] = 0
flat["coverage_percent"] = 0.0
else:
flat["aggregate_score"] = None
flat["coverage_count"] = 0
flat["coverage_percent"] = 0.0
return flat
def main():
print("=" * 70)
print("Fetching from Official APIs & Uploading to HF Dataset")
print("=" * 70)
print()
# Get HF token from environment (required for upload)
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
print("❌ HF_TOKEN environment variable required")
print(" Export your token: export HF_TOKEN=your_token")
sys.exit(1)
print("βœ“ HF_TOKEN found")
print("πŸ”“ Will fetch gated datasets (GPQA, HLE)")
print()
# Fetch models from APIs
models = fetch_all_from_apis(hf_token)
if not models:
print("❌ No models fetched - exiting")
sys.exit(0)
print()
print("=" * 70)
print(f"βœ“ Fetched {len(models)} models from APIs")
print("=" * 70)
# Collect all benchmark keys to ensure consistent schema
all_benchmark_keys = set()
for m in models:
all_benchmark_keys.update(m.get("benchmarks", {}).keys())
print(
f"\nπŸ” Found {len(all_benchmark_keys)} unique benchmarks: {sorted(all_benchmark_keys)}"
)
# Flatten data for parquet (pass all_benchmark_keys for consistent schema)
print("\nπŸ“Š Flattening data for parquet format...")
flattened_models = [
flatten_model_for_parquet(m, all_benchmark_keys) for m in models
]
# Create HF Dataset
from datasets import Dataset
dataset = Dataset.from_list(flattened_models)
print(f" βœ“ Created dataset with {len(dataset)} rows")
print(f" βœ“ Schema: {len(dataset.column_names)} columns")
# Upload to HuggingFace
DATASET_REPO = "OpenEvals/leaderboard-data"
print(f"\nπŸ“€ Uploading to {DATASET_REPO}...")
try:
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")
dataset.push_to_hub(
DATASET_REPO,
token=hf_token,
commit_message=f"Automated update: {timestamp}",
)
print(f" βœ… Successfully uploaded!")
print(f" πŸ”— View at: https://huggingface.co/datasets/{DATASET_REPO}")
except Exception as e:
print(f" ❌ Upload failed: {e}")
sys.exit(1)
# Show summary
benchmark_keys = [
"gsm8k",
"mmluPro",
"gpqa",
"hle",
"olmOcr",
"sweVerified",
"swePro",
"aime2026",
"terminalBench",
"evasionBench",
"hmmt2026",
]
print("\nπŸ“Š Benchmark Coverage:")
for bench in benchmark_keys:
col_name = f"{bench}_score"
if col_name in dataset.column_names:
# Count non-null values in the column
values = dataset[col_name]
count = sum(1 for v in values if v is not None)
if count > 0:
print(f" {bench:20s}: {count:2d} models")
print("\nβœ… Data updated successfully!")
print(f" Total models: {len(models)}")
print(f" Timestamp: {timestamp}")
if __name__ == "__main__":
main()