Spaces:

OpenEvals
/

every-leaderboards

Running

App Files Files Community

every-leaderboards / scripts /fetch_api_only.py

Linker1907

Update fetch script to upload parquet dataset to HuggingFace

16cdd3e 5 days ago

raw

history blame contribute delete

11 kB

	#!/usr/bin/env python3
	# /// script
	# dependencies = ["requests", "huggingface-hub", "datasets", "pyarrow"]
	# ///

	import requests
	import json
	import os
	import sys
	from datetime import datetime

	BENCHMARK_CONFIGS = [
	{"dataset": "openai/gsm8k", "key": "gsm8k", "name": "GSM8K", "gated": False},
	{
	"dataset": "TIGER-Lab/MMLU-Pro",
	"key": "mmluPro",
	"name": "MMLU-Pro",
	"gated": False,
	},
	{
	"dataset": "Idavidrein/gpqa",
	"key": "gpqa",
	"name": "GPQA Diamond",
	"gated": True,
	},
	{"dataset": "cais/hle", "key": "hle", "name": "HLE", "gated": True},
	{
	"dataset": "SWE-bench/SWE-bench_Verified",
	"key": "sweVerified",
	"name": "SWE-bench Verified",
	"gated": False,
	},
	{
	"dataset": "MathArena/aime_2026",
	"key": "aime2026",
	"name": "AIME 2026",
	"gated": False,
	},
	{
	"dataset": "MathArena/hmmt_feb_2026",
	"key": "hmmt2026",
	"name": "HMMT Feb 2026",
	"gated": False,
	},
	{
	"dataset": "allenai/olmOCR-bench",
	"key": "olmOcr",
	"name": "olmOCR-bench",
	"gated": False,
	},
	{
	"dataset": "harborframework/terminal-bench-2.0",
	"key": "terminalBench",
	"name": "Terminal-Bench 2.0",
	"gated": False,
	},
	{
	"dataset": "ScaleAI/SWE-bench_Pro",
	"key": "swePro",
	"name": "SWE-bench Pro",
	"gated": False,
	},
	{
	"dataset": "FutureMa/EvasionBench",
	"key": "evasionBench",
	"name": "EvasionBench",
	"gated": False,
	},
	]


	def fetch_model_parameters(model_id, hf_token=None):
	"""Fetch parameter count for a model from HuggingFace API.

	Args:
	model_id: Model ID (e.g., "meta-llama/Llama-3-70B")
	hf_token: Optional HuggingFace token for private models

	Returns:
	Parameter count in billions, or None if not available
	"""

	url = f"https://huggingface.co/api/models/{model_id}"
	headers = {}
	if hf_token:
	headers["Authorization"] = f"Bearer {hf_token}"

	response = requests.get(url, headers=headers, timeout=5)

	if response.status_code != 200:
	return parse_params_from_name(model_id)

	data = response.json()

	if "safetensors" in data:
	total_params = data["safetensors"].get("total")
	if total_params:
	return round(total_params / 1_000_000_000, 1)

	if "parameters" in data["safetensors"]:
	bf16_params = data["safetensors"]["parameters"].get("BF16")
	if bf16_params:
	return round(bf16_params / 1_000_000_000, 1)

	return parse_params_from_name(model_id)


	def parse_params_from_name(model_id):
	"""Parse parameter count from model name/ID.

	Examples:
	- "meta-llama/Llama-3-70B" -> 70.0
	- "Qwen/Qwen2-72B" -> 72.0
	- "Qwen/Qwen3.5-397B-A17B" -> 397.0 (full model, not active params)
	- "microsoft/Phi-3.5-mini-instruct" -> None (no clear size)

	Returns:
	Parameter count in billions, or None
	"""
	import re

	# Pattern to match numbers followed by 'B' (case insensitive)
	# Looks for patterns like: -70B, -72B-, _8B, etc.
	# Also handles decimals like 1.5B
	# Prioritize larger numbers (full model size vs active params)

	matches = re.findall(r"[-_/](\d+\.?\d*)[Bb](?:[-_/]\|$)", model_id)

	if matches:
	# Convert all matches to floats and take the maximum
	# (assumes full model size is larger than active params)
	params = [float(m) for m in matches]
	return max(params)

	return None


	def fetch_all_from_apis(hf_token=None):
	"""Fetch ALL models from APIs only - no manual data.

	Args:
	hf_token: Optional HuggingFace token for accessing gated datasets
	"""
	models_dict = {}

	for config in BENCHMARK_CONFIGS:
	url = f"https://huggingface.co/api/datasets/{config['dataset']}/leaderboard"

	# Skip gated datasets if no token provided
	if config.get("gated", False) and not hf_token:
	print(f"Skipping {config['name']} (gated, requires HF token)")
	continue

	print(f"Fetching {config['name']}...")

	# Add authorization header for gated datasets
	headers = {}
	if config.get("gated", False) and hf_token:
	headers["Authorization"] = f"Bearer {hf_token}"
	print(f" 🔒 Using auth token for gated dataset")

	response = requests.get(url, headers=headers, timeout=10)

	if response.status_code != 200:
	print(f" ⚠️ Skip (status {response.status_code})")
	continue

	data = response.json()

	for entry in data:
	model_id = entry.get("modelId")
	score = entry.get("value")

	# Create or update model
	if model_id not in models_dict:
	# Fetch parameter count from HuggingFace API
	param_count = fetch_model_parameters(model_id, hf_token)

	models_dict[model_id] = {
	"id": model_id.lower().replace("/", "-"),
	"name": model_id,
	"provider": model_id.split("/")[0]
	if "/" in model_id
	else "Unknown",
	"type": "open",
	"metadata": {
	"license": "Unknown",
	"parametersInBillions": param_count,
	"contextWindow": 0,
	"modality": "text",
	"architecture": "Transformer",
	},
	"benchmarks": {},
	}

	# Add benchmark score
	models_dict[model_id]["benchmarks"][config["key"]] = entry

	print(f" ✓ Found {len([e for e in data if e.get('modelId')])} models")

	# Calculate aggregate scores
	models = list(models_dict.values())

	return models


	def flatten_model_for_parquet(model, all_benchmark_keys):
	"""Flatten nested model structure for parquet compatibility.

	Converts nested JSON structure into flat columns suitable for parquet format.
	Each benchmark score becomes its own column.

	Args:
	model: Model dict with nested structure
	all_benchmark_keys: List of all possible benchmark keys to ensure consistent schema
	"""
	flat = {
	"model_id": model["id"],
	"model_name": model["name"],
	"provider": model["provider"],
	"model_type": model["type"],
	"parameters_billions": model["metadata"].get("parametersInBillions"),
	"license": model["metadata"].get("license", "Unknown"),
	"context_window": model["metadata"].get("contextWindow", 0),
	"modality": model["metadata"].get("modality", "text"),
	"architecture": model["metadata"].get("architecture", "Transformer"),
	}

	# Add ALL benchmark columns (with None for missing values)
	# This ensures consistent schema across all rows
	benchmarks = model.get("benchmarks", {})
	for bench_key in sorted(all_benchmark_keys):
	if bench_key in benchmarks:
	bench_data = benchmarks[bench_key]
	flat[f"{bench_key}_score"] = bench_data.get("value")
	else:
	flat[f"{bench_key}_score"] = None

	# Calculate aggregate metrics
	if benchmarks:
	scores = [
	b.get("value") for b in benchmarks.values() if b.get("value") is not None
	]
	if scores:
	flat["aggregate_score"] = round(sum(scores) / len(scores), 2)
	flat["coverage_count"] = len(benchmarks)
	flat["coverage_percent"] = round((len(benchmarks) / 11) * 100, 1)
	else:
	flat["aggregate_score"] = None
	flat["coverage_count"] = 0
	flat["coverage_percent"] = 0.0
	else:
	flat["aggregate_score"] = None
	flat["coverage_count"] = 0
	flat["coverage_percent"] = 0.0

	return flat


	def main():
	print("=" * 70)
	print("Fetching from Official APIs & Uploading to HF Dataset")
	print("=" * 70)
	print()

	# Get HF token from environment (required for upload)
	hf_token = os.environ.get("HF_TOKEN")

	if not hf_token:
	print("❌ HF_TOKEN environment variable required")
	print(" Export your token: export HF_TOKEN=your_token")
	sys.exit(1)

	print("✓ HF_TOKEN found")
	print("🔓 Will fetch gated datasets (GPQA, HLE)")
	print()

	# Fetch models from APIs
	models = fetch_all_from_apis(hf_token)

	if not models:
	print("❌ No models fetched - exiting")
	sys.exit(0)

	print()
	print("=" * 70)
	print(f"✓ Fetched {len(models)} models from APIs")
	print("=" * 70)

	# Collect all benchmark keys to ensure consistent schema
	all_benchmark_keys = set()
	for m in models:
	all_benchmark_keys.update(m.get("benchmarks", {}).keys())

	print(
	f"\n🔍 Found {len(all_benchmark_keys)} unique benchmarks: {sorted(all_benchmark_keys)}"
	)

	# Flatten data for parquet (pass all_benchmark_keys for consistent schema)
	print("\n📊 Flattening data for parquet format...")
	flattened_models = [
	flatten_model_for_parquet(m, all_benchmark_keys) for m in models
	]

	# Create HF Dataset
	from datasets import Dataset

	dataset = Dataset.from_list(flattened_models)

	print(f" ✓ Created dataset with {len(dataset)} rows")
	print(f" ✓ Schema: {len(dataset.column_names)} columns")

	# Upload to HuggingFace
	DATASET_REPO = "OpenEvals/leaderboard-data"

	print(f"\n📤 Uploading to {DATASET_REPO}...")

	try:
	timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")

	dataset.push_to_hub(
	DATASET_REPO,
	token=hf_token,
	commit_message=f"Automated update: {timestamp}",
	)

	print(f" ✅ Successfully uploaded!")
	print(f" 🔗 View at: https://huggingface.co/datasets/{DATASET_REPO}")

	except Exception as e:
	print(f" ❌ Upload failed: {e}")
	sys.exit(1)

	# Show summary
	benchmark_keys = [
	"gsm8k",
	"mmluPro",
	"gpqa",
	"hle",
	"olmOcr",
	"sweVerified",
	"swePro",
	"aime2026",
	"terminalBench",
	"evasionBench",
	"hmmt2026",
	]

	print("\n📊 Benchmark Coverage:")
	for bench in benchmark_keys:
	col_name = f"{bench}_score"
	if col_name in dataset.column_names:
	# Count non-null values in the column
	values = dataset[col_name]
	count = sum(1 for v in values if v is not None)
	if count > 0:
	print(f" {bench:20s}: {count:2d} models")

	print("\n✅ Data updated successfully!")
	print(f" Total models: {len(models)}")
	print(f" Timestamp: {timestamp}")


	if __name__ == "__main__":
	main()