#!/usr/bin/env python3 # /// script # dependencies = ["requests", "huggingface-hub", "datasets", "pyarrow"] # /// import requests import json import os import sys from datetime import datetime BENCHMARK_CONFIGS = [ {"dataset": "openai/gsm8k", "key": "gsm8k", "name": "GSM8K", "gated": False}, { "dataset": "TIGER-Lab/MMLU-Pro", "key": "mmluPro", "name": "MMLU-Pro", "gated": False, }, { "dataset": "Idavidrein/gpqa", "key": "gpqa", "name": "GPQA Diamond", "gated": True, }, {"dataset": "cais/hle", "key": "hle", "name": "HLE", "gated": True}, { "dataset": "SWE-bench/SWE-bench_Verified", "key": "sweVerified", "name": "SWE-bench Verified", "gated": False, }, { "dataset": "MathArena/aime_2026", "key": "aime2026", "name": "AIME 2026", "gated": False, }, { "dataset": "MathArena/hmmt_feb_2026", "key": "hmmt2026", "name": "HMMT Feb 2026", "gated": False, }, { "dataset": "allenai/olmOCR-bench", "key": "olmOcr", "name": "olmOCR-bench", "gated": False, }, { "dataset": "harborframework/terminal-bench-2.0", "key": "terminalBench", "name": "Terminal-Bench 2.0", "gated": False, }, { "dataset": "ScaleAI/SWE-bench_Pro", "key": "swePro", "name": "SWE-bench Pro", "gated": False, }, { "dataset": "FutureMa/EvasionBench", "key": "evasionBench", "name": "EvasionBench", "gated": False, }, ] def fetch_model_parameters(model_id, hf_token=None): """Fetch parameter count for a model from HuggingFace API. Args: model_id: Model ID (e.g., "meta-llama/Llama-3-70B") hf_token: Optional HuggingFace token for private models Returns: Parameter count in billions, or None if not available """ url = f"https://huggingface.co/api/models/{model_id}" headers = {} if hf_token: headers["Authorization"] = f"Bearer {hf_token}" response = requests.get(url, headers=headers, timeout=5) if response.status_code != 200: return parse_params_from_name(model_id) data = response.json() if "safetensors" in data: total_params = data["safetensors"].get("total") if total_params: return round(total_params / 1_000_000_000, 1) if "parameters" in data["safetensors"]: bf16_params = data["safetensors"]["parameters"].get("BF16") if bf16_params: return round(bf16_params / 1_000_000_000, 1) return parse_params_from_name(model_id) def parse_params_from_name(model_id): """Parse parameter count from model name/ID. Examples: - "meta-llama/Llama-3-70B" -> 70.0 - "Qwen/Qwen2-72B" -> 72.0 - "Qwen/Qwen3.5-397B-A17B" -> 397.0 (full model, not active params) - "microsoft/Phi-3.5-mini-instruct" -> None (no clear size) Returns: Parameter count in billions, or None """ import re # Pattern to match numbers followed by 'B' (case insensitive) # Looks for patterns like: -70B, -72B-, _8B, etc. # Also handles decimals like 1.5B # Prioritize larger numbers (full model size vs active params) matches = re.findall(r"[-_/](\d+\.?\d*)[Bb](?:[-_/]|$)", model_id) if matches: # Convert all matches to floats and take the maximum # (assumes full model size is larger than active params) params = [float(m) for m in matches] return max(params) return None def fetch_all_from_apis(hf_token=None): """Fetch ALL models from APIs only - no manual data. Args: hf_token: Optional HuggingFace token for accessing gated datasets """ models_dict = {} for config in BENCHMARK_CONFIGS: url = f"https://huggingface.co/api/datasets/{config['dataset']}/leaderboard" # Skip gated datasets if no token provided if config.get("gated", False) and not hf_token: print(f"Skipping {config['name']} (gated, requires HF token)") continue print(f"Fetching {config['name']}...") # Add authorization header for gated datasets headers = {} if config.get("gated", False) and hf_token: headers["Authorization"] = f"Bearer {hf_token}" print(f" šŸ”’ Using auth token for gated dataset") response = requests.get(url, headers=headers, timeout=10) if response.status_code != 200: print(f" āš ļø Skip (status {response.status_code})") continue data = response.json() for entry in data: model_id = entry.get("modelId") score = entry.get("value") # Create or update model if model_id not in models_dict: # Fetch parameter count from HuggingFace API param_count = fetch_model_parameters(model_id, hf_token) models_dict[model_id] = { "id": model_id.lower().replace("/", "-"), "name": model_id, "provider": model_id.split("/")[0] if "/" in model_id else "Unknown", "type": "open", "metadata": { "license": "Unknown", "parametersInBillions": param_count, "contextWindow": 0, "modality": "text", "architecture": "Transformer", }, "benchmarks": {}, } # Add benchmark score models_dict[model_id]["benchmarks"][config["key"]] = entry print(f" āœ“ Found {len([e for e in data if e.get('modelId')])} models") # Calculate aggregate scores models = list(models_dict.values()) return models def flatten_model_for_parquet(model, all_benchmark_keys): """Flatten nested model structure for parquet compatibility. Converts nested JSON structure into flat columns suitable for parquet format. Each benchmark score becomes its own column. Args: model: Model dict with nested structure all_benchmark_keys: List of all possible benchmark keys to ensure consistent schema """ flat = { "model_id": model["id"], "model_name": model["name"], "provider": model["provider"], "model_type": model["type"], "parameters_billions": model["metadata"].get("parametersInBillions"), "license": model["metadata"].get("license", "Unknown"), "context_window": model["metadata"].get("contextWindow", 0), "modality": model["metadata"].get("modality", "text"), "architecture": model["metadata"].get("architecture", "Transformer"), } # Add ALL benchmark columns (with None for missing values) # This ensures consistent schema across all rows benchmarks = model.get("benchmarks", {}) for bench_key in sorted(all_benchmark_keys): if bench_key in benchmarks: bench_data = benchmarks[bench_key] flat[f"{bench_key}_score"] = bench_data.get("value") else: flat[f"{bench_key}_score"] = None # Calculate aggregate metrics if benchmarks: scores = [ b.get("value") for b in benchmarks.values() if b.get("value") is not None ] if scores: flat["aggregate_score"] = round(sum(scores) / len(scores), 2) flat["coverage_count"] = len(benchmarks) flat["coverage_percent"] = round((len(benchmarks) / 11) * 100, 1) else: flat["aggregate_score"] = None flat["coverage_count"] = 0 flat["coverage_percent"] = 0.0 else: flat["aggregate_score"] = None flat["coverage_count"] = 0 flat["coverage_percent"] = 0.0 return flat def main(): print("=" * 70) print("Fetching from Official APIs & Uploading to HF Dataset") print("=" * 70) print() # Get HF token from environment (required for upload) hf_token = os.environ.get("HF_TOKEN") if not hf_token: print("āŒ HF_TOKEN environment variable required") print(" Export your token: export HF_TOKEN=your_token") sys.exit(1) print("āœ“ HF_TOKEN found") print("šŸ”“ Will fetch gated datasets (GPQA, HLE)") print() # Fetch models from APIs models = fetch_all_from_apis(hf_token) if not models: print("āŒ No models fetched - exiting") sys.exit(0) print() print("=" * 70) print(f"āœ“ Fetched {len(models)} models from APIs") print("=" * 70) # Collect all benchmark keys to ensure consistent schema all_benchmark_keys = set() for m in models: all_benchmark_keys.update(m.get("benchmarks", {}).keys()) print( f"\nšŸ” Found {len(all_benchmark_keys)} unique benchmarks: {sorted(all_benchmark_keys)}" ) # Flatten data for parquet (pass all_benchmark_keys for consistent schema) print("\nšŸ“Š Flattening data for parquet format...") flattened_models = [ flatten_model_for_parquet(m, all_benchmark_keys) for m in models ] # Create HF Dataset from datasets import Dataset dataset = Dataset.from_list(flattened_models) print(f" āœ“ Created dataset with {len(dataset)} rows") print(f" āœ“ Schema: {len(dataset.column_names)} columns") # Upload to HuggingFace DATASET_REPO = "OpenEvals/leaderboard-data" print(f"\nšŸ“¤ Uploading to {DATASET_REPO}...") try: timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC") dataset.push_to_hub( DATASET_REPO, token=hf_token, commit_message=f"Automated update: {timestamp}", ) print(f" āœ… Successfully uploaded!") print(f" šŸ”— View at: https://huggingface.co/datasets/{DATASET_REPO}") except Exception as e: print(f" āŒ Upload failed: {e}") sys.exit(1) # Show summary benchmark_keys = [ "gsm8k", "mmluPro", "gpqa", "hle", "olmOcr", "sweVerified", "swePro", "aime2026", "terminalBench", "evasionBench", "hmmt2026", ] print("\nšŸ“Š Benchmark Coverage:") for bench in benchmark_keys: col_name = f"{bench}_score" if col_name in dataset.column_names: # Count non-null values in the column values = dataset[col_name] count = sum(1 for v in values if v is not None) if count > 0: print(f" {bench:20s}: {count:2d} models") print("\nāœ… Data updated successfully!") print(f" Total models: {len(models)}") print(f" Timestamp: {timestamp}") if __name__ == "__main__": main()