Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| # /// script | |
| # dependencies = ["requests", "huggingface-hub", "datasets", "pyarrow"] | |
| # /// | |
| import requests | |
| import json | |
| import os | |
| import sys | |
| from datetime import datetime | |
| BENCHMARK_CONFIGS = [ | |
| {"dataset": "openai/gsm8k", "key": "gsm8k", "name": "GSM8K", "gated": False}, | |
| { | |
| "dataset": "TIGER-Lab/MMLU-Pro", | |
| "key": "mmluPro", | |
| "name": "MMLU-Pro", | |
| "gated": False, | |
| }, | |
| { | |
| "dataset": "Idavidrein/gpqa", | |
| "key": "gpqa", | |
| "name": "GPQA Diamond", | |
| "gated": True, | |
| }, | |
| {"dataset": "cais/hle", "key": "hle", "name": "HLE", "gated": True}, | |
| { | |
| "dataset": "SWE-bench/SWE-bench_Verified", | |
| "key": "sweVerified", | |
| "name": "SWE-bench Verified", | |
| "gated": False, | |
| }, | |
| { | |
| "dataset": "MathArena/aime_2026", | |
| "key": "aime2026", | |
| "name": "AIME 2026", | |
| "gated": False, | |
| }, | |
| { | |
| "dataset": "MathArena/hmmt_feb_2026", | |
| "key": "hmmt2026", | |
| "name": "HMMT Feb 2026", | |
| "gated": False, | |
| }, | |
| { | |
| "dataset": "allenai/olmOCR-bench", | |
| "key": "olmOcr", | |
| "name": "olmOCR-bench", | |
| "gated": False, | |
| }, | |
| { | |
| "dataset": "harborframework/terminal-bench-2.0", | |
| "key": "terminalBench", | |
| "name": "Terminal-Bench 2.0", | |
| "gated": False, | |
| }, | |
| { | |
| "dataset": "ScaleAI/SWE-bench_Pro", | |
| "key": "swePro", | |
| "name": "SWE-bench Pro", | |
| "gated": False, | |
| }, | |
| { | |
| "dataset": "FutureMa/EvasionBench", | |
| "key": "evasionBench", | |
| "name": "EvasionBench", | |
| "gated": False, | |
| }, | |
| ] | |
| def fetch_model_parameters(model_id, hf_token=None): | |
| """Fetch parameter count for a model from HuggingFace API. | |
| Args: | |
| model_id: Model ID (e.g., "meta-llama/Llama-3-70B") | |
| hf_token: Optional HuggingFace token for private models | |
| Returns: | |
| Parameter count in billions, or None if not available | |
| """ | |
| url = f"https://huggingface.co/api/models/{model_id}" | |
| headers = {} | |
| if hf_token: | |
| headers["Authorization"] = f"Bearer {hf_token}" | |
| response = requests.get(url, headers=headers, timeout=5) | |
| if response.status_code != 200: | |
| return parse_params_from_name(model_id) | |
| data = response.json() | |
| if "safetensors" in data: | |
| total_params = data["safetensors"].get("total") | |
| if total_params: | |
| return round(total_params / 1_000_000_000, 1) | |
| if "parameters" in data["safetensors"]: | |
| bf16_params = data["safetensors"]["parameters"].get("BF16") | |
| if bf16_params: | |
| return round(bf16_params / 1_000_000_000, 1) | |
| return parse_params_from_name(model_id) | |
| def parse_params_from_name(model_id): | |
| """Parse parameter count from model name/ID. | |
| Examples: | |
| - "meta-llama/Llama-3-70B" -> 70.0 | |
| - "Qwen/Qwen2-72B" -> 72.0 | |
| - "Qwen/Qwen3.5-397B-A17B" -> 397.0 (full model, not active params) | |
| - "microsoft/Phi-3.5-mini-instruct" -> None (no clear size) | |
| Returns: | |
| Parameter count in billions, or None | |
| """ | |
| import re | |
| # Pattern to match numbers followed by 'B' (case insensitive) | |
| # Looks for patterns like: -70B, -72B-, _8B, etc. | |
| # Also handles decimals like 1.5B | |
| # Prioritize larger numbers (full model size vs active params) | |
| matches = re.findall(r"[-_/](\d+\.?\d*)[Bb](?:[-_/]|$)", model_id) | |
| if matches: | |
| # Convert all matches to floats and take the maximum | |
| # (assumes full model size is larger than active params) | |
| params = [float(m) for m in matches] | |
| return max(params) | |
| return None | |
| def fetch_all_from_apis(hf_token=None): | |
| """Fetch ALL models from APIs only - no manual data. | |
| Args: | |
| hf_token: Optional HuggingFace token for accessing gated datasets | |
| """ | |
| models_dict = {} | |
| for config in BENCHMARK_CONFIGS: | |
| url = f"https://huggingface.co/api/datasets/{config['dataset']}/leaderboard" | |
| # Skip gated datasets if no token provided | |
| if config.get("gated", False) and not hf_token: | |
| print(f"Skipping {config['name']} (gated, requires HF token)") | |
| continue | |
| print(f"Fetching {config['name']}...") | |
| # Add authorization header for gated datasets | |
| headers = {} | |
| if config.get("gated", False) and hf_token: | |
| headers["Authorization"] = f"Bearer {hf_token}" | |
| print(f" π Using auth token for gated dataset") | |
| response = requests.get(url, headers=headers, timeout=10) | |
| if response.status_code != 200: | |
| print(f" β οΈ Skip (status {response.status_code})") | |
| continue | |
| data = response.json() | |
| for entry in data: | |
| model_id = entry.get("modelId") | |
| score = entry.get("value") | |
| # Create or update model | |
| if model_id not in models_dict: | |
| # Fetch parameter count from HuggingFace API | |
| param_count = fetch_model_parameters(model_id, hf_token) | |
| models_dict[model_id] = { | |
| "id": model_id.lower().replace("/", "-"), | |
| "name": model_id, | |
| "provider": model_id.split("/")[0] | |
| if "/" in model_id | |
| else "Unknown", | |
| "type": "open", | |
| "metadata": { | |
| "license": "Unknown", | |
| "parametersInBillions": param_count, | |
| "contextWindow": 0, | |
| "modality": "text", | |
| "architecture": "Transformer", | |
| }, | |
| "benchmarks": {}, | |
| } | |
| # Add benchmark score | |
| models_dict[model_id]["benchmarks"][config["key"]] = entry | |
| print(f" β Found {len([e for e in data if e.get('modelId')])} models") | |
| # Calculate aggregate scores | |
| models = list(models_dict.values()) | |
| return models | |
| def flatten_model_for_parquet(model, all_benchmark_keys): | |
| """Flatten nested model structure for parquet compatibility. | |
| Converts nested JSON structure into flat columns suitable for parquet format. | |
| Each benchmark score becomes its own column. | |
| Args: | |
| model: Model dict with nested structure | |
| all_benchmark_keys: List of all possible benchmark keys to ensure consistent schema | |
| """ | |
| flat = { | |
| "model_id": model["id"], | |
| "model_name": model["name"], | |
| "provider": model["provider"], | |
| "model_type": model["type"], | |
| "parameters_billions": model["metadata"].get("parametersInBillions"), | |
| "license": model["metadata"].get("license", "Unknown"), | |
| "context_window": model["metadata"].get("contextWindow", 0), | |
| "modality": model["metadata"].get("modality", "text"), | |
| "architecture": model["metadata"].get("architecture", "Transformer"), | |
| } | |
| # Add ALL benchmark columns (with None for missing values) | |
| # This ensures consistent schema across all rows | |
| benchmarks = model.get("benchmarks", {}) | |
| for bench_key in sorted(all_benchmark_keys): | |
| if bench_key in benchmarks: | |
| bench_data = benchmarks[bench_key] | |
| flat[f"{bench_key}_score"] = bench_data.get("value") | |
| else: | |
| flat[f"{bench_key}_score"] = None | |
| # Calculate aggregate metrics | |
| if benchmarks: | |
| scores = [ | |
| b.get("value") for b in benchmarks.values() if b.get("value") is not None | |
| ] | |
| if scores: | |
| flat["aggregate_score"] = round(sum(scores) / len(scores), 2) | |
| flat["coverage_count"] = len(benchmarks) | |
| flat["coverage_percent"] = round((len(benchmarks) / 11) * 100, 1) | |
| else: | |
| flat["aggregate_score"] = None | |
| flat["coverage_count"] = 0 | |
| flat["coverage_percent"] = 0.0 | |
| else: | |
| flat["aggregate_score"] = None | |
| flat["coverage_count"] = 0 | |
| flat["coverage_percent"] = 0.0 | |
| return flat | |
| def main(): | |
| print("=" * 70) | |
| print("Fetching from Official APIs & Uploading to HF Dataset") | |
| print("=" * 70) | |
| print() | |
| # Get HF token from environment (required for upload) | |
| hf_token = os.environ.get("HF_TOKEN") | |
| if not hf_token: | |
| print("β HF_TOKEN environment variable required") | |
| print(" Export your token: export HF_TOKEN=your_token") | |
| sys.exit(1) | |
| print("β HF_TOKEN found") | |
| print("π Will fetch gated datasets (GPQA, HLE)") | |
| print() | |
| # Fetch models from APIs | |
| models = fetch_all_from_apis(hf_token) | |
| if not models: | |
| print("β No models fetched - exiting") | |
| sys.exit(0) | |
| print() | |
| print("=" * 70) | |
| print(f"β Fetched {len(models)} models from APIs") | |
| print("=" * 70) | |
| # Collect all benchmark keys to ensure consistent schema | |
| all_benchmark_keys = set() | |
| for m in models: | |
| all_benchmark_keys.update(m.get("benchmarks", {}).keys()) | |
| print( | |
| f"\nπ Found {len(all_benchmark_keys)} unique benchmarks: {sorted(all_benchmark_keys)}" | |
| ) | |
| # Flatten data for parquet (pass all_benchmark_keys for consistent schema) | |
| print("\nπ Flattening data for parquet format...") | |
| flattened_models = [ | |
| flatten_model_for_parquet(m, all_benchmark_keys) for m in models | |
| ] | |
| # Create HF Dataset | |
| from datasets import Dataset | |
| dataset = Dataset.from_list(flattened_models) | |
| print(f" β Created dataset with {len(dataset)} rows") | |
| print(f" β Schema: {len(dataset.column_names)} columns") | |
| # Upload to HuggingFace | |
| DATASET_REPO = "OpenEvals/leaderboard-data" | |
| print(f"\nπ€ Uploading to {DATASET_REPO}...") | |
| try: | |
| timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC") | |
| dataset.push_to_hub( | |
| DATASET_REPO, | |
| token=hf_token, | |
| commit_message=f"Automated update: {timestamp}", | |
| ) | |
| print(f" β Successfully uploaded!") | |
| print(f" π View at: https://huggingface.co/datasets/{DATASET_REPO}") | |
| except Exception as e: | |
| print(f" β Upload failed: {e}") | |
| sys.exit(1) | |
| # Show summary | |
| benchmark_keys = [ | |
| "gsm8k", | |
| "mmluPro", | |
| "gpqa", | |
| "hle", | |
| "olmOcr", | |
| "sweVerified", | |
| "swePro", | |
| "aime2026", | |
| "terminalBench", | |
| "evasionBench", | |
| "hmmt2026", | |
| ] | |
| print("\nπ Benchmark Coverage:") | |
| for bench in benchmark_keys: | |
| col_name = f"{bench}_score" | |
| if col_name in dataset.column_names: | |
| # Count non-null values in the column | |
| values = dataset[col_name] | |
| count = sum(1 for v in values if v is not None) | |
| if count > 0: | |
| print(f" {bench:20s}: {count:2d} models") | |
| print("\nβ Data updated successfully!") | |
| print(f" Total models: {len(models)}") | |
| print(f" Timestamp: {timestamp}") | |
| if __name__ == "__main__": | |
| main() | |