Spaces:

OpenEvals
/

every-leaderboards

Running

File size: 11,018 Bytes

#!/usr/bin/env python3
# /// script
# dependencies = ["requests", "huggingface-hub", "datasets", "pyarrow"]
# ///

import requests
import json
import os
import sys
from datetime import datetime

BENCHMARK_CONFIGS = [
    {"dataset": "openai/gsm8k", "key": "gsm8k", "name": "GSM8K", "gated": False},
    {
        "dataset": "TIGER-Lab/MMLU-Pro",
        "key": "mmluPro",
        "name": "MMLU-Pro",
        "gated": False,
    },
    {
        "dataset": "Idavidrein/gpqa",
        "key": "gpqa",
        "name": "GPQA Diamond",
        "gated": True,
    },
    {"dataset": "cais/hle", "key": "hle", "name": "HLE", "gated": True},
    {
        "dataset": "SWE-bench/SWE-bench_Verified",
        "key": "sweVerified",
        "name": "SWE-bench Verified",
        "gated": False,
    },
    {
        "dataset": "MathArena/aime_2026",
        "key": "aime2026",
        "name": "AIME 2026",
        "gated": False,
    },
    {
        "dataset": "MathArena/hmmt_feb_2026",
        "key": "hmmt2026",
        "name": "HMMT Feb 2026",
        "gated": False,
    },
    {
        "dataset": "allenai/olmOCR-bench",
        "key": "olmOcr",
        "name": "olmOCR-bench",
        "gated": False,
    },
    {
        "dataset": "harborframework/terminal-bench-2.0",
        "key": "terminalBench",
        "name": "Terminal-Bench 2.0",
        "gated": False,
    },
    {
        "dataset": "ScaleAI/SWE-bench_Pro",
        "key": "swePro",
        "name": "SWE-bench Pro",
        "gated": False,
    },
    {
        "dataset": "FutureMa/EvasionBench",
        "key": "evasionBench",
        "name": "EvasionBench",
        "gated": False,
    },
]


def fetch_model_parameters(model_id, hf_token=None):
    """Fetch parameter count for a model from HuggingFace API.

    Args:
        model_id: Model ID (e.g., "meta-llama/Llama-3-70B")
        hf_token: Optional HuggingFace token for private models

    Returns:
        Parameter count in billions, or None if not available
    """

    url = f"https://huggingface.co/api/models/{model_id}"
    headers = {}
    if hf_token:
        headers["Authorization"] = f"Bearer {hf_token}"

    response = requests.get(url, headers=headers, timeout=5)

    if response.status_code != 200:
        return parse_params_from_name(model_id)

    data = response.json()

    if "safetensors" in data:
        total_params = data["safetensors"].get("total")
        if total_params:
            return round(total_params / 1_000_000_000, 1)

        if "parameters" in data["safetensors"]:
            bf16_params = data["safetensors"]["parameters"].get("BF16")
            if bf16_params:
                return round(bf16_params / 1_000_000_000, 1)

        return parse_params_from_name(model_id)


def parse_params_from_name(model_id):
    """Parse parameter count from model name/ID.

    Examples:
        - "meta-llama/Llama-3-70B" -> 70.0
        - "Qwen/Qwen2-72B" -> 72.0
        - "Qwen/Qwen3.5-397B-A17B" -> 397.0 (full model, not active params)
        - "microsoft/Phi-3.5-mini-instruct" -> None (no clear size)

    Returns:
        Parameter count in billions, or None
    """
    import re

    # Pattern to match numbers followed by 'B' (case insensitive)
    # Looks for patterns like: -70B, -72B-, _8B, etc.
    # Also handles decimals like 1.5B
    # Prioritize larger numbers (full model size vs active params)

    matches = re.findall(r"[-_/](\d+\.?\d*)[Bb](?:[-_/]|$)", model_id)

    if matches:
        # Convert all matches to floats and take the maximum
        # (assumes full model size is larger than active params)
        params = [float(m) for m in matches]
        return max(params)

    return None


def fetch_all_from_apis(hf_token=None):
    """Fetch ALL models from APIs only - no manual data.

    Args:
        hf_token: Optional HuggingFace token for accessing gated datasets
    """
    models_dict = {}

    for config in BENCHMARK_CONFIGS:
        url = f"https://huggingface.co/api/datasets/{config['dataset']}/leaderboard"

        # Skip gated datasets if no token provided
        if config.get("gated", False) and not hf_token:
            print(f"Skipping {config['name']} (gated, requires HF token)")
            continue

        print(f"Fetching {config['name']}...")

        # Add authorization header for gated datasets
        headers = {}
        if config.get("gated", False) and hf_token:
            headers["Authorization"] = f"Bearer {hf_token}"
            print(f"  🔒 Using auth token for gated dataset")

        response = requests.get(url, headers=headers, timeout=10)

        if response.status_code != 200:
            print(f"  ⚠️  Skip (status {response.status_code})")
            continue

        data = response.json()

        for entry in data:
            model_id = entry.get("modelId")
            score = entry.get("value")

            # Create or update model
            if model_id not in models_dict:
                # Fetch parameter count from HuggingFace API
                param_count = fetch_model_parameters(model_id, hf_token)

                models_dict[model_id] = {
                    "id": model_id.lower().replace("/", "-"),
                    "name": model_id,
                    "provider": model_id.split("/")[0]
                    if "/" in model_id
                    else "Unknown",
                    "type": "open",
                    "metadata": {
                        "license": "Unknown",
                        "parametersInBillions": param_count,
                        "contextWindow": 0,
                        "modality": "text",
                        "architecture": "Transformer",
                    },
                    "benchmarks": {},
                }

            # Add benchmark score
            models_dict[model_id]["benchmarks"][config["key"]] = entry

        print(f"  ✓ Found {len([e for e in data if e.get('modelId')])} models")

    # Calculate aggregate scores
    models = list(models_dict.values())

    return models


def flatten_model_for_parquet(model, all_benchmark_keys):
    """Flatten nested model structure for parquet compatibility.

    Converts nested JSON structure into flat columns suitable for parquet format.
    Each benchmark score becomes its own column.

    Args:
        model: Model dict with nested structure
        all_benchmark_keys: List of all possible benchmark keys to ensure consistent schema
    """
    flat = {
        "model_id": model["id"],
        "model_name": model["name"],
        "provider": model["provider"],
        "model_type": model["type"],
        "parameters_billions": model["metadata"].get("parametersInBillions"),
        "license": model["metadata"].get("license", "Unknown"),
        "context_window": model["metadata"].get("contextWindow", 0),
        "modality": model["metadata"].get("modality", "text"),
        "architecture": model["metadata"].get("architecture", "Transformer"),
    }

    # Add ALL benchmark columns (with None for missing values)
    # This ensures consistent schema across all rows
    benchmarks = model.get("benchmarks", {})
    for bench_key in sorted(all_benchmark_keys):
        if bench_key in benchmarks:
            bench_data = benchmarks[bench_key]
            flat[f"{bench_key}_score"] = bench_data.get("value")
        else:
            flat[f"{bench_key}_score"] = None

    # Calculate aggregate metrics
    if benchmarks:
        scores = [
            b.get("value") for b in benchmarks.values() if b.get("value") is not None
        ]
        if scores:
            flat["aggregate_score"] = round(sum(scores) / len(scores), 2)
            flat["coverage_count"] = len(benchmarks)
            flat["coverage_percent"] = round((len(benchmarks) / 11) * 100, 1)
        else:
            flat["aggregate_score"] = None
            flat["coverage_count"] = 0
            flat["coverage_percent"] = 0.0
    else:
        flat["aggregate_score"] = None
        flat["coverage_count"] = 0
        flat["coverage_percent"] = 0.0

    return flat


def main():
    print("=" * 70)
    print("Fetching from Official APIs & Uploading to HF Dataset")
    print("=" * 70)
    print()

    # Get HF token from environment (required for upload)
    hf_token = os.environ.get("HF_TOKEN")

    if not hf_token:
        print("❌ HF_TOKEN environment variable required")
        print("   Export your token: export HF_TOKEN=your_token")
        sys.exit(1)

    print("✓ HF_TOKEN found")
    print("🔓 Will fetch gated datasets (GPQA, HLE)")
    print()

    # Fetch models from APIs
    models = fetch_all_from_apis(hf_token)

    if not models:
        print("❌ No models fetched - exiting")
        sys.exit(0)

    print()
    print("=" * 70)
    print(f"✓ Fetched {len(models)} models from APIs")
    print("=" * 70)

    # Collect all benchmark keys to ensure consistent schema
    all_benchmark_keys = set()
    for m in models:
        all_benchmark_keys.update(m.get("benchmarks", {}).keys())

    print(
        f"\n🔍 Found {len(all_benchmark_keys)} unique benchmarks: {sorted(all_benchmark_keys)}"
    )

    # Flatten data for parquet (pass all_benchmark_keys for consistent schema)
    print("\n📊 Flattening data for parquet format...")
    flattened_models = [
        flatten_model_for_parquet(m, all_benchmark_keys) for m in models
    ]

    # Create HF Dataset
    from datasets import Dataset

    dataset = Dataset.from_list(flattened_models)

    print(f"   ✓ Created dataset with {len(dataset)} rows")
    print(f"   ✓ Schema: {len(dataset.column_names)} columns")

    # Upload to HuggingFace
    DATASET_REPO = "OpenEvals/leaderboard-data"

    print(f"\n📤 Uploading to {DATASET_REPO}...")

    try:
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")

        dataset.push_to_hub(
            DATASET_REPO,
            token=hf_token,
            commit_message=f"Automated update: {timestamp}",
        )

        print(f"   ✅ Successfully uploaded!")
        print(f"   🔗 View at: https://huggingface.co/datasets/{DATASET_REPO}")

    except Exception as e:
        print(f"   ❌ Upload failed: {e}")
        sys.exit(1)

    # Show summary
    benchmark_keys = [
        "gsm8k",
        "mmluPro",
        "gpqa",
        "hle",
        "olmOcr",
        "sweVerified",
        "swePro",
        "aime2026",
        "terminalBench",
        "evasionBench",
        "hmmt2026",
    ]

    print("\n📊 Benchmark Coverage:")
    for bench in benchmark_keys:
        col_name = f"{bench}_score"
        if col_name in dataset.column_names:
            # Count non-null values in the column
            values = dataset[col_name]
            count = sum(1 for v in values if v is not None)
            if count > 0:
                print(f"   {bench:20s}: {count:2d} models")

    print("\n✅ Data updated successfully!")
    print(f"   Total models: {len(models)}")
    print(f"   Timestamp: {timestamp}")


if __name__ == "__main__":
    main()