Spaces:

CatsCanWrite
/

MewsicBench

Running

File size: 7,382 Bytes

import json
import os
import time
from datetime import datetime, timezone

import gradio as gr
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download

# ---------------------------------------------------------------------------
# CONFIGURATION
# ---------------------------------------------------------------------------
SCORES_REPO = "CatsCanWrite/mewsicbench-scores"      # Public dataset
REQUESTS_REPO = "CatsCanWrite/mewsicbench-requests"  # Public dataset


HF_TOKEN = os.environ.get("HF_TOKEN")
API = HfApi()

# Rate limiting: minimum seconds between submissions from the same IP
RATE_LIMIT_SECONDS = 10

# In-memory rate limit tracker: {client_ip: last_request_timestamp}
_rate_limit_store: dict[str, float] = {}


def _get_client_ip(request) -> str:
    """Extract client IP from Gradio request object."""
    try:
        if hasattr(request, "client"):
            host = request.client.host
            if host:
                return str(host)
        if hasattr(request, "request") and hasattr(request.request, "client"):
            host = request.request.client.host
            if host:
                return str(host)
    except Exception:
        pass
    return "unknown"


def _is_rate_limited(client_ip: str) -> bool:
    """Check if the given IP has submitted too recently."""
    now = time.time()
    last = _rate_limit_store.get(client_ip, 0)
    if now - last < RATE_LIMIT_SECONDS:
        return True
    _rate_limit_store[client_ip] = now
    return False


def load_leaderboard() -> pd.DataFrame:
    """Load the latest scores from the scores dataset."""
    if not HF_TOKEN:
        return empty_leaderboard()

    try:
        from datasets import load_dataset

        ds = load_dataset(SCORES_REPO, token=HF_TOKEN)
        df = ds["train"].to_pandas()

        # Build column mapping
        col_map = {
            "model_id": "Model ID",
            "overall_score": "Overall",
            "meter_score": "Meter",
            "verse_score": "Verse",
            "focus_score": "Focus",
            "avg_think_tokens": "Thinking",
            "evaluated_at": "Evaluated At",
        }
        display_cols = [c for c in col_map if c in df.columns]
        df = df[display_cols].copy()

        # Sort by overall score descending before formatting
        if "overall_score" in df.columns:
            df = df.sort_values("overall_score", ascending=False).reset_index(drop=True)
            df.insert(0, "Rank", range(1, len(df) + 1))

        # Apply coloring
        def color_score(val):
            if val >= 0.9:
                return "color: #008000;"   # green
            elif val >= 0.75:
                return "color: #606e00;"   # yeen
            elif val >= 0.5:
                return "color: #ff6f00;"   # orange
            else:
                return "color: #9f0000;"   # red


        # # Format percentages
        # for col in ["overall_score", "meter_score", "verse_score", "focus_score"]:
        #     if col in df.columns:
        #         df[col] = df[col].apply(
        #             lambda x: f"{x * 100:.1f}%" if pd.notna(x) else "N/A"
        #         )

        # Format dates
        if "evaluated_at" in df.columns:
            df["evaluated_at"] = pd.to_datetime(df["evaluated_at"], errors="coerce")
            df["evaluated_at"] = df["evaluated_at"].dt.strftime("%Y-%m-%d %H:%M")

        # Rename to human-readable names
        df.rename(columns=col_map, inplace=True)

        # Ensure column order matches empty leaderboard
        final_cols = [c for c in empty_leaderboard().columns if c in df.columns]
        df = df[final_cols]
        styled = (
            df.style
            .map(color_score, subset=["Overall", "Meter", "Verse", "Focus"])
            .format({
                "Overall": "{:.1%}",
                "Meter": "{:.1%}",
                "Verse": "{:.1%}",
                "Focus": "{:.1%}",
                "Thinking": "{:0.0f}",
            })
            .hide(axis="index")
        )        

        return styled
    except Exception as e:
        print(f"Could not load scores dataset: {e}")
        return empty_leaderboard()



def empty_leaderboard() -> pd.DataFrame:
    return pd.DataFrame(
        columns=[
            "Rank",
            "Model ID",
            "Overall",
            "Meter",
            "Verse",
            "Focus",
            "Thinking",
            "Evaluated At",
        ]
    )

def request_model(model_id: str, request: gr.Request) -> str:
    """
    Handle a model evaluation request.

    1. If the model already has a score, report it.
    2. If the model is already in the requests dataset, report that.
    3. Otherwise, add it to the requests dataset.
    """
    if not HF_TOKEN:
        return "**Error:** `HF_TOKEN` is not configured in this Space."

    model_id = model_id.strip().lower()
    if not model_id:
        return "**Error:** Please enter a Model ID."

    # Rate limiting
    client_ip = _get_client_ip(request)
    if _is_rate_limited(client_ip):
        return (
            f"**Rate limit:** Please wait at least {RATE_LIMIT_SECONDS} seconds "
            "between requests."
        )

    # Case 1: Already evaluated
    try:
        path = hf_hub_download(
            repo_id=SCORES_REPO,
            filename=f"scores/{model_id}.json",
            repo_type="dataset",
            token=HF_TOKEN,
        )
        with open(path, "r", encoding="utf-8") as f:
            score = json.load(f)
        return (
            f"**{model_id}** has already been evaluated!\n\n"
            f"- **Overall Score:** {score.get('overall_score', 'N/A'):.0%}\n"
            f"- **Meter:** {score.get('meter_score', 'N/A'):.0%}\n"
            f"- **Verse:** {score.get('verse_score', 'N/A'):.0%}\n"
            f"- **Focus:** {score.get('focus_score', 'N/A'):.0%}\n"
            f"- **Evaluated At:** {score.get('evaluated_at', 'N/A')}"
        )
    except Exception:
        pass  # Not in scores dataset

    # Case 2: Already requested
    try:
        path = hf_hub_download(
            repo_id=REQUESTS_REPO,
            filename=f"requests/{model_id}.json",
            repo_type="dataset",
            token=HF_TOKEN,
        )
        with open(path, "r", encoding="utf-8") as f:
            req = json.load(f)
        return (
            f"**{model_id}** has already been requested for evaluation.\n\n"
            f"- **Requested At:** {req.get('requested_at', 'N/A')}\n\n"
            "Please check back later for results."
        )
    except Exception:
        pass  # Not in requests dataset

    # Case 3: New request
    timestamp = datetime.now(timezone.utc).isoformat()
    record = {
        "model_id": model_id,
        "requested_at": timestamp,
    }
    data = json.dumps(record, indent=2).encode("utf-8")

    try:
        API.upload_file(
            path_or_fileobj=data,
            path_in_repo=f"requests/{model_id}.json",
            repo_id=REQUESTS_REPO,
            repo_type="dataset",
            token=HF_TOKEN,
        )
        return (
            f"**Request submitted!** {model_id} has been added to the evaluation queue.\n\n"
            f"- **Requested At:** {timestamp}\n\n"
            "Results will appear on the leaderboard once evaluation is complete."
        )
    except Exception as e:
        return f"**Error:** Could not submit request: {e}"