MewsicBench / utils.py
CatsCanWrite's picture
all right, fix'd
2d61c75
import json
import os
import time
from datetime import datetime, timezone
import gradio as gr
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download
# ---------------------------------------------------------------------------
# CONFIGURATION
# ---------------------------------------------------------------------------
SCORES_REPO = "CatsCanWrite/mewsicbench-scores" # Public dataset
REQUESTS_REPO = "CatsCanWrite/mewsicbench-requests" # Public dataset
HF_TOKEN = os.environ.get("HF_TOKEN")
API = HfApi()
# Rate limiting: minimum seconds between submissions from the same IP
RATE_LIMIT_SECONDS = 10
# In-memory rate limit tracker: {client_ip: last_request_timestamp}
_rate_limit_store: dict[str, float] = {}
def _get_client_ip(request) -> str:
"""Extract client IP from Gradio request object."""
try:
if hasattr(request, "client"):
host = request.client.host
if host:
return str(host)
if hasattr(request, "request") and hasattr(request.request, "client"):
host = request.request.client.host
if host:
return str(host)
except Exception:
pass
return "unknown"
def _is_rate_limited(client_ip: str) -> bool:
"""Check if the given IP has submitted too recently."""
now = time.time()
last = _rate_limit_store.get(client_ip, 0)
if now - last < RATE_LIMIT_SECONDS:
return True
_rate_limit_store[client_ip] = now
return False
def load_leaderboard() -> pd.DataFrame:
"""Load the latest scores from the scores dataset."""
if not HF_TOKEN:
return empty_leaderboard()
try:
from datasets import load_dataset
ds = load_dataset(SCORES_REPO, token=HF_TOKEN)
df = ds["train"].to_pandas()
# Build column mapping
col_map = {
"model_id": "Model ID",
"overall_score": "Overall",
"meter_score": "Meter",
"verse_score": "Verse",
"focus_score": "Focus",
"avg_think_tokens": "Thinking",
"evaluated_at": "Evaluated At",
}
display_cols = [c for c in col_map if c in df.columns]
df = df[display_cols].copy()
# Sort by overall score descending before formatting
if "overall_score" in df.columns:
df = df.sort_values("overall_score", ascending=False).reset_index(drop=True)
df.insert(0, "Rank", range(1, len(df) + 1))
# Apply coloring
def color_score(val):
if val >= 0.9:
return "color: #008000;" # green
elif val >= 0.75:
return "color: #606e00;" # yeen
elif val >= 0.5:
return "color: #ff6f00;" # orange
else:
return "color: #9f0000;" # red
# # Format percentages
# for col in ["overall_score", "meter_score", "verse_score", "focus_score"]:
# if col in df.columns:
# df[col] = df[col].apply(
# lambda x: f"{x * 100:.1f}%" if pd.notna(x) else "N/A"
# )
# Format dates
if "evaluated_at" in df.columns:
df["evaluated_at"] = pd.to_datetime(df["evaluated_at"], errors="coerce")
df["evaluated_at"] = df["evaluated_at"].dt.strftime("%Y-%m-%d %H:%M")
# Rename to human-readable names
df.rename(columns=col_map, inplace=True)
# Ensure column order matches empty leaderboard
final_cols = [c for c in empty_leaderboard().columns if c in df.columns]
df = df[final_cols]
styled = (
df.style
.map(color_score, subset=["Overall", "Meter", "Verse", "Focus"])
.format({
"Overall": "{:.1%}",
"Meter": "{:.1%}",
"Verse": "{:.1%}",
"Focus": "{:.1%}",
"Thinking": "{:0.0f}",
})
.hide(axis="index")
)
return styled
except Exception as e:
print(f"Could not load scores dataset: {e}")
return empty_leaderboard()
def empty_leaderboard() -> pd.DataFrame:
return pd.DataFrame(
columns=[
"Rank",
"Model ID",
"Overall",
"Meter",
"Verse",
"Focus",
"Thinking",
"Evaluated At",
]
)
def request_model(model_id: str, request: gr.Request) -> str:
"""
Handle a model evaluation request.
1. If the model already has a score, report it.
2. If the model is already in the requests dataset, report that.
3. Otherwise, add it to the requests dataset.
"""
if not HF_TOKEN:
return "**Error:** `HF_TOKEN` is not configured in this Space."
model_id = model_id.strip().lower()
if not model_id:
return "**Error:** Please enter a Model ID."
# Rate limiting
client_ip = _get_client_ip(request)
if _is_rate_limited(client_ip):
return (
f"**Rate limit:** Please wait at least {RATE_LIMIT_SECONDS} seconds "
"between requests."
)
# Case 1: Already evaluated
try:
path = hf_hub_download(
repo_id=SCORES_REPO,
filename=f"scores/{model_id}.json",
repo_type="dataset",
token=HF_TOKEN,
)
with open(path, "r", encoding="utf-8") as f:
score = json.load(f)
return (
f"**{model_id}** has already been evaluated!\n\n"
f"- **Overall Score:** {score.get('overall_score', 'N/A'):.0%}\n"
f"- **Meter:** {score.get('meter_score', 'N/A'):.0%}\n"
f"- **Verse:** {score.get('verse_score', 'N/A'):.0%}\n"
f"- **Focus:** {score.get('focus_score', 'N/A'):.0%}\n"
f"- **Evaluated At:** {score.get('evaluated_at', 'N/A')}"
)
except Exception:
pass # Not in scores dataset
# Case 2: Already requested
try:
path = hf_hub_download(
repo_id=REQUESTS_REPO,
filename=f"requests/{model_id}.json",
repo_type="dataset",
token=HF_TOKEN,
)
with open(path, "r", encoding="utf-8") as f:
req = json.load(f)
return (
f"**{model_id}** has already been requested for evaluation.\n\n"
f"- **Requested At:** {req.get('requested_at', 'N/A')}\n\n"
"Please check back later for results."
)
except Exception:
pass # Not in requests dataset
# Case 3: New request
timestamp = datetime.now(timezone.utc).isoformat()
record = {
"model_id": model_id,
"requested_at": timestamp,
}
data = json.dumps(record, indent=2).encode("utf-8")
try:
API.upload_file(
path_or_fileobj=data,
path_in_repo=f"requests/{model_id}.json",
repo_id=REQUESTS_REPO,
repo_type="dataset",
token=HF_TOKEN,
)
return (
f"**Request submitted!** {model_id} has been added to the evaluation queue.\n\n"
f"- **Requested At:** {timestamp}\n\n"
"Results will appear on the leaderboard once evaluation is complete."
)
except Exception as e:
return f"**Error:** Could not submit request: {e}"