File size: 7,382 Bytes
2ac66eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c787817
2ac66eb
 
c787817
2ac66eb
 
 
 
 
 
 
2d42fc1
 
 
 
 
b234c92
2d42fc1
bc83a77
2d42fc1
 
 
 
 
 
2ac66eb
 
2d42fc1
 
8b7b4a8
1b375e5
 
6c6a1f3
1b375e5
6c6a1f3
1b375e5
6c6a1f3
1b375e5
6c6a1f3
1b375e5
8b7b4a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d42fc1
 
677500d
2d42fc1
8b7b4a8
 
 
 
 
 
 
 
bc83a77
8b7b4a8
 
 
 
c787817
2ac66eb
 
c787817
2ac66eb
 
2d42fc1
677500d
2ac66eb
 
2d42fc1
 
 
 
 
 
2d61c75
2d42fc1
2ac66eb
 
 
 
 
 
 
 
 
 
 
 
 
 
4c539fc
2ac66eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c539fc
 
 
 
2ac66eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import json
import os
import time
from datetime import datetime, timezone

import gradio as gr
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download

# ---------------------------------------------------------------------------
# CONFIGURATION
# ---------------------------------------------------------------------------
SCORES_REPO = "CatsCanWrite/mewsicbench-scores"      # Public dataset
REQUESTS_REPO = "CatsCanWrite/mewsicbench-requests"  # Public dataset


HF_TOKEN = os.environ.get("HF_TOKEN")
API = HfApi()

# Rate limiting: minimum seconds between submissions from the same IP
RATE_LIMIT_SECONDS = 10

# In-memory rate limit tracker: {client_ip: last_request_timestamp}
_rate_limit_store: dict[str, float] = {}


def _get_client_ip(request) -> str:
    """Extract client IP from Gradio request object."""
    try:
        if hasattr(request, "client"):
            host = request.client.host
            if host:
                return str(host)
        if hasattr(request, "request") and hasattr(request.request, "client"):
            host = request.request.client.host
            if host:
                return str(host)
    except Exception:
        pass
    return "unknown"


def _is_rate_limited(client_ip: str) -> bool:
    """Check if the given IP has submitted too recently."""
    now = time.time()
    last = _rate_limit_store.get(client_ip, 0)
    if now - last < RATE_LIMIT_SECONDS:
        return True
    _rate_limit_store[client_ip] = now
    return False


def load_leaderboard() -> pd.DataFrame:
    """Load the latest scores from the scores dataset."""
    if not HF_TOKEN:
        return empty_leaderboard()

    try:
        from datasets import load_dataset

        ds = load_dataset(SCORES_REPO, token=HF_TOKEN)
        df = ds["train"].to_pandas()

        # Build column mapping
        col_map = {
            "model_id": "Model ID",
            "overall_score": "Overall",
            "meter_score": "Meter",
            "verse_score": "Verse",
            "focus_score": "Focus",
            "avg_think_tokens": "Thinking",
            "evaluated_at": "Evaluated At",
        }
        display_cols = [c for c in col_map if c in df.columns]
        df = df[display_cols].copy()

        # Sort by overall score descending before formatting
        if "overall_score" in df.columns:
            df = df.sort_values("overall_score", ascending=False).reset_index(drop=True)
            df.insert(0, "Rank", range(1, len(df) + 1))

        # Apply coloring
        def color_score(val):
            if val >= 0.9:
                return "color: #008000;"   # green
            elif val >= 0.75:
                return "color: #606e00;"   # yeen
            elif val >= 0.5:
                return "color: #ff6f00;"   # orange
            else:
                return "color: #9f0000;"   # red


        # # Format percentages
        # for col in ["overall_score", "meter_score", "verse_score", "focus_score"]:
        #     if col in df.columns:
        #         df[col] = df[col].apply(
        #             lambda x: f"{x * 100:.1f}%" if pd.notna(x) else "N/A"
        #         )

        # Format dates
        if "evaluated_at" in df.columns:
            df["evaluated_at"] = pd.to_datetime(df["evaluated_at"], errors="coerce")
            df["evaluated_at"] = df["evaluated_at"].dt.strftime("%Y-%m-%d %H:%M")

        # Rename to human-readable names
        df.rename(columns=col_map, inplace=True)

        # Ensure column order matches empty leaderboard
        final_cols = [c for c in empty_leaderboard().columns if c in df.columns]
        df = df[final_cols]
        styled = (
            df.style
            .map(color_score, subset=["Overall", "Meter", "Verse", "Focus"])
            .format({
                "Overall": "{:.1%}",
                "Meter": "{:.1%}",
                "Verse": "{:.1%}",
                "Focus": "{:.1%}",
                "Thinking": "{:0.0f}",
            })
            .hide(axis="index")
        )        

        return styled
    except Exception as e:
        print(f"Could not load scores dataset: {e}")
        return empty_leaderboard()



def empty_leaderboard() -> pd.DataFrame:
    return pd.DataFrame(
        columns=[
            "Rank",
            "Model ID",
            "Overall",
            "Meter",
            "Verse",
            "Focus",
            "Thinking",
            "Evaluated At",
        ]
    )

def request_model(model_id: str, request: gr.Request) -> str:
    """
    Handle a model evaluation request.

    1. If the model already has a score, report it.
    2. If the model is already in the requests dataset, report that.
    3. Otherwise, add it to the requests dataset.
    """
    if not HF_TOKEN:
        return "**Error:** `HF_TOKEN` is not configured in this Space."

    model_id = model_id.strip().lower()
    if not model_id:
        return "**Error:** Please enter a Model ID."

    # Rate limiting
    client_ip = _get_client_ip(request)
    if _is_rate_limited(client_ip):
        return (
            f"**Rate limit:** Please wait at least {RATE_LIMIT_SECONDS} seconds "
            "between requests."
        )

    # Case 1: Already evaluated
    try:
        path = hf_hub_download(
            repo_id=SCORES_REPO,
            filename=f"scores/{model_id}.json",
            repo_type="dataset",
            token=HF_TOKEN,
        )
        with open(path, "r", encoding="utf-8") as f:
            score = json.load(f)
        return (
            f"**{model_id}** has already been evaluated!\n\n"
            f"- **Overall Score:** {score.get('overall_score', 'N/A'):.0%}\n"
            f"- **Meter:** {score.get('meter_score', 'N/A'):.0%}\n"
            f"- **Verse:** {score.get('verse_score', 'N/A'):.0%}\n"
            f"- **Focus:** {score.get('focus_score', 'N/A'):.0%}\n"
            f"- **Evaluated At:** {score.get('evaluated_at', 'N/A')}"
        )
    except Exception:
        pass  # Not in scores dataset

    # Case 2: Already requested
    try:
        path = hf_hub_download(
            repo_id=REQUESTS_REPO,
            filename=f"requests/{model_id}.json",
            repo_type="dataset",
            token=HF_TOKEN,
        )
        with open(path, "r", encoding="utf-8") as f:
            req = json.load(f)
        return (
            f"**{model_id}** has already been requested for evaluation.\n\n"
            f"- **Requested At:** {req.get('requested_at', 'N/A')}\n\n"
            "Please check back later for results."
        )
    except Exception:
        pass  # Not in requests dataset

    # Case 3: New request
    timestamp = datetime.now(timezone.utc).isoformat()
    record = {
        "model_id": model_id,
        "requested_at": timestamp,
    }
    data = json.dumps(record, indent=2).encode("utf-8")

    try:
        API.upload_file(
            path_or_fileobj=data,
            path_in_repo=f"requests/{model_id}.json",
            repo_id=REQUESTS_REPO,
            repo_type="dataset",
            token=HF_TOKEN,
        )
        return (
            f"**Request submitted!** {model_id} has been added to the evaluation queue.\n\n"
            f"- **Requested At:** {timestamp}\n\n"
            "Results will appear on the leaderboard once evaluation is complete."
        )
    except Exception as e:
        return f"**Error:** Could not submit request: {e}"