FredericFan commited on
Commit
bf70dba
·
1 Parent(s): 8b01eb0
.gitignore ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Paper LaTeX source (not needed for HF Space)
2
+ /paper_latex
3
+
4
+ # Python
5
+ __pycache__/
6
+ *.py[cod]
7
+ *.pyo
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+ *.egg
12
+
13
+ # Virtual environments
14
+ .venv/
15
+ venv/
16
+ env/
17
+
18
+ # IDE
19
+ .idea/
20
+ .vscode/
21
+ *.swp
22
+ *.swo
23
+
24
+ # OS
25
+ .DS_Store
26
+ Thumbs.db
27
+
28
+ # Gradio
29
+ flagged/
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: M
3
- emoji: 🦀
4
- colorFrom: gray
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: M3-Bench Leaderboard
3
+ emoji: 🎮
4
+ colorFrom: indigo
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 6.11.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
+ short_description: "Multi-Model Multi-View Game-Theoretic Benchmark for LLMs"
12
  ---
 
 
app.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ M3-Bench Leaderboard – HF Space entry point
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from create_leaderboard import demo
10
+
11
+ if __name__ == "__main__":
12
+ demo.launch(
13
+ server_name="0.0.0.0",
14
+ server_port=7860,
15
+ share=False,
16
+ )
create_leaderboard.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ M3-Bench Leaderboard – Gradio UI
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from datetime import datetime
11
+
12
+ import pandas as pd
13
+ import gradio as gr
14
+
15
+ from tabs.leaderboard_tab import create_leaderboard_tab
16
+
17
+
18
+ def get_leaderboard_info():
19
+ leaderboard_path = Path(__file__).parent / "data" / "leaderboard.csv"
20
+ if leaderboard_path.exists():
21
+ try:
22
+ df = pd.read_csv(leaderboard_path)
23
+ model_count = len(df["model"].unique())
24
+ mtime = leaderboard_path.stat().st_mtime
25
+ last_update = datetime.fromtimestamp(mtime).strftime("%d %B %Y")
26
+ return model_count, last_update
27
+ except Exception:
28
+ pass
29
+ return 0, "Unknown"
30
+
31
+
32
+ model_count, last_update = get_leaderboard_info()
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # UI
36
+ # ---------------------------------------------------------------------------
37
+
38
+ with gr.Blocks(title="M3-Bench Leaderboard") as demo:
39
+
40
+ gr.HTML("""
41
+ <style>
42
+ .title-block {
43
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
44
+ -webkit-background-clip: text;
45
+ -webkit-text-fill-color: transparent;
46
+ background-clip: text;
47
+ color: transparent;
48
+ text-align: center;
49
+ font-size: 2.2rem;
50
+ font-weight: 800;
51
+ margin: 0.5rem 0 0.5rem 0;
52
+ display: inline-block;
53
+ width: 100%;
54
+ }
55
+ .subtitle-block {
56
+ text-align: center;
57
+ font-size: 1.1rem;
58
+ color: #555;
59
+ margin-bottom: 0.5rem;
60
+ }
61
+ .intro-block {
62
+ text-align: center;
63
+ margin-bottom: 1.25rem;
64
+ line-height: 2;
65
+ }
66
+ .intro-block a {
67
+ color: #667eea;
68
+ text-decoration: none;
69
+ font-weight: 600;
70
+ margin: 0 0.3rem;
71
+ }
72
+ .intro-block a:hover { text-decoration: underline; }
73
+ .info-badge {
74
+ display: inline-block;
75
+ background: #f0f0ff;
76
+ border: 1px solid #ddd;
77
+ border-radius: 12px;
78
+ padding: 2px 10px;
79
+ margin: 0 4px;
80
+ font-size: 0.9em;
81
+ color: #444;
82
+ }
83
+ </style>
84
+ """)
85
+
86
+ gr.HTML(f"""
87
+ <div class="title-block">
88
+ M3-Bench Leaderboard
89
+ </div>
90
+ <div class="subtitle-block">
91
+ Multi-Model Multi-View Game-Theoretic Benchmark for LLMs
92
+ </div>
93
+ <div class="intro-block">
94
+ Evaluating LLMs through game-theoretic interactions across 4 difficulty levels and 3 evaluation views.<br>
95
+ <a href="#" target="_blank">📄 Paper</a> |
96
+ <a href="#" target="_blank">💻 Code</a> |
97
+ <a href="#" target="_blank">🌐 Website</a>
98
+ <br>
99
+ <span class="info-badge">🤖 Models: {model_count}</span>
100
+ <span class="info-badge">📅 Last Update: {last_update}</span>
101
+ </div>
102
+ """)
103
+
104
+ with gr.Tabs():
105
+ create_leaderboard_tab()
106
+
107
+ with gr.Tab("📊 About M3-Bench"):
108
+ gr.Markdown("""
109
+ ## About M3-Bench
110
+
111
+ **M3-Bench** is a comprehensive benchmark for evaluating Large Language Models (LLMs) through
112
+ game-theoretic interactions. It assesses model capabilities across **4 difficulty levels** using
113
+ **3 complementary evaluation views**.
114
+
115
+ ### Evaluation Views
116
+
117
+ | View | Description |
118
+ |------|-------------|
119
+ | **BTA** (Birds-eye, Technically-grounded, Artistically-aware) | Holistic evaluation combining multiple assessment perspectives |
120
+ | **RPA** (Role-Play Assessment) | Evaluation through role-playing scenarios within games |
121
+ | **CCA** (Compositional Capability Assessment) | Fine-grained assessment of compositional reasoning abilities |
122
+
123
+ ### Difficulty Levels
124
+
125
+ | Level | Games | Description |
126
+ |-------|-------|-------------|
127
+ | **Level 1** | PD, SH, UG | Basic strategic reasoning (Prisoner's Dilemma, Stag Hunt, Ultimatum Game) |
128
+ | **Level 2** | RPD, GE, AOB | Iterated/extended games (Repeated PD, Gift Exchange, All-or-Nothing Bargaining) |
129
+ | **Level 3** | PGG, VD, CPR | Multi-agent coordination (Public Goods Game, Volunteer's Dilemma, Common Pool Resource) |
130
+ | **Level 4** | AC, WW, KP | Complex strategic reasoning (Auction Competition, Wage War, Keynesian Beauty Contest) |
131
+
132
+ ### Model Categories
133
+
134
+ - 🔒 **Closed-Source**: Proprietary frontier models (GPT-5.1, Claude Opus 4.5, Gemini-3 Pro, Grok-4.1)
135
+ - 🔓 **Open-Weight**: Open-weight models (GPT-oss 120b, DeepSeek V3.2, Kimi-K2 Thinking, Mistral Large 3)
136
+ - 🧠 **Reasoning**: Reasoning-oriented models (GPT-5 pro, Gemini-3 Deep Think, DeepSeek-V3.2 Speciale)
137
+ - 👤 **Human**: Human participant baseline
138
+ """)
139
+
140
+ # Citation block
141
+ gr.HTML("""
142
+ <style>
143
+ .citation-block {
144
+ margin-top: 2rem;
145
+ padding: 1.5rem;
146
+ border: 1px solid #e0e0e0;
147
+ border-radius: 8px;
148
+ background-color: #f9f9f9;
149
+ }
150
+ .citation-title {
151
+ font-size: 1.15rem;
152
+ font-weight: 600;
153
+ margin-bottom: 1rem;
154
+ color: #333;
155
+ }
156
+ .citation-content {
157
+ background-color: #fff;
158
+ border: 1px solid #ddd;
159
+ border-radius: 4px;
160
+ padding: 1rem;
161
+ font-family: monospace;
162
+ font-size: 0.85rem;
163
+ white-space: pre-wrap;
164
+ line-height: 1.5;
165
+ position: relative;
166
+ }
167
+ .copy-btn {
168
+ position: absolute;
169
+ top: 8px;
170
+ right: 8px;
171
+ padding: 6px 12px;
172
+ background-color: #667eea;
173
+ color: white;
174
+ border: none;
175
+ border-radius: 4px;
176
+ cursor: pointer;
177
+ font-size: 0.85rem;
178
+ }
179
+ .copy-btn:hover { background-color: #5a6fd6; }
180
+ .copy-btn.copied { background-color: #198754; }
181
+ </style>
182
+ <div class="citation-block">
183
+ <div class="citation-title">📚 Citation</div>
184
+ <div class="citation-content" id="citation-text">
185
+ <button class="copy-btn" onclick="copyCitation()">Copy</button>
186
+ @article{m3bench2025,
187
+ title = {M3-Bench: Multi-Model Multi-View Game-Theoretic Benchmark for LLMs},
188
+ author = {Authors},
189
+ journal = {arXiv preprint},
190
+ year = {2025},
191
+ }</div>
192
+ </div>
193
+ <script>
194
+ function copyCitation() {
195
+ const citationText = `@article{m3bench2025,
196
+ title = {M3-Bench: Multi-Model Multi-View Game-Theoretic Benchmark for LLMs},
197
+ author = {Authors},
198
+ journal = {arXiv preprint},
199
+ year = {2025},
200
+ }`;
201
+ navigator.clipboard.writeText(citationText).then(function() {
202
+ const btn = document.querySelector('.copy-btn');
203
+ btn.textContent = 'Copied!';
204
+ btn.classList.add('copied');
205
+ setTimeout(function() {
206
+ btn.textContent = 'Copy';
207
+ btn.classList.remove('copied');
208
+ }, 2000);
209
+ });
210
+ }
211
+ </script>
212
+ """)
213
+
214
+
215
+ if __name__ == "__main__":
216
+ demo.launch()
data/leaderboard.csv ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,category,view,PD,SH,UG,RPD,GE,AOB,PGG,VD,CPR,AC,WW,KP,L1,L2,L3,L4
2
+ GPT-5.1,Closed-Source,BTA,97,94,95,96,82,93,93,88,94,91,80,90,93.0,89.0,90.3,86.2
3
+ GPT-5.1,Closed-Source,RPA,95,92,93,93,84,90,90,86,91,88,84,87,91.7,88.0,88.5,85.8
4
+ GPT-5.1,Closed-Source,CCA,90,88,96,88,91,93,88,84,85,80,90,76,89.3,86.8,87.2,83.7
5
+ Claude Opus 4.5,Closed-Source,BTA,95,92,91,93,85,95,91,86,90,87,83,85,91.0,88.8,89.3,85.5
6
+ Claude Opus 4.5,Closed-Source,RPA,93,90,94,91,82,93,88,84,88,85,87,83,89.7,86.5,87.5,84.7
7
+ Claude Opus 4.5,Closed-Source,CCA,92,94,97,90,95,94,90,87,83,82,93,78,90.3,89.0,88.0,86.8
8
+ Gemini-3 Pro,Closed-Source,BTA,93,95,89,90,86,88,88,90,91,86,78,84,89.7,86.8,87.3,83.2
9
+ Gemini-3 Pro,Closed-Source,RPA,91,90,91,88,83,85,86,87,89,84,82,85,89.3,85.7,85.8,83.5
10
+ Gemini-3 Pro,Closed-Source,CCA,87,85,93,84,88,90,84,85,82,78,86,74,86.8,83.5,84.3,81.0
11
+ Grok-4.1,Closed-Source,BTA,90,88,83,87,81,93,85,83,89,82,72,86,88.7,86.3,84.5,79.0
12
+ Grok-4.1,Closed-Source,RPA,87,85,86,84,79,90,83,81,86,79,76,83,86.7,84.2,82.8,78.7
13
+ Grok-4.1,Closed-Source,CCA,84,82,90,80,85,92,81,80,80,74,82,71,86.2,82.7,81.8,77.0
14
+ GPT-oss 120b,Open-Weight,BTA,89,91,86,88,90,84,84,87,85,79,75,78,87.0,85.3,84.0,77.5
15
+ GPT-oss 120b,Open-Weight,RPA,86,88,89,85,86,82,82,84,83,77,79,76,86.3,83.2,82.3,77.5
16
+ GPT-oss 120b,Open-Weight,CCA,83,86,91,82,89,86,80,83,79,75,84,70,85.0,81.8,81.8,77.7
17
+ DeepSeek V3.2,Open-Weight,BTA,88,90,85,85,88,83,87,81,92,84,68,80,86.7,84.2,83.8,76.2
18
+ DeepSeek V3.2,Open-Weight,RPA,85,87,88,83,85,81,84,79,89,81,73,77,85.5,82.5,82.2,76.2
19
+ DeepSeek V3.2,Open-Weight,CCA,82,84,90,79,87,85,82,78,84,72,79,69,84.0,80.5,81.0,74.8
20
+ Kimi-K2 Thinking,Open-Weight,BTA,92,86,90,84,79,86,86,80,83,73,82,68,87.0,81.2,82.0,74.3
21
+ Kimi-K2 Thinking,Open-Weight,RPA,89,84,92,81,77,83,83,78,81,80,86,76,86.2,79.3,80.3,81.0
22
+ Kimi-K2 Thinking,Open-Weight,CCA,85,81,94,77,83,87,79,76,77,60,72,53,84.5,78.5,79.2,63.2
23
+ Mistral Large 3,Open-Weight,BTA,86,84,82,82,76,80,79,76,82,74,70,71,81.8,77.8,77.7,70.5
24
+ Mistral Large 3,Open-Weight,RPA,83,81,85,79,74,78,77,74,79,71,72,69,81.0,76.2,76.2,69.8
25
+ Mistral Large 3,Open-Weight,CCA,79,77,88,75,80,82,74,72,75,62,74,58,79.3,75.3,74.7,66.0
26
+ GPT-5 pro,Reasoning,BTA,96,93,94,94,85,91,91,86,92,86,76,88,92.7,89.0,88.2,82.2
27
+ GPT-5 pro,Reasoning,RPA,97,96,93,96,93,95,93,92,94,93,91,92,95.2,94.2,92.2,92.0
28
+ GPT-5 pro,Reasoning,CCA,78,76,88,79,72,84,77,74,80,65,70,60,78.5,75.5,77.2,66.8
29
+ Gemini-3 Deep Think,Reasoning,BTA,91,89,88,89,83,87,87,85,90,83,84,80,89.7,85.7,86.8,81.5
30
+ Gemini-3 Deep Think,Reasoning,RPA,94,93,95,93,91,93,88,90,91,91,88,90,93.0,92.2,90.2,90.0
31
+ Gemini-3 Deep Think,Reasoning,CCA,76,80,85,76,80,82,78,76,77,63,78,58,78.8,75.5,77.8,67.8
32
+ DeepSeek-V3.2 Speciale,Reasoning,BTA,94,91,92,92,80,89,89,82,91,84,70,85,91.3,86.7,85.0,78.2
33
+ DeepSeek-V3.2 Speciale,Reasoning,RPA,97,96,94,97,94,96,90,92,93,95,92,94,95.2,95.2,92.5,94.0
34
+ DeepSeek-V3.2 Speciale,Reasoning,CCA,74,72,83,72,68,78,71,68,73,55,62,50,74.8,69.7,71.5,58.0
35
+ Human,Human,BTA,91,88,93,86,78,84,83,80,79,78,87,72,86.3,82.2,82.8,80.7
36
+ Human,Human,RPA,86,84,90,82,76,81,80,78,77,75,83,70,84.2,79.5,80.2,77.5
37
+ Human,Human,CCA,93,90,94,89,92,91,90,85,76,84,92,68,89.0,85.0,85.3,84.7
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio>=6.0.0
2
+ pandas>=1.5
tabs/__init__.py ADDED
File without changes
tabs/leaderboard_tab.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ M3-Bench Leaderboard – Leaderboard Tab
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import gradio as gr
10
+ import pandas as pd
11
+ from pathlib import Path
12
+
13
+ BASE_DIR = Path(__file__).resolve().parent.parent
14
+ DATA_PATH = BASE_DIR / "data" / "leaderboard.csv"
15
+
16
+ # Category colors for row shading
17
+ CATEGORY_COLORS = {
18
+ "Closed-Source": "#e8f0fe", # cool blue
19
+ "Open-Weight": "#e8f5e9", # sage green
20
+ "Reasoning": "#fce4ec", # warm peach
21
+ "Human": "#e0f7fa", # light cyan
22
+ }
23
+
24
+ CATEGORY_EMOJI = {
25
+ "Closed-Source": "🔒",
26
+ "Open-Weight": "🔓",
27
+ "Reasoning": "🧠",
28
+ "Human": "👤",
29
+ }
30
+
31
+ # View full names
32
+ VIEW_NAMES = {
33
+ "BTA": "Birds-eye, Technically-grounded, Artistically-aware",
34
+ "RPA": "Role-Play Assessment",
35
+ "CCA": "Compositional Capability Assessment",
36
+ }
37
+
38
+ # Game abbreviation full names
39
+ GAME_FULL_NAMES = {
40
+ "PD": "Prisoner's Dilemma",
41
+ "SH": "Stag Hunt",
42
+ "UG": "Ultimatum Game",
43
+ "RPD": "Repeated PD",
44
+ "GE": "Gift Exchange",
45
+ "AOB": "All-or-Nothing Bargaining",
46
+ "PGG": "Public Goods Game",
47
+ "VD": "Volunteer's Dilemma",
48
+ "CPR": "Common Pool Resource",
49
+ "AC": "Auction Competition",
50
+ "WW": "Wage War",
51
+ "KP": "Keynesian Beauty Contest",
52
+ }
53
+
54
+
55
+ def load_leaderboard() -> pd.DataFrame:
56
+ if not DATA_PATH.exists():
57
+ raise FileNotFoundError(f"Leaderboard file not found: {DATA_PATH}")
58
+ df = pd.read_csv(DATA_PATH)
59
+ df.columns = [c.strip() for c in df.columns]
60
+ return df
61
+
62
+
63
+ def compute_overall(df: pd.DataFrame) -> pd.DataFrame:
64
+ """Compute overall average from the 4 level averages."""
65
+ df["Overall"] = df[["L1", "L2", "L3", "L4"]].mean(axis=1).round(2)
66
+ return df
67
+
68
+
69
+ def aggregate_by_model(df: pd.DataFrame) -> pd.DataFrame:
70
+ """Average across all 3 views per model to get a single row per model."""
71
+ numeric_cols = ["PD", "SH", "UG", "RPD", "GE", "AOB", "PGG", "VD", "CPR",
72
+ "AC", "WW", "KP", "L1", "L2", "L3", "L4"]
73
+ agg = df.groupby(["model", "category"])[numeric_cols].mean().reset_index()
74
+ agg["Overall"] = agg[["L1", "L2", "L3", "L4"]].mean(axis=1).round(2)
75
+ return agg
76
+
77
+
78
+ def make_ranked_html(df: pd.DataFrame, sort_col: str = "Overall") -> str:
79
+ """Build an HTML table from the DataFrame."""
80
+ df = df.sort_values(by=sort_col, ascending=False).reset_index(drop=True)
81
+
82
+ # Assign ranks with ties
83
+ df["Rank"] = df[sort_col].rank(method="min", ascending=False).astype(int)
84
+
85
+ medal = {1: "🥇", 2: "🥈", 3: "🥉"}
86
+
87
+ # Start building HTML
88
+ html = """
89
+ <style>
90
+ .lb-table {
91
+ width: 100%;
92
+ border-collapse: collapse;
93
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
94
+ font-size: 14px;
95
+ }
96
+ .lb-table th {
97
+ background: #1a1a2e;
98
+ color: #fff;
99
+ padding: 10px 8px;
100
+ text-align: center;
101
+ font-weight: 600;
102
+ position: sticky;
103
+ top: 0;
104
+ z-index: 10;
105
+ }
106
+ .lb-table th.group-header {
107
+ border-bottom: 2px solid #16213e;
108
+ }
109
+ .lb-table td {
110
+ padding: 8px 8px;
111
+ text-align: center;
112
+ border-bottom: 1px solid #eee;
113
+ }
114
+ .lb-table td.model-name {
115
+ text-align: left;
116
+ font-weight: 600;
117
+ padding-left: 12px;
118
+ }
119
+ .lb-table td.category-cell {
120
+ font-size: 12px;
121
+ }
122
+ .lb-table tr:hover {
123
+ background-color: #f0f0ff !important;
124
+ }
125
+ .lb-table .rank-cell {
126
+ font-weight: 700;
127
+ font-size: 15px;
128
+ }
129
+ .score-best {
130
+ font-weight: 700;
131
+ color: #d32f2f;
132
+ }
133
+ .score-second {
134
+ font-weight: 600;
135
+ text-decoration: underline;
136
+ }
137
+ </style>
138
+ """
139
+
140
+ # Determine best and second-best per column
141
+ score_cols = [c for c in df.columns if c not in ["Rank", "model", "category"]]
142
+ best_vals = {}
143
+ second_vals = {}
144
+ for col in score_cols:
145
+ vals = df[col].dropna().unique()
146
+ sorted_vals = sorted(vals, reverse=True)
147
+ best_vals[col] = sorted_vals[0] if len(sorted_vals) > 0 else None
148
+ second_vals[col] = sorted_vals[1] if len(sorted_vals) > 1 else None
149
+
150
+ html += '<table class="lb-table">'
151
+ # Header row
152
+ html += "<thead><tr>"
153
+ html += '<th style="width:50px">Rank</th>'
154
+ html += '<th style="text-align:left;width:180px">Model</th>'
155
+ html += '<th style="width:80px">Category</th>'
156
+ html += '<th style="width:70px;background:#16213e">Overall</th>'
157
+
158
+ level_groups = [
159
+ ("Level 1", ["PD", "SH", "UG"]),
160
+ ("Level 2", ["RPD", "GE", "AOB"]),
161
+ ("Level 3", ["PGG", "VD", "CPR"]),
162
+ ("Level 4", ["AC", "WW", "KP"]),
163
+ ("Level Avg.", ["L1", "L2", "L3", "L4"]),
164
+ ]
165
+
166
+ for _, cols in level_groups:
167
+ for c in cols:
168
+ html += f'<th style="width:55px">{c}</th>'
169
+ html += "</tr>"
170
+
171
+ # Second header row with group labels
172
+ html += "<tr>"
173
+ html += '<th colspan="4"></th>'
174
+ colors = ["#2a2a4a", "#2a3a2a", "#3a2a2a", "#2a2a3a", "#1a1a2e"]
175
+ for i, (name, cols) in enumerate(level_groups):
176
+ html += f'<th colspan="{len(cols)}" class="group-header" style="background:{colors[i]};font-size:12px">{name}</th>'
177
+ html += "</tr></thead>"
178
+
179
+ html += "<tbody>"
180
+ for _, row in df.iterrows():
181
+ cat = row["category"]
182
+ bg = CATEGORY_COLORS.get(cat, "#fff")
183
+ emoji = CATEGORY_EMOJI.get(cat, "")
184
+ rank = row["Rank"]
185
+ rank_display = f'{medal.get(rank, "")} {rank}' if rank in medal else str(rank)
186
+
187
+ html += f'<tr style="background-color:{bg}">'
188
+ html += f'<td class="rank-cell">{rank_display}</td>'
189
+ html += f'<td class="model-name">{row["model"]}</td>'
190
+ html += f'<td class="category-cell">{emoji} {cat}</td>'
191
+
192
+ # Overall with special styling
193
+ ov = row["Overall"]
194
+ ov_class = ""
195
+ if ov == best_vals.get("Overall"):
196
+ ov_class = "score-best"
197
+ elif ov == second_vals.get("Overall"):
198
+ ov_class = "score-second"
199
+ html += f'<td class="{ov_class}" style="font-weight:700;background:rgba(0,0,0,0.03)">{ov}</td>'
200
+
201
+ for _, cols in level_groups:
202
+ for c in cols:
203
+ val = row[c]
204
+ cls = ""
205
+ if val == best_vals.get(c):
206
+ cls = "score-best"
207
+ elif val == second_vals.get(c):
208
+ cls = "score-second"
209
+ display = f"{val:.1f}" if isinstance(val, float) and val != int(val) else str(int(val)) if pd.notna(val) else "-"
210
+ html += f'<td class="{cls}">{display}</td>'
211
+
212
+ html += "</tr>"
213
+
214
+ html += "</tbody></table>"
215
+ return html
216
+
217
+
218
+ def make_view_html(df: pd.DataFrame, view: str, sort_col: str = "Overall") -> str:
219
+ """Filter to one view and produce an HTML table."""
220
+ filtered = df[df["view"] == view].copy()
221
+ filtered = compute_overall(filtered)
222
+ filtered = filtered.drop(columns=["view"])
223
+ return make_ranked_html(filtered, sort_col)
224
+
225
+
226
+ def make_aggregated_html(df: pd.DataFrame, sort_col: str = "Overall") -> str:
227
+ """Aggregate across all views and produce an HTML table."""
228
+ agg = aggregate_by_model(df)
229
+ return make_ranked_html(agg, sort_col)
230
+
231
+
232
+ def create_leaderboard_tab():
233
+ with gr.Tab("🏆 Leaderboard"):
234
+
235
+ gr.Markdown("""
236
+ Select a view to see per-view scores, or choose **All Views (Averaged)** for the overall ranking.
237
+ Scores are multiplied by 100. **Bold red** = best in column, <u>underlined</u> = second best.
238
+ """)
239
+
240
+ with gr.Row():
241
+ view_selector = gr.Radio(
242
+ choices=["All Views (Averaged)", "BTA", "RPA", "CCA"],
243
+ value="All Views (Averaged)",
244
+ label="View",
245
+ interactive=True,
246
+ )
247
+ sort_selector = gr.Dropdown(
248
+ choices=["Overall", "L1", "L2", "L3", "L4",
249
+ "PD", "SH", "UG", "RPD", "GE", "AOB",
250
+ "PGG", "VD", "CPR", "AC", "WW", "KP"],
251
+ value="Overall",
252
+ label="Sort by",
253
+ interactive=True,
254
+ )
255
+
256
+ with gr.Row():
257
+ category_filter = gr.CheckboxGroup(
258
+ choices=["Closed-Source", "Open-Weight", "Reasoning", "Human"],
259
+ value=["Closed-Source", "Open-Weight", "Reasoning", "Human"],
260
+ label="Model Categories",
261
+ interactive=True,
262
+ )
263
+
264
+ table_html = gr.HTML()
265
+
266
+ def update_table(view_choice, sort_col, categories):
267
+ df = load_leaderboard()
268
+ if categories:
269
+ df = df[df["category"].isin(categories)]
270
+ if df.empty:
271
+ return "<p>No data to display.</p>"
272
+ if view_choice == "All Views (Averaged)":
273
+ return make_aggregated_html(df, sort_col)
274
+ else:
275
+ return make_view_html(df, view_choice, sort_col)
276
+
277
+ # Set initial value
278
+ initial_html = update_table("All Views (Averaged)", "Overall",
279
+ ["Closed-Source", "Open-Weight", "Reasoning", "Human"])
280
+ table_html.value = initial_html
281
+
282
+ # Bind events
283
+ for component in [view_selector, sort_selector, category_filter]:
284
+ component.change(
285
+ fn=update_table,
286
+ inputs=[view_selector, sort_selector, category_filter],
287
+ outputs=table_html,
288
+ )
289
+
290
+ # Column descriptions
291
+ with gr.Accordion("📖 Column Descriptions & Game Abbreviations", open=False):
292
+ game_desc = "\n".join([f"- **{k}**: {v}" for k, v in GAME_FULL_NAMES.items()])
293
+ view_desc = "\n".join([f"- **{k}**: {v}" for k, v in VIEW_NAMES.items()])
294
+ gr.Markdown(f"""
295
+ **Views:**
296
+ {view_desc}
297
+
298
+ **Game Abbreviations (Level 1–4):**
299
+ {game_desc}
300
+
301
+ **Level Averages (L1–L4):** Computed over all tasks in each level (not just the 3 shown).
302
+
303
+ **Overall:** Average of L1, L2, L3, L4.
304
+ """)