initial leaderboard
Browse files
Large Language Model Scientific Capability.csv
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Type,Parameters,Knowl. Und.,Code Gen.,Symbolic Reason.,Hypoth. Gen.,Overall
|
| 2 |
+
Claude 4.5 Sonnet,Close,,60.67 ,21.73 ,40.36 ,56.10 ,44.72
|
| 3 |
+
Claude4-1-Opus,Close,,60.87 ,25.32 ,38.69 ,29.47 ,38.58
|
| 4 |
+
GPT-4o,Close,,60.84 ,17.67 ,32.09 ,33.04 ,35.91
|
| 5 |
+
GPT-5,Close,,74.05 ,29.21 ,39.91 ,45.67 ,47.21
|
| 6 |
+
GPT-o3,Close,,76.05 ,25.26 ,38.14 ,34.14 ,43.40
|
| 7 |
+
Gemini-2.5-Flash,Close,,50.46 ,18.28 ,32.07 ,40.86 ,35.42
|
| 8 |
+
Gemini-2.5-Pro,Close,,59.34 ,24.77 ,34.96 ,50.73 ,42.45
|
| 9 |
+
Grok-2-vision-1212,Close,,50.14 ,20.60 ,28.21 ,49.63 ,37.14
|
| 10 |
+
Ling-flash-2.0,Open,100B,53.39 ,25.60 ,37.98 ,50.29 ,41.81
|
| 11 |
+
Seed1.6-vision,Close,,65.78 ,21.49 ,39.24 ,45.00 ,42.88
|
| 12 |
+
DeepSeek-R1,Open,685B,45.17 ,0.06 ,20.00 ,49.73 ,28.74
|
| 13 |
+
GLM-4.5V,Open,106B,52.78 ,3.24 ,13.43 ,42.23 ,27.92
|
| 14 |
+
InternS1,Open,241B,66.14 ,17.08 ,31.62 ,37.45 ,38.07
|
| 15 |
+
Kimi-k2,Open,1040B,62.49 ,20.86 ,38.59 ,42.28 ,41.06
|
| 16 |
+
Llama 4 Maverick,Open,400B,57.22 ,18.26 ,38.97 ,38.31 ,38.19
|
| 17 |
+
Qwen3-VL-235B-A22B,Open,235B,65.98 ,18.00 ,49.93 ,40.62 ,43.63
|
| 18 |
+
Qwen3-Max,Open,1000B,63.14 ,43.97 ,41.04 ,42.12 ,47.57
|
| 19 |
+
GPT-5.1,Close,,69.23 ,25.63 ,32.44 ,41.45 ,42.19
|
| 20 |
+
Gemini-3-Pro,Close,,66.06 ,29.57 ,45.19 ,61.51 ,50.58
|
Multimodal Model Disciplinary Leaderboard.csv
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Type,Parameters,Life Sci. ,Astronomy,Earth Sci. ,Chemistry,Mat. Sci. ,Physics,Overall
|
| 2 |
+
Claude 4.5 Sonnet,Close,,43.86 ,34.23 ,64.66 ,71.27 ,83.23 ,40.36 ,56.27
|
| 3 |
+
Claude4-1-Opus,Close,,42.49 ,39.87 ,67.47 ,71.94 ,83.23 ,38.69 ,57.28
|
| 4 |
+
GPT-4o,Close,,54.46 ,30.73 ,63.08 ,72.37 ,61.54 ,32.09 ,52.38
|
| 5 |
+
GPT-5,Close,,59.49 ,44.57 ,74.43 ,81.62 ,93.54 ,39.91 ,65.59
|
| 6 |
+
GPT-o3,Close,,61.57 ,42.82 ,74.15 ,81.77 ,93.85 ,38.14 ,65.38
|
| 7 |
+
Gemini-2.5-Flash,Close,,35.61 ,31.94 ,68.21 ,75.12 ,62.15 ,32.07 ,50.85
|
| 8 |
+
Gemini-2.5-Pro,Close,,34.85 ,43.40 ,70.52 ,78.29 ,83.23 ,34.96 ,57.54
|
| 9 |
+
Grok-2-vision-1212,Close,,43.44 ,33.51 ,58.80 ,63.04 ,49.08 ,28.21 ,46.01
|
| 10 |
+
Ling-flash-2.0,Open,100B,28.83 ,48.12 ,69.55 ,66.80 ,63.69 ,37.98 ,52.49
|
| 11 |
+
Seed1.6-vision,Close,,58.53 ,32.21 ,69.46 ,75.48 ,68.92 ,39.24 ,57.31
|
| 12 |
+
DeepSeek-R1,Open,685B,27.15 ,0.11 ,63.14 ,77.28 ,44.46 ,20.00 ,38.69
|
| 13 |
+
GLM-4.5V,Open,106B,54.11 ,0.32 ,52.68 ,71.38 ,57.08 ,13.43 ,41.50
|
| 14 |
+
InternS1,Open,241B,53.70 ,29.55 ,68.52 ,75.28 ,81.69 ,31.62 ,56.73
|
| 15 |
+
Kimi-k2,Open,1040B,41.10 ,35.57 ,77.77 ,77.04 ,71.38 ,38.59 ,56.91
|
| 16 |
+
Llama 4 Maverick,Open,400B,42.92 ,31.91 ,59.82 ,68.06 ,80.77 ,38.97 ,53.74
|
| 17 |
+
Qwen3-VL-235B-A22B,Open,235B,56.69 ,29.84 ,67.81 ,77.39 ,81.85 ,49.93 ,60.58
|
| 18 |
+
Qwen3-Max,Open,1000B,38.95 ,75.64 ,67.38 ,76.38 ,69.54 ,41.04 ,61.49
|
| 19 |
+
GPT-5.1,Close,,60.54 ,42.02 ,73.88 ,76.45 ,61.08 ,32.44 ,57.73
|
| 20 |
+
Gemini-3-Pro,Close,,49.20 ,42.23 ,72.49 ,83.08 ,91.23 ,45.19 ,63.90
|
Multimodal Model Scientific Capability.csv
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Type,Parameters,Sci.MM-Percep.,Sci.Img-Und.,Sci.MM-Reason.,Overall,
|
| 2 |
+
Claude 4.5 Sonnet,Close,,57.87 ,43.64 ,56.11 ,52.54 ,
|
| 3 |
+
Claude4-1-Opus,Close,,58.25 ,45.19 ,58.66 ,54.03 ,
|
| 4 |
+
GPT-4o,Close,,52.78 ,25.93 ,57.97 ,45.56 ,
|
| 5 |
+
GPT-5,Close,,59.94 ,42.44 ,61.46 ,54.61 ,
|
| 6 |
+
GPT-o3,Close,,55.23 ,32.84 ,59.27 ,49.11 ,
|
| 7 |
+
Gemini-2.5-Flash,Close,,55.98 ,38.20 ,57.22 ,50.47 ,
|
| 8 |
+
Gemini-2.5-Pro,Close,,52.12 ,43.76 ,61.28 ,52.39 ,
|
| 9 |
+
Grok-2-vision-1212,Close,,64.00 ,25.04 ,51.76 ,46.93 ,
|
| 10 |
+
Seed1.6-vision,Close,,65.79 ,44.75 ,57.11 ,55.88 ,
|
| 11 |
+
GLM-4.5V,Open,106B,59.10 ,38.57 ,51.04 ,49.57 ,
|
| 12 |
+
InternS1,Open,241B,60.89 ,45.73 ,56.47 ,54.36 ,
|
| 13 |
+
Llama 4 Maverick,Open,400B,56.74 ,36.83 ,55.39 ,49.65 ,
|
| 14 |
+
Qwen3-VL-235B-A22B,Open,235B,72.29 ,38.35 ,50.83 ,53.82 ,
|
| 15 |
+
Qwen3-Max,Open,1000B,24.51 ,20.40 ,49.86 ,31.59 ,
|
| 16 |
+
GPT-5.1,Close,,54.10 ,33.05 ,58.73 ,48.63 ,
|
| 17 |
+
Gemini-3-Pro,Close,,66.54 ,55.62 ,66.49 ,62.88 ,
|
app.py
CHANGED
|
@@ -1,204 +1,463 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from gradio_leaderboard import
|
| 3 |
import pandas as pd
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
with demo:
|
| 94 |
gr.HTML(TITLE)
|
| 95 |
-
gr.
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
running_eval_table = gr.components.Dataframe(
|
| 127 |
-
value=running_eval_queue_df,
|
| 128 |
-
headers=EVAL_COLS,
|
| 129 |
-
datatype=EVAL_TYPES,
|
| 130 |
-
row_count=5,
|
| 131 |
-
)
|
| 132 |
-
|
| 133 |
-
with gr.Accordion(
|
| 134 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 135 |
-
open=False,
|
| 136 |
-
):
|
| 137 |
-
with gr.Row():
|
| 138 |
-
pending_eval_table = gr.components.Dataframe(
|
| 139 |
-
value=pending_eval_queue_df,
|
| 140 |
-
headers=EVAL_COLS,
|
| 141 |
-
datatype=EVAL_TYPES,
|
| 142 |
-
row_count=5,
|
| 143 |
-
)
|
| 144 |
-
with gr.Row():
|
| 145 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
| 146 |
-
|
| 147 |
-
with gr.Row():
|
| 148 |
-
with gr.Column():
|
| 149 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
| 150 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
| 151 |
-
model_type = gr.Dropdown(
|
| 152 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
| 153 |
-
label="Model type",
|
| 154 |
-
multiselect=False,
|
| 155 |
-
value=None,
|
| 156 |
-
interactive=True,
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
with gr.Column():
|
| 160 |
-
precision = gr.Dropdown(
|
| 161 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 162 |
-
label="Precision",
|
| 163 |
-
multiselect=False,
|
| 164 |
-
value="float16",
|
| 165 |
-
interactive=True,
|
| 166 |
-
)
|
| 167 |
-
weight_type = gr.Dropdown(
|
| 168 |
-
choices=[i.value.name for i in WeightType],
|
| 169 |
-
label="Weights type",
|
| 170 |
-
multiselect=False,
|
| 171 |
-
value="Original",
|
| 172 |
-
interactive=True,
|
| 173 |
-
)
|
| 174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 175 |
-
|
| 176 |
-
submit_button = gr.Button("Submit Eval")
|
| 177 |
-
submission_result = gr.Markdown()
|
| 178 |
-
submit_button.click(
|
| 179 |
-
add_new_eval,
|
| 180 |
-
[
|
| 181 |
-
model_name_textbox,
|
| 182 |
-
base_model_name_textbox,
|
| 183 |
-
revision_name_textbox,
|
| 184 |
-
precision,
|
| 185 |
-
weight_type,
|
| 186 |
-
model_type,
|
| 187 |
-
],
|
| 188 |
-
submission_result,
|
| 189 |
)
|
|
|
|
| 190 |
|
| 191 |
with gr.Row():
|
| 192 |
-
with gr.
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
| 200 |
|
| 201 |
-
|
| 202 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 203 |
-
scheduler.start()
|
| 204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Optional
|
| 3 |
+
|
| 4 |
import gradio as gr
|
| 5 |
+
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
|
| 6 |
import pandas as pd
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
def _slugify(title: str) -> str:
|
| 10 |
+
return re.sub(r'[^a-z0-9]+', '-', title.lower()).strip('-')
|
| 11 |
+
|
| 12 |
+
# 🎨 增强后的自定义 CSS
|
| 13 |
+
custom_css = """
|
| 14 |
+
/* 全局设置:简洁、高级的字体和背景 */
|
| 15 |
+
:root {
|
| 16 |
+
--color-background-primary: #f8f8f8; /* 浅米白色背景 */
|
| 17 |
+
--color-background-secondary: #ffffff; /* 卡片背景 */
|
| 18 |
+
--color-text-primary: #333333;
|
| 19 |
+
--color-accent: #8e80ff; /* 浅紫色强调色 (Primary) */
|
| 20 |
+
--color-accent-light: #a99dff; /* 浅紫色悬停色 */
|
| 21 |
+
--shadow-medium: 0 4px 12px rgba(0, 0, 0, 0.08);
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
/* 全局字体:强制使用 Arial */
|
| 25 |
+
html, body, .gradio-container, .gradio-container * {
|
| 26 |
+
font-family: Arial, "Helvetica Neue", Helvetica, "Noto Sans", "PingFang SC", "Microsoft YaHei", sans-serif !important;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
body {
|
| 30 |
+
background-color: var(--color-background-primary) !important;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
/* 增加容器最大宽度以展示完整表格 */
|
| 34 |
+
.gradio-container {
|
| 35 |
+
max-width: 1400px; /* 宽度从 1800px 调窄到 1400px */
|
| 36 |
+
margin: 0 auto;
|
| 37 |
+
padding: 20px;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
/* 标题样式 */
|
| 41 |
+
#space-title {
|
| 42 |
+
color: var(--color-text-primary);
|
| 43 |
+
font-size: 3em;
|
| 44 |
+
font-weight: 700;
|
| 45 |
+
margin-bottom: 0.5em;
|
| 46 |
+
padding-top: 20px;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
/* Group/Block 组件的卡片样式 */
|
| 50 |
+
.gr-group, .gr-block {
|
| 51 |
+
background-color: var(--color-background-secondary);
|
| 52 |
+
border-radius: 12px;
|
| 53 |
+
box-shadow: var(--shadow-medium);
|
| 54 |
+
transition: box-shadow 0.3s ease;
|
| 55 |
+
padding: 15px;
|
| 56 |
+
margin-bottom: 20px;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
.gr-group:hover, .gr-block:hover {
|
| 60 |
+
box-shadow: 0 6px 18px rgba(0, 0, 0, 0.12);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
/* Leaderboard 容器:调整内部布局的关键 */
|
| 64 |
+
[id^="leaderboard-"] {
|
| 65 |
+
padding: 0 !important;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
/* 搜索栏布局调整 (第一行) */
|
| 69 |
+
.leaderboard_root > div:nth-child(1) {
|
| 70 |
+
padding: 0 15px 15px 15px;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
/* 过滤器和列选择布局调整 (第二行) */
|
| 74 |
+
.leaderboard_root > div:nth-child(2) {
|
| 75 |
+
display: flex;
|
| 76 |
+
padding: 0 15px 15px 15px;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
.leaderboard_root .gr-form {
|
| 80 |
+
border: none;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
/* Search Bar */
|
| 84 |
+
#search-bar-table-box {
|
| 85 |
+
width: 100%;
|
| 86 |
+
margin-bottom: 10px;
|
| 87 |
+
}
|
| 88 |
+
#search-bar-table-box > div:first-child {
|
| 89 |
+
background: none;
|
| 90 |
+
border: none;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
/* === Select Columns to Display: 强制单行展示 === */
|
| 94 |
+
/* 定位 SelectColumns 的内部复选框容器 */
|
| 95 |
+
.leaderboard-filter-column:first-child .gr-form-checkbox-group {
|
| 96 |
+
/* 使用 flex 容器 */
|
| 97 |
+
display: flex !important;
|
| 98 |
+
flex-wrap: nowrap !important; /* 强制不换行 */
|
| 99 |
+
overflow-x: auto !important; /* 允许水平滚动 */
|
| 100 |
+
gap: 10px;
|
| 101 |
+
padding-bottom: 5px;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
/* 确保每个复选框标签保持内联块级元素 */
|
| 105 |
+
.leaderboard-filter-column:first-child .gr-form-checkbox-group label {
|
| 106 |
+
flex-shrink: 0 !important; /* 防止选项被压缩 */
|
| 107 |
+
display: inline-block !important; /* 确保每个选项占据其自然宽度 */
|
| 108 |
+
margin: 0;
|
| 109 |
+
white-space: nowrap; /* 确保文字也不换行 */
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
#leaderboard-table, #leaderboard-table-lite {
|
| 113 |
+
margin-top: 15px;
|
| 114 |
+
border-radius: 8px;
|
| 115 |
+
overflow: hidden;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
#leaderboard-table th {
|
| 119 |
+
background-color: var(--color-accent);
|
| 120 |
+
color: white;
|
| 121 |
+
font-weight: 600;
|
| 122 |
+
text-transform: uppercase;
|
| 123 |
+
border-bottom: 2px solid var(--color-accent-light);
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
#leaderboard-table tr:hover {
|
| 127 |
+
background-color: #f0f0f0;
|
| 128 |
+
cursor: pointer;
|
| 129 |
+
transition: background-color 0.2s ease;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
#leaderboard-table td:nth-child(2),
|
| 133 |
+
#leaderboard-table th:nth-child(2) {
|
| 134 |
+
max-width: 400px;
|
| 135 |
+
overflow: auto;
|
| 136 |
+
white-space: nowrap;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
#leaderboard-table td:nth-child(3) {
|
| 140 |
+
font-weight: bold;
|
| 141 |
+
color: var(--color-accent);
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
/* Citation 区域 */
|
| 145 |
+
#citation-group {
|
| 146 |
+
padding: 20px;
|
| 147 |
+
margin-top: 10px;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
#citation-button {
|
| 151 |
+
margin-top: 0;
|
| 152 |
+
padding: 0;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
/* 修复 Citation 复制图标重叠问题 */
|
| 156 |
+
#citation-button label {
|
| 157 |
+
display: block;
|
| 158 |
+
position: relative;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
#citation-button textarea {
|
| 162 |
+
font-family: Arial, "Helvetica Neue", Helvetica, "Noto Sans", "PingFang SC", "Microsoft YaHei", sans-serif !important;
|
| 163 |
+
background-color: #f1f1f1;
|
| 164 |
+
border: 1px solid #cccccc;
|
| 165 |
+
border-radius: 6px;
|
| 166 |
+
padding: 10px;
|
| 167 |
+
padding-right: 40px !important; /* 为复制按钮腾出空间 */
|
| 168 |
+
font-size: 14px !important;
|
| 169 |
+
width: 100% !important;
|
| 170 |
+
box-sizing: border-box;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
/* 调整复制按钮的位置 */
|
| 174 |
+
#citation-button > label > button {
|
| 175 |
+
position: absolute;
|
| 176 |
+
top: 10px;
|
| 177 |
+
right: 10px;
|
| 178 |
+
margin: 0;
|
| 179 |
+
transform: scale(1.1);
|
| 180 |
+
transition: transform 0.2s ease;
|
| 181 |
+
background-color: var(--color-accent) !important;
|
| 182 |
+
color: white !important;
|
| 183 |
+
border: none !important;
|
| 184 |
+
border-radius: 6px;
|
| 185 |
+
z-index: 10;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
#citation-button > label > button:hover {
|
| 189 |
+
transform: scale(1.2);
|
| 190 |
+
background-color: var(--color-accent-light) !important;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
/* Leaderboard 内部过滤/选择组件微调 */
|
| 194 |
+
.leaderboard_root .leaderboard-filter-column:last-child {
|
| 195 |
+
flex-grow: 1;
|
| 196 |
+
max-width: 50%;
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
.leaderboard_root .leaderboard-filter-column:first-child {
|
| 200 |
+
max-width: 50%;
|
| 201 |
+
padding-right: 20px;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
/* 其他 Gradio 元素的简洁化 */
|
| 205 |
+
.wrap-inner input[type="text"], .wrap-inner input[type="number"] {
|
| 206 |
+
border-radius: 6px;
|
| 207 |
+
border: 1px solid #cccccc;
|
| 208 |
+
padding: 8px 12px;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
/* ==== Score bar cells ==== */
|
| 212 |
+
.leaderboard-cell-bar {
|
| 213 |
+
position: relative;
|
| 214 |
+
display: block;
|
| 215 |
+
width: 100%;
|
| 216 |
+
height: 28px;
|
| 217 |
+
line-height: 28px;
|
| 218 |
+
background: #f5f3ff; /* light purple background */
|
| 219 |
+
border-radius: 8px;
|
| 220 |
+
overflow: hidden;
|
| 221 |
+
padding-left: 38px; /* leave room for dot */
|
| 222 |
+
color: #1d1b84; /* dark purple text */
|
| 223 |
+
font-weight: 600;
|
| 224 |
+
}
|
| 225 |
+
.leaderboard-cell-bar .bar-fill {
|
| 226 |
+
position: absolute;
|
| 227 |
+
left: 0;
|
| 228 |
+
top: 0;
|
| 229 |
+
height: 100%;
|
| 230 |
+
width: var(--w, 0%);
|
| 231 |
+
background: linear-gradient(90deg, #6c5ce7 0%, #a29bfe 100%);
|
| 232 |
+
opacity: 0.25;
|
| 233 |
+
}
|
| 234 |
+
.leaderboard-cell-bar .bar-dot {
|
| 235 |
+
position: absolute;
|
| 236 |
+
left: 10px;
|
| 237 |
+
top: 50%;
|
| 238 |
+
transform: translateY(-50%);
|
| 239 |
+
width: 12px;
|
| 240 |
+
height: 12px;
|
| 241 |
+
border-radius: 50%;
|
| 242 |
+
background: #3c1be3;
|
| 243 |
+
box-shadow: 0 0 0 4px rgba(60, 27, 227, 0.08);
|
| 244 |
+
}
|
| 245 |
+
.leaderboard-cell-bar .bar-text {
|
| 246 |
+
position: relative;
|
| 247 |
+
z-index: 1;
|
| 248 |
+
padding-right: 10px;
|
| 249 |
+
}
|
| 250 |
+
"""
|
| 251 |
+
|
| 252 |
+
TITLE = """<h1 align="center" id="space-title">SciEval Leaderboards 🏆</h1>"""
|
| 253 |
+
INFO = """<p align="center">
|
| 254 |
+
<a href="https://huggingface.co/datasets/InternScience/SciEval"><b>HuggingFace</b></a> ·
|
| 255 |
+
<a href="https://github.com/InternScience/SciEvalKit"><b>GitHub</b></a>
|
| 256 |
+
</p>"""
|
| 257 |
+
|
| 258 |
+
CITATION_BUTTON_LABEL = "📖 Citation"
|
| 259 |
+
CITATION_BUTTON_TEXT = r"""
|
| 260 |
+
@article{scieval2025,
|
| 261 |
+
title={SciEvalKit: An Open-source Evaluation Toolkit for Scientific General Intelligence},
|
| 262 |
+
author={SciPrismaX Team},
|
| 263 |
+
journal={arXiv preprint},
|
| 264 |
+
year={2025}
|
| 265 |
+
}
|
| 266 |
+
"""
|
| 267 |
+
|
| 268 |
+
LEADERBOARD_FILES = [
|
| 269 |
+
("Large Language Model Scientific Capability", "Large Language Model Scientific Capability.csv"),
|
| 270 |
+
("Multimodal Model Scientific Capability", "Multimodal Model Scientific Capability.csv"),
|
| 271 |
+
("Multimodal Model Disciplinary Leaderboard", "Multimodal Model Disciplinary Leaderboard.csv"),
|
| 272 |
+
]
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
def strip_auxiliary_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 276 |
+
"""Remove unnamed columns that come from spreadsheet index exports."""
|
| 277 |
+
return df.loc[:, ~df.columns.str.contains("^Unnamed")]
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def find_sort_column(df: pd.DataFrame) -> Optional[str]:
|
| 281 |
+
"""Pick a sensible default sort column."""
|
| 282 |
+
preferred = ["overall", "score", "avg", "average"]
|
| 283 |
+
for col in df.columns:
|
| 284 |
+
if col.lower() in preferred and pd.api.types.is_numeric_dtype(df[col]):
|
| 285 |
+
return col
|
| 286 |
+
numeric_cols = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
|
| 287 |
+
return numeric_cols[0] if numeric_cols else None
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def _percent_widths(series: pd.Series) -> pd.Series:
|
| 291 |
+
"""Compute a 0-100 width for a numeric series."""
|
| 292 |
+
s = series.astype(float)
|
| 293 |
+
# If values look like percentages already
|
| 294 |
+
if s.min() >= 0 and s.max() <= 100:
|
| 295 |
+
return s
|
| 296 |
+
# If values look like 0-1
|
| 297 |
+
if s.min() >= 0 and s.max() <= 1.0:
|
| 298 |
+
return s * 100.0
|
| 299 |
+
# General min-max scaling
|
| 300 |
+
rng = s.max() - s.min()
|
| 301 |
+
if rng == 0:
|
| 302 |
+
return pd.Series([50.0] * len(s), index=s.index)
|
| 303 |
+
return (s - s.min()) / rng * 100.0
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def add_bar_cells(df: pd.DataFrame, exclude: Optional[list[str]] = None) -> tuple[pd.DataFrame, set[str]]:
|
| 307 |
+
"""
|
| 308 |
+
Convert numeric score columns to HTML with a bar background.
|
| 309 |
+
Returns a new DataFrame and the set of columns that were converted.
|
| 310 |
+
"""
|
| 311 |
+
exclude = set((exclude or []))
|
| 312 |
+
# Columns we never bar-render
|
| 313 |
+
exclude |= {"Model", "Type", "Parameters"}
|
| 314 |
+
out = df.copy()
|
| 315 |
+
converted: set[str] = set()
|
| 316 |
+
for col in out.columns:
|
| 317 |
+
if col in exclude:
|
| 318 |
+
continue
|
| 319 |
+
if pd.api.types.is_numeric_dtype(out[col]):
|
| 320 |
+
widths = _percent_widths(out[col])
|
| 321 |
+
# Build HTML for each cell
|
| 322 |
+
formatted = []
|
| 323 |
+
for val, w in zip(out[col], widths):
|
| 324 |
+
try:
|
| 325 |
+
disp = f"{float(val):.2f}"
|
| 326 |
+
except Exception:
|
| 327 |
+
disp = str(val)
|
| 328 |
+
html = (
|
| 329 |
+
f'<div class="leaderboard-cell-bar" style="--w:{max(0.0, min(100.0, float(w))):.2f}%">'
|
| 330 |
+
f'<span class="bar-fill"></span>'
|
| 331 |
+
f'<span class="bar-dot"></span>'
|
| 332 |
+
f'<span class="bar-text">{disp}</span>'
|
| 333 |
+
f"</div>"
|
| 334 |
+
)
|
| 335 |
+
formatted.append(html)
|
| 336 |
+
out[col] = formatted
|
| 337 |
+
converted.add(col)
|
| 338 |
+
return out, converted
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def load_leaderboard_csv(path: Path) -> pd.DataFrame:
|
| 342 |
+
"""Read and clean a leaderboard CSV."""
|
| 343 |
+
df = pd.read_csv(path)
|
| 344 |
+
df = strip_auxiliary_columns(df)
|
| 345 |
+
df.columns = [col.strip() for col in df.columns]
|
| 346 |
+
|
| 347 |
+
numeric_cols = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
|
| 348 |
+
if numeric_cols:
|
| 349 |
+
df[numeric_cols] = df[numeric_cols].round(2)
|
| 350 |
+
|
| 351 |
+
sort_col = find_sort_column(df)
|
| 352 |
+
if sort_col:
|
| 353 |
+
df = df.sort_values(by=sort_col, ascending=False)
|
| 354 |
+
|
| 355 |
+
return df.reset_index(drop=True)
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def safe_load(title: str, path: Path) -> tuple[str, pd.DataFrame]:
|
| 359 |
+
"""Load a leaderboard but keep the app running if the CSV is missing or malformed."""
|
| 360 |
+
try:
|
| 361 |
+
df = load_leaderboard_csv(path)
|
| 362 |
+
except Exception as exc:
|
| 363 |
+
print(f"[leaderboard] Failed to load {path}: {exc}")
|
| 364 |
+
df = pd.DataFrame(
|
| 365 |
+
{
|
| 366 |
+
"Status": [
|
| 367 |
+
f"Upload a CSV named '{path.name}' to populate the '{title}' leaderboard. "
|
| 368 |
+
f"Error: {exc}"
|
| 369 |
+
]
|
| 370 |
+
}
|
| 371 |
+
)
|
| 372 |
+
return title, df
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def build_datatypes(df: pd.DataFrame, html_cols: Optional[set[str]] = None) -> list[str]:
|
| 376 |
+
"""Build the datatype list for gradio_leaderboard.
|
| 377 |
+
Columns we bar-render should be treated as markdown so inline HTML is rendered.
|
| 378 |
+
"""
|
| 379 |
+
html_cols = html_cols or set()
|
| 380 |
+
dtypes: list[str] = []
|
| 381 |
+
for col in df.columns:
|
| 382 |
+
if col in html_cols:
|
| 383 |
+
# Use markdown to allow raw HTML inside cells
|
| 384 |
+
dtypes.append("markdown")
|
| 385 |
+
else:
|
| 386 |
+
dtypes.append("number" if pd.api.types.is_numeric_dtype(df[col]) else "str")
|
| 387 |
+
return dtypes
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def discover_leaderboards(config: list[tuple[str, str]]) -> list[tuple[str, pd.DataFrame]]:
|
| 391 |
+
"""Load configured leaderboards; if a file is renamed, fall back to any other CSVs in the folder."""
|
| 392 |
+
configured_paths = [(title, Path(filename)) for title, filename in config]
|
| 393 |
+
configured_names = {Path(filename).name for _, filename in config}
|
| 394 |
+
|
| 395 |
+
# Load explicitly configured CSVs first
|
| 396 |
+
boards: list[tuple[str, pd.DataFrame]] = [safe_load(title, path) for title, path in configured_paths]
|
| 397 |
+
|
| 398 |
+
# Add any other CSVs in the folder as additional tabs for resilience
|
| 399 |
+
extra_csvs = [
|
| 400 |
+
path
|
| 401 |
+
for path in sorted(Path(".").glob("*.csv"))
|
| 402 |
+
if path.name not in configured_names
|
| 403 |
+
]
|
| 404 |
+
for path in extra_csvs:
|
| 405 |
+
boards.append(safe_load(path.stem, path))
|
| 406 |
+
|
| 407 |
+
return boards
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
leaderboards = discover_leaderboards(LEADERBOARD_FILES)
|
| 411 |
+
|
| 412 |
+
required_filenames_md = "\n".join([f" - `{filename}`" for _, filename in LEADERBOARD_FILES])
|
| 413 |
+
|
| 414 |
+
demo = gr.Blocks(css=custom_css, theme=gr.themes.Soft())
|
| 415 |
with demo:
|
| 416 |
gr.HTML(TITLE)
|
| 417 |
+
gr.HTML(INFO)
|
| 418 |
+
|
| 419 |
+
# Render independent leaderboards (no tabs)
|
| 420 |
+
for lb_title, df in leaderboards:
|
| 421 |
+
with gr.Group():
|
| 422 |
+
centered_titles = {
|
| 423 |
+
"Large Language Model Scientific Capability",
|
| 424 |
+
"Multimodal Model Scientific Capability",
|
| 425 |
+
"Multimodal Model Disciplinary Leaderboard",
|
| 426 |
+
}
|
| 427 |
+
if lb_title.strip() in centered_titles:
|
| 428 |
+
gr.HTML(f'<h2 style="text-align:center; font-weight:700; margin: 0.2em 0;">{lb_title}</h2>')
|
| 429 |
+
else:
|
| 430 |
+
gr.Markdown(f"## {lb_title}")
|
| 431 |
+
# Apply bar-style rendering to numeric score columns
|
| 432 |
+
df_render, html_cols = add_bar_cells(df)
|
| 433 |
+
Leaderboard(
|
| 434 |
+
value=df_render,
|
| 435 |
+
elem_id=f"leaderboard-{_slugify(lb_title)}",
|
| 436 |
+
datatype=build_datatypes(df_render, html_cols),
|
| 437 |
+
select_columns=SelectColumns(
|
| 438 |
+
default_selection=list(df_render.columns),
|
| 439 |
+
cant_deselect=[c for c in ("Model", "Type") if c in df_render.columns],
|
| 440 |
+
label="Select columns to display:",
|
| 441 |
+
),
|
| 442 |
+
search_columns=["Model"] if "Model" in df_render.columns else [df_render.columns[0]],
|
| 443 |
+
filter_columns=(
|
| 444 |
+
[ColumnFilter("Type", type="checkboxgroup", label="Model Types:")]
|
| 445 |
+
if "Type" in df_render.columns else []
|
| 446 |
+
),
|
| 447 |
+
interactive=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
)
|
| 449 |
+
gr.Markdown("---")
|
| 450 |
|
| 451 |
with gr.Row():
|
| 452 |
+
with gr.Column():
|
| 453 |
+
with gr.Group(elem_id="citation-group"):
|
| 454 |
+
gr.Textbox(
|
| 455 |
+
value=CITATION_BUTTON_TEXT,
|
| 456 |
+
label=CITATION_BUTTON_LABEL,
|
| 457 |
+
lines=CITATION_BUTTON_TEXT.count("\n") + 1,
|
| 458 |
+
elem_id="citation-button",
|
| 459 |
+
show_copy_button=True,
|
| 460 |
+
interactive=False,
|
| 461 |
+
)
|
| 462 |
|
| 463 |
+
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
|
|
|
|
|