Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
en-version
#1
by
harheem
- opened
- .DS_Store +0 -0
- app.py +8 -31
- banner_wide.png +0 -3
- combined_evaluation_summary.csv +7 -16
- components/leaderboard_components.py +10 -13
- styles/leaderboard_styles.py +30 -205
- tabs/{leaderboard_v1_kr.py → leaderboard_v1.py} +484 -690
- tabs/leaderboard_v1_en.py +0 -0
- utils.py +10 -13
.DS_Store
CHANGED
|
Binary files a/.DS_Store and b/.DS_Store differ
|
|
|
app.py
CHANGED
|
@@ -2,10 +2,9 @@ import warnings
|
|
| 2 |
warnings.filterwarnings("ignore")
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
-
from tabs.
|
| 6 |
-
from tabs.leaderboard_v1_en import create_leaderboard_v2_interface as leaderboard_en
|
| 7 |
|
| 8 |
-
# 다크
|
| 9 |
FIX_DARK_TEXT_CSS = """
|
| 10 |
html.dark .gr-prose,
|
| 11 |
html.dark .gr-prose p,
|
|
@@ -18,36 +17,14 @@ html.dark .gr-markdown * {
|
|
| 18 |
"""
|
| 19 |
|
| 20 |
def create_app():
|
| 21 |
-
|
|
|
|
| 22 |
|
| 23 |
with gr.Blocks(theme=theme, css=FIX_DARK_TEXT_CSS) as app:
|
| 24 |
-
|
| 25 |
-
with gr.Row():
|
| 26 |
-
lang_btn = gr.Button("🌍 English", scale=0, elem_id="lang-toggle-btn")
|
| 27 |
-
|
| 28 |
-
# 🔹 기본은 한국어 UI
|
| 29 |
-
with gr.Column(visible=True) as kr_view:
|
| 30 |
-
leaderboard_kr()
|
| 31 |
-
|
| 32 |
-
# 🔹 영어 UI는 숨김
|
| 33 |
-
with gr.Column(visible=False) as en_view:
|
| 34 |
-
leaderboard_en()
|
| 35 |
-
|
| 36 |
-
# 🔹 버튼 클릭 시 토글
|
| 37 |
-
def toggle_language(current_label):
|
| 38 |
-
if "English" in current_label:
|
| 39 |
-
return "🇰🇷 Korean", gr.update(visible=False), gr.update(visible=True)
|
| 40 |
-
else:
|
| 41 |
-
return "🌍 English", gr.update(visible=True), gr.update(visible=False)
|
| 42 |
-
|
| 43 |
-
lang_btn.click(
|
| 44 |
-
toggle_language,
|
| 45 |
-
inputs=[lang_btn],
|
| 46 |
-
outputs=[lang_btn, kr_view, en_view],
|
| 47 |
-
)
|
| 48 |
-
|
| 49 |
return app
|
| 50 |
|
| 51 |
-
|
| 52 |
demo = create_app()
|
| 53 |
-
|
|
|
|
|
|
|
|
|
| 2 |
warnings.filterwarnings("ignore")
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
+
from tabs.leaderboard_v1 import create_leaderboard_v2_interface
|
|
|
|
| 6 |
|
| 7 |
+
# 다크 모드에서 프로즈/마크다운 텍스트를 확실히 밝게 고정하는 CSS 보정
|
| 8 |
FIX_DARK_TEXT_CSS = """
|
| 9 |
html.dark .gr-prose,
|
| 10 |
html.dark .gr-prose p,
|
|
|
|
| 17 |
"""
|
| 18 |
|
| 19 |
def create_app():
|
| 20 |
+
# 권장: 명시적인 테마 객체 사용 (Default, Soft, Origin 등)
|
| 21 |
+
theme = gr.themes.Default() # 필요 시 gr.themes.Origin() 등으로 변경
|
| 22 |
|
| 23 |
with gr.Blocks(theme=theme, css=FIX_DARK_TEXT_CSS) as app:
|
| 24 |
+
create_leaderboard_v2_interface()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
return app
|
| 26 |
|
|
|
|
| 27 |
demo = create_app()
|
| 28 |
+
|
| 29 |
+
# Spaces/Gradio5에서 SSR이 꼬이면 일단 꺼서 확인
|
| 30 |
+
demo.launch(ssr_mode=False)
|
banner_wide.png
DELETED
Git LFS Details
|
combined_evaluation_summary.csv
CHANGED
|
@@ -1,16 +1,7 @@
|
|
| 1 |
-
Model,Vendor,Model Type,L1_Total_Tasks,L2_Total_Tasks,L3_Total_Tasks,L4_Total_Tasks,L5_Total_Tasks,L6_Total_Tasks,L7_Total_Tasks,L1_Evaluated_Tasks,L2_Evaluated_Tasks,L3_Evaluated_Tasks,L4_Evaluated_Tasks,L5_Evaluated_Tasks,L6_Evaluated_Tasks,L7_Evaluated_Tasks,L1_Avg_Exec_Time,L2_Avg_Exec_Time,L3_Avg_Exec_Time,L4_Avg_Exec_Time,L5_Avg_Exec_Time,L6_Avg_Exec_Time,L7_Avg_Exec_Time,L1_Avg_Tokens,L2_Avg_Tokens,L3_Avg_Tokens,L4_Avg_Tokens,L5_Avg_Tokens,L6_Avg_Tokens,L7_Avg_Tokens,L1_Avg_TPS,L2_Avg_TPS,L3_Avg_TPS,L4_Avg_TPS,L5_Avg_TPS,L6_Avg_TPS,L7_Avg_TPS,L1_Avg_TTFT,L2_Avg_TTFT,L3_Avg_TTFT,L4_Avg_TTFT,L5_Avg_TTFT,L6_Avg_TTFT,L7_Avg_TTFT,L1_RRR,L2_RRR,L3_RRR,L4_RRR,L5_RRR,L6_RRR,L7_RRR,L1_SR,L2_SR,L3_SR,L4_SR,L5_SR,L6_SR,L7_SR,L1_EPR_CVR,L2_EPR_CVR,L3_EPR_CVR,L4_EPR_CVR,L5_EPR_CVR,L6_EPR_CVR,L7_EPR_CVR,L1_pass@k,L2_pass@k,L3_pass@k,L4_pass@k,L5_pass@k,L6_pass@k,L7_pass@k,L1_TooAcc,L1_ArgAcc,L1_CallEM,L1_RespOK,L2_SelectAcc,L3_FSM,L3_PSM,L3_ΔSteps_norm,L4_Coverage,L4_SourceEPR,L5_AdaptiveRoutingScore,L5_FallbackSR,L6_RedundantCallRate,L6_EffScore,L7_ContextRetention,L7_RefRecall
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
gemini-2.5-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
claude-haiku-4-5,Anthropic,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,5.18,9.9,14.65,21.61,18.33,3.69,4.22,4504.64,11367.93,23333.9,42628.5,13977.65,2732.53,7153.3,869.59,1148.23,1593.07,1972.65,762.46,741.38,1697.01,2.4328,3.2797,4.1784,5.2912,2.2585,3.6851,3.3065,1.0,1.0,1.0,1.0,0.95,1.0,1.0,1.0,1.0,1.0,0.9,0.65,0.8,0.7,1.0,1.0,1.0,1.0,0.2358,0.0,0.3,1.0,1.0,1.0,1.0,0.95,1.0,1.0,1.0,0.6136,0.2727,1.0,1.0,0.6,1.0,0.5,0.75,0.7389,0.2283,0.75,1.0,0.0,1.0,0.925
|
| 9 |
-
gemini-2.5-flash-lite,Google,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,1.62,2.83,1.55,5.72,3.74,1.66,2.97,1930.09,3337.87,5892.0,15236.2,1795.9,1572.73,2577.8,1188.63,1179.12,3797.73,2664.96,480.67,944.86,868.65,0.6444,0.9106,0.6729,1.1369,0.5226,0.7943,0.6945,1.0,1.0,1.0,1.0,0.9,1.0,0.4,1.0,0.8667,0.2,0.7,0.25,0.6,0.4,1.0,0.8667,0.275,0.6,0.1167,0.2,0.2,1.0,1.0,1.0,1.0,0.9,1.0,0.4,1.0,0.6364,0.2727,1.0,0.8667,0.1,0.2,0.1,0.35,0.35,0.125,0.25,1.0,0.1333,0.975,0.825
|
| 10 |
-
claude-sonnet-4-5,Anthropic,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,6.77,11.69,19.86,34.08,19.1,5.45,7.18,3215.09,5874.0,19958.4,60071.8,10702.45,2710.47,10297.8,474.96,502.51,1004.85,1762.73,560.27,497.52,1434.99,3.1551,5.243,5.9522,8.9693,3.4574,5.4468,4.6806,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9,0.55,0.8,0.6,1.0,1.0,1.0,1.0,0.1742,0.0,0.4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6591,0.2727,1.0,1.0,0.9,1.0,0.75,0.75,0.75,0.1892,0.6,1.0,0.0,1.0,0.975
|
| 11 |
-
gpt-4o-mini,OpenAI,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,2.79,5.61,8.13,25.46,7.19,2.63,2.9,1389.55,4236.13,11772.4,11700.1,5203.7,1561.93,3940.3,498.7,755.34,1448.9,459.62,724.0,594.06,1357.18,1.2394,1.9904,2.5526,9.1994,0.9279,2.6286,2.1975,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9091,1.0,1.0,1.0,0.6,0.6667,0.5,1.0,0.8667,1.0,1.0,0.1946,0.0,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6591,0.2727,1.0,1.0,0.5,0.9167,0.5,0.5833,0.5833,0.2171,0.75,1.0,0.0,0.925,0.975
|
| 12 |
-
gpt-5,OpenAI,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,5.64,11.23,14.48,24.59,19.64,9.31,10.15,2306.18,16867.2,19321.9,29718.7,10773.2,6753.07,9451.3,409.06,1501.34,1334.6,1208.62,548.57,725.02,931.01,2.4414,3.442,5.8573,7.5822,3.1615,5.978,5.431,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9091,0.9333,1.0,0.9,0.85,0.8667,0.8,1.0,1.0,0.7,0.7,0.2728,0.2,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7273,0.3636,1.0,1.0,0.1,0.5667,0.4,0.55,0.5333,0.3,0.85,1.0,0.1444,1.0,0.975
|
| 13 |
-
qwen3-next-80b-a3b,Alibaba,OSS,11,15,10,10,20,15,10,11,15,10,10,20,15,10,4.13,12.63,17.18,28.84,10.59,9.59,7.92,1937.82,4725.0,15345.8,22067.0,6512.1,2198.27,5761.5,469.0,374.15,893.49,765.08,615.2,229.2,727.4,1.907,5.8972,5.5666,10.0412,1.985,9.5896,5.561,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6,0.9333,0.7,1.0,1.0,1.0,1.0,0.2375,0.0,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7727,0.4545,1.0,1.0,0.8,1.0,0.65,0.7,0.7,0.2542,0.7,1.0,0.0,0.975,0.95
|
| 14 |
-
gpt-5-mini,OpenAI,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,7.14,7.36,12.37,13.11,11.67,7.22,8.02,2963.73,4288.47,9704.4,8528.4,3510.45,2465.07,5810.8,414.91,582.29,784.64,650.71,300.9,341.21,724.39,3.4248,3.2995,5.2383,6.41,2.7195,6.5991,6.5065,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9091,0.9333,0.9,0.8,0.2,0.8667,1.0,1.0,0.8667,0.6,0.6,0.0917,0.0667,0.3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7045,0.3636,1.0,1.0,0.3,0.55,0.25,0.3667,0.3667,0.0917,0.2,1.0,0.0667,1.0,0.95
|
| 15 |
-
nova-lite,Amazon,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,3.29,7.72,12.08,18.88,11.81,5.05,3.2,2760.64,7563.27,17904.5,43855.6,12621.5,23029.87,6711.7,839.35,979.15,1482.74,2323.41,1068.7,4562.8,2094.59,1.4877,2.958,2.4853,4.0705,1.4959,2.0742,2.2498,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9091,1.0,0.5,0.9,0.3,0.8,0.4,1.0,1.0,1.0,1.0,0.1373,0.4667,0.4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5909,0.1818,1.0,1.0,0.5,0.85,0.45,0.5667,0.5667,0.1376,0.6,0.3,0.3133,0.725,0.675
|
| 16 |
-
gemini-2.5-pro,Google,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,10.88,11.9,23.24,19.5,23.03,7.52,9.7,2524.45,4880.93,3022.7,15671.5,4011.9,5005.8,9071.0,232.11,410.31,130.06,803.81,174.17,665.86,935.55,5.2265,5.6138,9.9988,8.3578,5.6094,4.8197,5.9149,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9333,0.1,0.6,0.3,0.7333,0.5,1.0,1.0,0.5,0.7,0.125,0.4,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7045,0.3636,1.0,1.0,0.0,0.2667,0.2,0.4667,0.4667,0.125,0.3,1.0,0.1333,0.875,0.85
|
|
|
|
| 1 |
+
Model,Vendor,Model Type,L1_Total_Tasks,L2_Total_Tasks,L3_Total_Tasks,L4_Total_Tasks,L5_Total_Tasks,L6_Total_Tasks,L7_Total_Tasks,L1_Evaluated_Tasks,L2_Evaluated_Tasks,L3_Evaluated_Tasks,L4_Evaluated_Tasks,L5_Evaluated_Tasks,L6_Evaluated_Tasks,L7_Evaluated_Tasks,L1_Avg_Exec_Time,L2_Avg_Exec_Time,L3_Avg_Exec_Time,L4_Avg_Exec_Time,L5_Avg_Exec_Time,L6_Avg_Exec_Time,L7_Avg_Exec_Time,L1_Avg_Tokens,L2_Avg_Tokens,L3_Avg_Tokens,L4_Avg_Tokens,L5_Avg_Tokens,L6_Avg_Tokens,L7_Avg_Tokens,L1_Avg_TPS,L2_Avg_TPS,L3_Avg_TPS,L4_Avg_TPS,L5_Avg_TPS,L6_Avg_TPS,L7_Avg_TPS,L1_Avg_TTFT,L2_Avg_TTFT,L3_Avg_TTFT,L4_Avg_TTFT,L5_Avg_TTFT,L6_Avg_TTFT,L7_Avg_TTFT,L1_RRR,L2_RRR,L3_RRR,L4_RRR,L5_RRR,L6_RRR,L7_RRR,L1_SR,L2_SR,L3_SR,L4_SR,L5_SR,L6_SR,L7_SR,L1_EPR_CVR,L2_EPR_CVR,L3_EPR_CVR,L4_EPR_CVR,L5_EPR_CVR,L6_EPR_CVR,L7_EPR_CVR,L1_pass@k,L2_pass@k,L3_pass@k,L4_pass@k,L5_pass@k,L6_pass@k,L7_pass@k,L1_TooAcc,L1_ArgAcc,L1_CallEM,L1_RespOK,L2_SelectAcc,L3_FSM,L3_PSM,L3_ΔSteps_norm,L3_ProvAcc,L4_Coverage,L4_SourceEPR,L5_AdaptiveRoutingScore,L5_FallbackSR,L6_ReuseRage,L6_RedundantCallRate,L6_EffScore,L7_ContextRetention,L7_RefRecall
|
| 2 |
+
kanana-1.5-8b-instruct-2505,Kakao,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,5.53,17.22,14.51,23.78,9.44,52.98,47.39,4556.36,6107.6,5723.4,7188.3,5665.9,28502.33,28738.1,823.46,354.62,394.38,302.24,599.94,538.01,606.41,1.5236,6.7827,5.9015,7.4927,1.4163,7.764,5.1605,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8409,0.925,0.55,0.55,0.45,0.7167,0.4,1.0,1.0,1.0,0.9,0.225,1.0,0.9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6364,0.2727,1.0,1.0,0.0,0.5333,0.0,0.0,0.2667,0.2667,0.225,0.45,0.4,1.0,0.6,0.825,0.75
|
| 3 |
+
skt_A.X-4.0-Light,SKT,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,5.15,17.37,21.51,9.06,9.23,38.97,33.94,4286.73,7456.1,13579.8,2284.9,6500.85,27744.0,25032.0,833.07,429.13,631.27,252.27,704.42,711.88,737.55,1.3615,5.8379,6.0725,6.2881,1.3627,5.3648,3.902,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5455,0.7417,0.525,0.35,0.2875,0.55,0.45,1.0,1.0,1.0,0.3,0.2583,0.8667,0.9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8182,0.4545,1.0,1.0,0.2,0.7833,0.65,0.1,0.05,0.05,0.25,0.55,0.4,1.0,0.4667,0.8,0.775
|
| 4 |
+
qwen3-8B,알리바바,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,24.54,33.11,38.89,61.09,46.28,102.03,92.19,5798.0,7600.07,8380.0,14758.8,9789.4,45946.13,55163.2,236.28,229.53,215.5,241.58,211.54,450.34,598.37,11.0876,13.3456,23.3045,16.4015,8.5784,16.7883,11.2336,1.0,1.0,0.9,0.9,1.0,1.0,1.0,0.5909,0.8083,0.175,0.35,0.45,0.7833,0.525,1.0,1.0,0.4,0.9,0.2258,1.0,0.95,1.0,1.0,0.9,0.8,0.9667,1.0,1.0,1.0,0.7955,0.4545,1.0,1.0,0.2,0.3,0.2,0.1,0.4667,0.4667,0.2333,0.55,0.2,1.0,0.5667,0.85,0.775
|
| 5 |
+
gemini-2.5-pro,Google,API,11,30,10,10,20,15,10,11,30,10,10,20,15,10,9.01,10.45,11.43,29.65,15.91,43.0,33.16,5257.45,5761.23,6384.2,22304.6,7592.2,54436.6,50150.6,583.2,551.49,558.73,752.35,477.25,1266.0,1512.44,4.6263,5.4812,7.9657,8.8433,4.9659,7.1894,5.2974,0.9091,0.8,0.8,1.0,0.8,0.8667,0.9,0.8409,0.6583,0.2,0.425,0.4,0.4,0.35,0.9091,0.7667,0.2,0.7,0.1583,0.8667,0.9,0.9091,0.8,0.8,1.0,0.8,0.8667,0.9,0.9091,0.6364,0.2727,0.9091,0.7667,0.1,0.1667,0.1,0.0,0.4833,0.4833,0.1583,0.35,0.5333,1.0,0.1222,0.825,0.7
|
| 6 |
+
Qwen3-4B-Instruct-2507,알리바바,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,6.66,22.89,14.8,51.19,11.71,86.63,60.09,5273.09,6447.9,9087.8,17502.5,5363.85,36058.4,37068.1,791.39,281.66,613.83,341.91,458.02,416.23,616.84,2.093,9.1244,4.4172,13.7638,1.8319,14.8681,8.245,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6364,0.6583,0.15,0.375,0.3,0.6167,0.425,1.0,1.0,1.0,0.9,0.15,1.0,1.0,1.0,1.0,1.0,0.9333,1.0,1.0,1.0,1.0,0.75,0.3636,1.0,1.0,0.2,0.6333,0.7,0.0,0.5167,0.5167,0.15,0.3,0.1333,1.0,0.4,0.875,0.8
|
| 7 |
+
Midm-2.0-Base-Instruct,KT,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,5.39,3.9,3.06,3.75,8.13,28.66,16.08,4185.82,2514.93,3418.3,2388.8,3084.5,22909.13,14079.1,775.89,644.46,1117.59,636.3,379.51,799.33,875.38,1.4775,1.8563,1.8855,1.6781,1.0824,1.6794,1.1356,1.0,1.0,1.0,1.0,0.95,1.0,1.0,0.5909,0.5167,0.25,0.325,0.275,0.4833,0.35,0.9091,0.5667,0.2,0.3,0.0667,0.9333,0.6,1.0,1.0,1.0,0.8667,0.9833,1.0,1.0,0.9091,0.6364,0.2727,1.0,0.5667,0.0,0.1,0.0,0.0,0.0,0.0,0.0667,0.15,0.0,0.9333,0.3,0.55,0.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
components/leaderboard_components.py
CHANGED
|
@@ -5,8 +5,8 @@ These are stable components that don't change frequently
|
|
| 5 |
|
| 6 |
def get_chart_colors():
|
| 7 |
return {
|
| 8 |
-
"Private": "#
|
| 9 |
-
"Open source": "#
|
| 10 |
"performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"],
|
| 11 |
"text": "white",
|
| 12 |
"background": "#01091A",
|
|
@@ -16,12 +16,10 @@ def get_chart_colors():
|
|
| 16 |
|
| 17 |
def get_rank_badge(rank):
|
| 18 |
"""Generate HTML for rank badge with appropriate styling"""
|
| 19 |
-
tag_background = "#593B1D"
|
| 20 |
-
tag_text_color = "#FFFFFF"
|
| 21 |
badge_styles = {
|
| 22 |
-
1: ("1st",
|
| 23 |
-
2: ("2nd",
|
| 24 |
-
3: ("3rd",
|
| 25 |
}
|
| 26 |
|
| 27 |
if rank in badge_styles:
|
|
@@ -61,25 +59,24 @@ def get_type_badge(model_type):
|
|
| 61 |
"""Generate HTML for model type badge"""
|
| 62 |
colors = get_chart_colors()
|
| 63 |
color_map = {
|
| 64 |
-
"Open source": colors.get("Open source", "#
|
| 65 |
-
"Proprietary": colors.get("Private", "#
|
| 66 |
-
"Private": colors.get("Private", "#
|
| 67 |
}
|
| 68 |
label_map = {
|
| 69 |
"Open source": "OSS",
|
| 70 |
"Proprietary": "API",
|
| 71 |
"Private": "API",
|
| 72 |
}
|
| 73 |
-
bg_color = color_map.get(model_type, "#
|
| 74 |
display_label = label_map.get(model_type, model_type)
|
| 75 |
-
text_color = "#111827" if display_label == "OSS" else "#FFFFFF"
|
| 76 |
return f"""
|
| 77 |
<div style="
|
| 78 |
display: inline-flex;
|
| 79 |
align-items: center;
|
| 80 |
padding: 4px 8px;
|
| 81 |
background: {bg_color};
|
| 82 |
-
color:
|
| 83 |
border-radius: 4px;
|
| 84 |
font-size: 0.85em;
|
| 85 |
font-weight: 500;
|
|
|
|
| 5 |
|
| 6 |
def get_chart_colors():
|
| 7 |
return {
|
| 8 |
+
"Private": "#1098F7", # Airglow Blue for Proprietary
|
| 9 |
+
"Open source": "#58BC82", # Green for Open source
|
| 10 |
"performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"],
|
| 11 |
"text": "white",
|
| 12 |
"background": "#01091A",
|
|
|
|
| 16 |
|
| 17 |
def get_rank_badge(rank):
|
| 18 |
"""Generate HTML for rank badge with appropriate styling"""
|
|
|
|
|
|
|
| 19 |
badge_styles = {
|
| 20 |
+
1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"),
|
| 21 |
+
2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
|
| 22 |
+
3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
|
| 23 |
}
|
| 24 |
|
| 25 |
if rank in badge_styles:
|
|
|
|
| 59 |
"""Generate HTML for model type badge"""
|
| 60 |
colors = get_chart_colors()
|
| 61 |
color_map = {
|
| 62 |
+
"Open source": colors.get("Open source", "#58BC82"),
|
| 63 |
+
"Proprietary": colors.get("Private", "#1098F7"),
|
| 64 |
+
"Private": colors.get("Private", "#1098F7"),
|
| 65 |
}
|
| 66 |
label_map = {
|
| 67 |
"Open source": "OSS",
|
| 68 |
"Proprietary": "API",
|
| 69 |
"Private": "API",
|
| 70 |
}
|
| 71 |
+
bg_color = color_map.get(model_type, "#4F46E5")
|
| 72 |
display_label = label_map.get(model_type, model_type)
|
|
|
|
| 73 |
return f"""
|
| 74 |
<div style="
|
| 75 |
display: inline-flex;
|
| 76 |
align-items: center;
|
| 77 |
padding: 4px 8px;
|
| 78 |
background: {bg_color};
|
| 79 |
+
color: white;
|
| 80 |
border-radius: 4px;
|
| 81 |
font-size: 0.85em;
|
| 82 |
font-weight: 500;
|
styles/leaderboard_styles.py
CHANGED
|
@@ -34,9 +34,9 @@ def get_leaderboard_css():
|
|
| 34 |
--border-subtle: rgba(245, 246, 247, 0.08);
|
| 35 |
--border-default: rgba(245, 246, 247, 0.12);
|
| 36 |
--border-strong: rgba(245, 246, 247, 0.2);
|
| 37 |
-
--text-primary: #
|
| 38 |
-
--text-secondary: #
|
| 39 |
-
--text-muted: #
|
| 40 |
--accent-primary: #ffd21e;
|
| 41 |
--accent-secondary: #1098F7;
|
| 42 |
--accent-tertiary: #F5F6F7;
|
|
@@ -44,38 +44,12 @@ def get_leaderboard_css():
|
|
| 44 |
--glow-secondary: rgba(16, 152, 247, 0.4);
|
| 45 |
--glow-tertiary: rgba(245, 246, 247, 0.3);
|
| 46 |
}
|
| 47 |
-
|
| 48 |
-
html.light,
|
| 49 |
-
html.light body,
|
| 50 |
-
html.light .gradio-container {
|
| 51 |
-
--bg-primary: #F8FAFC;
|
| 52 |
-
--bg-secondary: rgba(15, 23, 42, 0.06);
|
| 53 |
-
--bg-card: rgba(255, 255, 255, 0.92);
|
| 54 |
-
--border-subtle: rgba(15, 23, 42, 0.08);
|
| 55 |
-
--border-default: rgba(15, 23, 42, 0.12);
|
| 56 |
-
--border-strong: rgba(15, 23, 42, 0.18);
|
| 57 |
-
--text-primary: #0B1120;
|
| 58 |
-
--text-secondary: #1E293B;
|
| 59 |
-
--text-muted: #475569;
|
| 60 |
-
--accent-primary: #F59E0B;
|
| 61 |
-
--accent-secondary: #2563EB;
|
| 62 |
-
--accent-tertiary: #111827;
|
| 63 |
-
--glow-primary: rgba(245, 158, 11, 0.25);
|
| 64 |
-
--glow-secondary: rgba(37, 99, 235, 0.2);
|
| 65 |
-
--glow-tertiary: rgba(15, 23, 42, 0.18);
|
| 66 |
-
}
|
| 67 |
-
|
| 68 |
-
html.light [style*="color: white"],
|
| 69 |
-
html.light [style*="color:white"],
|
| 70 |
-
html.light [style*="#FFFFFF"],
|
| 71 |
-
html.light [style*="#ffffff"] {
|
| 72 |
-
color: var(--text-primary) !important;
|
| 73 |
-
}
|
| 74 |
|
| 75 |
/* Global font and background */
|
| 76 |
-
|
| 77 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, 'Inter', sans-serif !important;
|
| 78 |
background: var(--bg-primary) !important;
|
|
|
|
| 79 |
}
|
| 80 |
|
| 81 |
/* Headers and text */
|
|
@@ -86,15 +60,18 @@ def get_leaderboard_css():
|
|
| 86 |
}
|
| 87 |
|
| 88 |
p, span, div, li, ul li {
|
|
|
|
| 89 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 90 |
}
|
| 91 |
|
| 92 |
/* Labels and info text */
|
| 93 |
label {
|
|
|
|
| 94 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 95 |
}
|
| 96 |
|
| 97 |
.gr-box label {
|
|
|
|
| 98 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 99 |
}
|
| 100 |
|
|
@@ -181,7 +158,7 @@ def get_leaderboard_css():
|
|
| 181 |
|
| 182 |
/* Radio button labels */
|
| 183 |
input[type="radio"] + label {
|
| 184 |
-
color:
|
| 185 |
}
|
| 186 |
|
| 187 |
input[type="radio"]:checked {
|
|
@@ -194,22 +171,26 @@ def get_leaderboard_css():
|
|
| 194 |
.dropdown {
|
| 195 |
border-color: var(--border-default) !important;
|
| 196 |
background: var(--bg-card) !important;
|
|
|
|
| 197 |
transition: all 0.2s ease !important;
|
| 198 |
}
|
| 199 |
|
| 200 |
/* Dropdown option styling */
|
| 201 |
.dropdown option {
|
| 202 |
background: var(--bg-card) !important;
|
|
|
|
| 203 |
}
|
| 204 |
|
| 205 |
/* Gradio dropdown specific styling */
|
| 206 |
.gradio-dropdown select,
|
| 207 |
.gradio-dropdown [role="combobox"],
|
| 208 |
.gradio-dropdown input {
|
|
|
|
| 209 |
background: var(--bg-card) !important;
|
| 210 |
}
|
| 211 |
|
| 212 |
.gradio-dropdown option {
|
|
|
|
| 213 |
background: var(--bg-card) !important;
|
| 214 |
}
|
| 215 |
|
|
@@ -229,16 +210,19 @@ def get_leaderboard_css():
|
|
| 229 |
overflow-y: auto !important;
|
| 230 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 231 |
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3) !important;
|
|
|
|
| 232 |
}
|
| 233 |
|
| 234 |
/* Table cells and headers */
|
| 235 |
.dataframe td,
|
| 236 |
.dataframe th {
|
|
|
|
| 237 |
}
|
| 238 |
|
| 239 |
/* Button styling */
|
| 240 |
button {
|
| 241 |
background: var(--bg-card) !important;
|
|
|
|
| 242 |
border: 1px solid var(--border-default) !important;
|
| 243 |
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
|
| 244 |
}
|
|
@@ -379,7 +363,7 @@ def get_leaderboard_css():
|
|
| 379 |
display: inline-block !important;
|
| 380 |
padding: 14px 28px !important;
|
| 381 |
background: #ffd21e !important;
|
| 382 |
-
color:
|
| 383 |
text-decoration: none !important;
|
| 384 |
border-radius: 16px !important;
|
| 385 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
|
@@ -398,7 +382,7 @@ def get_leaderboard_css():
|
|
| 398 |
transform: translateY(-3px) !important;
|
| 399 |
box-shadow: 0 12px 32px rgba(255, 210, 30, 0.5), 0 8px 16px rgba(0, 0, 0, 0.4) !important;
|
| 400 |
background: #ffd21e !important;
|
| 401 |
-
color:
|
| 402 |
text-decoration: none !important;
|
| 403 |
text-shadow: 0 2px 6px rgba(0, 0, 0, 0.45) !important;
|
| 404 |
}
|
|
@@ -440,176 +424,24 @@ def get_leaderboard_css():
|
|
| 440 |
border-color: #ffd21e !important;
|
| 441 |
box-shadow: 0 8px 24px rgba(255, 210, 30, 0.3), 0 4px 12px rgba(0, 0, 0, 0.4) !important;
|
| 442 |
text-decoration: none !important;
|
| 443 |
-
color: var(--text-primary) !important;
|
| 444 |
-
}
|
| 445 |
-
|
| 446 |
-
/* Ensure key hero/body text stays bright */
|
| 447 |
-
.hero-subtitle,
|
| 448 |
-
.section-lead,
|
| 449 |
-
.section-subtitle,
|
| 450 |
-
.criteria-card li,
|
| 451 |
-
.scenario-body,
|
| 452 |
-
.hero-action-button,
|
| 453 |
-
.hero-action-button span,
|
| 454 |
-
#lang-toggle-btn,
|
| 455 |
-
#lang-toggle-btn button {
|
| 456 |
-
color: #FFFFFF !important;
|
| 457 |
-
}
|
| 458 |
-
|
| 459 |
-
.secondary.svelte-1ixn6qd {
|
| 460 |
-
color: #FFFFFF !important;
|
| 461 |
-
}
|
| 462 |
-
|
| 463 |
-
/* Responsive adjustments */
|
| 464 |
-
@media (max-width: 1024px) {
|
| 465 |
-
.hero-title {
|
| 466 |
-
font-size: 4.5rem !important;
|
| 467 |
-
}
|
| 468 |
-
.hero-subtitle {
|
| 469 |
-
font-size: 1.6rem !important;
|
| 470 |
-
}
|
| 471 |
-
.hero-actions {
|
| 472 |
-
flex-wrap: wrap !important;
|
| 473 |
-
gap: 12px !important;
|
| 474 |
-
}
|
| 475 |
-
.performance-card {
|
| 476 |
-
padding: 24px !important;
|
| 477 |
-
}
|
| 478 |
-
.domain-selector-container {
|
| 479 |
-
padding: 24px !important;
|
| 480 |
-
}
|
| 481 |
-
.dashboard-section {
|
| 482 |
-
padding: 28px !important;
|
| 483 |
-
}
|
| 484 |
-
}
|
| 485 |
-
|
| 486 |
-
@media (max-width: 768px) {
|
| 487 |
-
.hero-banner-wrapper {
|
| 488 |
-
width: 100% !important;
|
| 489 |
-
margin: 0 0 16px 0 !important;
|
| 490 |
-
}
|
| 491 |
-
.hero-title {
|
| 492 |
-
font-size: 3.2rem !important;
|
| 493 |
-
}
|
| 494 |
-
.hero-subtitle {
|
| 495 |
-
font-size: 1.3rem !important;
|
| 496 |
-
}
|
| 497 |
-
.hero-actions {
|
| 498 |
-
flex-direction: column !important;
|
| 499 |
-
align-items: stretch !important;
|
| 500 |
-
}
|
| 501 |
-
.hero-action-button {
|
| 502 |
-
width: 100% !important;
|
| 503 |
-
justify-content: center !important;
|
| 504 |
-
}
|
| 505 |
-
.dashboard-section,
|
| 506 |
-
.domain-selector-container,
|
| 507 |
-
.performance-card {
|
| 508 |
-
margin: 20px 12px !important;
|
| 509 |
-
padding: 20px !important;
|
| 510 |
-
}
|
| 511 |
-
.performance-card .card-body {
|
| 512 |
-
grid-template-columns: 1fr !important;
|
| 513 |
-
gap: 20px !important;
|
| 514 |
-
}
|
| 515 |
-
.radar-slot {
|
| 516 |
-
width: 100% !important;
|
| 517 |
-
max-width: 260px !important;
|
| 518 |
-
margin: 0 auto !important;
|
| 519 |
-
}
|
| 520 |
-
.v2-table-container {
|
| 521 |
-
overflow-x: auto !important;
|
| 522 |
-
}
|
| 523 |
-
.v2-styled-table {
|
| 524 |
-
min-width: 720px !important;
|
| 525 |
-
}
|
| 526 |
-
.hero-actions svg {
|
| 527 |
-
width: 18px !important;
|
| 528 |
-
height: 18px !important;
|
| 529 |
-
}
|
| 530 |
-
.section-title {
|
| 531 |
-
font-size: 1.8rem !important;
|
| 532 |
-
}
|
| 533 |
-
.section-lead,
|
| 534 |
-
.section-subtitle {
|
| 535 |
-
font-size: 1rem !important;
|
| 536 |
-
}
|
| 537 |
-
.criteria-card {
|
| 538 |
-
padding: 16px !important;
|
| 539 |
-
}
|
| 540 |
-
.criteria-grid {
|
| 541 |
-
grid-template-columns: 1fr !important;
|
| 542 |
-
gap: 16px !important;
|
| 543 |
-
}
|
| 544 |
-
.phase-grid {
|
| 545 |
-
grid-template-columns: 1fr !important;
|
| 546 |
-
}
|
| 547 |
-
.hero-subtitle,
|
| 548 |
-
.section-lead,
|
| 549 |
-
.section-subtitle,
|
| 550 |
-
.criteria-card li,
|
| 551 |
-
.scenario-body {
|
| 552 |
-
text-align: left !important;
|
| 553 |
-
}
|
| 554 |
-
}
|
| 555 |
-
|
| 556 |
-
@media (max-width: 480px) {
|
| 557 |
-
.hero-title {
|
| 558 |
-
font-size: 2.4rem !important;
|
| 559 |
-
}
|
| 560 |
-
.hero-subtitle {
|
| 561 |
-
font-size: 1.1rem !important;
|
| 562 |
-
}
|
| 563 |
-
.hero-action-button {
|
| 564 |
-
font-size: 0.95rem !important;
|
| 565 |
-
padding: 10px 16px !important;
|
| 566 |
-
}
|
| 567 |
-
.performance-card {
|
| 568 |
-
padding: 18px !important;
|
| 569 |
-
}
|
| 570 |
-
.card-top-row {
|
| 571 |
-
flex-direction: column !important;
|
| 572 |
-
gap: 12px !important;
|
| 573 |
-
}
|
| 574 |
-
.rank-panel {
|
| 575 |
-
align-self: flex-start !important;
|
| 576 |
-
}
|
| 577 |
-
.model-selector-container,
|
| 578 |
-
.level-selector-container {
|
| 579 |
-
margin: 0 !important;
|
| 580 |
-
}
|
| 581 |
-
.hero-banner-wrapper {
|
| 582 |
-
margin-bottom: 12px !important;
|
| 583 |
-
}
|
| 584 |
-
}
|
| 585 |
-
}
|
| 586 |
-
|
| 587 |
-
/* Language toggle button */
|
| 588 |
-
#lang-toggle-btn button,
|
| 589 |
-
#lang-toggle-btn {
|
| 590 |
color: #FFFFFF !important;
|
| 591 |
-
border-color: #ffd21e !important;
|
| 592 |
-
}
|
| 593 |
-
|
| 594 |
-
.hero-action-button {
|
| 595 |
-
border-color: #ffd21e !important;
|
| 596 |
}
|
| 597 |
|
| 598 |
/* Numeric content styling */
|
| 599 |
.numeric-cell, .metric-value, .rank-value,
|
| 600 |
.level-tile-score, .core-metric-card .metric-value {
|
| 601 |
-
color:
|
| 602 |
font-family: 'Geist Mono', monospace !important;
|
| 603 |
}
|
| 604 |
|
| 605 |
/* Table content */
|
| 606 |
td, th, table * {
|
| 607 |
-
color:
|
| 608 |
}
|
| 609 |
|
| 610 |
/* All numeric and data elements */
|
| 611 |
.performance-card *, .v2-styled-table *, .dataframe * {
|
| 612 |
-
color:
|
| 613 |
}
|
| 614 |
|
| 615 |
/* Enhanced dropdown styling - more specific selectors
|
|
@@ -622,18 +454,20 @@ def get_leaderboard_css():
|
|
| 622 |
.model-dropdown [role="combobox"],
|
| 623 |
.model-dropdown button {
|
| 624 |
background: rgba(1, 9, 26, 0.95) !important;
|
|
|
|
| 625 |
border: 1px solid var(--border-default) !important;
|
| 626 |
border-radius: 8px !important;
|
| 627 |
}
|
| 628 |
-
|
| 629 |
.gradio-dropdown option,
|
| 630 |
.model-dropdown option {
|
| 631 |
background: rgba(1, 9, 26, 0.95) !important;
|
|
|
|
| 632 |
}
|
| 633 |
|
| 634 |
/* Force dropdown text color */
|
| 635 |
/* .gradio-dropdown *, .model-dropdown * {
|
| 636 |
-
color:
|
| 637 |
} */
|
| 638 |
|
| 639 |
/* Gradio 5.x compatible dropdown styling */
|
|
@@ -641,31 +475,22 @@ def get_leaderboard_css():
|
|
| 641 |
.gradio-container [data-testid="dropdown"],
|
| 642 |
.gradio-container select {
|
| 643 |
background-color: rgba(1, 9, 26, 0.95) !important;
|
|
|
|
| 644 |
border: 1px solid rgba(245, 246, 247, 0.12) !important;
|
| 645 |
}
|
| 646 |
-
|
| 647 |
.gradio-container .gradio-dropdown option,
|
| 648 |
.gradio-container select option {
|
| 649 |
background-color: rgba(1, 9, 26, 0.95) !important;
|
|
|
|
| 650 |
}
|
| 651 |
-
|
| 652 |
/* Target the actual visible text in dropdown */
|
| 653 |
.gradio-container [role="combobox"],
|
| 654 |
.gradio-container .gradio-dropdown .wrap > div {
|
|
|
|
| 655 |
background-color: rgba(1, 9, 26, 0.95) !important;
|
| 656 |
}
|
| 657 |
|
| 658 |
-
html.light .model-dropdown .gradio-dropdown,
|
| 659 |
-
html.light .model-dropdown [role="combobox"],
|
| 660 |
-
html.light .model-dropdown button,
|
| 661 |
-
html.light .gradio-container [data-testid="dropdown"],
|
| 662 |
-
html.light .gradio-container select,
|
| 663 |
-
html.light .gradio-container [role="combobox"],
|
| 664 |
-
html.light .gradio-container .gradio-dropdown .wrap > div {
|
| 665 |
-
background-color: rgba(255, 255, 255, 0.95) !important;
|
| 666 |
-
border-color: rgba(15, 23, 42, 0.12) !important;
|
| 667 |
-
box-shadow: 0 8px 20px rgba(15, 23, 42, 0.08) !important;
|
| 668 |
-
}
|
| 669 |
-
|
| 670 |
</style>
|
| 671 |
"""
|
|
|
|
| 34 |
--border-subtle: rgba(245, 246, 247, 0.08);
|
| 35 |
--border-default: rgba(245, 246, 247, 0.12);
|
| 36 |
--border-strong: rgba(245, 246, 247, 0.2);
|
| 37 |
+
--text-primary: #F5F6F7;
|
| 38 |
+
--text-secondary: #94A3B8;
|
| 39 |
+
--text-muted: #64748B;
|
| 40 |
--accent-primary: #ffd21e;
|
| 41 |
--accent-secondary: #1098F7;
|
| 42 |
--accent-tertiary: #F5F6F7;
|
|
|
|
| 44 |
--glow-secondary: rgba(16, 152, 247, 0.4);
|
| 45 |
--glow-tertiary: rgba(245, 246, 247, 0.3);
|
| 46 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
/* Global font and background */
|
| 49 |
+
.gradio-container {
|
| 50 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, 'Inter', sans-serif !important;
|
| 51 |
background: var(--bg-primary) !important;
|
| 52 |
+
color: var(--text-primary) !important;
|
| 53 |
}
|
| 54 |
|
| 55 |
/* Headers and text */
|
|
|
|
| 60 |
}
|
| 61 |
|
| 62 |
p, span, div, li, ul li {
|
| 63 |
+
color: white !important;
|
| 64 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 65 |
}
|
| 66 |
|
| 67 |
/* Labels and info text */
|
| 68 |
label {
|
| 69 |
+
color: white !important;
|
| 70 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 71 |
}
|
| 72 |
|
| 73 |
.gr-box label {
|
| 74 |
+
color: white !important;
|
| 75 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 76 |
}
|
| 77 |
|
|
|
|
| 158 |
|
| 159 |
/* Radio button labels */
|
| 160 |
input[type="radio"] + label {
|
| 161 |
+
color: white !important;
|
| 162 |
}
|
| 163 |
|
| 164 |
input[type="radio"]:checked {
|
|
|
|
| 171 |
.dropdown {
|
| 172 |
border-color: var(--border-default) !important;
|
| 173 |
background: var(--bg-card) !important;
|
| 174 |
+
color: white !important;
|
| 175 |
transition: all 0.2s ease !important;
|
| 176 |
}
|
| 177 |
|
| 178 |
/* Dropdown option styling */
|
| 179 |
.dropdown option {
|
| 180 |
background: var(--bg-card) !important;
|
| 181 |
+
color: white !important;
|
| 182 |
}
|
| 183 |
|
| 184 |
/* Gradio dropdown specific styling */
|
| 185 |
.gradio-dropdown select,
|
| 186 |
.gradio-dropdown [role="combobox"],
|
| 187 |
.gradio-dropdown input {
|
| 188 |
+
color: white !important;
|
| 189 |
background: var(--bg-card) !important;
|
| 190 |
}
|
| 191 |
|
| 192 |
.gradio-dropdown option {
|
| 193 |
+
color: white !important;
|
| 194 |
background: var(--bg-card) !important;
|
| 195 |
}
|
| 196 |
|
|
|
|
| 210 |
overflow-y: auto !important;
|
| 211 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 212 |
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3) !important;
|
| 213 |
+
color: white !important;
|
| 214 |
}
|
| 215 |
|
| 216 |
/* Table cells and headers */
|
| 217 |
.dataframe td,
|
| 218 |
.dataframe th {
|
| 219 |
+
color: white !important;
|
| 220 |
}
|
| 221 |
|
| 222 |
/* Button styling */
|
| 223 |
button {
|
| 224 |
background: var(--bg-card) !important;
|
| 225 |
+
color: white !important;
|
| 226 |
border: 1px solid var(--border-default) !important;
|
| 227 |
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
|
| 228 |
}
|
|
|
|
| 363 |
display: inline-block !important;
|
| 364 |
padding: 14px 28px !important;
|
| 365 |
background: #ffd21e !important;
|
| 366 |
+
color: #FFFFFF !important;
|
| 367 |
text-decoration: none !important;
|
| 368 |
border-radius: 16px !important;
|
| 369 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
|
|
|
| 382 |
transform: translateY(-3px) !important;
|
| 383 |
box-shadow: 0 12px 32px rgba(255, 210, 30, 0.5), 0 8px 16px rgba(0, 0, 0, 0.4) !important;
|
| 384 |
background: #ffd21e !important;
|
| 385 |
+
color: #FFFFFF !important;
|
| 386 |
text-decoration: none !important;
|
| 387 |
text-shadow: 0 2px 6px rgba(0, 0, 0, 0.45) !important;
|
| 388 |
}
|
|
|
|
| 424 |
border-color: #ffd21e !important;
|
| 425 |
box-shadow: 0 8px 24px rgba(255, 210, 30, 0.3), 0 4px 12px rgba(0, 0, 0, 0.4) !important;
|
| 426 |
text-decoration: none !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
color: #FFFFFF !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
}
|
| 429 |
|
| 430 |
/* Numeric content styling */
|
| 431 |
.numeric-cell, .metric-value, .rank-value,
|
| 432 |
.level-tile-score, .core-metric-card .metric-value {
|
| 433 |
+
color: white !important;
|
| 434 |
font-family: 'Geist Mono', monospace !important;
|
| 435 |
}
|
| 436 |
|
| 437 |
/* Table content */
|
| 438 |
td, th, table * {
|
| 439 |
+
color: white !important;
|
| 440 |
}
|
| 441 |
|
| 442 |
/* All numeric and data elements */
|
| 443 |
.performance-card *, .v2-styled-table *, .dataframe * {
|
| 444 |
+
color: white !important;
|
| 445 |
}
|
| 446 |
|
| 447 |
/* Enhanced dropdown styling - more specific selectors
|
|
|
|
| 454 |
.model-dropdown [role="combobox"],
|
| 455 |
.model-dropdown button {
|
| 456 |
background: rgba(1, 9, 26, 0.95) !important;
|
| 457 |
+
color: white !important;
|
| 458 |
border: 1px solid var(--border-default) !important;
|
| 459 |
border-radius: 8px !important;
|
| 460 |
}
|
| 461 |
+
|
| 462 |
.gradio-dropdown option,
|
| 463 |
.model-dropdown option {
|
| 464 |
background: rgba(1, 9, 26, 0.95) !important;
|
| 465 |
+
color: white !important;
|
| 466 |
}
|
| 467 |
|
| 468 |
/* Force dropdown text color */
|
| 469 |
/* .gradio-dropdown *, .model-dropdown * {
|
| 470 |
+
color: white !important;
|
| 471 |
} */
|
| 472 |
|
| 473 |
/* Gradio 5.x compatible dropdown styling */
|
|
|
|
| 475 |
.gradio-container [data-testid="dropdown"],
|
| 476 |
.gradio-container select {
|
| 477 |
background-color: rgba(1, 9, 26, 0.95) !important;
|
| 478 |
+
color: white !important;
|
| 479 |
border: 1px solid rgba(245, 246, 247, 0.12) !important;
|
| 480 |
}
|
| 481 |
+
|
| 482 |
.gradio-container .gradio-dropdown option,
|
| 483 |
.gradio-container select option {
|
| 484 |
background-color: rgba(1, 9, 26, 0.95) !important;
|
| 485 |
+
color: white !important;
|
| 486 |
}
|
| 487 |
+
|
| 488 |
/* Target the actual visible text in dropdown */
|
| 489 |
.gradio-container [role="combobox"],
|
| 490 |
.gradio-container .gradio-dropdown .wrap > div {
|
| 491 |
+
color: white !important;
|
| 492 |
background-color: rgba(1, 9, 26, 0.95) !important;
|
| 493 |
}
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
</style>
|
| 496 |
"""
|
tabs/{leaderboard_v1_kr.py → leaderboard_v1.py}
RENAMED
|
@@ -53,8 +53,7 @@ def create_leaderboard_v2_tab():
|
|
| 53 |
|
| 54 |
# Clean and prepare data
|
| 55 |
df = df.copy()
|
| 56 |
-
|
| 57 |
-
numeric_candidate_cols = [col for col in df.columns if col not in exclude_cols]
|
| 58 |
for col in numeric_candidate_cols:
|
| 59 |
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 60 |
|
|
@@ -119,45 +118,36 @@ def create_leaderboard_v2_tab():
|
|
| 119 |
df['Call Validity'] = df[epr_cols].mean(axis=1)
|
| 120 |
|
| 121 |
# Use LLM Type from CSV directly, with mapping to display names
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
return None
|
| 135 |
-
|
| 136 |
-
# Prefer explicit type columns from the CSV, then fall back to vendor mapping
|
| 137 |
-
if 'Model Type' in df.columns:
|
| 138 |
-
df['Model Type'] = df['Model Type'].apply(normalize_model_type)
|
| 139 |
-
elif 'LLM Type' in df.columns:
|
| 140 |
-
df['Model Type'] = df['LLM Type'].apply(normalize_model_type)
|
| 141 |
else:
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
df['Model Type'] = df['Model Type'].fillna(df['Vendor'].map(vendor_model_type_map))
|
| 160 |
-
df['Model Type'] = df['Model Type'].fillna('Proprietary')
|
| 161 |
|
| 162 |
# Round numeric columns for better display
|
| 163 |
round_three_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Overall Success', 'Execution Accuracy',
|
|
@@ -234,36 +224,36 @@ def create_leaderboard_v2_tab():
|
|
| 234 |
# Level metadata for the 7-stage task framework
|
| 235 |
level_details = {
|
| 236 |
"ALL": {
|
| 237 |
-
"title": "
|
| 238 |
-
"description": "
|
| 239 |
},
|
| 240 |
"L1": {
|
| 241 |
-
"title": "<span style='color:
|
| 242 |
-
"description": "<span style='color:
|
| 243 |
},
|
| 244 |
"L2": {
|
| 245 |
-
"title": "<span style='color:
|
| 246 |
-
"description": "<span style='color:
|
| 247 |
},
|
| 248 |
"L3": {
|
| 249 |
-
"title": "<span style='color:
|
| 250 |
-
"description": "<span style='color:
|
| 251 |
},
|
| 252 |
"L4": {
|
| 253 |
-
"title": "<span style='color:
|
| 254 |
-
"description": "<span style='color:
|
| 255 |
},
|
| 256 |
"L5": {
|
| 257 |
-
"title": "<span style='color:
|
| 258 |
-
"description": "<span style='color:
|
| 259 |
},
|
| 260 |
"L6": {
|
| 261 |
-
"title": "<span style='color:
|
| 262 |
-
"description": "<span style='color:
|
| 263 |
},
|
| 264 |
"L7": {
|
| 265 |
-
"title": "<span style='color:
|
| 266 |
-
"description": "<span style='color:
|
| 267 |
}
|
| 268 |
}
|
| 269 |
default_level = "ALL"
|
|
@@ -301,7 +291,7 @@ def create_leaderboard_v2_tab():
|
|
| 301 |
border-collapse: collapse;
|
| 302 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif;
|
| 303 |
background: var(--bg-card);
|
| 304 |
-
color:
|
| 305 |
}
|
| 306 |
|
| 307 |
.v2-styled-table thead {
|
|
@@ -315,7 +305,7 @@ def create_leaderboard_v2_tab():
|
|
| 315 |
padding: 14px 12px;
|
| 316 |
text-align: left;
|
| 317 |
font-weight: 600;
|
| 318 |
-
color:
|
| 319 |
border-bottom: 2px solid var(--accent-primary);
|
| 320 |
font-size: 13px;
|
| 321 |
text-transform: uppercase;
|
|
@@ -329,7 +319,7 @@ def create_leaderboard_v2_tab():
|
|
| 329 |
.v2-styled-table td {
|
| 330 |
padding: 12px;
|
| 331 |
border-bottom: 1px solid var(--border-subtle);
|
| 332 |
-
color:
|
| 333 |
transition: all 0.2s ease;
|
| 334 |
}
|
| 335 |
|
|
@@ -349,30 +339,30 @@ def create_leaderboard_v2_tab():
|
|
| 349 |
|
| 350 |
.model-name {
|
| 351 |
font-weight: 500;
|
| 352 |
-
color:
|
| 353 |
transition: color 0.2s ease;
|
| 354 |
}
|
| 355 |
|
| 356 |
/* Keep model name color consistent on hover to emphasize row highlight */
|
| 357 |
.v2-styled-table tr:hover .model-name {
|
| 358 |
-
color:
|
| 359 |
}
|
| 360 |
|
| 361 |
.numeric-cell {
|
| 362 |
font-family: 'Geist Mono', monospace;
|
| 363 |
font-size: 13px;
|
| 364 |
text-align: center;
|
| 365 |
-
color:
|
| 366 |
}
|
| 367 |
|
| 368 |
.highlight-header {
|
| 369 |
background: rgba(255, 210, 30, 0.14);
|
| 370 |
-
color:
|
| 371 |
}
|
| 372 |
|
| 373 |
.highlight-cell {
|
| 374 |
background: rgba(255, 210, 30, 0.08);
|
| 375 |
-
color:
|
| 376 |
font-weight: 600;
|
| 377 |
}
|
| 378 |
</style>
|
|
@@ -470,8 +460,8 @@ def create_leaderboard_v2_tab():
|
|
| 470 |
return f"""
|
| 471 |
<div class="domain-selector-container leaderboard-intro">
|
| 472 |
<div class="domain-header">
|
| 473 |
-
<h2 class="domain-title" style="color:
|
| 474 |
-
<p class="domain-subtitle" style="color:
|
| 475 |
</div>
|
| 476 |
<div class="dataframe-container">
|
| 477 |
"""
|
|
@@ -521,14 +511,6 @@ def create_leaderboard_v2_tab():
|
|
| 521 |
# Load initial data
|
| 522 |
initial_table = filter_and_sort_data(default_level, "All", "Overall Success", "Descending")
|
| 523 |
initial_df = load_leaderboard_data() # Load raw data for model selector
|
| 524 |
-
if not initial_df.empty:
|
| 525 |
-
overall_success_numeric = pd.to_numeric(initial_df.get('Overall Success'), errors='coerce')
|
| 526 |
-
if overall_success_numeric.notna().any():
|
| 527 |
-
initial_df = initial_df.assign(**{'Overall Success': overall_success_numeric}).sort_values(
|
| 528 |
-
'Overall Success', ascending=False, na_position='last'
|
| 529 |
-
)
|
| 530 |
-
else:
|
| 531 |
-
initial_df = initial_df.sort_values('Model')
|
| 532 |
initial_selected_models = initial_df['Model'].tolist()[:5] if len(initial_df) > 0 else []
|
| 533 |
initial_heatmap_models = initial_df['Model'].tolist()[:12] if len(initial_df) > 0 else []
|
| 534 |
initial_heatmap = create_performance_heatmap(initial_df, initial_heatmap_models)
|
|
@@ -754,14 +736,12 @@ def create_leaderboard_v2_tab():
|
|
| 754 |
# Header styles and navigation
|
| 755 |
gr.HTML("""
|
| 756 |
<style>
|
| 757 |
-
@import url('https://fonts.googleapis.com/css2?family=Nanum+Gothic:wght@400;700;800&family=Gowun+Dodum&family=Do+Hyeon&display=swap');
|
| 758 |
-
|
| 759 |
/* Enhanced button styling with better gradio compatibility */
|
| 760 |
.header-action-button {
|
| 761 |
display: inline-block !important;
|
| 762 |
padding: 14px 28px !important;
|
| 763 |
background: #ffd21e !important;
|
| 764 |
-
color:
|
| 765 |
text-decoration: none !important;
|
| 766 |
border-radius: 16px !important;
|
| 767 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
|
@@ -795,7 +775,7 @@ def create_leaderboard_v2_tab():
|
|
| 795 |
transform: translateY(-3px) !important;
|
| 796 |
box-shadow: 0 12px 32px rgba(255, 210, 30, 0.5), 0 8px 16px rgba(0, 0, 0, 0.4) !important;
|
| 797 |
background: #ffd21e !important;
|
| 798 |
-
color:
|
| 799 |
text-decoration: none !important;
|
| 800 |
text-shadow: 0 2px 6px rgba(0, 0, 0, 0.45) !important;
|
| 801 |
}
|
|
@@ -810,52 +790,34 @@ def create_leaderboard_v2_tab():
|
|
| 810 |
filter: drop-shadow(0 0 8px rgba(255, 255, 255, 0.3));
|
| 811 |
}
|
| 812 |
|
| 813 |
-
.hero-banner-wrapper {
|
| 814 |
-
position: relative;
|
| 815 |
-
width: 100vw;
|
| 816 |
-
margin: 0 calc(-50vw + 50%) 20px calc(-50vw + 50%);
|
| 817 |
-
border-radius: 0 !important;
|
| 818 |
-
overflow: hidden !important;
|
| 819 |
-
box-shadow: 0 12px 32px rgba(0, 0, 0, 0.25) !important;
|
| 820 |
-
}
|
| 821 |
-
|
| 822 |
-
.hero-banner-wrapper::before {
|
| 823 |
-
content: "";
|
| 824 |
-
position: absolute;
|
| 825 |
-
inset: 0;
|
| 826 |
-
background: #01091A;
|
| 827 |
-
z-index: 0;
|
| 828 |
-
}
|
| 829 |
-
|
| 830 |
#hero-banner {
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
|
|
|
| 835 |
}
|
| 836 |
-
|
| 837 |
#hero-banner img {
|
| 838 |
-
width: 100
|
| 839 |
-
height: auto
|
| 840 |
-
display: block
|
| 841 |
-
object-fit: cover !important;
|
| 842 |
}
|
| 843 |
|
| 844 |
.hero-title {
|
| 845 |
-
font-size:
|
| 846 |
font-weight: 800;
|
| 847 |
line-height: 1.1;
|
| 848 |
background: linear-gradient(135deg, #FFE082 0%, #FFC107 50%, #FFB300 100%);
|
| 849 |
-webkit-background-clip: text;
|
| 850 |
-webkit-text-fill-color: transparent;
|
| 851 |
margin-bottom: 1rem;
|
| 852 |
-
font-family: 'Nanum Gothic', sans-serif !important;
|
| 853 |
}
|
| 854 |
|
| 855 |
.hero-subtitle {
|
| 856 |
color: var(--text-secondary);
|
| 857 |
-
font-size:
|
| 858 |
-
font-family: '
|
| 859 |
margin-top: 0;
|
| 860 |
}
|
| 861 |
|
|
@@ -914,7 +876,6 @@ def create_leaderboard_v2_tab():
|
|
| 914 |
box-shadow: 0 12px 30px rgba(0, 0, 0, 0.25);
|
| 915 |
backdrop-filter: blur(12px);
|
| 916 |
-webkit-backdrop-filter: blur(12px);
|
| 917 |
-
font-family: 'Nanum Gothic', sans-serif !important;
|
| 918 |
}
|
| 919 |
|
| 920 |
.dashboard-section.emphasized {
|
|
@@ -934,16 +895,15 @@ def create_leaderboard_v2_tab():
|
|
| 934 |
}
|
| 935 |
|
| 936 |
.section-title {
|
| 937 |
-
font-size:
|
| 938 |
font-weight: 700;
|
| 939 |
color: var(--text-primary);
|
| 940 |
margin-bottom: 12px;
|
| 941 |
text-align: center !important;
|
| 942 |
-
font-family: 'Nanum Gothic', sans-serif !important;
|
| 943 |
}
|
| 944 |
|
| 945 |
.section-lead, .section-subtitle {
|
| 946 |
-
font-size: 1.
|
| 947 |
color: var(--text-secondary);
|
| 948 |
max-width: 720px;
|
| 949 |
margin: 0 auto 24px auto;
|
|
@@ -952,7 +912,6 @@ def create_leaderboard_v2_tab():
|
|
| 952 |
word-break: keep-all;
|
| 953 |
white-space: normal;
|
| 954 |
display: block;
|
| 955 |
-
font-family: 'Nanum Gothic', sans-serif !important;
|
| 956 |
}
|
| 957 |
|
| 958 |
.phase-grid {
|
|
@@ -970,11 +929,10 @@ def create_leaderboard_v2_tab():
|
|
| 970 |
}
|
| 971 |
|
| 972 |
.phase-card h3 {
|
| 973 |
-
font-size: 1.
|
| 974 |
color: var(--text-primary);
|
| 975 |
margin-bottom: 20px;
|
| 976 |
font-weight: 700;
|
| 977 |
-
font-family: 'Nanum Gothic', sans-serif !important;
|
| 978 |
}
|
| 979 |
|
| 980 |
.phase-chart {
|
|
@@ -1002,26 +960,11 @@ def create_leaderboard_v2_tab():
|
|
| 1002 |
|
| 1003 |
.phase-chart span {
|
| 1004 |
position: relative;
|
| 1005 |
-
font-size: 1.
|
| 1006 |
font-weight: 700;
|
| 1007 |
-
color: var(--text-primary)
|
| 1008 |
-
font-family: 'Nanum Gothic', sans-serif !important;
|
| 1009 |
-
}
|
| 1010 |
-
|
| 1011 |
-
/* 추가적인 구체적 선택자 */
|
| 1012 |
-
.phase-card .phase-chart span {
|
| 1013 |
-
color: var(--text-primary) !important;
|
| 1014 |
-
text-shadow: 0 1px 2px rgba(0, 0, 0, 0.8) !important;
|
| 1015 |
-
font-family: 'Nanum Gothic', sans-serif !important;
|
| 1016 |
-
}
|
| 1017 |
-
|
| 1018 |
-
.phase-grid .phase-chart span {
|
| 1019 |
-
color: var(--text-primary) !important;
|
| 1020 |
-
z-index: 10 !important;
|
| 1021 |
-
font-family: 'Nanum Gothic', sans-serif !important;
|
| 1022 |
}
|
| 1023 |
|
| 1024 |
-
|
| 1025 |
.phase-list {
|
| 1026 |
list-style: none;
|
| 1027 |
padding: 0;
|
|
@@ -1036,8 +979,7 @@ def create_leaderboard_v2_tab():
|
|
| 1036 |
background: rgba(245, 246, 247, 0.05);
|
| 1037 |
border: 1px solid rgba(245, 246, 247, 0.08);
|
| 1038 |
color: var(--text-secondary);
|
| 1039 |
-
font-size:
|
| 1040 |
-
font-family: 'Nanum Gothic', sans-serif !important;
|
| 1041 |
}
|
| 1042 |
|
| 1043 |
.scenario-body {
|
|
@@ -1100,7 +1042,7 @@ def create_leaderboard_v2_tab():
|
|
| 1100 |
/* Responsive design */
|
| 1101 |
@media (max-width: 768px) {
|
| 1102 |
.hero-title {
|
| 1103 |
-
font-size:
|
| 1104 |
}
|
| 1105 |
.hero-action-button {
|
| 1106 |
width: 100% !important;
|
|
@@ -1124,7 +1066,7 @@ def create_leaderboard_v2_tab():
|
|
| 1124 |
gap: 8px;
|
| 1125 |
}
|
| 1126 |
.section-title {
|
| 1127 |
-
font-size:
|
| 1128 |
}
|
| 1129 |
.phase-chart {
|
| 1130 |
width: 100px;
|
|
@@ -1138,15 +1080,13 @@ def create_leaderboard_v2_tab():
|
|
| 1138 |
</style>
|
| 1139 |
""")
|
| 1140 |
|
| 1141 |
-
gr.HTML("<div class='hero-banner-wrapper'>")
|
| 1142 |
gr.Image(
|
| 1143 |
-
value="
|
| 1144 |
show_label=False,
|
| 1145 |
interactive=False,
|
| 1146 |
type="filepath",
|
| 1147 |
elem_id="hero-banner"
|
| 1148 |
)
|
| 1149 |
-
gr.HTML("</div>")
|
| 1150 |
|
| 1151 |
gr.HTML("""
|
| 1152 |
<div style="text-align: center; padding: 20px 0;">
|
|
@@ -1159,35 +1099,35 @@ def create_leaderboard_v2_tab():
|
|
| 1159 |
gr.HTML("""
|
| 1160 |
<div class="hero-actions">
|
| 1161 |
<a href="https://hugging-face-krew.github.io/" target="_blank" rel="noopener noreferrer" class="hero-action-button">
|
| 1162 |
-
<svg viewBox="0 0 24 24" fill="none" stroke="
|
| 1163 |
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
|
| 1164 |
<line x1="8" y1="12" x2="16" y2="12"/>
|
| 1165 |
</svg>
|
| 1166 |
-
<span
|
| 1167 |
</a>
|
| 1168 |
<a href="https://github.com/Hugging-Face-KREW/Ko-AgentBench" target="_blank" rel="noopener noreferrer" class="hero-action-button">
|
| 1169 |
-
<svg viewBox="0 0 24 24" fill="none" stroke="
|
| 1170 |
<path d="M9 19c-5 1.5-5-2.5-7-3"/>
|
| 1171 |
<path d="M20 21v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
|
| 1172 |
</svg>
|
| 1173 |
<span>GitHub</span>
|
| 1174 |
</a>
|
| 1175 |
<a href="https://huggingface.co/datasets/huggingface-KREW/Ko-AgentBench" target="_blank" rel="noopener noreferrer" class="hero-action-button">
|
| 1176 |
-
<svg viewBox="0 0 24 24" fill="none" stroke="
|
| 1177 |
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
|
| 1178 |
<polyline points="7 10 12 15 17 10"/>
|
| 1179 |
<line x1="12" y1="15" x2="12" y2="3"/>
|
| 1180 |
</svg>
|
| 1181 |
-
<span
|
| 1182 |
</a>
|
| 1183 |
-
<a href="https://github.com/Hugging-Face-KREW/Ko-AgentBench
|
| 1184 |
-
<svg viewBox="0 0 24 24" fill="none" stroke="
|
| 1185 |
<path d="M3 3v18h18"/>
|
| 1186 |
<path d="M7 17v-6"/>
|
| 1187 |
<path d="M12 17V7"/>
|
| 1188 |
<path d="M17 17v-3"/>
|
| 1189 |
</svg>
|
| 1190 |
-
<span
|
| 1191 |
</a>
|
| 1192 |
</div>
|
| 1193 |
""")
|
|
@@ -1196,31 +1136,31 @@ def create_leaderboard_v2_tab():
|
|
| 1196 |
gr.HTML("""
|
| 1197 |
<div class="dashboard-section">
|
| 1198 |
<div class="section-header">
|
| 1199 |
-
<h2 class="section-title"
|
| 1200 |
</div>
|
| 1201 |
<p class="section-lead" style="text-align: center; margin: 0 auto 24px auto; max-width: 720px; line-height: 1.7; word-break: keep-all;">단순 도구 호출부터 장기적 맥락 능력, 강건성 처리 능력까지 에이전트의 능력을 7단계로 입체적으로 분석하였습니다.</p>
|
| 1202 |
<div class="phase-grid">
|
| 1203 |
<div class="phase-card">
|
| 1204 |
-
<h3
|
| 1205 |
<div class="phase-chart" style="--progress:80%;">
|
| 1206 |
-
<span
|
| 1207 |
</div>
|
| 1208 |
<ul class="phase-list">
|
| 1209 |
-
<li style="color:
|
| 1210 |
-
<li style="color:
|
| 1211 |
-
<li style="color:
|
| 1212 |
-
<li style="color:
|
| 1213 |
-
<li style="color:
|
| 1214 |
</ul>
|
| 1215 |
</div>
|
| 1216 |
<div class="phase-card">
|
| 1217 |
-
<h3
|
| 1218 |
<div class="phase-chart" style="--progress:20%;">
|
| 1219 |
-
<span
|
| 1220 |
</div>
|
| 1221 |
<ul class="phase-list">
|
| 1222 |
-
<li style="color:
|
| 1223 |
-
<li style="color:
|
| 1224 |
</ul>
|
| 1225 |
</div>
|
| 1226 |
</div>
|
|
@@ -1231,21 +1171,20 @@ def create_leaderboard_v2_tab():
|
|
| 1231 |
gr.HTML("""
|
| 1232 |
<div class="dashboard-section emphasized">
|
| 1233 |
<div class="section-header">
|
| 1234 |
-
<h2 class="section-title"
|
| 1235 |
</div>
|
| 1236 |
<div class="scenario-body">
|
| 1237 |
-
<p
|
| 1238 |
</div>
|
| 1239 |
-
|
| 1240 |
-
</div>
|
| 1241 |
<div class="section-flow">⌄</div>
|
|
|
|
| 1242 |
""")
|
| 1243 |
|
| 1244 |
# Section 3: 핵심 평가 기준
|
| 1245 |
gr.HTML("""
|
| 1246 |
<div class="dashboard-section">
|
| 1247 |
<div class="section-header">
|
| 1248 |
-
<h2 class="section-title"
|
| 1249 |
</div>
|
| 1250 |
<div class="criteria-grid">
|
| 1251 |
<div class="criteria-card">
|
|
@@ -1279,8 +1218,6 @@ def create_leaderboard_v2_tab():
|
|
| 1279 |
# Domain filter section with enhanced styling
|
| 1280 |
gr.HTML("""
|
| 1281 |
<style>
|
| 1282 |
-
@import url('https://fonts.googleapis.com/css2?family=Nanum+Gothic:wght@400;700;800&family=Gowun+Dodum&family=Do+Hyeon&display=swap');
|
| 1283 |
-
|
| 1284 |
/* Enhanced domain selector styling */
|
| 1285 |
.domain-selector-container {
|
| 1286 |
background: #ffd21e0d;
|
|
@@ -1383,11 +1320,10 @@ def create_leaderboard_v2_tab():
|
|
| 1383 |
-webkit-background-clip: text;
|
| 1384 |
background-clip: text;
|
| 1385 |
-webkit-text-fill-color: transparent;
|
| 1386 |
-
text-shadow: 0 0
|
| 1387 |
-
filter: drop-shadow(0 0
|
| 1388 |
letter-spacing: 0.02em;
|
| 1389 |
-
animation: title-shimmer
|
| 1390 |
-
font-family: 'Nanum Gothic', sans-serif !important;
|
| 1391 |
}
|
| 1392 |
|
| 1393 |
@keyframes title-shimmer {
|
|
@@ -1675,8 +1611,8 @@ def create_leaderboard_v2_tab():
|
|
| 1675 |
|
| 1676 |
.model-dropdown select,
|
| 1677 |
.model-dropdown [role="combobox"] {
|
| 1678 |
-
background:
|
| 1679 |
-
border: 1px solid
|
| 1680 |
border-radius: 999px !important;
|
| 1681 |
padding: 12px 24px !important;
|
| 1682 |
color: var(--text-primary) !important;
|
|
@@ -1707,8 +1643,8 @@ def create_leaderboard_v2_tab():
|
|
| 1707 |
gap: 8px !important;
|
| 1708 |
width: 100% !important;
|
| 1709 |
padding: 12px 24px !important;
|
| 1710 |
-
background:
|
| 1711 |
-
border: 1px solid
|
| 1712 |
border-radius: 999px !important;
|
| 1713 |
color: var(--text-primary) !important;
|
| 1714 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
|
@@ -1765,7 +1701,7 @@ def create_leaderboard_v2_tab():
|
|
| 1765 |
background: #ffd21e !important;
|
| 1766 |
border: 1px solid rgba(255, 210, 30, 0.6) !important;
|
| 1767 |
border-radius: 999px !important;
|
| 1768 |
-
color:
|
| 1769 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 1770 |
font-weight: 600 !important;
|
| 1771 |
font-size: 0.95rem !important;
|
|
@@ -1812,7 +1748,7 @@ def create_leaderboard_v2_tab():
|
|
| 1812 |
font-size: 1.5rem;
|
| 1813 |
margin-bottom: 4px;
|
| 1814 |
display: block;
|
| 1815 |
-
filter: drop-shadow(0 0 10px
|
| 1816 |
}
|
| 1817 |
|
| 1818 |
.domain-name {
|
|
@@ -1827,7 +1763,7 @@ def create_leaderboard_v2_tab():
|
|
| 1827 |
top: 8px;
|
| 1828 |
right: 8px;
|
| 1829 |
background: var(--accent-primary);
|
| 1830 |
-
color:
|
| 1831 |
font-size: 0.75rem;
|
| 1832 |
padding: 2px 8px;
|
| 1833 |
border-radius: 12px;
|
|
@@ -1999,147 +1935,92 @@ def create_leaderboard_v2_tab():
|
|
| 1999 |
padding: 12px 20px !important;
|
| 2000 |
font-size: 0.95rem !important;
|
| 2001 |
}
|
| 2002 |
-
|
| 2003 |
-
/* Leaderboard controls row styling */
|
| 2004 |
-
.leaderboard-controls-row {
|
| 2005 |
-
margin: 20px 0 !important;
|
| 2006 |
-
padding: 20px !important;
|
| 2007 |
-
background: transparent !important;
|
| 2008 |
-
border: none !important;
|
| 2009 |
-
gap: 40px !important;
|
| 2010 |
-
}
|
| 2011 |
-
|
| 2012 |
-
.leaderboard-controls-row .gr-column,
|
| 2013 |
-
.leaderboard-controls-row .gr-row,
|
| 2014 |
-
.leaderboard-controls-row .gr-box,
|
| 2015 |
-
.leaderboard-controls-row .gradio-column,
|
| 2016 |
-
.leaderboard-controls-row .gradio-row,
|
| 2017 |
-
.leaderboard-controls-row .gradio-group {
|
| 2018 |
-
background: transparent !important;
|
| 2019 |
-
border: none !important;
|
| 2020 |
-
box-shadow: none !important;
|
| 2021 |
-
padding: 0 !important;
|
| 2022 |
-
}
|
| 2023 |
-
|
| 2024 |
-
/* Remove all container backgrounds for leaderboard controls */
|
| 2025 |
-
.leaderboard-controls-row * {
|
| 2026 |
-
background-color: transparent !important;
|
| 2027 |
-
background-image: none !important;
|
| 2028 |
-
border: none !important;
|
| 2029 |
-
box-shadow: none !important;
|
| 2030 |
-
}
|
| 2031 |
-
|
| 2032 |
-
.leaderboard-controls-row .inline-radio,
|
| 2033 |
-
.leaderboard-controls-row .domain-radio {
|
| 2034 |
-
background: transparent !important;
|
| 2035 |
-
border: none !important;
|
| 2036 |
-
box-shadow: none !important;
|
| 2037 |
-
}
|
| 2038 |
-
|
| 2039 |
-
/* Inline radio styling for integrated controls */
|
| 2040 |
-
.inline-radio {
|
| 2041 |
-
background: transparent !important;
|
| 2042 |
-
border: none !important;
|
| 2043 |
-
box-shadow: none !important;
|
| 2044 |
-
padding: 0 !important;
|
| 2045 |
-
}
|
| 2046 |
-
|
| 2047 |
-
.inline-radio .wrap {
|
| 2048 |
-
display: flex !important;
|
| 2049 |
-
gap: 8px !important;
|
| 2050 |
-
flex-wrap: wrap !important;
|
| 2051 |
-
justify-content: flex-start !important;
|
| 2052 |
-
background: transparent !important;
|
| 2053 |
-
border: none !important;
|
| 2054 |
-
box-shadow: none !important;
|
| 2055 |
-
padding: 0 !important;
|
| 2056 |
-
}
|
| 2057 |
-
|
| 2058 |
-
.inline-radio label {
|
| 2059 |
-
padding: 8px 16px !important;
|
| 2060 |
-
background: rgba(245, 246, 247, 0.06) !important;
|
| 2061 |
-
border: 1px solid var(--border-subtle) !important;
|
| 2062 |
-
border-radius: 20px !important;
|
| 2063 |
-
font-size: 0.85rem !important;
|
| 2064 |
-
color: var(--text-primary) !important;
|
| 2065 |
-
transition: all 0.2s ease !important;
|
| 2066 |
-
cursor: pointer !important;
|
| 2067 |
-
}
|
| 2068 |
-
|
| 2069 |
-
.inline-radio label:hover {
|
| 2070 |
-
background: rgba(255, 210, 30, 0.12) !important;
|
| 2071 |
-
border-color: var(--accent-primary) !important;
|
| 2072 |
-
}
|
| 2073 |
-
|
| 2074 |
-
.inline-radio input[type="radio"]:checked + label,
|
| 2075 |
-
.inline-radio label[aria-checked="true"] {
|
| 2076 |
-
background: rgba(255, 210, 30, 0.2) !important;
|
| 2077 |
-
border-color: var(--accent-primary) !important;
|
| 2078 |
-
color: var(--text-primary) !important;
|
| 2079 |
-
font-weight: 600 !important;
|
| 2080 |
-
}
|
| 2081 |
</style>
|
| 2082 |
|
| 2083 |
""")
|
| 2084 |
|
| 2085 |
level_options = list(level_details.keys())
|
| 2086 |
|
| 2087 |
-
|
| 2088 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2089 |
|
| 2090 |
-
#
|
| 2091 |
-
gr.
|
| 2092 |
-
|
| 2093 |
-
|
| 2094 |
-
|
| 2095 |
-
|
| 2096 |
-
|
| 2097 |
-
|
| 2098 |
-
elem_classes=["
|
| 2099 |
-
|
| 2100 |
-
|
| 2101 |
-
|
| 2102 |
-
|
| 2103 |
-
|
| 2104 |
-
|
| 2105 |
-
|
| 2106 |
-
|
| 2107 |
-
|
| 2108 |
-
|
| 2109 |
-
|
| 2110 |
-
|
| 2111 |
-
|
| 2112 |
-
|
| 2113 |
-
|
| 2114 |
-
|
| 2115 |
-
|
| 2116 |
-
|
| 2117 |
-
|
| 2118 |
-
|
| 2119 |
-
|
| 2120 |
-
|
|
|
|
| 2121 |
|
| 2122 |
leaderboard_table = gr.HTML(initial_table)
|
| 2123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2124 |
# Radar Chart Section
|
| 2125 |
gr.HTML("""
|
| 2126 |
<div class="domain-selector-container domain-performance-container">
|
| 2127 |
<div class="domain-header">
|
| 2128 |
-
<h2 class="domain-title" style="color:
|
| 2129 |
-
<p class="domain-subtitle" style="color:
|
| 2130 |
</div>
|
| 2131 |
""")
|
| 2132 |
-
|
| 2133 |
-
gr.
|
| 2134 |
-
|
| 2135 |
-
|
| 2136 |
-
|
| 2137 |
-
|
| 2138 |
-
|
| 2139 |
-
|
| 2140 |
-
|
| 2141 |
-
|
| 2142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2143 |
|
| 2144 |
# Radar chart plot - wrapped in centered container
|
| 2145 |
gr.HTML('<div class="chart-container radar-chart-container">')
|
|
@@ -2155,29 +2036,292 @@ def create_leaderboard_v2_tab():
|
|
| 2155 |
|
| 2156 |
gr.HTML("</div>")
|
| 2157 |
|
| 2158 |
-
|
| 2159 |
-
|
| 2160 |
-
|
| 2161 |
-
|
| 2162 |
-
|
| 2163 |
-
|
| 2164 |
-
|
| 2165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2166 |
|
| 2167 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2168 |
df = load_leaderboard_data()
|
| 2169 |
-
|
|
|
|
| 2170 |
|
| 2171 |
-
|
| 2172 |
-
|
| 2173 |
-
|
| 2174 |
-
</div>"""
|
| 2175 |
|
| 2176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2177 |
|
| 2178 |
-
#
|
| 2179 |
-
|
| 2180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2181 |
df_with_success = df_with_success[df_with_success['Overall Success'].notna()]
|
| 2182 |
df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True)
|
| 2183 |
try:
|
|
@@ -2308,22 +2452,18 @@ def create_leaderboard_v2_tab():
|
|
| 2308 |
gr.HTML("""
|
| 2309 |
<div class="domain-selector-container performance-card-container">
|
| 2310 |
<div class="domain-header">
|
| 2311 |
-
<h2 class="domain-title" style="color:
|
| 2312 |
-
<p class="domain-subtitle" style="color:
|
| 2313 |
-
모델의 성능 스펙트럼을 6대 핵심 지표와 L1~L7 단계별 종합 성공률(SR)로 시각화한 정밀 분석 카드를 확인해보세요.
|
| 2314 |
-
</p>
|
| 2315 |
-
<p class="domain-note" style="color: #bdbdbd; font-size: 0.85em; margin-top: 4px;">
|
| 2316 |
-
※ Rank는 L1~L7 단계별 SR의 평균값을 기준으로 선정되었습니다.
|
| 2317 |
-
</p>
|
| 2318 |
</div>
|
| 2319 |
-
|
| 2320 |
<div class="performance-card-content">
|
| 2321 |
""")
|
| 2322 |
-
|
| 2323 |
with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
|
| 2324 |
gr.HTML("""
|
| 2325 |
-
<
|
| 2326 |
-
|
|
|
|
|
|
|
| 2327 |
""")
|
| 2328 |
card_model_selector = gr.Dropdown(
|
| 2329 |
choices=initial_df['Model'].tolist(),
|
|
@@ -2331,10 +2471,10 @@ def create_leaderboard_v2_tab():
|
|
| 2331 |
label="",
|
| 2332 |
info=None,
|
| 2333 |
container=False,
|
| 2334 |
-
|
| 2335 |
)
|
| 2336 |
download_card_btn = gr.Button(
|
| 2337 |
-
"PNG
|
| 2338 |
elem_id="download-card-btn",
|
| 2339 |
elem_classes=["pill-button"]
|
| 2340 |
)
|
|
@@ -2353,275 +2493,6 @@ def create_leaderboard_v2_tab():
|
|
| 2353 |
</div>
|
| 2354 |
</div>
|
| 2355 |
""")
|
| 2356 |
-
|
| 2357 |
-
|
| 2358 |
-
# Level metric breakdown section
|
| 2359 |
-
gr.HTML("""
|
| 2360 |
-
<div class="domain-selector-container domain-performance-container level-metrics-wrapper">
|
| 2361 |
-
<div class="domain-header">
|
| 2362 |
-
<h2 class="domain-title" style="color: var(--text-primary);">레벨별 상세 지표</h2>
|
| 2363 |
-
<p class="domain-subtitle" style="color: var(--text-primary);">각 Ko-AgentBench 단계별 고유 평가 지표를 통해 모델 점수를 비교하고 더 자세히 살펴보세요.</p>
|
| 2364 |
-
</div>
|
| 2365 |
-
""")
|
| 2366 |
-
|
| 2367 |
-
with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
|
| 2368 |
-
level_metric_selector = gr.Dropdown(
|
| 2369 |
-
choices=level_ids,
|
| 2370 |
-
value=level_ids[0] if level_ids else None,
|
| 2371 |
-
multiselect=False,
|
| 2372 |
-
label="",
|
| 2373 |
-
info=None,
|
| 2374 |
-
container=False,
|
| 2375 |
-
elem_classes=["level-dropdown"]
|
| 2376 |
-
)
|
| 2377 |
-
level_model_selector = gr.Dropdown(
|
| 2378 |
-
choices=initial_level_model_choices,
|
| 2379 |
-
value=initial_level_model_values,
|
| 2380 |
-
multiselect=True,
|
| 2381 |
-
label="",
|
| 2382 |
-
info=None,
|
| 2383 |
-
container=False,
|
| 2384 |
-
elem_classes=["model-dropdown", "level-model-dropdown"]
|
| 2385 |
-
)
|
| 2386 |
-
|
| 2387 |
-
gr.HTML('<div class="chart-container level-metric-chart-container">')
|
| 2388 |
-
level_metric_chart = gr.Plot(
|
| 2389 |
-
label="",
|
| 2390 |
-
value=initial_level_metric_chart,
|
| 2391 |
-
elem_classes=["level-metric-plot", "plot-container"]
|
| 2392 |
-
)
|
| 2393 |
-
gr.HTML("""
|
| 2394 |
-
</div>
|
| 2395 |
-
</div>
|
| 2396 |
-
""")
|
| 2397 |
-
|
| 2398 |
-
# # Heatmap section
|
| 2399 |
-
# gr.HTML("""
|
| 2400 |
-
# <div class="domain-selector-container domain-performance-container heatmap-wrapper">
|
| 2401 |
-
# <div class="domain-header">
|
| 2402 |
-
# <h2 class="domain-title" style="color: var(--text-primary);">종합 성능 히트맵</h2>
|
| 2403 |
-
# <p class="domain-subtitle" style="color: var(--text-primary);">각 모델의 L1~L7 Ko-AgentBench SR(성공률) 점수를 한눈에 보세요.</p>
|
| 2404 |
-
# </div>
|
| 2405 |
-
# <div class="chart-container heatmap-chart-container">
|
| 2406 |
-
# """)
|
| 2407 |
-
# heatmap_chart = gr.Plot(
|
| 2408 |
-
# label="",
|
| 2409 |
-
# value=initial_heatmap,
|
| 2410 |
-
# elem_classes=["heatmap-plot", "plot-container"]
|
| 2411 |
-
# )
|
| 2412 |
-
# gr.HTML("""
|
| 2413 |
-
# </div>
|
| 2414 |
-
# </div>
|
| 2415 |
-
# """)
|
| 2416 |
-
|
| 2417 |
-
# Update functions
|
| 2418 |
-
def get_optimal_sort_order(sort_by_value):
|
| 2419 |
-
"""Return the optimal sort order for a given metric"""
|
| 2420 |
-
# Metrics where higher is better (descending)
|
| 2421 |
-
descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids]
|
| 2422 |
-
|
| 2423 |
-
# Metrics where lower is better (ascending)
|
| 2424 |
-
ascending_metrics = []
|
| 2425 |
-
|
| 2426 |
-
if sort_by_value in descending_metrics:
|
| 2427 |
-
return "Descending"
|
| 2428 |
-
elif sort_by_value in ascending_metrics:
|
| 2429 |
-
return "Ascending"
|
| 2430 |
-
else:
|
| 2431 |
-
return "Descending" # Default fallback
|
| 2432 |
-
|
| 2433 |
-
|
| 2434 |
-
|
| 2435 |
-
def update_table(level_filter, model_type_filter, sort_order):
|
| 2436 |
-
title_html = update_leaderboard_title(level_filter)
|
| 2437 |
-
sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success")
|
| 2438 |
-
table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order)
|
| 2439 |
-
return title_html, table_html
|
| 2440 |
-
|
| 2441 |
-
def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
|
| 2442 |
-
# Get filtered dataframe
|
| 2443 |
-
df = load_leaderboard_data()
|
| 2444 |
-
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
|
| 2445 |
-
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
|
| 2446 |
-
|
| 2447 |
-
# Update model selector choices based on filtered data
|
| 2448 |
-
available_models_all = filtered_df['Model'].tolist()
|
| 2449 |
-
available_models = available_models_all[:15] # Top 15 from filtered results
|
| 2450 |
-
|
| 2451 |
-
# If selected models are not in available models, reset to top 5
|
| 2452 |
-
if selected_models:
|
| 2453 |
-
valid_selected = [m for m in selected_models if m in available_models]
|
| 2454 |
-
# Check if more than 5 models are selected and show alert
|
| 2455 |
-
if len(valid_selected) > 5:
|
| 2456 |
-
gr.Warning("최대 5개 까지만 선택 가능합니다")
|
| 2457 |
-
# Remove the last selected item (6th item) instead of keeping first 5
|
| 2458 |
-
valid_selected = valid_selected[:-1]
|
| 2459 |
-
if not valid_selected:
|
| 2460 |
-
valid_selected = available_models[:5]
|
| 2461 |
-
else:
|
| 2462 |
-
valid_selected = available_models[:5]
|
| 2463 |
-
|
| 2464 |
-
# Create radar chart
|
| 2465 |
-
chart = create_domain_radar_chart(filtered_df, valid_selected)
|
| 2466 |
-
|
| 2467 |
-
# Prepare heatmap order prioritizing selected models
|
| 2468 |
-
|
| 2469 |
-
|
| 2470 |
-
# Level metric chart
|
| 2471 |
-
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
|
| 2472 |
-
available_level_models = available_models_all
|
| 2473 |
-
if level_selected_models:
|
| 2474 |
-
valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
|
| 2475 |
-
if not valid_level_models:
|
| 2476 |
-
valid_level_models = available_level_models[:5]
|
| 2477 |
-
else:
|
| 2478 |
-
valid_level_models = available_level_models[:5]
|
| 2479 |
-
level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
|
| 2480 |
-
|
| 2481 |
-
return (
|
| 2482 |
-
gr.Dropdown(
|
| 2483 |
-
choices=available_models,
|
| 2484 |
-
value=valid_selected,
|
| 2485 |
-
multiselect=True,
|
| 2486 |
-
label="",
|
| 2487 |
-
info=None,
|
| 2488 |
-
container=False,
|
| 2489 |
-
# elem_classes=["model-dropdown"]
|
| 2490 |
-
),
|
| 2491 |
-
chart,
|
| 2492 |
-
gr.Dropdown(
|
| 2493 |
-
choices=available_level_models,
|
| 2494 |
-
value=valid_level_models,
|
| 2495 |
-
multiselect=True,
|
| 2496 |
-
label="",
|
| 2497 |
-
info=None,
|
| 2498 |
-
container=False,
|
| 2499 |
-
elem_classes=["model-dropdown", "level-model-dropdown"]
|
| 2500 |
-
),
|
| 2501 |
-
level_metric_fig,
|
| 2502 |
-
)
|
| 2503 |
-
|
| 2504 |
-
def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
|
| 2505 |
-
# Get filtered dataframe
|
| 2506 |
-
df = load_leaderboard_data()
|
| 2507 |
-
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
|
| 2508 |
-
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
|
| 2509 |
-
|
| 2510 |
-
available_models_all = filtered_df['Model'].tolist()
|
| 2511 |
-
if selected_models:
|
| 2512 |
-
valid_selected = [m for m in selected_models if m in available_models_all]
|
| 2513 |
-
# Check if more than 5 models are selected and show alert
|
| 2514 |
-
if len(valid_selected) > 5:
|
| 2515 |
-
# JavaScript alert for exceeding 5 models
|
| 2516 |
-
gr.Warning("최대 5개 까지만 선택 가능합니다")
|
| 2517 |
-
# Remove the last selected item (6th item) instead of keeping first 5
|
| 2518 |
-
valid_selected = valid_selected[:-1]
|
| 2519 |
-
if not valid_selected:
|
| 2520 |
-
valid_selected = available_models_all[:5]
|
| 2521 |
-
else:
|
| 2522 |
-
valid_selected = available_models_all[:5]
|
| 2523 |
-
|
| 2524 |
-
|
| 2525 |
-
|
| 2526 |
-
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
|
| 2527 |
-
available_level_models = available_models_all
|
| 2528 |
-
if level_selected_models:
|
| 2529 |
-
valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
|
| 2530 |
-
if not valid_level_models:
|
| 2531 |
-
valid_level_models = available_level_models[:5]
|
| 2532 |
-
else:
|
| 2533 |
-
valid_level_models = available_level_models[:5]
|
| 2534 |
-
level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
|
| 2535 |
-
|
| 2536 |
-
return (
|
| 2537 |
-
gr.Dropdown(
|
| 2538 |
-
choices=available_models_all[:15],
|
| 2539 |
-
value=valid_selected,
|
| 2540 |
-
multiselect=True,
|
| 2541 |
-
label="",
|
| 2542 |
-
info=None,
|
| 2543 |
-
container=False,
|
| 2544 |
-
),
|
| 2545 |
-
create_domain_radar_chart(filtered_df, valid_selected),
|
| 2546 |
-
gr.Dropdown(
|
| 2547 |
-
choices=available_level_models,
|
| 2548 |
-
value=valid_level_models,
|
| 2549 |
-
multiselect=True,
|
| 2550 |
-
label="",
|
| 2551 |
-
info=None,
|
| 2552 |
-
container=False,
|
| 2553 |
-
elem_classes=["model-dropdown", "level-model-dropdown"]
|
| 2554 |
-
),
|
| 2555 |
-
level_metric_fig,
|
| 2556 |
-
)
|
| 2557 |
-
|
| 2558 |
-
def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
|
| 2559 |
-
df = load_leaderboard_data()
|
| 2560 |
-
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
|
| 2561 |
-
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
|
| 2562 |
-
available_models = filtered_df['Model'].tolist()
|
| 2563 |
-
if level_selected_models:
|
| 2564 |
-
valid_level_models = [m for m in level_selected_models if m in available_models]
|
| 2565 |
-
# Check if more than 5 models are selected and show alert
|
| 2566 |
-
if len(valid_level_models) > 5:
|
| 2567 |
-
gr.Warning("최대 5개 까지만 선택 가능합니다")
|
| 2568 |
-
# Remove the last selected item (6th item) instead of keeping first 5
|
| 2569 |
-
valid_level_models = valid_level_models[:-1]
|
| 2570 |
-
if not valid_level_models:
|
| 2571 |
-
valid_level_models = available_models[:5]
|
| 2572 |
-
else:
|
| 2573 |
-
valid_level_models = available_models[:5]
|
| 2574 |
-
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
|
| 2575 |
-
level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
|
| 2576 |
-
return (
|
| 2577 |
-
gr.Dropdown(
|
| 2578 |
-
choices=available_models,
|
| 2579 |
-
value=valid_level_models,
|
| 2580 |
-
multiselect=True,
|
| 2581 |
-
label="",
|
| 2582 |
-
info=None,
|
| 2583 |
-
container=False,
|
| 2584 |
-
elem_classes=["model-dropdown", "level-model-dropdown"]
|
| 2585 |
-
),
|
| 2586 |
-
level_chart,
|
| 2587 |
-
)
|
| 2588 |
-
|
| 2589 |
-
# Update table when filters change
|
| 2590 |
-
filter_inputs = [domain_filter, model_type_filter, sort_order]
|
| 2591 |
-
|
| 2592 |
-
for input_component in filter_inputs:
|
| 2593 |
-
input_component.change(
|
| 2594 |
-
fn=update_table,
|
| 2595 |
-
inputs=filter_inputs,
|
| 2596 |
-
outputs=[leaderboard_title, leaderboard_table]
|
| 2597 |
-
)
|
| 2598 |
-
|
| 2599 |
-
# Also update radar chart when filters change
|
| 2600 |
-
input_component.change(
|
| 2601 |
-
fn=update_radar_chart,
|
| 2602 |
-
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
|
| 2603 |
-
outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart]
|
| 2604 |
-
)
|
| 2605 |
-
|
| 2606 |
-
# Update radar chart when model selection changes
|
| 2607 |
-
model_selector.change(
|
| 2608 |
-
fn=update_radar_only,
|
| 2609 |
-
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
|
| 2610 |
-
outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart]
|
| 2611 |
-
)
|
| 2612 |
-
|
| 2613 |
-
level_metric_selector.change(
|
| 2614 |
-
fn=update_level_metric_only,
|
| 2615 |
-
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
|
| 2616 |
-
outputs=[level_model_selector, level_metric_chart]
|
| 2617 |
-
)
|
| 2618 |
-
|
| 2619 |
-
level_model_selector.change(
|
| 2620 |
-
fn=update_level_metric_only,
|
| 2621 |
-
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
|
| 2622 |
-
outputs=[level_model_selector, level_metric_chart]
|
| 2623 |
-
)
|
| 2624 |
-
|
| 2625 |
|
| 2626 |
# Add custom CSS for the performance card
|
| 2627 |
gr.HTML("""
|
|
@@ -2844,8 +2715,8 @@ def create_leaderboard_v2_tab():
|
|
| 2844 |
.level-dropdown select,
|
| 2845 |
.level-dropdown [role="combobox"],
|
| 2846 |
.level-dropdown button {
|
| 2847 |
-
background:
|
| 2848 |
-
border: 1px solid
|
| 2849 |
border-radius: 999px !important;
|
| 2850 |
padding: 12px 20px !important;
|
| 2851 |
color: var(--text-primary) !important;
|
|
@@ -2855,7 +2726,7 @@ def create_leaderboard_v2_tab():
|
|
| 2855 |
text-align: center !important;
|
| 2856 |
min-height: 46px !important;
|
| 2857 |
transition: all 0.3s ease !important;
|
| 2858 |
-
box-shadow: 0 10px 24px rgba(
|
| 2859 |
}
|
| 2860 |
|
| 2861 |
.level-dropdown select:hover,
|
|
@@ -2872,14 +2743,6 @@ def create_leaderboard_v2_tab():
|
|
| 2872 |
margin: 12px auto 0 !important;
|
| 2873 |
}
|
| 2874 |
|
| 2875 |
-
.level-model-dropdown select,
|
| 2876 |
-
.level-model-dropdown [role="combobox"],
|
| 2877 |
-
.level-model-dropdown button {
|
| 2878 |
-
background: #000000 !important;
|
| 2879 |
-
border: 1px solid #333333 !important;
|
| 2880 |
-
color: var(--text-primary) !important;
|
| 2881 |
-
}
|
| 2882 |
-
|
| 2883 |
.radar-placeholder {
|
| 2884 |
display: flex;
|
| 2885 |
flex-direction: column;
|
|
@@ -3032,74 +2895,6 @@ def create_leaderboard_v2_tab():
|
|
| 3032 |
}
|
| 3033 |
}
|
| 3034 |
|
| 3035 |
-
/* 폰트 강제 적용 - 최종 우선순위 */
|
| 3036 |
-
.dashboard-section,
|
| 3037 |
-
.dashboard-section *,
|
| 3038 |
-
.dashboard-section h2,
|
| 3039 |
-
.dashboard-section h3,
|
| 3040 |
-
.dashboard-section p,
|
| 3041 |
-
.dashboard-section li,
|
| 3042 |
-
.section-lead,
|
| 3043 |
-
.section-subtitle,
|
| 3044 |
-
.phase-card h3,
|
| 3045 |
-
.phase-list li,
|
| 3046 |
-
.scenario-body p,
|
| 3047 |
-
.criteria-card h3,
|
| 3048 |
-
.criteria-card ul,
|
| 3049 |
-
.criteria-card li {
|
| 3050 |
-
font-family: "Nanum Gothic", sans-serif !important;
|
| 3051 |
-
}
|
| 3052 |
-
|
| 3053 |
-
/* section-title 강제 적용 */
|
| 3054 |
-
.section-title,
|
| 3055 |
-
h2.section-title,
|
| 3056 |
-
.dashboard-section .section-title,
|
| 3057 |
-
.section-header .section-title {
|
| 3058 |
-
font-family: "Nanum Gothic", sans-serif !important;
|
| 3059 |
-
}
|
| 3060 |
-
|
| 3061 |
-
.domain-title,
|
| 3062 |
-
h2.domain-title,
|
| 3063 |
-
.domain-header .domain-title {
|
| 3064 |
-
font-family: "Nanum Gothic", sans-serif !important;
|
| 3065 |
-
}
|
| 3066 |
-
|
| 3067 |
-
.hero-title,
|
| 3068 |
-
.hero-subtitle,
|
| 3069 |
-
h1.hero-title,
|
| 3070 |
-
p.hero-subtitle {
|
| 3071 |
-
font-family: "Nanum Gothic", sans-serif !important;
|
| 3072 |
-
font-size: 2rem; !important;
|
| 3073 |
-
}
|
| 3074 |
-
|
| 3075 |
-
/* hero-title 크기 강제 적용 */
|
| 3076 |
-
.hero-title,
|
| 3077 |
-
h1.hero-title {
|
| 3078 |
-
font-size: 4rem !important;
|
| 3079 |
-
}
|
| 3080 |
-
|
| 3081 |
-
.phase-chart span,
|
| 3082 |
-
.phase-card .phase-chart span,
|
| 3083 |
-
.phase-grid .phase-chart span {
|
| 3084 |
-
font-family: "Nanum Gothic", sans-serif !important;
|
| 3085 |
-
font-size: 1.2rem !important;
|
| 3086 |
-
}
|
| 3087 |
-
|
| 3088 |
-
.section-lead, .section-subtitle {
|
| 3089 |
-
font-size: 1.32rem !important;
|
| 3090 |
-
font-family: "Nanum Gothic", sans-serif !important;
|
| 3091 |
-
}
|
| 3092 |
-
|
| 3093 |
-
.phase-card h3 {
|
| 3094 |
-
font-size: 1.44rem !important;
|
| 3095 |
-
font-family: "Nanum Gothic", sans-serif !important;
|
| 3096 |
-
}
|
| 3097 |
-
|
| 3098 |
-
.phase-list li {
|
| 3099 |
-
font-size: 1.08rem !important;
|
| 3100 |
-
font-family: "Nanum Gothic", sans-serif !important;
|
| 3101 |
-
}
|
| 3102 |
-
|
| 3103 |
</style>
|
| 3104 |
|
| 3105 |
""")
|
|
@@ -3207,7 +3002,7 @@ def create_leaderboard_v2_tab():
|
|
| 3207 |
label="",
|
| 3208 |
info=None,
|
| 3209 |
container=False,
|
| 3210 |
-
|
| 3211 |
)
|
| 3212 |
|
| 3213 |
input_component.change(
|
|
@@ -3262,8 +3057,8 @@ def create_domain_radar_chart(df, selected_models=None, max_models=5):
|
|
| 3262 |
palette = [
|
| 3263 |
{'fill': 'rgba(255, 210, 30, 0.25)', 'line': '#ffd21e'},
|
| 3264 |
{'fill': 'rgba(255, 138, 60, 0.22)', 'line': '#FF8A3C'},
|
| 3265 |
-
{'fill': 'rgba(
|
| 3266 |
-
{'fill': 'rgba(
|
| 3267 |
{'fill': 'rgba(248, 250, 252, 0.20)', 'line': '#F8FAFC'},
|
| 3268 |
]
|
| 3269 |
|
|
@@ -3387,8 +3182,7 @@ def create_domain_radar_chart(df, selected_models=None, max_models=5):
|
|
| 3387 |
height=800,
|
| 3388 |
width=900,
|
| 3389 |
margin=dict(t=30, b=50, l=10, r=10),
|
| 3390 |
-
autosize=True
|
| 3391 |
-
annotations=[]
|
| 3392 |
)
|
| 3393 |
|
| 3394 |
return fig
|
|
@@ -3647,8 +3441,8 @@ def create_level_metric_chart(df, level, selected_models=None, max_models=5):
|
|
| 3647 |
model_palette = [
|
| 3648 |
'#ffd21e',
|
| 3649 |
'#FF8A3C',
|
| 3650 |
-
'#
|
| 3651 |
-
'#
|
| 3652 |
'#F8FAFC',
|
| 3653 |
'#38BDF8',
|
| 3654 |
]
|
|
@@ -3686,7 +3480,7 @@ def create_level_metric_chart(df, level, selected_models=None, max_models=5):
|
|
| 3686 |
paper_bgcolor="#01091A",
|
| 3687 |
plot_bgcolor="rgba(245, 246, 247, 0.02)",
|
| 3688 |
height=plot_height,
|
| 3689 |
-
|
| 3690 |
margin=dict(t=90, b=80, l=220, r=160),
|
| 3691 |
legend=dict(
|
| 3692 |
orientation="h",
|
|
@@ -3738,7 +3532,7 @@ def create_empty_level_metric_chart(message):
|
|
| 3738 |
paper_bgcolor="#01091A",
|
| 3739 |
plot_bgcolor="rgba(245, 246, 247, 0.02)",
|
| 3740 |
height=420,
|
| 3741 |
-
|
| 3742 |
margin=dict(t=80, b=60, l=80, r=120),
|
| 3743 |
title=dict(
|
| 3744 |
text="<b>Level Metric Breakdown</b>",
|
|
|
|
| 53 |
|
| 54 |
# Clean and prepare data
|
| 55 |
df = df.copy()
|
| 56 |
+
numeric_candidate_cols = [col for col in df.columns if col not in ('Model', 'Vendor')]
|
|
|
|
| 57 |
for col in numeric_candidate_cols:
|
| 58 |
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 59 |
|
|
|
|
| 118 |
df['Call Validity'] = df[epr_cols].mean(axis=1)
|
| 119 |
|
| 120 |
# Use LLM Type from CSV directly, with mapping to display names
|
| 121 |
+
if 'LLM Type' in df.columns:
|
| 122 |
+
# Clean the LLM Type column to remove any whitespace
|
| 123 |
+
df['LLM Type'] = df['LLM Type'].astype(str).str.strip()
|
| 124 |
+
|
| 125 |
+
# Map LLM Type to Model Type
|
| 126 |
+
def map_llm_type(llm_type):
|
| 127 |
+
if llm_type.upper() == "OSS":
|
| 128 |
+
return "Open source"
|
| 129 |
+
else:
|
| 130 |
+
return "Proprietary"
|
| 131 |
+
|
| 132 |
+
df['Model Type'] = df['LLM Type'].apply(map_llm_type)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
else:
|
| 134 |
+
# Fallback to vendor mapping if LLM Type column doesn't exist
|
| 135 |
+
vendor_model_type_map = {
|
| 136 |
+
"OpenAI": "Proprietary",
|
| 137 |
+
"Anthropic": "Proprietary",
|
| 138 |
+
"Google": "Proprietary",
|
| 139 |
+
"Microsoft": "Proprietary",
|
| 140 |
+
"Mistral": "Proprietary",
|
| 141 |
+
"Databricks": "Open source",
|
| 142 |
+
"Meta": "Open source",
|
| 143 |
+
"Alibaba": "Open source",
|
| 144 |
+
"알리바바": "Open source", # Korean name for Alibaba
|
| 145 |
+
"Kakao": "Open source",
|
| 146 |
+
"SKT": "Open source",
|
| 147 |
+
"KT": "Open source",
|
| 148 |
+
"xAI": "Proprietary",
|
| 149 |
+
}
|
| 150 |
+
df['Model Type'] = df['Vendor'].map(vendor_model_type_map).fillna('Proprietary')
|
|
|
|
|
|
|
| 151 |
|
| 152 |
# Round numeric columns for better display
|
| 153 |
round_three_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Overall Success', 'Execution Accuracy',
|
|
|
|
| 224 |
# Level metadata for the 7-stage task framework
|
| 225 |
level_details = {
|
| 226 |
"ALL": {
|
| 227 |
+
"title": "ALL · 전체 태스크",
|
| 228 |
+
"description": "7개의 태스크 전반의 평균 성능을 한눈에 살펴보고 각 레벨 비교를 위한 기준점을 제공합니다."
|
| 229 |
},
|
| 230 |
"L1": {
|
| 231 |
+
"title": "<span style='color: white;'>L1 · 단일 도구 실행</span>",
|
| 232 |
+
"description": "<span style='color: white;'>단일 도구 실행 능력과 기본적인 명령 수행 정확도를 평가합니다.</span>"
|
| 233 |
},
|
| 234 |
"L2": {
|
| 235 |
+
"title": "<span style='color: white;'>L2 · 도구 선택 능력</span>",
|
| 236 |
+
"description": "<span style='color: white;'>요구 사항에 맞는 도구를 고르고 적절한 파라미터로 호출하는 능력을 측정합니다.</span>"
|
| 237 |
},
|
| 238 |
"L3": {
|
| 239 |
+
"title": "<span style='color: white;'>L3 · 순차적 추론 (Chaining)</span>",
|
| 240 |
+
"description": "<span style='color: white;'>복수 단계의 순차적 reasoning을 통해 문제를 해결하는 과정을 검증합니다.</span>"
|
| 241 |
},
|
| 242 |
"L4": {
|
| 243 |
+
"title": "<span style='color: white;'>L4 · 병렬적 추론 (Aggregation)</span>",
|
| 244 |
+
"description": "<span style='color: white;'>여러 소스의 정보를 병렬적으로 통합하고 요약하는 능력을 평가합니다.</span>"
|
| 245 |
},
|
| 246 |
"L5": {
|
| 247 |
+
"title": "<span style='color: white;'>L5 · 강건성 (Robustness / Fallback)</span>",
|
| 248 |
+
"description": "<span style='color: white;'>예상치 못한 오류나 실패 상황에 대한 인지와 대응 전략을 확인합니다.</span>"
|
| 249 |
},
|
| 250 |
"L6": {
|
| 251 |
+
"title": "<span style='color: white;'>L6 · 효율성 (Efficiency)</span>",
|
| 252 |
+
"description": "<span style='color: white;'>최소한의 호출과 비용으로 목표를 달성하는 운영 효율을 살펴봅니다.</span>"
|
| 253 |
},
|
| 254 |
"L7": {
|
| 255 |
+
"title": "<span style='color: white;'>L7 · 장기 컨텍스트 기억 (Contextual Memory)</span>",
|
| 256 |
+
"description": "<span style='color: white;'>장기 대화 맥락을 유지하고 적절히 활용하는 능력을 집중적으로 분석합니다.</span>"
|
| 257 |
}
|
| 258 |
}
|
| 259 |
default_level = "ALL"
|
|
|
|
| 291 |
border-collapse: collapse;
|
| 292 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif;
|
| 293 |
background: var(--bg-card);
|
| 294 |
+
color: white;
|
| 295 |
}
|
| 296 |
|
| 297 |
.v2-styled-table thead {
|
|
|
|
| 305 |
padding: 14px 12px;
|
| 306 |
text-align: left;
|
| 307 |
font-weight: 600;
|
| 308 |
+
color: white;
|
| 309 |
border-bottom: 2px solid var(--accent-primary);
|
| 310 |
font-size: 13px;
|
| 311 |
text-transform: uppercase;
|
|
|
|
| 319 |
.v2-styled-table td {
|
| 320 |
padding: 12px;
|
| 321 |
border-bottom: 1px solid var(--border-subtle);
|
| 322 |
+
color: white;
|
| 323 |
transition: all 0.2s ease;
|
| 324 |
}
|
| 325 |
|
|
|
|
| 339 |
|
| 340 |
.model-name {
|
| 341 |
font-weight: 500;
|
| 342 |
+
color: white;
|
| 343 |
transition: color 0.2s ease;
|
| 344 |
}
|
| 345 |
|
| 346 |
/* Keep model name color consistent on hover to emphasize row highlight */
|
| 347 |
.v2-styled-table tr:hover .model-name {
|
| 348 |
+
color: white;
|
| 349 |
}
|
| 350 |
|
| 351 |
.numeric-cell {
|
| 352 |
font-family: 'Geist Mono', monospace;
|
| 353 |
font-size: 13px;
|
| 354 |
text-align: center;
|
| 355 |
+
color: white;
|
| 356 |
}
|
| 357 |
|
| 358 |
.highlight-header {
|
| 359 |
background: rgba(255, 210, 30, 0.14);
|
| 360 |
+
color: white;
|
| 361 |
}
|
| 362 |
|
| 363 |
.highlight-cell {
|
| 364 |
background: rgba(255, 210, 30, 0.08);
|
| 365 |
+
color: white;
|
| 366 |
font-weight: 600;
|
| 367 |
}
|
| 368 |
</style>
|
|
|
|
| 460 |
return f"""
|
| 461 |
<div class="domain-selector-container leaderboard-intro">
|
| 462 |
<div class="domain-header">
|
| 463 |
+
<h2 class="domain-title" style="color: white;">Agent Leaderboard · {level_title}</h2>
|
| 464 |
+
<p class="domain-subtitle" style="color: white;">{level_description}</p>
|
| 465 |
</div>
|
| 466 |
<div class="dataframe-container">
|
| 467 |
"""
|
|
|
|
| 511 |
# Load initial data
|
| 512 |
initial_table = filter_and_sort_data(default_level, "All", "Overall Success", "Descending")
|
| 513 |
initial_df = load_leaderboard_data() # Load raw data for model selector
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
initial_selected_models = initial_df['Model'].tolist()[:5] if len(initial_df) > 0 else []
|
| 515 |
initial_heatmap_models = initial_df['Model'].tolist()[:12] if len(initial_df) > 0 else []
|
| 516 |
initial_heatmap = create_performance_heatmap(initial_df, initial_heatmap_models)
|
|
|
|
| 736 |
# Header styles and navigation
|
| 737 |
gr.HTML("""
|
| 738 |
<style>
|
|
|
|
|
|
|
| 739 |
/* Enhanced button styling with better gradio compatibility */
|
| 740 |
.header-action-button {
|
| 741 |
display: inline-block !important;
|
| 742 |
padding: 14px 28px !important;
|
| 743 |
background: #ffd21e !important;
|
| 744 |
+
color: #FFFFFF !important;
|
| 745 |
text-decoration: none !important;
|
| 746 |
border-radius: 16px !important;
|
| 747 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
|
|
|
| 775 |
transform: translateY(-3px) !important;
|
| 776 |
box-shadow: 0 12px 32px rgba(255, 210, 30, 0.5), 0 8px 16px rgba(0, 0, 0, 0.4) !important;
|
| 777 |
background: #ffd21e !important;
|
| 778 |
+
color: #FFFFFF !important;
|
| 779 |
text-decoration: none !important;
|
| 780 |
text-shadow: 0 2px 6px rgba(0, 0, 0, 0.45) !important;
|
| 781 |
}
|
|
|
|
| 790 |
filter: drop-shadow(0 0 8px rgba(255, 255, 255, 0.3));
|
| 791 |
}
|
| 792 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 793 |
#hero-banner {
|
| 794 |
+
max-width: 960px;
|
| 795 |
+
margin: 0 auto 20px auto;
|
| 796 |
+
border-radius: 16px;
|
| 797 |
+
overflow: hidden;
|
| 798 |
+
box-shadow: 0 12px 32px rgba(0, 0, 0, 0.25);
|
| 799 |
}
|
| 800 |
+
|
| 801 |
#hero-banner img {
|
| 802 |
+
width: 100%;
|
| 803 |
+
height: auto;
|
| 804 |
+
display: block;
|
|
|
|
| 805 |
}
|
| 806 |
|
| 807 |
.hero-title {
|
| 808 |
+
font-size: 5rem;
|
| 809 |
font-weight: 800;
|
| 810 |
line-height: 1.1;
|
| 811 |
background: linear-gradient(135deg, #FFE082 0%, #FFC107 50%, #FFB300 100%);
|
| 812 |
-webkit-background-clip: text;
|
| 813 |
-webkit-text-fill-color: transparent;
|
| 814 |
margin-bottom: 1rem;
|
|
|
|
| 815 |
}
|
| 816 |
|
| 817 |
.hero-subtitle {
|
| 818 |
color: var(--text-secondary);
|
| 819 |
+
font-size: 1.25rem;
|
| 820 |
+
font-family: 'Geist', sans-serif;
|
| 821 |
margin-top: 0;
|
| 822 |
}
|
| 823 |
|
|
|
|
| 876 |
box-shadow: 0 12px 30px rgba(0, 0, 0, 0.25);
|
| 877 |
backdrop-filter: blur(12px);
|
| 878 |
-webkit-backdrop-filter: blur(12px);
|
|
|
|
| 879 |
}
|
| 880 |
|
| 881 |
.dashboard-section.emphasized {
|
|
|
|
| 895 |
}
|
| 896 |
|
| 897 |
.section-title {
|
| 898 |
+
font-size: 2.2rem;
|
| 899 |
font-weight: 700;
|
| 900 |
color: var(--text-primary);
|
| 901 |
margin-bottom: 12px;
|
| 902 |
text-align: center !important;
|
|
|
|
| 903 |
}
|
| 904 |
|
| 905 |
.section-lead, .section-subtitle {
|
| 906 |
+
font-size: 1.1rem;
|
| 907 |
color: var(--text-secondary);
|
| 908 |
max-width: 720px;
|
| 909 |
margin: 0 auto 24px auto;
|
|
|
|
| 912 |
word-break: keep-all;
|
| 913 |
white-space: normal;
|
| 914 |
display: block;
|
|
|
|
| 915 |
}
|
| 916 |
|
| 917 |
.phase-grid {
|
|
|
|
| 929 |
}
|
| 930 |
|
| 931 |
.phase-card h3 {
|
| 932 |
+
font-size: 1.5rem;
|
| 933 |
color: var(--text-primary);
|
| 934 |
margin-bottom: 20px;
|
| 935 |
font-weight: 700;
|
|
|
|
| 936 |
}
|
| 937 |
|
| 938 |
.phase-chart {
|
|
|
|
| 960 |
|
| 961 |
.phase-chart span {
|
| 962 |
position: relative;
|
| 963 |
+
font-size: 1.5rem;
|
| 964 |
font-weight: 700;
|
| 965 |
+
color: var(--text-primary);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 966 |
}
|
| 967 |
|
|
|
|
| 968 |
.phase-list {
|
| 969 |
list-style: none;
|
| 970 |
padding: 0;
|
|
|
|
| 979 |
background: rgba(245, 246, 247, 0.05);
|
| 980 |
border: 1px solid rgba(245, 246, 247, 0.08);
|
| 981 |
color: var(--text-secondary);
|
| 982 |
+
font-size: 0.95rem;
|
|
|
|
| 983 |
}
|
| 984 |
|
| 985 |
.scenario-body {
|
|
|
|
| 1042 |
/* Responsive design */
|
| 1043 |
@media (max-width: 768px) {
|
| 1044 |
.hero-title {
|
| 1045 |
+
font-size: 3rem;
|
| 1046 |
}
|
| 1047 |
.hero-action-button {
|
| 1048 |
width: 100% !important;
|
|
|
|
| 1066 |
gap: 8px;
|
| 1067 |
}
|
| 1068 |
.section-title {
|
| 1069 |
+
font-size: 1.8rem;
|
| 1070 |
}
|
| 1071 |
.phase-chart {
|
| 1072 |
width: 100px;
|
|
|
|
| 1080 |
</style>
|
| 1081 |
""")
|
| 1082 |
|
|
|
|
| 1083 |
gr.Image(
|
| 1084 |
+
value="banner.png",
|
| 1085 |
show_label=False,
|
| 1086 |
interactive=False,
|
| 1087 |
type="filepath",
|
| 1088 |
elem_id="hero-banner"
|
| 1089 |
)
|
|
|
|
| 1090 |
|
| 1091 |
gr.HTML("""
|
| 1092 |
<div style="text-align: center; padding: 20px 0;">
|
|
|
|
| 1099 |
gr.HTML("""
|
| 1100 |
<div class="hero-actions">
|
| 1101 |
<a href="https://hugging-face-krew.github.io/" target="_blank" rel="noopener noreferrer" class="hero-action-button">
|
| 1102 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
| 1103 |
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
|
| 1104 |
<line x1="8" y1="12" x2="16" y2="12"/>
|
| 1105 |
</svg>
|
| 1106 |
+
<span>Blog</span>
|
| 1107 |
</a>
|
| 1108 |
<a href="https://github.com/Hugging-Face-KREW/Ko-AgentBench" target="_blank" rel="noopener noreferrer" class="hero-action-button">
|
| 1109 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
| 1110 |
<path d="M9 19c-5 1.5-5-2.5-7-3"/>
|
| 1111 |
<path d="M20 21v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
|
| 1112 |
</svg>
|
| 1113 |
<span>GitHub</span>
|
| 1114 |
</a>
|
| 1115 |
<a href="https://huggingface.co/datasets/huggingface-KREW/Ko-AgentBench" target="_blank" rel="noopener noreferrer" class="hero-action-button">
|
| 1116 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
| 1117 |
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
|
| 1118 |
<polyline points="7 10 12 15 17 10"/>
|
| 1119 |
<line x1="12" y1="15" x2="12" y2="3"/>
|
| 1120 |
</svg>
|
| 1121 |
+
<span>Dataset</span>
|
| 1122 |
</a>
|
| 1123 |
+
<a href="https://github.com/Hugging-Face-KREW/Ko-AgentBench/blob/main/evaluate_model_run.py#L55" target="_blank" rel="noopener noreferrer" class="hero-action-button">
|
| 1124 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
| 1125 |
<path d="M3 3v18h18"/>
|
| 1126 |
<path d="M7 17v-6"/>
|
| 1127 |
<path d="M12 17V7"/>
|
| 1128 |
<path d="M17 17v-3"/>
|
| 1129 |
</svg>
|
| 1130 |
+
<span>Metrics</span>
|
| 1131 |
</a>
|
| 1132 |
</div>
|
| 1133 |
""")
|
|
|
|
| 1136 |
gr.HTML("""
|
| 1137 |
<div class="dashboard-section">
|
| 1138 |
<div class="section-header">
|
| 1139 |
+
<h2 class="section-title">단계별 태스크 설계</h2>
|
| 1140 |
</div>
|
| 1141 |
<p class="section-lead" style="text-align: center; margin: 0 auto 24px auto; max-width: 720px; line-height: 1.7; word-break: keep-all;">단순 도구 호출부터 장기적 맥락 능력, 강건성 처리 능력까지 에이전트의 능력을 7단계로 입체적으로 분석하였습니다.</p>
|
| 1142 |
<div class="phase-grid">
|
| 1143 |
<div class="phase-card">
|
| 1144 |
+
<h3>Single-Turn</h3>
|
| 1145 |
<div class="phase-chart" style="--progress:80%;">
|
| 1146 |
+
<span>80%</span>
|
| 1147 |
</div>
|
| 1148 |
<ul class="phase-list">
|
| 1149 |
+
<li style="color: white;">L1: 단일 도구 실행</li>
|
| 1150 |
+
<li style="color: white;">L2: 도구 선택 능력</li>
|
| 1151 |
+
<li style="color: white;">L3: 순차적 reasoning (Chaining)</li>
|
| 1152 |
+
<li style="color: white;">L4: 병렬적 reasoning (Aggregation)</li>
|
| 1153 |
+
<li style="color: white;">L5: 강건성 (Robustness / Fallback)</li>
|
| 1154 |
</ul>
|
| 1155 |
</div>
|
| 1156 |
<div class="phase-card">
|
| 1157 |
+
<h3>Multi-Turn</h3>
|
| 1158 |
<div class="phase-chart" style="--progress:20%;">
|
| 1159 |
+
<span>20%</span>
|
| 1160 |
</div>
|
| 1161 |
<ul class="phase-list">
|
| 1162 |
+
<li style="color: white;">L6: 효율성 (Efficiency)</li>
|
| 1163 |
+
<li style="color: white;">L7: 장기 컨텍스트 기억 (Contextual Memory)</li>
|
| 1164 |
</ul>
|
| 1165 |
</div>
|
| 1166 |
</div>
|
|
|
|
| 1171 |
gr.HTML("""
|
| 1172 |
<div class="dashboard-section emphasized">
|
| 1173 |
<div class="section-header">
|
| 1174 |
+
<h2 class="section-title">18가지 한국형 API 사용 및 실생활 환경에 특화된 고품질 시나리오 구성</h2>
|
| 1175 |
</div>
|
| 1176 |
<div class="scenario-body">
|
| 1177 |
+
<p>네이버, 지도, 카카오, 웹사이트 등 한국 실사용 환경 기반의 API를 기반으로 국내 사용자의 일상과 밀접한 '약속 예약', '블로그 후기 검색'과 같은 현실적인 문제 해결 시나리오를 구현했습니다.</p>
|
| 1178 |
</div>
|
|
|
|
|
|
|
| 1179 |
<div class="section-flow">⌄</div>
|
| 1180 |
+
</div>
|
| 1181 |
""")
|
| 1182 |
|
| 1183 |
# Section 3: 핵심 평가 기준
|
| 1184 |
gr.HTML("""
|
| 1185 |
<div class="dashboard-section">
|
| 1186 |
<div class="section-header">
|
| 1187 |
+
<h2 class="section-title">핵심 평가 기준</h2>
|
| 1188 |
</div>
|
| 1189 |
<div class="criteria-grid">
|
| 1190 |
<div class="criteria-card">
|
|
|
|
| 1218 |
# Domain filter section with enhanced styling
|
| 1219 |
gr.HTML("""
|
| 1220 |
<style>
|
|
|
|
|
|
|
| 1221 |
/* Enhanced domain selector styling */
|
| 1222 |
.domain-selector-container {
|
| 1223 |
background: #ffd21e0d;
|
|
|
|
| 1320 |
-webkit-background-clip: text;
|
| 1321 |
background-clip: text;
|
| 1322 |
-webkit-text-fill-color: transparent;
|
| 1323 |
+
text-shadow: 0 0 22px rgba(255, 210, 30, 0.65), 0 0 45px rgba(255, 210, 30, 0.4);
|
| 1324 |
+
filter: drop-shadow(0 0 16px rgba(255, 210, 30, 0.35));
|
| 1325 |
letter-spacing: 0.02em;
|
| 1326 |
+
animation: title-shimmer 5s ease-in-out infinite;
|
|
|
|
| 1327 |
}
|
| 1328 |
|
| 1329 |
@keyframes title-shimmer {
|
|
|
|
| 1611 |
|
| 1612 |
.model-dropdown select,
|
| 1613 |
.model-dropdown [role="combobox"] {
|
| 1614 |
+
background: rgba(245, 246, 247, 0.06) !important;
|
| 1615 |
+
border: 1px solid var(--border-subtle) !important;
|
| 1616 |
border-radius: 999px !important;
|
| 1617 |
padding: 12px 24px !important;
|
| 1618 |
color: var(--text-primary) !important;
|
|
|
|
| 1643 |
gap: 8px !important;
|
| 1644 |
width: 100% !important;
|
| 1645 |
padding: 12px 24px !important;
|
| 1646 |
+
background: rgba(245, 246, 247, 0.06) !important;
|
| 1647 |
+
border: 1px solid var(--border-subtle) !important;
|
| 1648 |
border-radius: 999px !important;
|
| 1649 |
color: var(--text-primary) !important;
|
| 1650 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
|
|
|
| 1701 |
background: #ffd21e !important;
|
| 1702 |
border: 1px solid rgba(255, 210, 30, 0.6) !important;
|
| 1703 |
border-radius: 999px !important;
|
| 1704 |
+
color: #FFFFFF !important;
|
| 1705 |
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 1706 |
font-weight: 600 !important;
|
| 1707 |
font-size: 0.95rem !important;
|
|
|
|
| 1748 |
font-size: 1.5rem;
|
| 1749 |
margin-bottom: 4px;
|
| 1750 |
display: block;
|
| 1751 |
+
filter: drop-shadow(0 0 10px currentColor);
|
| 1752 |
}
|
| 1753 |
|
| 1754 |
.domain-name {
|
|
|
|
| 1763 |
top: 8px;
|
| 1764 |
right: 8px;
|
| 1765 |
background: var(--accent-primary);
|
| 1766 |
+
color: white;
|
| 1767 |
font-size: 0.75rem;
|
| 1768 |
padding: 2px 8px;
|
| 1769 |
border-radius: 12px;
|
|
|
|
| 1935 |
padding: 12px 20px !important;
|
| 1936 |
font-size: 0.95rem !important;
|
| 1937 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1938 |
</style>
|
| 1939 |
|
| 1940 |
""")
|
| 1941 |
|
| 1942 |
level_options = list(level_details.keys())
|
| 1943 |
|
| 1944 |
+
with gr.Column(elem_classes=["domain-selector-container"], elem_id="task-level-selector"):
|
| 1945 |
+
gr.HTML("""
|
| 1946 |
+
<div class="domain-header">
|
| 1947 |
+
<h2 class="domain-title" style="color: white;">🧠 Select Task Level</h2>
|
| 1948 |
+
<p class="domain-subtitle" style="color: white;">Ko-AgentBench의 ALL · L1~L7 단계별 에이전트 성능을 손쉽게 비교하세요.</p>
|
| 1949 |
+
</div>
|
| 1950 |
+
""")
|
| 1951 |
+
domain_filter = gr.Radio(
|
| 1952 |
+
choices=level_options,
|
| 1953 |
+
value=default_level,
|
| 1954 |
+
label="",
|
| 1955 |
+
interactive=True,
|
| 1956 |
+
container=False,
|
| 1957 |
+
elem_classes=["domain-radio"]
|
| 1958 |
+
)
|
| 1959 |
|
| 1960 |
+
# Filter controls with domain styling
|
| 1961 |
+
with gr.Column(elem_classes=["domain-selector-container", "filters-sorting-container"], elem_id="filters-sorting-container"):
|
| 1962 |
+
gr.HTML("""
|
| 1963 |
+
<div class="domain-header">
|
| 1964 |
+
<h2 class="domain-title" style="color: white;">🔍 Filters & Sorting</h2>
|
| 1965 |
+
<p class="domain-subtitle" style="color: white;">모델 접근 방식과 정렬 순서를 선택해 맞춤 뷰를 구성하세요.</p>
|
| 1966 |
+
</div>
|
| 1967 |
+
""")
|
| 1968 |
+
with gr.Row(elem_classes=["filters-sorting-row"]):
|
| 1969 |
+
with gr.Column(scale=1, elem_classes=["filter-group"]):
|
| 1970 |
+
with gr.Row(elem_classes=["filter-group-row"]):
|
| 1971 |
+
gr.HTML("<span class='filter-group-label' style='color: white;'>Model Access</span>")
|
| 1972 |
+
model_type_filter = gr.Radio(
|
| 1973 |
+
choices=["All", "OSS", "API"],
|
| 1974 |
+
value="All",
|
| 1975 |
+
label="",
|
| 1976 |
+
elem_classes=["domain-radio"],
|
| 1977 |
+
container=False
|
| 1978 |
+
)
|
| 1979 |
+
with gr.Column(scale=1, elem_classes=["filter-group"]):
|
| 1980 |
+
with gr.Row(elem_classes=["filter-group-row"]):
|
| 1981 |
+
gr.HTML("<span class='filter-group-label' style='color: white;'>Sort Order</span>")
|
| 1982 |
+
sort_order = gr.Radio(
|
| 1983 |
+
choices=["Descending", "Ascending"],
|
| 1984 |
+
value="Descending",
|
| 1985 |
+
label="",
|
| 1986 |
+
elem_classes=["domain-radio"],
|
| 1987 |
+
container=False
|
| 1988 |
+
)
|
| 1989 |
+
|
| 1990 |
+
# Main leaderboard table with dynamic title
|
| 1991 |
+
leaderboard_title = gr.HTML(update_leaderboard_title(default_level))
|
| 1992 |
|
| 1993 |
leaderboard_table = gr.HTML(initial_table)
|
| 1994 |
|
| 1995 |
+
gr.HTML("""
|
| 1996 |
+
</div>
|
| 1997 |
+
</div>""")
|
| 1998 |
+
|
| 1999 |
# Radar Chart Section
|
| 2000 |
gr.HTML("""
|
| 2001 |
<div class="domain-selector-container domain-performance-container">
|
| 2002 |
<div class="domain-header">
|
| 2003 |
+
<h2 class="domain-title" style="color: white;">Core Capability Radar</h2>
|
| 2004 |
+
<p class="domain-subtitle" style="color: white;">Track six essential pillars: Success, Execution, Reasoning, Robustness, Efficiency, and Call Validity.</p>
|
| 2005 |
</div>
|
| 2006 |
""")
|
| 2007 |
+
|
| 2008 |
+
with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="radar-model-selector"):
|
| 2009 |
+
gr.HTML("""
|
| 2010 |
+
<div class="domain-header">
|
| 2011 |
+
<h2 class="domain-title" style="color: white;">🎯 Select Models for Comparison</h2>
|
| 2012 |
+
<p class="domain-subtitle" style="color: white;">Choose up to 5 models to map on the capability radar.</p>
|
| 2013 |
+
</div>
|
| 2014 |
+
""")
|
| 2015 |
+
model_selector = gr.Dropdown(
|
| 2016 |
+
choices=initial_df['Model'].tolist()[:10],
|
| 2017 |
+
value=initial_df['Model'].tolist()[:5],
|
| 2018 |
+
multiselect=True,
|
| 2019 |
+
label="",
|
| 2020 |
+
info=None,
|
| 2021 |
+
container=False,
|
| 2022 |
+
elem_classes=["model-dropdown"]
|
| 2023 |
+
)
|
| 2024 |
|
| 2025 |
# Radar chart plot - wrapped in centered container
|
| 2026 |
gr.HTML('<div class="chart-container radar-chart-container">')
|
|
|
|
| 2036 |
|
| 2037 |
gr.HTML("</div>")
|
| 2038 |
|
| 2039 |
+
# Level metric breakdown section
|
| 2040 |
+
gr.HTML("""
|
| 2041 |
+
<div class="domain-selector-container domain-performance-container level-metrics-wrapper">
|
| 2042 |
+
<div class="domain-header">
|
| 2043 |
+
<h2 class="domain-title" style="color: white;">Level-Specific Metric Spotlight</h2>
|
| 2044 |
+
<p class="domain-subtitle" style="color: white;">Dive deeper into each Ko-AgentBench stage and compare model scores across its unique evaluation metrics.</p>
|
| 2045 |
+
</div>
|
| 2046 |
+
""")
|
| 2047 |
+
|
| 2048 |
+
with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
|
| 2049 |
+
gr.HTML("""
|
| 2050 |
+
<div class="domain-header">
|
| 2051 |
+
<h2 class="domain-title" style="color: white;">🧭 Select Task Level and Models</h2>
|
| 2052 |
+
<p class="domain-subtitle" style="color: white;">Choose a level and up to 5 models to explore their detailed SR-driven metrics.</p>
|
| 2053 |
+
</div>
|
| 2054 |
+
""")
|
| 2055 |
+
level_metric_selector = gr.Dropdown(
|
| 2056 |
+
choices=level_ids,
|
| 2057 |
+
value=level_ids[0] if level_ids else None,
|
| 2058 |
+
multiselect=False,
|
| 2059 |
+
label="",
|
| 2060 |
+
info=None,
|
| 2061 |
+
container=False,
|
| 2062 |
+
elem_classes=["level-dropdown"]
|
| 2063 |
+
)
|
| 2064 |
+
level_model_selector = gr.Dropdown(
|
| 2065 |
+
choices=initial_level_model_choices,
|
| 2066 |
+
value=initial_level_model_values,
|
| 2067 |
+
multiselect=True,
|
| 2068 |
+
label="",
|
| 2069 |
+
info=None,
|
| 2070 |
+
container=False,
|
| 2071 |
+
elem_classes=["model-dropdown", "level-model-dropdown"]
|
| 2072 |
+
)
|
| 2073 |
+
|
| 2074 |
+
gr.HTML('<div class="chart-container level-metric-chart-container">')
|
| 2075 |
+
level_metric_chart = gr.Plot(
|
| 2076 |
+
label="",
|
| 2077 |
+
value=initial_level_metric_chart,
|
| 2078 |
+
elem_classes=["level-metric-plot", "plot-container"]
|
| 2079 |
+
)
|
| 2080 |
+
gr.HTML("""
|
| 2081 |
+
</div>
|
| 2082 |
+
</div>
|
| 2083 |
+
""")
|
| 2084 |
+
|
| 2085 |
+
# Heatmap section
|
| 2086 |
+
gr.HTML("""
|
| 2087 |
+
<div class="domain-selector-container domain-performance-container heatmap-wrapper">
|
| 2088 |
+
<div class="domain-header">
|
| 2089 |
+
<h2 class="domain-title" style="color: white;">Comprehensive Performance Heatmap</h2>
|
| 2090 |
+
<p class="domain-subtitle" style="color: white;">View Ko-AgentBench SR scores across L1~L7 for each model in a single glance.</p>
|
| 2091 |
+
</div>
|
| 2092 |
+
<div class="chart-container heatmap-chart-container">
|
| 2093 |
+
""")
|
| 2094 |
+
heatmap_chart = gr.Plot(
|
| 2095 |
+
label="",
|
| 2096 |
+
value=initial_heatmap,
|
| 2097 |
+
elem_classes=["heatmap-plot", "plot-container"]
|
| 2098 |
+
)
|
| 2099 |
+
gr.HTML("""
|
| 2100 |
+
</div>
|
| 2101 |
+
</div>
|
| 2102 |
+
""")
|
| 2103 |
+
|
| 2104 |
+
# Update functions
|
| 2105 |
+
def get_optimal_sort_order(sort_by_value):
|
| 2106 |
+
"""Return the optimal sort order for a given metric"""
|
| 2107 |
+
# Metrics where higher is better (descending)
|
| 2108 |
+
descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids]
|
| 2109 |
|
| 2110 |
+
# Metrics where lower is better (ascending)
|
| 2111 |
+
ascending_metrics = []
|
| 2112 |
+
|
| 2113 |
+
if sort_by_value in descending_metrics:
|
| 2114 |
+
return "Descending"
|
| 2115 |
+
elif sort_by_value in ascending_metrics:
|
| 2116 |
+
return "Ascending"
|
| 2117 |
+
else:
|
| 2118 |
+
return "Descending" # Default fallback
|
| 2119 |
+
|
| 2120 |
+
def update_table(level_filter, model_type_filter, sort_order):
|
| 2121 |
+
title_html = update_leaderboard_title(level_filter)
|
| 2122 |
+
sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success")
|
| 2123 |
+
table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order)
|
| 2124 |
+
return title_html, table_html
|
| 2125 |
+
|
| 2126 |
+
def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
|
| 2127 |
+
# Get filtered dataframe
|
| 2128 |
df = load_leaderboard_data()
|
| 2129 |
+
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
|
| 2130 |
+
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
|
| 2131 |
|
| 2132 |
+
# Update model selector choices based on filtered data
|
| 2133 |
+
available_models_all = filtered_df['Model'].tolist()
|
| 2134 |
+
available_models = available_models_all[:15] # Top 15 from filtered results
|
|
|
|
| 2135 |
|
| 2136 |
+
# If selected models are not in available models, reset to top 5
|
| 2137 |
+
if selected_models:
|
| 2138 |
+
valid_selected = [m for m in selected_models if m in available_models]
|
| 2139 |
+
if not valid_selected:
|
| 2140 |
+
valid_selected = available_models[:5]
|
| 2141 |
+
else:
|
| 2142 |
+
valid_selected = available_models[:5]
|
| 2143 |
|
| 2144 |
+
# Create radar chart
|
| 2145 |
+
chart = create_domain_radar_chart(filtered_df, valid_selected)
|
| 2146 |
+
|
| 2147 |
+
# Prepare heatmap order prioritizing selected models
|
| 2148 |
+
heatmap_order = []
|
| 2149 |
+
for model in valid_selected:
|
| 2150 |
+
if model not in heatmap_order:
|
| 2151 |
+
heatmap_order.append(model)
|
| 2152 |
+
for model in available_models_all:
|
| 2153 |
+
if model not in heatmap_order:
|
| 2154 |
+
heatmap_order.append(model)
|
| 2155 |
+
heatmap_order = heatmap_order[:12]
|
| 2156 |
+
heatmap_fig = create_performance_heatmap(filtered_df, heatmap_order)
|
| 2157 |
+
|
| 2158 |
+
# Level metric chart
|
| 2159 |
+
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
|
| 2160 |
+
available_level_models = available_models_all
|
| 2161 |
+
if level_selected_models:
|
| 2162 |
+
valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
|
| 2163 |
+
if not valid_level_models:
|
| 2164 |
+
valid_level_models = available_level_models[:5]
|
| 2165 |
+
else:
|
| 2166 |
+
valid_level_models = available_level_models[:5]
|
| 2167 |
+
level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
|
| 2168 |
+
|
| 2169 |
+
return (
|
| 2170 |
+
gr.Dropdown(
|
| 2171 |
+
choices=available_models,
|
| 2172 |
+
value=valid_selected,
|
| 2173 |
+
multiselect=True,
|
| 2174 |
+
label="",
|
| 2175 |
+
info=None,
|
| 2176 |
+
container=False,
|
| 2177 |
+
elem_classes=["model-dropdown"]
|
| 2178 |
+
),
|
| 2179 |
+
chart,
|
| 2180 |
+
heatmap_fig,
|
| 2181 |
+
gr.Dropdown(
|
| 2182 |
+
choices=available_level_models,
|
| 2183 |
+
value=valid_level_models,
|
| 2184 |
+
multiselect=True,
|
| 2185 |
+
label="",
|
| 2186 |
+
info=None,
|
| 2187 |
+
container=False,
|
| 2188 |
+
elem_classes=["model-dropdown", "level-model-dropdown"]
|
| 2189 |
+
),
|
| 2190 |
+
level_metric_fig,
|
| 2191 |
+
)
|
| 2192 |
+
|
| 2193 |
+
def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
|
| 2194 |
+
# Get filtered dataframe
|
| 2195 |
+
df = load_leaderboard_data()
|
| 2196 |
+
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
|
| 2197 |
+
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
|
| 2198 |
+
|
| 2199 |
+
available_models_all = filtered_df['Model'].tolist()
|
| 2200 |
+
if selected_models:
|
| 2201 |
+
valid_selected = [m for m in selected_models if m in available_models_all]
|
| 2202 |
+
if not valid_selected:
|
| 2203 |
+
valid_selected = available_models_all[:5]
|
| 2204 |
+
else:
|
| 2205 |
+
valid_selected = available_models_all[:5]
|
| 2206 |
+
|
| 2207 |
+
heatmap_order = []
|
| 2208 |
+
for model in valid_selected:
|
| 2209 |
+
if model not in heatmap_order:
|
| 2210 |
+
heatmap_order.append(model)
|
| 2211 |
+
for model in available_models_all:
|
| 2212 |
+
if model not in heatmap_order:
|
| 2213 |
+
heatmap_order.append(model)
|
| 2214 |
+
heatmap_order = heatmap_order[:12]
|
| 2215 |
+
|
| 2216 |
+
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
|
| 2217 |
+
available_level_models = available_models_all
|
| 2218 |
+
if level_selected_models:
|
| 2219 |
+
valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
|
| 2220 |
+
if not valid_level_models:
|
| 2221 |
+
valid_level_models = available_level_models[:5]
|
| 2222 |
+
else:
|
| 2223 |
+
valid_level_models = available_level_models[:5]
|
| 2224 |
+
level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
|
| 2225 |
+
|
| 2226 |
+
return (
|
| 2227 |
+
create_domain_radar_chart(filtered_df, valid_selected),
|
| 2228 |
+
create_performance_heatmap(filtered_df, heatmap_order),
|
| 2229 |
+
gr.Dropdown(
|
| 2230 |
+
choices=available_level_models,
|
| 2231 |
+
value=valid_level_models,
|
| 2232 |
+
multiselect=True,
|
| 2233 |
+
label="",
|
| 2234 |
+
info=None,
|
| 2235 |
+
container=False,
|
| 2236 |
+
elem_classes=["model-dropdown", "level-model-dropdown"]
|
| 2237 |
+
),
|
| 2238 |
+
level_metric_fig,
|
| 2239 |
+
)
|
| 2240 |
+
|
| 2241 |
+
def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
|
| 2242 |
+
df = load_leaderboard_data()
|
| 2243 |
+
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
|
| 2244 |
+
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
|
| 2245 |
+
available_models = filtered_df['Model'].tolist()
|
| 2246 |
+
if level_selected_models:
|
| 2247 |
+
valid_level_models = [m for m in level_selected_models if m in available_models][:5]
|
| 2248 |
+
if not valid_level_models:
|
| 2249 |
+
valid_level_models = available_models[:5]
|
| 2250 |
+
else:
|
| 2251 |
+
valid_level_models = available_models[:5]
|
| 2252 |
+
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
|
| 2253 |
+
level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
|
| 2254 |
+
return (
|
| 2255 |
+
gr.Dropdown(
|
| 2256 |
+
choices=available_models,
|
| 2257 |
+
value=valid_level_models,
|
| 2258 |
+
multiselect=True,
|
| 2259 |
+
label="",
|
| 2260 |
+
info=None,
|
| 2261 |
+
container=False,
|
| 2262 |
+
elem_classes=["model-dropdown", "level-model-dropdown"]
|
| 2263 |
+
),
|
| 2264 |
+
level_chart,
|
| 2265 |
+
)
|
| 2266 |
+
|
| 2267 |
+
# Update table when filters change
|
| 2268 |
+
filter_inputs = [domain_filter, model_type_filter, sort_order]
|
| 2269 |
+
|
| 2270 |
+
for input_component in filter_inputs:
|
| 2271 |
+
input_component.change(
|
| 2272 |
+
fn=update_table,
|
| 2273 |
+
inputs=filter_inputs,
|
| 2274 |
+
outputs=[leaderboard_title, leaderboard_table]
|
| 2275 |
+
)
|
| 2276 |
+
|
| 2277 |
+
# Also update radar chart when filters change
|
| 2278 |
+
input_component.change(
|
| 2279 |
+
fn=update_radar_chart,
|
| 2280 |
+
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
|
| 2281 |
+
outputs=[model_selector, radar_chart, heatmap_chart, level_model_selector, level_metric_chart]
|
| 2282 |
+
)
|
| 2283 |
+
|
| 2284 |
+
# Update radar chart when model selection changes
|
| 2285 |
+
model_selector.change(
|
| 2286 |
+
fn=update_radar_only,
|
| 2287 |
+
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
|
| 2288 |
+
outputs=[radar_chart, heatmap_chart, level_model_selector, level_metric_chart]
|
| 2289 |
+
)
|
| 2290 |
+
|
| 2291 |
+
level_metric_selector.change(
|
| 2292 |
+
fn=update_level_metric_only,
|
| 2293 |
+
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
|
| 2294 |
+
outputs=[level_model_selector, level_metric_chart]
|
| 2295 |
+
)
|
| 2296 |
+
|
| 2297 |
+
level_model_selector.change(
|
| 2298 |
+
fn=update_level_metric_only,
|
| 2299 |
+
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
|
| 2300 |
+
outputs=[level_model_selector, level_metric_chart]
|
| 2301 |
+
)
|
| 2302 |
+
|
| 2303 |
+
# Define generate_performance_card function before using it
|
| 2304 |
+
def generate_performance_card(model_name):
|
| 2305 |
+
"""Generate HTML for the model performance card"""
|
| 2306 |
+
if not model_name:
|
| 2307 |
+
return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
|
| 2308 |
+
Please select a model to generate its performance card
|
| 2309 |
+
</div>"""
|
| 2310 |
+
|
| 2311 |
+
# Get model data
|
| 2312 |
+
df = load_leaderboard_data()
|
| 2313 |
+
model_data = df[df['Model'] == model_name]
|
| 2314 |
+
|
| 2315 |
+
if model_data.empty:
|
| 2316 |
+
return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
|
| 2317 |
+
Model not found in the database
|
| 2318 |
+
</div>"""
|
| 2319 |
+
|
| 2320 |
+
row = model_data.iloc[0]
|
| 2321 |
+
|
| 2322 |
+
# Get overall rank based on overall success
|
| 2323 |
+
df_with_success = df.copy()
|
| 2324 |
+
df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce')
|
| 2325 |
df_with_success = df_with_success[df_with_success['Overall Success'].notna()]
|
| 2326 |
df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True)
|
| 2327 |
try:
|
|
|
|
| 2452 |
gr.HTML("""
|
| 2453 |
<div class="domain-selector-container performance-card-container">
|
| 2454 |
<div class="domain-header">
|
| 2455 |
+
<h2 class="domain-title" style="color: white;">Model Performance Card</h2>
|
| 2456 |
+
<p class="domain-subtitle" style="color: white;">Comprehensive performance card for any model - perfect for presentations and reports</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2457 |
</div>
|
|
|
|
| 2458 |
<div class="performance-card-content">
|
| 2459 |
""")
|
| 2460 |
+
|
| 2461 |
with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
|
| 2462 |
gr.HTML("""
|
| 2463 |
+
<div class="domain-header">
|
| 2464 |
+
<h2 class="domain-title" style="color: white;">🤖 Select Model</h2>
|
| 2465 |
+
<p class="domain-subtitle" style="color: white;">비교할 모델을 선택하세요.</p>
|
| 2466 |
+
</div>
|
| 2467 |
""")
|
| 2468 |
card_model_selector = gr.Dropdown(
|
| 2469 |
choices=initial_df['Model'].tolist(),
|
|
|
|
| 2471 |
label="",
|
| 2472 |
info=None,
|
| 2473 |
container=False,
|
| 2474 |
+
elem_classes=["model-dropdown"]
|
| 2475 |
)
|
| 2476 |
download_card_btn = gr.Button(
|
| 2477 |
+
"Download Card as PNG",
|
| 2478 |
elem_id="download-card-btn",
|
| 2479 |
elem_classes=["pill-button"]
|
| 2480 |
)
|
|
|
|
| 2493 |
</div>
|
| 2494 |
</div>
|
| 2495 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2496 |
|
| 2497 |
# Add custom CSS for the performance card
|
| 2498 |
gr.HTML("""
|
|
|
|
| 2715 |
.level-dropdown select,
|
| 2716 |
.level-dropdown [role="combobox"],
|
| 2717 |
.level-dropdown button {
|
| 2718 |
+
background: rgba(245, 246, 247, 0.06) !important;
|
| 2719 |
+
border: 1px solid var(--border-subtle) !important;
|
| 2720 |
border-radius: 999px !important;
|
| 2721 |
padding: 12px 20px !important;
|
| 2722 |
color: var(--text-primary) !important;
|
|
|
|
| 2726 |
text-align: center !important;
|
| 2727 |
min-height: 46px !important;
|
| 2728 |
transition: all 0.3s ease !important;
|
| 2729 |
+
box-shadow: 0 10px 24px rgba(255, 210, 30, 0.15) !important;
|
| 2730 |
}
|
| 2731 |
|
| 2732 |
.level-dropdown select:hover,
|
|
|
|
| 2743 |
margin: 12px auto 0 !important;
|
| 2744 |
}
|
| 2745 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2746 |
.radar-placeholder {
|
| 2747 |
display: flex;
|
| 2748 |
flex-direction: column;
|
|
|
|
| 2895 |
}
|
| 2896 |
}
|
| 2897 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2898 |
</style>
|
| 2899 |
|
| 2900 |
""")
|
|
|
|
| 3002 |
label="",
|
| 3003 |
info=None,
|
| 3004 |
container=False,
|
| 3005 |
+
elem_classes=["model-dropdown"]
|
| 3006 |
)
|
| 3007 |
|
| 3008 |
input_component.change(
|
|
|
|
| 3057 |
palette = [
|
| 3058 |
{'fill': 'rgba(255, 210, 30, 0.25)', 'line': '#ffd21e'},
|
| 3059 |
{'fill': 'rgba(255, 138, 60, 0.22)', 'line': '#FF8A3C'},
|
| 3060 |
+
{'fill': 'rgba(249, 112, 185, 0.22)', 'line': '#F970B9'},
|
| 3061 |
+
{'fill': 'rgba(139, 92, 246, 0.20)', 'line': '#8B5CF6'},
|
| 3062 |
{'fill': 'rgba(248, 250, 252, 0.20)', 'line': '#F8FAFC'},
|
| 3063 |
]
|
| 3064 |
|
|
|
|
| 3182 |
height=800,
|
| 3183 |
width=900,
|
| 3184 |
margin=dict(t=30, b=50, l=10, r=10),
|
| 3185 |
+
autosize=True
|
|
|
|
| 3186 |
)
|
| 3187 |
|
| 3188 |
return fig
|
|
|
|
| 3441 |
model_palette = [
|
| 3442 |
'#ffd21e',
|
| 3443 |
'#FF8A3C',
|
| 3444 |
+
'#F970B9',
|
| 3445 |
+
'#8B5CF6',
|
| 3446 |
'#F8FAFC',
|
| 3447 |
'#38BDF8',
|
| 3448 |
]
|
|
|
|
| 3480 |
paper_bgcolor="#01091A",
|
| 3481 |
plot_bgcolor="rgba(245, 246, 247, 0.02)",
|
| 3482 |
height=plot_height,
|
| 3483 |
+
width=1450,
|
| 3484 |
margin=dict(t=90, b=80, l=220, r=160),
|
| 3485 |
legend=dict(
|
| 3486 |
orientation="h",
|
|
|
|
| 3532 |
paper_bgcolor="#01091A",
|
| 3533 |
plot_bgcolor="rgba(245, 246, 247, 0.02)",
|
| 3534 |
height=420,
|
| 3535 |
+
width=1450,
|
| 3536 |
margin=dict(t=80, b=60, l=80, r=120),
|
| 3537 |
title=dict(
|
| 3538 |
text="<b>Level Metric Breakdown</b>",
|
tabs/leaderboard_v1_en.py
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
utils.py
CHANGED
|
@@ -9,8 +9,8 @@ def get_chart_colors():
|
|
| 9 |
# "grid": (1, 1, 1, 0.1), # RGBA tuple for grid
|
| 10 |
# }
|
| 11 |
return {
|
| 12 |
-
"Private": "#
|
| 13 |
-
"Open source": "#
|
| 14 |
"performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"],
|
| 15 |
"text": "#111827",
|
| 16 |
"background": "#FFFFFF",
|
|
@@ -20,12 +20,10 @@ def get_chart_colors():
|
|
| 20 |
|
| 21 |
def get_rank_badge(rank):
|
| 22 |
"""Generate HTML for rank badge with appropriate styling"""
|
| 23 |
-
tag_background = "#593B1D"
|
| 24 |
-
tag_text_color = "#FFFFFF"
|
| 25 |
badge_styles = {
|
| 26 |
-
1: ("1st",
|
| 27 |
-
2: ("2nd",
|
| 28 |
-
3: ("3rd",
|
| 29 |
}
|
| 30 |
|
| 31 |
if rank in badge_styles:
|
|
@@ -65,25 +63,24 @@ def get_type_badge(model_type):
|
|
| 65 |
"""Generate HTML for model type badge"""
|
| 66 |
colors = get_chart_colors()
|
| 67 |
color_map = {
|
| 68 |
-
"Open source": colors.get("Open source", "#
|
| 69 |
-
"Proprietary": colors.get("Private", "#
|
| 70 |
-
"Private": colors.get("Private", "#
|
| 71 |
}
|
| 72 |
label_map = {
|
| 73 |
"Open source": "OSS",
|
| 74 |
"Proprietary": "API",
|
| 75 |
"Private": "API",
|
| 76 |
}
|
| 77 |
-
bg_color = color_map.get(model_type, "#
|
| 78 |
display_label = label_map.get(model_type, model_type)
|
| 79 |
-
text_color = "#111827" if display_label == "OSS" else "#FFFFFF"
|
| 80 |
return f"""
|
| 81 |
<div style="
|
| 82 |
display: inline-flex;
|
| 83 |
align-items: center;
|
| 84 |
padding: 4px 8px;
|
| 85 |
background: {bg_color};
|
| 86 |
-
color:
|
| 87 |
border-radius: 4px;
|
| 88 |
font-size: 0.85em;
|
| 89 |
font-weight: 500;
|
|
|
|
| 9 |
# "grid": (1, 1, 1, 0.1), # RGBA tuple for grid
|
| 10 |
# }
|
| 11 |
return {
|
| 12 |
+
"Private": "#3F78FA", # accent-blue light
|
| 13 |
+
"Open source": "#A13AE2", # accent-purple light
|
| 14 |
"performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"],
|
| 15 |
"text": "#111827",
|
| 16 |
"background": "#FFFFFF",
|
|
|
|
| 20 |
|
| 21 |
def get_rank_badge(rank):
|
| 22 |
"""Generate HTML for rank badge with appropriate styling"""
|
|
|
|
|
|
|
| 23 |
badge_styles = {
|
| 24 |
+
1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"),
|
| 25 |
+
2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
|
| 26 |
+
3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
|
| 27 |
}
|
| 28 |
|
| 29 |
if rank in badge_styles:
|
|
|
|
| 63 |
"""Generate HTML for model type badge"""
|
| 64 |
colors = get_chart_colors()
|
| 65 |
color_map = {
|
| 66 |
+
"Open source": colors.get("Open source", "#A13AE2"),
|
| 67 |
+
"Proprietary": colors.get("Private", "#3F78FA"),
|
| 68 |
+
"Private": colors.get("Private", "#3F78FA"),
|
| 69 |
}
|
| 70 |
label_map = {
|
| 71 |
"Open source": "OSS",
|
| 72 |
"Proprietary": "API",
|
| 73 |
"Private": "API",
|
| 74 |
}
|
| 75 |
+
bg_color = color_map.get(model_type, "#4F46E5")
|
| 76 |
display_label = label_map.get(model_type, model_type)
|
|
|
|
| 77 |
return f"""
|
| 78 |
<div style="
|
| 79 |
display: inline-flex;
|
| 80 |
align-items: center;
|
| 81 |
padding: 4px 8px;
|
| 82 |
background: {bg_color};
|
| 83 |
+
color: white;
|
| 84 |
border-radius: 4px;
|
| 85 |
font-size: 0.85em;
|
| 86 |
font-weight: 500;
|