feat: Add structured output support and refactor comments
Browse files- Integrate structured output benchmark functionality
- Clean up Turkish comments and improve code quality
- app.py +31 -29
- result/structured_output/avg_str001.json +5 -0
- result/structured_output/avg_str002.json +5 -0
- result/structured_output/avg_str003.json +5 -0
- result/structured_output/avg_str004.json +5 -0
- result/structured_output/avg_str005.json +5 -0
- result/structured_output/avg_str006.json +5 -0
- result/structured_output/avg_str007.json +5 -0
- result/structured_output/avg_str008.json +5 -0
- result/structured_output/avg_str009.json +5 -0
- result/structured_output/avg_str010.json +5 -0
- result/structured_output/avg_str011.json +5 -0
- result/structured_output/avg_str012.json +5 -0
- result/structured_output/avg_str013.json +5 -0
- result/structured_output/avg_str014.json +6 -0
- result/structured_output/detail_str001.json +14 -0
- result/structured_output/detail_str002.json +14 -0
- result/structured_output/detail_str003.json +14 -0
- result/structured_output/detail_str004.json +14 -0
- result/structured_output/detail_str005.json +14 -0
- result/structured_output/detail_str006.json +14 -0
- result/structured_output/detail_str007.json +14 -0
- result/structured_output/detail_str008.json +14 -0
- result/structured_output/detail_str009.json +14 -0
- result/structured_output/detail_str010.json +14 -0
- result/structured_output/detail_str011.json +14 -0
- result/structured_output/detail_str012.json +14 -0
- result/structured_output/detail_str013.json +14 -0
- result/structured_output/detail_str014.json +24 -0
- src/display/about.py +32 -1
- src/utils.py +197 -18
app.py
CHANGED
|
@@ -43,10 +43,10 @@ from src.utils import (
|
|
| 43 |
create_light_eval_table,
|
| 44 |
create_raw_details_table,
|
| 45 |
create_human_arena_table,
|
|
|
|
| 46 |
update_supported_base_models
|
| 47 |
)
|
| 48 |
|
| 49 |
-
# Pipelines utils fonksiyonlarını import et
|
| 50 |
from pipelines.utils.common import search_and_filter
|
| 51 |
from pipelines.unified_benchmark import submit_unified_benchmark
|
| 52 |
|
|
@@ -72,7 +72,6 @@ def format_dataframe(df, is_light_eval_detail=False):
|
|
| 72 |
if df.empty:
|
| 73 |
return df
|
| 74 |
|
| 75 |
-
# 'file' sütununu kaldır
|
| 76 |
if 'file' in df.columns:
|
| 77 |
df = df.drop(columns=['file'])
|
| 78 |
|
|
@@ -83,16 +82,24 @@ def format_dataframe(df, is_light_eval_detail=False):
|
|
| 83 |
if col in df.columns:
|
| 84 |
df = df.drop(columns=[col])
|
| 85 |
|
| 86 |
-
# Float değerleri yuvarlama
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
for column in df.columns:
|
| 89 |
try:
|
| 90 |
if pd.api.types.is_float_dtype(df[column]):
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
| 92 |
except:
|
| 93 |
continue
|
| 94 |
|
| 95 |
-
# Kolon isimlerini düzgün formata getir
|
| 96 |
column_mapping = {}
|
| 97 |
for col in df.columns:
|
| 98 |
# Skip run_id and user_id fields
|
|
@@ -162,15 +169,12 @@ def create_demo():
|
|
| 162 |
gr.Markdown(TITLE)
|
| 163 |
gr.Markdown(INTRODUCTION_TEXT)
|
| 164 |
|
| 165 |
-
# Hidden session state to track login expiration
|
| 166 |
session_expiry = gr.State(None)
|
| 167 |
|
| 168 |
try:
|
| 169 |
-
# Benchmark sonuçlarını yükle
|
| 170 |
benchmark_results = load_benchmark_results()
|
| 171 |
default_plots = create_benchmark_plots(benchmark_results, "avg")
|
| 172 |
|
| 173 |
-
# State variable to track login state across page refreshes
|
| 174 |
login_state = gr.State(value=False)
|
| 175 |
|
| 176 |
with gr.Tabs() as tabs:
|
|
@@ -178,8 +182,6 @@ def create_demo():
|
|
| 178 |
gr.Markdown("## Model Evaluation Results")
|
| 179 |
gr.Markdown("This screen shows model performance across different evaluation categories.")
|
| 180 |
|
| 181 |
-
# Remove the separate refresh button row
|
| 182 |
-
# Instead, combine search and refresh in one row
|
| 183 |
with gr.Row():
|
| 184 |
search_input = gr.Textbox(
|
| 185 |
label="🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
|
@@ -192,10 +194,8 @@ def create_demo():
|
|
| 192 |
# # Status display for refresh results
|
| 193 |
# refresh_status = gr.Markdown("", visible=False)
|
| 194 |
|
| 195 |
-
# Benchmark tablarını semboller içeren tab grubuyla göster
|
| 196 |
with gr.Tabs() as benchmark_tabs:
|
| 197 |
with gr.TabItem("👥 Human Arena"):
|
| 198 |
-
# Human Arena sonuçları - detail dosyalarını kullan
|
| 199 |
human_arena_data = benchmark_results["raw"]["human_arena"]
|
| 200 |
|
| 201 |
# Store human arena data in a state component for filtering
|
|
@@ -220,7 +220,6 @@ def create_demo():
|
|
| 220 |
if filtered_df.empty:
|
| 221 |
filtered_df = pd.DataFrame({"Model Name": ["No data available"]})
|
| 222 |
|
| 223 |
-
# Return updated buttons with new variants
|
| 224 |
if category == "general":
|
| 225 |
return (
|
| 226 |
filtered_df,
|
|
@@ -236,7 +235,6 @@ def create_demo():
|
|
| 236 |
gr.Button("Reasoning", variant="primary", elem_id="human_arena_reasoning_btn", elem_classes=["active-btn"])
|
| 237 |
)
|
| 238 |
|
| 239 |
-
# Initial table load
|
| 240 |
if human_arena_data:
|
| 241 |
human_arena_df = create_human_arena_table(human_arena_data, category="general")
|
| 242 |
else:
|
|
@@ -257,7 +255,6 @@ def create_demo():
|
|
| 257 |
column_widths=["300px", "150px", "110px", "110px", "110px", "156px", "169px", "100px", "120px"]
|
| 258 |
)
|
| 259 |
|
| 260 |
-
# Button click handlers
|
| 261 |
general_btn.click(
|
| 262 |
fn=lambda data: filter_human_arena_table("general", data),
|
| 263 |
inputs=[human_arena_state],
|
|
@@ -295,7 +292,6 @@ def create_demo():
|
|
| 295 |
)
|
| 296 |
|
| 297 |
with gr.TabItem("🏟️ Auto Arena"):
|
| 298 |
-
# Arena sonuçları - detail dosyalarını kullan
|
| 299 |
arena_details_df = create_raw_details_table(benchmark_results, "arena")
|
| 300 |
arena_details_df = format_dataframe(arena_details_df)
|
| 301 |
|
|
@@ -311,7 +307,6 @@ def create_demo():
|
|
| 311 |
)
|
| 312 |
|
| 313 |
with gr.TabItem("📚 Retrieval"):
|
| 314 |
-
# RAG Judge sonuçları - detail dosyalarını kullan
|
| 315 |
rag_details_df = create_raw_details_table(benchmark_results, "retrieval")
|
| 316 |
rag_details_df = format_dataframe(rag_details_df)
|
| 317 |
|
|
@@ -326,8 +321,21 @@ def create_demo():
|
|
| 326 |
|
| 327 |
)
|
| 328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
with gr.TabItem("⚡ Light Eval"):
|
| 330 |
-
# Light Eval sonuçları - detail dosyalarını kullan
|
| 331 |
light_details_data = benchmark_results["raw"]["light_eval"]
|
| 332 |
if light_details_data:
|
| 333 |
light_details_df = create_light_eval_table(light_details_data, is_detail=True)
|
|
@@ -348,7 +356,6 @@ def create_demo():
|
|
| 348 |
)
|
| 349 |
|
| 350 |
with gr.TabItem("📋 EvalMix"):
|
| 351 |
-
# Hybrid Benchmark sonuçları - detail dosyalarını kullan
|
| 352 |
hybrid_details_df = create_raw_details_table(benchmark_results, "evalmix")
|
| 353 |
hybrid_details_df = format_dataframe(hybrid_details_df)
|
| 354 |
|
|
@@ -364,7 +371,6 @@ def create_demo():
|
|
| 364 |
)
|
| 365 |
|
| 366 |
with gr.TabItem("🐍 𝐒𝐧𝐚𝐤𝐞 𝐁𝐞𝐧𝐜𝐡"):
|
| 367 |
-
# Snake Benchmark sonuçları - detail dosyalarını kullan
|
| 368 |
snake_details_df = create_raw_details_table(benchmark_results, "snake")
|
| 369 |
snake_details_df = format_dataframe(snake_details_df)
|
| 370 |
|
|
@@ -410,32 +416,29 @@ def create_demo():
|
|
| 410 |
# ]
|
| 411 |
# )
|
| 412 |
|
| 413 |
-
# Tüm sekmeler için ortak arama fonksiyonu
|
| 414 |
def search_all_tabs(query, original_data):
|
| 415 |
"""
|
| 416 |
-
|
| 417 |
"""
|
| 418 |
if not query or query.strip() == "":
|
| 419 |
-
# Boş arama - orijinal veriyi döndür
|
| 420 |
return (original_data, arena_details_df, human_arena_df,
|
| 421 |
-
rag_details_df, light_details_df, hybrid_details_df, snake_details_df)
|
| 422 |
|
| 423 |
-
# Arama var - tüm sekmeleri filtrele
|
| 424 |
return (
|
| 425 |
search_and_filter(query, original_data, "All"),
|
| 426 |
search_and_filter(query, arena_details_df, "All"),
|
| 427 |
search_and_filter(query, human_arena_df, "All"),
|
| 428 |
search_and_filter(query, rag_details_df, "All"),
|
|
|
|
| 429 |
search_and_filter(query, light_details_df, "All"),
|
| 430 |
search_and_filter(query, hybrid_details_df, "All"),
|
| 431 |
search_and_filter(query, snake_details_df, "All")
|
| 432 |
)
|
| 433 |
|
| 434 |
-
# Arama fonksiyonu - tüm sekmeleri güncelle
|
| 435 |
search_input.change(
|
| 436 |
search_all_tabs,
|
| 437 |
inputs=[search_input, original_leaderboard_data],
|
| 438 |
-
outputs=[combined_table, arena_table, human_arena_table, rag_table, light_table, hybrid_table, snake_table]
|
| 439 |
)
|
| 440 |
|
| 441 |
with gr.TabItem("ℹ️ About", elem_id="about-tab"):
|
|
@@ -692,7 +695,6 @@ def create_demo():
|
|
| 692 |
logging.warning(f"Error checking model type: {str(e)}")
|
| 693 |
|
| 694 |
# Call the benchmark function with profile information
|
| 695 |
-
# base_model validasyonunu kaldırdık ama parametre olarak yine de gönderiyoruz
|
| 696 |
result_message, _ = submit_unified_benchmark(model, base_model, reasoning, email, profile)
|
| 697 |
logging.info(f"Submission processed for model: {model}")
|
| 698 |
return result_message
|
|
|
|
| 43 |
create_light_eval_table,
|
| 44 |
create_raw_details_table,
|
| 45 |
create_human_arena_table,
|
| 46 |
+
create_structured_outputs_table,
|
| 47 |
update_supported_base_models
|
| 48 |
)
|
| 49 |
|
|
|
|
| 50 |
from pipelines.utils.common import search_and_filter
|
| 51 |
from pipelines.unified_benchmark import submit_unified_benchmark
|
| 52 |
|
|
|
|
| 72 |
if df.empty:
|
| 73 |
return df
|
| 74 |
|
|
|
|
| 75 |
if 'file' in df.columns:
|
| 76 |
df = df.drop(columns=['file'])
|
| 77 |
|
|
|
|
| 82 |
if col in df.columns:
|
| 83 |
df = df.drop(columns=[col])
|
| 84 |
|
| 85 |
+
# Float değerleri yuvarlama
|
| 86 |
+
# Varsayılan: 2 hane. Light eval detail veya structured_output_score kolonları varsa: 4 hane.
|
| 87 |
+
# Leaderboard için özel durum: "Structured Outputs" ve "Retrieval" kolonlarını 4 hane tut.
|
| 88 |
+
if is_light_eval_detail or "structured_output_score" in df.columns:
|
| 89 |
+
default_decimal_places = 4
|
| 90 |
+
else:
|
| 91 |
+
default_decimal_places = 2
|
| 92 |
+
four_decimal_cols = {"Structured Outputs"}
|
| 93 |
for column in df.columns:
|
| 94 |
try:
|
| 95 |
if pd.api.types.is_float_dtype(df[column]):
|
| 96 |
+
if column in four_decimal_cols:
|
| 97 |
+
df[column] = df[column].round(4)
|
| 98 |
+
else:
|
| 99 |
+
df[column] = df[column].round(default_decimal_places)
|
| 100 |
except:
|
| 101 |
continue
|
| 102 |
|
|
|
|
| 103 |
column_mapping = {}
|
| 104 |
for col in df.columns:
|
| 105 |
# Skip run_id and user_id fields
|
|
|
|
| 169 |
gr.Markdown(TITLE)
|
| 170 |
gr.Markdown(INTRODUCTION_TEXT)
|
| 171 |
|
|
|
|
| 172 |
session_expiry = gr.State(None)
|
| 173 |
|
| 174 |
try:
|
|
|
|
| 175 |
benchmark_results = load_benchmark_results()
|
| 176 |
default_plots = create_benchmark_plots(benchmark_results, "avg")
|
| 177 |
|
|
|
|
| 178 |
login_state = gr.State(value=False)
|
| 179 |
|
| 180 |
with gr.Tabs() as tabs:
|
|
|
|
| 182 |
gr.Markdown("## Model Evaluation Results")
|
| 183 |
gr.Markdown("This screen shows model performance across different evaluation categories.")
|
| 184 |
|
|
|
|
|
|
|
| 185 |
with gr.Row():
|
| 186 |
search_input = gr.Textbox(
|
| 187 |
label="🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
|
|
|
| 194 |
# # Status display for refresh results
|
| 195 |
# refresh_status = gr.Markdown("", visible=False)
|
| 196 |
|
|
|
|
| 197 |
with gr.Tabs() as benchmark_tabs:
|
| 198 |
with gr.TabItem("👥 Human Arena"):
|
|
|
|
| 199 |
human_arena_data = benchmark_results["raw"]["human_arena"]
|
| 200 |
|
| 201 |
# Store human arena data in a state component for filtering
|
|
|
|
| 220 |
if filtered_df.empty:
|
| 221 |
filtered_df = pd.DataFrame({"Model Name": ["No data available"]})
|
| 222 |
|
|
|
|
| 223 |
if category == "general":
|
| 224 |
return (
|
| 225 |
filtered_df,
|
|
|
|
| 235 |
gr.Button("Reasoning", variant="primary", elem_id="human_arena_reasoning_btn", elem_classes=["active-btn"])
|
| 236 |
)
|
| 237 |
|
|
|
|
| 238 |
if human_arena_data:
|
| 239 |
human_arena_df = create_human_arena_table(human_arena_data, category="general")
|
| 240 |
else:
|
|
|
|
| 255 |
column_widths=["300px", "150px", "110px", "110px", "110px", "156px", "169px", "100px", "120px"]
|
| 256 |
)
|
| 257 |
|
|
|
|
| 258 |
general_btn.click(
|
| 259 |
fn=lambda data: filter_human_arena_table("general", data),
|
| 260 |
inputs=[human_arena_state],
|
|
|
|
| 292 |
)
|
| 293 |
|
| 294 |
with gr.TabItem("🏟️ Auto Arena"):
|
|
|
|
| 295 |
arena_details_df = create_raw_details_table(benchmark_results, "arena")
|
| 296 |
arena_details_df = format_dataframe(arena_details_df)
|
| 297 |
|
|
|
|
| 307 |
)
|
| 308 |
|
| 309 |
with gr.TabItem("📚 Retrieval"):
|
|
|
|
| 310 |
rag_details_df = create_raw_details_table(benchmark_results, "retrieval")
|
| 311 |
rag_details_df = format_dataframe(rag_details_df)
|
| 312 |
|
|
|
|
| 321 |
|
| 322 |
)
|
| 323 |
|
| 324 |
+
with gr.TabItem("🔧 Structured Outputs"):
|
| 325 |
+
structured_details_df = create_structured_outputs_table(benchmark_results["raw"]["structured_output"], is_detail=True)
|
| 326 |
+
|
| 327 |
+
if structured_details_df.empty:
|
| 328 |
+
structured_details_df = pd.DataFrame({"Model": ["No data available"]})
|
| 329 |
+
|
| 330 |
+
structured_table = gr.DataFrame(
|
| 331 |
+
value=structured_details_df,
|
| 332 |
+
label="Structured Outputs Detailed Results",
|
| 333 |
+
interactive=False,
|
| 334 |
+
column_widths=["300px", "250px", "110px", "150px", "100px", "150px", "150px", "100px", "100px", "100px", "120px"]
|
| 335 |
+
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
with gr.TabItem("⚡ Light Eval"):
|
|
|
|
| 339 |
light_details_data = benchmark_results["raw"]["light_eval"]
|
| 340 |
if light_details_data:
|
| 341 |
light_details_df = create_light_eval_table(light_details_data, is_detail=True)
|
|
|
|
| 356 |
)
|
| 357 |
|
| 358 |
with gr.TabItem("📋 EvalMix"):
|
|
|
|
| 359 |
hybrid_details_df = create_raw_details_table(benchmark_results, "evalmix")
|
| 360 |
hybrid_details_df = format_dataframe(hybrid_details_df)
|
| 361 |
|
|
|
|
| 371 |
)
|
| 372 |
|
| 373 |
with gr.TabItem("🐍 𝐒𝐧𝐚𝐤𝐞 𝐁𝐞𝐧𝐜𝐡"):
|
|
|
|
| 374 |
snake_details_df = create_raw_details_table(benchmark_results, "snake")
|
| 375 |
snake_details_df = format_dataframe(snake_details_df)
|
| 376 |
|
|
|
|
| 416 |
# ]
|
| 417 |
# )
|
| 418 |
|
|
|
|
| 419 |
def search_all_tabs(query, original_data):
|
| 420 |
"""
|
| 421 |
+
Search across all tabs
|
| 422 |
"""
|
| 423 |
if not query or query.strip() == "":
|
|
|
|
| 424 |
return (original_data, arena_details_df, human_arena_df,
|
| 425 |
+
rag_details_df, structured_details_df, light_details_df, hybrid_details_df, snake_details_df)
|
| 426 |
|
|
|
|
| 427 |
return (
|
| 428 |
search_and_filter(query, original_data, "All"),
|
| 429 |
search_and_filter(query, arena_details_df, "All"),
|
| 430 |
search_and_filter(query, human_arena_df, "All"),
|
| 431 |
search_and_filter(query, rag_details_df, "All"),
|
| 432 |
+
search_and_filter(query, structured_details_df, "All"),
|
| 433 |
search_and_filter(query, light_details_df, "All"),
|
| 434 |
search_and_filter(query, hybrid_details_df, "All"),
|
| 435 |
search_and_filter(query, snake_details_df, "All")
|
| 436 |
)
|
| 437 |
|
|
|
|
| 438 |
search_input.change(
|
| 439 |
search_all_tabs,
|
| 440 |
inputs=[search_input, original_leaderboard_data],
|
| 441 |
+
outputs=[combined_table, arena_table, human_arena_table, rag_table, structured_table, light_table, hybrid_table, snake_table]
|
| 442 |
)
|
| 443 |
|
| 444 |
with gr.TabItem("ℹ️ About", elem_id="about-tab"):
|
|
|
|
| 695 |
logging.warning(f"Error checking model type: {str(e)}")
|
| 696 |
|
| 697 |
# Call the benchmark function with profile information
|
|
|
|
| 698 |
result_message, _ = submit_unified_benchmark(model, base_model, reasoning, email, profile)
|
| 699 |
logging.info(f"Submission processed for model: {model}")
|
| 700 |
return result_message
|
result/structured_output/avg_str001.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "meta-llama/Llama-3.3-70b-Instruct",
|
| 3 |
+
"structured_output_score": 0.7635,
|
| 4 |
+
"run_id": "str001"
|
| 5 |
+
}
|
result/structured_output/avg_str002.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "grok-3",
|
| 3 |
+
"structured_output_score": 0.7628,
|
| 4 |
+
"run_id": "str002"
|
| 5 |
+
}
|
result/structured_output/avg_str003.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "newmindai/Llama-3.3-70b-Instruct",
|
| 3 |
+
"structured_output_score": 0.7622,
|
| 4 |
+
"run_id": "str003"
|
| 5 |
+
}
|
result/structured_output/avg_str004.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "deepseek-ai/DeepSeek-R1",
|
| 3 |
+
"structured_output_score": 0.76,
|
| 4 |
+
"run_id": "str004"
|
| 5 |
+
}
|
result/structured_output/avg_str005.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "google/gemma-3-27b-it",
|
| 3 |
+
"structured_output_score": 0.7478,
|
| 4 |
+
"run_id": "str005"
|
| 5 |
+
}
|
result/structured_output/avg_str006.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "grok-3-mini-fast-beta",
|
| 3 |
+
"structured_output_score": 0.7471,
|
| 4 |
+
"run_id": "str006"
|
| 5 |
+
}
|
result/structured_output/avg_str007.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
| 3 |
+
"structured_output_score": 0.7424,
|
| 4 |
+
"run_id": "str007"
|
| 5 |
+
}
|
result/structured_output/avg_str008.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen/Qwen3-32B",
|
| 3 |
+
"structured_output_score": 0.735,
|
| 4 |
+
"run_id": "str008"
|
| 5 |
+
}
|
result/structured_output/avg_str009.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
| 3 |
+
"structured_output_score": 0.7309,
|
| 4 |
+
"run_id": "str009"
|
| 5 |
+
}
|
result/structured_output/avg_str010.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "newmindai/QwQ-32B-r1",
|
| 3 |
+
"structured_output_score": 0.7252,
|
| 4 |
+
"run_id": "str010"
|
| 5 |
+
}
|
result/structured_output/avg_str011.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen/QwQ-32B",
|
| 3 |
+
"structured_output_score": 0.7205,
|
| 4 |
+
"run_id": "str011"
|
| 5 |
+
}
|
result/structured_output/avg_str012.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "microsoft/phi-4",
|
| 3 |
+
"structured_output_score": 0.6906,
|
| 4 |
+
"run_id": "str012"
|
| 5 |
+
}
|
result/structured_output/avg_str013.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen/Qwen3-14B",
|
| 3 |
+
"structured_output_score": 0.6153,
|
| 4 |
+
"run_id": "str013"
|
| 5 |
+
}
|
result/structured_output/avg_str014.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "newmindai/Qwen2.5-72b-Instruct",
|
| 3 |
+
"structured_output_score": 0.761,
|
| 4 |
+
"run_id": "ec6bf42a-4482-4f8c-9fbd-2ab5f1eed6bb"
|
| 5 |
+
}
|
| 6 |
+
|
result/structured_output/detail_str001.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "meta-llama/Llama-3.3-70b-Instruct",
|
| 3 |
+
"structured_output_score": 0.7635,
|
| 4 |
+
"semantic": 0.5271,
|
| 5 |
+
"response_format": "506/506",
|
| 6 |
+
"name": 0.6364,
|
| 7 |
+
"document_note": 0.2194,
|
| 8 |
+
"document_date": 0.6561,
|
| 9 |
+
"from": 0.6319,
|
| 10 |
+
"to": 0.4919,
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"licence": "Llama-3.3",
|
| 13 |
+
"run_id": "str001"
|
| 14 |
+
}
|
result/structured_output/detail_str002.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "grok-3",
|
| 3 |
+
"structured_output_score": 0.7628,
|
| 4 |
+
"semantic": 0.5256,
|
| 5 |
+
"response_format": "506/506",
|
| 6 |
+
"name": 0.6344,
|
| 7 |
+
"document_note": 0.166,
|
| 8 |
+
"document_date": 0.6482,
|
| 9 |
+
"from": 0.6493,
|
| 10 |
+
"to": 0.5299,
|
| 11 |
+
"dtype": "Unknown",
|
| 12 |
+
"licence": "Proprietary",
|
| 13 |
+
"run_id": "str002"
|
| 14 |
+
}
|
result/structured_output/detail_str003.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "newmindai/Llama-3.3-70b-Instruct",
|
| 3 |
+
"structured_output_score": 0.7622,
|
| 4 |
+
"semantic": 0.5245,
|
| 5 |
+
"response_format": "506/506",
|
| 6 |
+
"name": 0.6423,
|
| 7 |
+
"document_note": 0.2016,
|
| 8 |
+
"document_date": 0.6561,
|
| 9 |
+
"from": 0.6259,
|
| 10 |
+
"to": 0.4966,
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"licence": "Llama-3.3",
|
| 13 |
+
"run_id": "str003"
|
| 14 |
+
}
|
result/structured_output/detail_str004.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "deepseek-ai/DeepSeek-R1",
|
| 3 |
+
"structured_output_score": 0.76,
|
| 4 |
+
"semantic": 0.5199,
|
| 5 |
+
"response_format": "506/506",
|
| 6 |
+
"name": 0.6601,
|
| 7 |
+
"document_note": 0.1917,
|
| 8 |
+
"document_date": 0.6542,
|
| 9 |
+
"from": 0.6223,
|
| 10 |
+
"to": 0.4713,
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"licence": "MIT",
|
| 13 |
+
"run_id": "str004"
|
| 14 |
+
}
|
result/structured_output/detail_str005.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "google/gemma-3-27b-it",
|
| 3 |
+
"structured_output_score": 0.7478,
|
| 4 |
+
"semantic": 0.4955,
|
| 5 |
+
"response_format": "506/506",
|
| 6 |
+
"name": 0.5909,
|
| 7 |
+
"document_note": 0.2055,
|
| 8 |
+
"document_date": 0.6502,
|
| 9 |
+
"from": 0.6044,
|
| 10 |
+
"to": 0.4264,
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"licence": "Gemma",
|
| 13 |
+
"run_id": "str005"
|
| 14 |
+
}
|
result/structured_output/detail_str006.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "grok-3-mini-fast-beta",
|
| 3 |
+
"structured_output_score": 0.7471,
|
| 4 |
+
"semantic": 0.4943,
|
| 5 |
+
"response_format": "506/506",
|
| 6 |
+
"name": 0.6403,
|
| 7 |
+
"document_note": 0.1957,
|
| 8 |
+
"document_date": 0.6324,
|
| 9 |
+
"from": 0.567,
|
| 10 |
+
"to": 0.4363,
|
| 11 |
+
"dtype": "Unknown",
|
| 12 |
+
"licence": "Proprietary",
|
| 13 |
+
"run_id": "str006"
|
| 14 |
+
}
|
result/structured_output/detail_str007.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
| 3 |
+
"structured_output_score": 0.7424,
|
| 4 |
+
"semantic": 0.4847,
|
| 5 |
+
"response_format": "506/506",
|
| 6 |
+
"name": 0.581,
|
| 7 |
+
"document_note": 0.2134,
|
| 8 |
+
"document_date": 0.6561,
|
| 9 |
+
"from": 0.5248,
|
| 10 |
+
"to": 0.4482,
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"licence": "Llama 3.1",
|
| 13 |
+
"run_id": "str007"
|
| 14 |
+
}
|
result/structured_output/detail_str008.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen/Qwen3-32B",
|
| 3 |
+
"structured_output_score": 0.735,
|
| 4 |
+
"semantic": 0.482,
|
| 5 |
+
"response_format": "500/506",
|
| 6 |
+
"name": 0.566,
|
| 7 |
+
"document_note": 0.21,
|
| 8 |
+
"document_date": 0.636,
|
| 9 |
+
"from": 0.5614,
|
| 10 |
+
"to": 0.4367,
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"licence": "Qwen",
|
| 13 |
+
"run_id": "str008"
|
| 14 |
+
}
|
result/structured_output/detail_str009.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
| 3 |
+
"structured_output_score": 0.7309,
|
| 4 |
+
"semantic": 0.4618,
|
| 5 |
+
"response_format": "506/506",
|
| 6 |
+
"name": 0.502,
|
| 7 |
+
"document_note": 0.1957,
|
| 8 |
+
"document_date": 0.6383,
|
| 9 |
+
"from": 0.5927,
|
| 10 |
+
"to": 0.3801,
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"licence": "Qwen",
|
| 13 |
+
"run_id": "str009"
|
| 14 |
+
}
|
result/structured_output/detail_str010.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "newmindai/QwQ-32B-r1",
|
| 3 |
+
"structured_output_score": 0.7252,
|
| 4 |
+
"semantic": 0.4564,
|
| 5 |
+
"response_format": "503/506",
|
| 6 |
+
"name": 0.507,
|
| 7 |
+
"document_note": 0.1272,
|
| 8 |
+
"document_date": 0.6243,
|
| 9 |
+
"from": 0.5816,
|
| 10 |
+
"to": 0.4419,
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"licence": "Apache 2.0",
|
| 13 |
+
"run_id": "str010"
|
| 14 |
+
}
|
result/structured_output/detail_str011.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen/QwQ-32B",
|
| 3 |
+
"structured_output_score": 0.7205,
|
| 4 |
+
"semantic": 0.4468,
|
| 5 |
+
"response_format": "503/506",
|
| 6 |
+
"name": 0.4791,
|
| 7 |
+
"document_note": 0.1352,
|
| 8 |
+
"document_date": 0.6243,
|
| 9 |
+
"from": 0.573,
|
| 10 |
+
"to": 0.4224,
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"licence": "Apache 2.0",
|
| 13 |
+
"run_id": "str011"
|
| 14 |
+
}
|
result/structured_output/detail_str012.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "microsoft/phi-4",
|
| 3 |
+
"structured_output_score": 0.6906,
|
| 4 |
+
"semantic": 0.3912,
|
| 5 |
+
"response_format": "503/506",
|
| 6 |
+
"name": 0.3752,
|
| 7 |
+
"document_note": 0.2275,
|
| 8 |
+
"document_date": 0.5768,
|
| 9 |
+
"from": 0.4542,
|
| 10 |
+
"to": 0.3222,
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"licence": "MIT",
|
| 13 |
+
"run_id": "str012"
|
| 14 |
+
}
|
result/structured_output/detail_str013.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen/Qwen3-14B",
|
| 3 |
+
"structured_output_score": 0.6153,
|
| 4 |
+
"semantic": 0.2426,
|
| 5 |
+
"response_format": "501/506",
|
| 6 |
+
"name": 0.31,
|
| 7 |
+
"document_note": 0.156,
|
| 8 |
+
"document_date": 0.538,
|
| 9 |
+
"from": 0.1095,
|
| 10 |
+
"to": 0.0998,
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"licence": "Apache 2.0",
|
| 13 |
+
"run_id": "str013"
|
| 14 |
+
}
|
result/structured_output/detail_str014.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "newmindai/Qwen2.5-72b-Instruct",
|
| 3 |
+
"structured_output_score": 0.761,
|
| 4 |
+
"semantic": 0.5219,
|
| 5 |
+
"response_format": "506/506",
|
| 6 |
+
"name": 0.5632,
|
| 7 |
+
"document_note": 0.2905,
|
| 8 |
+
"document_date": 0.6403,
|
| 9 |
+
"from": 0.6136,
|
| 10 |
+
"to": 0.5018,
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"licence": "Qwen",
|
| 13 |
+
"run_id": "ec6bf42a-4482-4f8c-9fbd-2ab5f1eed6bb",
|
| 14 |
+
"ISL": 1712575,
|
| 15 |
+
"OSL": 183946,
|
| 16 |
+
"cost": null,
|
| 17 |
+
"e2e_benchmark_time": 114.4683,
|
| 18 |
+
"model_generation_time": 104.2865,
|
| 19 |
+
"scoring_duration_minutes": 10.1755,
|
| 20 |
+
"provider": "nebius",
|
| 21 |
+
"sample_count": 506,
|
| 22 |
+
"success_response": 506
|
| 23 |
+
}
|
| 24 |
+
|
src/display/about.py
CHANGED
|
@@ -57,7 +57,7 @@ Evaluate your model's performance in the following categories:
|
|
| 57 |
|
| 58 |
6. 🐍 **Snake Bench** - Specialized evaluation measuring step-by-step problem solving and complex reasoning abilities.
|
| 59 |
|
| 60 |
-
7. 🧩 **Structured Outputs** -
|
| 61 |
|
| 62 |
Evaluate your model in any or all of these categories to discover its capabilities and areas of excellence.
|
| 63 |
|
|
@@ -244,6 +244,37 @@ Human evaluators consider multiple factors when comparing model responses:
|
|
| 244 |
|
| 245 |
Human Arena provides a complementary perspective to automated benchmarks, capturing nuanced human preferences that traditional metrics might miss. This evaluation is particularly valuable for understanding how models perform in real-world conversational scenarios.
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
"""
|
| 248 |
|
| 249 |
EVALUATION_QUEUE_TEXT = """
|
|
|
|
| 57 |
|
| 58 |
6. 🐍 **Snake Bench** - Specialized evaluation measuring step-by-step problem solving and complex reasoning abilities.
|
| 59 |
|
| 60 |
+
7. 🧩 **Structured Outputs** - Evaluation of models' ability to generate properly formatted, structured responses with accurate field extraction and semantic understanding.
|
| 61 |
|
| 62 |
Evaluate your model in any or all of these categories to discover its capabilities and areas of excellence.
|
| 63 |
|
|
|
|
| 244 |
|
| 245 |
Human Arena provides a complementary perspective to automated benchmarks, capturing nuanced human preferences that traditional metrics might miss. This evaluation is particularly valuable for understanding how models perform in real-world conversational scenarios.
|
| 246 |
|
| 247 |
+
### 7. 🧩 Structured Outputs
|
| 248 |
+
Structured Outputs evaluation assesses models' ability to generate properly formatted, structured responses with accurate field extraction and semantic understanding. This benchmark tests how well language models can parse, understand, and extract specific information from documents while maintaining semantic coherence.
|
| 249 |
+
|
| 250 |
+
**Evaluation Methodology:**
|
| 251 |
+
Models are evaluated on their ability to extract structured information from Turkish legal documents. The evaluation uses advanced semantic similarity scoring with Turkish-specific embedding models for accurate assessment.
|
| 252 |
+
|
| 253 |
+
**Technical Implementation:**
|
| 254 |
+
- **Embedding Model**: Primary evaluation uses [`newmindai/TurkEmbed4Retrieval`](https://huggingface.co/newmindai/TurkEmbed4Retrieval) for Turkish-specific semantic understanding
|
| 255 |
+
- **Similarity Threshold**: 0.75 cosine similarity threshold for field matching
|
| 256 |
+
- **Ground Truth Comparison**: MongoDB-stored ground truth data with pre-computed embeddings
|
| 257 |
+
|
| 258 |
+
**Evaluation Metrics:**
|
| 259 |
+
|
| 260 |
+
- **Overall**: Combined overall performance metric that averages Semantic understanding and Response Format success ratio
|
| 261 |
+
- **Semantic**: Measures semantic understanding and coherence of extracted information using cosine similarity (corresponds to `overall_score` in scoring)
|
| 262 |
+
- **Response Format**: Success ratio showing successful JSON extractions vs total attempts (success_response/sample_count)
|
| 263 |
+
- **Name**: Accuracy in extracting and identifying name fields from legal documents (20% weight)
|
| 264 |
+
- **Document Date**: Accuracy in date field extraction with multiple format support (20% weight)
|
| 265 |
+
- **Document Note**: Performance in extracting document annotation information using semantic similarity (20% weight)
|
| 266 |
+
- **From**: Performance in extracting source/sender information as lists with semantic matching (20% weight)
|
| 267 |
+
- **To**: Accuracy in extracting destination/recipient information as lists with semantic matching (20% weight)
|
| 268 |
+
|
| 269 |
+
**Scoring Algorithm:**
|
| 270 |
+
The evaluation uses a sophisticated multi-level scoring system:
|
| 271 |
+
|
| 272 |
+
1. **String Fields** (name, document_note): Turkish embedding similarity with 0.75 threshold using `newmindai/TurkEmbed4Retrieval`
|
| 273 |
+
2. **Date Fields** (document_date): Exact date matching with multiple format parsing support
|
| 274 |
+
3. **List Fields** (from, to): One-way similarity from ground truth to predictions using semantic matching
|
| 275 |
+
4. **Overall Score Calculation**: `Overall = (Semantic + Response Format) / 2`
|
| 276 |
+
5. **Field Weights**: Each extraction field (name, document_date, document_note, from, to) contributes equally with 20% weight to the semantic score
|
| 277 |
+
|
| 278 |
"""
|
| 279 |
|
| 280 |
EVALUATION_QUEUE_TEXT = """
|
src/utils.py
CHANGED
|
@@ -118,7 +118,6 @@ def filter_models(
|
|
| 118 |
return filtered_df
|
| 119 |
|
| 120 |
|
| 121 |
-
# Yeni fonksiyonlar
|
| 122 |
def load_benchmark_results():
|
| 123 |
"""
|
| 124 |
Load benchmark results from local files
|
|
@@ -130,7 +129,8 @@ def load_benchmark_results():
|
|
| 130 |
"snake": [],
|
| 131 |
"retrieval": [],
|
| 132 |
"arena": [],
|
| 133 |
-
"human_arena": []
|
|
|
|
| 134 |
},
|
| 135 |
"raw": {
|
| 136 |
"evalmix": [],
|
|
@@ -138,12 +138,13 @@ def load_benchmark_results():
|
|
| 138 |
"snake": [],
|
| 139 |
"retrieval": [],
|
| 140 |
"arena": [],
|
| 141 |
-
"human_arena": []
|
|
|
|
| 142 |
}
|
| 143 |
}
|
| 144 |
|
| 145 |
# Define benchmark types to look for
|
| 146 |
-
benchmark_types = ["evalmix", "light_eval", "snake", "retrieval", "arena", "human_arena"] # "lm_harness" removed
|
| 147 |
|
| 148 |
# Initialize RAG Score calculator for runtime calculation
|
| 149 |
rag_calculator = None
|
|
@@ -387,7 +388,6 @@ def create_evalmix_table(data):
|
|
| 387 |
else:
|
| 388 |
df["average_score"] = df[["lexical_metric", "semantic_metric"]].mean(axis=1).round(2)
|
| 389 |
|
| 390 |
-
# Float değerleri 2 ondalık basamağa yuvarla
|
| 391 |
for column in df.columns:
|
| 392 |
try:
|
| 393 |
if pd.api.types.is_float_dtype(df[column]):
|
|
@@ -485,7 +485,6 @@ def create_light_eval_table(data, is_detail=False):
|
|
| 485 |
if not data:
|
| 486 |
return pd.DataFrame()
|
| 487 |
|
| 488 |
-
# Light eval sonuçları farklı formatta, düzenleme gerekiyor
|
| 489 |
formatted_data = []
|
| 490 |
for item in data:
|
| 491 |
model_data = {"model_name": format_model_name(item.get("model_name", "Bilinmeyen Model"))}
|
|
@@ -557,7 +556,6 @@ def create_light_eval_table(data, is_detail=False):
|
|
| 557 |
# Sort with NaN at the end
|
| 558 |
df = df.iloc[sort_col.fillna(-1).argsort(kind="stable").iloc[::-1]]
|
| 559 |
|
| 560 |
-
# Float değerleri yuvarlama - detail için 4 hane, avg için 2 hane
|
| 561 |
decimal_places = 4 if is_detail else 2
|
| 562 |
for column in df.columns:
|
| 563 |
try:
|
|
@@ -609,6 +607,138 @@ def create_light_eval_table(data, is_detail=False):
|
|
| 609 |
|
| 610 |
return df
|
| 611 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 612 |
def create_benchmark_plots(benchmark_data, data_type="avg"):
|
| 613 |
"""
|
| 614 |
Benchmark verilerinden grafikler oluşturur
|
|
@@ -619,7 +749,6 @@ def create_benchmark_plots(benchmark_data, data_type="avg"):
|
|
| 619 |
"""
|
| 620 |
plots = {}
|
| 621 |
|
| 622 |
-
# Hybrid Benchmark için çubuk grafik
|
| 623 |
if benchmark_data[data_type]["evalmix"]:
|
| 624 |
df = create_evalmix_table(benchmark_data[data_type]["evalmix"])
|
| 625 |
if not df.empty and all(col in df.columns for col in ["model_name", "lexical_metric", "semantic_metric"]):
|
|
@@ -628,7 +757,6 @@ def create_benchmark_plots(benchmark_data, data_type="avg"):
|
|
| 628 |
if "judge_metric" in df.columns:
|
| 629 |
metrics.append("judge_metric")
|
| 630 |
|
| 631 |
-
# Veriyi uzun formata dönüştür
|
| 632 |
plot_df = pd.melt(
|
| 633 |
df,
|
| 634 |
id_vars=["model_name"],
|
|
@@ -637,7 +765,6 @@ def create_benchmark_plots(benchmark_data, data_type="avg"):
|
|
| 637 |
value_name="Değer"
|
| 638 |
)
|
| 639 |
|
| 640 |
-
# Metrik isimlerini daha okunabilir hale getir
|
| 641 |
plot_df["Metrik"] = plot_df["Metrik"].replace({
|
| 642 |
"lexical_metric": "Lexical Metric",
|
| 643 |
"semantic_metric": "Semantic Metric",
|
|
@@ -655,11 +782,9 @@ def create_benchmark_plots(benchmark_data, data_type="avg"):
|
|
| 655 |
)
|
| 656 |
plots["evalmix"] = fig
|
| 657 |
|
| 658 |
-
# Light Eval için radar grafik
|
| 659 |
if benchmark_data[data_type]["light_eval"]:
|
| 660 |
df = create_light_eval_table(benchmark_data[data_type]["light_eval"])
|
| 661 |
if not df.empty:
|
| 662 |
-
# Ortalama ve total_samples sütunlarını hariç tut
|
| 663 |
metric_cols = [col for col in df.columns if col not in ["model_name", "Ortalama", "file", "overall_average", "total_samples"]]
|
| 664 |
if metric_cols:
|
| 665 |
fig = go.Figure()
|
|
@@ -691,7 +816,7 @@ def create_combined_leaderboard_table(benchmark_data):
|
|
| 691 |
Creates a combined leaderboard table from avg JSON data
|
| 692 |
"""
|
| 693 |
# Define benchmark types to include in the leaderboard
|
| 694 |
-
benchmark_types = ["evalmix", "light_eval", "retrieval", "arena"] # "lm_harness" and "human_arena" removed
|
| 695 |
|
| 696 |
all_models = {}
|
| 697 |
|
|
@@ -788,6 +913,11 @@ def create_combined_leaderboard_table(benchmark_data):
|
|
| 788 |
# Human Elo Score removed from leaderboard table (still available in Human Arena tab)
|
| 789 |
# Remove dtype and license from JSON - use only lookup table values
|
| 790 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 791 |
|
| 792 |
# Create DataFrame from the collected data
|
| 793 |
if all_models:
|
|
@@ -821,6 +951,7 @@ def create_combined_leaderboard_table(benchmark_data):
|
|
| 821 |
display_cols = [
|
| 822 |
"Auto Elo Score",
|
| 823 |
"Retrieval",
|
|
|
|
| 824 |
"Light Eval",
|
| 825 |
"Turkish Semantic",
|
| 826 |
"Multilingual Semantic",
|
|
@@ -835,7 +966,7 @@ def create_combined_leaderboard_table(benchmark_data):
|
|
| 835 |
df[col] = df[col].fillna(0)
|
| 836 |
|
| 837 |
# Explicitly reorder columns to match the UI display order exactly as in the screenshot
|
| 838 |
-
desired_order = ["Model Name", "Auto Elo Score", "Retrieval", "Light Eval", "Turkish Semantic", "Multilingual Semantic", "Lexical", "Dtype", "License"]
|
| 839 |
|
| 840 |
# Filter out columns that don't exist in the DataFrame
|
| 841 |
actual_order = [col for col in desired_order if col in df.columns]
|
|
@@ -848,11 +979,15 @@ def create_combined_leaderboard_table(benchmark_data):
|
|
| 848 |
if "Auto Elo Score" in df.columns:
|
| 849 |
df = df.sort_values(by="Auto Elo Score", ascending=False)
|
| 850 |
|
| 851 |
-
|
|
|
|
| 852 |
for column in df.columns:
|
| 853 |
try:
|
| 854 |
if pd.api.types.is_float_dtype(df[column]):
|
| 855 |
-
|
|
|
|
|
|
|
|
|
|
| 856 |
except:
|
| 857 |
continue
|
| 858 |
|
|
@@ -950,7 +1085,6 @@ def create_raw_details_table(benchmark_data, benchmark_type):
|
|
| 950 |
cols = ["model_name"] + [col for col in df.columns if col != "model_name"]
|
| 951 |
df = df[cols]
|
| 952 |
|
| 953 |
-
# Float değerleri 2 ondalık basamağa yuvarla
|
| 954 |
for column in df.columns:
|
| 955 |
try:
|
| 956 |
if pd.api.types.is_float_dtype(df[column]):
|
|
@@ -1051,6 +1185,22 @@ def create_raw_details_table(benchmark_data, benchmark_type):
|
|
| 1051 |
"license": "License"
|
| 1052 |
}
|
| 1053 |
column_mapping.update(custom_columns)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1054 |
|
| 1055 |
|
| 1056 |
|
|
@@ -1161,6 +1311,36 @@ def create_raw_details_table(benchmark_data, benchmark_type):
|
|
| 1161 |
|
| 1162 |
# elif benchmark_type == "lm_harness" and "Overall" in df.columns:
|
| 1163 |
# df = df.sort_values(by="Overall", ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1164 |
elif benchmark_type == "light_eval" and "Overall" in df.columns:
|
| 1165 |
df = df.sort_values(by="Overall", ascending=False)
|
| 1166 |
elif benchmark_type == "snake":
|
|
@@ -1250,7 +1430,6 @@ def _flatten_dict(d, target_dict, prefix=""):
|
|
| 1250 |
target_dict[new_key] = str(value)
|
| 1251 |
else:
|
| 1252 |
# Add other values directly
|
| 1253 |
-
# Float değerleri yuvarla
|
| 1254 |
if isinstance(value, float):
|
| 1255 |
target_dict[new_key] = round(value, 2)
|
| 1256 |
else:
|
|
|
|
| 118 |
return filtered_df
|
| 119 |
|
| 120 |
|
|
|
|
| 121 |
def load_benchmark_results():
|
| 122 |
"""
|
| 123 |
Load benchmark results from local files
|
|
|
|
| 129 |
"snake": [],
|
| 130 |
"retrieval": [],
|
| 131 |
"arena": [],
|
| 132 |
+
"human_arena": [],
|
| 133 |
+
"structured_output": []
|
| 134 |
},
|
| 135 |
"raw": {
|
| 136 |
"evalmix": [],
|
|
|
|
| 138 |
"snake": [],
|
| 139 |
"retrieval": [],
|
| 140 |
"arena": [],
|
| 141 |
+
"human_arena": [],
|
| 142 |
+
"structured_output": []
|
| 143 |
}
|
| 144 |
}
|
| 145 |
|
| 146 |
# Define benchmark types to look for
|
| 147 |
+
benchmark_types = ["evalmix", "light_eval", "snake", "retrieval", "arena", "human_arena", "structured_output"] # "lm_harness" removed
|
| 148 |
|
| 149 |
# Initialize RAG Score calculator for runtime calculation
|
| 150 |
rag_calculator = None
|
|
|
|
| 388 |
else:
|
| 389 |
df["average_score"] = df[["lexical_metric", "semantic_metric"]].mean(axis=1).round(2)
|
| 390 |
|
|
|
|
| 391 |
for column in df.columns:
|
| 392 |
try:
|
| 393 |
if pd.api.types.is_float_dtype(df[column]):
|
|
|
|
| 485 |
if not data:
|
| 486 |
return pd.DataFrame()
|
| 487 |
|
|
|
|
| 488 |
formatted_data = []
|
| 489 |
for item in data:
|
| 490 |
model_data = {"model_name": format_model_name(item.get("model_name", "Bilinmeyen Model"))}
|
|
|
|
| 556 |
# Sort with NaN at the end
|
| 557 |
df = df.iloc[sort_col.fillna(-1).argsort(kind="stable").iloc[::-1]]
|
| 558 |
|
|
|
|
| 559 |
decimal_places = 4 if is_detail else 2
|
| 560 |
for column in df.columns:
|
| 561 |
try:
|
|
|
|
| 607 |
|
| 608 |
return df
|
| 609 |
|
| 610 |
+
def create_structured_outputs_table(data, is_detail=False):
|
| 611 |
+
"""
|
| 612 |
+
Creates a table from Structured Outputs results
|
| 613 |
+
|
| 614 |
+
Args:
|
| 615 |
+
data: Structured outputs data
|
| 616 |
+
is_detail: If True, keep 4 decimal places for detail results
|
| 617 |
+
"""
|
| 618 |
+
if not data:
|
| 619 |
+
return pd.DataFrame()
|
| 620 |
+
|
| 621 |
+
formatted_data = []
|
| 622 |
+
for item in data:
|
| 623 |
+
model_data = {"model": format_model_name(item.get("model_name", "") or item.get("model", "Bilinmeyen Model"))}
|
| 624 |
+
|
| 625 |
+
# Add specific metrics we're interested in for Structured Outputs
|
| 626 |
+
metrics = [
|
| 627 |
+
"structured_output_score",
|
| 628 |
+
"semantic",
|
| 629 |
+
"response_format",
|
| 630 |
+
"name",
|
| 631 |
+
"document_note",
|
| 632 |
+
"document_date",
|
| 633 |
+
"from",
|
| 634 |
+
"to",
|
| 635 |
+
"dtype",
|
| 636 |
+
"licence"
|
| 637 |
+
]
|
| 638 |
+
|
| 639 |
+
for metric in metrics:
|
| 640 |
+
try:
|
| 641 |
+
if metric in ["dtype", "licence"]:
|
| 642 |
+
# Use the value from JSON directly
|
| 643 |
+
model_data[metric] = item.get(metric, "Unknown")
|
| 644 |
+
elif metric in item:
|
| 645 |
+
if metric == "structured_output_score" and item[metric] == "N/A":
|
| 646 |
+
model_data[metric] = "N/A"
|
| 647 |
+
elif isinstance(item[metric], str) and item[metric] != "N/A":
|
| 648 |
+
try:
|
| 649 |
+
model_data[metric] = float(item[metric])
|
| 650 |
+
except:
|
| 651 |
+
model_data[metric] = item[metric] # Keep as string if can't convert
|
| 652 |
+
else:
|
| 653 |
+
model_data[metric] = item[metric]
|
| 654 |
+
else:
|
| 655 |
+
model_data[metric] = "N/A"
|
| 656 |
+
except Exception as e:
|
| 657 |
+
if metric in ["dtype", "licence"]:
|
| 658 |
+
model_data[metric] = item.get(metric, "Unknown")
|
| 659 |
+
else:
|
| 660 |
+
model_data[metric] = item.get(metric, "N/A")
|
| 661 |
+
|
| 662 |
+
formatted_data.append(model_data)
|
| 663 |
+
|
| 664 |
+
# Create DataFrame
|
| 665 |
+
df = pd.DataFrame(formatted_data)
|
| 666 |
+
|
| 667 |
+
# Remove the file column if present
|
| 668 |
+
if 'file' in df.columns:
|
| 669 |
+
df = df.drop(columns=['file'])
|
| 670 |
+
|
| 671 |
+
# Try to convert metrics to float with error handling (only numeric columns)
|
| 672 |
+
numeric_cols = ["structured_output_score", "semantic", "name",
|
| 673 |
+
"document_note", "document_date", "from", "to"]
|
| 674 |
+
for col in numeric_cols:
|
| 675 |
+
if col in df.columns:
|
| 676 |
+
try:
|
| 677 |
+
# Convert column to float but keep "N/A" as is
|
| 678 |
+
df[col] = df[col].apply(lambda x: float(x) if isinstance(x, (int, float)) or (isinstance(x, str) and x != "N/A") else x)
|
| 679 |
+
except Exception as e:
|
| 680 |
+
pass # Keep original values if conversion fails
|
| 681 |
+
|
| 682 |
+
# Sort by structured_output_score if available
|
| 683 |
+
if "structured_output_score" in df.columns:
|
| 684 |
+
# For sorting, replace non-numeric values with NaN temporarily
|
| 685 |
+
sort_col = pd.to_numeric(df["structured_output_score"], errors="coerce")
|
| 686 |
+
# Sort with NaN at the end
|
| 687 |
+
df = df.iloc[sort_col.fillna(-1).argsort(kind="stable").iloc[::-1]]
|
| 688 |
+
|
| 689 |
+
decimal_places = 4 if is_detail else 2
|
| 690 |
+
for column in df.columns:
|
| 691 |
+
try:
|
| 692 |
+
if pd.api.types.is_float_dtype(df[column]):
|
| 693 |
+
df[column] = df[column].round(decimal_places)
|
| 694 |
+
except:
|
| 695 |
+
continue
|
| 696 |
+
|
| 697 |
+
# Format column names according to user request
|
| 698 |
+
column_mapping = {
|
| 699 |
+
"model": "Model",
|
| 700 |
+
"structured_output_score": "Structured Output Score",
|
| 701 |
+
"semantic": "Semantic",
|
| 702 |
+
"response_format": "Response Format",
|
| 703 |
+
"name": "Name",
|
| 704 |
+
"document_note": "Document Note",
|
| 705 |
+
"document_date": "Document Date",
|
| 706 |
+
"from": "From",
|
| 707 |
+
"to": "To",
|
| 708 |
+
"dtype": "Dtype",
|
| 709 |
+
"licence": "Licence"
|
| 710 |
+
}
|
| 711 |
+
|
| 712 |
+
# Rename DataFrame columns
|
| 713 |
+
df = df.rename(columns=column_mapping)
|
| 714 |
+
|
| 715 |
+
# Define desired column order for Structured Outputs - metadata columns at the end
|
| 716 |
+
desired_cols = [
|
| 717 |
+
"Model",
|
| 718 |
+
"Structured Output Score",
|
| 719 |
+
"Semantic",
|
| 720 |
+
"Response Format",
|
| 721 |
+
"Name",
|
| 722 |
+
"Document Note",
|
| 723 |
+
"Document Date",
|
| 724 |
+
"From",
|
| 725 |
+
"To",
|
| 726 |
+
"Dtype",
|
| 727 |
+
"Licence"
|
| 728 |
+
]
|
| 729 |
+
|
| 730 |
+
# Filter out columns that don't exist in the DataFrame
|
| 731 |
+
final_cols = [col for col in desired_cols if col in df.columns]
|
| 732 |
+
|
| 733 |
+
# Add any remaining columns that weren't in the desired list
|
| 734 |
+
remaining_cols = [col for col in df.columns if col not in final_cols]
|
| 735 |
+
final_cols.extend(remaining_cols)
|
| 736 |
+
|
| 737 |
+
# Set the new column order
|
| 738 |
+
df = df[final_cols]
|
| 739 |
+
|
| 740 |
+
return df
|
| 741 |
+
|
| 742 |
def create_benchmark_plots(benchmark_data, data_type="avg"):
|
| 743 |
"""
|
| 744 |
Benchmark verilerinden grafikler oluşturur
|
|
|
|
| 749 |
"""
|
| 750 |
plots = {}
|
| 751 |
|
|
|
|
| 752 |
if benchmark_data[data_type]["evalmix"]:
|
| 753 |
df = create_evalmix_table(benchmark_data[data_type]["evalmix"])
|
| 754 |
if not df.empty and all(col in df.columns for col in ["model_name", "lexical_metric", "semantic_metric"]):
|
|
|
|
| 757 |
if "judge_metric" in df.columns:
|
| 758 |
metrics.append("judge_metric")
|
| 759 |
|
|
|
|
| 760 |
plot_df = pd.melt(
|
| 761 |
df,
|
| 762 |
id_vars=["model_name"],
|
|
|
|
| 765 |
value_name="Değer"
|
| 766 |
)
|
| 767 |
|
|
|
|
| 768 |
plot_df["Metrik"] = plot_df["Metrik"].replace({
|
| 769 |
"lexical_metric": "Lexical Metric",
|
| 770 |
"semantic_metric": "Semantic Metric",
|
|
|
|
| 782 |
)
|
| 783 |
plots["evalmix"] = fig
|
| 784 |
|
|
|
|
| 785 |
if benchmark_data[data_type]["light_eval"]:
|
| 786 |
df = create_light_eval_table(benchmark_data[data_type]["light_eval"])
|
| 787 |
if not df.empty:
|
|
|
|
| 788 |
metric_cols = [col for col in df.columns if col not in ["model_name", "Ortalama", "file", "overall_average", "total_samples"]]
|
| 789 |
if metric_cols:
|
| 790 |
fig = go.Figure()
|
|
|
|
| 816 |
Creates a combined leaderboard table from avg JSON data
|
| 817 |
"""
|
| 818 |
# Define benchmark types to include in the leaderboard
|
| 819 |
+
benchmark_types = ["evalmix", "light_eval", "retrieval", "arena", "structured_output"] # "lm_harness" and "human_arena" removed
|
| 820 |
|
| 821 |
all_models = {}
|
| 822 |
|
|
|
|
| 913 |
# Human Elo Score removed from leaderboard table (still available in Human Arena tab)
|
| 914 |
# Remove dtype and license from JSON - use only lookup table values
|
| 915 |
pass
|
| 916 |
+
elif benchmark_type == "structured_output":
|
| 917 |
+
if "structured_output_score" in item:
|
| 918 |
+
# Keep higher precision for Structured Outputs to align with detail view
|
| 919 |
+
all_models[formatted_model_name]["Structured Outputs"] = round(item.get("structured_output_score", 0), 4)
|
| 920 |
+
# Remove dtype and license from JSON - use only lookup table values
|
| 921 |
|
| 922 |
# Create DataFrame from the collected data
|
| 923 |
if all_models:
|
|
|
|
| 951 |
display_cols = [
|
| 952 |
"Auto Elo Score",
|
| 953 |
"Retrieval",
|
| 954 |
+
"Structured Outputs",
|
| 955 |
"Light Eval",
|
| 956 |
"Turkish Semantic",
|
| 957 |
"Multilingual Semantic",
|
|
|
|
| 966 |
df[col] = df[col].fillna(0)
|
| 967 |
|
| 968 |
# Explicitly reorder columns to match the UI display order exactly as in the screenshot
|
| 969 |
+
desired_order = ["Model Name", "Auto Elo Score", "Retrieval", "Structured Outputs", "Light Eval", "Turkish Semantic", "Multilingual Semantic", "Lexical", "Dtype", "License"]
|
| 970 |
|
| 971 |
# Filter out columns that don't exist in the DataFrame
|
| 972 |
actual_order = [col for col in desired_order if col in df.columns]
|
|
|
|
| 979 |
if "Auto Elo Score" in df.columns:
|
| 980 |
df = df.sort_values(by="Auto Elo Score", ascending=False)
|
| 981 |
|
| 982 |
+
|
| 983 |
+
four_decimal_columns = {"Structured Outputs"}
|
| 984 |
for column in df.columns:
|
| 985 |
try:
|
| 986 |
if pd.api.types.is_float_dtype(df[column]):
|
| 987 |
+
if column in four_decimal_columns:
|
| 988 |
+
df[column] = df[column].round(4)
|
| 989 |
+
else:
|
| 990 |
+
df[column] = df[column].round(2)
|
| 991 |
except:
|
| 992 |
continue
|
| 993 |
|
|
|
|
| 1085 |
cols = ["model_name"] + [col for col in df.columns if col != "model_name"]
|
| 1086 |
df = df[cols]
|
| 1087 |
|
|
|
|
| 1088 |
for column in df.columns:
|
| 1089 |
try:
|
| 1090 |
if pd.api.types.is_float_dtype(df[column]):
|
|
|
|
| 1185 |
"license": "License"
|
| 1186 |
}
|
| 1187 |
column_mapping.update(custom_columns)
|
| 1188 |
+
|
| 1189 |
+
elif benchmark_type == "structured_output":
|
| 1190 |
+
# Structured Output benchmark column mappings
|
| 1191 |
+
custom_columns = {
|
| 1192 |
+
"structured_output_score": "Structured Output Score",
|
| 1193 |
+
"semantic": "Semantic",
|
| 1194 |
+
"response_format": "Response Format",
|
| 1195 |
+
"name": "Name",
|
| 1196 |
+
"document_note": "Document Note",
|
| 1197 |
+
"document_date": "Document Date",
|
| 1198 |
+
"from": "From",
|
| 1199 |
+
"to": "To",
|
| 1200 |
+
"dtype": "Dtype",
|
| 1201 |
+
"license": "License"
|
| 1202 |
+
}
|
| 1203 |
+
column_mapping.update(custom_columns)
|
| 1204 |
|
| 1205 |
|
| 1206 |
|
|
|
|
| 1311 |
|
| 1312 |
# elif benchmark_type == "lm_harness" and "Overall" in df.columns:
|
| 1313 |
# df = df.sort_values(by="Overall", ascending=False)
|
| 1314 |
+
elif benchmark_type == "structured_output":
|
| 1315 |
+
# Sort by Structured Output Score if available
|
| 1316 |
+
if "Structured Output Score" in df.columns:
|
| 1317 |
+
df = df.sort_values(by="Structured Output Score", ascending=False)
|
| 1318 |
+
|
| 1319 |
+
# Define desired column order for Structured Output - metadata columns at the end
|
| 1320 |
+
desired_cols = [
|
| 1321 |
+
"Model Name",
|
| 1322 |
+
"Structured Output Score",
|
| 1323 |
+
"Semantic",
|
| 1324 |
+
"Response Format",
|
| 1325 |
+
"Name",
|
| 1326 |
+
"Document Note",
|
| 1327 |
+
"Document Date",
|
| 1328 |
+
"From",
|
| 1329 |
+
"To",
|
| 1330 |
+
"Dtype",
|
| 1331 |
+
"License"
|
| 1332 |
+
]
|
| 1333 |
+
|
| 1334 |
+
# Filter out columns that don't exist in the DataFrame
|
| 1335 |
+
final_cols = [col for col in desired_cols if col in df.columns]
|
| 1336 |
+
|
| 1337 |
+
# Add any remaining columns that weren't in the desired list
|
| 1338 |
+
remaining_cols = [col for col in df.columns if col not in final_cols]
|
| 1339 |
+
final_cols.extend(remaining_cols)
|
| 1340 |
+
|
| 1341 |
+
# Set the new column order
|
| 1342 |
+
df = df[final_cols]
|
| 1343 |
+
|
| 1344 |
elif benchmark_type == "light_eval" and "Overall" in df.columns:
|
| 1345 |
df = df.sort_values(by="Overall", ascending=False)
|
| 1346 |
elif benchmark_type == "snake":
|
|
|
|
| 1430 |
target_dict[new_key] = str(value)
|
| 1431 |
else:
|
| 1432 |
# Add other values directly
|
|
|
|
| 1433 |
if isinstance(value, float):
|
| 1434 |
target_dict[new_key] = round(value, 2)
|
| 1435 |
else:
|