nmmursit commited on
Commit
7dea7c1
·
1 Parent(s): 2f272c5

feat: Add structured output support and refactor comments

Browse files

- Integrate structured output benchmark functionality
- Clean up Turkish comments and improve code quality

app.py CHANGED
@@ -43,10 +43,10 @@ from src.utils import (
43
  create_light_eval_table,
44
  create_raw_details_table,
45
  create_human_arena_table,
 
46
  update_supported_base_models
47
  )
48
 
49
- # Pipelines utils fonksiyonlarını import et
50
  from pipelines.utils.common import search_and_filter
51
  from pipelines.unified_benchmark import submit_unified_benchmark
52
 
@@ -72,7 +72,6 @@ def format_dataframe(df, is_light_eval_detail=False):
72
  if df.empty:
73
  return df
74
 
75
- # 'file' sütununu kaldır
76
  if 'file' in df.columns:
77
  df = df.drop(columns=['file'])
78
 
@@ -83,16 +82,24 @@ def format_dataframe(df, is_light_eval_detail=False):
83
  if col in df.columns:
84
  df = df.drop(columns=[col])
85
 
86
- # Float değerleri yuvarlama - light eval detail için 4 hane, diğerleri için 2 hane
87
- decimal_places = 4 if is_light_eval_detail else 2
 
 
 
 
 
 
88
  for column in df.columns:
89
  try:
90
  if pd.api.types.is_float_dtype(df[column]):
91
- df[column] = df[column].round(decimal_places)
 
 
 
92
  except:
93
  continue
94
 
95
- # Kolon isimlerini düzgün formata getir
96
  column_mapping = {}
97
  for col in df.columns:
98
  # Skip run_id and user_id fields
@@ -162,15 +169,12 @@ def create_demo():
162
  gr.Markdown(TITLE)
163
  gr.Markdown(INTRODUCTION_TEXT)
164
 
165
- # Hidden session state to track login expiration
166
  session_expiry = gr.State(None)
167
 
168
  try:
169
- # Benchmark sonuçlarını yükle
170
  benchmark_results = load_benchmark_results()
171
  default_plots = create_benchmark_plots(benchmark_results, "avg")
172
 
173
- # State variable to track login state across page refreshes
174
  login_state = gr.State(value=False)
175
 
176
  with gr.Tabs() as tabs:
@@ -178,8 +182,6 @@ def create_demo():
178
  gr.Markdown("## Model Evaluation Results")
179
  gr.Markdown("This screen shows model performance across different evaluation categories.")
180
 
181
- # Remove the separate refresh button row
182
- # Instead, combine search and refresh in one row
183
  with gr.Row():
184
  search_input = gr.Textbox(
185
  label="🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
@@ -192,10 +194,8 @@ def create_demo():
192
  # # Status display for refresh results
193
  # refresh_status = gr.Markdown("", visible=False)
194
 
195
- # Benchmark tablarını semboller içeren tab grubuyla göster
196
  with gr.Tabs() as benchmark_tabs:
197
  with gr.TabItem("👥 Human Arena"):
198
- # Human Arena sonuçları - detail dosyalarını kullan
199
  human_arena_data = benchmark_results["raw"]["human_arena"]
200
 
201
  # Store human arena data in a state component for filtering
@@ -220,7 +220,6 @@ def create_demo():
220
  if filtered_df.empty:
221
  filtered_df = pd.DataFrame({"Model Name": ["No data available"]})
222
 
223
- # Return updated buttons with new variants
224
  if category == "general":
225
  return (
226
  filtered_df,
@@ -236,7 +235,6 @@ def create_demo():
236
  gr.Button("Reasoning", variant="primary", elem_id="human_arena_reasoning_btn", elem_classes=["active-btn"])
237
  )
238
 
239
- # Initial table load
240
  if human_arena_data:
241
  human_arena_df = create_human_arena_table(human_arena_data, category="general")
242
  else:
@@ -257,7 +255,6 @@ def create_demo():
257
  column_widths=["300px", "150px", "110px", "110px", "110px", "156px", "169px", "100px", "120px"]
258
  )
259
 
260
- # Button click handlers
261
  general_btn.click(
262
  fn=lambda data: filter_human_arena_table("general", data),
263
  inputs=[human_arena_state],
@@ -295,7 +292,6 @@ def create_demo():
295
  )
296
 
297
  with gr.TabItem("🏟️ Auto Arena"):
298
- # Arena sonuçları - detail dosyalarını kullan
299
  arena_details_df = create_raw_details_table(benchmark_results, "arena")
300
  arena_details_df = format_dataframe(arena_details_df)
301
 
@@ -311,7 +307,6 @@ def create_demo():
311
  )
312
 
313
  with gr.TabItem("📚 Retrieval"):
314
- # RAG Judge sonuçları - detail dosyalarını kullan
315
  rag_details_df = create_raw_details_table(benchmark_results, "retrieval")
316
  rag_details_df = format_dataframe(rag_details_df)
317
 
@@ -326,8 +321,21 @@ def create_demo():
326
 
327
  )
328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  with gr.TabItem("⚡ Light Eval"):
330
- # Light Eval sonuçları - detail dosyalarını kullan
331
  light_details_data = benchmark_results["raw"]["light_eval"]
332
  if light_details_data:
333
  light_details_df = create_light_eval_table(light_details_data, is_detail=True)
@@ -348,7 +356,6 @@ def create_demo():
348
  )
349
 
350
  with gr.TabItem("📋 EvalMix"):
351
- # Hybrid Benchmark sonuçları - detail dosyalarını kullan
352
  hybrid_details_df = create_raw_details_table(benchmark_results, "evalmix")
353
  hybrid_details_df = format_dataframe(hybrid_details_df)
354
 
@@ -364,7 +371,6 @@ def create_demo():
364
  )
365
 
366
  with gr.TabItem("🐍 𝐒𝐧𝐚𝐤𝐞 𝐁𝐞𝐧𝐜𝐡"):
367
- # Snake Benchmark sonuçları - detail dosyalarını kullan
368
  snake_details_df = create_raw_details_table(benchmark_results, "snake")
369
  snake_details_df = format_dataframe(snake_details_df)
370
 
@@ -410,32 +416,29 @@ def create_demo():
410
  # ]
411
  # )
412
 
413
- # Tüm sekmeler için ortak arama fonksiyonu
414
  def search_all_tabs(query, original_data):
415
  """
416
- Tüm sekmelerde arama yapar
417
  """
418
  if not query or query.strip() == "":
419
- # Boş arama - orijinal veriyi döndür
420
  return (original_data, arena_details_df, human_arena_df,
421
- rag_details_df, light_details_df, hybrid_details_df, snake_details_df)
422
 
423
- # Arama var - tüm sekmeleri filtrele
424
  return (
425
  search_and_filter(query, original_data, "All"),
426
  search_and_filter(query, arena_details_df, "All"),
427
  search_and_filter(query, human_arena_df, "All"),
428
  search_and_filter(query, rag_details_df, "All"),
 
429
  search_and_filter(query, light_details_df, "All"),
430
  search_and_filter(query, hybrid_details_df, "All"),
431
  search_and_filter(query, snake_details_df, "All")
432
  )
433
 
434
- # Arama fonksiyonu - tüm sekmeleri güncelle
435
  search_input.change(
436
  search_all_tabs,
437
  inputs=[search_input, original_leaderboard_data],
438
- outputs=[combined_table, arena_table, human_arena_table, rag_table, light_table, hybrid_table, snake_table]
439
  )
440
 
441
  with gr.TabItem("ℹ️ About", elem_id="about-tab"):
@@ -692,7 +695,6 @@ def create_demo():
692
  logging.warning(f"Error checking model type: {str(e)}")
693
 
694
  # Call the benchmark function with profile information
695
- # base_model validasyonunu kaldırdık ama parametre olarak yine de gönderiyoruz
696
  result_message, _ = submit_unified_benchmark(model, base_model, reasoning, email, profile)
697
  logging.info(f"Submission processed for model: {model}")
698
  return result_message
 
43
  create_light_eval_table,
44
  create_raw_details_table,
45
  create_human_arena_table,
46
+ create_structured_outputs_table,
47
  update_supported_base_models
48
  )
49
 
 
50
  from pipelines.utils.common import search_and_filter
51
  from pipelines.unified_benchmark import submit_unified_benchmark
52
 
 
72
  if df.empty:
73
  return df
74
 
 
75
  if 'file' in df.columns:
76
  df = df.drop(columns=['file'])
77
 
 
82
  if col in df.columns:
83
  df = df.drop(columns=[col])
84
 
85
+ # Float değerleri yuvarlama
86
+ # Varsayılan: 2 hane. Light eval detail veya structured_output_score kolonları varsa: 4 hane.
87
+ # Leaderboard için özel durum: "Structured Outputs" ve "Retrieval" kolonlarını 4 hane tut.
88
+ if is_light_eval_detail or "structured_output_score" in df.columns:
89
+ default_decimal_places = 4
90
+ else:
91
+ default_decimal_places = 2
92
+ four_decimal_cols = {"Structured Outputs"}
93
  for column in df.columns:
94
  try:
95
  if pd.api.types.is_float_dtype(df[column]):
96
+ if column in four_decimal_cols:
97
+ df[column] = df[column].round(4)
98
+ else:
99
+ df[column] = df[column].round(default_decimal_places)
100
  except:
101
  continue
102
 
 
103
  column_mapping = {}
104
  for col in df.columns:
105
  # Skip run_id and user_id fields
 
169
  gr.Markdown(TITLE)
170
  gr.Markdown(INTRODUCTION_TEXT)
171
 
 
172
  session_expiry = gr.State(None)
173
 
174
  try:
 
175
  benchmark_results = load_benchmark_results()
176
  default_plots = create_benchmark_plots(benchmark_results, "avg")
177
 
 
178
  login_state = gr.State(value=False)
179
 
180
  with gr.Tabs() as tabs:
 
182
  gr.Markdown("## Model Evaluation Results")
183
  gr.Markdown("This screen shows model performance across different evaluation categories.")
184
 
 
 
185
  with gr.Row():
186
  search_input = gr.Textbox(
187
  label="🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
 
194
  # # Status display for refresh results
195
  # refresh_status = gr.Markdown("", visible=False)
196
 
 
197
  with gr.Tabs() as benchmark_tabs:
198
  with gr.TabItem("👥 Human Arena"):
 
199
  human_arena_data = benchmark_results["raw"]["human_arena"]
200
 
201
  # Store human arena data in a state component for filtering
 
220
  if filtered_df.empty:
221
  filtered_df = pd.DataFrame({"Model Name": ["No data available"]})
222
 
 
223
  if category == "general":
224
  return (
225
  filtered_df,
 
235
  gr.Button("Reasoning", variant="primary", elem_id="human_arena_reasoning_btn", elem_classes=["active-btn"])
236
  )
237
 
 
238
  if human_arena_data:
239
  human_arena_df = create_human_arena_table(human_arena_data, category="general")
240
  else:
 
255
  column_widths=["300px", "150px", "110px", "110px", "110px", "156px", "169px", "100px", "120px"]
256
  )
257
 
 
258
  general_btn.click(
259
  fn=lambda data: filter_human_arena_table("general", data),
260
  inputs=[human_arena_state],
 
292
  )
293
 
294
  with gr.TabItem("🏟️ Auto Arena"):
 
295
  arena_details_df = create_raw_details_table(benchmark_results, "arena")
296
  arena_details_df = format_dataframe(arena_details_df)
297
 
 
307
  )
308
 
309
  with gr.TabItem("📚 Retrieval"):
 
310
  rag_details_df = create_raw_details_table(benchmark_results, "retrieval")
311
  rag_details_df = format_dataframe(rag_details_df)
312
 
 
321
 
322
  )
323
 
324
+ with gr.TabItem("🔧 Structured Outputs"):
325
+ structured_details_df = create_structured_outputs_table(benchmark_results["raw"]["structured_output"], is_detail=True)
326
+
327
+ if structured_details_df.empty:
328
+ structured_details_df = pd.DataFrame({"Model": ["No data available"]})
329
+
330
+ structured_table = gr.DataFrame(
331
+ value=structured_details_df,
332
+ label="Structured Outputs Detailed Results",
333
+ interactive=False,
334
+ column_widths=["300px", "250px", "110px", "150px", "100px", "150px", "150px", "100px", "100px", "100px", "120px"]
335
+
336
+ )
337
+
338
  with gr.TabItem("⚡ Light Eval"):
 
339
  light_details_data = benchmark_results["raw"]["light_eval"]
340
  if light_details_data:
341
  light_details_df = create_light_eval_table(light_details_data, is_detail=True)
 
356
  )
357
 
358
  with gr.TabItem("📋 EvalMix"):
 
359
  hybrid_details_df = create_raw_details_table(benchmark_results, "evalmix")
360
  hybrid_details_df = format_dataframe(hybrid_details_df)
361
 
 
371
  )
372
 
373
  with gr.TabItem("🐍 𝐒𝐧𝐚𝐤𝐞 𝐁𝐞𝐧𝐜𝐡"):
 
374
  snake_details_df = create_raw_details_table(benchmark_results, "snake")
375
  snake_details_df = format_dataframe(snake_details_df)
376
 
 
416
  # ]
417
  # )
418
 
 
419
  def search_all_tabs(query, original_data):
420
  """
421
+ Search across all tabs
422
  """
423
  if not query or query.strip() == "":
 
424
  return (original_data, arena_details_df, human_arena_df,
425
+ rag_details_df, structured_details_df, light_details_df, hybrid_details_df, snake_details_df)
426
 
 
427
  return (
428
  search_and_filter(query, original_data, "All"),
429
  search_and_filter(query, arena_details_df, "All"),
430
  search_and_filter(query, human_arena_df, "All"),
431
  search_and_filter(query, rag_details_df, "All"),
432
+ search_and_filter(query, structured_details_df, "All"),
433
  search_and_filter(query, light_details_df, "All"),
434
  search_and_filter(query, hybrid_details_df, "All"),
435
  search_and_filter(query, snake_details_df, "All")
436
  )
437
 
 
438
  search_input.change(
439
  search_all_tabs,
440
  inputs=[search_input, original_leaderboard_data],
441
+ outputs=[combined_table, arena_table, human_arena_table, rag_table, structured_table, light_table, hybrid_table, snake_table]
442
  )
443
 
444
  with gr.TabItem("ℹ️ About", elem_id="about-tab"):
 
695
  logging.warning(f"Error checking model type: {str(e)}")
696
 
697
  # Call the benchmark function with profile information
 
698
  result_message, _ = submit_unified_benchmark(model, base_model, reasoning, email, profile)
699
  logging.info(f"Submission processed for model: {model}")
700
  return result_message
result/structured_output/avg_str001.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "meta-llama/Llama-3.3-70b-Instruct",
3
+ "structured_output_score": 0.7635,
4
+ "run_id": "str001"
5
+ }
result/structured_output/avg_str002.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "grok-3",
3
+ "structured_output_score": 0.7628,
4
+ "run_id": "str002"
5
+ }
result/structured_output/avg_str003.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "newmindai/Llama-3.3-70b-Instruct",
3
+ "structured_output_score": 0.7622,
4
+ "run_id": "str003"
5
+ }
result/structured_output/avg_str004.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "deepseek-ai/DeepSeek-R1",
3
+ "structured_output_score": 0.76,
4
+ "run_id": "str004"
5
+ }
result/structured_output/avg_str005.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "google/gemma-3-27b-it",
3
+ "structured_output_score": 0.7478,
4
+ "run_id": "str005"
5
+ }
result/structured_output/avg_str006.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "grok-3-mini-fast-beta",
3
+ "structured_output_score": 0.7471,
4
+ "run_id": "str006"
5
+ }
result/structured_output/avg_str007.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
3
+ "structured_output_score": 0.7424,
4
+ "run_id": "str007"
5
+ }
result/structured_output/avg_str008.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen/Qwen3-32B",
3
+ "structured_output_score": 0.735,
4
+ "run_id": "str008"
5
+ }
result/structured_output/avg_str009.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
3
+ "structured_output_score": 0.7309,
4
+ "run_id": "str009"
5
+ }
result/structured_output/avg_str010.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "newmindai/QwQ-32B-r1",
3
+ "structured_output_score": 0.7252,
4
+ "run_id": "str010"
5
+ }
result/structured_output/avg_str011.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen/QwQ-32B",
3
+ "structured_output_score": 0.7205,
4
+ "run_id": "str011"
5
+ }
result/structured_output/avg_str012.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "microsoft/phi-4",
3
+ "structured_output_score": 0.6906,
4
+ "run_id": "str012"
5
+ }
result/structured_output/avg_str013.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen/Qwen3-14B",
3
+ "structured_output_score": 0.6153,
4
+ "run_id": "str013"
5
+ }
result/structured_output/avg_str014.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "newmindai/Qwen2.5-72b-Instruct",
3
+ "structured_output_score": 0.761,
4
+ "run_id": "ec6bf42a-4482-4f8c-9fbd-2ab5f1eed6bb"
5
+ }
6
+
result/structured_output/detail_str001.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "meta-llama/Llama-3.3-70b-Instruct",
3
+ "structured_output_score": 0.7635,
4
+ "semantic": 0.5271,
5
+ "response_format": "506/506",
6
+ "name": 0.6364,
7
+ "document_note": 0.2194,
8
+ "document_date": 0.6561,
9
+ "from": 0.6319,
10
+ "to": 0.4919,
11
+ "dtype": "bfloat16",
12
+ "licence": "Llama-3.3",
13
+ "run_id": "str001"
14
+ }
result/structured_output/detail_str002.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "grok-3",
3
+ "structured_output_score": 0.7628,
4
+ "semantic": 0.5256,
5
+ "response_format": "506/506",
6
+ "name": 0.6344,
7
+ "document_note": 0.166,
8
+ "document_date": 0.6482,
9
+ "from": 0.6493,
10
+ "to": 0.5299,
11
+ "dtype": "Unknown",
12
+ "licence": "Proprietary",
13
+ "run_id": "str002"
14
+ }
result/structured_output/detail_str003.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "newmindai/Llama-3.3-70b-Instruct",
3
+ "structured_output_score": 0.7622,
4
+ "semantic": 0.5245,
5
+ "response_format": "506/506",
6
+ "name": 0.6423,
7
+ "document_note": 0.2016,
8
+ "document_date": 0.6561,
9
+ "from": 0.6259,
10
+ "to": 0.4966,
11
+ "dtype": "bfloat16",
12
+ "licence": "Llama-3.3",
13
+ "run_id": "str003"
14
+ }
result/structured_output/detail_str004.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "deepseek-ai/DeepSeek-R1",
3
+ "structured_output_score": 0.76,
4
+ "semantic": 0.5199,
5
+ "response_format": "506/506",
6
+ "name": 0.6601,
7
+ "document_note": 0.1917,
8
+ "document_date": 0.6542,
9
+ "from": 0.6223,
10
+ "to": 0.4713,
11
+ "dtype": "bfloat16",
12
+ "licence": "MIT",
13
+ "run_id": "str004"
14
+ }
result/structured_output/detail_str005.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "google/gemma-3-27b-it",
3
+ "structured_output_score": 0.7478,
4
+ "semantic": 0.4955,
5
+ "response_format": "506/506",
6
+ "name": 0.5909,
7
+ "document_note": 0.2055,
8
+ "document_date": 0.6502,
9
+ "from": 0.6044,
10
+ "to": 0.4264,
11
+ "dtype": "bfloat16",
12
+ "licence": "Gemma",
13
+ "run_id": "str005"
14
+ }
result/structured_output/detail_str006.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "grok-3-mini-fast-beta",
3
+ "structured_output_score": 0.7471,
4
+ "semantic": 0.4943,
5
+ "response_format": "506/506",
6
+ "name": 0.6403,
7
+ "document_note": 0.1957,
8
+ "document_date": 0.6324,
9
+ "from": 0.567,
10
+ "to": 0.4363,
11
+ "dtype": "Unknown",
12
+ "licence": "Proprietary",
13
+ "run_id": "str006"
14
+ }
result/structured_output/detail_str007.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
3
+ "structured_output_score": 0.7424,
4
+ "semantic": 0.4847,
5
+ "response_format": "506/506",
6
+ "name": 0.581,
7
+ "document_note": 0.2134,
8
+ "document_date": 0.6561,
9
+ "from": 0.5248,
10
+ "to": 0.4482,
11
+ "dtype": "bfloat16",
12
+ "licence": "Llama 3.1",
13
+ "run_id": "str007"
14
+ }
result/structured_output/detail_str008.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen/Qwen3-32B",
3
+ "structured_output_score": 0.735,
4
+ "semantic": 0.482,
5
+ "response_format": "500/506",
6
+ "name": 0.566,
7
+ "document_note": 0.21,
8
+ "document_date": 0.636,
9
+ "from": 0.5614,
10
+ "to": 0.4367,
11
+ "dtype": "bfloat16",
12
+ "licence": "Qwen",
13
+ "run_id": "str008"
14
+ }
result/structured_output/detail_str009.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
3
+ "structured_output_score": 0.7309,
4
+ "semantic": 0.4618,
5
+ "response_format": "506/506",
6
+ "name": 0.502,
7
+ "document_note": 0.1957,
8
+ "document_date": 0.6383,
9
+ "from": 0.5927,
10
+ "to": 0.3801,
11
+ "dtype": "bfloat16",
12
+ "licence": "Qwen",
13
+ "run_id": "str009"
14
+ }
result/structured_output/detail_str010.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "newmindai/QwQ-32B-r1",
3
+ "structured_output_score": 0.7252,
4
+ "semantic": 0.4564,
5
+ "response_format": "503/506",
6
+ "name": 0.507,
7
+ "document_note": 0.1272,
8
+ "document_date": 0.6243,
9
+ "from": 0.5816,
10
+ "to": 0.4419,
11
+ "dtype": "bfloat16",
12
+ "licence": "Apache 2.0",
13
+ "run_id": "str010"
14
+ }
result/structured_output/detail_str011.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen/QwQ-32B",
3
+ "structured_output_score": 0.7205,
4
+ "semantic": 0.4468,
5
+ "response_format": "503/506",
6
+ "name": 0.4791,
7
+ "document_note": 0.1352,
8
+ "document_date": 0.6243,
9
+ "from": 0.573,
10
+ "to": 0.4224,
11
+ "dtype": "bfloat16",
12
+ "licence": "Apache 2.0",
13
+ "run_id": "str011"
14
+ }
result/structured_output/detail_str012.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "microsoft/phi-4",
3
+ "structured_output_score": 0.6906,
4
+ "semantic": 0.3912,
5
+ "response_format": "503/506",
6
+ "name": 0.3752,
7
+ "document_note": 0.2275,
8
+ "document_date": 0.5768,
9
+ "from": 0.4542,
10
+ "to": 0.3222,
11
+ "dtype": "bfloat16",
12
+ "licence": "MIT",
13
+ "run_id": "str012"
14
+ }
result/structured_output/detail_str013.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen/Qwen3-14B",
3
+ "structured_output_score": 0.6153,
4
+ "semantic": 0.2426,
5
+ "response_format": "501/506",
6
+ "name": 0.31,
7
+ "document_note": 0.156,
8
+ "document_date": 0.538,
9
+ "from": 0.1095,
10
+ "to": 0.0998,
11
+ "dtype": "bfloat16",
12
+ "licence": "Apache 2.0",
13
+ "run_id": "str013"
14
+ }
result/structured_output/detail_str014.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "newmindai/Qwen2.5-72b-Instruct",
3
+ "structured_output_score": 0.761,
4
+ "semantic": 0.5219,
5
+ "response_format": "506/506",
6
+ "name": 0.5632,
7
+ "document_note": 0.2905,
8
+ "document_date": 0.6403,
9
+ "from": 0.6136,
10
+ "to": 0.5018,
11
+ "dtype": "bfloat16",
12
+ "licence": "Qwen",
13
+ "run_id": "ec6bf42a-4482-4f8c-9fbd-2ab5f1eed6bb",
14
+ "ISL": 1712575,
15
+ "OSL": 183946,
16
+ "cost": null,
17
+ "e2e_benchmark_time": 114.4683,
18
+ "model_generation_time": 104.2865,
19
+ "scoring_duration_minutes": 10.1755,
20
+ "provider": "nebius",
21
+ "sample_count": 506,
22
+ "success_response": 506
23
+ }
24
+
src/display/about.py CHANGED
@@ -57,7 +57,7 @@ Evaluate your model's performance in the following categories:
57
 
58
  6. 🐍 **Snake Bench** - Specialized evaluation measuring step-by-step problem solving and complex reasoning abilities.
59
 
60
- 7. 🧩 **Structured Outputs** - Coming soon!
61
 
62
  Evaluate your model in any or all of these categories to discover its capabilities and areas of excellence.
63
 
@@ -244,6 +244,37 @@ Human evaluators consider multiple factors when comparing model responses:
244
 
245
  Human Arena provides a complementary perspective to automated benchmarks, capturing nuanced human preferences that traditional metrics might miss. This evaluation is particularly valuable for understanding how models perform in real-world conversational scenarios.
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  """
248
 
249
  EVALUATION_QUEUE_TEXT = """
 
57
 
58
  6. 🐍 **Snake Bench** - Specialized evaluation measuring step-by-step problem solving and complex reasoning abilities.
59
 
60
+ 7. 🧩 **Structured Outputs** - Evaluation of models' ability to generate properly formatted, structured responses with accurate field extraction and semantic understanding.
61
 
62
  Evaluate your model in any or all of these categories to discover its capabilities and areas of excellence.
63
 
 
244
 
245
  Human Arena provides a complementary perspective to automated benchmarks, capturing nuanced human preferences that traditional metrics might miss. This evaluation is particularly valuable for understanding how models perform in real-world conversational scenarios.
246
 
247
+ ### 7. 🧩 Structured Outputs
248
+ Structured Outputs evaluation assesses models' ability to generate properly formatted, structured responses with accurate field extraction and semantic understanding. This benchmark tests how well language models can parse, understand, and extract specific information from documents while maintaining semantic coherence.
249
+
250
+ **Evaluation Methodology:**
251
+ Models are evaluated on their ability to extract structured information from Turkish legal documents. The evaluation uses advanced semantic similarity scoring with Turkish-specific embedding models for accurate assessment.
252
+
253
+ **Technical Implementation:**
254
+ - **Embedding Model**: Primary evaluation uses [`newmindai/TurkEmbed4Retrieval`](https://huggingface.co/newmindai/TurkEmbed4Retrieval) for Turkish-specific semantic understanding
255
+ - **Similarity Threshold**: 0.75 cosine similarity threshold for field matching
256
+ - **Ground Truth Comparison**: MongoDB-stored ground truth data with pre-computed embeddings
257
+
258
+ **Evaluation Metrics:**
259
+
260
+ - **Overall**: Combined overall performance metric that averages Semantic understanding and Response Format success ratio
261
+ - **Semantic**: Measures semantic understanding and coherence of extracted information using cosine similarity (corresponds to `overall_score` in scoring)
262
+ - **Response Format**: Success ratio showing successful JSON extractions vs total attempts (success_response/sample_count)
263
+ - **Name**: Accuracy in extracting and identifying name fields from legal documents (20% weight)
264
+ - **Document Date**: Accuracy in date field extraction with multiple format support (20% weight)
265
+ - **Document Note**: Performance in extracting document annotation information using semantic similarity (20% weight)
266
+ - **From**: Performance in extracting source/sender information as lists with semantic matching (20% weight)
267
+ - **To**: Accuracy in extracting destination/recipient information as lists with semantic matching (20% weight)
268
+
269
+ **Scoring Algorithm:**
270
+ The evaluation uses a sophisticated multi-level scoring system:
271
+
272
+ 1. **String Fields** (name, document_note): Turkish embedding similarity with 0.75 threshold using `newmindai/TurkEmbed4Retrieval`
273
+ 2. **Date Fields** (document_date): Exact date matching with multiple format parsing support
274
+ 3. **List Fields** (from, to): One-way similarity from ground truth to predictions using semantic matching
275
+ 4. **Overall Score Calculation**: `Overall = (Semantic + Response Format) / 2`
276
+ 5. **Field Weights**: Each extraction field (name, document_date, document_note, from, to) contributes equally with 20% weight to the semantic score
277
+
278
  """
279
 
280
  EVALUATION_QUEUE_TEXT = """
src/utils.py CHANGED
@@ -118,7 +118,6 @@ def filter_models(
118
  return filtered_df
119
 
120
 
121
- # Yeni fonksiyonlar
122
  def load_benchmark_results():
123
  """
124
  Load benchmark results from local files
@@ -130,7 +129,8 @@ def load_benchmark_results():
130
  "snake": [],
131
  "retrieval": [],
132
  "arena": [],
133
- "human_arena": []
 
134
  },
135
  "raw": {
136
  "evalmix": [],
@@ -138,12 +138,13 @@ def load_benchmark_results():
138
  "snake": [],
139
  "retrieval": [],
140
  "arena": [],
141
- "human_arena": []
 
142
  }
143
  }
144
 
145
  # Define benchmark types to look for
146
- benchmark_types = ["evalmix", "light_eval", "snake", "retrieval", "arena", "human_arena"] # "lm_harness" removed
147
 
148
  # Initialize RAG Score calculator for runtime calculation
149
  rag_calculator = None
@@ -387,7 +388,6 @@ def create_evalmix_table(data):
387
  else:
388
  df["average_score"] = df[["lexical_metric", "semantic_metric"]].mean(axis=1).round(2)
389
 
390
- # Float değerleri 2 ondalık basamağa yuvarla
391
  for column in df.columns:
392
  try:
393
  if pd.api.types.is_float_dtype(df[column]):
@@ -485,7 +485,6 @@ def create_light_eval_table(data, is_detail=False):
485
  if not data:
486
  return pd.DataFrame()
487
 
488
- # Light eval sonuçları farklı formatta, düzenleme gerekiyor
489
  formatted_data = []
490
  for item in data:
491
  model_data = {"model_name": format_model_name(item.get("model_name", "Bilinmeyen Model"))}
@@ -557,7 +556,6 @@ def create_light_eval_table(data, is_detail=False):
557
  # Sort with NaN at the end
558
  df = df.iloc[sort_col.fillna(-1).argsort(kind="stable").iloc[::-1]]
559
 
560
- # Float değerleri yuvarlama - detail için 4 hane, avg için 2 hane
561
  decimal_places = 4 if is_detail else 2
562
  for column in df.columns:
563
  try:
@@ -609,6 +607,138 @@ def create_light_eval_table(data, is_detail=False):
609
 
610
  return df
611
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
  def create_benchmark_plots(benchmark_data, data_type="avg"):
613
  """
614
  Benchmark verilerinden grafikler oluşturur
@@ -619,7 +749,6 @@ def create_benchmark_plots(benchmark_data, data_type="avg"):
619
  """
620
  plots = {}
621
 
622
- # Hybrid Benchmark için çubuk grafik
623
  if benchmark_data[data_type]["evalmix"]:
624
  df = create_evalmix_table(benchmark_data[data_type]["evalmix"])
625
  if not df.empty and all(col in df.columns for col in ["model_name", "lexical_metric", "semantic_metric"]):
@@ -628,7 +757,6 @@ def create_benchmark_plots(benchmark_data, data_type="avg"):
628
  if "judge_metric" in df.columns:
629
  metrics.append("judge_metric")
630
 
631
- # Veriyi uzun formata dönüştür
632
  plot_df = pd.melt(
633
  df,
634
  id_vars=["model_name"],
@@ -637,7 +765,6 @@ def create_benchmark_plots(benchmark_data, data_type="avg"):
637
  value_name="Değer"
638
  )
639
 
640
- # Metrik isimlerini daha okunabilir hale getir
641
  plot_df["Metrik"] = plot_df["Metrik"].replace({
642
  "lexical_metric": "Lexical Metric",
643
  "semantic_metric": "Semantic Metric",
@@ -655,11 +782,9 @@ def create_benchmark_plots(benchmark_data, data_type="avg"):
655
  )
656
  plots["evalmix"] = fig
657
 
658
- # Light Eval için radar grafik
659
  if benchmark_data[data_type]["light_eval"]:
660
  df = create_light_eval_table(benchmark_data[data_type]["light_eval"])
661
  if not df.empty:
662
- # Ortalama ve total_samples sütunlarını hariç tut
663
  metric_cols = [col for col in df.columns if col not in ["model_name", "Ortalama", "file", "overall_average", "total_samples"]]
664
  if metric_cols:
665
  fig = go.Figure()
@@ -691,7 +816,7 @@ def create_combined_leaderboard_table(benchmark_data):
691
  Creates a combined leaderboard table from avg JSON data
692
  """
693
  # Define benchmark types to include in the leaderboard
694
- benchmark_types = ["evalmix", "light_eval", "retrieval", "arena"] # "lm_harness" and "human_arena" removed
695
 
696
  all_models = {}
697
 
@@ -788,6 +913,11 @@ def create_combined_leaderboard_table(benchmark_data):
788
  # Human Elo Score removed from leaderboard table (still available in Human Arena tab)
789
  # Remove dtype and license from JSON - use only lookup table values
790
  pass
 
 
 
 
 
791
 
792
  # Create DataFrame from the collected data
793
  if all_models:
@@ -821,6 +951,7 @@ def create_combined_leaderboard_table(benchmark_data):
821
  display_cols = [
822
  "Auto Elo Score",
823
  "Retrieval",
 
824
  "Light Eval",
825
  "Turkish Semantic",
826
  "Multilingual Semantic",
@@ -835,7 +966,7 @@ def create_combined_leaderboard_table(benchmark_data):
835
  df[col] = df[col].fillna(0)
836
 
837
  # Explicitly reorder columns to match the UI display order exactly as in the screenshot
838
- desired_order = ["Model Name", "Auto Elo Score", "Retrieval", "Light Eval", "Turkish Semantic", "Multilingual Semantic", "Lexical", "Dtype", "License"]
839
 
840
  # Filter out columns that don't exist in the DataFrame
841
  actual_order = [col for col in desired_order if col in df.columns]
@@ -848,11 +979,15 @@ def create_combined_leaderboard_table(benchmark_data):
848
  if "Auto Elo Score" in df.columns:
849
  df = df.sort_values(by="Auto Elo Score", ascending=False)
850
 
851
- # Float değerleri 2 ondalık basamağa yuvarla
 
852
  for column in df.columns:
853
  try:
854
  if pd.api.types.is_float_dtype(df[column]):
855
- df[column] = df[column].round(2)
 
 
 
856
  except:
857
  continue
858
 
@@ -950,7 +1085,6 @@ def create_raw_details_table(benchmark_data, benchmark_type):
950
  cols = ["model_name"] + [col for col in df.columns if col != "model_name"]
951
  df = df[cols]
952
 
953
- # Float değerleri 2 ondalık basamağa yuvarla
954
  for column in df.columns:
955
  try:
956
  if pd.api.types.is_float_dtype(df[column]):
@@ -1051,6 +1185,22 @@ def create_raw_details_table(benchmark_data, benchmark_type):
1051
  "license": "License"
1052
  }
1053
  column_mapping.update(custom_columns)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1054
 
1055
 
1056
 
@@ -1161,6 +1311,36 @@ def create_raw_details_table(benchmark_data, benchmark_type):
1161
 
1162
  # elif benchmark_type == "lm_harness" and "Overall" in df.columns:
1163
  # df = df.sort_values(by="Overall", ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1164
  elif benchmark_type == "light_eval" and "Overall" in df.columns:
1165
  df = df.sort_values(by="Overall", ascending=False)
1166
  elif benchmark_type == "snake":
@@ -1250,7 +1430,6 @@ def _flatten_dict(d, target_dict, prefix=""):
1250
  target_dict[new_key] = str(value)
1251
  else:
1252
  # Add other values directly
1253
- # Float değerleri yuvarla
1254
  if isinstance(value, float):
1255
  target_dict[new_key] = round(value, 2)
1256
  else:
 
118
  return filtered_df
119
 
120
 
 
121
  def load_benchmark_results():
122
  """
123
  Load benchmark results from local files
 
129
  "snake": [],
130
  "retrieval": [],
131
  "arena": [],
132
+ "human_arena": [],
133
+ "structured_output": []
134
  },
135
  "raw": {
136
  "evalmix": [],
 
138
  "snake": [],
139
  "retrieval": [],
140
  "arena": [],
141
+ "human_arena": [],
142
+ "structured_output": []
143
  }
144
  }
145
 
146
  # Define benchmark types to look for
147
+ benchmark_types = ["evalmix", "light_eval", "snake", "retrieval", "arena", "human_arena", "structured_output"] # "lm_harness" removed
148
 
149
  # Initialize RAG Score calculator for runtime calculation
150
  rag_calculator = None
 
388
  else:
389
  df["average_score"] = df[["lexical_metric", "semantic_metric"]].mean(axis=1).round(2)
390
 
 
391
  for column in df.columns:
392
  try:
393
  if pd.api.types.is_float_dtype(df[column]):
 
485
  if not data:
486
  return pd.DataFrame()
487
 
 
488
  formatted_data = []
489
  for item in data:
490
  model_data = {"model_name": format_model_name(item.get("model_name", "Bilinmeyen Model"))}
 
556
  # Sort with NaN at the end
557
  df = df.iloc[sort_col.fillna(-1).argsort(kind="stable").iloc[::-1]]
558
 
 
559
  decimal_places = 4 if is_detail else 2
560
  for column in df.columns:
561
  try:
 
607
 
608
  return df
609
 
610
+ def create_structured_outputs_table(data, is_detail=False):
611
+ """
612
+ Creates a table from Structured Outputs results
613
+
614
+ Args:
615
+ data: Structured outputs data
616
+ is_detail: If True, keep 4 decimal places for detail results
617
+ """
618
+ if not data:
619
+ return pd.DataFrame()
620
+
621
+ formatted_data = []
622
+ for item in data:
623
+ model_data = {"model": format_model_name(item.get("model_name", "") or item.get("model", "Bilinmeyen Model"))}
624
+
625
+ # Add specific metrics we're interested in for Structured Outputs
626
+ metrics = [
627
+ "structured_output_score",
628
+ "semantic",
629
+ "response_format",
630
+ "name",
631
+ "document_note",
632
+ "document_date",
633
+ "from",
634
+ "to",
635
+ "dtype",
636
+ "licence"
637
+ ]
638
+
639
+ for metric in metrics:
640
+ try:
641
+ if metric in ["dtype", "licence"]:
642
+ # Use the value from JSON directly
643
+ model_data[metric] = item.get(metric, "Unknown")
644
+ elif metric in item:
645
+ if metric == "structured_output_score" and item[metric] == "N/A":
646
+ model_data[metric] = "N/A"
647
+ elif isinstance(item[metric], str) and item[metric] != "N/A":
648
+ try:
649
+ model_data[metric] = float(item[metric])
650
+ except:
651
+ model_data[metric] = item[metric] # Keep as string if can't convert
652
+ else:
653
+ model_data[metric] = item[metric]
654
+ else:
655
+ model_data[metric] = "N/A"
656
+ except Exception as e:
657
+ if metric in ["dtype", "licence"]:
658
+ model_data[metric] = item.get(metric, "Unknown")
659
+ else:
660
+ model_data[metric] = item.get(metric, "N/A")
661
+
662
+ formatted_data.append(model_data)
663
+
664
+ # Create DataFrame
665
+ df = pd.DataFrame(formatted_data)
666
+
667
+ # Remove the file column if present
668
+ if 'file' in df.columns:
669
+ df = df.drop(columns=['file'])
670
+
671
+ # Try to convert metrics to float with error handling (only numeric columns)
672
+ numeric_cols = ["structured_output_score", "semantic", "name",
673
+ "document_note", "document_date", "from", "to"]
674
+ for col in numeric_cols:
675
+ if col in df.columns:
676
+ try:
677
+ # Convert column to float but keep "N/A" as is
678
+ df[col] = df[col].apply(lambda x: float(x) if isinstance(x, (int, float)) or (isinstance(x, str) and x != "N/A") else x)
679
+ except Exception as e:
680
+ pass # Keep original values if conversion fails
681
+
682
+ # Sort by structured_output_score if available
683
+ if "structured_output_score" in df.columns:
684
+ # For sorting, replace non-numeric values with NaN temporarily
685
+ sort_col = pd.to_numeric(df["structured_output_score"], errors="coerce")
686
+ # Sort with NaN at the end
687
+ df = df.iloc[sort_col.fillna(-1).argsort(kind="stable").iloc[::-1]]
688
+
689
+ decimal_places = 4 if is_detail else 2
690
+ for column in df.columns:
691
+ try:
692
+ if pd.api.types.is_float_dtype(df[column]):
693
+ df[column] = df[column].round(decimal_places)
694
+ except:
695
+ continue
696
+
697
+ # Format column names according to user request
698
+ column_mapping = {
699
+ "model": "Model",
700
+ "structured_output_score": "Structured Output Score",
701
+ "semantic": "Semantic",
702
+ "response_format": "Response Format",
703
+ "name": "Name",
704
+ "document_note": "Document Note",
705
+ "document_date": "Document Date",
706
+ "from": "From",
707
+ "to": "To",
708
+ "dtype": "Dtype",
709
+ "licence": "Licence"
710
+ }
711
+
712
+ # Rename DataFrame columns
713
+ df = df.rename(columns=column_mapping)
714
+
715
+ # Define desired column order for Structured Outputs - metadata columns at the end
716
+ desired_cols = [
717
+ "Model",
718
+ "Structured Output Score",
719
+ "Semantic",
720
+ "Response Format",
721
+ "Name",
722
+ "Document Note",
723
+ "Document Date",
724
+ "From",
725
+ "To",
726
+ "Dtype",
727
+ "Licence"
728
+ ]
729
+
730
+ # Filter out columns that don't exist in the DataFrame
731
+ final_cols = [col for col in desired_cols if col in df.columns]
732
+
733
+ # Add any remaining columns that weren't in the desired list
734
+ remaining_cols = [col for col in df.columns if col not in final_cols]
735
+ final_cols.extend(remaining_cols)
736
+
737
+ # Set the new column order
738
+ df = df[final_cols]
739
+
740
+ return df
741
+
742
  def create_benchmark_plots(benchmark_data, data_type="avg"):
743
  """
744
  Benchmark verilerinden grafikler oluşturur
 
749
  """
750
  plots = {}
751
 
 
752
  if benchmark_data[data_type]["evalmix"]:
753
  df = create_evalmix_table(benchmark_data[data_type]["evalmix"])
754
  if not df.empty and all(col in df.columns for col in ["model_name", "lexical_metric", "semantic_metric"]):
 
757
  if "judge_metric" in df.columns:
758
  metrics.append("judge_metric")
759
 
 
760
  plot_df = pd.melt(
761
  df,
762
  id_vars=["model_name"],
 
765
  value_name="Değer"
766
  )
767
 
 
768
  plot_df["Metrik"] = plot_df["Metrik"].replace({
769
  "lexical_metric": "Lexical Metric",
770
  "semantic_metric": "Semantic Metric",
 
782
  )
783
  plots["evalmix"] = fig
784
 
 
785
  if benchmark_data[data_type]["light_eval"]:
786
  df = create_light_eval_table(benchmark_data[data_type]["light_eval"])
787
  if not df.empty:
 
788
  metric_cols = [col for col in df.columns if col not in ["model_name", "Ortalama", "file", "overall_average", "total_samples"]]
789
  if metric_cols:
790
  fig = go.Figure()
 
816
  Creates a combined leaderboard table from avg JSON data
817
  """
818
  # Define benchmark types to include in the leaderboard
819
+ benchmark_types = ["evalmix", "light_eval", "retrieval", "arena", "structured_output"] # "lm_harness" and "human_arena" removed
820
 
821
  all_models = {}
822
 
 
913
  # Human Elo Score removed from leaderboard table (still available in Human Arena tab)
914
  # Remove dtype and license from JSON - use only lookup table values
915
  pass
916
+ elif benchmark_type == "structured_output":
917
+ if "structured_output_score" in item:
918
+ # Keep higher precision for Structured Outputs to align with detail view
919
+ all_models[formatted_model_name]["Structured Outputs"] = round(item.get("structured_output_score", 0), 4)
920
+ # Remove dtype and license from JSON - use only lookup table values
921
 
922
  # Create DataFrame from the collected data
923
  if all_models:
 
951
  display_cols = [
952
  "Auto Elo Score",
953
  "Retrieval",
954
+ "Structured Outputs",
955
  "Light Eval",
956
  "Turkish Semantic",
957
  "Multilingual Semantic",
 
966
  df[col] = df[col].fillna(0)
967
 
968
  # Explicitly reorder columns to match the UI display order exactly as in the screenshot
969
+ desired_order = ["Model Name", "Auto Elo Score", "Retrieval", "Structured Outputs", "Light Eval", "Turkish Semantic", "Multilingual Semantic", "Lexical", "Dtype", "License"]
970
 
971
  # Filter out columns that don't exist in the DataFrame
972
  actual_order = [col for col in desired_order if col in df.columns]
 
979
  if "Auto Elo Score" in df.columns:
980
  df = df.sort_values(by="Auto Elo Score", ascending=False)
981
 
982
+
983
+ four_decimal_columns = {"Structured Outputs"}
984
  for column in df.columns:
985
  try:
986
  if pd.api.types.is_float_dtype(df[column]):
987
+ if column in four_decimal_columns:
988
+ df[column] = df[column].round(4)
989
+ else:
990
+ df[column] = df[column].round(2)
991
  except:
992
  continue
993
 
 
1085
  cols = ["model_name"] + [col for col in df.columns if col != "model_name"]
1086
  df = df[cols]
1087
 
 
1088
  for column in df.columns:
1089
  try:
1090
  if pd.api.types.is_float_dtype(df[column]):
 
1185
  "license": "License"
1186
  }
1187
  column_mapping.update(custom_columns)
1188
+
1189
+ elif benchmark_type == "structured_output":
1190
+ # Structured Output benchmark column mappings
1191
+ custom_columns = {
1192
+ "structured_output_score": "Structured Output Score",
1193
+ "semantic": "Semantic",
1194
+ "response_format": "Response Format",
1195
+ "name": "Name",
1196
+ "document_note": "Document Note",
1197
+ "document_date": "Document Date",
1198
+ "from": "From",
1199
+ "to": "To",
1200
+ "dtype": "Dtype",
1201
+ "license": "License"
1202
+ }
1203
+ column_mapping.update(custom_columns)
1204
 
1205
 
1206
 
 
1311
 
1312
  # elif benchmark_type == "lm_harness" and "Overall" in df.columns:
1313
  # df = df.sort_values(by="Overall", ascending=False)
1314
+ elif benchmark_type == "structured_output":
1315
+ # Sort by Structured Output Score if available
1316
+ if "Structured Output Score" in df.columns:
1317
+ df = df.sort_values(by="Structured Output Score", ascending=False)
1318
+
1319
+ # Define desired column order for Structured Output - metadata columns at the end
1320
+ desired_cols = [
1321
+ "Model Name",
1322
+ "Structured Output Score",
1323
+ "Semantic",
1324
+ "Response Format",
1325
+ "Name",
1326
+ "Document Note",
1327
+ "Document Date",
1328
+ "From",
1329
+ "To",
1330
+ "Dtype",
1331
+ "License"
1332
+ ]
1333
+
1334
+ # Filter out columns that don't exist in the DataFrame
1335
+ final_cols = [col for col in desired_cols if col in df.columns]
1336
+
1337
+ # Add any remaining columns that weren't in the desired list
1338
+ remaining_cols = [col for col in df.columns if col not in final_cols]
1339
+ final_cols.extend(remaining_cols)
1340
+
1341
+ # Set the new column order
1342
+ df = df[final_cols]
1343
+
1344
  elif benchmark_type == "light_eval" and "Overall" in df.columns:
1345
  df = df.sort_values(by="Overall", ascending=False)
1346
  elif benchmark_type == "snake":
 
1430
  target_dict[new_key] = str(value)
1431
  else:
1432
  # Add other values directly
 
1433
  if isinstance(value, float):
1434
  target_dict[new_key] = round(value, 2)
1435
  else: