Spaces:

newmindai
/

Mezura

Running

App Files Files Community

nmmursit commited on Jan 21

Commit

7dea7c1

1 Parent(s): 2f272c5

feat: Add structured output support and refactor comments

Browse files

- Integrate structured output benchmark functionality
- Clean up Turkish comments and improve code quality

Files changed (31) hide show

app.py +31 -29
result/structured_output/avg_str001.json +5 -0
result/structured_output/avg_str002.json +5 -0
result/structured_output/avg_str003.json +5 -0
result/structured_output/avg_str004.json +5 -0
result/structured_output/avg_str005.json +5 -0
result/structured_output/avg_str006.json +5 -0
result/structured_output/avg_str007.json +5 -0
result/structured_output/avg_str008.json +5 -0
result/structured_output/avg_str009.json +5 -0
result/structured_output/avg_str010.json +5 -0
result/structured_output/avg_str011.json +5 -0
result/structured_output/avg_str012.json +5 -0
result/structured_output/avg_str013.json +5 -0
result/structured_output/avg_str014.json +6 -0
result/structured_output/detail_str001.json +14 -0
result/structured_output/detail_str002.json +14 -0
result/structured_output/detail_str003.json +14 -0
result/structured_output/detail_str004.json +14 -0
result/structured_output/detail_str005.json +14 -0
result/structured_output/detail_str006.json +14 -0
result/structured_output/detail_str007.json +14 -0
result/structured_output/detail_str008.json +14 -0
result/structured_output/detail_str009.json +14 -0
result/structured_output/detail_str010.json +14 -0
result/structured_output/detail_str011.json +14 -0
result/structured_output/detail_str012.json +14 -0
result/structured_output/detail_str013.json +14 -0
result/structured_output/detail_str014.json +24 -0
src/display/about.py +32 -1
src/utils.py +197 -18

app.py CHANGED Viewed

@@ -43,10 +43,10 @@ from src.utils import (
     create_light_eval_table,
     create_raw_details_table,
     create_human_arena_table,
     update_supported_base_models
 )
-# Pipelines utils fonksiyonlarını import et
 from pipelines.utils.common import search_and_filter
 from pipelines.unified_benchmark import submit_unified_benchmark
@@ -72,7 +72,6 @@ def format_dataframe(df, is_light_eval_detail=False):
     if df.empty:
         return df
-    # 'file' sütununu kaldır
     if 'file' in df.columns:
         df = df.drop(columns=['file'])
@@ -83,16 +82,24 @@ def format_dataframe(df, is_light_eval_detail=False):
         if col in df.columns:
             df = df.drop(columns=[col])
-    # Float değerleri yuvarlama - light eval detail için 4 hane, diğerleri için 2 hane
-    decimal_places = 4 if is_light_eval_detail else 2
     for column in df.columns:
         try:
             if pd.api.types.is_float_dtype(df[column]):
-                df[column] = df[column].round(decimal_places)
         except:
             continue
-    # Kolon isimlerini düzgün formata getir
     column_mapping = {}
     for col in df.columns:
         # Skip run_id and user_id fields
@@ -162,15 +169,12 @@ def create_demo():
         gr.Markdown(TITLE)
         gr.Markdown(INTRODUCTION_TEXT)
-        # Hidden session state to track login expiration
         session_expiry = gr.State(None)
         try:
-            # Benchmark sonuçlarını yükle
             benchmark_results = load_benchmark_results()
             default_plots = create_benchmark_plots(benchmark_results, "avg")
-            # State variable to track login state across page refreshes
             login_state = gr.State(value=False)
             with gr.Tabs() as tabs:
@@ -178,8 +182,6 @@ def create_demo():
                     gr.Markdown("## Model Evaluation Results")
                     gr.Markdown("This screen shows model performance across different evaluation categories.")
-                    # Remove the separate refresh button row
-                    # Instead, combine search and refresh in one row
                     with gr.Row():
                         search_input = gr.Textbox(
                             label="🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
@@ -192,10 +194,8 @@ def create_demo():
                     # # Status display for refresh results
                     # refresh_status = gr.Markdown("", visible=False)
-                    # Benchmark tablarını semboller içeren tab grubuyla göster
                     with gr.Tabs() as benchmark_tabs:
                         with gr.TabItem("👥 Human Arena"):
-                            # Human Arena sonuçları - detail dosyalarını kullan
                             human_arena_data = benchmark_results["raw"]["human_arena"]
                             # Store human arena data in a state component for filtering
@@ -220,7 +220,6 @@ def create_demo():
                                     if filtered_df.empty:
                                         filtered_df = pd.DataFrame({"Model Name": ["No data available"]})
-                                # Return updated buttons with new variants
                                 if category == "general":
                                     return (
                                         filtered_df,
@@ -236,7 +235,6 @@ def create_demo():
                                         gr.Button("Reasoning", variant="primary", elem_id="human_arena_reasoning_btn", elem_classes=["active-btn"])
                                     )
-                            # Initial table load
                             if human_arena_data:
                                 human_arena_df = create_human_arena_table(human_arena_data, category="general")
                             else:
@@ -257,7 +255,6 @@ def create_demo():
                                 column_widths=["300px", "150px", "110px", "110px", "110px", "156px", "169px", "100px", "120px"]
                             )
-                            # Button click handlers
                             general_btn.click(
                                 fn=lambda data: filter_human_arena_table("general", data),
                                 inputs=[human_arena_state],
@@ -295,7 +292,6 @@ def create_demo():
                             )
                         with gr.TabItem("🏟️ Auto Arena"):
-                            # Arena sonuçları - detail dosyalarını kullan
                             arena_details_df = create_raw_details_table(benchmark_results, "arena")
                             arena_details_df = format_dataframe(arena_details_df)
@@ -311,7 +307,6 @@ def create_demo():
                             )
                         with gr.TabItem("📚 Retrieval"):
-                            # RAG Judge sonuçları - detail dosyalarını kullan
                             rag_details_df = create_raw_details_table(benchmark_results, "retrieval")
                             rag_details_df = format_dataframe(rag_details_df)
@@ -326,8 +321,21 @@ def create_demo():
                             )
                         with gr.TabItem("⚡ Light Eval"):
-                            # Light Eval sonuçları - detail dosyalarını kullan
                             light_details_data = benchmark_results["raw"]["light_eval"]
                             if light_details_data:
                                 light_details_df = create_light_eval_table(light_details_data, is_detail=True)
@@ -348,7 +356,6 @@ def create_demo():
                             )
                         with gr.TabItem("📋 EvalMix"):
-                            # Hybrid Benchmark sonuçları - detail dosyalarını kullan
                             hybrid_details_df = create_raw_details_table(benchmark_results, "evalmix")
                             hybrid_details_df = format_dataframe(hybrid_details_df)
@@ -364,7 +371,6 @@ def create_demo():
                             )
                         with gr.TabItem("🐍 𝐒𝐧𝐚𝐤𝐞 𝐁𝐞𝐧𝐜𝐡"):
-                            # Snake Benchmark sonuçları - detail dosyalarını kullan
                             snake_details_df = create_raw_details_table(benchmark_results, "snake")
                             snake_details_df = format_dataframe(snake_details_df)
@@ -410,32 +416,29 @@ def create_demo():
                     #     ]
                     # )
-                    # Tüm sekmeler için ortak arama fonksiyonu
                     def search_all_tabs(query, original_data):
                         """
-                        Tüm sekmelerde arama yapar
                         """
                         if not query or query.strip() == "":
-                            # Boş arama - orijinal veriyi döndür
                             return (original_data, arena_details_df, human_arena_df,
-                                   rag_details_df, light_details_df, hybrid_details_df, snake_details_df)
-                        # Arama var - tüm sekmeleri filtrele
                         return (
                             search_and_filter(query, original_data, "All"),
                             search_and_filter(query, arena_details_df, "All"),
                             search_and_filter(query, human_arena_df, "All"),
                             search_and_filter(query, rag_details_df, "All"),
                             search_and_filter(query, light_details_df, "All"),
                             search_and_filter(query, hybrid_details_df, "All"),
                             search_and_filter(query, snake_details_df, "All")
                         )
-                    # Arama fonksiyonu - tüm sekmeleri güncelle
                     search_input.change(
                         search_all_tabs,
                         inputs=[search_input, original_leaderboard_data],
-                        outputs=[combined_table, arena_table, human_arena_table, rag_table, light_table, hybrid_table, snake_table]
                     )
                 with gr.TabItem("ℹ️ About", elem_id="about-tab"):
@@ -692,7 +695,6 @@ def create_demo():
                             logging.warning(f"Error checking model type: {str(e)}")
                         # Call the benchmark function with profile information
-                        # base_model validasyonunu kaldırdık ama parametre olarak yine de gönderiyoruz
                         result_message, _ = submit_unified_benchmark(model, base_model, reasoning, email, profile)
                         logging.info(f"Submission processed for model: {model}")
                         return result_message

     create_light_eval_table,
     create_raw_details_table,
     create_human_arena_table,
+    create_structured_outputs_table,
     update_supported_base_models
 )
 from pipelines.utils.common import search_and_filter
 from pipelines.unified_benchmark import submit_unified_benchmark
     if df.empty:
         return df
     if 'file' in df.columns:
         df = df.drop(columns=['file'])
         if col in df.columns:
             df = df.drop(columns=[col])
+    # Float değerleri yuvarlama
+    # Varsayılan: 2 hane. Light eval detail veya structured_output_score kolonları varsa: 4 hane.
+    # Leaderboard için özel durum: "Structured Outputs" ve "Retrieval" kolonlarını 4 hane tut.
+    if is_light_eval_detail or "structured_output_score" in df.columns:
+        default_decimal_places = 4
+    else:
+        default_decimal_places = 2
+    four_decimal_cols = {"Structured Outputs"}
     for column in df.columns:
         try:
             if pd.api.types.is_float_dtype(df[column]):
+                if column in four_decimal_cols:
+                    df[column] = df[column].round(4)
+                else:
+                    df[column] = df[column].round(default_decimal_places)
         except:
             continue
     column_mapping = {}
     for col in df.columns:
         # Skip run_id and user_id fields
         gr.Markdown(TITLE)
         gr.Markdown(INTRODUCTION_TEXT)
         session_expiry = gr.State(None)
         try:
             benchmark_results = load_benchmark_results()
             default_plots = create_benchmark_plots(benchmark_results, "avg")
             login_state = gr.State(value=False)
             with gr.Tabs() as tabs:
                     gr.Markdown("## Model Evaluation Results")
                     gr.Markdown("This screen shows model performance across different evaluation categories.")
                     with gr.Row():
                         search_input = gr.Textbox(
                             label="🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
                     # # Status display for refresh results
                     # refresh_status = gr.Markdown("", visible=False)
                     with gr.Tabs() as benchmark_tabs:
                         with gr.TabItem("👥 Human Arena"):
                             human_arena_data = benchmark_results["raw"]["human_arena"]
                             # Store human arena data in a state component for filtering
                                     if filtered_df.empty:
                                         filtered_df = pd.DataFrame({"Model Name": ["No data available"]})
                                 if category == "general":
                                     return (
                                         filtered_df,
                                         gr.Button("Reasoning", variant="primary", elem_id="human_arena_reasoning_btn", elem_classes=["active-btn"])
                                     )
                             if human_arena_data:
                                 human_arena_df = create_human_arena_table(human_arena_data, category="general")
                             else:
                                 column_widths=["300px", "150px", "110px", "110px", "110px", "156px", "169px", "100px", "120px"]
                             )
                             general_btn.click(
                                 fn=lambda data: filter_human_arena_table("general", data),
                                 inputs=[human_arena_state],
                             )
                         with gr.TabItem("🏟️ Auto Arena"):
                             arena_details_df = create_raw_details_table(benchmark_results, "arena")
                             arena_details_df = format_dataframe(arena_details_df)
                             )
                         with gr.TabItem("📚 Retrieval"):
                             rag_details_df = create_raw_details_table(benchmark_results, "retrieval")
                             rag_details_df = format_dataframe(rag_details_df)
                             )
+                        with gr.TabItem("🔧 Structured Outputs"):
+                            structured_details_df = create_structured_outputs_table(benchmark_results["raw"]["structured_output"], is_detail=True)
+                            if structured_details_df.empty:
+                                structured_details_df = pd.DataFrame({"Model": ["No data available"]})
+                            structured_table = gr.DataFrame(
+                                value=structured_details_df,
+                                label="Structured Outputs Detailed Results",
+                                interactive=False,
+                                column_widths=["300px", "250px", "110px", "150px", "100px", "150px", "150px", "100px", "100px", "100px", "120px"]
+                            )
                         with gr.TabItem("⚡ Light Eval"):
                             light_details_data = benchmark_results["raw"]["light_eval"]
                             if light_details_data:
                                 light_details_df = create_light_eval_table(light_details_data, is_detail=True)
                             )
                         with gr.TabItem("📋 EvalMix"):
                             hybrid_details_df = create_raw_details_table(benchmark_results, "evalmix")
                             hybrid_details_df = format_dataframe(hybrid_details_df)
                             )
                         with gr.TabItem("🐍 𝐒𝐧𝐚𝐤𝐞 𝐁𝐞𝐧𝐜𝐡"):
                             snake_details_df = create_raw_details_table(benchmark_results, "snake")
                             snake_details_df = format_dataframe(snake_details_df)
                     #     ]
                     # )
                     def search_all_tabs(query, original_data):
                         """
+                        Search across all tabs
                         """
                         if not query or query.strip() == "":
                             return (original_data, arena_details_df, human_arena_df,
+                                   rag_details_df, structured_details_df, light_details_df, hybrid_details_df, snake_details_df)
                         return (
                             search_and_filter(query, original_data, "All"),
                             search_and_filter(query, arena_details_df, "All"),
                             search_and_filter(query, human_arena_df, "All"),
                             search_and_filter(query, rag_details_df, "All"),
+                            search_and_filter(query, structured_details_df, "All"),
                             search_and_filter(query, light_details_df, "All"),
                             search_and_filter(query, hybrid_details_df, "All"),
                             search_and_filter(query, snake_details_df, "All")
                         )
                     search_input.change(
                         search_all_tabs,
                         inputs=[search_input, original_leaderboard_data],
+                        outputs=[combined_table, arena_table, human_arena_table, rag_table, structured_table, light_table, hybrid_table, snake_table]
                     )
                 with gr.TabItem("ℹ️ About", elem_id="about-tab"):
                             logging.warning(f"Error checking model type: {str(e)}")
                         # Call the benchmark function with profile information
                         result_message, _ = submit_unified_benchmark(model, base_model, reasoning, email, profile)
                         logging.info(f"Submission processed for model: {model}")
                         return result_message

result/structured_output/avg_str001.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "meta-llama/Llama-3.3-70b-Instruct",
+  "structured_output_score": 0.7635,
+  "run_id": "str001"
+}

result/structured_output/avg_str002.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "grok-3",
+  "structured_output_score": 0.7628,
+  "run_id": "str002"
+}

result/structured_output/avg_str003.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "newmindai/Llama-3.3-70b-Instruct",
+  "structured_output_score": 0.7622,
+  "run_id": "str003"
+}

result/structured_output/avg_str004.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "deepseek-ai/DeepSeek-R1",
+  "structured_output_score": 0.76,
+  "run_id": "str004"
+}

result/structured_output/avg_str005.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "google/gemma-3-27b-it",
+  "structured_output_score": 0.7478,
+  "run_id": "str005"
+}

result/structured_output/avg_str006.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "grok-3-mini-fast-beta",
+  "structured_output_score": 0.7471,
+  "run_id": "str006"
+}

result/structured_output/avg_str007.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+  "structured_output_score": 0.7424,
+  "run_id": "str007"
+}

result/structured_output/avg_str008.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "Qwen/Qwen3-32B",
+  "structured_output_score": 0.735,
+  "run_id": "str008"
+}

result/structured_output/avg_str009.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "Qwen/Qwen2.5-72B-Instruct",
+  "structured_output_score": 0.7309,
+  "run_id": "str009"
+}

result/structured_output/avg_str010.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "newmindai/QwQ-32B-r1",
+  "structured_output_score": 0.7252,
+  "run_id": "str010"
+}

result/structured_output/avg_str011.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "Qwen/QwQ-32B",
+  "structured_output_score": 0.7205,
+  "run_id": "str011"
+}

result/structured_output/avg_str012.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "microsoft/phi-4",
+  "structured_output_score": 0.6906,
+  "run_id": "str012"
+}

result/structured_output/avg_str013.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "Qwen/Qwen3-14B",
+  "structured_output_score": 0.6153,
+  "run_id": "str013"
+}

result/structured_output/avg_str014.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "model_name": "newmindai/Qwen2.5-72b-Instruct",
+  "structured_output_score": 0.761,
+  "run_id": "ec6bf42a-4482-4f8c-9fbd-2ab5f1eed6bb"
+}

result/structured_output/detail_str001.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "meta-llama/Llama-3.3-70b-Instruct",
+  "structured_output_score": 0.7635,
+  "semantic": 0.5271,
+  "response_format": "506/506",
+  "name": 0.6364,
+  "document_note": 0.2194,
+  "document_date": 0.6561,
+  "from": 0.6319,
+  "to": 0.4919,
+  "dtype": "bfloat16",
+  "licence": "Llama-3.3",
+  "run_id": "str001"
+}

result/structured_output/detail_str002.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "grok-3",
+  "structured_output_score": 0.7628,
+  "semantic": 0.5256,
+  "response_format": "506/506",
+  "name": 0.6344,
+  "document_note": 0.166,
+  "document_date": 0.6482,
+  "from": 0.6493,
+  "to": 0.5299,
+  "dtype": "Unknown",
+  "licence": "Proprietary",
+  "run_id": "str002"
+}

result/structured_output/detail_str003.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "newmindai/Llama-3.3-70b-Instruct",
+  "structured_output_score": 0.7622,
+  "semantic": 0.5245,
+  "response_format": "506/506",
+  "name": 0.6423,
+  "document_note": 0.2016,
+  "document_date": 0.6561,
+  "from": 0.6259,
+  "to": 0.4966,
+  "dtype": "bfloat16",
+  "licence": "Llama-3.3",
+  "run_id": "str003"
+}

result/structured_output/detail_str004.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "deepseek-ai/DeepSeek-R1",
+  "structured_output_score": 0.76,
+  "semantic": 0.5199,
+  "response_format": "506/506",
+  "name": 0.6601,
+  "document_note": 0.1917,
+  "document_date": 0.6542,
+  "from": 0.6223,
+  "to": 0.4713,
+  "dtype": "bfloat16",
+  "licence": "MIT",
+  "run_id": "str004"
+}

result/structured_output/detail_str005.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "google/gemma-3-27b-it",
+  "structured_output_score": 0.7478,
+  "semantic": 0.4955,
+  "response_format": "506/506",
+  "name": 0.5909,
+  "document_note": 0.2055,
+  "document_date": 0.6502,
+  "from": 0.6044,
+  "to": 0.4264,
+  "dtype": "bfloat16",
+  "licence": "Gemma",
+  "run_id": "str005"
+}

result/structured_output/detail_str006.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "grok-3-mini-fast-beta",
+  "structured_output_score": 0.7471,
+  "semantic": 0.4943,
+  "response_format": "506/506",
+  "name": 0.6403,
+  "document_note": 0.1957,
+  "document_date": 0.6324,
+  "from": 0.567,
+  "to": 0.4363,
+  "dtype": "Unknown",
+  "licence": "Proprietary",
+  "run_id": "str006"
+}

result/structured_output/detail_str007.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+  "structured_output_score": 0.7424,
+  "semantic": 0.4847,
+  "response_format": "506/506",
+  "name": 0.581,
+  "document_note": 0.2134,
+  "document_date": 0.6561,
+  "from": 0.5248,
+  "to": 0.4482,
+  "dtype": "bfloat16",
+  "licence": "Llama 3.1",
+  "run_id": "str007"
+}

result/structured_output/detail_str008.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "Qwen/Qwen3-32B",
+  "structured_output_score": 0.735,
+  "semantic": 0.482,
+  "response_format": "500/506",
+  "name": 0.566,
+  "document_note": 0.21,
+  "document_date": 0.636,
+  "from": 0.5614,
+  "to": 0.4367,
+  "dtype": "bfloat16",
+  "licence": "Qwen",
+  "run_id": "str008"
+}

result/structured_output/detail_str009.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "Qwen/Qwen2.5-72B-Instruct",
+  "structured_output_score": 0.7309,
+  "semantic": 0.4618,
+  "response_format": "506/506",
+  "name": 0.502,
+  "document_note": 0.1957,
+  "document_date": 0.6383,
+  "from": 0.5927,
+  "to": 0.3801,
+  "dtype": "bfloat16",
+  "licence": "Qwen",
+  "run_id": "str009"
+}

result/structured_output/detail_str010.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "newmindai/QwQ-32B-r1",
+  "structured_output_score": 0.7252,
+  "semantic": 0.4564,
+  "response_format": "503/506",
+  "name": 0.507,
+  "document_note": 0.1272,
+  "document_date": 0.6243,
+  "from": 0.5816,
+  "to": 0.4419,
+  "dtype": "bfloat16",
+  "licence": "Apache 2.0",
+  "run_id": "str010"
+}

result/structured_output/detail_str011.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "Qwen/QwQ-32B",
+  "structured_output_score": 0.7205,
+  "semantic": 0.4468,
+  "response_format": "503/506",
+  "name": 0.4791,
+  "document_note": 0.1352,
+  "document_date": 0.6243,
+  "from": 0.573,
+  "to": 0.4224,
+  "dtype": "bfloat16",
+  "licence": "Apache 2.0",
+  "run_id": "str011"
+}

result/structured_output/detail_str012.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "microsoft/phi-4",
+  "structured_output_score": 0.6906,
+  "semantic": 0.3912,
+  "response_format": "503/506",
+  "name": 0.3752,
+  "document_note": 0.2275,
+  "document_date": 0.5768,
+  "from": 0.4542,
+  "to": 0.3222,
+  "dtype": "bfloat16",
+  "licence": "MIT",
+  "run_id": "str012"
+}

result/structured_output/detail_str013.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "Qwen/Qwen3-14B",
+  "structured_output_score": 0.6153,
+  "semantic": 0.2426,
+  "response_format": "501/506",
+  "name": 0.31,
+  "document_note": 0.156,
+  "document_date": 0.538,
+  "from": 0.1095,
+  "to": 0.0998,
+  "dtype": "bfloat16",
+  "licence": "Apache 2.0",
+  "run_id": "str013"
+}

result/structured_output/detail_str014.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "model_name": "newmindai/Qwen2.5-72b-Instruct",
+  "structured_output_score": 0.761,
+  "semantic": 0.5219,
+  "response_format": "506/506",
+  "name": 0.5632,
+  "document_note": 0.2905,
+  "document_date": 0.6403,
+  "from": 0.6136,
+  "to": 0.5018,
+  "dtype": "bfloat16",
+  "licence": "Qwen",
+  "run_id": "ec6bf42a-4482-4f8c-9fbd-2ab5f1eed6bb",
+  "ISL": 1712575,
+  "OSL": 183946,
+  "cost": null,
+  "e2e_benchmark_time": 114.4683,
+  "model_generation_time": 104.2865,
+  "scoring_duration_minutes": 10.1755,
+  "provider": "nebius",
+  "sample_count": 506,
+  "success_response": 506
+}

src/display/about.py CHANGED Viewed

@@ -57,7 +57,7 @@ Evaluate your model's performance in the following categories:
 6. 🐍 **Snake Bench** - Specialized evaluation measuring step-by-step problem solving and complex reasoning abilities.
-7. 🧩 **Structured Outputs** - Coming soon!
 Evaluate your model in any or all of these categories to discover its capabilities and areas of excellence.
@@ -244,6 +244,37 @@ Human evaluators consider multiple factors when comparing model responses:
 Human Arena provides a complementary perspective to automated benchmarks, capturing nuanced human preferences that traditional metrics might miss. This evaluation is particularly valuable for understanding how models perform in real-world conversational scenarios.
 """
 EVALUATION_QUEUE_TEXT = """

 6. 🐍 **Snake Bench** - Specialized evaluation measuring step-by-step problem solving and complex reasoning abilities.
+7. 🧩 **Structured Outputs** - Evaluation of models' ability to generate properly formatted, structured responses with accurate field extraction and semantic understanding.
 Evaluate your model in any or all of these categories to discover its capabilities and areas of excellence.
 Human Arena provides a complementary perspective to automated benchmarks, capturing nuanced human preferences that traditional metrics might miss. This evaluation is particularly valuable for understanding how models perform in real-world conversational scenarios.
+### 7. 🧩 Structured Outputs
+Structured Outputs evaluation assesses models' ability to generate properly formatted, structured responses with accurate field extraction and semantic understanding. This benchmark tests how well language models can parse, understand, and extract specific information from documents while maintaining semantic coherence.
+**Evaluation Methodology:**
+Models are evaluated on their ability to extract structured information from Turkish legal documents. The evaluation uses advanced semantic similarity scoring with Turkish-specific embedding models for accurate assessment.
+**Technical Implementation:**
+- **Embedding Model**: Primary evaluation uses [`newmindai/TurkEmbed4Retrieval`](https://huggingface.co/newmindai/TurkEmbed4Retrieval) for Turkish-specific semantic understanding
+- **Similarity Threshold**: 0.75 cosine similarity threshold for field matching
+- **Ground Truth Comparison**: MongoDB-stored ground truth data with pre-computed embeddings
+**Evaluation Metrics:**
+- **Overall**: Combined overall performance metric that averages Semantic understanding and Response Format success ratio
+- **Semantic**: Measures semantic understanding and coherence of extracted information using cosine similarity (corresponds to `overall_score` in scoring)
+- **Response Format**: Success ratio showing successful JSON extractions vs total attempts (success_response/sample_count)
+- **Name**: Accuracy in extracting and identifying name fields from legal documents (20% weight)
+- **Document Date**: Accuracy in date field extraction with multiple format support (20% weight)
+- **Document Note**: Performance in extracting document annotation information using semantic similarity (20% weight)
+- **From**: Performance in extracting source/sender information as lists with semantic matching (20% weight)
+- **To**: Accuracy in extracting destination/recipient information as lists with semantic matching (20% weight)
+**Scoring Algorithm:**
+The evaluation uses a sophisticated multi-level scoring system:
+1. **String Fields** (name, document_note): Turkish embedding similarity with 0.75 threshold using `newmindai/TurkEmbed4Retrieval`
+2. **Date Fields** (document_date): Exact date matching with multiple format parsing support
+3. **List Fields** (from, to): One-way similarity from ground truth to predictions using semantic matching
+4. **Overall Score Calculation**: `Overall = (Semantic + Response Format) / 2`
+5. **Field Weights**: Each extraction field (name, document_date, document_note, from, to) contributes equally with 20% weight to the semantic score
 """
 EVALUATION_QUEUE_TEXT = """

src/utils.py CHANGED Viewed

@@ -118,7 +118,6 @@ def filter_models(
     return filtered_df
-# Yeni fonksiyonlar
 def load_benchmark_results():
     """
     Load benchmark results from local files
@@ -130,7 +129,8 @@ def load_benchmark_results():
             "snake": [],
             "retrieval": [],
             "arena": [],
-            "human_arena": []
         },
         "raw": {
             "evalmix": [],
@@ -138,12 +138,13 @@ def load_benchmark_results():
             "snake": [],
             "retrieval": [],
             "arena": [],
-            "human_arena": []
         }
     }
     # Define benchmark types to look for
-    benchmark_types = ["evalmix", "light_eval", "snake", "retrieval", "arena", "human_arena"]  # "lm_harness" removed
     # Initialize RAG Score calculator for runtime calculation
     rag_calculator = None
@@ -387,7 +388,6 @@ def create_evalmix_table(data):
         else:
             df["average_score"] = df[["lexical_metric", "semantic_metric"]].mean(axis=1).round(2)
-    # Float değerleri 2 ondalık basamağa yuvarla
     for column in df.columns:
         try:
             if pd.api.types.is_float_dtype(df[column]):
@@ -485,7 +485,6 @@ def create_light_eval_table(data, is_detail=False):
     if not data:
         return pd.DataFrame()
-    # Light eval sonuçları farklı formatta, düzenleme gerekiyor
     formatted_data = []
     for item in data:
         model_data = {"model_name": format_model_name(item.get("model_name", "Bilinmeyen Model"))}
@@ -557,7 +556,6 @@ def create_light_eval_table(data, is_detail=False):
         # Sort with NaN at the end
         df = df.iloc[sort_col.fillna(-1).argsort(kind="stable").iloc[::-1]]
-    # Float değerleri yuvarlama - detail için 4 hane, avg için 2 hane
     decimal_places = 4 if is_detail else 2
     for column in df.columns:
         try:
@@ -609,6 +607,138 @@ def create_light_eval_table(data, is_detail=False):
     return df
 def create_benchmark_plots(benchmark_data, data_type="avg"):
     """
     Benchmark verilerinden grafikler oluşturur
@@ -619,7 +749,6 @@ def create_benchmark_plots(benchmark_data, data_type="avg"):
     """
     plots = {}
-    # Hybrid Benchmark için çubuk grafik
     if benchmark_data[data_type]["evalmix"]:
         df = create_evalmix_table(benchmark_data[data_type]["evalmix"])
         if not df.empty and all(col in df.columns for col in ["model_name", "lexical_metric", "semantic_metric"]):
@@ -628,7 +757,6 @@ def create_benchmark_plots(benchmark_data, data_type="avg"):
             if "judge_metric" in df.columns:
                 metrics.append("judge_metric")
-            # Veriyi uzun formata dönüştür
             plot_df = pd.melt(
                 df,
                 id_vars=["model_name"],
@@ -637,7 +765,6 @@ def create_benchmark_plots(benchmark_data, data_type="avg"):
                 value_name="Değer"
             )
-            # Metrik isimlerini daha okunabilir hale getir
             plot_df["Metrik"] = plot_df["Metrik"].replace({
                 "lexical_metric": "Lexical Metric",
                 "semantic_metric": "Semantic Metric",
@@ -655,11 +782,9 @@ def create_benchmark_plots(benchmark_data, data_type="avg"):
             )
             plots["evalmix"] = fig
-    # Light Eval için radar grafik
     if benchmark_data[data_type]["light_eval"]:
         df = create_light_eval_table(benchmark_data[data_type]["light_eval"])
         if not df.empty:
-            # Ortalama ve total_samples sütunlarını hariç tut
             metric_cols = [col for col in df.columns if col not in ["model_name", "Ortalama", "file", "overall_average", "total_samples"]]
             if metric_cols:
                 fig = go.Figure()
@@ -691,7 +816,7 @@ def create_combined_leaderboard_table(benchmark_data):
     Creates a combined leaderboard table from avg JSON data
     """
     # Define benchmark types to include in the leaderboard
-    benchmark_types = ["evalmix", "light_eval", "retrieval", "arena"]  # "lm_harness" and "human_arena" removed
     all_models = {}
@@ -788,6 +913,11 @@ def create_combined_leaderboard_table(benchmark_data):
                 # Human Elo Score removed from leaderboard table (still available in Human Arena tab)
                 # Remove dtype and license from JSON - use only lookup table values
                 pass
     # Create DataFrame from the collected data
     if all_models:
@@ -821,6 +951,7 @@ def create_combined_leaderboard_table(benchmark_data):
         display_cols = [
             "Auto Elo Score",
             "Retrieval",
             "Light Eval",
             "Turkish Semantic",
             "Multilingual Semantic",
@@ -835,7 +966,7 @@ def create_combined_leaderboard_table(benchmark_data):
             df[col] = df[col].fillna(0)
         # Explicitly reorder columns to match the UI display order exactly as in the screenshot
-        desired_order = ["Model Name", "Auto Elo Score", "Retrieval", "Light Eval", "Turkish Semantic", "Multilingual Semantic", "Lexical", "Dtype", "License"]
         # Filter out columns that don't exist in the DataFrame
         actual_order = [col for col in desired_order if col in df.columns]
@@ -848,11 +979,15 @@ def create_combined_leaderboard_table(benchmark_data):
         if "Auto Elo Score" in df.columns:
             df = df.sort_values(by="Auto Elo Score", ascending=False)
-        # Float değerleri 2 ondalık basamağa yuvarla
         for column in df.columns:
             try:
                 if pd.api.types.is_float_dtype(df[column]):
-                    df[column] = df[column].round(2)
             except:
                 continue
@@ -950,7 +1085,6 @@ def create_raw_details_table(benchmark_data, benchmark_type):
         cols = ["model_name"] + [col for col in df.columns if col != "model_name"]
         df = df[cols]
-    # Float değerleri 2 ondalık basamağa yuvarla
     for column in df.columns:
         try:
             if pd.api.types.is_float_dtype(df[column]):
@@ -1051,6 +1185,22 @@ def create_raw_details_table(benchmark_data, benchmark_type):
             "license": "License"
         }
         column_mapping.update(custom_columns)
@@ -1161,6 +1311,36 @@ def create_raw_details_table(benchmark_data, benchmark_type):
     # elif benchmark_type == "lm_harness" and "Overall" in df.columns:
     #     df = df.sort_values(by="Overall", ascending=False)
     elif benchmark_type == "light_eval" and "Overall" in df.columns:
         df = df.sort_values(by="Overall", ascending=False)
     elif benchmark_type == "snake":
@@ -1250,7 +1430,6 @@ def _flatten_dict(d, target_dict, prefix=""):
                 target_dict[new_key] = str(value)
         else:
             # Add other values directly
-            # Float değerleri yuvarla
             if isinstance(value, float):
                 target_dict[new_key] = round(value, 2)
             else:

     return filtered_df
 def load_benchmark_results():
     """
     Load benchmark results from local files
             "snake": [],
             "retrieval": [],
             "arena": [],
+            "human_arena": [],
+            "structured_output": []
         },
         "raw": {
             "evalmix": [],
             "snake": [],
             "retrieval": [],
             "arena": [],
+            "human_arena": [],
+            "structured_output": []
         }
     }
     # Define benchmark types to look for
+    benchmark_types = ["evalmix", "light_eval", "snake", "retrieval", "arena", "human_arena", "structured_output"]  # "lm_harness" removed
     # Initialize RAG Score calculator for runtime calculation
     rag_calculator = None
         else:
             df["average_score"] = df[["lexical_metric", "semantic_metric"]].mean(axis=1).round(2)
     for column in df.columns:
         try:
             if pd.api.types.is_float_dtype(df[column]):
     if not data:
         return pd.DataFrame()
     formatted_data = []
     for item in data:
         model_data = {"model_name": format_model_name(item.get("model_name", "Bilinmeyen Model"))}
         # Sort with NaN at the end
         df = df.iloc[sort_col.fillna(-1).argsort(kind="stable").iloc[::-1]]
     decimal_places = 4 if is_detail else 2
     for column in df.columns:
         try:
     return df
+def create_structured_outputs_table(data, is_detail=False):
+    """
+    Creates a table from Structured Outputs results
+    Args:
+        data: Structured outputs data
+        is_detail: If True, keep 4 decimal places for detail results
+    """
+    if not data:
+        return pd.DataFrame()
+    formatted_data = []
+    for item in data:
+        model_data = {"model": format_model_name(item.get("model_name", "") or item.get("model", "Bilinmeyen Model"))}
+        # Add specific metrics we're interested in for Structured Outputs
+        metrics = [
+            "structured_output_score",
+            "semantic",
+            "response_format",
+            "name",
+            "document_note",
+            "document_date",
+            "from",
+            "to",
+            "dtype",
+            "licence"
+        ]
+        for metric in metrics:
+            try:
+                if metric in ["dtype", "licence"]:
+                    # Use the value from JSON directly
+                    model_data[metric] = item.get(metric, "Unknown")
+                elif metric in item:
+                    if metric == "structured_output_score" and item[metric] == "N/A":
+                        model_data[metric] = "N/A"
+                    elif isinstance(item[metric], str) and item[metric] != "N/A":
+                        try:
+                            model_data[metric] = float(item[metric])
+                        except:
+                            model_data[metric] = item[metric]  # Keep as string if can't convert
+                    else:
+                        model_data[metric] = item[metric]
+                else:
+                    model_data[metric] = "N/A"
+            except Exception as e:
+                if metric in ["dtype", "licence"]:
+                    model_data[metric] = item.get(metric, "Unknown")
+                else:
+                    model_data[metric] = item.get(metric, "N/A")
+        formatted_data.append(model_data)
+    # Create DataFrame
+    df = pd.DataFrame(formatted_data)
+    # Remove the file column if present
+    if 'file' in df.columns:
+        df = df.drop(columns=['file'])
+    # Try to convert metrics to float with error handling (only numeric columns)
+    numeric_cols = ["structured_output_score", "semantic", "name",
+                   "document_note", "document_date", "from", "to"]
+    for col in numeric_cols:
+        if col in df.columns:
+            try:
+                # Convert column to float but keep "N/A" as is
+                df[col] = df[col].apply(lambda x: float(x) if isinstance(x, (int, float)) or (isinstance(x, str) and x != "N/A") else x)
+            except Exception as e:
+                pass  # Keep original values if conversion fails
+    # Sort by structured_output_score if available
+    if "structured_output_score" in df.columns:
+        # For sorting, replace non-numeric values with NaN temporarily
+        sort_col = pd.to_numeric(df["structured_output_score"], errors="coerce")
+        # Sort with NaN at the end
+        df = df.iloc[sort_col.fillna(-1).argsort(kind="stable").iloc[::-1]]
+    decimal_places = 4 if is_detail else 2
+    for column in df.columns:
+        try:
+            if pd.api.types.is_float_dtype(df[column]):
+                df[column] = df[column].round(decimal_places)
+        except:
+            continue
+    # Format column names according to user request
+    column_mapping = {
+        "model": "Model",
+        "structured_output_score": "Structured Output Score",
+        "semantic": "Semantic",
+        "response_format": "Response Format",
+        "name": "Name",
+        "document_note": "Document Note",
+        "document_date": "Document Date",
+        "from": "From",
+        "to": "To",
+        "dtype": "Dtype",
+        "licence": "Licence"
+    }
+    # Rename DataFrame columns
+    df = df.rename(columns=column_mapping)
+    # Define desired column order for Structured Outputs - metadata columns at the end
+    desired_cols = [
+        "Model",
+        "Structured Output Score",
+        "Semantic",
+        "Response Format",
+        "Name",
+        "Document Note",
+        "Document Date",
+        "From",
+        "To",
+        "Dtype",
+        "Licence"
+    ]
+    # Filter out columns that don't exist in the DataFrame
+    final_cols = [col for col in desired_cols if col in df.columns]
+    # Add any remaining columns that weren't in the desired list
+    remaining_cols = [col for col in df.columns if col not in final_cols]
+    final_cols.extend(remaining_cols)
+    # Set the new column order
+    df = df[final_cols]
+    return df
 def create_benchmark_plots(benchmark_data, data_type="avg"):
     """
     Benchmark verilerinden grafikler oluşturur
     """
     plots = {}
     if benchmark_data[data_type]["evalmix"]:
         df = create_evalmix_table(benchmark_data[data_type]["evalmix"])
         if not df.empty and all(col in df.columns for col in ["model_name", "lexical_metric", "semantic_metric"]):
             if "judge_metric" in df.columns:
                 metrics.append("judge_metric")
             plot_df = pd.melt(
                 df,
                 id_vars=["model_name"],
                 value_name="Değer"
             )
             plot_df["Metrik"] = plot_df["Metrik"].replace({
                 "lexical_metric": "Lexical Metric",
                 "semantic_metric": "Semantic Metric",
             )
             plots["evalmix"] = fig
     if benchmark_data[data_type]["light_eval"]:
         df = create_light_eval_table(benchmark_data[data_type]["light_eval"])
         if not df.empty:
             metric_cols = [col for col in df.columns if col not in ["model_name", "Ortalama", "file", "overall_average", "total_samples"]]
             if metric_cols:
                 fig = go.Figure()
     Creates a combined leaderboard table from avg JSON data
     """
     # Define benchmark types to include in the leaderboard
+    benchmark_types = ["evalmix", "light_eval", "retrieval", "arena", "structured_output"]  # "lm_harness" and "human_arena" removed
     all_models = {}
                 # Human Elo Score removed from leaderboard table (still available in Human Arena tab)
                 # Remove dtype and license from JSON - use only lookup table values
                 pass
+            elif benchmark_type == "structured_output":
+                if "structured_output_score" in item:
+                    # Keep higher precision for Structured Outputs to align with detail view
+                    all_models[formatted_model_name]["Structured Outputs"] = round(item.get("structured_output_score", 0), 4)
+                # Remove dtype and license from JSON - use only lookup table values
     # Create DataFrame from the collected data
     if all_models:
         display_cols = [
             "Auto Elo Score",
             "Retrieval",
+            "Structured Outputs",
             "Light Eval",
             "Turkish Semantic",
             "Multilingual Semantic",
             df[col] = df[col].fillna(0)
         # Explicitly reorder columns to match the UI display order exactly as in the screenshot
+        desired_order = ["Model Name", "Auto Elo Score", "Retrieval", "Structured Outputs", "Light Eval", "Turkish Semantic", "Multilingual Semantic", "Lexical", "Dtype", "License"]
         # Filter out columns that don't exist in the DataFrame
         actual_order = [col for col in desired_order if col in df.columns]
         if "Auto Elo Score" in df.columns:
             df = df.sort_values(by="Auto Elo Score", ascending=False)
+        four_decimal_columns = {"Structured Outputs"}
         for column in df.columns:
             try:
                 if pd.api.types.is_float_dtype(df[column]):
+                    if column in four_decimal_columns:
+                        df[column] = df[column].round(4)
+                    else:
+                        df[column] = df[column].round(2)
             except:
                 continue
         cols = ["model_name"] + [col for col in df.columns if col != "model_name"]
         df = df[cols]
     for column in df.columns:
         try:
             if pd.api.types.is_float_dtype(df[column]):
             "license": "License"
         }
         column_mapping.update(custom_columns)
+    elif benchmark_type == "structured_output":
+        # Structured Output benchmark column mappings
+        custom_columns = {
+            "structured_output_score": "Structured Output Score",
+            "semantic": "Semantic",
+            "response_format": "Response Format",
+            "name": "Name",
+            "document_note": "Document Note",
+            "document_date": "Document Date",
+            "from": "From",
+            "to": "To",
+            "dtype": "Dtype",
+            "license": "License"
+        }
+        column_mapping.update(custom_columns)
     # elif benchmark_type == "lm_harness" and "Overall" in df.columns:
     #     df = df.sort_values(by="Overall", ascending=False)
+    elif benchmark_type == "structured_output":
+        # Sort by Structured Output Score if available
+        if "Structured Output Score" in df.columns:
+            df = df.sort_values(by="Structured Output Score", ascending=False)
+        # Define desired column order for Structured Output - metadata columns at the end
+        desired_cols = [
+            "Model Name",
+            "Structured Output Score",
+            "Semantic",
+            "Response Format",
+            "Name",
+            "Document Note",
+            "Document Date",
+            "From",
+            "To",
+            "Dtype",
+            "License"
+        ]
+        # Filter out columns that don't exist in the DataFrame
+        final_cols = [col for col in desired_cols if col in df.columns]
+        # Add any remaining columns that weren't in the desired list
+        remaining_cols = [col for col in df.columns if col not in final_cols]
+        final_cols.extend(remaining_cols)
+        # Set the new column order
+        df = df[final_cols]
     elif benchmark_type == "light_eval" and "Overall" in df.columns:
         df = df.sort_values(by="Overall", ascending=False)
     elif benchmark_type == "snake":
                 target_dict[new_key] = str(value)
         else:
             # Add other values directly
             if isinstance(value, float):
                 target_dict[new_key] = round(value, 2)
             else: