lmgame_bench

Running

App Files Files Community

Yuxuan-Zhang-Dexter commited on Jun 24, 2025

Commit

6d4c755

1 Parent(s): 865dbef

update leaderboard with new agentic leaderboard layout

Browse files

Files changed (6) hide show

app.py +162 -98
assets/model_color.json +27 -27
data_visualization.py +340 -20
generate_normalized_cache.py +57 -0
leaderboard_utils.py +81 -18
rank_data_03_25_2025.json +94 -94

app.py CHANGED Viewed

@@ -38,11 +38,11 @@ TIME_POINTS = {
 }
 # Load the initial JSON file with rank data
-with open(TIME_POINTS["03/25/2025"], "r") as f:
     rank_data = json.load(f)
 # Load the model leaderboard data
-with open("rank_single_model_03_25_2025.json", "r") as f:
     model_rank_data = json.load(f)
 # Add leaderboard state at the top level
@@ -72,17 +72,17 @@ leaderboard_state = {
 # Load video links and news data
-with open('assets/game_video_link.json', 'r') as f:
     VIDEO_LINKS = json.load(f)
-with open('assets/news.json', 'r') as f:
     NEWS_DATA = json.load(f)
 def load_rank_data(time_point):
     """Load rank data for a specific time point"""
     if time_point in TIME_POINTS:
         try:
-            with open(TIME_POINTS[time_point], "r") as f:
                 return json.load(f)
         except FileNotFoundError:
             return None
@@ -105,7 +105,7 @@ def prepare_dataframe_for_display(df, for_game=None):
     # Replace '_' with '-' for better display
     for col in display_df.columns:
-        if col.endswith(' Score'):
             display_df[col] = display_df[col].apply(lambda x: '-' if x == '_' else x)
     # If we're in detailed view, sort by score
@@ -120,36 +120,47 @@ def prepare_dataframe_for_display(df, for_game=None):
             # Filter out models that didn't participate
             display_df = display_df[~display_df[score_col].isna()]
     else:
-        # For overall view, sort by average of game scores (implicitly used for ranking)
-        # but we won't add an explicit 'Rank' or 'Average Rank' column to the final display_df
-        # Calculate an internal sorting key based on average scores, but don't add it to the display_df
-        score_cols = [col for col in display_df.columns if col.endswith(' Score')]
-        if score_cols:
-            temp_sort_df = display_df.copy()
-            for col in score_cols:
-                temp_sort_df[col] = pd.to_numeric(temp_sort_df[col], errors='coerce')
-            # Calculate average of the game scores (use mean of ranks from utils for actual ranking logic if different)
-            # For display sorting, let's use a simple average of available scores.
-            # The actual ranking for 'Average Rank' in leaderboard_utils uses mean of ranks, which is more robust.
-            # Here we just need a consistent sort order.
-            # Create a temporary column for sorting
-            temp_sort_df['temp_avg_score_for_sort'] = temp_sort_df[score_cols].mean(axis=1)
-            # Sort by this temporary average score (higher is better for scores)
-            # and then by Player name as a tie-breaker
-            display_df = display_df.loc[temp_sort_df.sort_values(by=['temp_avg_score_for_sort', 'Player'], ascending=[False, True]).index]
     # Add line breaks to column headers
     new_columns = {}
     for col in display_df.columns:
-        if col.endswith(' Score'):
             # Replace 'Game Name Score' with 'Game Name\nScore'
             game_name = col.replace(' Score', '')
             new_col = f"{game_name}\nScore"
             new_columns[col] = new_col
     # Rename columns with new line breaks
     if new_columns:
@@ -164,8 +175,14 @@ def update_df_with_height(df):
     col_widths = ["40px"]  # Row number column width
     col_widths.append("230px")  # Player column - reduced by 20px
     col_widths.append("120px")  # Organization column
     # Add game score columns
-    for _ in range(len(df.columns) - 2):
         col_widths.append("120px")
     return gr.update(value=df,
@@ -184,7 +201,7 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
                        # tetris_overall, tetris_details, # Commented out
                        tetris_plan_overall, tetris_plan_details,
                        ace_attorney_overall, ace_attorney_details,
-                       top_n=10,
                        data_source=None):
     global leaderboard_state
@@ -304,21 +321,22 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
     # Get the appropriate DataFrame and charts based on current state
     if leaderboard_state["current_game"]:
-        # For detailed view
         # if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
         #     df = get_mario_leaderboard(data)
         if leaderboard_state["current_game"] == "Super Mario Bros":
-            df = get_mario_planning_leaderboard(data)
         elif leaderboard_state["current_game"] == "Sokoban":
-            df = get_sokoban_leaderboard(data)
         elif leaderboard_state["current_game"] == "2048":
-            df = get_2048_leaderboard(data)
         elif leaderboard_state["current_game"] == "Candy Crush":
-            df = get_candy_leaderboard(data)
         elif leaderboard_state["current_game"] == "Tetris":
-            df = get_tetris_planning_leaderboard(data)
         elif leaderboard_state["current_game"] == "Ace Attorney":
-            df = get_ace_attorney_leaderboard(data)
         else: # Should not happen if current_game is one of the known games
             df = pd.DataFrame() # Empty df
@@ -327,10 +345,12 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
         radar_chart = chart # In detailed view, radar and group bar can be the same as the main chart
         group_bar_chart = chart
     else:
-        # For overall view
-        df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n)
         display_df = prepare_dataframe_for_display(df)
-        _, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games)
         chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
     # Return values, including all four plot placeholders
@@ -405,7 +425,7 @@ def get_initial_state():
         }
     }
-def clear_filters(top_n=10, data_source=None):
     global leaderboard_state
     # Use provided data source or default to rank_data
@@ -420,9 +440,12 @@ def clear_filters(top_n=10, data_source=None):
         "Ace Attorney": True
     }
-    df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n)
     display_df = prepare_dataframe_for_display(df)
-    _, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games)
     leaderboard_state = get_initial_state()
@@ -675,9 +698,18 @@ def build_app():
             max-width: 140px !important;
         }
-        /* Game score columns */
-        .table-container th:nth-child(n+4),
-        .table-container td:nth-child(n+4) {
             width: 120px !important;
             min-width: 100px !important;
             max-width: 140px !important;
@@ -743,6 +775,27 @@ def build_app():
             width: 100% !important;
             margin-top: 40px !important;
         }
     """) as demo:
         gr.Markdown("# 🎮 Lmgame Bench: Leaderboard 🎲")
@@ -875,6 +928,14 @@ def build_app():
         with gr.Tabs():
             with gr.Tab("🏆 Agent Leaderboard"):
                 # Visualization section
                 with gr.Row():
                     gr.Markdown("### 📊 Data Visualization")
@@ -884,6 +945,19 @@ def build_app():
                     visible=False,
                     elem_classes="visualization-container"
                 )
                 with gr.Column(visible=True) as overall_visualizations:
                     with gr.Tabs():
@@ -894,45 +968,32 @@ def build_app():
                                 elem_classes="visualization-container"
                             )
                             gr.Markdown(
-                                    "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
                                     elem_classes="radar-tip"
                                 )
-                        # Comment out the Group Bar Chart tab
                         with gr.Tab("📊 Group Bar Chart"):
-                            with gr.Row():
-                                # Calculate dynamic maximum based on total models
-                                agent_max_models = get_total_model_count(rank_data)
-                                top_n_slider = gr.Slider(
-                                    minimum=1,
-                                    maximum=agent_max_models,
-                                    step=1,
-                                    value=min(10, agent_max_models),
-                                    label=f"Number of Top Models to Display (max: {agent_max_models})",
-                                    elem_classes="top-n-slider"
-                                )
                             group_bar_visualization = gr.Plot(
                                 label="Comparative Analysis (Group Bar Chart)",
                                 elem_classes="visualization-container"
                             )
                             gr.Markdown(
-                                    "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
                                     elem_classes="radar-tip"
                                 )
                 # Hidden placeholder for group bar visualization (to maintain code references)
                 # group_bar_visualization = gr.Plot(visible=False)
                 # Game selection section
                 with gr.Row():
-                    gr.Markdown("### 🎮 Game Selection")
                 with gr.Row():
                     # with gr.Column(): # Commented out Super Mario BrosUI
                     #     gr.Markdown("**🎮 Super Mario Bros**")
                     #     mario_overall = gr.Checkbox(label="Super Mario BrosScore", value=True)
                     #     mario_details = gr.Checkbox(label="Super Mario BrosDetails", value=False)
                     with gr.Column(): # Added Super Mario BrosUI
-                        gr.Markdown("**🎮 Super Mario Bros**")
                         mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
                         mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
                     with gr.Column(): # Sokoban is now after mario_plan
@@ -972,12 +1033,16 @@ def build_app():
                 # Leaderboard table
                 with gr.Row():
                     gr.Markdown("### 📋 Detailed Results")
                 # Add reference to Jupyter notebook
                 with gr.Row():
                     gr.Markdown("*All data analysis can be replicated by checking [this Jupyter notebook](https://colab.research.google.com/drive/1CYFiJGm3EoBXXI8vICPVR82J9qrmmRvc#scrollTo=qft1Oald-21J)*")
-                # Get initial leaderboard dataframe
                 initial_df = get_combined_leaderboard(rank_data, {
                     # "Super Mario Bros": True, # Commented out
                     "Super Mario Bros": True,
@@ -987,7 +1052,7 @@ def build_app():
                     # "Tetris(complete)": True, # Commented out
                     "Tetris": True,
                     "Ace Attorney": True
-                })
                 # Format the DataFrame for display
                 initial_display_df = prepare_dataframe_for_display(initial_df)
@@ -996,8 +1061,14 @@ def build_app():
                 col_widths = ["40px"]  # Row number column width
                 col_widths.append("230px")  # Player column - reduced by 20px
                 col_widths.append("120px")  # Organization column
                 # Add game score columns
-                for _ in range(len(initial_display_df.columns) - 2):
                     col_widths.append("120px")
                 # Create a standard DataFrame component with enhanced styling
@@ -1062,8 +1133,8 @@ def build_app():
                 # Update leaderboard and visualizations when checkboxes change
                 for checkbox in checkbox_list:
                     checkbox.change(
-                        lambda *args: update_leaderboard(*args, data_source=rank_data),
-                        inputs=checkbox_list + [top_n_slider],
                         outputs=[
                             leaderboard_df,
                             detailed_visualization,
@@ -1072,22 +1143,10 @@ def build_app():
                         ] + checkbox_list
                     )
-                # Update when top_n_slider changes
-                top_n_slider.change(
-                    lambda *args: update_leaderboard(*args, data_source=rank_data),
-                    inputs=checkbox_list + [top_n_slider],
-                    outputs=[
-                        leaderboard_df,
-                        detailed_visualization,
-                        radar_visualization,
-                        group_bar_visualization
-                    ] + checkbox_list
-                )
                 # Update when clear button is clicked
                 clear_btn.click(
-                    lambda *args: clear_filters(*args, data_source=rank_data),
-                    inputs=[top_n_slider],
                     outputs=[
                         leaderboard_df,
                         detailed_visualization,
@@ -1096,7 +1155,7 @@ def build_app():
                     ] + checkbox_list
                 )
-                # Initialize the app
                 demo.load(
                     lambda: clear_filters(data_source=rank_data),
                     inputs=[],
@@ -1119,6 +1178,20 @@ def build_app():
                     visible=False,
                     elem_classes="visualization-container"
                 )
                 with gr.Column(visible=True) as model_overall_visualizations:
                     with gr.Tabs():
@@ -1132,17 +1205,6 @@ def build_app():
                                     elem_classes="radar-tip"
                                 )
                         with gr.Tab("📊 Group Bar Chart"):
-                            with gr.Row():
-                                # Calculate dynamic maximum based on total models
-                                model_max_models = get_total_model_count(model_rank_data)
-                                model_top_n_slider = gr.Slider(
-                                    minimum=1,
-                                    maximum=model_max_models,
-                                    step=1,
-                                    value=min(10, model_max_models),
-                                    label=f"Number of Top Models to Display (max: {model_max_models})",
-                                    elem_classes="top-n-slider"
-                                )
                             model_group_bar_visualization = gr.Plot(
                                 label="Comparative Analysis (Group Bar Chart)",
                                 elem_classes="visualization-container"
@@ -1154,10 +1216,10 @@ def build_app():
                 # Game selection section
                 with gr.Row():
-                    gr.Markdown("### 🎮 Game Selection")
                 with gr.Row():
                     with gr.Column():
-                        gr.Markdown("**🎮 Super Mario Bros**")
                         model_mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
                         model_mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
                     with gr.Column():
@@ -1193,8 +1255,10 @@ def build_app():
                 # Leaderboard table
                 with gr.Row():
                     gr.Markdown("### 📋 Detailed Results")
-                # Get initial leaderboard dataframe
                 model_initial_df = get_combined_leaderboard(model_rank_data, {
                     "Super Mario Bros": True,
                     "Sokoban": True,
@@ -1202,7 +1266,7 @@ def build_app():
                     "Candy Crush": True,
                     "Tetris": True,
                     "Ace Attorney": True
-                })
                 # Format the DataFrame for display
                 model_initial_display_df = prepare_dataframe_for_display(model_initial_df)
@@ -1300,7 +1364,7 @@ def build_app():
                     ] + model_checkbox_list
                 )
-                # Initialize the model leaderboard
                 demo.load(
                     lambda: clear_filters(data_source=model_rank_data),
                     inputs=[],

 }
 # Load the initial JSON file with rank data
+with open(TIME_POINTS["03/25/2025"], "r", encoding='utf-8') as f:
     rank_data = json.load(f)
 # Load the model leaderboard data
+with open("rank_single_model_03_25_2025.json", "r", encoding='utf-8') as f:
     model_rank_data = json.load(f)
 # Add leaderboard state at the top level
 # Load video links and news data
+with open('assets/game_video_link.json', 'r', encoding='utf-8') as f:
     VIDEO_LINKS = json.load(f)
+with open('assets/news.json', 'r', encoding='utf-8') as f:
     NEWS_DATA = json.load(f)
 def load_rank_data(time_point):
     """Load rank data for a specific time point"""
     if time_point in TIME_POINTS:
         try:
+            with open(TIME_POINTS[time_point], "r", encoding='utf-8') as f:
                 return json.load(f)
         except FileNotFoundError:
             return None
     # Replace '_' with '-' for better display
     for col in display_df.columns:
+        if col.endswith(' Score') and col != 'Avg Normalized Score':
             display_df[col] = display_df[col].apply(lambda x: '-' if x == '_' else x)
     # If we're in detailed view, sort by score
             # Filter out models that didn't participate
             display_df = display_df[~display_df[score_col].isna()]
     else:
+        # For overall view, sort by average normalized score if available, otherwise fallback to average scores
+        if 'Avg Normalized Score' in display_df.columns:
+            # Sort by average normalized score (already calculated in leaderboard_utils)
+            display_df = display_df.sort_values(by='Avg Normalized Score', ascending=False)
+        else:
+            # Calculate an internal sorting key based on average scores, but don't add it to the display_df
+            score_cols = [col for col in display_df.columns if col.endswith(' Score')]
+            if score_cols:
+                temp_sort_df = display_df.copy()
+                for col in score_cols:
+                    temp_sort_df[col] = pd.to_numeric(temp_sort_df[col], errors='coerce')
+                # Create a temporary column for sorting
+                temp_sort_df['temp_avg_score_for_sort'] = temp_sort_df[score_cols].mean(axis=1)
+                # Sort by this temporary average score (higher is better for scores)
+                # and then by Player name as a tie-breaker
+                display_df = display_df.loc[temp_sort_df.sort_values(by=['temp_avg_score_for_sort', 'Player'], ascending=[False, True]).index]
+    # Add medal emojis for top 3 performers
+    if len(display_df) > 0 and 'Player' in display_df.columns:
+        # Reset index to get proper ranking after sorting
+        display_df = display_df.reset_index(drop=True)
+        # Add medal emojis to Player names for top 3
+        medal_emojis = ['🥇', '🥈', '🥉']
+        for i in range(min(3, len(display_df))):
+            original_name = display_df.loc[i, 'Player']
+            display_df.loc[i, 'Player'] = f"{medal_emojis[i]} {original_name}"
     # Add line breaks to column headers
     new_columns = {}
     for col in display_df.columns:
+        if col.endswith(' Score') and col != 'Avg Normalized Score':
             # Replace 'Game Name Score' with 'Game Name\nScore'
             game_name = col.replace(' Score', '')
             new_col = f"{game_name}\nScore"
             new_columns[col] = new_col
+        elif col == 'Avg Normalized Score':
+            # Add line break to Avg Normalized Score column
+            new_columns[col] = "Avg Normalized\nScore"
     # Rename columns with new line breaks
     if new_columns:
     col_widths = ["40px"]  # Row number column width
     col_widths.append("230px")  # Player column - reduced by 20px
     col_widths.append("120px")  # Organization column
+    # Check if there's an Avg Normalized Score column
+    if any('Avg Normalized' in col for col in df.columns):
+        col_widths.append("140px")  # Avg Normalized Score column - slightly wider
     # Add game score columns
+    remaining_cols = len(df.columns) - len(col_widths) + 1  # +1 because we subtracted row number column
+    for _ in range(remaining_cols):
         col_widths.append("120px")
     return gr.update(value=df,
                        # tetris_overall, tetris_details, # Commented out
                        tetris_plan_overall, tetris_plan_details,
                        ace_attorney_overall, ace_attorney_details,
+                       top_n=3,
                        data_source=None):
     global leaderboard_state
     # Get the appropriate DataFrame and charts based on current state
     if leaderboard_state["current_game"]:
+        # For detailed view - use slider value for both leaderboards
+        limit = top_n
         # if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
         #     df = get_mario_leaderboard(data)
         if leaderboard_state["current_game"] == "Super Mario Bros":
+            df = get_mario_planning_leaderboard(data, limit)
         elif leaderboard_state["current_game"] == "Sokoban":
+            df = get_sokoban_leaderboard(data, limit)
         elif leaderboard_state["current_game"] == "2048":
+            df = get_2048_leaderboard(data, limit)
         elif leaderboard_state["current_game"] == "Candy Crush":
+            df = get_candy_leaderboard(data, limit)
         elif leaderboard_state["current_game"] == "Tetris":
+            df = get_tetris_planning_leaderboard(data, limit)
         elif leaderboard_state["current_game"] == "Ace Attorney":
+            df = get_ace_attorney_leaderboard(data, limit)
         else: # Should not happen if current_game is one of the known games
             df = pd.DataFrame() # Empty df
         radar_chart = chart # In detailed view, radar and group bar can be the same as the main chart
         group_bar_chart = chart
     else:
+        # For overall view - use slider value for both leaderboards
+        limit = top_n
+        df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n, limit)
         display_df = prepare_dataframe_for_display(df)
+        # Pass appropriate title and top_n based on data source
+        _, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games, limit_to_top_n=limit, top_n=top_n)
         chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
     # Return values, including all four plot placeholders
         }
     }
+def clear_filters(top_n=3, data_source=None):
     global leaderboard_state
     # Use provided data source or default to rank_data
         "Ace Attorney": True
     }
+    # Use slider value for both leaderboards
+    limit = top_n
+    df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n, limit)
     display_df = prepare_dataframe_for_display(df)
+    # Pass top_n parameter for consistent titles
+    _, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games, limit_to_top_n=limit, top_n=top_n)
     leaderboard_state = get_initial_state()
             max-width: 140px !important;
         }
+        /* Avg Normalized Score column (4th column) */
+        .table-container th:nth-child(4),
+        .table-container td:nth-child(4) {
+            width: 140px !important;
+            min-width: 120px !important;
+            max-width: 160px !important;
+            text-align: center !important;
+        }
+        /* Game score columns (5th column onwards) */
+        .table-container th:nth-child(n+5),
+        .table-container td:nth-child(n+5) {
             width: 120px !important;
             min-width: 100px !important;
             max-width: 140px !important;
             width: 100% !important;
             margin-top: 40px !important;
         }
+        .welcome-message {
+            background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%);
+            color: #333;
+            padding: 20px;
+            border-radius: 10px;
+            margin: 20px 0;
+            text-align: center;
+            box-shadow: 0 4px 15px rgba(0,0,0,0.05);
+        }
+        .welcome-message h3 {
+            margin: 0 0 10px 0;
+            font-size: 1.3em;
+        }
+        .welcome-message p {
+            margin: 0;
+            font-size: 1.1em;
+            line-height: 1.5;
+        }
     """) as demo:
         gr.Markdown("# 🎮 Lmgame Bench: Leaderboard 🎲")
         with gr.Tabs():
             with gr.Tab("🏆 Agent Leaderboard"):
                 # Visualization section
+                with gr.Row():
+                    gr.Markdown("""
+                    **🎮 Welcome to LMGame Bench!**
+                    We welcome everyone to implement their own gaming agents by replacing our baseAgent in `customer_runner.py` and test them on our benchmark. Join the competition and see how your agent performs!
+                    """, elem_classes="welcome-message")
                 with gr.Row():
                     gr.Markdown("### 📊 Data Visualization")
                     visible=False,
                     elem_classes="visualization-container"
                 )
+                # with gr.Row():
+                #     # Calculate dynamic maximum based on total models
+                #     agent_max_models = get_total_model_count(rank_data)
+                #     top_n_slider = gr.Slider(
+                #         minimum=1,
+                #         maximum=agent_max_models,
+                #         step=1,
+                #         value=min(3, agent_max_models),
+                #         label=f"Number of Top Models to Display in All Views (max: {agent_max_models})",
+                #         elem_classes="top-n-slider"
+                #     )
                 with gr.Column(visible=True) as overall_visualizations:
                     with gr.Tabs():
                                 elem_classes="visualization-container"
                             )
                             gr.Markdown(
+                                    "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*\n\n*⚔️ - Model with our gaming agent*",
                                     elem_classes="radar-tip"
                                 )
                         with gr.Tab("📊 Group Bar Chart"):
                             group_bar_visualization = gr.Plot(
                                 label="Comparative Analysis (Group Bar Chart)",
                                 elem_classes="visualization-container"
                             )
                             gr.Markdown(
+                                    "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*\n\n*⚔️ - Model with our gaming agent*",
                                     elem_classes="radar-tip"
                                 )
                 # Hidden placeholder for group bar visualization (to maintain code references)
                 # group_bar_visualization = gr.Plot(visible=False)
                 # Game selection section
                 with gr.Row():
+                    gr.Markdown("### 🕹️ Game Selection")
                 with gr.Row():
                     # with gr.Column(): # Commented out Super Mario BrosUI
                     #     gr.Markdown("**🎮 Super Mario Bros**")
                     #     mario_overall = gr.Checkbox(label="Super Mario BrosScore", value=True)
                     #     mario_details = gr.Checkbox(label="Super Mario BrosDetails", value=False)
                     with gr.Column(): # Added Super Mario BrosUI
+                        gr.Markdown("**🍄 Super Mario Bros**")
                         mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
                         mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
                     with gr.Column(): # Sokoban is now after mario_plan
                 # Leaderboard table
                 with gr.Row():
                     gr.Markdown("### 📋 Detailed Results")
+                with gr.Row():
+                    gr.Markdown("*⚔️ - Model with our gaming agent*", elem_classes="radar-tip")
+                # Welcome message for custom gaming agents
                 # Add reference to Jupyter notebook
                 with gr.Row():
                     gr.Markdown("*All data analysis can be replicated by checking [this Jupyter notebook](https://colab.research.google.com/drive/1CYFiJGm3EoBXXI8vICPVR82J9qrmmRvc#scrollTo=qft1Oald-21J)*")
+                # Get initial leaderboard dataframe (limited by default slider value for agent leaderboard)
                 initial_df = get_combined_leaderboard(rank_data, {
                     # "Super Mario Bros": True, # Commented out
                     "Super Mario Bros": True,
                     # "Tetris(complete)": True, # Commented out
                     "Tetris": True,
                     "Ace Attorney": True
+                }, limit_to_top_n=min(3, get_total_model_count(rank_data)))
                 # Format the DataFrame for display
                 initial_display_df = prepare_dataframe_for_display(initial_df)
                 col_widths = ["40px"]  # Row number column width
                 col_widths.append("230px")  # Player column - reduced by 20px
                 col_widths.append("120px")  # Organization column
+                # Check if there's an Avg Normalized Score column
+                if any('Avg Normalized' in col for col in initial_display_df.columns):
+                    col_widths.append("140px")  # Avg Normalized Score column - slightly wider
                 # Add game score columns
+                remaining_cols = len(initial_display_df.columns) - len(col_widths) + 1  # +1 because we subtracted row number column
+                for _ in range(remaining_cols):
                     col_widths.append("120px")
                 # Create a standard DataFrame component with enhanced styling
                 # Update leaderboard and visualizations when checkboxes change
                 for checkbox in checkbox_list:
                     checkbox.change(
+                        lambda *args: update_leaderboard(*args, top_n=3, data_source=rank_data),
+                        inputs=checkbox_list,
                         outputs=[
                             leaderboard_df,
                             detailed_visualization,
                         ] + checkbox_list
                     )
                 # Update when clear button is clicked
                 clear_btn.click(
+                    lambda: clear_filters(top_n=3, data_source=rank_data),
+                    inputs=[],
                     outputs=[
                         leaderboard_df,
                         detailed_visualization,
                     ] + checkbox_list
                 )
+                # Initialize the agent leaderboard (with top 5 limit)
                 demo.load(
                     lambda: clear_filters(data_source=rank_data),
                     inputs=[],
                     visible=False,
                     elem_classes="visualization-container"
                 )
+                with gr.Row():
+                    # Calculate dynamic maximum based on total models
+                    model_max_models = get_total_model_count(model_rank_data)
+                    model_top_n_slider = gr.Slider(
+                        minimum=1,
+                        maximum=model_max_models,
+                        step=1,
+                        value=min(5, model_max_models),
+                        label=f"Number of Top Models to Display in All Views (max: {model_max_models})",
+                        elem_classes="top-n-slider"
+                    )
                 with gr.Column(visible=True) as model_overall_visualizations:
                     with gr.Tabs():
                                     elem_classes="radar-tip"
                                 )
                         with gr.Tab("📊 Group Bar Chart"):
                             model_group_bar_visualization = gr.Plot(
                                 label="Comparative Analysis (Group Bar Chart)",
                                 elem_classes="visualization-container"
                 # Game selection section
                 with gr.Row():
+                    gr.Markdown("### 🕹️ Game Selection")
                 with gr.Row():
                     with gr.Column():
+                        gr.Markdown("**🍄 Super Mario Bros**")
                         model_mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
                         model_mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
                     with gr.Column():
                 # Leaderboard table
                 with gr.Row():
                     gr.Markdown("### 📋 Detailed Results")
+                with gr.Row():
+                    gr.Markdown("*💡 The slider above controls how many top models are shown in the radar chart, bar chart, and data table.*", elem_classes="radar-tip")
+                # Get initial leaderboard dataframe (limited by default slider value for model leaderboard)
                 model_initial_df = get_combined_leaderboard(model_rank_data, {
                     "Super Mario Bros": True,
                     "Sokoban": True,
                     "Candy Crush": True,
                     "Tetris": True,
                     "Ace Attorney": True
+                }, limit_to_top_n=min(5, get_total_model_count(model_rank_data)))
                 # Format the DataFrame for display
                 model_initial_display_df = prepare_dataframe_for_display(model_initial_df)
                     ] + model_checkbox_list
                 )
+                # Initialize the model leaderboard (with default slider limit)
                 demo.load(
                     lambda: clear_filters(data_source=model_rank_data),
                     inputs=[],

assets/model_color.json CHANGED Viewed

@@ -27,31 +27,31 @@
     "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
     "qwen3-235B-A22B-fp8": "#6A1B9A",
     "random (x30)": "#9E9E9E",
-    "gamingagent + claude-3-7-sonnet-20250219": "#4A90E2",
-    "gamingagent + claude-3-5-haiku-20241022": "#7FB5E6",
-    "gamingagent + claude-3-5-sonnet-20241022": "#1A4C7C",
-    "gamingagent + claude-opus-4-20250514": "#3A80D2",
-    "gamingagent + claude-sonnet-4-20250514": "#5A9FE2",
-    "gamingagent + gemini-2.0-flash": "#FF4081",
-    "gamingagent + gemini-2.0-flash-thinking-exp-1219": "#C2185B",
-    "gamingagent + gemini-2.5-pro-exp-03-25": "#FF80AB",
-    "gamingagent + gemini-2.5-flash-preview-04-17": "#F06292",
-    "gamingagent + gemini-2.5-flash-preview-05-20": "#F8BBD9",
-    "gamingagent + gemini-2.5-pro-preview-05-06": "#AD1457",
-    "gamingagent + gemini-2.5-pro-preview-06-05": "#EC407A",
-    "gamingagent + gpt-4o-2024-11-20": "#00BFA5",
-    "gamingagent + gpt-4.5-preview-2025-02-27": "#00796B",
-    "gamingagent + gpt-4.1-2025-04-14": "#00897B",
-    "gamingagent + o1-2024-12-17": "#4DB6AC",
-    "gamingagent + o1-mini-2024-09-12": "#26A69A",
-    "gamingagent + o3-mini-2025-01-31(medium)": "#80CBC4",
-    "gamingagent + o3-2025-04-16": "#26C6DA",
-    "gamingagent + o4-mini-2025-04-16": "#00ACC1",
-    "gamingagent + grok-3-beta": "#FF7043",
-    "gamingagent + grok-3-mini-beta": "#FF8A65",
-    "gamingagent + deepseek-v3": "#FFC107",
-    "gamingagent + deepseek-r1-0120": "#FFA000",
-    "gamingagent + deepseek-r1-0528": "#FFB300",
-    "gamingagent + llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
-    "gamingagent + qwen3-235B-A22B-fp8": "#6A1B9A"
 }

     "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
     "qwen3-235B-A22B-fp8": "#6A1B9A",
     "random (x30)": "#9E9E9E",
+    "claude-3-7-sonnet-20250219 (⚔️)": "#4A90E2",
+    "claude-3-5-haiku-20241022 (⚔️)": "#7FB5E6",
+    "claude-3-5-sonnet-20241022 (⚔️)": "#1A4C7C",
+    "claude-opus-4-20250514 (⚔️)": "#3A80D2",
+    "claude-sonnet-4-20250514 (⚔️)": "#5A9FE2",
+    "gemini-2.0-flash (⚔️)": "#FF4081",
+    "gemini-2.0-flash-thinking-exp-1219 (⚔️)": "#C2185B",
+    "gemini-2.5-pro-exp-03-25 (⚔️)": "#FF80AB",
+    "gemini-2.5-flash-preview-04-17 (⚔️)": "#F06292",
+    "gemini-2.5-flash-preview-05-20 (⚔️)": "#F8BBD9",
+    "gemini-2.5-pro-preview-05-06 (⚔️)": "#AD1457",
+    "gemini-2.5-pro-preview-06-05 (⚔️)": "#EC407A",
+    "gpt-4o-2024-11-20 (⚔️)": "#00BFA5",
+    "gpt-4.5-preview-2025-02-27 (⚔️)": "#00796B",
+    "gpt-4.1-2025-04-14 (⚔️)": "#00897B",
+    "o1-2024-12-17 (⚔️)": "#4DB6AC",
+    "o1-mini-2024-09-12 (⚔️)": "#26A69A",
+    "o3-mini-2025-01-31(medium) (⚔️)": "#80CBC4",
+    "o3-2025-04-16 (⚔️)": "#26C6DA",
+    "o4-mini-2025-04-16 (⚔️)": "#00ACC1",
+    "grok-3-beta (⚔️)": "#FF7043",
+    "grok-3-mini-beta (⚔️)": "#FF8A65",
+    "deepseek-v3 (⚔️)": "#FFC107",
+    "deepseek-r1-0120 (⚔️)": "#FFA000",
+    "deepseek-r1-0528 (⚔️)": "#FFB300",
+    "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)": "#8E24AA",
+    "qwen3-235B-A22B-fp8 (⚔️)": "#6A1B9A"
 }

data_visualization.py CHANGED Viewed

@@ -2,13 +2,15 @@ import plotly.graph_objects as go
 import numpy as np
 import pandas as pd
 import json
 from leaderboard_utils import (
     get_combined_leaderboard,
     GAME_ORDER
 )
 # Load model colors
-with open('assets/model_color.json', 'r') as f:
     MODEL_COLORS = json.load(f)
 GAME_SCORE_COLUMNS = {
@@ -126,7 +128,7 @@ def create_radar_charts(df):
     categories = [c.replace(" Score", "") for c in game_cols]
     for col in game_cols:
-        vals = df[col].replace("n/a", 0).astype(float)
         mean, std = vals.mean(), vals.std()
         df[f"norm_{col}"] = normalize_values(vals, mean, std)
@@ -179,7 +181,7 @@ def get_combined_leaderboard_with_radar(rank_data, selected_games):
     df_viz = df.copy()
     return df, create_radar_charts(df_viz)
-def create_group_bar_chart(df, top_n=10):
     game_cols = {}
     for game in GAME_ORDER:
         col = f"{game} Score"
@@ -330,8 +332,8 @@ def create_group_bar_chart(df, top_n=10):
-def get_combined_leaderboard_with_group_bar(rank_data, selected_games, top_n=10):
-    df = get_combined_leaderboard(rank_data, selected_games)
     # Create a copy for visualization to avoid modifying the original
     df_viz = df.copy()
     return df, create_group_bar_chart(df_viz, top_n)
@@ -344,7 +346,7 @@ def hex_to_rgba(hex_color, alpha=0.2):
     return f'rgba({r}, {g}, {b}, {alpha})'
-def create_single_radar_chart(df, selected_games=None, highlight_models=None):
     if selected_games is None:
         selected_games = ['Super Mario Bros', '2048', 'Candy Crush', 'Sokoban', 'Ace Attorney']
@@ -359,11 +361,25 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
     game_cols = [f"{game} Score" for game in selected_games]
     categories = formatted_games
-    # Normalize
     for col in game_cols:
-        vals = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
-        mean, std = vals.mean(), vals.std()
-        df[f"norm_{col}"] = normalize_values(vals, mean, std)
     # Group players by prefix and sort alphabetically
     model_groups = {}
@@ -411,12 +427,23 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
             hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
         ))
     fig.update_layout(
         autosize=True,
         height=550,  # Reduced height for better proportion with legend
         margin=dict(l=400, r=100, t=20, b=20),
         title=dict(
-            text="AI Normalized Performance Across Games",
             x=0.5,
             xanchor='center',
             yanchor='top',
@@ -462,12 +489,20 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
     return fig
-def get_combined_leaderboard_with_single_radar(rank_data, selected_games, highlight_models=None):
-    df = get_combined_leaderboard(rank_data, selected_games)
     selected_game_names = [g for g, sel in selected_games.items() if sel]
-    # Create a copy for visualization to avoid modifying the original
     df_viz = df.copy()
-    return df, create_single_radar_chart(df_viz, selected_game_names, highlight_models)
 def create_organization_radar_chart(rank_data):
     df = get_combined_leaderboard(rank_data, {g: True for g in GAME_ORDER})
@@ -477,7 +512,7 @@ def create_organization_radar_chart(rank_data):
     avg_df = pd.DataFrame([
         {
-            **{col: df[df["Organization"] == org][col].replace("n/a", 0).infer_objects(copy=False).astype(float).mean() for col in game_cols},
             "Organization": org
         }
         for org in orgs
@@ -533,7 +568,10 @@ def create_top_players_radar_chart(rank_data, n=5):
     for col in game_cols:
         # Replace "n/a" with 0 and handle downcasting properly
-        vals = top_df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
         mean, std = vals.mean(), vals.std()
         top_df[f"norm_{col}"] = normalize_values(vals, mean, std)
@@ -589,8 +627,15 @@ def create_player_radar_chart(rank_data, player_name):
     for col in game_cols:
         # Replace "n/a" with 0 and handle downcasting properly
-        vals = player_df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
-        mean, std = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float).mean(), df[col].replace("n/a", 0).infer_objects(copy=False).astype(float).std()
         player_df[f"norm_{col}"] = normalize_values(vals, mean, std)
     fig = go.Figure()
@@ -628,6 +673,281 @@ def create_player_radar_chart(rank_data, player_name):
     )
     return fig
 def save_visualization(fig, filename):
-    fig.write_image(filename)

 import numpy as np
 import pandas as pd
 import json
+import os
+from datetime import datetime
 from leaderboard_utils import (
     get_combined_leaderboard,
     GAME_ORDER
 )
 # Load model colors
+with open('assets/model_color.json', 'r', encoding='utf-8') as f:
     MODEL_COLORS = json.load(f)
 GAME_SCORE_COLUMNS = {
     categories = [c.replace(" Score", "") for c in game_cols]
     for col in game_cols:
+        vals = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
         mean, std = vals.mean(), vals.std()
         df[f"norm_{col}"] = normalize_values(vals, mean, std)
     df_viz = df.copy()
     return df, create_radar_charts(df_viz)
+def create_group_bar_chart(df, top_n=5):
     game_cols = {}
     for game in GAME_ORDER:
         col = f"{game} Score"
+def get_combined_leaderboard_with_group_bar(rank_data, selected_games, top_n=5, limit_to_top_n=None):
+    df = get_combined_leaderboard(rank_data, selected_games, limit_to_top_n)
     # Create a copy for visualization to avoid modifying the original
     df_viz = df.copy()
     return df, create_group_bar_chart(df_viz, top_n)
     return f'rgba({r}, {g}, {b}, {alpha})'
+def create_single_radar_chart(df, selected_games=None, highlight_models=None, chart_title=None, top_n=None, full_df=None):
     if selected_games is None:
         selected_games = ['Super Mario Bros', '2048', 'Candy Crush', 'Sokoban', 'Ace Attorney']
     game_cols = [f"{game} Score" for game in selected_games]
     categories = formatted_games
+    # Use full dataset for normalization to keep consistent scale
+    # If full_df is not provided, use the current df (fallback for backward compatibility)
+    normalization_df = full_df if full_df is not None else df
+    # Normalize using the full dataset but apply to the limited df
     for col in game_cols:
+        # Get normalization parameters from full dataset
+        # Use where() to avoid FutureWarning about downcasting in replace()
+        full_series = normalization_df[col].copy()
+        full_series = full_series.where(full_series != "n/a", 0)
+        full_vals = full_series.astype(float)
+        mean, std = full_vals.mean(), full_vals.std()
+        # Apply normalization to the limited df
+        # Use where() to avoid FutureWarning about downcasting in replace()
+        limited_series = df[col].copy()
+        limited_series = limited_series.where(limited_series != "n/a", 0)
+        limited_vals = limited_series.astype(float)
+        df[f"norm_{col}"] = normalize_values(limited_vals, mean, std)
     # Group players by prefix and sort alphabetically
     model_groups = {}
             hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
         ))
+    # Dynamic title based on the data source and top_n
+    if chart_title is None:
+        if top_n is not None:
+            chart_title = f"Radar Chart - Top {top_n} Performers by Game"
+        else:
+            # Fallback title
+            if len(df) <= 10:
+                chart_title = "🎮 Agent Performance Across Games"
+            else:
+                chart_title = "🤖 Model Performance Across Games"
     fig.update_layout(
         autosize=True,
         height=550,  # Reduced height for better proportion with legend
         margin=dict(l=400, r=100, t=20, b=20),
         title=dict(
+            text=chart_title,
             x=0.5,
             xanchor='center',
             yanchor='top',
     return fig
+def get_combined_leaderboard_with_single_radar(rank_data, selected_games, highlight_models=None, limit_to_top_n=None, chart_title=None, top_n=None):
+    # Get full dataset for normalization
+    full_df = get_combined_leaderboard(rank_data, selected_games, limit_to_top_n=None)
+    # Get limited dataset for display
+    df = get_combined_leaderboard(rank_data, selected_games, limit_to_top_n)
     selected_game_names = [g for g, sel in selected_games.items() if sel]
+    # Create copies for visualization to avoid modifying the original
     df_viz = df.copy()
+    full_df_viz = full_df.copy()
+    return df, create_single_radar_chart(df_viz, selected_game_names, highlight_models, chart_title, top_n, full_df_viz)
 def create_organization_radar_chart(rank_data):
     df = get_combined_leaderboard(rank_data, {g: True for g in GAME_ORDER})
     avg_df = pd.DataFrame([
         {
+            **{col: df[df["Organization"] == org][col].where(df[df["Organization"] == org][col] != "n/a", 0).astype(float).mean() for col in game_cols},
             "Organization": org
         }
         for org in orgs
     for col in game_cols:
         # Replace "n/a" with 0 and handle downcasting properly
+        # Use where() to avoid FutureWarning about downcasting in replace()
+        series = top_df[col].copy()
+        series = series.where(series != "n/a", 0)
+        vals = series.astype(float)
         mean, std = vals.mean(), vals.std()
         top_df[f"norm_{col}"] = normalize_values(vals, mean, std)
     for col in game_cols:
         # Replace "n/a" with 0 and handle downcasting properly
+        # Use where() to avoid FutureWarning about downcasting in replace()
+        player_series = player_df[col].copy()
+        player_series = player_series.where(player_series != "n/a", 0)
+        vals = player_series.astype(float)
+        df_series = df[col].copy()
+        df_series = df_series.where(df_series != "n/a", 0)
+        df_vals = df_series.astype(float)
+        mean, std = df_vals.mean(), df_vals.std()
         player_df[f"norm_{col}"] = normalize_values(vals, mean, std)
     fig = go.Figure()
     )
     return fig
+def save_normalized_data(df, selected_games, filename="normalized_data.json"):
+    """
+    Save normalized data to a JSON file for caching
+    Args:
+        df (pd.DataFrame): DataFrame with raw scores
+        selected_games (dict): Dictionary of selected games
+        filename (str): Output filename
+    """
+    game_cols = [f"{game} Score" for game in GAME_ORDER if f"{game} Score" in df.columns]
+    # Calculate normalization parameters and normalized values
+    normalization_data = {
+        "timestamp": datetime.now().isoformat(),
+        "selected_games": selected_games,
+        "games": {},
+        "players": {}
+    }
+    # Store normalization parameters per game
+    for col in game_cols:
+        game_name = col.replace(" Score", "")
+        vals = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
+        mean, std = vals.mean(), vals.std()
+        normalization_data["games"][game_name] = {
+            "mean": mean,
+            "std": std,
+            "raw_scores": vals.to_dict()
+        }
+    # Store normalized scores per player
+    for _, row in df.iterrows():
+        player = row["Player"]
+        player_data = {"organization": row.get("Organization", "unknown")}
+        for col in game_cols:
+            game_name = col.replace(" Score", "")
+            raw_score = row[col]
+            if raw_score != "n/a":
+                raw_score = float(raw_score)
+                mean = normalization_data["games"][game_name]["mean"]
+                std = normalization_data["games"][game_name]["std"]
+                normalized = normalize_values([raw_score], mean, std)[0]
+            else:
+                raw_score = "n/a"
+                normalized = 0
+            player_data[f"{game_name}_raw"] = raw_score
+            player_data[f"{game_name}_normalized"] = normalized
+        normalization_data["players"][player] = player_data
+    # Save to file
+    os.makedirs("cache", exist_ok=True)
+    filepath = os.path.join("cache", filename)
+    with open(filepath, 'w') as f:
+        json.dump(normalization_data, f, indent=2)
+    print(f"Normalized data saved to {filepath}")
+    return filepath
+def load_normalized_data(filename="normalized_data.json"):
+    """
+    Load normalized data from a JSON file
+    Args:
+        filename (str): Input filename
+    Returns:
+        dict: Normalized data or None if file doesn't exist
+    """
+    filepath = os.path.join("cache", filename)
+    if not os.path.exists(filepath):
+        return None
+    try:
+        with open(filepath, 'r') as f:
+            data = json.load(f)
+        print(f"Normalized data loaded from {filepath}")
+        return data
+    except Exception as e:
+        print(f"Error loading normalized data: {e}")
+        return None
+def get_normalized_scores_from_cache(players, games, cache_data):
+    """
+    Extract normalized scores from cached data
+    Args:
+        players (list): List of player names
+        games (list): List of game names
+        cache_data (dict): Cached normalization data
+    Returns:
+        pd.DataFrame: DataFrame with normalized scores
+    """
+    data = []
+    for player in players:
+        if player in cache_data["players"]:
+            player_data = {"Player": player}
+            player_cache = cache_data["players"][player]
+            for game in games:
+                raw_key = f"{game}_raw"
+                norm_key = f"{game}_normalized"
+                if raw_key in player_cache:
+                    player_data[f"{game} Score"] = player_cache[raw_key]
+                    player_data[f"norm_{game} Score"] = player_cache[norm_key]
+                else:
+                    player_data[f"{game} Score"] = "n/a"
+                    player_data[f"norm_{game} Score"] = 0
+            data.append(player_data)
+    return pd.DataFrame(data)
 def save_visualization(fig, filename):
+    fig.write_image(filename)
+def generate_and_save_normalized_data(rank_data, filename="normalized_data.json"):
+    """
+    Generate normalized data for all games and save to file
+    Args:
+        rank_data (dict): Raw rank data
+        filename (str): Output filename
+    Returns:
+        str: Path to saved file
+    """
+    # Select all games
+    all_games = {game: True for game in GAME_ORDER}
+    # Get combined leaderboard
+    df = get_combined_leaderboard(rank_data, all_games)
+    # Save normalized data
+    return save_normalized_data(df, all_games, filename)
+def create_single_radar_chart_with_cache(df, selected_games=None, highlight_models=None, use_cache=True, cache_filename="normalized_data.json"):
+    """
+    Create radar chart with optional caching support
+    """
+    if selected_games is None:
+        selected_games = ['Super Mario Bros', '2048', 'Candy Crush', 'Sokoban', 'Ace Attorney']
+    # Try to load from cache first
+    cached_data = None
+    if use_cache:
+        cached_data = load_normalized_data(cache_filename)
+    if cached_data:
+        # Use cached normalized data
+        players = df["Player"].tolist()
+        df_normalized = get_normalized_scores_from_cache(players, selected_games, cached_data)
+        # Merge with original df to get Organization info
+        df_normalized = df_normalized.merge(df[["Player", "Organization"]], on="Player", how="left")
+    else:
+        # Fall back to on-the-fly normalization
+        df_normalized = df.copy()
+        game_cols = [f"{game} Score" for game in selected_games]
+        # Normalize
+        for col in game_cols:
+            vals = df_normalized[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
+            mean, std = vals.mean(), vals.std()
+            df_normalized[f"norm_{col}"] = normalize_values(vals, mean, std)
+    # Format game names
+    formatted_games = []
+    for game in selected_games:
+        if game == 'Super Mario Bros':
+            formatted_games.append('SMB')
+        else:
+            formatted_games.append(game)
+    categories = formatted_games
+    # Group players by prefix and sort alphabetically
+    model_groups = {}
+    for player in df_normalized["Player"]:
+        prefix = get_model_prefix(player)
+        model_groups.setdefault(prefix, []).append(player)
+    # Sort each group alphabetically
+    for prefix in model_groups:
+        model_groups[prefix] = sorted(model_groups[prefix], key=str.lower)
+    # Get sorted prefixes and create ordered player list
+    sorted_prefixes = sorted(model_groups.keys(), key=str.lower)
+    grouped_players = []
+    for prefix in sorted_prefixes:
+        grouped_players.extend(model_groups[prefix])
+    fig = go.Figure()
+    for player in grouped_players:
+        row = df_normalized[df_normalized["Player"] == player]
+        if row.empty:
+            continue
+        row = row.iloc[0]
+        is_highlighted = highlight_models and player in highlight_models
+        color = 'red' if is_highlighted else MODEL_COLORS.get(player, '#808080')
+        fillcolor = 'rgba(255, 0, 0, 0.4)' if is_highlighted else hex_to_rgba(color, 0.2)
+        # Get normalized values
+        if cached_data:
+            r = [row[f"norm_{game} Score"] for game in selected_games]
+        else:
+            r = [row[f"norm_{game} Score"] for game in selected_games]
+        display_name = player.lower()
+        fig.add_trace(go.Scatterpolar(
+            r=r + [r[0]],
+            theta=categories + [categories[0]],
+            mode='lines+markers',
+            fill='toself',
+            name=display_name,
+            line=dict(color=color, width=6 if is_highlighted else 2),
+            marker=dict(color=color, size=10 if is_highlighted else 6),
+            fillcolor=fillcolor,
+            opacity=1.0 if is_highlighted else 0.7,
+            hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
+        ))
+    fig.update_layout(
+        autosize=True,
+        height=550,
+        margin=dict(l=400, r=100, t=20, b=20),
+        title=dict(
+            text="AI Normalized Performance Across Games",
+            x=0.5,
+            xanchor='center',
+            yanchor='top',
+            y=0.95,
+            font=dict(size=20),
+            pad=dict(b=20)
+        ),
+        polar=dict(
+            radialaxis=dict(
+                visible=True,
+                range=[0, 100],
+                tickangle=45,
+                tickfont=dict(size=12),
+                gridcolor='lightgray',
+                gridwidth=1,
+                angle=45
+            ),
+            angularaxis=dict(
+                tickfont=dict(size=14, weight='bold'),
+                tickangle=0
+            )
+        ),
+        legend=dict(
+            font=dict(size=12),
+            title="Choose your model 💡 (click / double-click)",
+            itemsizing='trace',
+            x=-1.4,
+            y=0.8,
+            yanchor='top',
+            xanchor='left',
+            bgcolor='rgba(255,255,255,0.6)',
+            bordercolor='gray',
+            borderwidth=1,
+            itemclick="toggleothers",
+            itemdoubleclick="toggle"
+        )
+    )
+    return fig

generate_normalized_cache.py ADDED Viewed

	@@ -0,0 +1,57 @@

+#!/usr/bin/env python3
+"""
+Script to generate normalized data cache for faster visualization loading.
+Usage:
+    python generate_normalized_cache.py [input_file] [output_file]
+Example:
+    python generate_normalized_cache.py data/rank_data.json normalized_data.json
+"""
+import sys
+import json
+from data_visualization import generate_and_save_normalized_data, load_normalized_data
+def main():
+    # Default files
+    input_file = "data/rank_data.json"  # Update this path as needed
+    output_file = "normalized_data.json"
+    # Handle command line arguments
+    if len(sys.argv) > 1:
+        input_file = sys.argv[1]
+    if len(sys.argv) > 2:
+        output_file = sys.argv[2]
+    try:
+        # Load rank data
+        print(f"Loading rank data from {input_file}...")
+        with open(input_file, 'r') as f:
+            rank_data = json.load(f)
+        # Generate and save normalized data
+        print("Generating normalized data...")
+        saved_path = generate_and_save_normalized_data(rank_data, output_file)
+        # Verify the saved data
+        print("Verifying saved data...")
+        cached_data = load_normalized_data(output_file)
+        if cached_data:
+            print(f"✅ Successfully generated normalized data cache!")
+            print(f"📁 Saved to: {saved_path}")
+            print(f"🎮 Games included: {list(cached_data['games'].keys())}")
+            print(f"👥 Players included: {len(cached_data['players'])}")
+            print(f"📅 Generated at: {cached_data['timestamp']}")
+        else:
+            print("❌ Failed to verify cached data")
+    except FileNotFoundError:
+        print(f"❌ Error: Could not find input file '{input_file}'")
+        print("Please check the file path and try again.")
+    except Exception as e:
+        print(f"❌ Error: {str(e)}")
+if __name__ == "__main__":
+    main()

leaderboard_utils.py CHANGED Viewed

@@ -32,7 +32,7 @@ def get_organization(model_name):
         return "unknown"
-def get_sokoban_leaderboard(rank_data):
     data = rank_data.get("Sokoban", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
@@ -53,9 +53,12 @@ def get_sokoban_leaderboard(rank_data):
     if "Score" in df.columns:
         df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
         df = df.sort_values("Score", ascending=False)
     return df
-def get_2048_leaderboard(rank_data):
     data = rank_data.get("2048", {}).get("results", [])
     # --- Diagnostic Print Removed ---
     # if data and isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
@@ -108,9 +111,12 @@ def get_2048_leaderboard(rank_data):
     if "Score" in df.columns:
         df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
         df = df.sort_values("Score", ascending=False)
     return df
-def get_candy_leaderboard(rank_data):
     data = rank_data.get("Candy Crush", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
@@ -127,9 +133,12 @@ def get_candy_leaderboard(rank_data):
     if "Score" in df.columns:
         df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
         df = df.sort_values("Score", ascending=False)
     return df
-def get_tetris_planning_leaderboard(rank_data):
     data = rank_data.get("Tetris", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
@@ -147,9 +156,12 @@ def get_tetris_planning_leaderboard(rank_data):
     if "Score" in df.columns:
         df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
         df = df.sort_values("Score", ascending=False)
     return df
-def get_ace_attorney_leaderboard(rank_data):
     data = rank_data.get("Ace Attorney", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
@@ -168,9 +180,12 @@ def get_ace_attorney_leaderboard(rank_data):
     if "Score" in df.columns:
         df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
         df = df.sort_values("Score", ascending=False)  # Higher score is better
     return df
-def get_mario_planning_leaderboard(rank_data):
     data = rank_data.get("Super Mario Bros", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
@@ -188,6 +203,9 @@ def get_mario_planning_leaderboard(rank_data):
     if "Score" in df.columns:
         df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
         df = df.sort_values("Score", ascending=False)
     return df
 def calculate_rank_and_completeness(rank_data, selected_games):
@@ -285,13 +303,14 @@ def calculate_rank_and_completeness(rank_data, selected_games):
     return df_results
-def get_combined_leaderboard(rank_data, selected_games):
     """
     Get combined leaderboard for selected games
     Args:
         rank_data (dict): Dictionary containing rank data
         selected_games (dict): Dictionary of game names and their selection status
     Returns:
         pd.DataFrame: Combined leaderboard DataFrame
@@ -358,20 +377,64 @@ def get_combined_leaderboard(rank_data, selected_games):
     # Create DataFrame
     df_results = pd.DataFrame(results)
-    # Sort by total score across all games
     if not df_results.empty:
-        # Calculate total score for each player
-        df_results["Total Score"] = 0
         for game in GAME_ORDER:
-            if f"{game} Score" in df_results.columns:
-                df_results["Total Score"] += df_results[f"{game} Score"].apply(
-                    lambda x: float(x) if x != 'n/a' else 0
-                )
-        # Sort by total score in descending order
-        df_results = df_results.sort_values("Total Score", ascending=False)
-        # Drop the temporary total score column
-        df_results = df_results.drop("Total Score", axis=1)
     return df_results

         return "unknown"
+def get_sokoban_leaderboard(rank_data, limit_to_top_n=None):
     data = rank_data.get("Sokoban", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
     if "Score" in df.columns:
         df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
         df = df.sort_values("Score", ascending=False)
+        # Apply limit if specified
+        if limit_to_top_n is not None:
+            df = df.head(limit_to_top_n)
     return df
+def get_2048_leaderboard(rank_data, limit_to_top_n=None):
     data = rank_data.get("2048", {}).get("results", [])
     # --- Diagnostic Print Removed ---
     # if data and isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
     if "Score" in df.columns:
         df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
         df = df.sort_values("Score", ascending=False)
+        # Apply limit if specified
+        if limit_to_top_n is not None:
+            df = df.head(limit_to_top_n)
     return df
+def get_candy_leaderboard(rank_data, limit_to_top_n=None):
     data = rank_data.get("Candy Crush", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
     if "Score" in df.columns:
         df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
         df = df.sort_values("Score", ascending=False)
+        # Apply limit if specified
+        if limit_to_top_n is not None:
+            df = df.head(limit_to_top_n)
     return df
+def get_tetris_planning_leaderboard(rank_data, limit_to_top_n=None):
     data = rank_data.get("Tetris", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
     if "Score" in df.columns:
         df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
         df = df.sort_values("Score", ascending=False)
+        # Apply limit if specified
+        if limit_to_top_n is not None:
+            df = df.head(limit_to_top_n)
     return df
+def get_ace_attorney_leaderboard(rank_data, limit_to_top_n=None):
     data = rank_data.get("Ace Attorney", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
     if "Score" in df.columns:
         df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
         df = df.sort_values("Score", ascending=False)  # Higher score is better
+        # Apply limit if specified
+        if limit_to_top_n is not None:
+            df = df.head(limit_to_top_n)
     return df
+def get_mario_planning_leaderboard(rank_data, limit_to_top_n=None):
     data = rank_data.get("Super Mario Bros", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
     if "Score" in df.columns:
         df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
         df = df.sort_values("Score", ascending=False)
+        # Apply limit if specified
+        if limit_to_top_n is not None:
+            df = df.head(limit_to_top_n)
     return df
 def calculate_rank_and_completeness(rank_data, selected_games):
     return df_results
+def get_combined_leaderboard(rank_data, selected_games, limit_to_top_n=None):
     """
     Get combined leaderboard for selected games
     Args:
         rank_data (dict): Dictionary containing rank data
         selected_games (dict): Dictionary of game names and their selection status
+        limit_to_top_n (int, optional): Limit results to top N entries. None means no limit.
     Returns:
         pd.DataFrame: Combined leaderboard DataFrame
     # Create DataFrame
     df_results = pd.DataFrame(results)
+    # Calculate normalized scores and average normalized score
     if not df_results.empty:
+        # Import the normalize_values function from data_visualization
+        from data_visualization import normalize_values
+        # Calculate normalized scores for each game
+        game_score_columns = []
         for game in GAME_ORDER:
+            score_col = f"{game} Score"
+            if score_col in df_results.columns:
+                game_score_columns.append(score_col)
+                # Get numeric values, replacing 'n/a' with NaN
+                # Use where() to avoid FutureWarning about downcasting in replace()
+                series = df_results[score_col].copy()
+                series = series.where(series != 'n/a', np.nan)
+                numeric_scores = pd.to_numeric(series, errors='coerce')
+                # Skip games where all scores are NaN or 0
+                valid_scores = numeric_scores.dropna()
+                if len(valid_scores) > 0 and valid_scores.sum() > 0:
+                    mean = valid_scores.mean()
+                    std = valid_scores.std() if len(valid_scores) > 1 else 0
+                    # Calculate normalized scores for all players
+                    normalized_scores = []
+                    for _, row in df_results.iterrows():
+                        score = row[score_col]
+                        if score == 'n/a' or pd.isna(score):
+                            normalized_scores.append(0)
+                        else:
+                            normalized_scores.append(normalize_values([float(score)], mean, std)[0])
+                    df_results[f"norm_{score_col}"] = normalized_scores
+                else:
+                    # If no valid scores, set all normalized scores to 0
+                    df_results[f"norm_{score_col}"] = 0
+        # Calculate average normalized score across games
+        normalized_columns = [f"norm_{col}" for col in game_score_columns if f"norm_{col}" in df_results.columns]
+        if normalized_columns:
+            df_results["Avg Normalized Score"] = df_results[normalized_columns].mean(axis=1).round(2)
+        else:
+            df_results["Avg Normalized Score"] = 0.0
+        # Reorder columns to put Avg Normalized Score after Organization
+        base_columns = ["Player", "Organization", "Avg Normalized Score"]
+        game_columns = [col for col in df_results.columns if col.endswith(" Score") and not col.startswith("norm_") and col != "Avg Normalized Score"]
+        other_columns = [col for col in df_results.columns if col not in base_columns + game_columns and not col.startswith("norm_")]
+        # Create final column order
+        final_columns = base_columns + game_columns + other_columns
+        df_results = df_results[final_columns]
+        # Sort by average normalized score in descending order
+        df_results = df_results.sort_values("Avg Normalized Score", ascending=False)
+        # Apply limit if specified
+        if limit_to_top_n is not None:
+            df_results = df_results.head(limit_to_top_n)
     return df_results

rank_data_03_25_2025.json CHANGED Viewed

@@ -3,61 +3,61 @@
         "runs": 3,
         "results": [
             {
-                "model": "gamingagent + claude-3-5-sonnet-20241022",
                 "score": 1267.7,
                 "detail_data": "709,1532,1562",
                 "progress": "1-1"
             },
             {
-                "model": "gamingagent + claude-3-7-sonnet-20250219",
                 "score": 1418.7,
                 "detail_data": "2015,709,1532",
                 "progress": "1-1"
             },
             {
-                "model": "gamingagent + gemini-2.5-flash-preview-04-17",
                 "score": 1385.0,
                 "detail_data": "1672,1266,1247",
                 "progress": "1-1"
             },
             {
-                "model": "gamingagent + gemini-2.5-pro-preview-05-06",
                 "score": 1498.3,
                 "detail_data": "1561,1271,1663",
                 "progress": "1-1"
             },
             {
-                "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 1468.7,
                 "detail_data": "898,2008,1500",
                 "progress": "1-1"
             },
             {
-                "model": "gamingagent + gpt-4.1-2025-04-14",
                 "score": 2126.3,
                 "detail_data": "1531,722,4126",
                 "progress": "1-1"
             },
             {
-                "model": "gamingagent + gpt-4o-2024-11-20",
                 "score": 2047.3,
                 "detail_data": "2017,2590,1535",
                 "progress": "1-1"
             },
             {
-                "model": "gamingagent + o1-2024-12-17",
                 "score": 855,
                 "detail_data": "855",
                 "progress": "1-1"
             },
             {
-                "model": "gamingagent + o3-2025-04-16",
                 "score": 3445,
                 "detail_data": "3445",
                 "progress": "1-1"
             },
             {
-                "model": "gamingagent + o4-mini-2025-04-16",
                 "score": 1448.0,
                 "detail_data": "1525,1263,1556",
                 "progress": "1-1"
@@ -74,79 +74,79 @@
         "runs": 3,
         "results": [
             {
-                "model": "gamingagent + claude-3-5-sonnet-20241022",
                 "score": 1914.67,
                 "details": "1352,2860,1532",
                 "highest_tail": 256
             },
             {
-                "model": "gamingagent + claude-3-7-sonnet-20250219",
                 "score": 2624,
                 "details": "2560,3224,2088",
                 "highest_tail": 256
             },
             {
-                "model": "gamingagent + deepseek-r1-0120",
                 "score": 1873.33,
                 "details": "700,1240,3680",
                 "highest_tail": 256
             },
             {
-                "model": "gamingagent + gemini-2.5-flash-preview-04-17",
                 "score": 1697.33,
                 "details": "1304,1316,2472",
                 "highest_tail": 256
             },
             {
-                "model": "gamingagent + gemini-2.5-pro-preview-05-06",
                 "score": 3586.67,
                 "details": "5300,2400,3060",
                 "highest_tail": 512
             },
             {
-                "model": "gamingagent + grok-3-mini-beta",
                 "score": 4036,
                 "details": "6412,2492,3204",
                 "highest_tail": 512
             },
             {
-                "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 1586.67,
                 "details": "1404,1272,2084",
                 "highest_tail": 128
             },
             {
-                "model": "gamingagent + gpt-4.1-2025-04-14",
                 "score": 1656,
                 "details": "1156,2664,1148",
                 "highest_tail": 256
             },
             {
-                "model": "gamingagent + gpt-4o-2024-11-20",
                 "score": 1656,
                 "details": "1604,1284,2080",
                 "highest_tail": 256
             },
             {
-                "model": "gamingagent + o1-2024-12-17",
                 "score": 7580,
                 "details": "7580",
                 "highest_tail": 512
             },
             {
-                "model": "gamingagent + o1-mini-2024-09-12",
                 "score": 2757.33,
                 "details": "3132,2004,3136",
                 "highest_tail": 256
             },
             {
-                "model": "gamingagent + o3-2025-04-16",
                 "score": 7120,
                 "details": "7120",
                 "highest_tail": 512
             },
             {
-                "model": "gamingagent + o4-mini-2025-04-16",
                 "score": 4432.0,
                 "details": "4928,5456,2912",
                 "highest_tail": 512
@@ -158,25 +158,25 @@
                 "highest_tail": 128
             },
             {
-                "model": "gamingagent + claude-opus-4-20250514",
                 "score": 3036.0,
                 "details": "3036.0",
                 "highest_tail": 256
             },
             {
-                "model": "gamingagent + claude-sonnet-4-20250514",
                 "score": 3136,
                 "details": "2148,2360,4900",
                 "highest_tail": 256
             },
             {
-                "model": "gamingagent + deepseek-r1-0528",
                 "score": 3330.0,
                 "details": "3260,3400",
                 "highest_tail": 256
             },
             {
-                "model": "gamingagent + qwen3-235B-A22B-fp8",
                 "score": 2144.0,
                 "details": "1436,2556,2440",
                 "highest_tail": 256
@@ -187,67 +187,67 @@
         "runs": 3,
         "results": [
             {
-                "model": "gamingagent + claude-3-5-sonnet-20241022",
                 "score": 14.7,
                 "details": "16,14,14"
             },
             {
-                "model": "gamingagent + claude-3-7-sonnet-20250219",
                 "score": 16.3,
                 "details": "19,15,15"
             },
             {
-                "model": "gamingagent + deepseek-r1-0120",
                 "score": 14.3,
                 "details": "15,14,14"
             },
             {
-                "model": "gamingagent + gemini-2.5-flash-preview-04-17",
                 "score": 16.3,
                 "details": "20,14,15"
             },
             {
-                "model": "gamingagent + gemini-2.5-pro-preview-05-06",
                 "score": 23.3,
                 "details": "23,23,24"
             },
             {
-                "model": "gamingagent + grok-3-mini-beta",
                 "score": 21.3,
                 "details": "20,15,29"
             },
             {
-                "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 10.3,
                 "details": "9,10,12"
             },
             {
-                "model": "gamingagent + gpt-4.1-2025-04-14",
                 "score": 13.7,
                 "details": "13,14,14"
             },
             {
-                "model": "gamingagent + gpt-4o-2024-11-20",
                 "score": 14,
                 "details": "18,11,13"
             },
             {
-                "model": "gamingagent + o1-2024-12-17",
                 "score": 35,
                 "details": "35"
             },
             {
-                "model": "gamingagent + o1-mini-2024-09-12",
                 "score": 11.7,
                 "details": "11,11,13"
             },
             {
-                "model": "gamingagent + o3-2025-04-16",
                 "score": 42,
                 "details": "42"
             },
             {
-                "model": "gamingagent + o4-mini-2025-04-16",
                 "score": 25.3,
                 "details": "22,35,19"
             },
@@ -257,22 +257,22 @@
                 "details": ""
             },
             {
-                "model": "gamingagent + claude-opus-4-20250514",
                 "score": 20,
                 "details": "17,18,25"
             },
             {
-                "model": "gamingagent + claude-sonnet-4-20250514",
                 "score": 19.33,
                 "details": "20,17,21"
             },
             {
-                "model": "gamingagent + deepseek-r1-0528",
                 "score": 33.67,
                 "details": "26,34,41"
             },
             {
-                "model": "gamingagent + qwen3-235B-A22B-fp8",
                 "score": 11.67,
                 "details": "13,14,8"
             }
@@ -282,67 +282,67 @@
         "runs": 3,
         "results": [
             {
-                "model": "gamingagent + claude-3-5-sonnet-20241022",
                 "score": 106,
                 "details": "92,165,61"
             },
             {
-                "model": "gamingagent + claude-3-7-sonnet-20250219",
                 "score": 484,
                 "details": "535,428,489"
             },
             {
-                "model": "gamingagent + deepseek-r1-0120",
                 "score": 447.3,
                 "details": "409,436,497"
             },
             {
-                "model": "gamingagent + gemini-2.5-flash-preview-04-17",
                 "score": 334.7,
                 "details": "259,372,373"
             },
             {
-                "model": "gamingagent + gemini-2.5-pro-preview-05-06",
                 "score": 416.3,
                 "details": "411,414,424"
             },
             {
-                "model": "gamingagent + grok-3-mini-beta",
                 "score": 254,
                 "details": "299,332,131"
             },
             {
-                "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 128.7,
                 "details": "67,139,180"
             },
             {
-                "model": "gamingagent + gpt-4.1-2025-04-14",
                 "score": 182,
                 "details": "163,215,168"
             },
             {
-                "model": "gamingagent + gpt-4o-2024-11-20",
                 "score": 147.3,
                 "details": "131,104,207"
             },
             {
-                "model": "gamingagent + o1-2024-12-17",
                 "score": 159,
                 "details": "159"
             },
             {
-                "model": "gamingagent + o1-mini-2024-09-12",
                 "score": 48,
                 "details": "21,86,37"
             },
             {
-                "model": "gamingagent + o3-2025-04-16",
                 "score": 647,
                 "details": "647"
             },
             {
-                "model": "gamingagent + o4-mini-2025-04-16",
                 "score": 487.3,
                 "details": "259,591,612"
             },
@@ -352,22 +352,22 @@
                 "details": ""
             },
             {
-                "model": "gamingagent + claude-opus-4-20250514",
                 "score": 464,
                 "details": "593,406,393"
             },
             {
-                "model": "gamingagent + claude-sonnet-4-20250514",
                 "score": 478.33,
                 "details": "545,468,422"
             },
             {
-                "model": "gamingagent + deepseek-r1-0528",
                 "score": 491.67,
                 "details": "464,463,548"
             },
             {
-                "model": "gamingagent + qwen3-235B-A22B-fp8",
                 "score": 363.33,
                 "details": "365,372,353"
             }
@@ -377,79 +377,79 @@
         "runs": 3,
         "results": [
             {
-                "model": "gamingagent + claude-3-5-sonnet-20241022",
                 "score": 0,
                 "detail_box_on_target": "0,0,0",
                 "cracked_levels": "0,0,0"
             },
             {
-                "model": "gamingagent + claude-3-7-sonnet-20250219",
                 "score": 2.33,
                 "detail_box_on_target": "2,4,1",
                 "cracked_levels": "1,2,0"
             },
             {
-                "model": "gamingagent + deepseek-r1-0120",
                 "score": 1.33,
                 "detail_box_on_target": "2,0,2",
                 "cracked_levels": "1,0,1"
             },
             {
-                "model": "gamingagent + gemini-2.5-flash-preview-04-17",
                 "score": 1.67,
                 "detail_box_on_target": "3,0,2",
                 "cracked_levels": "2,0,1"
             },
             {
-                "model": "gamingagent + gemini-2.5-pro-preview-05-06",
                 "score": 4.33,
                 "detail_box_on_target": "4,4,5",
                 "cracked_levels": "2,2,3"
             },
             {
-                "model": "gamingagent + grok-3-mini-beta",
                 "score": 5.67,
                 "detail_box_on_target": "5,6,6",
                 "cracked_levels": "3,3,3"
             },
             {
-                "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 0,
                 "detail_box_on_target": "0,0,0",
                 "cracked_levels": "0,0,0"
             },
             {
-                "model": "gamingagent + gpt-4.1-2025-04-14",
                 "score": 0,
                 "detail_box_on_target": "0,0,0",
                 "cracked_levels": "0,0,0"
             },
             {
-                "model": "gamingagent + gpt-4o-2024-11-20",
                 "score": 0,
                 "detail_box_on_target": "0,0,0",
                 "cracked_levels": "0,0,0"
             },
             {
-                "model": "gamingagent + o1-2024-12-17",
                 "score": 2.33,
                 "detail_box_on_target": "2,2,3",
                 "cracked_levels": "1,1,2"
             },
             {
-                "model": "gamingagent + o1-mini-2024-09-12",
                 "score": 1.33,
                 "detail_box_on_target": "1,2,1",
                 "cracked_levels": "0,1,0"
             },
             {
-                "model": "gamingagent + o3-2025-04-16",
                 "score": 8,
                 "detail_box_on_target": "10,6",
                 "cracked_levels": "5,3"
             },
             {
-                "model": "gamingagent + o4-mini-2025-04-16",
                 "score": 5.33,
                 "detail_box_on_target": "4,6,6",
                 "cracked_levels": "2,2,3"
@@ -461,22 +461,22 @@
                 "cracked_levels": "0,0,0"
             },
             {
-                "model": "gamingagent + claude-opus-4-20250514",
                 "score": 4,
                 "details": "4,4,4"
             },
             {
-                "model": "gamingagent + claude-sonnet-4-20250514",
                 "score": 3,
                 "details": "2,2,5"
             },
             {
-                "model": "gamingagent + deepseek-r1-0528",
                 "score": 4.67,
                 "details": "4,4,6"
             },
             {
-                "model": "gamingagent + qwen3-235B-A22B-fp8",
                 "score": 2.33,
                 "details": "1,2,4"
             }
@@ -486,79 +486,79 @@
         "runs": 1,
         "results": [
             {
-                "model": "gamingagent + claude-3-5-sonnet-20241022",
                 "score": 2,
                 "progress": "1:2/5",
                 "evaluator result": "1/3"
             },
             {
-                "model": "gamingagent + claude-3-7-sonnet-20250219",
                 "score": 7,
                 "progress": "2:2/9",
                 "evaluator result": "5/11"
             },
             {
-                "model": "gamingagent + deepseek-r1-0120",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "1/5"
             },
             {
-                "model": "gamingagent + gemini-2.5-flash-preview-04-17",
                 "score": 4,
                 "progress": "1:4/5",
                 "evaluator result": "1/7"
             },
             {
-                "model": "gamingagent + gemini-2.5-pro-preview-05-06",
                 "score": 7,
                 "progress": "2:2/9",
                 "evaluator result": "2/3"
             },
             {
-                "model": "gamingagent + grok-3-mini-beta",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             },
             {
-                "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             },
             {
-                "model": "gamingagent + gpt-4.1-2025-04-14",
                 "score": 2,
                 "progress": "1:2/5",
                 "evaluator result": "2/3"
             },
             {
-                "model": "gamingagent + gpt-4o-2024-11-20",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             },
             {
-                "model": "gamingagent + o1-2024-12-17",
                 "score": 16,
                 "progress": "3: 2/8",
                 "evaluator result": "6/11"
             },
             {
-                "model": "gamingagent + o1-mini-2024-09-12",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "1/5"
             },
             {
-                "model": "gamingagent + o3-2025-04-16",
                 "score": 16,
                 "progress": "3: 2/8",
                 "evaluator result": "1/2"
             },
             {
-                "model": "gamingagent + o4-mini-2025-04-16",
                 "score": 4,
                 "progress": "1:4/5",
                 "evaluator result": "2/5"
@@ -570,17 +570,17 @@
                 "evaluator result": "0"
             },
             {
-                "model": "gamingagent + claude-opus-4-20250514",
                 "score": 6,
                 "details": "6"
             },
             {
-                "model": "gamingagent + claude-sonnet-4-20250514",
                 "score": 3.67,
                 "details": "3,4,4"
             },
             {
-                "model": "gamingagent + gemini-2.5-flash-preview-05-20",
                 "score": 4.33,
                 "details": "3,4,6"
             }

         "runs": 3,
         "results": [
             {
+                "model": "claude-3-5-sonnet-20241022 (⚔️)",
                 "score": 1267.7,
                 "detail_data": "709,1532,1562",
                 "progress": "1-1"
             },
             {
+                "model": "claude-3-7-sonnet-20250219 (⚔️)",
                 "score": 1418.7,
                 "detail_data": "2015,709,1532",
                 "progress": "1-1"
             },
             {
+                "model": "gemini-2.5-flash-preview-04-17 (⚔️)",
                 "score": 1385.0,
                 "detail_data": "1672,1266,1247",
                 "progress": "1-1"
             },
             {
+                "model": "gemini-2.5-pro-preview-05-06 (⚔️)",
                 "score": 1498.3,
                 "detail_data": "1561,1271,1663",
                 "progress": "1-1"
             },
             {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
                 "score": 1468.7,
                 "detail_data": "898,2008,1500",
                 "progress": "1-1"
             },
             {
+                "model": "gpt-4.1-2025-04-14 (⚔️)",
                 "score": 2126.3,
                 "detail_data": "1531,722,4126",
                 "progress": "1-1"
             },
             {
+                "model": "gpt-4o-2024-11-20 (⚔️)",
                 "score": 2047.3,
                 "detail_data": "2017,2590,1535",
                 "progress": "1-1"
             },
             {
+                "model": "o1-2024-12-17 (⚔️)",
                 "score": 855,
                 "detail_data": "855",
                 "progress": "1-1"
             },
             {
+                "model": "o3-2025-04-16 (⚔️)",
                 "score": 3445,
                 "detail_data": "3445",
                 "progress": "1-1"
             },
             {
+                "model": "o4-mini-2025-04-16 (⚔️)",
                 "score": 1448.0,
                 "detail_data": "1525,1263,1556",
                 "progress": "1-1"
         "runs": 3,
         "results": [
             {
+                "model": "claude-3-5-sonnet-20241022 (⚔️)",
                 "score": 1914.67,
                 "details": "1352,2860,1532",
                 "highest_tail": 256
             },
             {
+                "model": "claude-3-7-sonnet-20250219 (⚔️)",
                 "score": 2624,
                 "details": "2560,3224,2088",
                 "highest_tail": 256
             },
             {
+                "model": "deepseek-r1-0120 (⚔️)",
                 "score": 1873.33,
                 "details": "700,1240,3680",
                 "highest_tail": 256
             },
             {
+                "model": "gemini-2.5-flash-preview-04-17 (⚔️)",
                 "score": 1697.33,
                 "details": "1304,1316,2472",
                 "highest_tail": 256
             },
             {
+                "model": "gemini-2.5-pro-preview-05-06 (⚔️)",
                 "score": 3586.67,
                 "details": "5300,2400,3060",
                 "highest_tail": 512
             },
             {
+                "model": "grok-3-mini-beta (⚔️)",
                 "score": 4036,
                 "details": "6412,2492,3204",
                 "highest_tail": 512
             },
             {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
                 "score": 1586.67,
                 "details": "1404,1272,2084",
                 "highest_tail": 128
             },
             {
+                "model": "gpt-4.1-2025-04-14 (⚔️)",
                 "score": 1656,
                 "details": "1156,2664,1148",
                 "highest_tail": 256
             },
             {
+                "model": "gpt-4o-2024-11-20 (⚔️)",
                 "score": 1656,
                 "details": "1604,1284,2080",
                 "highest_tail": 256
             },
             {
+                "model": "o1-2024-12-17 (⚔️)",
                 "score": 7580,
                 "details": "7580",
                 "highest_tail": 512
             },
             {
+                "model": "o1-mini-2024-09-12 (⚔️)",
                 "score": 2757.33,
                 "details": "3132,2004,3136",
                 "highest_tail": 256
             },
             {
+                "model": "o3-2025-04-16 (⚔️)",
                 "score": 7120,
                 "details": "7120",
                 "highest_tail": 512
             },
             {
+                "model": "o4-mini-2025-04-16 (⚔️)",
                 "score": 4432.0,
                 "details": "4928,5456,2912",
                 "highest_tail": 512
                 "highest_tail": 128
             },
             {
+                "model": "claude-opus-4-20250514 (⚔️)",
                 "score": 3036.0,
                 "details": "3036.0",
                 "highest_tail": 256
             },
             {
+                "model": "claude-sonnet-4-20250514 (⚔️)",
                 "score": 3136,
                 "details": "2148,2360,4900",
                 "highest_tail": 256
             },
             {
+                "model": "deepseek-r1-0528 (⚔️)",
                 "score": 3330.0,
                 "details": "3260,3400",
                 "highest_tail": 256
             },
             {
+                "model": "qwen3-235B-A22B-fp8 (⚔️)",
                 "score": 2144.0,
                 "details": "1436,2556,2440",
                 "highest_tail": 256
         "runs": 3,
         "results": [
             {
+                "model": "claude-3-5-sonnet-20241022 (⚔️)",
                 "score": 14.7,
                 "details": "16,14,14"
             },
             {
+                "model": "claude-3-7-sonnet-20250219 (⚔️)",
                 "score": 16.3,
                 "details": "19,15,15"
             },
             {
+                "model": "deepseek-r1-0120 (⚔️)",
                 "score": 14.3,
                 "details": "15,14,14"
             },
             {
+                "model": "gemini-2.5-flash-preview-04-17 (⚔️)",
                 "score": 16.3,
                 "details": "20,14,15"
             },
             {
+                "model": "gemini-2.5-pro-preview-05-06 (⚔️)",
                 "score": 23.3,
                 "details": "23,23,24"
             },
             {
+                "model": "grok-3-mini-beta (⚔️)",
                 "score": 21.3,
                 "details": "20,15,29"
             },
             {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
                 "score": 10.3,
                 "details": "9,10,12"
             },
             {
+                "model": "gpt-4.1-2025-04-14 (⚔️)",
                 "score": 13.7,
                 "details": "13,14,14"
             },
             {
+                "model": "gpt-4o-2024-11-20 (⚔️)",
                 "score": 14,
                 "details": "18,11,13"
             },
             {
+                "model": "o1-2024-12-17 (⚔️)",
                 "score": 35,
                 "details": "35"
             },
             {
+                "model": "o1-mini-2024-09-12 (⚔️)",
                 "score": 11.7,
                 "details": "11,11,13"
             },
             {
+                "model": "o3-2025-04-16 (⚔️)",
                 "score": 42,
                 "details": "42"
             },
             {
+                "model": "o4-mini-2025-04-16 (⚔️)",
                 "score": 25.3,
                 "details": "22,35,19"
             },
                 "details": ""
             },
             {
+                "model": "claude-opus-4-20250514 (⚔️)",
                 "score": 20,
                 "details": "17,18,25"
             },
             {
+                "model": "claude-sonnet-4-20250514 (⚔️)",
                 "score": 19.33,
                 "details": "20,17,21"
             },
             {
+                "model": "deepseek-r1-0528 (⚔️)",
                 "score": 33.67,
                 "details": "26,34,41"
             },
             {
+                "model": "qwen3-235B-A22B-fp8 (⚔️)",
                 "score": 11.67,
                 "details": "13,14,8"
             }
         "runs": 3,
         "results": [
             {
+                "model": "claude-3-5-sonnet-20241022 (⚔️)",
                 "score": 106,
                 "details": "92,165,61"
             },
             {
+                "model": "claude-3-7-sonnet-20250219 (⚔️)",
                 "score": 484,
                 "details": "535,428,489"
             },
             {
+                "model": "deepseek-r1-0120 (⚔️)",
                 "score": 447.3,
                 "details": "409,436,497"
             },
             {
+                "model": "gemini-2.5-flash-preview-04-17 (⚔️)",
                 "score": 334.7,
                 "details": "259,372,373"
             },
             {
+                "model": "gemini-2.5-pro-preview-05-06 (⚔️)",
                 "score": 416.3,
                 "details": "411,414,424"
             },
             {
+                "model": "grok-3-mini-beta (⚔️)",
                 "score": 254,
                 "details": "299,332,131"
             },
             {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
                 "score": 128.7,
                 "details": "67,139,180"
             },
             {
+                "model": "gpt-4.1-2025-04-14 (⚔️)",
                 "score": 182,
                 "details": "163,215,168"
             },
             {
+                "model": "gpt-4o-2024-11-20 (⚔️)",
                 "score": 147.3,
                 "details": "131,104,207"
             },
             {
+                "model": "o1-2024-12-17 (⚔️)",
                 "score": 159,
                 "details": "159"
             },
             {
+                "model": "o1-mini-2024-09-12 (⚔️)",
                 "score": 48,
                 "details": "21,86,37"
             },
             {
+                "model": "o3-2025-04-16 (⚔️)",
                 "score": 647,
                 "details": "647"
             },
             {
+                "model": "o4-mini-2025-04-16 (⚔️)",
                 "score": 487.3,
                 "details": "259,591,612"
             },
                 "details": ""
             },
             {
+                "model": "claude-opus-4-20250514 (⚔️)",
                 "score": 464,
                 "details": "593,406,393"
             },
             {
+                "model": "claude-sonnet-4-20250514 (⚔️)",
                 "score": 478.33,
                 "details": "545,468,422"
             },
             {
+                "model": "deepseek-r1-0528 (⚔️)",
                 "score": 491.67,
                 "details": "464,463,548"
             },
             {
+                "model": "qwen3-235B-A22B-fp8 (⚔️)",
                 "score": 363.33,
                 "details": "365,372,353"
             }
         "runs": 3,
         "results": [
             {
+                "model": "claude-3-5-sonnet-20241022 (⚔️)",
                 "score": 0,
                 "detail_box_on_target": "0,0,0",
                 "cracked_levels": "0,0,0"
             },
             {
+                "model": "claude-3-7-sonnet-20250219 (⚔️)",
                 "score": 2.33,
                 "detail_box_on_target": "2,4,1",
                 "cracked_levels": "1,2,0"
             },
             {
+                "model": "deepseek-r1-0120 (⚔️)",
                 "score": 1.33,
                 "detail_box_on_target": "2,0,2",
                 "cracked_levels": "1,0,1"
             },
             {
+                "model": "gemini-2.5-flash-preview-04-17 (⚔️)",
                 "score": 1.67,
                 "detail_box_on_target": "3,0,2",
                 "cracked_levels": "2,0,1"
             },
             {
+                "model": "gemini-2.5-pro-preview-05-06 (⚔️)",
                 "score": 4.33,
                 "detail_box_on_target": "4,4,5",
                 "cracked_levels": "2,2,3"
             },
             {
+                "model": "grok-3-mini-beta (⚔️)",
                 "score": 5.67,
                 "detail_box_on_target": "5,6,6",
                 "cracked_levels": "3,3,3"
             },
             {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
                 "score": 0,
                 "detail_box_on_target": "0,0,0",
                 "cracked_levels": "0,0,0"
             },
             {
+                "model": "gpt-4.1-2025-04-14 (⚔️)",
                 "score": 0,
                 "detail_box_on_target": "0,0,0",
                 "cracked_levels": "0,0,0"
             },
             {
+                "model": "gpt-4o-2024-11-20 (⚔️)",
                 "score": 0,
                 "detail_box_on_target": "0,0,0",
                 "cracked_levels": "0,0,0"
             },
             {
+                "model": "o1-2024-12-17 (⚔️)",
                 "score": 2.33,
                 "detail_box_on_target": "2,2,3",
                 "cracked_levels": "1,1,2"
             },
             {
+                "model": "o1-mini-2024-09-12 (⚔️)",
                 "score": 1.33,
                 "detail_box_on_target": "1,2,1",
                 "cracked_levels": "0,1,0"
             },
             {
+                "model": "o3-2025-04-16 (⚔️)",
                 "score": 8,
                 "detail_box_on_target": "10,6",
                 "cracked_levels": "5,3"
             },
             {
+                "model": "o4-mini-2025-04-16 (⚔️)",
                 "score": 5.33,
                 "detail_box_on_target": "4,6,6",
                 "cracked_levels": "2,2,3"
                 "cracked_levels": "0,0,0"
             },
             {
+                "model": "claude-opus-4-20250514 (⚔️)",
                 "score": 4,
                 "details": "4,4,4"
             },
             {
+                "model": "claude-sonnet-4-20250514 (⚔️)",
                 "score": 3,
                 "details": "2,2,5"
             },
             {
+                "model": "deepseek-r1-0528 (⚔️)",
                 "score": 4.67,
                 "details": "4,4,6"
             },
             {
+                "model": "qwen3-235B-A22B-fp8 (⚔️)",
                 "score": 2.33,
                 "details": "1,2,4"
             }
         "runs": 1,
         "results": [
             {
+                "model": "claude-3-5-sonnet-20241022 (⚔️)",
                 "score": 2,
                 "progress": "1:2/5",
                 "evaluator result": "1/3"
             },
             {
+                "model": "claude-3-7-sonnet-20250219 (⚔️)",
                 "score": 7,
                 "progress": "2:2/9",
                 "evaluator result": "5/11"
             },
             {
+                "model": "deepseek-r1-0120 (⚔️)",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "1/5"
             },
             {
+                "model": "gemini-2.5-flash-preview-04-17 (⚔️)",
                 "score": 4,
                 "progress": "1:4/5",
                 "evaluator result": "1/7"
             },
             {
+                "model": "gemini-2.5-pro-preview-05-06 (⚔️)",
                 "score": 7,
                 "progress": "2:2/9",
                 "evaluator result": "2/3"
             },
             {
+                "model": "grok-3-mini-beta (⚔️)",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             },
             {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             },
             {
+                "model": "gpt-4.1-2025-04-14 (⚔️)",
                 "score": 2,
                 "progress": "1:2/5",
                 "evaluator result": "2/3"
             },
             {
+                "model": "gpt-4o-2024-11-20 (⚔️)",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             },
             {
+                "model": "o1-2024-12-17 (⚔️)",
                 "score": 16,
                 "progress": "3: 2/8",
                 "evaluator result": "6/11"
             },
             {
+                "model": "o1-mini-2024-09-12 (⚔️)",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "1/5"
             },
             {
+                "model": "o3-2025-04-16 (⚔️)",
                 "score": 16,
                 "progress": "3: 2/8",
                 "evaluator result": "1/2"
             },
             {
+                "model": "o4-mini-2025-04-16 (⚔️)",
                 "score": 4,
                 "progress": "1:4/5",
                 "evaluator result": "2/5"
                 "evaluator result": "0"
             },
             {
+                "model": "claude-opus-4-20250514 (⚔️)",
                 "score": 6,
                 "details": "6"
             },
             {
+                "model": "claude-sonnet-4-20250514 (⚔️)",
                 "score": 3.67,
                 "details": "3,4,4"
             },
             {
+                "model": "gemini-2.5-flash-preview-05-20 (⚔️)",
                 "score": 4.33,
                 "details": "3,4,6"
             }