lmgame_bench

Running

App Files Files Community

Yuxuan-Zhang-Dexter commited on Jun 3, 2025

Commit

bce84cc

1 Parent(s): 5d62091

update new leaderboard data

Browse files

Files changed (5) hide show

app.py +113 -90
assets/model_color.json +6 -2
data_visualization.py +69 -67
leaderboard_utils.py +154 -64
rank_data_03_25_2025.json +359 -336

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ from datetime import datetime, timedelta
 import matplotlib.pyplot as plt
 from leaderboard_utils import (
     get_organization,
-    get_mario_leaderboard,
     get_sokoban_leaderboard,
     get_2048_leaderboard,
     get_candy_leaderboard,
@@ -50,20 +50,22 @@ with open(TIME_POINTS["03/25/2025"], "r") as f:
 leaderboard_state = {
     "current_game": None,
     "previous_overall": {
-        "Super Mario Bros": True,
         "Sokoban": True,
         "2048": True,
         "Candy Crush": True,
-        "Tetris (complete)": True,
         "Tetris (planning only)": True,
         "Ace Attorney": True
     },
     "previous_details": {
-        "Super Mario Bros": False,
         "Sokoban": False,
         "2048": False,
         "Candy Crush": False,
-        "Tetris (complete)": False,
         "Tetris (planning only)": False,
         "Ace Attorney": False
     }
@@ -107,7 +109,7 @@ def prepare_dataframe_for_display(df, for_game=None):
         if col.endswith(' Score'):
             display_df[col] = display_df[col].apply(lambda x: '-' if x == '_' else x)
-    # If we're in detailed view, add a formatted rank column
     if for_game:
         # Sort by relevant score column
         score_col = f"{for_game} Score"
@@ -116,10 +118,30 @@ def prepare_dataframe_for_display(df, for_game=None):
             display_df[score_col] = pd.to_numeric(display_df[score_col], errors='coerce')
             # Sort by score in descending order
             display_df = display_df.sort_values(by=score_col, ascending=False)
-            # Add rank column based on the sort
-            display_df.insert(0, 'Rank', range(1, len(display_df) + 1))
             # Filter out models that didn't participate
             display_df = display_df[~display_df[score_col].isna()]
     # Add line breaks to column headers
     new_columns = {}
@@ -129,9 +151,6 @@ def prepare_dataframe_for_display(df, for_game=None):
             game_name = col.replace(' Score', '')
             new_col = f"{game_name}\nScore"
             new_columns[col] = new_col
-        # Keep Organization without line breaks
-        # elif col == 'Organization':
-        #    new_columns[col] = 'Organi-\nzation'
     # Rename columns with new line breaks
     if new_columns:
@@ -158,32 +177,35 @@ def update_df_with_height(df):
                      # max_height=None,  # Remove height limitation - COMMENTED OUT
                      column_widths=col_widths)
-def update_leaderboard(mario_overall, mario_details,
                        sokoban_overall, sokoban_details,
                        _2048_overall, _2048_details,
                        candy_overall, candy_details,
-                       tetris_overall, tetris_details,
                        tetris_plan_overall, tetris_plan_details,
                        ace_attorney_overall, ace_attorney_details):
     global leaderboard_state
     # Convert current checkbox states to dictionary for easier comparison
     current_overall = {
-        "Super Mario Bros": mario_overall,
         "Sokoban": sokoban_overall,
         "2048": _2048_overall,
         "Candy Crush": candy_overall,
-        "Tetris (complete)": tetris_overall,
         "Tetris (planning only)": tetris_plan_overall,
         "Ace Attorney": ace_attorney_overall
     }
     current_details = {
-        "Super Mario Bros": mario_details,
         "Sokoban": sokoban_details,
         "2048": _2048_details,
         "Candy Crush": candy_details,
-        "Tetris (complete)": tetris_details,
         "Tetris (planning only)": tetris_plan_details,
         "Ace Attorney": ace_attorney_details
     }
@@ -266,11 +288,12 @@ def update_leaderboard(mario_overall, mario_details,
     # Build dictionary for selected games
     selected_games = {
-        "Super Mario Bros": current_overall["Super Mario Bros"],
         "Sokoban": current_overall["Sokoban"],
         "2048": current_overall["2048"],
         "Candy Crush": current_overall["Candy Crush"],
-        "Tetris (complete)": current_overall["Tetris (complete)"],
         "Tetris (planning only)": current_overall["Tetris (planning only)"],
         "Ace Attorney": current_overall["Ace Attorney"]
     }
@@ -278,54 +301,49 @@ def update_leaderboard(mario_overall, mario_details,
     # Get the appropriate DataFrame and charts based on current state
     if leaderboard_state["current_game"]:
         # For detailed view
-        if leaderboard_state["current_game"] == "Super Mario Bros":
-            df = get_mario_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Sokoban":
             df = get_sokoban_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "2048":
             df = get_2048_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Candy Crush":
             df = get_candy_leaderboard(rank_data)
-        elif leaderboard_state["current_game"] == "Tetris (complete)":
-            df = get_tetris_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Tetris (planning only)":
             df = get_tetris_planning_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Ace Attorney":
             df = get_ace_attorney_leaderboard(rank_data)
-        # Format the DataFrame for display
         display_df = prepare_dataframe_for_display(df, leaderboard_state["current_game"])
-        # Always create a new chart for detailed view
         chart = create_horizontal_bar_chart(df, leaderboard_state["current_game"])
-        # Use the same chart for all visualizations in detailed view
-        radar_chart = chart
-        group_bar_chart = chart
     else:
         # For overall view
-        df, _ = get_combined_leaderboard_with_group_bar(rank_data, selected_games)
-        # Format the DataFrame for display
         display_df = prepare_dataframe_for_display(df)
-        # Use the same selected_games for radar chart
         _, radar_chart = get_combined_leaderboard_with_single_radar(rank_data, selected_games)
-        chart = radar_chart
-        group_bar_chart = radar_chart  # Use radar chart instead of bar chart
-    # Return exactly 18 values to match the expected outputs
-    return (update_df_with_height(display_df), chart, radar_chart, radar_chart,
-            current_overall["Super Mario Bros"], current_details["Super Mario Bros"],
             current_overall["Sokoban"], current_details["Sokoban"],
             current_overall["2048"], current_details["2048"],
             current_overall["Candy Crush"], current_details["Candy Crush"],
-            current_overall["Tetris (complete)"], current_details["Tetris (complete)"],
             current_overall["Tetris (planning only)"], current_details["Tetris (planning only)"],
             current_overall["Ace Attorney"], current_details["Ace Attorney"])
-def update_leaderboard_with_time(time_point, mario_overall, mario_details,
                                sokoban_overall, sokoban_details,
                                _2048_overall, _2048_details,
                                candy_overall, candy_details,
-                               tetris_overall, tetris_details,
                                tetris_plan_overall, tetris_plan_details,
                                ace_attorney_overall, ace_attorney_details):
     # Load rank data for the selected time point
@@ -334,12 +352,13 @@ def update_leaderboard_with_time(time_point, mario_overall, mario_details,
     if new_rank_data is not None:
         rank_data = new_rank_data
-    # Use the existing update_leaderboard function
-    return update_leaderboard(mario_overall, mario_details,
                             sokoban_overall, sokoban_details,
                             _2048_overall, _2048_details,
                             candy_overall, candy_details,
-                            tetris_overall, tetris_details,
                             tetris_plan_overall, tetris_plan_details,
                             ace_attorney_overall, ace_attorney_details)
@@ -348,20 +367,22 @@ def get_initial_state():
     return {
         "current_game": None,
         "previous_overall": {
-            "Super Mario Bros": True,
             "Sokoban": True,
             "2048": True,
             "Candy Crush": True,
-            "Tetris (complete)": True,
             "Tetris (planning only)": True,
             "Ace Attorney": True
         },
         "previous_details": {
-            "Super Mario Bros": False,
             "Sokoban": False,
             "2048": False,
             "Candy Crush": False,
-            "Tetris (complete)": False,
             "Tetris (planning only)": False,
             "Ace Attorney": False
         }
@@ -370,36 +391,27 @@ def get_initial_state():
 def clear_filters():
     global leaderboard_state
-    # Reset all checkboxes to default state
     selected_games = {
-        "Super Mario Bros": True,
         "Sokoban": True,
         "2048": True,
         "Candy Crush": True,
-        "Tetris (complete)": True,
         "Tetris (planning only)": True,
         "Ace Attorney": True
     }
-    # Get the combined leaderboard and group bar chart
     df, group_bar_chart = get_combined_leaderboard_with_group_bar(rank_data, selected_games)
-    # Format the DataFrame for display
     display_df = prepare_dataframe_for_display(df)
-    # Get the radar chart using the same selected games
     _, radar_chart = get_combined_leaderboard_with_single_radar(rank_data, selected_games)
-    # Reset the leaderboard state to match the default checkbox states
     leaderboard_state = get_initial_state()
-    # Return exactly 16 values to match the expected outputs
-    return (update_df_with_height(display_df), radar_chart, radar_chart, radar_chart,
-            True, False,  # mario
             True, False,  # sokoban
             True, False,  # 2048
             True, False,  # candy
-            True, False,  # tetris
             True, False,  # tetris plan
             True, False)  # ace attorney
@@ -712,7 +724,7 @@ def build_app():
             margin-top: 40px !important;
         }
     """) as demo:
-        gr.Markdown("# 🎮 Game Arena: Gaming Agent 🎲")
         # Add custom JavaScript for table header line breaks
         gr.HTML("""
@@ -861,29 +873,34 @@ def build_app():
                                 label="Comparative Analysis (Radar Chart)",
                                 elem_classes="visualization-container"
                             )
-                        # Comment out the Group Bar Chart tab
-                        # with gr.Tab("📊 Group Bar Chart"):
-                        #     group_bar_visualization = gr.Plot(
-                        #         label="Comparative Analysis (Group Bar Chart)",
-                        #         elem_classes="visualization-container"
-                        #     )
                             gr.Markdown(
-                                "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
-                                elem_classes="radar-tip"
                             )
                 # Hidden placeholder for group bar visualization (to maintain code references)
-                group_bar_visualization = gr.Plot(visible=False)
                 # Game selection section
                 with gr.Row():
                     gr.Markdown("### 🎮 Game Selection")
                 with gr.Row():
-                    with gr.Column():
-                        gr.Markdown("**🎮 Super Mario Bros**")
-                        mario_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
-                        mario_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
-                    with gr.Column():
                         gr.Markdown("**📦 Sokoban**")
                         sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
                         sokoban_details = gr.Checkbox(label="Sokoban Details", value=False)
@@ -895,10 +912,10 @@ def build_app():
                         gr.Markdown("**🍬 Candy Crush**")
                         candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
                         candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
-                    with gr.Column():
-                        gr.Markdown("**🎯 Tetris (complete)**")
-                        tetris_overall = gr.Checkbox(label="Tetris (complete) Score", value=True)
-                        tetris_details = gr.Checkbox(label="Tetris (complete) Details", value=False)
                     with gr.Column():
                         gr.Markdown("**📋 Tetris (planning)**")
                         tetris_plan_overall = gr.Checkbox(label="Tetris (planning) Score", value=True)
@@ -927,11 +944,12 @@ def build_app():
                 # Get initial leaderboard dataframe
                 initial_df = get_combined_leaderboard(rank_data, {
-                    "Super Mario Bros": True,
                     "Sokoban": True,
                     "2048": True,
                     "Candy Crush": True,
-                    "Tetris (complete)": True,
                     "Tetris (planning only)": True,
                     "Ace Attorney": True
                 })
@@ -967,13 +985,14 @@ def build_app():
                 with gr.Row():
                     score_note = add_score_note()
-                # List of all checkboxes
                 checkbox_list = [
-                    mario_overall, mario_details,
                     sokoban_overall, sokoban_details,
                     _2048_overall, _2048_details,
                     candy_overall, candy_details,
-                    tetris_overall, tetris_details,
                     tetris_plan_overall, tetris_plan_details,
                     ace_attorney_overall, ace_attorney_details
                 ]
@@ -981,10 +1000,14 @@ def build_app():
                 # Update visualizations when checkboxes change
                 def update_visualizations(*checkbox_states):
                     # Check if any details checkbox is selected
                     is_details_view = any([
-                        checkbox_states[1], checkbox_states[3], checkbox_states[5],
-                        checkbox_states[7], checkbox_states[9], checkbox_states[11],
-                        checkbox_states[13]  # Ace Attorney details checkbox
                     ])
                     # Update visibility of visualization blocks
@@ -1010,7 +1033,7 @@ def build_app():
                             leaderboard_df,
                             detailed_visualization,
                             radar_visualization,
-                            group_bar_visualization
                         ] + checkbox_list
                     )
@@ -1022,7 +1045,7 @@ def build_app():
                         leaderboard_df,
                         detailed_visualization,
                         radar_visualization,
-                        group_bar_visualization
                     ] + checkbox_list
                 )
@@ -1034,7 +1057,7 @@ def build_app():
                         leaderboard_df,
                         detailed_visualization,
                         radar_visualization,
-                        group_bar_visualization
                     ] + checkbox_list
                 )

 import matplotlib.pyplot as plt
 from leaderboard_utils import (
     get_organization,
+    get_mario_planning_leaderboard,
     get_sokoban_leaderboard,
     get_2048_leaderboard,
     get_candy_leaderboard,
 leaderboard_state = {
     "current_game": None,
     "previous_overall": {
+        # "Super Mario Bros": True, # Commented out
+        "Super Mario Bros (planning only)": True,
         "Sokoban": True,
         "2048": True,
         "Candy Crush": True,
+        # "Tetris (complete)", # Commented out
         "Tetris (planning only)": True,
         "Ace Attorney": True
     },
     "previous_details": {
+        # "Super Mario Bros": False, # Commented out
+        "Super Mario Bros (planning only)": False,
         "Sokoban": False,
         "2048": False,
         "Candy Crush": False,
+        # "Tetris (complete)": False, # Commented out
         "Tetris (planning only)": False,
         "Ace Attorney": False
     }
         if col.endswith(' Score'):
             display_df[col] = display_df[col].apply(lambda x: '-' if x == '_' else x)
+    # If we're in detailed view, sort by score
     if for_game:
         # Sort by relevant score column
         score_col = f"{for_game} Score"
             display_df[score_col] = pd.to_numeric(display_df[score_col], errors='coerce')
             # Sort by score in descending order
             display_df = display_df.sort_values(by=score_col, ascending=False)
             # Filter out models that didn't participate
             display_df = display_df[~display_df[score_col].isna()]
+    else:
+        # For overall view, sort by average of game scores (implicitly used for ranking)
+        # but we won't add an explicit 'Rank' or 'Average Rank' column to the final display_df
+        # Calculate an internal sorting key based on average scores, but don't add it to the display_df
+        score_cols = [col for col in display_df.columns if col.endswith(' Score')]
+        if score_cols:
+            temp_sort_df = display_df.copy()
+            for col in score_cols:
+                temp_sort_df[col] = pd.to_numeric(temp_sort_df[col], errors='coerce')
+            # Calculate average of the game scores (use mean of ranks from utils for actual ranking logic if different)
+            # For display sorting, let's use a simple average of available scores.
+            # The actual ranking for 'Average Rank' in leaderboard_utils uses mean of ranks, which is more robust.
+            # Here we just need a consistent sort order.
+            # Create a temporary column for sorting
+            temp_sort_df['temp_avg_score_for_sort'] = temp_sort_df[score_cols].mean(axis=1)
+            # Sort by this temporary average score (higher is better for scores)
+            # and then by Player name as a tie-breaker
+            display_df = display_df.loc[temp_sort_df.sort_values(by=['temp_avg_score_for_sort', 'Player'], ascending=[False, True]).index]
     # Add line breaks to column headers
     new_columns = {}
             game_name = col.replace(' Score', '')
             new_col = f"{game_name}\nScore"
             new_columns[col] = new_col
     # Rename columns with new line breaks
     if new_columns:
                      # max_height=None,  # Remove height limitation - COMMENTED OUT
                      column_widths=col_widths)
+def update_leaderboard(# mario_overall, mario_details, # Commented out
+                       mario_plan_overall, mario_plan_details, # Added
                        sokoban_overall, sokoban_details,
                        _2048_overall, _2048_details,
                        candy_overall, candy_details,
+                       # tetris_overall, tetris_details, # Commented out
                        tetris_plan_overall, tetris_plan_details,
                        ace_attorney_overall, ace_attorney_details):
     global leaderboard_state
     # Convert current checkbox states to dictionary for easier comparison
     current_overall = {
+        # "Super Mario Bros": mario_overall, # Commented out
+        "Super Mario Bros (planning only)": mario_plan_overall,
         "Sokoban": sokoban_overall,
         "2048": _2048_overall,
         "Candy Crush": candy_overall,
+        # "Tetris (complete)": tetris_overall, # Commented out
         "Tetris (planning only)": tetris_plan_overall,
         "Ace Attorney": ace_attorney_overall
     }
     current_details = {
+        # "Super Mario Bros": mario_details, # Commented out
+        "Super Mario Bros (planning only)": mario_plan_details,
         "Sokoban": sokoban_details,
         "2048": _2048_details,
         "Candy Crush": candy_details,
+        # "Tetris (complete)": tetris_details, # Commented out
         "Tetris (planning only)": tetris_plan_details,
         "Ace Attorney": ace_attorney_details
     }
     # Build dictionary for selected games
     selected_games = {
+        # "Super Mario Bros": current_overall["Super Mario Bros"], # Commented out
+        "Super Mario Bros (planning only)": current_overall["Super Mario Bros (planning only)"],
         "Sokoban": current_overall["Sokoban"],
         "2048": current_overall["2048"],
         "Candy Crush": current_overall["Candy Crush"],
+        # "Tetris (complete)": current_overall["Tetris (complete)"], # Commented out
         "Tetris (planning only)": current_overall["Tetris (planning only)"],
         "Ace Attorney": current_overall["Ace Attorney"]
     }
     # Get the appropriate DataFrame and charts based on current state
     if leaderboard_state["current_game"]:
         # For detailed view
+        # if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
+        #     df = get_mario_leaderboard(rank_data)
+        if leaderboard_state["current_game"] == "Super Mario Bros (planning only)":
+            df = get_mario_planning_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Sokoban":
             df = get_sokoban_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "2048":
             df = get_2048_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Candy Crush":
             df = get_candy_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Tetris (planning only)":
             df = get_tetris_planning_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Ace Attorney":
             df = get_ace_attorney_leaderboard(rank_data)
+        else: # Should not happen if current_game is one of the known games
+            df = pd.DataFrame() # Empty df
         display_df = prepare_dataframe_for_display(df, leaderboard_state["current_game"])
         chart = create_horizontal_bar_chart(df, leaderboard_state["current_game"])
+        radar_chart = chart # In detailed view, radar and group bar can be the same as the main chart
+        group_bar_chart = chart
     else:
         # For overall view
+        df, group_bar_chart = get_combined_leaderboard_with_group_bar(rank_data, selected_games)
         display_df = prepare_dataframe_for_display(df)
         _, radar_chart = get_combined_leaderboard_with_single_radar(rank_data, selected_games)
+        chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
+    # Return values, including all four plot placeholders
+    return (update_df_with_height(display_df), chart, radar_chart, group_bar_chart,
+            current_overall["Super Mario Bros (planning only)"], current_details["Super Mario Bros (planning only)"],
             current_overall["Sokoban"], current_details["Sokoban"],
             current_overall["2048"], current_details["2048"],
             current_overall["Candy Crush"], current_details["Candy Crush"],
             current_overall["Tetris (planning only)"], current_details["Tetris (planning only)"],
             current_overall["Ace Attorney"], current_details["Ace Attorney"])
+def update_leaderboard_with_time(time_point, # mario_overall, mario_details, # Commented out
+                               mario_plan_overall, mario_plan_details, # Added
                                sokoban_overall, sokoban_details,
                                _2048_overall, _2048_details,
                                candy_overall, candy_details,
+                               # tetris_overall, tetris_details, # Commented out
                                tetris_plan_overall, tetris_plan_details,
                                ace_attorney_overall, ace_attorney_details):
     # Load rank data for the selected time point
     if new_rank_data is not None:
         rank_data = new_rank_data
+    # Use the existing update_leaderboard function, including Super Mario (planning only)
+    return update_leaderboard(# mario_overall, mario_details, # Commented out
+                            mario_plan_overall, mario_plan_details, # Added
                             sokoban_overall, sokoban_details,
                             _2048_overall, _2048_details,
                             candy_overall, candy_details,
+                            # tetris_overall, tetris_details, # Commented out
                             tetris_plan_overall, tetris_plan_details,
                             ace_attorney_overall, ace_attorney_details)
     return {
         "current_game": None,
         "previous_overall": {
+            # "Super Mario Bros": True, # Commented out
+            "Super Mario Bros (planning only)": True,
             "Sokoban": True,
             "2048": True,
             "Candy Crush": True,
+            # "Tetris (complete)", # Commented out
             "Tetris (planning only)": True,
             "Ace Attorney": True
         },
         "previous_details": {
+            # "Super Mario Bros": False, # Commented out
+            "Super Mario Bros (planning only)": False,
             "Sokoban": False,
             "2048": False,
             "Candy Crush": False,
+            # "Tetris (complete)": False, # Commented out
             "Tetris (planning only)": False,
             "Ace Attorney": False
         }
 def clear_filters():
     global leaderboard_state
     selected_games = {
+        "Super Mario Bros (planning only)": True,
         "Sokoban": True,
         "2048": True,
         "Candy Crush": True,
         "Tetris (planning only)": True,
         "Ace Attorney": True
     }
     df, group_bar_chart = get_combined_leaderboard_with_group_bar(rank_data, selected_games)
     display_df = prepare_dataframe_for_display(df)
     _, radar_chart = get_combined_leaderboard_with_single_radar(rank_data, selected_games)
     leaderboard_state = get_initial_state()
+    # Return values, including all four plot placeholders
+    return (update_df_with_height(display_df), radar_chart, radar_chart, group_bar_chart,
+            True, False, # mario_plan
             True, False,  # sokoban
             True, False,  # 2048
             True, False,  # candy
             True, False,  # tetris plan
             True, False)  # ace attorney
             margin-top: 40px !important;
         }
     """) as demo:
+        gr.Markdown("# 🎮 Lmgame Bench: Leaderboard 🎲")
         # Add custom JavaScript for table header line breaks
         gr.HTML("""
                                 label="Comparative Analysis (Radar Chart)",
                                 elem_classes="visualization-container"
                             )
                             gr.Markdown(
+                                    "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
+                                    elem_classes="radar-tip"
+                                )
+                        # Comment out the Group Bar Chart tab
+                        with gr.Tab("📊 Group Bar Chart"):
+                            group_bar_visualization = gr.Plot(
+                                label="Comparative Analysis (Group Bar Chart)",
+                                elem_classes="visualization-container"
                             )
                 # Hidden placeholder for group bar visualization (to maintain code references)
+                # group_bar_visualization = gr.Plot(visible=False)
                 # Game selection section
                 with gr.Row():
                     gr.Markdown("### 🎮 Game Selection")
                 with gr.Row():
+                    # with gr.Column(): # Commented out Super Mario Bros UI
+                    #     gr.Markdown("**🎮 Super Mario Bros**")
+                    #     mario_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
+                    #     mario_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
+                    with gr.Column(): # Added Super Mario Bros (planning only) UI
+                        gr.Markdown("**📝 Super Mario Bros (planning only)**")
+                        mario_plan_overall = gr.Checkbox(label="Super Mario Bros (planning only) Score", value=True)
+                        mario_plan_details = gr.Checkbox(label="Super Mario Bros (planning only) Details", value=False)
+                    with gr.Column(): # Sokoban is now after mario_plan
                         gr.Markdown("**📦 Sokoban**")
                         sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
                         sokoban_details = gr.Checkbox(label="Sokoban Details", value=False)
                         gr.Markdown("**🍬 Candy Crush**")
                         candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
                         candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
+                    # with gr.Column(): # Commented out Tetris (complete) UI
+                    #     gr.Markdown("**🎯 Tetris (complete)**")
+                    #     tetris_overall = gr.Checkbox(label="Tetris (complete) Score", value=True)
+                    #     tetris_details = gr.Checkbox(label="Tetris (complete) Details", value=False)
                     with gr.Column():
                         gr.Markdown("**📋 Tetris (planning)**")
                         tetris_plan_overall = gr.Checkbox(label="Tetris (planning) Score", value=True)
                 # Get initial leaderboard dataframe
                 initial_df = get_combined_leaderboard(rank_data, {
+                    # "Super Mario Bros": True, # Commented out
+                    "Super Mario Bros (planning only)": True,
                     "Sokoban": True,
                     "2048": True,
                     "Candy Crush": True,
+                    # "Tetris (complete)": True, # Commented out
                     "Tetris (planning only)": True,
                     "Ace Attorney": True
                 })
                 with gr.Row():
                     score_note = add_score_note()
+                # List of all checkboxes, including Super Mario Bros (planning only)
                 checkbox_list = [
+                    # mario_overall, mario_details, # Commented out
+                    mario_plan_overall, mario_plan_details,
                     sokoban_overall, sokoban_details,
                     _2048_overall, _2048_details,
                     candy_overall, candy_details,
+                    # tetris_overall, tetris_details, # Commented out
                     tetris_plan_overall, tetris_plan_details,
                     ace_attorney_overall, ace_attorney_details
                 ]
                 # Update visualizations when checkboxes change
                 def update_visualizations(*checkbox_states):
                     # Check if any details checkbox is selected
+                    # Adjusted indices due to addition of Super Mario (planning only)
                     is_details_view = any([
+                        checkbox_states[1], # Mario Plan details
+                        checkbox_states[3], # Sokoban details
+                        checkbox_states[5], # 2048 details
+                        checkbox_states[7], # Candy Crush details
+                        checkbox_states[9], # Tetris (planning only) details
+                        checkbox_states[11]  # Ace Attorney details
                     ])
                     # Update visibility of visualization blocks
                             leaderboard_df,
                             detailed_visualization,
                             radar_visualization,
+                            group_bar_visualization # RESTORED
                         ] + checkbox_list
                     )
                         leaderboard_df,
                         detailed_visualization,
                         radar_visualization,
+                        group_bar_visualization # RESTORED
                     ] + checkbox_list
                 )
                         leaderboard_df,
                         detailed_visualization,
                         radar_visualization,
+                        group_bar_visualization # RESTORED
                     ] + checkbox_list
                 )

assets/model_color.json CHANGED Viewed

@@ -1,12 +1,14 @@
 {
     "claude-3-7-sonnet-20250219": "#4A90E2",
-    "claude-3-7-sonnet-20250219(thinking)": "#2E5C8A",
     "claude-3-5-haiku-20241022": "#7FB5E6",
     "claude-3-5-sonnet-20241022": "#1A4C7C",
     "gemini-2.0-flash": "#FF4081",
     "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
     "gemini-2.5-pro-exp-03-25": "#FF80AB",
     "gemini-2.5-flash-preview-04-17": "#F06292",
     "gpt-4o-2024-11-20": "#00BFA5",
     "gpt-4.5-preview-2025-02-27": "#00796B",
     "gpt-4.1-2025-04-14": "#00897B",
@@ -17,7 +19,9 @@
     "o4-mini-2025-04-16": "#00ACC1",
     "grok-3-beta": "#FF7043",
     "grok-3-mini-beta": "#FF8A65",
     "deepseek-v3": "#FFC107",
     "deepseek-r1": "#FFA000",
-    "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA"
 }

 {
     "claude-3-7-sonnet-20250219": "#4A90E2",
+    "claude-3-7-sonnet-20250219 (thinking)": "#2E5C8A",
     "claude-3-5-haiku-20241022": "#7FB5E6",
     "claude-3-5-sonnet-20241022": "#1A4C7C",
     "gemini-2.0-flash": "#FF4081",
     "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
     "gemini-2.5-pro-exp-03-25": "#FF80AB",
     "gemini-2.5-flash-preview-04-17": "#F06292",
+    "gemini-2.5-flash-preview-04-17 (thinking)": "#E91E63",
+    "gemini-2.5-pro-preview-05-06 (thinking)": "#AD1457",
     "gpt-4o-2024-11-20": "#00BFA5",
     "gpt-4.5-preview-2025-02-27": "#00796B",
     "gpt-4.1-2025-04-14": "#00897B",
     "o4-mini-2025-04-16": "#00ACC1",
     "grok-3-beta": "#FF7043",
     "grok-3-mini-beta": "#FF8A65",
+    "grok-3-mini-beta (thinking)": "#F57C00",
     "deepseek-v3": "#FFC107",
     "deepseek-r1": "#FFA000",
+    "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
+    "Random (x30)": "#9E9E9E"
 }

data_visualization.py CHANGED Viewed

@@ -56,76 +56,76 @@ def simplify_model_name(name):
     return '-'.join(parts[:4]) + '-...' if len(parts) > 4 else name
 def create_horizontal_bar_chart(df, game_name):
-    if game_name == "Super Mario Bros":
-        score_col = "Score"
-        df_sorted = df.sort_values(by=score_col, ascending=True)
-    elif game_name == "Sokoban":
-        # Process Sokoban scores by splitting and getting max level
-        def get_max_level(levels_str):
-            try:
-                # Split by semicolon, strip whitespace, filter empty strings, convert to integers
-                levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
-                return max(levels) if levels else 0
-            except:
-                return 0
-        # Create a temporary column with max levels
-        df['Max Level'] = df['Levels Cracked'].apply(get_max_level)
-        df_sorted = df.sort_values(by='Max Level', ascending=True)
-        score_col = 'Max Level'
-    elif game_name == "2048":
-        score_col = "Score"
-        df_sorted = df.sort_values(by=score_col, ascending=True)
-    elif game_name == "Candy Crush":
-        score_col = "Average Score"
-        df_sorted = df.sort_values(by=score_col, ascending=True)
-    elif game_name in ["Tetris (complete)", "Tetris (planning only)"]:
-        score_col = "Score"
-        df_sorted = df.sort_values(by=score_col, ascending=True)
-    elif game_name == "Ace Attorney":
-        score_col = "Score"
-        df_sorted = df.sort_values(by=score_col, ascending=True)
-    else:
-        return None
-    x = df_sorted[score_col]
-    y = [f"{row['Player']} [{row['Organization']}]" for _, row in df_sorted.iterrows()]
-    colors = [MODEL_COLORS.get(row['Player'], '#808080') for _, row in df_sorted.iterrows()]
-    texts = [f"{v:.1f}" if game_name == "Candy Crush" else f"{int(v)}" for v in x]
-    fig = go.Figure(go.Bar(
-        x=x,
-        y=y,
-        orientation='h',
-        marker_color=colors,
-        text=texts,
-        textposition='auto',
-        hovertemplate='%{y}<br>Score: %{x}<extra></extra>'
-    ))
     fig.update_layout(
-        autosize=False,
-        width=1000,
-        height=600,
-        margin=dict(l=200, r=200, t=20, b=20),
         title=dict(
-            text=f"{game_name} Performance",
-            pad=dict(t=10),
-            font=dict(size=20)
         ),
-        yaxis=dict(automargin=True),
-        legend=dict(
-            font=dict(size=12),
-            itemsizing='trace',
-            x=1.1,
-            y=1,
-            xanchor='left',
-            yanchor='top',
-            bgcolor='rgba(255,255,255,0.6)',
-            bordercolor='gray',
-            borderwidth=1
-        )
     )
     return fig
 def create_radar_charts(df):
@@ -324,8 +324,10 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
     # Format game names
     formatted_games = []
     for game in selected_games:
-        if game == 'Super Mario Bros':
             formatted_games.append('Super Mario')  # Simplified name
         else:
             formatted_games.append(game)  # Keep other names as is
@@ -387,7 +389,7 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
     fig.update_layout(
         autosize=False,
         width=1000,
-        height=620,  # Increased height to accommodate legend
         margin=dict(l=400, r=200, t=20, b=20),
         title=dict(
             text="AI Normalized Performance Across Games",

     return '-'.join(parts[:4]) + '-...' if len(parts) > 4 else name
 def create_horizontal_bar_chart(df, game_name):
+    """Creates a horizontal bar chart for a given game's leaderboard data."""
+    if df is None or df.empty:
+        # Return a placeholder or an empty figure if there's no data
+        fig = go.Figure()
+        fig.update_layout(
+            title=f"No data available for {game_name}",
+            xaxis_title="Score",
+            yaxis_title="Player",
+            plot_bgcolor='rgba(0,0,0,0)',
+            paper_bgcolor='rgba(0,0,0,0)',
+            font=dict(color='#2c3e50')
+        )
+        return fig
+    score_col = "Score" # Standardized score column name
+    if score_col not in df.columns:
+        fig = go.Figure()
+        fig.update_layout(title=f"'{score_col}' column not found for {game_name}")
+        return fig
+    # Ensure the score column is numeric for sorting and plotting
+    df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
+    df_cleaned = df.dropna(subset=[score_col]) # Remove rows where score is NaN after conversion
+    if df_cleaned.empty:
+        fig = go.Figure()
+        fig.update_layout(title=f"No valid score data to plot for {game_name}")
+        return fig
+    # Sort values for chart display (lowest score at the top of the chart)
+    # The input df is already sorted descending by score from leaderboard_utils
+    # Re-sorting ascending=True here means player with lowest score is at the top of the y-axis categories
+    df_sorted = df_cleaned.sort_values(by=score_col, ascending=True)
+    fig = go.Figure(
+        go.Bar(
+            y=df_sorted['Player'],
+            x=df_sorted[score_col],
+            orientation='h',
+            marker=dict(
+                color=df_sorted[score_col],
+                colorscale='Viridis', # Example colorscale, can be changed
+                line=dict(color='#2c3e50', width=1)
+            ),
+            hovertext=df_sorted[score_col].round(2).astype(str) + ' points',
+            hoverinfo='y+text'
+        )
+    )
     fig.update_layout(
         title=dict(
+            text=f'{game_name} Scores',
+            x=0.5,
+            font=dict(size=20, color='#2c3e50')
         ),
+        xaxis_title="Score",
+        yaxis_title="Player",
+        plot_bgcolor='rgba(0,0,0,0)', # Transparent plot background
+        paper_bgcolor='rgba(0,0,0,0)', # Transparent paper background
+        font=dict(color='#2c3e50'), # Dark text for better readability on light backgrounds
+        margin=dict(l=150, r=20, t=50, b=50), # Adjust margins for player names
+        yaxis=dict(
+            automargin=True,
+            tickfont=dict(size=10)
+        ),
+        xaxis=dict(gridcolor='#e0e0e0') # Light gridlines for x-axis
     )
     return fig
 def create_radar_charts(df):
     # Format game names
     formatted_games = []
     for game in selected_games:
+        if game == 'Super Mario Bros (planning only)':
             formatted_games.append('Super Mario')  # Simplified name
+        elif game == 'Tetris (planning only)':
+            formatted_games.append('Tetris')
         else:
             formatted_games.append(game)  # Keep other names as is
     fig.update_layout(
         autosize=False,
         width=1000,
+        height=700,  # Increased height to accommodate legend
         margin=dict(l=400, r=200, t=20, b=20),
         title=dict(
             text="AI Normalized Performance Across Games",

leaderboard_utils.py CHANGED Viewed

@@ -4,11 +4,12 @@ import numpy as np
 # Define game order
 GAME_ORDER = [
-    "Super Mario Bros",
     "Sokoban",
     "2048",
     "Candy Crush",
-    "Tetris (complete)",
     "Tetris (planning only)",
     "Ace Attorney"
 ]
@@ -41,31 +42,86 @@ def get_mario_leaderboard(rank_data):
     })
     df["Organization"] = df["Player"].apply(get_organization)
     df = df[["Player", "Organization", "Progress (current/total)", "Score", "Time (s)"]]
     return df
 def get_sokoban_leaderboard(rank_data):
     data = rank_data.get("Sokoban", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
-        "model": "Player",
-        "levels_cracked": "Levels Cracked",
-        "steps": "Steps"
     })
     df["Organization"] = df["Player"].apply(get_organization)
-    df = df[["Player", "Organization", "Levels Cracked", "Steps"]]
     return df
 def get_2048_leaderboard(rank_data):
     data = rank_data.get("2048", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
-        "model": "Player",
-        "score": "Score",
-        "steps": "Steps",
-        "time": "Time"
     })
-    df["Organization"] = df["Player"].apply(get_organization)
-    df = df[["Player", "Organization", "Score", "Steps", "Time"]]
     return df
 def get_candy_leaderboard(rank_data):
@@ -73,12 +129,18 @@ def get_candy_leaderboard(rank_data):
     df = pd.DataFrame(data)
     df = df.rename(columns={
         "model": "Player",
-        "score_runs": "Score Runs",
-        "average_score": "Average Score",
-        "steps": "Steps"
     })
     df["Organization"] = df["Player"].apply(get_organization)
-    df = df[["Player", "Organization", "Score Runs", "Average Score", "Steps"]]
     return df
 def get_tetris_leaderboard(rank_data):
@@ -98,11 +160,19 @@ def get_tetris_planning_leaderboard(rank_data):
     df = pd.DataFrame(data)
     df = df.rename(columns={
         "model": "Player",
-        "score": "Score",
-        "steps_blocks": "Steps"
     })
     df["Organization"] = df["Player"].apply(get_organization)
-    df = df[["Player", "Organization", "Score", "Steps"]]
     return df
 def get_ace_attorney_leaderboard(rank_data):
@@ -110,14 +180,41 @@ def get_ace_attorney_leaderboard(rank_data):
     df = pd.DataFrame(data)
     df = df.rename(columns={
         "model": "Player",
-        "levels_cracked": "Levels Cracked",
-        "lives_left": "Lives Left",
-        "cracked_details": "Progress",
         "score": "Score",
-        "note": "Notes"
     })
     df["Organization"] = df["Player"].apply(get_organization)
-    df = df[["Player", "Organization", "Levels Cracked", "Lives Left", "Progress", "Score", "Notes"]]
     return df
 def calculate_rank_and_completeness(rank_data, selected_games):
@@ -125,16 +222,18 @@ def calculate_rank_and_completeness(rank_data, selected_games):
     game_dfs = {}
     # Get DataFrames for selected games
-    if selected_games.get("Super Mario Bros"):
-        game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
     if selected_games.get("Sokoban"):
         game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
     if selected_games.get("2048"):
         game_dfs["2048"] = get_2048_leaderboard(rank_data)
     if selected_games.get("Candy Crush"):
         game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
-    if selected_games.get("Tetris (complete)"):
-        game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
     if selected_games.get("Tetris (planning only)"):
         game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
     if selected_games.get("Ace Attorney"):
@@ -163,29 +262,22 @@ def calculate_rank_and_completeness(rank_data, selected_games):
                 if player in df["Player"].values:
                     games_played += 1
                     # Get player's score based on game type
-                    if game == "Super Mario Bros":
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
                     elif game == "Sokoban":
-                        # Parse Sokoban score string and get maximum level
-                        levels_str = df[df["Player"] == player]["Levels Cracked"].iloc[0]
-                        try:
-                            # Split by semicolon, strip whitespace, filter empty strings, convert to integers
-                            levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
-                            player_score = max(levels) if levels else 0
-                        except:
-                            player_score = 0
-                        # Calculate rank based on maximum level
-                        rank = len(df[df["Levels Cracked"].apply(
-                            lambda x: max([int(y.strip()) for y in x.split(";") if y.strip()]) > player_score
-                        )]) + 1
                     elif game == "2048":
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
                     elif game == "Candy Crush":
-                        player_score = df[df["Player"] == player]["Average Score"].iloc[0]
-                        rank = len(df[df["Average Score"] > player_score]) + 1
-                    elif game in ["Tetris (complete)", "Tetris (planning only)"]:
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
                     elif game == "Ace Attorney":
@@ -197,12 +289,12 @@ def calculate_rank_and_completeness(rank_data, selected_games):
                 else:
                     player_data[f"{game} Score"] = 'n/a'
-        # Calculate average rank and completeness for sorting only
         if ranks:
-            player_data["Sort Rank"] = round(np.mean(ranks), 2)
             player_data["Games Played"] = games_played
         else:
-            player_data["Sort Rank"] = float('inf')
             player_data["Games Played"] = 0
         results.append(player_data)
@@ -210,13 +302,13 @@ def calculate_rank_and_completeness(rank_data, selected_games):
     # Create DataFrame and sort by average rank and completeness
     df_results = pd.DataFrame(results)
     if not df_results.empty:
-        # Sort by average rank (ascending) and completeness (descending)
         df_results = df_results.sort_values(
-            by=["Sort Rank", "Games Played"],
             ascending=[True, False]
         )
         # Drop the sorting columns
-        df_results = df_results.drop(["Sort Rank", "Games Played"], axis=1)
     return df_results
@@ -235,16 +327,18 @@ def get_combined_leaderboard(rank_data, selected_games):
     game_dfs = {}
     # Get DataFrames for selected games
-    if selected_games.get("Super Mario Bros"):
-        game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
     if selected_games.get("Sokoban"):
         game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
     if selected_games.get("2048"):
         game_dfs["2048"] = get_2048_leaderboard(rank_data)
     if selected_games.get("Candy Crush"):
         game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
-    if selected_games.get("Tetris (complete)"):
-        game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
     if selected_games.get("Tetris (planning only)"):
         game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
     if selected_games.get("Ace Attorney"):
@@ -269,21 +363,17 @@ def get_combined_leaderboard(rank_data, selected_games):
             if game in game_dfs:
                 df = game_dfs[game]
                 if player in df["Player"].values:
-                    if game == "Super Mario Bros":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "Sokoban":
-                        # Parse Sokoban score string and get maximum level
-                        levels_str = df[df["Player"] == player]["Levels Cracked"].iloc[0]
-                        try:
-                            levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
-                            player_data[f"{game} Score"] = max(levels) if levels else 0
-                        except:
-                            player_data[f"{game} Score"] = 0
                     elif game == "2048":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "Candy Crush":
-                        player_data[f"{game} Score"] = df[df["Player"] == player]["Average Score"].iloc[0]
-                    elif game in ["Tetris (complete)", "Tetris (planning only)"]:
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "Ace Attorney":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]

 # Define game order
 GAME_ORDER = [
+    # "Super Mario Bros", # Commented out
+    "Super Mario Bros (planning only)",
     "Sokoban",
     "2048",
     "Candy Crush",
+    # "Tetris (complete)", # Commented out
     "Tetris (planning only)",
     "Ace Attorney"
 ]
     })
     df["Organization"] = df["Player"].apply(get_organization)
     df = df[["Player", "Organization", "Progress (current/total)", "Score", "Time (s)"]]
+    if "Score" in df.columns:
+        df = df.sort_values("Score", ascending=False)
     return df
 def get_sokoban_leaderboard(rank_data):
     data = rank_data.get("Sokoban", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
+        "model": "Player",
+        "score": "Score",
+        "steps": "Steps",
+        "detail_box_on_target": "Detail Box On Target",
+        "cracked_levels": "Levels Cracked"
     })
     df["Organization"] = df["Player"].apply(get_organization)
+    # Define columns to keep, ensuring 'Score' is present
+    columns_to_keep = ["Player", "Organization", "Score", "Levels Cracked", "Detail Box On Target", "Steps"]
+    # Filter to only columns that actually exist in the DataFrame after renaming
+    df_columns = [col for col in columns_to_keep if col in df.columns]
+    df = df[df_columns]
+    if "Score" in df.columns:
+        df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
+        df = df.sort_values("Score", ascending=False)
     return df
 def get_2048_leaderboard(rank_data):
     data = rank_data.get("2048", {}).get("results", [])
+    # --- Diagnostic Print Removed ---
+    # if data and isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
+    #     print(f"DEBUG_UTILS: Keys in first item of raw data for 2048: {list(data[0].keys())}")
+    # elif not data:
+    #     print("DEBUG_UTILS: Raw data for 2048 is empty.")
+    # else:
+    #     print("DEBUG_UTILS: Raw data for 2048 is not in the expected list of dicts format.")
+    # --- End Diagnostic Print Removed ---
     df = pd.DataFrame(data)
+    # print(f"DEBUG_UTILS: Columns after pd.DataFrame(data): {df.columns.tolist()}") # REMOVED
     df = df.rename(columns={
+        "model": "Player",
+        "score": "Score",       # From new JSON structure
+        "details": "Details",    # From new JSON structure
+        "highest_tail": "Highest Tail" # Added new column
+        # Old fields like "steps", "time", "rank" are removed
     })
+    # print(f"DEBUG_UTILS: Columns after rename: {df.columns.tolist()}") # REMOVED
+    # Ensure 'Player' column exists before applying get_organization
+    if "Player" in df.columns:
+        df["Organization"] = df["Player"].apply(get_organization)
+    else:
+        # Handle case where 'Player' column might be missing after rename (should not happen with current logic)
+        # print("DEBUG_UTILS: 'Player' column not found after rename, skipping Organization.") # REMOVED
+        df["Organization"] = "unknown" # Fallback
+    columns_to_keep = ["Player", "Organization", "Score", "Highest Tail", "Details"] # Added "Highest Tail"
+    # Defensive check for 'Highest Tail' before filtering - REMOVED
+    # if 'highest_tail' in df.columns and 'Highest Tail' not in df.columns:
+    #     print("DEBUG_UTILS: 'highest_tail' (lowercase) found, but 'Highest Tail' (capitalized) not. This indicates a rename issue.")
+    # elif 'Highest Tail' not in df.columns and 'highest_tail' not in df.columns:
+    #     print("DEBUG_UTILS: Neither 'Highest Tail' nor 'highest_tail' found in columns before filtering.")
+    # df_columns = [col for col in columns_to_keep if col in df.columns] # REMOVED logic that used df_columns
+    # print(f"DEBUG_UTILS: df_columns selected (columns that are in columns_to_keep AND in df.columns): {df_columns}") # REMOVED
+    # Ensure all columns in columns_to_keep exist in df, fill with np.nan if not
+    for col_k in columns_to_keep:
+        if col_k not in df.columns:
+            # print(f"DEBUG_UTILS: Column '{col_k}' from columns_to_keep not found in DataFrame. Adding it with NaN values.") # REMOVED
+            df[col_k] = np.nan # Or some other default like 'n/a' if appropriate
+    df = df[columns_to_keep] # Use columns_to_keep directly after ensuring they exist
+    # print(f"DEBUG_UTILS: Columns after final selection: {df.columns.tolist()}") # REMOVED
+    if "Score" in df.columns:
+        df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
+        df = df.sort_values("Score", ascending=False)
     return df
 def get_candy_leaderboard(rank_data):
     df = pd.DataFrame(data)
     df = df.rename(columns={
         "model": "Player",
+        "score": "Score",
+        "details": "Details"
     })
     df["Organization"] = df["Player"].apply(get_organization)
+    columns_to_keep = ["Player", "Organization", "Score", "Details"]
+    df_columns = [col for col in columns_to_keep if col in df.columns]
+    df = df[df_columns]
+    if "Score" in df.columns:
+        df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
+        df = df.sort_values("Score", ascending=False)
     return df
 def get_tetris_leaderboard(rank_data):
     df = pd.DataFrame(data)
     df = df.rename(columns={
         "model": "Player",
+        "score": "Score",       # From new JSON structure
+        "details": "Details"    # From new JSON structure
+        # Old fields like "steps_blocks", "rank" are removed
     })
     df["Organization"] = df["Player"].apply(get_organization)
+    columns_to_keep = ["Player", "Organization", "Score", "Details"]
+    df_columns = [col for col in columns_to_keep if col in df.columns]
+    df = df[df_columns]
+    if "Score" in df.columns:
+        df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
+        df = df.sort_values("Score", ascending=False)
     return df
 def get_ace_attorney_leaderboard(rank_data):
     df = pd.DataFrame(data)
     df = df.rename(columns={
         "model": "Player",
         "score": "Score",
+        "progress": "Progress",
+        "evaluator result": "Evaluator Result"
     })
     df["Organization"] = df["Player"].apply(get_organization)
+    # Define columns to keep, including Evaluator Result
+    columns_to_keep = ["Player", "Organization", "Score", "Progress", "Evaluator Result"]
+    # Filter to only columns that actually exist in the DataFrame after renaming
+    df_columns = [col for col in columns_to_keep if col in df.columns]
+    df = df[df_columns]
+    if "Score" in df.columns:
+        df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
+        df = df.sort_values("Score", ascending=False)  # Higher score is better
+    return df
+def get_mario_planning_leaderboard(rank_data):
+    data = rank_data.get("Super Mario Bros (planning only)", {}).get("results", [])
+    df = pd.DataFrame(data)
+    df = df.rename(columns={
+        "model": "Player",
+        "score": "Score",
+        "detail_data": "Detail Data",
+        "progress": "Progress"
+    })
+    df["Organization"] = df["Player"].apply(get_organization)
+    # Define columns to keep
+    columns_to_keep = ["Player", "Organization", "Score", "Progress", "Detail Data"]
+    df_columns = [col for col in columns_to_keep if col in df.columns]
+    df = df[df_columns]
+    if "Score" in df.columns:
+        df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
+        df = df.sort_values("Score", ascending=False)
     return df
 def calculate_rank_and_completeness(rank_data, selected_games):
     game_dfs = {}
     # Get DataFrames for selected games
+    # if selected_games.get("Super Mario Bros"): # Commented out
+    #     game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
+    if selected_games.get("Super Mario Bros (planning only)"):
+        game_dfs["Super Mario Bros (planning only)"] = get_mario_planning_leaderboard(rank_data)
     if selected_games.get("Sokoban"):
         game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
     if selected_games.get("2048"):
         game_dfs["2048"] = get_2048_leaderboard(rank_data)
     if selected_games.get("Candy Crush"):
         game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
+    # if selected_games.get("Tetris (complete)"): # Commented out
+    #     game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
     if selected_games.get("Tetris (planning only)"):
         game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
     if selected_games.get("Ace Attorney"):
                 if player in df["Player"].values:
                     games_played += 1
                     # Get player's score based on game type
+                    # if game == "Super Mario Bros": # Commented out
+                    #     player_score = df[df["Player"] == player]["Score"].iloc[0]
+                    #     rank = len(df[df["Score"] > player_score]) + 1
+                    if game == "Super Mario Bros (planning only)":
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
                     elif game == "Sokoban":
+                        player_score = df[df["Player"] == player]["Score"].iloc[0]
+                        rank = len(df[df["Score"] > player_score]) + 1
                     elif game == "2048":
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
                     elif game == "Candy Crush":
+                        player_score = df[df["Player"] == player]["Score"].iloc[0]
+                        rank = len(df[df["Score"] > player_score]) + 1
+                    elif game in ["Tetris (planning only)"]:
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
                     elif game == "Ace Attorney":
                 else:
                     player_data[f"{game} Score"] = 'n/a'
+        # Calculate average rank and completeness for sorting
         if ranks:
+            player_data["Average Rank"] = round(np.mean(ranks), 2)
             player_data["Games Played"] = games_played
         else:
+            player_data["Average Rank"] = float('inf')
             player_data["Games Played"] = 0
         results.append(player_data)
     # Create DataFrame and sort by average rank and completeness
     df_results = pd.DataFrame(results)
     if not df_results.empty:
+        # Sort by average rank (ascending) and games played (descending)
         df_results = df_results.sort_values(
+            by=["Average Rank", "Games Played"],
             ascending=[True, False]
         )
         # Drop the sorting columns
+        df_results = df_results.drop(["Average Rank", "Games Played"], axis=1)
     return df_results
     game_dfs = {}
     # Get DataFrames for selected games
+    # if selected_games.get("Super Mario Bros"): # Commented out
+    #     game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
+    if selected_games.get("Super Mario Bros (planning only)"):
+        game_dfs["Super Mario Bros (planning only)"] = get_mario_planning_leaderboard(rank_data)
     if selected_games.get("Sokoban"):
         game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
     if selected_games.get("2048"):
         game_dfs["2048"] = get_2048_leaderboard(rank_data)
     if selected_games.get("Candy Crush"):
         game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
+    # if selected_games.get("Tetris (complete)"): # Commented out
+    #     game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
     if selected_games.get("Tetris (planning only)"):
         game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
     if selected_games.get("Ace Attorney"):
             if game in game_dfs:
                 df = game_dfs[game]
                 if player in df["Player"].values:
+                    # if game == "Super Mario Bros": # Commented out
+                    #     player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
+                    if game == "Super Mario Bros (planning only)":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "Sokoban":
+                        player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "2048":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "Candy Crush":
+                        player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
+                    elif game in ["Tetris (planning only)"]:
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "Ace Attorney":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]

rank_data_03_25_2025.json CHANGED Viewed

@@ -3,156 +3,200 @@
         "runs": 5,
         "results": [
             {
-                "model": "gpt-4.1-2025-04-14",
-                "score": 740,
                 "progress": "1-1",
-                "time_s": 68.6,
-                "rank": 1
             },
             {
-                "model": "claude-3-7-sonnet-20250219",
-                "score": 710,
                 "progress": "1-1",
-                "time_s": 64.2,
-                "rank": 2
             },
             {
                 "model": "gpt-4o-2024-11-20",
                 "score": 560,
                 "progress": "1-1",
-                "time_s": 58.6,
-                "rank": 3
             },
             {
                 "model": "gemini-2.0-flash",
                 "score": 320,
                 "progress": "1-1",
-                "time_s": 51.8,
-                "rank": 4
             },
             {
                 "model": "claude-3-5-haiku-20241022",
                 "score": 140,
                 "progress": "1-1",
-                "time_s": 76.4,
-                "rank": 5
             },
             {
                 "model": "gpt-4.5-preview-2025-02-27",
                 "score": 160,
                 "progress": "1-1",
-                "time_s": 62.8,
-                "rank": 6
             }
         ]
     },
-    "2048": {
-        "runs": 1,
         "results": [
             {
-                "model": "claude-3-7-sonnet-20250219(thinking)",
-                "score": 256,
-                "steps": 114,
-                "time": ">200",
-                "rank": 1
             },
             {
-                "model": "grok-3-mini-beta",
-                "score": 256,
-                "steps": 108,
-                "time": "58:09",
-                "rank": 1
             },
             {
                 "model": "o1-2024-12-17",
-                "score": 256,
-                "steps": 116,
-                "time": ">200",
-                "rank": 1
             },
             {
                 "model": "o3-2025-04-16",
-                "score": 256,
-                "steps": 108,
-                "time": "58:09",
-                "rank": 1
             },
             {
-                "model": "claude-3-7-sonnet-20250219",
-                "score": 256,
-                "steps": 130,
-                "time": "20:36",
-                "rank": 4
             },
             {
-                "model": "deepseek-v3",
-                "score": 256,
-                "steps": 216,
-                "time": "54.02",
-                "rank": 5
-            },
             {
-                "model": "gemini-2.5-flash-preview-04-17",
-                "score": 128,
-                "steps": 71,
-                "time": "41:42",
-                "rank": 6
             },
             {
-                "model": "gemini-2.0-flash",
-                "score": 128,
-                "steps": 111,
-                "time": "18:43",
-                "rank": 7
             },
             {
-                "model": "gemini-2.0-flash-thinking-exp-1219",
-                "score": 128,
-                "steps": 132,
-                "time": ">100",
-                "rank": 8
             },
             {
-                "model": "gemini-2.5-pro-exp-03-25",
-                "score": 128,
-                "steps": 138,
-                "time": "169",
-                "rank": 9
             },
             {
-                "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
-                "score": 128,
-                "steps": 145,
-                "time": ">100",
-                "rank": 10
             },
             {
-                "model": "o4-mini-2025-04-16",
-                "score": 128,
-                "steps": "",
-                "time": "",
-                "rank": 11
             },
             {
-                "model": "claude-3-5-sonnet-20241022",
-                "score": 64,
-                "steps": 92,
-                "time": "9:2",
-                "rank": 13
             },
             {
-                "model": "gpt-4.5-preview-2025-02-27",
-                "score": 34,
-                "steps": 34,
-                "time": "8:25",
-                "rank": 14
             },
             {
                 "model": "gpt-4o-2024-11-20",
-                "score": 16,
-                "steps": 21,
-                "time": "1:17",
-                "rank": 15
             }
         ]
     },
@@ -189,28 +233,74 @@
         "runs": 3,
         "results": [
             {
-                "model": "claude-3-7-sonnet-20250219",
-                "score": 110,
-                "steps_blocks": 29,
-                "rank": 1
             },
             {
-                "model": "claude-3-5-haiku-20241022",
-                "score": 92,
-                "steps_blocks": 25,
-                "rank": 2
             },
             {
-                "model": "gemini-2.0-flash",
-                "score": 87,
-                "steps_blocks": 24,
-                "rank": 3
             },
             {
                 "model": "gpt-4o-2024-11-20",
-                "score": 56,
-                "steps_blocks": 20,
-                "rank": 4
             }
         ]
     },
@@ -218,102 +308,74 @@
         "runs": 3,
         "results": [
             {
-                "model": "o4-mini-2025-04-16",
-                "score_runs": "123,131",
-                "average_score": 127,
-                "steps": 25,
-                "rank": 1
             },
             {
-                "model": "o3-2025-04-16",
-                "score_runs": "115, 122",
-                "average_score": 118.5,
-                "steps": 25,
-                "rank": 2
             },
             {
-                "model": "o3-mini-2025-01-31(medium)",
-                "score_runs": "90;109;120",
-                "average_score": 106.33,
-                "steps": 25,
-                "rank": 3
             },
             {
-                "model": "grok-3-mini-beta",
-                "score_runs": "106",
-                "average_score": 106,
-                "steps": 25,
-                "rank": 4
             },
             {
-                "model": "o1-2024-12-17",
-                "score_runs": "96;114;83",
-                "average_score": 97.67,
-                "steps": 25,
-                "rank": 5
             },
             {
-                "model": "deepseek-r1",
-                "score_runs": "62;108;105",
-                "average_score": 91.67,
-                "steps": 25,
-                "rank": 6
             },
             {
-                "model": "gemini-2.5-flash-preview-04-17",
-                "score_runs": "59",
-                "average_score": 59,
-                "steps": 25,
-                "rank": 7
             },
             {
-                "model": "gemini-2.5-pro-exp-03-25",
-                "score_runs": "50;36;68",
-                "average_score": 51.33,
-                "steps": 25,
-                "rank": 8
             },
             {
-                "model": "claude-3-7-sonnet-20250219(thinking)",
-                "score_runs": "36;46;24",
-                "average_score": 35.33,
-                "steps": 25,
-                "rank": 9
             },
             {
-                "model": "gemini-2.0-flash-thinking-exp-1219",
-                "score_runs": "0;15;39",
-                "average_score": 18,
-                "steps": 25,
-                "rank": 10
             },
             {
-                "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
-                "score_runs": "6;0;0",
-                "average_score": 2,
-                "steps": 25,
-                "rank": 11
             },
             {
-                "model": "gpt-4.1-2025-04-14",
-                "score_runs": "0;3;3",
-                "average_score": 2,
-                "steps": 25,
-                "rank": 12
             },
             {
-                "model": "claude-3-5-sonnet-20241022",
-                "score_runs": "3;0;0",
-                "average_score": 1,
-                "steps": 25,
-                "rank": 13
             },
             {
-                "model": "deepseek-v3",
-                "score_runs": "0;0;0",
-                "average_score": 0,
-                "steps": 25,
-                "rank": 14
             }
         ]
     },
@@ -321,216 +383,177 @@
         "runs": 3,
         "results": [
             {
-                "model": "o3-2025-04-16",
-                "levels_cracked": "5",
-                "steps": "[16, 40, 59, 110]",
-                "rank": 1
-            },
-            {
-                "model": "grok-3-mini-beta",
-                "levels_cracked": "3",
-                "steps": "[14, 36, 55, 78]",
-                "rank": 2
             },
             {
-                "model": "o3-mini-2025-01-31(medium)",
-                "levels_cracked": "2; 3; 2",
-                "steps": "[17,52,68];[24,58,78,91];[19,44,64]",
-                "rank": 3
             },
             {
-                "model": "gemini-2.5-pro-exp-03-25",
-                "levels_cracked": "2;2;3",
-                "steps": "[23, 46, 79]; [20,50,77]; [26,95,125,175]",
-                "rank": 4
-            },
-            {
-                "model": "gemini-2.5-flash-preview-04-17",
-                "levels_cracked": "2",
-                "steps": "[24, 50, 60]",
-                "rank": 5
             },
             {
-                "model": "o4-mini-2025-04-16",
-                "levels_cracked": "2",
-                "steps": "",
-                "rank": 6
             },
             {
-                "model": "claude-3-7-sonnet-20250219(thinking)",
-                "levels_cracked": "1; 2; 0",
-                "steps": "[17,35];[15,40,43];[4]",
-                "rank": 7
             },
             {
-                "model": "o1-2024-12-17",
-                "levels_cracked": "1; 1; 1",
-                "steps": null,
-                "rank": 8
             },
             {
-                "model": "deepseek-r1",
-                "levels_cracked": "1; 0; 1",
-                "steps": "[19,42];[13];[19,36]",
-                "note": "stuck",
-                "rank": 9
             },
             {
-                "model": "o1-mini-2024-09-12",
-                "levels_cracked": "0;1;0",
-                "steps": null,
-                "rank": 10
             },
             {
-                "model": "gemini-2.0-flash-thinking-exp-1219",
-                "levels_cracked": "0; 0; 0",
-                "steps": "[23]; [14]; [14]",
-                "rank": 11
             },
             {
-                "model": "gpt-4o-2024-11-20",
-                "levels_cracked": "0; 0; 0",
-                "steps": "[68];[105];[168]",
-                "note": "stuck in a loop",
-                "rank": 12
             },
             {
-                "model": "claude-3-5-sonnet-20241022",
-                "levels_cracked": "0; 0; 0",
-                "steps": "[21]; [30]; [51]",
-                "note": "stuck in a loop",
-                "rank": 13
             },
             {
-                "model": "deepseek-v3",
-                "levels_cracked": "0; 0; 0",
-                "steps": "[9]; [47]; [64]",
-                "rank": 14
             },
             {
-                "model": "gpt-4.1-2025-04-14",
-                "levels_cracked": "0; 0; 0",
-                "steps": "[9]; [47]; [64]",
-                "rank": 15
             },
             {
-                "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
-                "levels_cracked": "0;0;0",
-                "steps": "[5]",
-                "rank": 17
             }
         ]
     },
     "Ace Attorney": {
-        "runs": 2,
         "results": [
             {
-                "model": "o1-2024-12-17",
-                "levels_cracked": "3; 3",
-                "lives_left": "[5, 3, 3, 0],[4, 5, 3, 0]",
-                "cracked_details": "4: 7/8",
-                "rank": 1,
-                "score": 26,
-                "note": "stuck at the end not present evidence"
             },
             {
-                "model": "o3-2025-04-16",
-                "levels_cracked": "3",
-                "lives_left": "[5, 3, 3, 0]",
-                "cracked_details": "4: 4/8",
-                "rank": 2,
-                "score": 23,
-                "note": "failed to present evidence"
-            },
-            {
-                "model": "gemini-2.5-pro-exp-03-25",
-                "levels_cracked": "2; 3",
-                "lives_left": "[5,5,0]; [5, 5, 4, 0]",
-                "cracked_details": "4: 0/8",
-                "rank": 3,
-                "score": 20,
-                "note": "failed to present evidence"
-            },
-            {
-                "model": "claude-3-7-sonnet-20250219(thinking)",
-                "levels_cracked": "1; 1",
-                "lives_left": "[3,0]; [5,0]",
-                "cracked_details": "2: 3/9",
-                "rank": 4,
-                "score": 8,
-                "note": "failed to present evidence"
             },
             {
-                "model": "grok-3-mini-beta",
-                "levels_cracked": "1",
-                "lives_left": "[3, 0]",
-                "cracked_details": "2: 2/9",
-                "rank": 5,
                 "score": 7,
-                "note": "failed to present evidence"
             },
             {
-                "model": "claude-3-5-sonnet-20241022",
-                "levels_cracked": "1",
-                "lives_left": "5, 5",
-                "cracked_details": "1:1/8",
-                "rank": 6,
-                "score": 6,
-                "note": "stuck in loop"
             },
             {
                 "model": "gpt-4.1-2025-04-14",
-                "levels_cracked": "1",
-                "lives_left": "[4,5]",
-                "cracked_details": "1: 1/8",
-                "rank": 7,
-                "score": 6,
-                "note": "stuck in loop"
-            },
-            {
-                "model": "gemini-2.5-flash-preview-04-17",
-                "levels_cracked": "0",
-                "lives_left": "0",
-                "cracked_details": "1: 4/5",
-                "rank": 8,
-                "score": 4,
-                "note": "stuck in the last option section"
             },
             {
-                "model": "gemini-2.0-flash-thinking-exp-1219",
-                "levels_cracked": "0",
-                "lives_left": "0",
-                "cracked_details": "1: 4/5",
-                "rank": 9,
-                "score": 4,
-                "note": "stuck in the last option section"
             },
             {
-                "model": "deepseek-r1",
-                "levels_cracked": "0",
-                "lives_left": "0",
-                "cracked_details": "1: 4/5",
-                "rank": 10,
-                "score": 4,
-                "note": "stuck in the 3rd evidence present"
             },
             {
                 "model": "o4-mini-2025-04-16",
-                "levels_cracked": "0",
-                "lives_left": "0",
-                "cracked_details": "1:1/5",
-                "rank": 11,
-                "score": 1,
-                "note": "failed to present evidence"
-            },
-            {
-                "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
-                "levels_cracked": "0",
-                "lives_left": "0",
-                "cracked_details": "0:0/5",
-                "rank": 13,
                 "score": 0,
-                "note": "failed to present evidence"
             }
         ]
     }

         "runs": 5,
         "results": [
             {
+                "model": "claude-3-7-sonnet-20250219",
+                "score": 710,
                 "progress": "1-1",
+                "time_s": 64.2
             },
             {
+                "model": "gpt-4.1-2025-04-14",
+                "score": 740,
                 "progress": "1-1",
+                "time_s": 68.6
             },
             {
                 "model": "gpt-4o-2024-11-20",
                 "score": 560,
                 "progress": "1-1",
+                "time_s": 58.6
             },
             {
                 "model": "gemini-2.0-flash",
                 "score": 320,
                 "progress": "1-1",
+                "time_s": 51.8
             },
             {
                 "model": "claude-3-5-haiku-20241022",
                 "score": 140,
                 "progress": "1-1",
+                "time_s": 76.4
             },
             {
                 "model": "gpt-4.5-preview-2025-02-27",
                 "score": 160,
                 "progress": "1-1",
+                "time_s": 62.8
             }
         ]
     },
+    "Super Mario Bros (planning only)": {
+        "runs": 3,
         "results": [
             {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 1267.7,
+                "detail_data": "709;1532;1562",
+                "progress": "1-1"
             },
             {
+                "model": "claude-3-7-sonnet-20250219 (thinking)",
+                "score": 1418.7,
+                "detail_data": "2015;709;1532",
+                "progress": "1-1"
+            },
+            {
+                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 1385.0,
+                "detail_data": "1672;1266;1247",
+                "progress": "1-1"
+            },
+            {
+                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
+                "score": 1498.3,
+                "detail_data": "1561;1271;1663",
+                "progress": "1-1"
+            },
+            {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 1468.7,
+                "detail_data": "898;2008;1500",
+                "progress": "1-1"
+            },
+            {
+                "model": "gpt-4.1-2025-04-14",
+                "score": 2126.3,
+                "detail_data": "1531;722;4126",
+                "progress": "1-1"
+            },
+            {
+                "model": "gpt-4o-2024-11-20",
+                "score": 2047.3,
+                "detail_data": "2017;2590;1535",
+                "progress": "1-1"
             },
             {
                 "model": "o1-2024-12-17",
+                "score": 855,
+                "detail_data": "855",
+                "progress": "1-1"
             },
             {
                 "model": "o3-2025-04-16",
+                "score": 3445,
+                "detail_data": "3445",
+                "progress": "1-1"
             },
             {
+                "model": "o4-mini-2025-04-16",
+                "score": 1448.0,
+                "detail_data": "1525;1263;1556",
+                "progress": "1-1"
             },
             {
+                "model": "Random (x30)",
+                "score": 986.97,
+                "detail_data": "986.97",
+                "progress": "1-1"
+            }
+        ]
+    },
+    "2048": {
+        "runs": 3,
+        "results": [
             {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 108.2,
+                "details": "1352;2860;1532",
+                "highest_tail": 128
             },
             {
+                "model": "claude-3-7-sonnet-20250219 (thinking)",
+                "score": 113.3,
+                "details": "2560;3224;2088",
+                "highest_tail": 256
             },
             {
+                "model": "deepseek-r1",
+                "score": 105.2,
+                "details": "700;1240;3680",
+                "highest_tail": 128
             },
             {
+                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 106.6,
+                "details": "1304;1316;2472",
+                "highest_tail": 256
             },
             {
+                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
+                "score": 117.3,
+                "details": "5300;2400;3060",
+                "highest_tail": 256
             },
             {
+                "model": "grok-3-mini-beta (thinking)",
+                "score": 118.6,
+                "details": "6412;2492;3204",
+                "highest_tail": 256
             },
             {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 106,
+                "details": "1404;1272;2084",
+                "highest_tail": 128
             },
             {
+                "model": "gpt-4.1-2025-04-14",
+                "score": 105.7,
+                "details": "1156;2664;1148",
+                "highest_tail": 128
             },
             {
                 "model": "gpt-4o-2024-11-20",
+                "score": 106.7,
+                "details": "1604;1284;2080",
+                "highest_tail": 256
+            },
+            {
+                "model": "o1-2024-12-17",
+                "score": 128.9,
+                "details": "3132;2004;3136",
+                "highest_tail": 512
+            },
+            {
+                "model": "o1-mini-2024-09-12",
+                "score": 114.0,
+                "details": "21;86;37",
+                "highest_tail": 256
+            },
+            {
+                "model": "o3-2025-04-16",
+                "score": 128.0,
+                "details": "7120",
+                "highest_tail": 512
+            },
+            {
+                "model": "o4-mini-2025-04-16",
+                "score": 120.6,
+                "details": "4928;5456;2912",
+                "highest_tail": 256
+            },
+            {
+                "model": "Random (x30)",
+                "score": 100.4,
+                "details": "",
+                "highest_tail": 128
             }
         ]
     },
         "runs": 3,
         "results": [
             {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 14.7,
+                "details": "16;14;14"
             },
             {
+                "model": "claude-3-7-sonnet-20250219 (thinking)",
+                "score": 16.3,
+                "details": "19;15;15"
             },
             {
+                "model": "deepseek-r1",
+                "score": 14.3,
+                "details": "15;14;14"
+            },
+            {
+                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 16.3,
+                "details": "20;14;15"
+            },
+            {
+                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
+                "score": 23.3,
+                "details": "23;23;24"
+            },
+            {
+                "model": "grok-3-mini-beta (thinking)",
+                "score": 21.3,
+                "details": "20;15;29"
+            },
+            {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 10.3,
+                "details": "9;10;12"
+            },
+            {
+                "model": "gpt-4.1-2025-04-14",
+                "score": 13.7,
+                "details": "13;14;14"
             },
             {
                 "model": "gpt-4o-2024-11-20",
+                "score": 14,
+                "details": "18;11;13"
+            },
+            {
+                "model": "o1-2024-12-17",
+                "score": 35,
+                "details": "35"
+            },
+            {
+                "model": "o1-mini-2024-09-12",
+                "score": 11.7,
+                "details": "11;11;13"
+            },
+            {
+                "model": "o3-2025-04-16",
+                "score": 42,
+                "details": "42"
+            },
+            {
+                "model": "o4-mini-2025-04-16",
+                "score": 25.3,
+                "details": "22;35;19"
+            },
+            {
+                "model": "Random (x30)",
+                "score": 10.2,
+                "details": ""
             }
         ]
     },
         "runs": 3,
         "results": [
             {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 106,
+                "details": "92;165;61"
             },
             {
+                "model": "claude-3-7-sonnet-20250219 (thinking)",
+                "score": 484,
+                "details": "535;428;489"
             },
             {
+                "model": "deepseek-r1",
+                "score": 447.3,
+                "details": "409;436;497"
             },
             {
+                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 334.7,
+                "details": "259;372;373"
             },
             {
+                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
+                "score": 416.3,
+                "details": "411;414;424"
             },
             {
+                "model": "grok-3-mini-beta (thinking)",
+                "score": 254,
+                "details": "299;332;131"
             },
             {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 128.7,
+                "details": "67;139;180"
             },
             {
+                "model": "gpt-4.1-2025-04-14",
+                "score": 182,
+                "details": "163;215;168"
             },
             {
+                "model": "gpt-4o-2024-11-20",
+                "score": 147.3,
+                "details": "131;104;207"
             },
             {
+                "model": "o1-2024-12-17",
+                "score": 159,
+                "details": "159"
             },
             {
+                "model": "o1-mini-2024-09-12",
+                "score": 48,
+                "details": "21;86;37"
             },
             {
+                "model": "o3-2025-04-16",
+                "score": 647,
+                "details": "647"
             },
             {
+                "model": "o4-mini-2025-04-16",
+                "score": 487.3,
+                "details": "259;591;612"
             },
             {
+                "model": "Random (x30)",
+                "score": 116.5,
+                "details": ""
             }
         ]
     },
         "runs": 3,
         "results": [
             {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 0,
+                "detail_box_on_target": "0;0;0",
+                "cracked_levels": "0;0;0"
             },
             {
+                "model": "claude-3-7-sonnet-20250219 (thinking)",
+                "score": 2.33,
+                "detail_box_on_target": "2;4;1",
+                "cracked_levels": "1;2;0"
             },
             {
+                "model": "deepseek-r1",
+                "score": 1.33,
+                "detail_box_on_target": "2;0;2",
+                "cracked_levels": "1;0;1"
             },
             {
+                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 1.67,
+                "detail_box_on_target": "3;0;2",
+                "cracked_levels": "2;0;1"
             },
             {
+                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
+                "score": 4.33,
+                "detail_box_on_target": "4;4;5",
+                "cracked_levels": "2;2;3"
             },
             {
+                "model": "grok-3-mini-beta (thinking)",
+                "score": 5.67,
+                "detail_box_on_target": "5;6;6",
+                "cracked_levels": "3;3;3"
             },
             {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 0,
+                "detail_box_on_target": "0;0;0",
+                "cracked_levels": "0;0;0"
             },
             {
+                "model": "gpt-4.1-2025-04-14",
+                "score": 0,
+                "detail_box_on_target": "0;0;0",
+                "cracked_levels": "0;0;0"
             },
             {
+                "model": "gpt-4o-2024-11-20",
+                "score": 0,
+                "detail_box_on_target": "0;0;0",
+                "cracked_levels": "0;0;0"
             },
             {
+                "model": "o1-2024-12-17",
+                "score": 2.33,
+                "detail_box_on_target": "2;2;3",
+                "cracked_levels": "1;1;2"
             },
             {
+                "model": "o1-mini-2024-09-12",
+                "score": 1.33,
+                "detail_box_on_target": "1;2;1",
+                "cracked_levels": "0;1;0"
             },
             {
+                "model": "o3-2025-04-16",
+                "score": 8,
+                "detail_box_on_target": "10;6",
+                "cracked_levels": "5;3"
             },
             {
+                "model": "o4-mini-2025-04-16",
+                "score": 5.33,
+                "detail_box_on_target": "4;6;6",
+                "cracked_levels": "2;2;3"
             },
             {
+                "model": "Random (x30)",
+                "score": 0,
+                "detail_box_on_target": "0,0,0",
+                "cracked_levels": "0,0,0"
             }
         ]
     },
     "Ace Attorney": {
+        "runs": 1,
         "results": [
             {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 2,
+                "progress": "1:2/5",
+                "evaluator result": "1/3"
             },
             {
+                "model": "claude-3-7-sonnet-20250219 (thinking)",
+                "score": 7,
+                "progress": "2:2/9",
+                "evaluator result": "5/11"
             },
             {
+                "model": "deepseek-r1",
+                "score": 0,
+                "progress": "0",
+                "evaluator result": "1/5"
+            },
+            {
+                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 4,
+                "progress": "1:4/5",
+                "evaluator result": "1/7"
+            },
+            {
+                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
                 "score": 7,
+                "progress": "2:2/9",
+                "evaluator result": "2/3"
             },
             {
+                "model": "grok-3-mini-beta (thinking)",
+                "score": 0,
+                "progress": "0",
+                "evaluator result": "0"
+            },
+            {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 0,
+                "progress": "0",
+                "evaluator result": "0"
             },
             {
                 "model": "gpt-4.1-2025-04-14",
+                "score": 2,
+                "progress": "1:2/5",
+                "evaluator result": "2/3"
             },
             {
+                "model": "gpt-4o-2024-11-20",
+                "score": 0,
+                "progress": "0",
+                "evaluator result": "0"
             },
             {
+                "model": "o1-2024-12-17",
+                "score": 16,
+                "progress": "3: 2/8",
+                "evaluator result": "6/11"
+            },
+            {
+                "model": "o1-mini-2024-09-12",
+                "score": 0,
+                "progress": "0",
+                "evaluator result": "1/5"
+            },
+            {
+                "model": "o3-2025-04-16",
+                "score": 16,
+                "progress": "3: 2/8",
+                "evaluator result": "1/2"
             },
             {
                 "model": "o4-mini-2025-04-16",
+                "score": 4,
+                "progress": "1:4/5",
+                "evaluator result": "2/5"
+            },
+            {
+                "model": "Random (x30)",
                 "score": 0,
+                "progress": "0",
+                "evaluator result": "0"
             }
         ]
     }