lmgame_bench

Running

App Files Files Community

Yuxuan-Zhang-Dexter commited on Jun 9, 2025

Commit

3856741

1 Parent(s): 8290468

update agent and model leaderboard two tabs

Browse files

Files changed (6) hide show

app.py +325 -82
assets/model_color.json +38 -2
data_visualization.py +82 -59
leaderboard_utils.py +19 -46
rank_data_03_25_2025.json +266 -237
rank_single_model_03_25_2025.json +473 -0

app.py CHANGED Viewed

@@ -14,7 +14,6 @@ from leaderboard_utils import (
     get_sokoban_leaderboard,
     get_2048_leaderboard,
     get_candy_leaderboard,
-    get_tetris_leaderboard,
     get_tetris_planning_leaderboard,
     get_ace_attorney_leaderboard,
     get_combined_leaderboard,
@@ -22,11 +21,7 @@ from leaderboard_utils import (
 )
 from data_visualization import (
     get_combined_leaderboard_with_group_bar,
-    create_organization_radar_chart,
-    create_top_players_radar_chart,
-    create_player_radar_chart,
     create_horizontal_bar_chart,
-    normalize_values,
     get_combined_leaderboard_with_single_radar
 )
 from gallery_tab import create_video_gallery
@@ -46,27 +41,31 @@ TIME_POINTS = {
 with open(TIME_POINTS["03/25/2025"], "r") as f:
     rank_data = json.load(f)
 # Add leaderboard state at the top level
 leaderboard_state = {
     "current_game": None,
     "previous_overall": {
         # "Super Mario Bros": True, # Commented out
-        "Super Mario Bros (planning only)": True,
         "Sokoban": True,
         "2048": True,
         "Candy Crush": True,
-        # "Tetris (complete)", # Commented out
-        "Tetris (planning only)": True,
         "Ace Attorney": True
     },
     "previous_details": {
         # "Super Mario Bros": False, # Commented out
-        "Super Mario Bros (planning only)": False,
         "Sokoban": False,
         "2048": False,
         "Candy Crush": False,
-        # "Tetris (complete)": False, # Commented out
-        "Tetris (planning only)": False,
         "Ace Attorney": False
     }
 }
@@ -184,29 +183,34 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
                        candy_overall, candy_details,
                        # tetris_overall, tetris_details, # Commented out
                        tetris_plan_overall, tetris_plan_details,
-                       ace_attorney_overall, ace_attorney_details):
     global leaderboard_state
     # Convert current checkbox states to dictionary for easier comparison
     current_overall = {
         # "Super Mario Bros": mario_overall, # Commented out
-        "Super Mario Bros (planning only)": mario_plan_overall,
         "Sokoban": sokoban_overall,
         "2048": _2048_overall,
         "Candy Crush": candy_overall,
-        # "Tetris (complete)": tetris_overall, # Commented out
-        "Tetris (planning only)": tetris_plan_overall,
         "Ace Attorney": ace_attorney_overall
     }
     current_details = {
         # "Super Mario Bros": mario_details, # Commented out
-        "Super Mario Bros (planning only)": mario_plan_details,
         "Sokoban": sokoban_details,
         "2048": _2048_details,
         "Candy Crush": candy_details,
-        # "Tetris (complete)": tetris_details, # Commented out
-        "Tetris (planning only)": tetris_plan_details,
         "Ace Attorney": ace_attorney_details
     }
@@ -289,12 +293,12 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
     # Build dictionary for selected games
     selected_games = {
         # "Super Mario Bros": current_overall["Super Mario Bros"], # Commented out
-        "Super Mario Bros (planning only)": current_overall["Super Mario Bros (planning only)"],
         "Sokoban": current_overall["Sokoban"],
         "2048": current_overall["2048"],
         "Candy Crush": current_overall["Candy Crush"],
-        # "Tetris (complete)": current_overall["Tetris (complete)"], # Commented out
-        "Tetris (planning only)": current_overall["Tetris (planning only)"],
         "Ace Attorney": current_overall["Ace Attorney"]
     }
@@ -302,19 +306,19 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
     if leaderboard_state["current_game"]:
         # For detailed view
         # if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
-        #     df = get_mario_leaderboard(rank_data)
-        if leaderboard_state["current_game"] == "Super Mario Bros (planning only)":
-            df = get_mario_planning_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Sokoban":
-            df = get_sokoban_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "2048":
-            df = get_2048_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Candy Crush":
-            df = get_candy_leaderboard(rank_data)
-        elif leaderboard_state["current_game"] == "Tetris (planning only)":
-            df = get_tetris_planning_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Ace Attorney":
-            df = get_ace_attorney_leaderboard(rank_data)
         else: # Should not happen if current_game is one of the known games
             df = pd.DataFrame() # Empty df
@@ -324,18 +328,18 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
         group_bar_chart = chart
     else:
         # For overall view
-        df, group_bar_chart = get_combined_leaderboard_with_group_bar(rank_data, selected_games)
         display_df = prepare_dataframe_for_display(df)
-        _, radar_chart = get_combined_leaderboard_with_single_radar(rank_data, selected_games)
         chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
     # Return values, including all four plot placeholders
     return (update_df_with_height(display_df), chart, radar_chart, group_bar_chart,
-            current_overall["Super Mario Bros (planning only)"], current_details["Super Mario Bros (planning only)"],
             current_overall["Sokoban"], current_details["Sokoban"],
             current_overall["2048"], current_details["2048"],
             current_overall["Candy Crush"], current_details["Candy Crush"],
-            current_overall["Tetris (planning only)"], current_details["Tetris (planning only)"],
             current_overall["Ace Attorney"], current_details["Ace Attorney"])
 def update_leaderboard_with_time(time_point, # mario_overall, mario_details, # Commented out
@@ -352,7 +356,7 @@ def update_leaderboard_with_time(time_point, # mario_overall, mario_details, # C
     if new_rank_data is not None:
         rank_data = new_rank_data
-    # Use the existing update_leaderboard function, including Super Mario (planning only)
     return update_leaderboard(# mario_overall, mario_details, # Commented out
                             mario_plan_overall, mario_plan_details, # Added
                             sokoban_overall, sokoban_details,
@@ -362,47 +366,63 @@ def update_leaderboard_with_time(time_point, # mario_overall, mario_details, # C
                             tetris_plan_overall, tetris_plan_details,
                             ace_attorney_overall, ace_attorney_details)
 def get_initial_state():
     """Get the initial state for the leaderboard"""
     return {
         "current_game": None,
         "previous_overall": {
             # "Super Mario Bros": True, # Commented out
-            "Super Mario Bros (planning only)": True,
             "Sokoban": True,
             "2048": True,
             "Candy Crush": True,
-            # "Tetris (complete)", # Commented out
-            "Tetris (planning only)": True,
             "Ace Attorney": True
         },
         "previous_details": {
             # "Super Mario Bros": False, # Commented out
-            "Super Mario Bros (planning only)": False,
             "Sokoban": False,
             "2048": False,
             "Candy Crush": False,
-            # "Tetris (complete)": False, # Commented out
-            "Tetris (planning only)": False,
             "Ace Attorney": False
         }
     }
-def clear_filters():
     global leaderboard_state
     selected_games = {
-        "Super Mario Bros (planning only)": True,
         "Sokoban": True,
         "2048": True,
         "Candy Crush": True,
-        "Tetris (planning only)": True,
         "Ace Attorney": True
     }
-    df, group_bar_chart = get_combined_leaderboard_with_group_bar(rank_data, selected_games)
     display_df = prepare_dataframe_for_display(df)
-    _, radar_chart = get_combined_leaderboard_with_single_radar(rank_data, selected_games)
     leaderboard_state = get_initial_state()
@@ -412,7 +432,7 @@ def clear_filters():
             True, False,  # sokoban
             True, False,  # 2048
             True, False,  # candy
-            True, False,  # tetris plan
             True, False)  # ace attorney
 def create_timeline_slider():
@@ -527,7 +547,7 @@ def build_app():
     with gr.Blocks(css="""
         /* Fix for scrolling issues */
         html, body {
-            overflow-y: hidden !important;
             overflow-x: hidden !important;
             width: 100% !important;
             height: 100% !important;
@@ -750,18 +770,18 @@ def build_app():
                 let newContent = header.innerHTML;
-                // Format Super Mario Bros header
                 if (text.includes('Super Mario Bros')) {
                     newContent = newContent.replace(/Super\s+Mario\s+Bros/g, 'Super<br>Mario Bros');
                 }
-                // Format Tetris headers
-                if (text.includes('Tetris (complete)')) {
                     newContent = newContent.replace(/Tetris\s+\(complete\)/g, 'Tetris<br>(complete)');
                 }
-                if (text.includes('Tetris (planning only)')) {
-                    newContent = newContent.replace(/Tetris\s+\(planning\s+only\)/g, 'Tetris<br>(planning)');
                 }
                 // Format Candy Crush header
@@ -853,7 +873,7 @@ def build_app():
         """)
         with gr.Tabs():
-            with gr.Tab("🏆 Leaderboard"):
                 # Visualization section
                 with gr.Row():
                     gr.Markdown("### 📊 Data Visualization")
@@ -879,6 +899,17 @@ def build_app():
                                 )
                         # Comment out the Group Bar Chart tab
                         with gr.Tab("📊 Group Bar Chart"):
                             group_bar_visualization = gr.Plot(
                                 label="Comparative Analysis (Group Bar Chart)",
                                 elem_classes="visualization-container"
@@ -892,14 +923,14 @@ def build_app():
                 with gr.Row():
                     gr.Markdown("### 🎮 Game Selection")
                 with gr.Row():
-                    # with gr.Column(): # Commented out Super Mario Bros UI
                     #     gr.Markdown("**🎮 Super Mario Bros**")
-                    #     mario_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
-                    #     mario_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
-                    with gr.Column(): # Added Super Mario Bros (planning only) UI
-                        gr.Markdown("**📝 Super Mario Bros (planning only)**")
-                        mario_plan_overall = gr.Checkbox(label="Super Mario Bros (planning only) Score", value=True)
-                        mario_plan_details = gr.Checkbox(label="Super Mario Bros (planning only) Details", value=False)
                     with gr.Column(): # Sokoban is now after mario_plan
                         gr.Markdown("**📦 Sokoban**")
                         sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
@@ -912,14 +943,14 @@ def build_app():
                         gr.Markdown("**🍬 Candy Crush**")
                         candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
                         candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
-                    # with gr.Column(): # Commented out Tetris (complete) UI
-                    #     gr.Markdown("**🎯 Tetris (complete)**")
-                    #     tetris_overall = gr.Checkbox(label="Tetris (complete) Score", value=True)
-                    #     tetris_details = gr.Checkbox(label="Tetris (complete) Details", value=False)
                     with gr.Column():
-                        gr.Markdown("**📋 Tetris (planning)**")
-                        tetris_plan_overall = gr.Checkbox(label="Tetris (planning) Score", value=True)
-                        tetris_plan_details = gr.Checkbox(label="Tetris (planning) Details", value=False)
                     with gr.Column():
                         gr.Markdown("**⚖️ Ace Attorney**")
                         ace_attorney_overall = gr.Checkbox(label="Ace Attorney Score", value=True)
@@ -945,12 +976,12 @@ def build_app():
                 # Get initial leaderboard dataframe
                 initial_df = get_combined_leaderboard(rank_data, {
                     # "Super Mario Bros": True, # Commented out
-                    "Super Mario Bros (planning only)": True,
                     "Sokoban": True,
                     "2048": True,
                     "Candy Crush": True,
-                    # "Tetris (complete)": True, # Commented out
-                    "Tetris (planning only)": True,
                     "Ace Attorney": True
                 })
@@ -985,7 +1016,7 @@ def build_app():
                 with gr.Row():
                     score_note = add_score_note()
-                # List of all checkboxes, including Super Mario Bros (planning only)
                 checkbox_list = [
                     # mario_overall, mario_details, # Commented out
                     mario_plan_overall, mario_plan_details,
@@ -1000,13 +1031,13 @@ def build_app():
                 # Update visualizations when checkboxes change
                 def update_visualizations(*checkbox_states):
                     # Check if any details checkbox is selected
-                    # Adjusted indices due to addition of Super Mario (planning only)
                     is_details_view = any([
                         checkbox_states[1], # Mario Plan details
                         checkbox_states[3], # Sokoban details
                         checkbox_states[5], # 2048 details
                         checkbox_states[7], # Candy Crush details
-                        checkbox_states[9], # Tetris (planning only) details
                         checkbox_states[11]  # Ace Attorney details
                     ])
@@ -1027,40 +1058,252 @@ def build_app():
                 # Update leaderboard and visualizations when checkboxes change
                 for checkbox in checkbox_list:
                     checkbox.change(
-                        update_leaderboard,
-                        inputs=checkbox_list,
                         outputs=[
                             leaderboard_df,
                             detailed_visualization,
                             radar_visualization,
-                            group_bar_visualization # RESTORED
                         ] + checkbox_list
                     )
                 # Update when clear button is clicked
                 clear_btn.click(
-                    clear_filters,
-                    inputs=[],
                     outputs=[
                         leaderboard_df,
                         detailed_visualization,
                         radar_visualization,
-                        group_bar_visualization # RESTORED
                     ] + checkbox_list
                 )
                 # Initialize the app
                 demo.load(
-                    fn=clear_filters,
                     inputs=[],
                     outputs=[
                         leaderboard_df,
                         detailed_visualization,
                         radar_visualization,
-                        group_bar_visualization # RESTORED
                     ] + checkbox_list
                 )
             with gr.Tab("🎥 Gallery"):
                 video_gallery = create_video_gallery()

     get_sokoban_leaderboard,
     get_2048_leaderboard,
     get_candy_leaderboard,
     get_tetris_planning_leaderboard,
     get_ace_attorney_leaderboard,
     get_combined_leaderboard,
 )
 from data_visualization import (
     get_combined_leaderboard_with_group_bar,
     create_horizontal_bar_chart,
     get_combined_leaderboard_with_single_radar
 )
 from gallery_tab import create_video_gallery
 with open(TIME_POINTS["03/25/2025"], "r") as f:
     rank_data = json.load(f)
+# Load the model leaderboard data
+with open("rank_single_model_03_25_2025.json", "r") as f:
+    model_rank_data = json.load(f)
 # Add leaderboard state at the top level
 leaderboard_state = {
     "current_game": None,
     "previous_overall": {
         # "Super Mario Bros": True, # Commented out
+        "Super Mario Bros": True,
         "Sokoban": True,
         "2048": True,
         "Candy Crush": True,
+        # "Tetris(complete)", # Commented out
+        "Tetris": True,
         "Ace Attorney": True
     },
     "previous_details": {
         # "Super Mario Bros": False, # Commented out
+        "Super Mario Bros": False,
         "Sokoban": False,
         "2048": False,
         "Candy Crush": False,
+        # "Tetris(complete)": False, # Commented out
+        "Tetris": False,
         "Ace Attorney": False
     }
 }
                        candy_overall, candy_details,
                        # tetris_overall, tetris_details, # Commented out
                        tetris_plan_overall, tetris_plan_details,
+                       ace_attorney_overall, ace_attorney_details,
+                       top_n=10,
+                       data_source=None):
     global leaderboard_state
+    # Use provided data source or default to rank_data
+    data = data_source if data_source is not None else rank_data
     # Convert current checkbox states to dictionary for easier comparison
     current_overall = {
         # "Super Mario Bros": mario_overall, # Commented out
+        "Super Mario Bros": mario_plan_overall,
         "Sokoban": sokoban_overall,
         "2048": _2048_overall,
         "Candy Crush": candy_overall,
+        # "Tetris(complete)": tetris_overall, # Commented out
+        "Tetris": tetris_plan_overall,
         "Ace Attorney": ace_attorney_overall
     }
     current_details = {
         # "Super Mario Bros": mario_details, # Commented out
+        "Super Mario Bros": mario_plan_details,
         "Sokoban": sokoban_details,
         "2048": _2048_details,
         "Candy Crush": candy_details,
+        # "Tetris(complete)": tetris_details, # Commented out
+        "Tetris": tetris_plan_details,
         "Ace Attorney": ace_attorney_details
     }
     # Build dictionary for selected games
     selected_games = {
         # "Super Mario Bros": current_overall["Super Mario Bros"], # Commented out
+        "Super Mario Bros": current_overall["Super Mario Bros"],
         "Sokoban": current_overall["Sokoban"],
         "2048": current_overall["2048"],
         "Candy Crush": current_overall["Candy Crush"],
+        # "Tetris(complete)": current_overall["Tetris(complete)"], # Commented out
+        "Tetris": current_overall["Tetris"],
         "Ace Attorney": current_overall["Ace Attorney"]
     }
     if leaderboard_state["current_game"]:
         # For detailed view
         # if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
+        #     df = get_mario_leaderboard(data)
+        if leaderboard_state["current_game"] == "Super Mario Bros":
+            df = get_mario_planning_leaderboard(data)
         elif leaderboard_state["current_game"] == "Sokoban":
+            df = get_sokoban_leaderboard(data)
         elif leaderboard_state["current_game"] == "2048":
+            df = get_2048_leaderboard(data)
         elif leaderboard_state["current_game"] == "Candy Crush":
+            df = get_candy_leaderboard(data)
+        elif leaderboard_state["current_game"] == "Tetris":
+            df = get_tetris_planning_leaderboard(data)
         elif leaderboard_state["current_game"] == "Ace Attorney":
+            df = get_ace_attorney_leaderboard(data)
         else: # Should not happen if current_game is one of the known games
             df = pd.DataFrame() # Empty df
         group_bar_chart = chart
     else:
         # For overall view
+        df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n)
         display_df = prepare_dataframe_for_display(df)
+        _, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games)
         chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
     # Return values, including all four plot placeholders
     return (update_df_with_height(display_df), chart, radar_chart, group_bar_chart,
+            current_overall["Super Mario Bros"], current_details["Super Mario Bros"],
             current_overall["Sokoban"], current_details["Sokoban"],
             current_overall["2048"], current_details["2048"],
             current_overall["Candy Crush"], current_details["Candy Crush"],
+            current_overall["Tetris"], current_details["Tetris"],
             current_overall["Ace Attorney"], current_details["Ace Attorney"])
 def update_leaderboard_with_time(time_point, # mario_overall, mario_details, # Commented out
     if new_rank_data is not None:
         rank_data = new_rank_data
+    # Use the existing update_leaderboard function, including Super Mario
     return update_leaderboard(# mario_overall, mario_details, # Commented out
                             mario_plan_overall, mario_plan_details, # Added
                             sokoban_overall, sokoban_details,
                             tetris_plan_overall, tetris_plan_details,
                             ace_attorney_overall, ace_attorney_details)
+def get_total_model_count(data_source):
+    """Get the total number of unique models in the data"""
+    selected_games = {
+        "Super Mario Bros": True,
+        "Sokoban": True,
+        "2048": True,
+        "Candy Crush": True,
+        "Tetris": True,
+        "Ace Attorney": True
+    }
+    df = get_combined_leaderboard(data_source, selected_games)
+    return len(df["Player"].unique())
 def get_initial_state():
     """Get the initial state for the leaderboard"""
     return {
         "current_game": None,
         "previous_overall": {
             # "Super Mario Bros": True, # Commented out
+            "Super Mario Bros": True,
             "Sokoban": True,
             "2048": True,
             "Candy Crush": True,
+            # "Tetris(complete)", # Commented out
+            "Tetris": True,
             "Ace Attorney": True
         },
         "previous_details": {
             # "Super Mario Bros": False, # Commented out
+            "Super Mario Bros": False,
             "Sokoban": False,
             "2048": False,
             "Candy Crush": False,
+            # "Tetris(complete)": False, # Commented out
+            "Tetris": False,
             "Ace Attorney": False
         }
     }
+def clear_filters(top_n=10, data_source=None):
     global leaderboard_state
+    # Use provided data source or default to rank_data
+    data = data_source if data_source is not None else rank_data
     selected_games = {
+        "Super Mario Bros": True,
         "Sokoban": True,
         "2048": True,
         "Candy Crush": True,
+        "Tetris": True,
         "Ace Attorney": True
     }
+    df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n)
     display_df = prepare_dataframe_for_display(df)
+    _, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games)
     leaderboard_state = get_initial_state()
             True, False,  # sokoban
             True, False,  # 2048
             True, False,  # candy
+            True, False,  # Tetrisplan
             True, False)  # ace attorney
 def create_timeline_slider():
     with gr.Blocks(css="""
         /* Fix for scrolling issues */
         html, body {
+            overflow-y: auto !important;
             overflow-x: hidden !important;
             width: 100% !important;
             height: 100% !important;
                 let newContent = header.innerHTML;
+                // Format Super Mario Brosheader
                 if (text.includes('Super Mario Bros')) {
                     newContent = newContent.replace(/Super\s+Mario\s+Bros/g, 'Super<br>Mario Bros');
                 }
+                // Format Tetrisheaders
+                if (text.includes('Tetris(complete)')) {
                     newContent = newContent.replace(/Tetris\s+\(complete\)/g, 'Tetris<br>(complete)');
                 }
+                if (text.includes('Tetris')) {
+                    newContent = newContent.replace(/Tetris\s+\(planning\s+only\)/g, 'Tetris');
                 }
                 // Format Candy Crush header
         """)
         with gr.Tabs():
+            with gr.Tab("🏆 Agent Leaderboard"):
                 # Visualization section
                 with gr.Row():
                     gr.Markdown("### 📊 Data Visualization")
                                 )
                         # Comment out the Group Bar Chart tab
                         with gr.Tab("📊 Group Bar Chart"):
+                            with gr.Row():
+                                # Calculate dynamic maximum based on total models
+                                agent_max_models = get_total_model_count(rank_data)
+                                top_n_slider = gr.Slider(
+                                    minimum=1,
+                                    maximum=agent_max_models,
+                                    step=1,
+                                    value=min(10, agent_max_models),
+                                    label=f"Number of Top Models to Display (max: {agent_max_models})",
+                                    elem_classes="top-n-slider"
+                                )
                             group_bar_visualization = gr.Plot(
                                 label="Comparative Analysis (Group Bar Chart)",
                                 elem_classes="visualization-container"
                 with gr.Row():
                     gr.Markdown("### 🎮 Game Selection")
                 with gr.Row():
+                    # with gr.Column(): # Commented out Super Mario BrosUI
                     #     gr.Markdown("**🎮 Super Mario Bros**")
+                    #     mario_overall = gr.Checkbox(label="Super Mario BrosScore", value=True)
+                    #     mario_details = gr.Checkbox(label="Super Mario BrosDetails", value=False)
+                    with gr.Column(): # Added Super Mario BrosUI
+                        gr.Markdown("**🎮 Super Mario Bros**")
+                        mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
+                        mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
                     with gr.Column(): # Sokoban is now after mario_plan
                         gr.Markdown("**📦 Sokoban**")
                         sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
                         gr.Markdown("**🍬 Candy Crush**")
                         candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
                         candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
+                    # with gr.Column(): # Commented out Tetris(complete) UI
+                    #     gr.Markdown("**🎯 Tetris(complete)**")
+                    #     tetris_overall = gr.Checkbox(label="Tetris(complete) Score", value=True)
+                    #     tetris_details = gr.Checkbox(label="Tetris(complete) Details", value=False)
                     with gr.Column():
+                        gr.Markdown("**🎯 Tetris**")
+                        tetris_plan_overall = gr.Checkbox(label="Tetris Score", value=True)
+                        tetris_plan_details = gr.Checkbox(label="Tetris Details", value=False)
                     with gr.Column():
                         gr.Markdown("**⚖️ Ace Attorney**")
                         ace_attorney_overall = gr.Checkbox(label="Ace Attorney Score", value=True)
                 # Get initial leaderboard dataframe
                 initial_df = get_combined_leaderboard(rank_data, {
                     # "Super Mario Bros": True, # Commented out
+                    "Super Mario Bros": True,
                     "Sokoban": True,
                     "2048": True,
                     "Candy Crush": True,
+                    # "Tetris(complete)": True, # Commented out
+                    "Tetris": True,
                     "Ace Attorney": True
                 })
                 with gr.Row():
                     score_note = add_score_note()
+                # List of all checkboxes, including Super Mario Bros
                 checkbox_list = [
                     # mario_overall, mario_details, # Commented out
                     mario_plan_overall, mario_plan_details,
                 # Update visualizations when checkboxes change
                 def update_visualizations(*checkbox_states):
                     # Check if any details checkbox is selected
+                    # Adjusted indices due to addition of Super Mario
                     is_details_view = any([
                         checkbox_states[1], # Mario Plan details
                         checkbox_states[3], # Sokoban details
                         checkbox_states[5], # 2048 details
                         checkbox_states[7], # Candy Crush details
+                        checkbox_states[9], # Tetris details
                         checkbox_states[11]  # Ace Attorney details
                     ])
                 # Update leaderboard and visualizations when checkboxes change
                 for checkbox in checkbox_list:
                     checkbox.change(
+                        lambda *args: update_leaderboard(*args, data_source=rank_data),
+                        inputs=checkbox_list + [top_n_slider],
                         outputs=[
                             leaderboard_df,
                             detailed_visualization,
                             radar_visualization,
+                            group_bar_visualization
                         ] + checkbox_list
                     )
+                # Update when top_n_slider changes
+                top_n_slider.change(
+                    lambda *args: update_leaderboard(*args, data_source=rank_data),
+                    inputs=checkbox_list + [top_n_slider],
+                    outputs=[
+                        leaderboard_df,
+                        detailed_visualization,
+                        radar_visualization,
+                        group_bar_visualization
+                    ] + checkbox_list
+                )
                 # Update when clear button is clicked
                 clear_btn.click(
+                    lambda *args: clear_filters(*args, data_source=rank_data),
+                    inputs=[top_n_slider],
                     outputs=[
                         leaderboard_df,
                         detailed_visualization,
                         radar_visualization,
+                        group_bar_visualization
                     ] + checkbox_list
                 )
                 # Initialize the app
                 demo.load(
+                    lambda: clear_filters(data_source=rank_data),
                     inputs=[],
                     outputs=[
                         leaderboard_df,
                         detailed_visualization,
                         radar_visualization,
+                        group_bar_visualization
                     ] + checkbox_list
                 )
+            with gr.Tab("🤖 Model Leaderboard"):
+                # Visualization section
+                with gr.Row():
+                    gr.Markdown("### 📊 Data Visualization")
+                # Detailed view visualization (single chart)
+                model_detailed_visualization = gr.Plot(
+                    label="Performance Visualization",
+                    visible=False,
+                    elem_classes="visualization-container"
+                )
+                with gr.Column(visible=True) as model_overall_visualizations:
+                    with gr.Tabs():
+                        with gr.Tab("📈 Radar Chart"):
+                            model_radar_visualization = gr.Plot(
+                                label="Comparative Analysis (Radar Chart)",
+                                elem_classes="visualization-container"
+                            )
+                            gr.Markdown(
+                                    "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
+                                    elem_classes="radar-tip"
+                                )
+                        with gr.Tab("📊 Group Bar Chart"):
+                            with gr.Row():
+                                # Calculate dynamic maximum based on total models
+                                model_max_models = get_total_model_count(model_rank_data)
+                                model_top_n_slider = gr.Slider(
+                                    minimum=1,
+                                    maximum=model_max_models,
+                                    step=1,
+                                    value=min(10, model_max_models),
+                                    label=f"Number of Top Models to Display (max: {model_max_models})",
+                                    elem_classes="top-n-slider"
+                                )
+                            model_group_bar_visualization = gr.Plot(
+                                label="Comparative Analysis (Group Bar Chart)",
+                                elem_classes="visualization-container"
+                            )
+                # Game selection section
+                with gr.Row():
+                    gr.Markdown("### 🎮 Game Selection")
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("**🎮 Super Mario Bros**")
+                        model_mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
+                        model_mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
+                    with gr.Column():
+                        gr.Markdown("**📦 Sokoban**")
+                        model_sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
+                        model_sokoban_details = gr.Checkbox(label="Sokoban Details", value=False)
+                    with gr.Column():
+                        gr.Markdown("**🔢 2048**")
+                        model_2048_overall = gr.Checkbox(label="2048 Score", value=True)
+                        model_2048_details = gr.Checkbox(label="2048 Details", value=False)
+                    with gr.Column():
+                        gr.Markdown("**🍬 Candy Crush**")
+                        model_candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
+                        model_candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
+                    with gr.Column():
+                        gr.Markdown("**🎯 Tetris**")
+                        model_tetris_plan_overall = gr.Checkbox(label="Tetris Score", value=True)
+                        model_tetris_plan_details = gr.Checkbox(label="Tetris Details", value=False)
+                    with gr.Column():
+                        gr.Markdown("**⚖️ Ace Attorney**")
+                        model_ace_attorney_overall = gr.Checkbox(label="Ace Attorney Score", value=True)
+                        model_ace_attorney_details = gr.Checkbox(label="Ace Attorney Details", value=False)
+                # Controls
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        gr.Markdown("**⏰ Time Tracker**")
+                        model_timeline = create_timeline_slider()
+                    with gr.Column(scale=1):
+                        gr.Markdown("**🔄 Controls**")
+                        model_clear_btn = gr.Button("Reset Filters", variant="secondary")
+                # Leaderboard table
+                with gr.Row():
+                    gr.Markdown("### 📋 Detailed Results")
+                # Get initial leaderboard dataframe
+                model_initial_df = get_combined_leaderboard(model_rank_data, {
+                    "Super Mario Bros": True,
+                    "Sokoban": True,
+                    "2048": True,
+                    "Candy Crush": True,
+                    "Tetris": True,
+                    "Ace Attorney": True
+                })
+                # Format the DataFrame for display
+                model_initial_display_df = prepare_dataframe_for_display(model_initial_df)
+                # Create a standard DataFrame component with enhanced styling
+                with gr.Row():
+                    model_leaderboard_df = gr.DataFrame(
+                        value=model_initial_display_df,
+                        interactive=True,
+                        elem_id="model-leaderboard-table",
+                        elem_classes="table-container",
+                        wrap=True,
+                        show_row_numbers=True,
+                        show_fullscreen_button=True,
+                        line_breaks=True,
+                        max_height=1000,
+                        show_search="search",
+                        column_widths=col_widths
+                    )
+                # Add the score note below the table
+                with gr.Row():
+                    model_score_note = add_score_note()
+                # List of all checkboxes for model leaderboard
+                model_checkbox_list = [
+                    model_mario_plan_overall, model_mario_plan_details,
+                    model_sokoban_overall, model_sokoban_details,
+                    model_2048_overall, model_2048_details,
+                    model_candy_overall, model_candy_details,
+                    model_tetris_plan_overall, model_tetris_plan_details,
+                    model_ace_attorney_overall, model_ace_attorney_details
+                ]
+                # Update visualizations when checkboxes change
+                def update_model_visualizations(*checkbox_states):
+                    # Check if any details checkbox is selected
+                    is_details_view = any([
+                        checkbox_states[1], # Mario Plan details
+                        checkbox_states[3], # Sokoban details
+                        checkbox_states[5], # 2048 details
+                        checkbox_states[7], # Candy Crush details
+                        checkbox_states[9], # Tetris details
+                        checkbox_states[11]  # Ace Attorney details
+                    ])
+                    # Update visibility of visualization blocks
+                    return {
+                        model_detailed_visualization: gr.update(visible=is_details_view),
+                        model_overall_visualizations: gr.update(visible=not is_details_view)
+                    }
+                # Add change event to all checkboxes
+                for checkbox in model_checkbox_list:
+                    checkbox.change(
+                        update_model_visualizations,
+                        inputs=model_checkbox_list,
+                        outputs=[model_detailed_visualization, model_overall_visualizations]
+                    )
+                # Update leaderboard and visualizations when checkboxes change
+                for checkbox in model_checkbox_list:
+                    checkbox.change(
+                        lambda *args: update_leaderboard(*args, data_source=model_rank_data),
+                        inputs=model_checkbox_list + [model_top_n_slider],
+                        outputs=[
+                            model_leaderboard_df,
+                            model_detailed_visualization,
+                            model_radar_visualization,
+                            model_group_bar_visualization
+                        ] + model_checkbox_list
+                    )
+                # Update when model top_n_slider changes
+                model_top_n_slider.change(
+                    lambda *args: update_leaderboard(*args, data_source=model_rank_data),
+                    inputs=model_checkbox_list + [model_top_n_slider],
+                    outputs=[
+                        model_leaderboard_df,
+                        model_detailed_visualization,
+                        model_radar_visualization,
+                        model_group_bar_visualization
+                    ] + model_checkbox_list
+                )
+                # Update when clear button is clicked
+                model_clear_btn.click(
+                    lambda *args: clear_filters(*args, data_source=model_rank_data),
+                    inputs=[model_top_n_slider],
+                    outputs=[
+                        model_leaderboard_df,
+                        model_detailed_visualization,
+                        model_radar_visualization,
+                        model_group_bar_visualization
+                    ] + model_checkbox_list
+                )
+                # Initialize the model leaderboard
+                demo.load(
+                    lambda: clear_filters(data_source=model_rank_data),
+                    inputs=[],
+                    outputs=[
+                        model_leaderboard_df,
+                        model_detailed_visualization,
+                        model_radar_visualization,
+                        model_group_bar_visualization
+                    ] + model_checkbox_list
+                )
             with gr.Tab("🎥 Gallery"):
                 video_gallery = create_video_gallery()

assets/model_color.json CHANGED Viewed

@@ -3,12 +3,16 @@
     "claude-3-7-sonnet-20250219 (thinking)": "#2E5C8A",
     "claude-3-5-haiku-20241022": "#7FB5E6",
     "claude-3-5-sonnet-20241022": "#1A4C7C",
     "gemini-2.0-flash": "#FF4081",
     "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
     "gemini-2.5-pro-exp-03-25": "#FF80AB",
     "gemini-2.5-flash-preview-04-17": "#F06292",
     "gemini-2.5-flash-preview-04-17 (thinking)": "#E91E63",
     "gemini-2.5-pro-preview-05-06 (thinking)": "#AD1457",
     "gpt-4o-2024-11-20": "#00BFA5",
     "gpt-4.5-preview-2025-02-27": "#00796B",
     "gpt-4.1-2025-04-14": "#00897B",
@@ -21,7 +25,39 @@
     "grok-3-mini-beta": "#FF8A65",
     "grok-3-mini-beta (thinking)": "#F57C00",
     "deepseek-v3": "#FFC107",
-    "deepseek-r1": "#FFA000",
     "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
-    "Random (x30)": "#9E9E9E"
 }

     "claude-3-7-sonnet-20250219 (thinking)": "#2E5C8A",
     "claude-3-5-haiku-20241022": "#7FB5E6",
     "claude-3-5-sonnet-20241022": "#1A4C7C",
+    "claude-opus-4-20250514": "#3A80D2",
+    "claude-sonnet-4-20250514": "#5A9FE2",
     "gemini-2.0-flash": "#FF4081",
     "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
     "gemini-2.5-pro-exp-03-25": "#FF80AB",
     "gemini-2.5-flash-preview-04-17": "#F06292",
     "gemini-2.5-flash-preview-04-17 (thinking)": "#E91E63",
+    "gemini-2.5-flash-preview-05-20": "#F8BBD9",
     "gemini-2.5-pro-preview-05-06 (thinking)": "#AD1457",
+    "gemini-2.5-pro-preview-06-05": "#EC407A",
     "gpt-4o-2024-11-20": "#00BFA5",
     "gpt-4.5-preview-2025-02-27": "#00796B",
     "gpt-4.1-2025-04-14": "#00897B",
     "grok-3-mini-beta": "#FF8A65",
     "grok-3-mini-beta (thinking)": "#F57C00",
     "deepseek-v3": "#FFC107",
+    "deepseek-r1-0120": "#FFA000",
+    "deepseek-r1-0528": "#FFB300",
     "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
+    "qwen3-235B-A22B-fp8": "#6A1B9A",
+    "random (x30)": "#9E9E9E",
+    "gamingagent + claude-3-7-sonnet-20250219": "#4A90E2",
+    "gamingagent + claude-3-7-sonnet-20250219 (thinking)": "#2E5C8A",
+    "gamingagent + claude-3-5-haiku-20241022": "#7FB5E6",
+    "gamingagent + claude-3-5-sonnet-20241022": "#1A4C7C",
+    "gamingagent + claude-opus-4-20250514": "#3A80D2",
+    "gamingagent + claude-sonnet-4-20250514": "#5A9FE2",
+    "gamingagent + gemini-2.0-flash": "#FF4081",
+    "gamingagent + gemini-2.0-flash-thinking-exp-1219": "#C2185B",
+    "gamingagent + gemini-2.5-pro-exp-03-25": "#FF80AB",
+    "gamingagent + gemini-2.5-flash-preview-04-17": "#F06292",
+    "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)": "#E91E63",
+    "gamingagent + gemini-2.5-flash-preview-05-20": "#F8BBD9",
+    "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)": "#AD1457",
+    "gamingagent + gemini-2.5-pro-preview-06-05": "#EC407A",
+    "gamingagent + gpt-4o-2024-11-20": "#00BFA5",
+    "gamingagent + gpt-4.5-preview-2025-02-27": "#00796B",
+    "gamingagent + gpt-4.1-2025-04-14": "#00897B",
+    "gamingagent + o1-2024-12-17": "#4DB6AC",
+    "gamingagent + o1-mini-2024-09-12": "#26A69A",
+    "gamingagent + o3-mini-2025-01-31(medium)": "#80CBC4",
+    "gamingagent + o3-2025-04-16": "#26C6DA",
+    "gamingagent + o4-mini-2025-04-16": "#00ACC1",
+    "gamingagent + grok-3-beta": "#FF7043",
+    "gamingagent + grok-3-mini-beta": "#FF8A65",
+    "gamingagent + grok-3-mini-beta (thinking)": "#F57C00",
+    "gamingagent + deepseek-v3": "#FFC107",
+    "gamingagent + deepseek-r1-0120": "#FFA000",
+    "gamingagent + deepseek-r1-0528": "#FFB300",
+    "gamingagent + llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
+    "gamingagent + qwen3-235B-A22B-fp8": "#6A1B9A"
 }

data_visualization.py CHANGED Viewed

@@ -3,13 +3,6 @@ import numpy as np
 import pandas as pd
 import json
 from leaderboard_utils import (
-    get_organization,
-    get_mario_leaderboard,
-    get_sokoban_leaderboard,
-    get_2048_leaderboard,
-    get_candy_leaderboard,
-    get_tetris_leaderboard,
-    get_tetris_planning_leaderboard,
     get_combined_leaderboard,
     GAME_ORDER
 )
@@ -186,7 +179,7 @@ def get_combined_leaderboard_with_radar(rank_data, selected_games):
     df_viz = df.copy()
     return df, create_radar_charts(df_viz)
-def create_group_bar_chart(df):
     game_cols = {}
     for game in GAME_ORDER:
         col = f"{game} Score"
@@ -231,56 +224,89 @@ def create_group_bar_chart(df):
     # Create mapping from original to formatted names
     game_display_map = dict(zip(sorted_games, formatted_games))
-    # Group models by prefix, then sort alphabetically
-    model_groups = {}
-    for player in df["Player"].unique():
-        prefix = player.split('-')[0]
-        model_groups.setdefault(prefix, []).append(player)
-    ordered_players = []
-    for prefix in sorted(model_groups):
-        ordered_players.extend(sorted(model_groups[prefix]))
-    # Create one trace per player
     fig = go.Figure()
-    for player in ordered_players:
-        row = df[df["Player"] == player]
-        if row.empty:
-            continue
-        row = row.iloc[0]
         y_vals = []
-        has_data = False
         for game in sorted_games:
-            col = f"norm_{game} Score"
-            val = row.get(col, np.nan)
-            if not np.isnan(val):
-                has_data = True
-            y_vals.append(val if not np.isnan(val) else 0)
-        if not has_data:
-            continue
-        fig.add_trace(go.Bar(
-            name=row["Player"],
-            x=[game_display_map[game] for game in sorted_games],
-            y=y_vals,
-            marker_color=MODEL_COLORS.get(player, '#808080'),
-            hovertemplate="<b>%{fullData.name}</b><br>Score: %{y:.1f}<extra></extra>"
-        ))
     fig.update_layout(
-        autosize=False,
-        width=1000,
-        height=800,
-        margin=dict(l=200, r=200, t=20, b=20),
-        title=dict(text="Grouped Bar Chart of AI Models (Consistent Trace Grouping)", pad=dict(t=10)),
-        xaxis_title="Games",
         yaxis_title="Normalized Score",
         xaxis=dict(
             categoryorder='array',
-            categoryarray=[game_display_map[g] for g in sorted_games],
-            tickangle=0  # Keep text horizontal since we're using line breaks
         ),
         barmode='group',
         bargap=0.2,        # Gap between game categories
@@ -303,11 +329,11 @@ def create_group_bar_chart(df):
-def get_combined_leaderboard_with_group_bar(rank_data, selected_games):
     df = get_combined_leaderboard(rank_data, selected_games)
     # Create a copy for visualization to avoid modifying the original
     df_viz = df.copy()
-    return df, create_group_bar_chart(df_viz)
 def hex_to_rgba(hex_color, alpha=0.2):
     hex_color = hex_color.lstrip('#')
@@ -324,10 +350,8 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
     # Format game names
     formatted_games = []
     for game in selected_games:
-        if game == 'Super Mario Bros (planning only)':
-            formatted_games.append('Super Mario')  # Simplified name
-        elif game == 'Tetris (planning only)':
-            formatted_games.append('Tetris')
         else:
             formatted_games.append(game)  # Keep other names as is
@@ -387,10 +411,9 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
         ))
     fig.update_layout(
-        autosize=False,
-        width=1000,
-        height=700,  # Increased height to accommodate legend
-        margin=dict(l=400, r=200, t=20, b=20),
         title=dict(
             text="AI Normalized Performance Across Games",
             x=0.5,

 import pandas as pd
 import json
 from leaderboard_utils import (
     get_combined_leaderboard,
     GAME_ORDER
 )
     df_viz = df.copy()
     return df, create_radar_charts(df_viz)
+def create_group_bar_chart(df, top_n=10):
     game_cols = {}
     for game in GAME_ORDER:
         col = f"{game} Score"
     # Create mapping from original to formatted names
     game_display_map = dict(zip(sorted_games, formatted_games))
+    # For each game, get top performers and create combined x-axis categories
     fig = go.Figure()
+    all_x_categories = []
+    all_players = set()
+    unique_x_labels = []
+    # First pass: collect all players and create x-axis categories
+    game_rankings = {}
+    for game in sorted_games:
+        col = f"norm_{game} Score"
+        # Get valid scores for this game and sort by score (highest first)
+        game_data = df[df[col].notna()].copy()
+        game_data = game_data.sort_values(by=col, ascending=False)
+        # Store rankings for this game (limit to top_n)
+        game_rankings[game] = []
+        for i, (_, row) in enumerate(game_data.iterrows()):
+            if i >= top_n:  # Limit to top_n performers
+                break
+            player = row["Player"]
+            score = row[col]
+            rank = i + 1
+            x_category = f"{game_display_map[game]}<br>#{rank}"
+            game_rankings[game].append({
+                'player': player,
+                'score': score,
+                'x_category': x_category,
+                'rank': rank
+            })
+            all_x_categories.append(x_category)
+            all_players.add(player)
+            # Show label at the middle position based on number of models
+            middle_position = (top_n + 1) // 2
+            if rank == middle_position:
+                # Special case for Super Mario Bros (planning only)
+                if game == "Super Mario Bros":
+                    unique_x_labels.append("SMB")
+                else:
+                    unique_x_labels.append(game_display_map[game])  # Show just game name without rank
+            else:
+                unique_x_labels.append("")  # Empty string for other ranks
+    # Second pass: create traces for each player
+    for player in sorted(all_players):
+        x_vals = []
         y_vals = []
         for game in sorted_games:
+            # Find this player's data for this game
+            player_data = None
+            for data in game_rankings[game]:
+                if data['player'] == player:
+                    player_data = data
+                    break
+            if player_data:
+                x_vals.append(player_data['x_category'])
+                y_vals.append(player_data['score'])
+        if x_vals:  # Only add trace if player has data
+            fig.add_trace(go.Bar(
+                name=player,
+                x=x_vals,
+                y=y_vals,
+                marker_color=MODEL_COLORS.get(player, '#808080'),
+                hovertemplate="<b>%{fullData.name}</b><br>Score: %{y:.1f}<extra></extra>"
+            ))
     fig.update_layout(
+        autosize=True,
+        height=550,
+        margin=dict(l=50, r=50, t=20, b=20),
+        title=dict(text=f"Grouped Bar Chart - Top {top_n} Performers by Game", pad=dict(t=10)),
+        xaxis_title="Games (Ranked by Performance)",
         yaxis_title="Normalized Score",
         xaxis=dict(
             categoryorder='array',
+            categoryarray=all_x_categories,
+            tickangle=0,  # Keep text horizontal since we're using line breaks
+            ticktext=unique_x_labels,  # Show labels only for first occurrence
+            tickvals=all_x_categories
         ),
         barmode='group',
         bargap=0.2,        # Gap between game categories
+def get_combined_leaderboard_with_group_bar(rank_data, selected_games, top_n=10):
     df = get_combined_leaderboard(rank_data, selected_games)
     # Create a copy for visualization to avoid modifying the original
     df_viz = df.copy()
+    return df, create_group_bar_chart(df_viz, top_n)
 def hex_to_rgba(hex_color, alpha=0.2):
     hex_color = hex_color.lstrip('#')
     # Format game names
     formatted_games = []
     for game in selected_games:
+        if game == 'Super Mario Bros':
+            formatted_games.append('SMB')  # Clean name without planning only
         else:
             formatted_games.append(game)  # Keep other names as is
         ))
     fig.update_layout(
+        autosize=True,
+        height=550,  # Reduced height for better proportion with legend
+        margin=dict(l=400, r=100, t=20, b=20),
         title=dict(
             text="AI Normalized Performance Across Games",
             x=0.5,

leaderboard_utils.py CHANGED Viewed

@@ -5,12 +5,12 @@ import numpy as np
 # Define game order
 GAME_ORDER = [
     # "Super Mario Bros", # Commented out
-    "Super Mario Bros (planning only)",
     "Sokoban",
     "2048",
     "Candy Crush",
     # "Tetris (complete)", # Commented out
-    "Tetris (planning only)",
     "Ace Attorney"
 ]
@@ -31,20 +31,6 @@ def get_organization(model_name):
     else:
         return "unknown"
-def get_mario_leaderboard(rank_data):
-    data = rank_data.get("Super Mario Bros", {}).get("results", [])
-    df = pd.DataFrame(data)
-    df = df.rename(columns={
-        "model": "Player",
-        "progress": "Progress (current/total)",
-        "score": "Score",
-        "time_s": "Time (s)"
-    })
-    df["Organization"] = df["Player"].apply(get_organization)
-    df = df[["Player", "Organization", "Progress (current/total)", "Score", "Time (s)"]]
-    if "Score" in df.columns:
-        df = df.sort_values("Score", ascending=False)
-    return df
 def get_sokoban_leaderboard(rank_data):
     data = rank_data.get("Sokoban", {}).get("results", [])
@@ -143,20 +129,8 @@ def get_candy_leaderboard(rank_data):
         df = df.sort_values("Score", ascending=False)
     return df
-def get_tetris_leaderboard(rank_data):
-    data = rank_data.get("Tetris (complete)", {}).get("results", [])
-    df = pd.DataFrame(data)
-    df = df.rename(columns={
-        "model": "Player",
-        "score": "Score",
-        "steps_blocks": "Steps"
-    })
-    df["Organization"] = df["Player"].apply(get_organization)
-    df = df[["Player", "Organization", "Score", "Steps"]]
-    return df
 def get_tetris_planning_leaderboard(rank_data):
-    data = rank_data.get("Tetris (planning only)", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
         "model": "Player",
@@ -181,13 +155,12 @@ def get_ace_attorney_leaderboard(rank_data):
     df = df.rename(columns={
         "model": "Player",
         "score": "Score",
-        "progress": "Progress",
-        "evaluator result": "Evaluator Result"
     })
     df["Organization"] = df["Player"].apply(get_organization)
-    # Define columns to keep, including Evaluator Result
-    columns_to_keep = ["Player", "Organization", "Score", "Progress", "Evaluator Result"]
     # Filter to only columns that actually exist in the DataFrame after renaming
     df_columns = [col for col in columns_to_keep if col in df.columns]
     df = df[df_columns]
@@ -198,7 +171,7 @@ def get_ace_attorney_leaderboard(rank_data):
     return df
 def get_mario_planning_leaderboard(rank_data):
-    data = rank_data.get("Super Mario Bros (planning only)", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
         "model": "Player",
@@ -224,8 +197,8 @@ def calculate_rank_and_completeness(rank_data, selected_games):
     # Get DataFrames for selected games
     # if selected_games.get("Super Mario Bros"): # Commented out
     #     game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
-    if selected_games.get("Super Mario Bros (planning only)"):
-        game_dfs["Super Mario Bros (planning only)"] = get_mario_planning_leaderboard(rank_data)
     if selected_games.get("Sokoban"):
         game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
     if selected_games.get("2048"):
@@ -234,8 +207,8 @@ def calculate_rank_and_completeness(rank_data, selected_games):
         game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
     # if selected_games.get("Tetris (complete)"): # Commented out
     #     game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
-    if selected_games.get("Tetris (planning only)"):
-        game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
     if selected_games.get("Ace Attorney"):
         game_dfs["Ace Attorney"] = get_ace_attorney_leaderboard(rank_data)
@@ -265,7 +238,7 @@ def calculate_rank_and_completeness(rank_data, selected_games):
                     # if game == "Super Mario Bros": # Commented out
                     #     player_score = df[df["Player"] == player]["Score"].iloc[0]
                     #     rank = len(df[df["Score"] > player_score]) + 1
-                    if game == "Super Mario Bros (planning only)":
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
                     elif game == "Sokoban":
@@ -277,7 +250,7 @@ def calculate_rank_and_completeness(rank_data, selected_games):
                     elif game == "Candy Crush":
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
-                    elif game in ["Tetris (planning only)"]:
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
                     elif game == "Ace Attorney":
@@ -329,8 +302,8 @@ def get_combined_leaderboard(rank_data, selected_games):
     # Get DataFrames for selected games
     # if selected_games.get("Super Mario Bros"): # Commented out
     #     game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
-    if selected_games.get("Super Mario Bros (planning only)"):
-        game_dfs["Super Mario Bros (planning only)"] = get_mario_planning_leaderboard(rank_data)
     if selected_games.get("Sokoban"):
         game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
     if selected_games.get("2048"):
@@ -339,8 +312,8 @@ def get_combined_leaderboard(rank_data, selected_games):
         game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
     # if selected_games.get("Tetris (complete)"): # Commented out
     #     game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
-    if selected_games.get("Tetris (planning only)"):
-        game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
     if selected_games.get("Ace Attorney"):
         game_dfs["Ace Attorney"] = get_ace_attorney_leaderboard(rank_data)
@@ -365,7 +338,7 @@ def get_combined_leaderboard(rank_data, selected_games):
                 if player in df["Player"].values:
                     # if game == "Super Mario Bros": # Commented out
                     #     player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
-                    if game == "Super Mario Bros (planning only)":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "Sokoban":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
@@ -373,7 +346,7 @@ def get_combined_leaderboard(rank_data, selected_games):
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "Candy Crush":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
-                    elif game in ["Tetris (planning only)"]:
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "Ace Attorney":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]

 # Define game order
 GAME_ORDER = [
     # "Super Mario Bros", # Commented out
+    "Super Mario Bros",
     "Sokoban",
     "2048",
     "Candy Crush",
     # "Tetris (complete)", # Commented out
+    "Tetris",
     "Ace Attorney"
 ]
     else:
         return "unknown"
 def get_sokoban_leaderboard(rank_data):
     data = rank_data.get("Sokoban", {}).get("results", [])
         df = df.sort_values("Score", ascending=False)
     return df
 def get_tetris_planning_leaderboard(rank_data):
+    data = rank_data.get("Tetris", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
         "model": "Player",
     df = df.rename(columns={
         "model": "Player",
         "score": "Score",
+        "progress": "Progress"
     })
     df["Organization"] = df["Player"].apply(get_organization)
+    # Define columns to keep
+    columns_to_keep = ["Player", "Organization", "Score", "Progress"]
     # Filter to only columns that actually exist in the DataFrame after renaming
     df_columns = [col for col in columns_to_keep if col in df.columns]
     df = df[df_columns]
     return df
 def get_mario_planning_leaderboard(rank_data):
+    data = rank_data.get("Super Mario Bros", {}).get("results", [])
     df = pd.DataFrame(data)
     df = df.rename(columns={
         "model": "Player",
     # Get DataFrames for selected games
     # if selected_games.get("Super Mario Bros"): # Commented out
     #     game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
+    if selected_games.get("Super Mario Bros"):
+        game_dfs["Super Mario Bros"] = get_mario_planning_leaderboard(rank_data)
     if selected_games.get("Sokoban"):
         game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
     if selected_games.get("2048"):
         game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
     # if selected_games.get("Tetris (complete)"): # Commented out
     #     game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
+    if selected_games.get("Tetris"):
+        game_dfs["Tetris"] = get_tetris_planning_leaderboard(rank_data)
     if selected_games.get("Ace Attorney"):
         game_dfs["Ace Attorney"] = get_ace_attorney_leaderboard(rank_data)
                     # if game == "Super Mario Bros": # Commented out
                     #     player_score = df[df["Player"] == player]["Score"].iloc[0]
                     #     rank = len(df[df["Score"] > player_score]) + 1
+                    if game == "Super Mario Bros":
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
                     elif game == "Sokoban":
                     elif game == "Candy Crush":
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
+                    elif game in ["Tetris"]:
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
                     elif game == "Ace Attorney":
     # Get DataFrames for selected games
     # if selected_games.get("Super Mario Bros"): # Commented out
     #     game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
+    if selected_games.get("Super Mario Bros"):
+        game_dfs["Super Mario Bros"] = get_mario_planning_leaderboard(rank_data)
     if selected_games.get("Sokoban"):
         game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
     if selected_games.get("2048"):
         game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
     # if selected_games.get("Tetris (complete)"): # Commented out
     #     game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
+    if selected_games.get("Tetris"):
+        game_dfs["Tetris"] = get_tetris_planning_leaderboard(rank_data)
     if selected_games.get("Ace Attorney"):
         game_dfs["Ace Attorney"] = get_ace_attorney_leaderboard(rank_data)
                 if player in df["Player"].values:
                     # if game == "Super Mario Bros": # Commented out
                     #     player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
+                    if game == "Super Mario Bros":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "Sokoban":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "Candy Crush":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
+                    elif game in ["Tetris"]:
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                     elif game == "Ace Attorney":
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]

rank_data_03_25_2025.json CHANGED Viewed

@@ -1,112 +1,71 @@
 {
     "Super Mario Bros": {
-        "runs": 5,
-        "results": [
-            {
-                "model": "claude-3-7-sonnet-20250219",
-                "score": 710,
-                "progress": "1-1",
-                "time_s": 64.2
-            },
-            {
-                "model": "gpt-4.1-2025-04-14",
-                "score": 740,
-                "progress": "1-1",
-                "time_s": 68.6
-            },
-            {
-                "model": "gpt-4o-2024-11-20",
-                "score": 560,
-                "progress": "1-1",
-                "time_s": 58.6
-            },
-            {
-                "model": "gemini-2.0-flash",
-                "score": 320,
-                "progress": "1-1",
-                "time_s": 51.8
-            },
-            {
-                "model": "claude-3-5-haiku-20241022",
-                "score": 140,
-                "progress": "1-1",
-                "time_s": 76.4
-            },
-            {
-                "model": "gpt-4.5-preview-2025-02-27",
-                "score": 160,
-                "progress": "1-1",
-                "time_s": 62.8
-            }
-        ]
-    },
-    "Super Mario Bros (planning only)": {
         "runs": 3,
         "results": [
             {
-                "model": "claude-3-5-sonnet-20241022",
                 "score": 1267.7,
-                "detail_data": "709;1532;1562",
                 "progress": "1-1"
             },
             {
-                "model": "claude-3-7-sonnet-20250219 (thinking)",
                 "score": 1418.7,
-                "detail_data": "2015;709;1532",
                 "progress": "1-1"
             },
             {
-                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
                 "score": 1385.0,
-                "detail_data": "1672;1266;1247",
                 "progress": "1-1"
             },
             {
-                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
                 "score": 1498.3,
-                "detail_data": "1561;1271;1663",
                 "progress": "1-1"
             },
             {
-                "model": "llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 1468.7,
-                "detail_data": "898;2008;1500",
                 "progress": "1-1"
             },
             {
-                "model": "gpt-4.1-2025-04-14",
                 "score": 2126.3,
-                "detail_data": "1531;722;4126",
                 "progress": "1-1"
             },
             {
-                "model": "gpt-4o-2024-11-20",
                 "score": 2047.3,
-                "detail_data": "2017;2590;1535",
                 "progress": "1-1"
             },
             {
-                "model": "o1-2024-12-17",
                 "score": 855,
-                "detail_data": "855",
                 "progress": "1-1"
             },
             {
-                "model": "o3-2025-04-16",
                 "score": 3445,
-                "detail_data": "3445",
                 "progress": "1-1"
             },
             {
-                "model": "o4-mini-2025-04-16",
                 "score": 1448.0,
-                "detail_data": "1525;1263;1556",
                 "progress": "1-1"
             },
             {
-                "model": "Random (x30)",
                 "score": 986.97,
-                "detail_data": "986.97",
                 "progress": "1-1"
             }
         ]
@@ -115,192 +74,207 @@
         "runs": 3,
         "results": [
             {
-                "model": "claude-3-5-sonnet-20241022",
-                "score": 108.2,
-                "details": "1352;2860;1532",
-                "highest_tail": 128
             },
             {
-                "model": "claude-3-7-sonnet-20250219 (thinking)",
-                "score": 113.3,
-                "details": "2560;3224;2088",
                 "highest_tail": 256
             },
             {
-                "model": "deepseek-r1",
-                "score": 105.2,
-                "details": "700;1240;3680",
-                "highest_tail": 128
             },
             {
-                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
-                "score": 106.6,
-                "details": "1304;1316;2472",
                 "highest_tail": 256
             },
             {
-                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
-                "score": 117.3,
-                "details": "5300;2400;3060",
-                "highest_tail": 256
             },
             {
-                "model": "grok-3-mini-beta (thinking)",
-                "score": 118.6,
-                "details": "6412;2492;3204",
-                "highest_tail": 256
             },
             {
-                "model": "llama-4-maverick-17b-128e-instruct-fp8",
-                "score": 106,
-                "details": "1404;1272;2084",
                 "highest_tail": 128
             },
             {
-                "model": "gpt-4.1-2025-04-14",
-                "score": 105.7,
-                "details": "1156;2664;1148",
-                "highest_tail": 128
             },
             {
-                "model": "gpt-4o-2024-11-20",
-                "score": 106.7,
-                "details": "1604;1284;2080",
                 "highest_tail": 256
             },
             {
-                "model": "o1-2024-12-17",
-                "score": 128.9,
-                "details": "3132;2004;3136",
                 "highest_tail": 512
             },
             {
-                "model": "o1-mini-2024-09-12",
-                "score": 114.0,
-                "details": "21;86;37",
                 "highest_tail": 256
             },
             {
-                "model": "o3-2025-04-16",
-                "score": 128.0,
                 "details": "7120",
                 "highest_tail": 512
             },
             {
-                "model": "o4-mini-2025-04-16",
-                "score": 120.6,
-                "details": "4928;5456;2912",
-                "highest_tail": 256
             },
             {
-                "model": "Random (x30)",
-                "score": 100.4,
                 "details": "",
                 "highest_tail": 128
-            }
-        ]
-    },
-    "Tetris (complete)": {
-        "runs": 3,
-        "results": [
             {
-                "model": "claude-3-7-sonnet-20250219",
-                "score": 95,
-                "steps_blocks": 27,
-                "rank": 1
             },
             {
-                "model": "claude-3-5-haiku-20241022",
-                "score": 90,
-                "steps_blocks": 25,
-                "rank": 2
             },
             {
-                "model": "gemini-2.0-flash",
-                "score": 82,
-                "steps_blocks": 23,
-                "rank": 3
             },
             {
-                "model": "gpt-4o-2024-11-20",
-                "score": 54,
-                "steps_blocks": 19,
-                "rank": 4
             }
         ]
     },
-    "Tetris (planning only)": {
         "runs": 3,
         "results": [
             {
-                "model": "claude-3-5-sonnet-20241022",
                 "score": 14.7,
-                "details": "16;14;14"
             },
             {
-                "model": "claude-3-7-sonnet-20250219 (thinking)",
                 "score": 16.3,
-                "details": "19;15;15"
             },
             {
-                "model": "deepseek-r1",
                 "score": 14.3,
-                "details": "15;14;14"
             },
             {
-                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
                 "score": 16.3,
-                "details": "20;14;15"
             },
             {
-                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
                 "score": 23.3,
-                "details": "23;23;24"
             },
             {
-                "model": "grok-3-mini-beta (thinking)",
                 "score": 21.3,
-                "details": "20;15;29"
             },
             {
-                "model": "llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 10.3,
-                "details": "9;10;12"
             },
             {
-                "model": "gpt-4.1-2025-04-14",
                 "score": 13.7,
-                "details": "13;14;14"
             },
             {
-                "model": "gpt-4o-2024-11-20",
                 "score": 14,
-                "details": "18;11;13"
             },
             {
-                "model": "o1-2024-12-17",
                 "score": 35,
                 "details": "35"
             },
             {
-                "model": "o1-mini-2024-09-12",
                 "score": 11.7,
-                "details": "11;11;13"
             },
             {
-                "model": "o3-2025-04-16",
                 "score": 42,
                 "details": "42"
             },
             {
-                "model": "o4-mini-2025-04-16",
                 "score": 25.3,
-                "details": "22;35;19"
             },
             {
-                "model": "Random (x30)",
                 "score": 10.2,
                 "details": ""
             }
         ]
     },
@@ -308,74 +282,94 @@
         "runs": 3,
         "results": [
             {
-                "model": "claude-3-5-sonnet-20241022",
                 "score": 106,
-                "details": "92;165;61"
             },
             {
-                "model": "claude-3-7-sonnet-20250219 (thinking)",
                 "score": 484,
-                "details": "535;428;489"
             },
             {
-                "model": "deepseek-r1",
                 "score": 447.3,
-                "details": "409;436;497"
             },
             {
-                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
                 "score": 334.7,
-                "details": "259;372;373"
             },
             {
-                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
                 "score": 416.3,
-                "details": "411;414;424"
             },
             {
-                "model": "grok-3-mini-beta (thinking)",
                 "score": 254,
-                "details": "299;332;131"
             },
             {
-                "model": "llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 128.7,
-                "details": "67;139;180"
             },
             {
-                "model": "gpt-4.1-2025-04-14",
                 "score": 182,
-                "details": "163;215;168"
             },
             {
-                "model": "gpt-4o-2024-11-20",
                 "score": 147.3,
-                "details": "131;104;207"
             },
             {
-                "model": "o1-2024-12-17",
                 "score": 159,
                 "details": "159"
             },
             {
-                "model": "o1-mini-2024-09-12",
                 "score": 48,
-                "details": "21;86;37"
             },
             {
-                "model": "o3-2025-04-16",
                 "score": 647,
                 "details": "647"
             },
             {
-                "model": "o4-mini-2025-04-16",
                 "score": 487.3,
-                "details": "259;591;612"
             },
             {
-                "model": "Random (x30)",
                 "score": 116.5,
                 "details": ""
             }
         ]
     },
@@ -383,88 +377,108 @@
         "runs": 3,
         "results": [
             {
-                "model": "claude-3-5-sonnet-20241022",
                 "score": 0,
-                "detail_box_on_target": "0;0;0",
-                "cracked_levels": "0;0;0"
             },
             {
-                "model": "claude-3-7-sonnet-20250219 (thinking)",
                 "score": 2.33,
-                "detail_box_on_target": "2;4;1",
-                "cracked_levels": "1;2;0"
             },
             {
-                "model": "deepseek-r1",
                 "score": 1.33,
-                "detail_box_on_target": "2;0;2",
-                "cracked_levels": "1;0;1"
             },
             {
-                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
                 "score": 1.67,
-                "detail_box_on_target": "3;0;2",
-                "cracked_levels": "2;0;1"
             },
             {
-                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
                 "score": 4.33,
-                "detail_box_on_target": "4;4;5",
-                "cracked_levels": "2;2;3"
             },
             {
-                "model": "grok-3-mini-beta (thinking)",
                 "score": 5.67,
-                "detail_box_on_target": "5;6;6",
-                "cracked_levels": "3;3;3"
             },
             {
-                "model": "llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 0,
-                "detail_box_on_target": "0;0;0",
-                "cracked_levels": "0;0;0"
             },
             {
-                "model": "gpt-4.1-2025-04-14",
                 "score": 0,
-                "detail_box_on_target": "0;0;0",
-                "cracked_levels": "0;0;0"
             },
             {
-                "model": "gpt-4o-2024-11-20",
                 "score": 0,
-                "detail_box_on_target": "0;0;0",
-                "cracked_levels": "0;0;0"
             },
             {
-                "model": "o1-2024-12-17",
                 "score": 2.33,
-                "detail_box_on_target": "2;2;3",
-                "cracked_levels": "1;1;2"
             },
             {
-                "model": "o1-mini-2024-09-12",
                 "score": 1.33,
-                "detail_box_on_target": "1;2;1",
-                "cracked_levels": "0;1;0"
             },
             {
-                "model": "o3-2025-04-16",
                 "score": 8,
-                "detail_box_on_target": "10;6",
-                "cracked_levels": "5;3"
             },
             {
-                "model": "o4-mini-2025-04-16",
                 "score": 5.33,
-                "detail_box_on_target": "4;6;6",
-                "cracked_levels": "2;2;3"
             },
             {
-                "model": "Random (x30)",
                 "score": 0,
-                "detail_box_on_target": "0,0,0",
                 "cracked_levels": "0,0,0"
             }
         ]
     },
@@ -472,88 +486,103 @@
         "runs": 1,
         "results": [
             {
-                "model": "claude-3-5-sonnet-20241022",
                 "score": 2,
                 "progress": "1:2/5",
                 "evaluator result": "1/3"
             },
             {
-                "model": "claude-3-7-sonnet-20250219 (thinking)",
                 "score": 7,
                 "progress": "2:2/9",
                 "evaluator result": "5/11"
             },
             {
-                "model": "deepseek-r1",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "1/5"
             },
             {
-                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
                 "score": 4,
                 "progress": "1:4/5",
                 "evaluator result": "1/7"
             },
             {
-                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
                 "score": 7,
                 "progress": "2:2/9",
                 "evaluator result": "2/3"
             },
             {
-                "model": "grok-3-mini-beta (thinking)",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             },
             {
-                "model": "llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             },
             {
-                "model": "gpt-4.1-2025-04-14",
                 "score": 2,
                 "progress": "1:2/5",
                 "evaluator result": "2/3"
             },
             {
-                "model": "gpt-4o-2024-11-20",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             },
             {
-                "model": "o1-2024-12-17",
                 "score": 16,
                 "progress": "3: 2/8",
                 "evaluator result": "6/11"
             },
             {
-                "model": "o1-mini-2024-09-12",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "1/5"
             },
             {
-                "model": "o3-2025-04-16",
                 "score": 16,
                 "progress": "3: 2/8",
                 "evaluator result": "1/2"
             },
             {
-                "model": "o4-mini-2025-04-16",
                 "score": 4,
                 "progress": "1:4/5",
                 "evaluator result": "2/5"
             },
             {
-                "model": "Random (x30)",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             }
         ]
     }

 {
     "Super Mario Bros": {
         "runs": 3,
         "results": [
             {
+                "model": "gamingagent + claude-3-5-sonnet-20241022",
                 "score": 1267.7,
+                "detail_data":"709,1532,1562",
                 "progress": "1-1"
             },
             {
+                "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
                 "score": 1418.7,
+                "detail_data":"2015,709,1532",
                 "progress": "1-1"
             },
             {
+                "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
                 "score": 1385.0,
+                "detail_data":"1672,1266,1247",
                 "progress": "1-1"
             },
             {
+                "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
                 "score": 1498.3,
+                "detail_data":"1561,1271,1663",
                 "progress": "1-1"
             },
             {
+                "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 1468.7,
+                "detail_data":"898,2008,1500",
                 "progress": "1-1"
             },
             {
+                "model": "gamingagent + gpt-4.1-2025-04-14",
                 "score": 2126.3,
+                "detail_data":"1531,722,4126",
                 "progress": "1-1"
             },
             {
+                "model": "gamingagent + gpt-4o-2024-11-20",
                 "score": 2047.3,
+                "detail_data":"2017,2590,1535",
                 "progress": "1-1"
             },
             {
+                "model": "gamingagent + o1-2024-12-17",
                 "score": 855,
+                "detail_data":"855",
                 "progress": "1-1"
             },
             {
+                "model": "gamingagent + o3-2025-04-16",
                 "score": 3445,
+                "detail_data":"3445",
                 "progress": "1-1"
             },
             {
+                "model": "gamingagent + o4-mini-2025-04-16",
                 "score": 1448.0,
+                "detail_data":"1525,1263,1556",
                 "progress": "1-1"
             },
             {
+                "model": "random (x30)",
                 "score": 986.97,
+                "detail_data":"986.97",
                 "progress": "1-1"
             }
         ]
         "runs": 3,
         "results": [
             {
+                "model": "gamingagent + claude-3-5-sonnet-20241022",
+                "score": 1914.67,
+                "details": "1352,2860,1532",
+                "highest_tail": 256
             },
             {
+                "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
+                "score": 2624,
+                "details": "2560,3224,2088",
                 "highest_tail": 256
             },
             {
+                "model": "gamingagent + deepseek-r1-0120",
+                "score": 1873.33,
+                "details": "700,1240,3680",
+                "highest_tail": 256
             },
             {
+                "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 1697.33,
+                "details": "1304,1316,2472",
                 "highest_tail": 256
             },
             {
+                "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
+                "score": 3586.67,
+                "details": "5300,2400,3060",
+                "highest_tail": 512
             },
             {
+                "model": "gamingagent + grok-3-mini-beta (thinking)",
+                "score": 4036,
+                "details": "6412,2492,3204",
+                "highest_tail": 512
             },
             {
+                "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 1586.67,
+                "details": "1404,1272,2084",
                 "highest_tail": 128
             },
             {
+                "model": "gamingagent + gpt-4.1-2025-04-14",
+                "score": 1656,
+                "details": "1156,2664,1148",
+                "highest_tail": 256
             },
             {
+                "model": "gamingagent + gpt-4o-2024-11-20",
+                "score": 1656,
+                "details": "1604,1284,2080",
                 "highest_tail": 256
             },
             {
+                "model": "gamingagent + o1-2024-12-17",
+                "score": 7580,
+                "details": "7580",
                 "highest_tail": 512
             },
             {
+                "model": "gamingagent + o1-mini-2024-09-12",
+                "score": 2757.33,
+                "details": "3132,2004,3136",
                 "highest_tail": 256
             },
             {
+                "model": "gamingagent + o3-2025-04-16",
+                "score": 7120,
                 "details": "7120",
                 "highest_tail": 512
             },
             {
+                "model": "gamingagent + o4-mini-2025-04-16",
+                "score": 4432.0,
+                "details": "4928,5456,2912",
+                "highest_tail": 512
             },
             {
+                "model": "random (x30)",
+                "score": 1213.33,
                 "details": "",
                 "highest_tail": 128
+            },
             {
+                "model": "gamingagent + claude-opus-4-20250514",
+                "score": 3036.0,
+                "details": "3036.0",
+                "highest_tail": 256
             },
             {
+                "model": "gamingagent + claude-sonnet-4-20250514",
+                "score": 3136,
+                "details": "2148,2360,4900",
+                "highest_tail": 256
             },
             {
+                "model": "gamingagent + deepseek-r1-0528",
+                "score": 3330.0,
+                "details": "3260,3400",
+                "highest_tail": 256
             },
             {
+                "model": "gamingagent + qwen3-235B-A22B-fp8",
+                "score": 2144.0,
+                "details": "1436,2556,2440",
+                "highest_tail": 256
             }
         ]
     },
+    "Tetris": {
         "runs": 3,
         "results": [
             {
+                "model": "gamingagent + claude-3-5-sonnet-20241022",
                 "score": 14.7,
+                "details": "16,14,14"
             },
             {
+                "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
                 "score": 16.3,
+                "details": "19,15,15"
             },
             {
+                "model": "gamingagent + deepseek-r1-0120",
                 "score": 14.3,
+                "details": "15,14,14"
             },
             {
+                "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
                 "score": 16.3,
+                "details": "20,14,15"
             },
             {
+                "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
                 "score": 23.3,
+                "details": "23,23,24"
             },
             {
+                "model": "gamingagent + grok-3-mini-beta (thinking)",
                 "score": 21.3,
+                "details": "20,15,29"
             },
             {
+                "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 10.3,
+                "details": "9,10,12"
             },
             {
+                "model": "gamingagent + gpt-4.1-2025-04-14",
                 "score": 13.7,
+                "details": "13,14,14"
             },
             {
+                "model": "gamingagent + gpt-4o-2024-11-20",
                 "score": 14,
+                "details": "18,11,13"
             },
             {
+                "model": "gamingagent + o1-2024-12-17",
                 "score": 35,
                 "details": "35"
             },
             {
+                "model": "gamingagent + o1-mini-2024-09-12",
                 "score": 11.7,
+                "details": "11,11,13"
             },
             {
+                "model": "gamingagent + o3-2025-04-16",
                 "score": 42,
                 "details": "42"
             },
             {
+                "model": "gamingagent + o4-mini-2025-04-16",
                 "score": 25.3,
+                "details": "22,35,19"
             },
             {
+                "model": "random (x30)",
                 "score": 10.2,
                 "details": ""
+            },
+            {
+                "model": "gamingagent + claude-opus-4-20250514",
+                "score": 20,
+                "details": "17,18,25"
+            },
+            {
+                "model": "gamingagent + claude-sonnet-4-20250514",
+                "score": 19.33,
+                "details": "20,17,21"
+            },
+            {
+                "model": "gamingagent + deepseek-r1-0528",
+                "score": 33.67,
+                "details": "26,34,41"
+            },
+            {
+                "model": "gamingagent + qwen3-235B-A22B-fp8",
+                "score": 11.67,
+                "details": "13,14,8"
             }
         ]
     },
         "runs": 3,
         "results": [
             {
+                "model": "gamingagent + claude-3-5-sonnet-20241022",
                 "score": 106,
+                "details": "92,165,61"
             },
             {
+                "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
                 "score": 484,
+                "details": "535,428,489"
             },
             {
+                "model": "gamingagent + deepseek-r1-0120",
                 "score": 447.3,
+                "details": "409,436,497"
             },
             {
+                "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
                 "score": 334.7,
+                "details": "259,372,373"
             },
             {
+                "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
                 "score": 416.3,
+                "details": "411,414,424"
             },
             {
+                "model": "gamingagent + grok-3-mini-beta (thinking)",
                 "score": 254,
+                "details": "299,332,131"
             },
             {
+                "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 128.7,
+                "details": "67,139,180"
             },
             {
+                "model": "gamingagent + gpt-4.1-2025-04-14",
                 "score": 182,
+                "details": "163,215,168"
             },
             {
+                "model": "gamingagent + gpt-4o-2024-11-20",
                 "score": 147.3,
+                "details": "131,104,207"
             },
             {
+                "model": "gamingagent + o1-2024-12-17",
                 "score": 159,
                 "details": "159"
             },
             {
+                "model": "gamingagent + o1-mini-2024-09-12",
                 "score": 48,
+                "details": "21,86,37"
             },
             {
+                "model": "gamingagent + o3-2025-04-16",
                 "score": 647,
                 "details": "647"
             },
             {
+                "model": "gamingagent + o4-mini-2025-04-16",
                 "score": 487.3,
+                "details": "259,591,612"
             },
             {
+                "model": "random (x30)",
                 "score": 116.5,
                 "details": ""
+            },
+            {
+                "model": "gamingagent + claude-opus-4-20250514",
+                "score": 464,
+                "details": "593,406,393"
+            },
+            {
+                "model": "gamingagent + claude-sonnet-4-20250514",
+                "score": 478.33,
+                "details": "545,468,422"
+            },
+            {
+                "model": "gamingagent + deepseek-r1-0528",
+                "score": 491.67,
+                "details": "464,463,548"
+            },
+            {
+                "model": "gamingagent + qwen3-235B-A22B-fp8",
+                "score": 363.33,
+                "details": "365,372,353"
             }
         ]
     },
         "runs": 3,
         "results": [
             {
+                "model": "gamingagent + claude-3-5-sonnet-20241022",
                 "score": 0,
+                "detail_box_on_target":"0,0,0",
+                "cracked_levels": "0,0,0"
             },
             {
+                "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
                 "score": 2.33,
+                "detail_box_on_target":"2,4,1",
+                "cracked_levels": "1,2,0"
             },
             {
+                "model": "gamingagent + deepseek-r1-0120",
                 "score": 1.33,
+                "detail_box_on_target":"2,0,2",
+                "cracked_levels": "1,0,1"
             },
             {
+                "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
                 "score": 1.67,
+                "detail_box_on_target":"3,0,2",
+                "cracked_levels": "2,0,1"
             },
             {
+                "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
                 "score": 4.33,
+                "detail_box_on_target":"4,4,5",
+                "cracked_levels": "2,2,3"
             },
             {
+                "model": "gamingagent + grok-3-mini-beta (thinking)",
                 "score": 5.67,
+                "detail_box_on_target":"5,6,6",
+                "cracked_levels": "3,3,3"
             },
             {
+                "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 0,
+                "detail_box_on_target":"0,0,0",
+                "cracked_levels": "0,0,0"
             },
             {
+                "model": "gamingagent + gpt-4.1-2025-04-14",
                 "score": 0,
+                "detail_box_on_target":"0,0,0",
+                "cracked_levels": "0,0,0"
             },
             {
+                "model": "gamingagent + gpt-4o-2024-11-20",
                 "score": 0,
+                "detail_box_on_target":"0,0,0",
+                "cracked_levels": "0,0,0"
             },
             {
+                "model": "gamingagent + o1-2024-12-17",
                 "score": 2.33,
+                "detail_box_on_target":"2,2,3",
+                "cracked_levels": "1,1,2"
             },
             {
+                "model": "gamingagent + o1-mini-2024-09-12",
                 "score": 1.33,
+                "detail_box_on_target":"1,2,1",
+                "cracked_levels": "0,1,0"
             },
             {
+                "model": "gamingagent + o3-2025-04-16",
                 "score": 8,
+                "detail_box_on_target":"10,6",
+                "cracked_levels": "5,3"
             },
             {
+                "model": "gamingagent + o4-mini-2025-04-16",
                 "score": 5.33,
+                "detail_box_on_target":"4,6,6",
+                "cracked_levels": "2,2,3"
             },
             {
+                "model": "random (x30)",
                 "score": 0,
+                "detail_box_on_target":"0,0,0",
                 "cracked_levels": "0,0,0"
+            },
+            {
+                "model": "gamingagent + claude-opus-4-20250514",
+                "score": 4,
+                "details": "4,4,4"
+            },
+            {
+                "model": "gamingagent + claude-sonnet-4-20250514",
+                "score": 3,
+                "details": "2,2,5"
+            },
+            {
+                "model": "gamingagent + deepseek-r1-0528",
+                "score": 4.67,
+                "details": "4,4,6"
+            },
+            {
+                "model": "gamingagent + qwen3-235B-A22B-fp8",
+                "score": 2.33,
+                "details": "1,2,4"
             }
         ]
     },
         "runs": 1,
         "results": [
             {
+                "model": "gamingagent + claude-3-5-sonnet-20241022",
                 "score": 2,
                 "progress": "1:2/5",
                 "evaluator result": "1/3"
             },
             {
+                "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
                 "score": 7,
                 "progress": "2:2/9",
                 "evaluator result": "5/11"
             },
             {
+                "model": "gamingagent + deepseek-r1-0120",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "1/5"
             },
             {
+                "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
                 "score": 4,
                 "progress": "1:4/5",
                 "evaluator result": "1/7"
             },
             {
+                "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
                 "score": 7,
                 "progress": "2:2/9",
                 "evaluator result": "2/3"
             },
             {
+                "model": "gamingagent + grok-3-mini-beta (thinking)",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             },
             {
+                "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             },
             {
+                "model": "gamingagent + gpt-4.1-2025-04-14",
                 "score": 2,
                 "progress": "1:2/5",
                 "evaluator result": "2/3"
             },
             {
+                "model": "gamingagent + gpt-4o-2024-11-20",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
             },
             {
+                "model": "gamingagent + o1-2024-12-17",
                 "score": 16,
                 "progress": "3: 2/8",
                 "evaluator result": "6/11"
             },
             {
+                "model": "gamingagent + o1-mini-2024-09-12",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "1/5"
             },
             {
+                "model": "gamingagent + o3-2025-04-16",
                 "score": 16,
                 "progress": "3: 2/8",
                 "evaluator result": "1/2"
             },
             {
+                "model": "gamingagent + o4-mini-2025-04-16",
                 "score": 4,
                 "progress": "1:4/5",
                 "evaluator result": "2/5"
             },
             {
+                "model": "random (x30)",
                 "score": 0,
                 "progress": "0",
                 "evaluator result": "0"
+            },
+            {
+                "model": "gamingagent + claude-opus-4-20250514",
+                "score": 6,
+                "details": "6"
+            },
+            {
+                "model": "gamingagent + claude-sonnet-4-20250514",
+                "score": 3.67,
+                "details": "3,4,4"
+            },
+            {
+                "model": "gamingagent + gemini-2.5-flash-preview-05-20",
+                "score": 4.33,
+                "details": "3,4,6"
             }
         ]
     }

rank_single_model_03_25_2025.json ADDED Viewed

	@@ -0,0 +1,473 @@

+{
+    "Super Mario Bros": {
+        "runs": 3,
+        "results": [
+            {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 1540.0,
+                "detail_data":"1551,1515,1554",
+                "progress": "1-1"
+            },
+            {
+                "model": "claude-3-7-sonnet-20250219 (thinking)",
+                "score": 1430.0,
+                "detail_data":"1532,1515,1243",
+                "progress": "1-1"
+            },
+            {
+                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 1540.7,
+                "detail_data":"1794,1270,1558",
+                "progress": "1-1"
+            },
+            {
+                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
+                "score": 1025.3,
+                "detail_data":"820,1534,722",
+                "progress": "1-1"
+            },
+            {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 786.0,
+                "detail_data":"837,300,1221",
+                "progress": "1-1"
+            },
+            {
+                "model": "gpt-4.1-2025-04-14",
+                "score": 1991.3,
+                "detail_data":"1563,1257,3154",
+                "progress": "1-1"
+            },
+            {
+                "model": "gpt-4o-2024-11-20",
+                "score": 1028.3,
+                "detail_data":"1565,297,1223",
+                "progress": "1-1"
+            },
+            {
+                "model": "o1-2024-12-17",
+                "score": 1434.0,
+                "detail_data":"1434",
+                "progress": "1-1"
+            },
+            {
+                "model": "o3-2025-04-16",
+                "score": 1955.0,
+                "detail_data":"1955",
+                "progress": "1-1"
+            },
+            {
+                "model": "o4-mini-2025-04-16",
+                "score": 1348.3,
+                "detail_data":"1554,1245,1246",
+                "progress": "1-1"
+            },
+            {
+                "model": "random (x30)",
+                "score": 986.97,
+                "detail_data":"986.97",
+                "progress": "1-1"
+            }
+        ]
+    },
+    "2048": {
+        "runs": 3,
+        "results": [
+            {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 17.0,
+                "details": "188,20,44",
+                "highest_tail": 32
+            },
+            {
+                "model": "claude-3-7-sonnet-20250219 (thinking)",
+                "score": 126.3,
+                "details": "1596,4256,3008",
+                "highest_tail": 512
+            },
+            {
+                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 97.7,
+                "details": "2228,1424,1564",
+                "highest_tail": 256
+            },
+            {
+                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
+                "score": 120.5,
+                "details": "5784,3544,3704",
+                "highest_tail": 512
+            },
+            {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 44.6,
+                "details": "16,56,12",
+                "highest_tail": 64
+            },
+            {
+                "model": "gpt-4.1-2025-04-14",
+                "score": 94.5,
+                "details": "264,500,2576",
+                "highest_tail": 256
+            },
+            {
+                "model": "gpt-4o-2024-11-20",
+                "score": 70.4,
+                "details": "292,196,40",
+                "highest_tail": 32
+            },
+            {
+                "model": "o1-2024-12-17",
+                "score": 128.1,
+                "details": "7176",
+                "highest_tail": 512
+            },
+            {
+                "model": "o3-2025-04-16",
+                "score": 128.2,
+                "details": "7220",
+                "highest_tail": 512
+            },
+            {
+                "model": "o4-mini-2025-04-16",
+                "score": 97.6,
+                "details": "3004,84,2560",
+                "highest_tail": 256
+            },
+            {
+                "model": "random (x30)",
+                "score": 100.4,
+                "details": "",
+                "highest_tail": 128
+            },
+            {
+                "model": "gemini-2.5-flash-preview-05-20",
+                "score": 2750,
+                "details": "3128, 2758, 2364",
+                "highest_tail": 128
+            },
+            {
+                "model": "claude-sonnet-4-20250514",
+                "score": 3844,
+                "details": "3280,5024,3228",
+                "highest_tail": 512
+            },
+            {
+                "model": "gemini-2.5-pro-preview-06-05",
+                "score": 1,
+                "details": "3232,x,1628",
+                "highest_tail": 128
+            }
+        ]
+    },
+    "Tetris": {
+        "runs": 3,
+        "results": [
+            {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 12.3,
+                "details": "10,15,12"
+            },
+            {
+                "model": "claude-3-7-sonnet-20250219 (thinking)",
+                "score": 13.0,
+                "details": "13,13,13"
+            },
+            {
+                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 19.0,
+                "details": "15,18,24"
+            },
+            {
+                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
+                "score": 12.3,
+                "details": "15,9,13"
+            },
+            {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 11.7,
+                "details": "13,11,11"
+            },
+            {
+                "model": "gpt-4.1-2025-04-14",
+                "score": 13.0,
+                "details": "11,14,14"
+            },
+            {
+                "model": "gpt-4o-2024-11-20",
+                "score": 14.7,
+                "details": "14,17,13"
+            },
+            {
+                "model": "o1-2024-12-17",
+                "score": 13.0,
+                "details": "13.0"
+            },
+            {
+                "model": "o3-2025-04-16",
+                "score": 31.0,
+                "details": "31.0"
+            },
+            {
+                "model": "o4-mini-2025-04-16",
+                "score": 15.0,
+                "details": "14,12,19"
+            },
+            {
+                "model": "random (x30)",
+                "score": 10.2,
+                "details": ""
+            },
+            {
+                "model": "gemini-2.5-flash-preview-05-20",
+                "score": 16,
+                "details": "15,15,18"
+            },
+            {
+                "model": "claude-sonnet-4-20250514",
+                "score": 13.67,
+                "details": "15,12,14"
+            },
+            {
+                "model": "gemini-2.5-pro-preview-06-05",
+                "score": 12,
+                "details": "12"
+            }
+        ]
+    },
+    "Candy Crush": {
+        "runs": 3,
+        "results": [
+            {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 17.0,
+                "details": "15,36,0"
+            },
+            {
+                "model": "claude-3-7-sonnet-20250219 (thinking)",
+                "score": 126.3,
+                "details": "148,182,49"
+            },
+            {
+                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 97.7,
+                "details": "60,101,132"
+            },
+            {
+                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
+                "score": 177.3,
+                "details": "117,169,246"
+            },
+            {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 32.3,
+                "details": "18,79,0"
+            },
+            {
+                "model": "gpt-4.1-2025-04-14",
+                "score": 101.0,
+                "details": "13,238,52"
+            },
+            {
+                "model": "gpt-4o-2024-11-20",
+                "score": 59.0,
+                "details": "29,122,26"
+            },
+            {
+                "model": "o1-2024-12-17",
+                "score": 90.0,
+                "details": "90.0"
+            },
+            {
+                "model": "o3-2025-04-16",
+                "score": 106.0,
+                "details": "106.0"
+            },
+            {
+                "model": "o4-mini-2025-04-16",
+                "score": 110.7,
+                "details": "259,591,612"
+            },
+            {
+                "model": "random (x30)",
+                "score": 116.5,
+                "details": ""
+            },
+            {
+                "model": "gemini-2.5-flash-preview-05-20",
+                "score": 254,
+                "details": "276,191,295"
+            },
+            {
+                "model": "claude-sonnet-4-20250514",
+                "score": 557.67,
+                "details": "503, 557, 613"
+            },
+            {
+                "model": "gemini-2.5-pro-preview-06-05",
+                "score": 496,
+                "details": "461,556,471"
+            }
+        ]
+    },
+    "Sokoban": {
+        "runs": 3,
+        "results": [
+            {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 0,
+                "detail_box_on_target":"0,0,0",
+                "cracked_levels": "0,0,0"
+            },
+            {
+                "model": "claude-3-7-sonnet-20250219 (thinking)",
+                "score": 0,
+                "detail_box_on_target":"0,0,0",
+                "cracked_levels": "0,0,0"
+            },
+            {
+                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 0,
+                "detail_box_on_target":"0,0,0",
+                "cracked_levels": "0,0,0"
+            },
+            {
+                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
+                "score": 1,
+                "detail_box_on_target":"1,1,1",
+                "cracked_levels": "0,0,0"
+            },
+            {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 0,
+                "detail_box_on_target":"0,0,0",
+                "cracked_levels": "0,0,0"
+            },
+            {
+                "model": "gpt-4.1-2025-04-14",
+                "score": 0,
+                "detail_box_on_target":"0,0,0",
+                "cracked_levels": "0,0,0"
+            },
+            {
+                "model": "gpt-4o-2024-11-20",
+                "score": 0,
+                "detail_box_on_target":"0,0,0",
+                "cracked_levels": "0,0,0"
+            },
+            {
+                "model": "o1-2024-12-17",
+                "score": 0,
+                "detail_box_on_target":"0",
+                "cracked_levels": "0"
+            },
+            {
+                "model": "o3-2025-04-16",
+                "score": 2,
+                "detail_box_on_target":"2",
+                "cracked_levels": "1"
+            },
+            {
+                "model": "o4-mini-2025-04-16",
+                "score": 1.33,
+                "detail_box_on_target":"1,2,1",
+                "cracked_levels": "0,1,0"
+            },
+            {
+                "model": "random (x30)",
+                "score": 0,
+                "detail_box_on_target":"0,0,0",
+                "cracked_levels": "0,0,0"
+            },
+            {
+                "model": "claude-sonnet-4-20250514",
+                "score": 0,
+                "detail_box_on_target":"0,0,0",
+                "cracked_levels": "0,0,0"
+            },
+            {
+                "model": "gemini-2.5-pro-preview-06-05",
+                "score": 0.33,
+                "detail_box_on_target": "0,0,1",
+                "cracked_levels": "0,0,0"
+            },
+            {
+                "model": "gemini-2.5-flash-preview-05-20",
+                "score": 0,
+                "detail_box_on_target": "0,0,0",
+                "cracked_levels": "0,0,0"
+            }
+        ]
+    },
+    "Ace Attorney": {
+        "runs": 1,
+        "results": [
+            {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 1,
+                "progress": "1:1/5"
+            },
+            {
+                "model": "claude-3-7-sonnet-20250219 (thinking)",
+                "score": 3,
+                "progress": "1:3/5"
+            },
+            {
+                "model": "gemini-2.5-flash-preview-04-17 (thinking)",
+                "score": 1,
+                "progress": "1:1/5"
+            },
+            {
+                "model": "gemini-2.5-pro-preview-05-06 (thinking)",
+                "score": 8,
+                "progress": "2:3/9"
+            },
+            {
+                "model": "llama-4-maverick-17b-128e-instruct-fp8",
+                "score": 0,
+                "progress": "0"
+            },
+            {
+                "model": "gpt-4.1-2025-04-14",
+                "score": 0,
+                "progress": "0"
+            },
+            {
+                "model": "gpt-4o-2024-11-20",
+                "score": 0,
+                "progress": "0"
+            },
+            {
+                "model": "o1-2024-12-17",
+                "score": 3,
+                "progress": "1:3/5"
+            },
+            {
+                "model": "o3-2025-04-16",
+                "score": 8,
+                "progress": "2:3/9"
+            },
+            {
+                "model": "o4-mini-2025-04-16",
+                "score": 2,
+                "progress": "1:2/5"
+            },
+            {
+                "model": "random (x30)",
+                "score": 0,
+                "progress": "0"
+            },
+            {
+                "model": "gemini-2.5-flash-preview-05-20",
+                "score": 2.33,
+                "details": "1,4,2",
+                "progress": "1:4/5"
+            },
+            {
+                "model": "claude-sonnet-4-20250514",
+                "score": 1.33,
+                "details": "0,2,2",
+                "progress": "1:2/5"
+            }
+        ]
+    }
+}