Spaces:
Running
Running
Yuxuan-Zhang-Dexter
commited on
Commit
·
6d4c755
1
Parent(s):
865dbef
update leaderboard with new agentic leaderboard layout
Browse files- app.py +162 -98
- assets/model_color.json +27 -27
- data_visualization.py +340 -20
- generate_normalized_cache.py +57 -0
- leaderboard_utils.py +81 -18
- rank_data_03_25_2025.json +94 -94
app.py
CHANGED
|
@@ -38,11 +38,11 @@ TIME_POINTS = {
|
|
| 38 |
}
|
| 39 |
|
| 40 |
# Load the initial JSON file with rank data
|
| 41 |
-
with open(TIME_POINTS["03/25/2025"], "r") as f:
|
| 42 |
rank_data = json.load(f)
|
| 43 |
|
| 44 |
# Load the model leaderboard data
|
| 45 |
-
with open("rank_single_model_03_25_2025.json", "r") as f:
|
| 46 |
model_rank_data = json.load(f)
|
| 47 |
|
| 48 |
# Add leaderboard state at the top level
|
|
@@ -72,17 +72,17 @@ leaderboard_state = {
|
|
| 72 |
|
| 73 |
|
| 74 |
# Load video links and news data
|
| 75 |
-
with open('assets/game_video_link.json', 'r') as f:
|
| 76 |
VIDEO_LINKS = json.load(f)
|
| 77 |
|
| 78 |
-
with open('assets/news.json', 'r') as f:
|
| 79 |
NEWS_DATA = json.load(f)
|
| 80 |
|
| 81 |
def load_rank_data(time_point):
|
| 82 |
"""Load rank data for a specific time point"""
|
| 83 |
if time_point in TIME_POINTS:
|
| 84 |
try:
|
| 85 |
-
with open(TIME_POINTS[time_point], "r") as f:
|
| 86 |
return json.load(f)
|
| 87 |
except FileNotFoundError:
|
| 88 |
return None
|
|
@@ -105,7 +105,7 @@ def prepare_dataframe_for_display(df, for_game=None):
|
|
| 105 |
|
| 106 |
# Replace '_' with '-' for better display
|
| 107 |
for col in display_df.columns:
|
| 108 |
-
if col.endswith(' Score'):
|
| 109 |
display_df[col] = display_df[col].apply(lambda x: '-' if x == '_' else x)
|
| 110 |
|
| 111 |
# If we're in detailed view, sort by score
|
|
@@ -120,36 +120,47 @@ def prepare_dataframe_for_display(df, for_game=None):
|
|
| 120 |
# Filter out models that didn't participate
|
| 121 |
display_df = display_df[~display_df[score_col].isna()]
|
| 122 |
else:
|
| 123 |
-
# For overall view, sort by average
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
-
#
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
temp_sort_df[col] = pd.to_numeric(temp_sort_df[col], errors='coerce')
|
| 132 |
-
|
| 133 |
-
# Calculate average of the game scores (use mean of ranks from utils for actual ranking logic if different)
|
| 134 |
-
# For display sorting, let's use a simple average of available scores.
|
| 135 |
-
# The actual ranking for 'Average Rank' in leaderboard_utils uses mean of ranks, which is more robust.
|
| 136 |
-
# Here we just need a consistent sort order.
|
| 137 |
-
|
| 138 |
-
# Create a temporary column for sorting
|
| 139 |
-
temp_sort_df['temp_avg_score_for_sort'] = temp_sort_df[score_cols].mean(axis=1)
|
| 140 |
-
|
| 141 |
-
# Sort by this temporary average score (higher is better for scores)
|
| 142 |
-
# and then by Player name as a tie-breaker
|
| 143 |
-
display_df = display_df.loc[temp_sort_df.sort_values(by=['temp_avg_score_for_sort', 'Player'], ascending=[False, True]).index]
|
| 144 |
|
| 145 |
# Add line breaks to column headers
|
| 146 |
new_columns = {}
|
| 147 |
for col in display_df.columns:
|
| 148 |
-
if col.endswith(' Score'):
|
| 149 |
# Replace 'Game Name Score' with 'Game Name\nScore'
|
| 150 |
game_name = col.replace(' Score', '')
|
| 151 |
new_col = f"{game_name}\nScore"
|
| 152 |
new_columns[col] = new_col
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
# Rename columns with new line breaks
|
| 155 |
if new_columns:
|
|
@@ -164,8 +175,14 @@ def update_df_with_height(df):
|
|
| 164 |
col_widths = ["40px"] # Row number column width
|
| 165 |
col_widths.append("230px") # Player column - reduced by 20px
|
| 166 |
col_widths.append("120px") # Organization column
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
# Add game score columns
|
| 168 |
-
|
|
|
|
| 169 |
col_widths.append("120px")
|
| 170 |
|
| 171 |
return gr.update(value=df,
|
|
@@ -184,7 +201,7 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
|
|
| 184 |
# tetris_overall, tetris_details, # Commented out
|
| 185 |
tetris_plan_overall, tetris_plan_details,
|
| 186 |
ace_attorney_overall, ace_attorney_details,
|
| 187 |
-
top_n=
|
| 188 |
data_source=None):
|
| 189 |
global leaderboard_state
|
| 190 |
|
|
@@ -304,21 +321,22 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
|
|
| 304 |
|
| 305 |
# Get the appropriate DataFrame and charts based on current state
|
| 306 |
if leaderboard_state["current_game"]:
|
| 307 |
-
# For detailed view
|
|
|
|
| 308 |
# if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
|
| 309 |
# df = get_mario_leaderboard(data)
|
| 310 |
if leaderboard_state["current_game"] == "Super Mario Bros":
|
| 311 |
-
df = get_mario_planning_leaderboard(data)
|
| 312 |
elif leaderboard_state["current_game"] == "Sokoban":
|
| 313 |
-
df = get_sokoban_leaderboard(data)
|
| 314 |
elif leaderboard_state["current_game"] == "2048":
|
| 315 |
-
df = get_2048_leaderboard(data)
|
| 316 |
elif leaderboard_state["current_game"] == "Candy Crush":
|
| 317 |
-
df = get_candy_leaderboard(data)
|
| 318 |
elif leaderboard_state["current_game"] == "Tetris":
|
| 319 |
-
df = get_tetris_planning_leaderboard(data)
|
| 320 |
elif leaderboard_state["current_game"] == "Ace Attorney":
|
| 321 |
-
df = get_ace_attorney_leaderboard(data)
|
| 322 |
else: # Should not happen if current_game is one of the known games
|
| 323 |
df = pd.DataFrame() # Empty df
|
| 324 |
|
|
@@ -327,10 +345,12 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
|
|
| 327 |
radar_chart = chart # In detailed view, radar and group bar can be the same as the main chart
|
| 328 |
group_bar_chart = chart
|
| 329 |
else:
|
| 330 |
-
# For overall view
|
| 331 |
-
|
|
|
|
| 332 |
display_df = prepare_dataframe_for_display(df)
|
| 333 |
-
|
|
|
|
| 334 |
chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
|
| 335 |
|
| 336 |
# Return values, including all four plot placeholders
|
|
@@ -405,7 +425,7 @@ def get_initial_state():
|
|
| 405 |
}
|
| 406 |
}
|
| 407 |
|
| 408 |
-
def clear_filters(top_n=
|
| 409 |
global leaderboard_state
|
| 410 |
|
| 411 |
# Use provided data source or default to rank_data
|
|
@@ -420,9 +440,12 @@ def clear_filters(top_n=10, data_source=None):
|
|
| 420 |
"Ace Attorney": True
|
| 421 |
}
|
| 422 |
|
| 423 |
-
|
|
|
|
|
|
|
| 424 |
display_df = prepare_dataframe_for_display(df)
|
| 425 |
-
|
|
|
|
| 426 |
|
| 427 |
leaderboard_state = get_initial_state()
|
| 428 |
|
|
@@ -675,9 +698,18 @@ def build_app():
|
|
| 675 |
max-width: 140px !important;
|
| 676 |
}
|
| 677 |
|
| 678 |
-
/*
|
| 679 |
-
.table-container th:nth-child(
|
| 680 |
-
.table-container td:nth-child(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 681 |
width: 120px !important;
|
| 682 |
min-width: 100px !important;
|
| 683 |
max-width: 140px !important;
|
|
@@ -743,6 +775,27 @@ def build_app():
|
|
| 743 |
width: 100% !important;
|
| 744 |
margin-top: 40px !important;
|
| 745 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 746 |
""") as demo:
|
| 747 |
gr.Markdown("# 🎮 Lmgame Bench: Leaderboard 🎲")
|
| 748 |
|
|
@@ -875,6 +928,14 @@ def build_app():
|
|
| 875 |
with gr.Tabs():
|
| 876 |
with gr.Tab("🏆 Agent Leaderboard"):
|
| 877 |
# Visualization section
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 878 |
with gr.Row():
|
| 879 |
gr.Markdown("### 📊 Data Visualization")
|
| 880 |
|
|
@@ -884,6 +945,19 @@ def build_app():
|
|
| 884 |
visible=False,
|
| 885 |
elem_classes="visualization-container"
|
| 886 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 887 |
|
| 888 |
with gr.Column(visible=True) as overall_visualizations:
|
| 889 |
with gr.Tabs():
|
|
@@ -894,45 +968,32 @@ def build_app():
|
|
| 894 |
elem_classes="visualization-container"
|
| 895 |
)
|
| 896 |
gr.Markdown(
|
| 897 |
-
"*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison
|
| 898 |
elem_classes="radar-tip"
|
| 899 |
)
|
| 900 |
-
# Comment out the Group Bar Chart tab
|
| 901 |
with gr.Tab("📊 Group Bar Chart"):
|
| 902 |
-
with gr.Row():
|
| 903 |
-
# Calculate dynamic maximum based on total models
|
| 904 |
-
agent_max_models = get_total_model_count(rank_data)
|
| 905 |
-
top_n_slider = gr.Slider(
|
| 906 |
-
minimum=1,
|
| 907 |
-
maximum=agent_max_models,
|
| 908 |
-
step=1,
|
| 909 |
-
value=min(10, agent_max_models),
|
| 910 |
-
label=f"Number of Top Models to Display (max: {agent_max_models})",
|
| 911 |
-
elem_classes="top-n-slider"
|
| 912 |
-
)
|
| 913 |
group_bar_visualization = gr.Plot(
|
| 914 |
label="Comparative Analysis (Group Bar Chart)",
|
| 915 |
elem_classes="visualization-container"
|
| 916 |
)
|
| 917 |
gr.Markdown(
|
| 918 |
-
"*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison
|
| 919 |
elem_classes="radar-tip"
|
| 920 |
)
|
| 921 |
-
|
| 922 |
|
| 923 |
# Hidden placeholder for group bar visualization (to maintain code references)
|
| 924 |
# group_bar_visualization = gr.Plot(visible=False)
|
| 925 |
|
| 926 |
# Game selection section
|
| 927 |
with gr.Row():
|
| 928 |
-
gr.Markdown("###
|
| 929 |
with gr.Row():
|
| 930 |
# with gr.Column(): # Commented out Super Mario BrosUI
|
| 931 |
# gr.Markdown("**🎮 Super Mario Bros**")
|
| 932 |
# mario_overall = gr.Checkbox(label="Super Mario BrosScore", value=True)
|
| 933 |
# mario_details = gr.Checkbox(label="Super Mario BrosDetails", value=False)
|
| 934 |
with gr.Column(): # Added Super Mario BrosUI
|
| 935 |
-
gr.Markdown("
|
| 936 |
mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
|
| 937 |
mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
|
| 938 |
with gr.Column(): # Sokoban is now after mario_plan
|
|
@@ -972,12 +1033,16 @@ def build_app():
|
|
| 972 |
# Leaderboard table
|
| 973 |
with gr.Row():
|
| 974 |
gr.Markdown("### 📋 Detailed Results")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 975 |
|
| 976 |
# Add reference to Jupyter notebook
|
| 977 |
with gr.Row():
|
| 978 |
gr.Markdown("*All data analysis can be replicated by checking [this Jupyter notebook](https://colab.research.google.com/drive/1CYFiJGm3EoBXXI8vICPVR82J9qrmmRvc#scrollTo=qft1Oald-21J)*")
|
| 979 |
|
| 980 |
-
# Get initial leaderboard dataframe
|
| 981 |
initial_df = get_combined_leaderboard(rank_data, {
|
| 982 |
# "Super Mario Bros": True, # Commented out
|
| 983 |
"Super Mario Bros": True,
|
|
@@ -987,7 +1052,7 @@ def build_app():
|
|
| 987 |
# "Tetris(complete)": True, # Commented out
|
| 988 |
"Tetris": True,
|
| 989 |
"Ace Attorney": True
|
| 990 |
-
})
|
| 991 |
|
| 992 |
# Format the DataFrame for display
|
| 993 |
initial_display_df = prepare_dataframe_for_display(initial_df)
|
|
@@ -996,8 +1061,14 @@ def build_app():
|
|
| 996 |
col_widths = ["40px"] # Row number column width
|
| 997 |
col_widths.append("230px") # Player column - reduced by 20px
|
| 998 |
col_widths.append("120px") # Organization column
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 999 |
# Add game score columns
|
| 1000 |
-
|
|
|
|
| 1001 |
col_widths.append("120px")
|
| 1002 |
|
| 1003 |
# Create a standard DataFrame component with enhanced styling
|
|
@@ -1062,8 +1133,8 @@ def build_app():
|
|
| 1062 |
# Update leaderboard and visualizations when checkboxes change
|
| 1063 |
for checkbox in checkbox_list:
|
| 1064 |
checkbox.change(
|
| 1065 |
-
lambda *args: update_leaderboard(*args, data_source=rank_data),
|
| 1066 |
-
inputs=checkbox_list
|
| 1067 |
outputs=[
|
| 1068 |
leaderboard_df,
|
| 1069 |
detailed_visualization,
|
|
@@ -1072,22 +1143,10 @@ def build_app():
|
|
| 1072 |
] + checkbox_list
|
| 1073 |
)
|
| 1074 |
|
| 1075 |
-
# Update when top_n_slider changes
|
| 1076 |
-
top_n_slider.change(
|
| 1077 |
-
lambda *args: update_leaderboard(*args, data_source=rank_data),
|
| 1078 |
-
inputs=checkbox_list + [top_n_slider],
|
| 1079 |
-
outputs=[
|
| 1080 |
-
leaderboard_df,
|
| 1081 |
-
detailed_visualization,
|
| 1082 |
-
radar_visualization,
|
| 1083 |
-
group_bar_visualization
|
| 1084 |
-
] + checkbox_list
|
| 1085 |
-
)
|
| 1086 |
-
|
| 1087 |
# Update when clear button is clicked
|
| 1088 |
clear_btn.click(
|
| 1089 |
-
lambda
|
| 1090 |
-
inputs=[
|
| 1091 |
outputs=[
|
| 1092 |
leaderboard_df,
|
| 1093 |
detailed_visualization,
|
|
@@ -1096,7 +1155,7 @@ def build_app():
|
|
| 1096 |
] + checkbox_list
|
| 1097 |
)
|
| 1098 |
|
| 1099 |
-
# Initialize the
|
| 1100 |
demo.load(
|
| 1101 |
lambda: clear_filters(data_source=rank_data),
|
| 1102 |
inputs=[],
|
|
@@ -1119,6 +1178,20 @@ def build_app():
|
|
| 1119 |
visible=False,
|
| 1120 |
elem_classes="visualization-container"
|
| 1121 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1122 |
|
| 1123 |
with gr.Column(visible=True) as model_overall_visualizations:
|
| 1124 |
with gr.Tabs():
|
|
@@ -1132,17 +1205,6 @@ def build_app():
|
|
| 1132 |
elem_classes="radar-tip"
|
| 1133 |
)
|
| 1134 |
with gr.Tab("📊 Group Bar Chart"):
|
| 1135 |
-
with gr.Row():
|
| 1136 |
-
# Calculate dynamic maximum based on total models
|
| 1137 |
-
model_max_models = get_total_model_count(model_rank_data)
|
| 1138 |
-
model_top_n_slider = gr.Slider(
|
| 1139 |
-
minimum=1,
|
| 1140 |
-
maximum=model_max_models,
|
| 1141 |
-
step=1,
|
| 1142 |
-
value=min(10, model_max_models),
|
| 1143 |
-
label=f"Number of Top Models to Display (max: {model_max_models})",
|
| 1144 |
-
elem_classes="top-n-slider"
|
| 1145 |
-
)
|
| 1146 |
model_group_bar_visualization = gr.Plot(
|
| 1147 |
label="Comparative Analysis (Group Bar Chart)",
|
| 1148 |
elem_classes="visualization-container"
|
|
@@ -1154,10 +1216,10 @@ def build_app():
|
|
| 1154 |
|
| 1155 |
# Game selection section
|
| 1156 |
with gr.Row():
|
| 1157 |
-
gr.Markdown("###
|
| 1158 |
with gr.Row():
|
| 1159 |
with gr.Column():
|
| 1160 |
-
gr.Markdown("
|
| 1161 |
model_mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
|
| 1162 |
model_mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
|
| 1163 |
with gr.Column():
|
|
@@ -1193,8 +1255,10 @@ def build_app():
|
|
| 1193 |
# Leaderboard table
|
| 1194 |
with gr.Row():
|
| 1195 |
gr.Markdown("### 📋 Detailed Results")
|
|
|
|
|
|
|
| 1196 |
|
| 1197 |
-
# Get initial leaderboard dataframe
|
| 1198 |
model_initial_df = get_combined_leaderboard(model_rank_data, {
|
| 1199 |
"Super Mario Bros": True,
|
| 1200 |
"Sokoban": True,
|
|
@@ -1202,7 +1266,7 @@ def build_app():
|
|
| 1202 |
"Candy Crush": True,
|
| 1203 |
"Tetris": True,
|
| 1204 |
"Ace Attorney": True
|
| 1205 |
-
})
|
| 1206 |
|
| 1207 |
# Format the DataFrame for display
|
| 1208 |
model_initial_display_df = prepare_dataframe_for_display(model_initial_df)
|
|
@@ -1300,7 +1364,7 @@ def build_app():
|
|
| 1300 |
] + model_checkbox_list
|
| 1301 |
)
|
| 1302 |
|
| 1303 |
-
# Initialize the model leaderboard
|
| 1304 |
demo.load(
|
| 1305 |
lambda: clear_filters(data_source=model_rank_data),
|
| 1306 |
inputs=[],
|
|
|
|
| 38 |
}
|
| 39 |
|
| 40 |
# Load the initial JSON file with rank data
|
| 41 |
+
with open(TIME_POINTS["03/25/2025"], "r", encoding='utf-8') as f:
|
| 42 |
rank_data = json.load(f)
|
| 43 |
|
| 44 |
# Load the model leaderboard data
|
| 45 |
+
with open("rank_single_model_03_25_2025.json", "r", encoding='utf-8') as f:
|
| 46 |
model_rank_data = json.load(f)
|
| 47 |
|
| 48 |
# Add leaderboard state at the top level
|
|
|
|
| 72 |
|
| 73 |
|
| 74 |
# Load video links and news data
|
| 75 |
+
with open('assets/game_video_link.json', 'r', encoding='utf-8') as f:
|
| 76 |
VIDEO_LINKS = json.load(f)
|
| 77 |
|
| 78 |
+
with open('assets/news.json', 'r', encoding='utf-8') as f:
|
| 79 |
NEWS_DATA = json.load(f)
|
| 80 |
|
| 81 |
def load_rank_data(time_point):
|
| 82 |
"""Load rank data for a specific time point"""
|
| 83 |
if time_point in TIME_POINTS:
|
| 84 |
try:
|
| 85 |
+
with open(TIME_POINTS[time_point], "r", encoding='utf-8') as f:
|
| 86 |
return json.load(f)
|
| 87 |
except FileNotFoundError:
|
| 88 |
return None
|
|
|
|
| 105 |
|
| 106 |
# Replace '_' with '-' for better display
|
| 107 |
for col in display_df.columns:
|
| 108 |
+
if col.endswith(' Score') and col != 'Avg Normalized Score':
|
| 109 |
display_df[col] = display_df[col].apply(lambda x: '-' if x == '_' else x)
|
| 110 |
|
| 111 |
# If we're in detailed view, sort by score
|
|
|
|
| 120 |
# Filter out models that didn't participate
|
| 121 |
display_df = display_df[~display_df[score_col].isna()]
|
| 122 |
else:
|
| 123 |
+
# For overall view, sort by average normalized score if available, otherwise fallback to average scores
|
| 124 |
+
if 'Avg Normalized Score' in display_df.columns:
|
| 125 |
+
# Sort by average normalized score (already calculated in leaderboard_utils)
|
| 126 |
+
display_df = display_df.sort_values(by='Avg Normalized Score', ascending=False)
|
| 127 |
+
else:
|
| 128 |
+
# Calculate an internal sorting key based on average scores, but don't add it to the display_df
|
| 129 |
+
score_cols = [col for col in display_df.columns if col.endswith(' Score')]
|
| 130 |
+
if score_cols:
|
| 131 |
+
temp_sort_df = display_df.copy()
|
| 132 |
+
for col in score_cols:
|
| 133 |
+
temp_sort_df[col] = pd.to_numeric(temp_sort_df[col], errors='coerce')
|
| 134 |
+
|
| 135 |
+
# Create a temporary column for sorting
|
| 136 |
+
temp_sort_df['temp_avg_score_for_sort'] = temp_sort_df[score_cols].mean(axis=1)
|
| 137 |
+
|
| 138 |
+
# Sort by this temporary average score (higher is better for scores)
|
| 139 |
+
# and then by Player name as a tie-breaker
|
| 140 |
+
display_df = display_df.loc[temp_sort_df.sort_values(by=['temp_avg_score_for_sort', 'Player'], ascending=[False, True]).index]
|
| 141 |
+
|
| 142 |
+
# Add medal emojis for top 3 performers
|
| 143 |
+
if len(display_df) > 0 and 'Player' in display_df.columns:
|
| 144 |
+
# Reset index to get proper ranking after sorting
|
| 145 |
+
display_df = display_df.reset_index(drop=True)
|
| 146 |
|
| 147 |
+
# Add medal emojis to Player names for top 3
|
| 148 |
+
medal_emojis = ['🥇', '🥈', '🥉']
|
| 149 |
+
for i in range(min(3, len(display_df))):
|
| 150 |
+
original_name = display_df.loc[i, 'Player']
|
| 151 |
+
display_df.loc[i, 'Player'] = f"{medal_emojis[i]} {original_name}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
# Add line breaks to column headers
|
| 154 |
new_columns = {}
|
| 155 |
for col in display_df.columns:
|
| 156 |
+
if col.endswith(' Score') and col != 'Avg Normalized Score':
|
| 157 |
# Replace 'Game Name Score' with 'Game Name\nScore'
|
| 158 |
game_name = col.replace(' Score', '')
|
| 159 |
new_col = f"{game_name}\nScore"
|
| 160 |
new_columns[col] = new_col
|
| 161 |
+
elif col == 'Avg Normalized Score':
|
| 162 |
+
# Add line break to Avg Normalized Score column
|
| 163 |
+
new_columns[col] = "Avg Normalized\nScore"
|
| 164 |
|
| 165 |
# Rename columns with new line breaks
|
| 166 |
if new_columns:
|
|
|
|
| 175 |
col_widths = ["40px"] # Row number column width
|
| 176 |
col_widths.append("230px") # Player column - reduced by 20px
|
| 177 |
col_widths.append("120px") # Organization column
|
| 178 |
+
|
| 179 |
+
# Check if there's an Avg Normalized Score column
|
| 180 |
+
if any('Avg Normalized' in col for col in df.columns):
|
| 181 |
+
col_widths.append("140px") # Avg Normalized Score column - slightly wider
|
| 182 |
+
|
| 183 |
# Add game score columns
|
| 184 |
+
remaining_cols = len(df.columns) - len(col_widths) + 1 # +1 because we subtracted row number column
|
| 185 |
+
for _ in range(remaining_cols):
|
| 186 |
col_widths.append("120px")
|
| 187 |
|
| 188 |
return gr.update(value=df,
|
|
|
|
| 201 |
# tetris_overall, tetris_details, # Commented out
|
| 202 |
tetris_plan_overall, tetris_plan_details,
|
| 203 |
ace_attorney_overall, ace_attorney_details,
|
| 204 |
+
top_n=3,
|
| 205 |
data_source=None):
|
| 206 |
global leaderboard_state
|
| 207 |
|
|
|
|
| 321 |
|
| 322 |
# Get the appropriate DataFrame and charts based on current state
|
| 323 |
if leaderboard_state["current_game"]:
|
| 324 |
+
# For detailed view - use slider value for both leaderboards
|
| 325 |
+
limit = top_n
|
| 326 |
# if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
|
| 327 |
# df = get_mario_leaderboard(data)
|
| 328 |
if leaderboard_state["current_game"] == "Super Mario Bros":
|
| 329 |
+
df = get_mario_planning_leaderboard(data, limit)
|
| 330 |
elif leaderboard_state["current_game"] == "Sokoban":
|
| 331 |
+
df = get_sokoban_leaderboard(data, limit)
|
| 332 |
elif leaderboard_state["current_game"] == "2048":
|
| 333 |
+
df = get_2048_leaderboard(data, limit)
|
| 334 |
elif leaderboard_state["current_game"] == "Candy Crush":
|
| 335 |
+
df = get_candy_leaderboard(data, limit)
|
| 336 |
elif leaderboard_state["current_game"] == "Tetris":
|
| 337 |
+
df = get_tetris_planning_leaderboard(data, limit)
|
| 338 |
elif leaderboard_state["current_game"] == "Ace Attorney":
|
| 339 |
+
df = get_ace_attorney_leaderboard(data, limit)
|
| 340 |
else: # Should not happen if current_game is one of the known games
|
| 341 |
df = pd.DataFrame() # Empty df
|
| 342 |
|
|
|
|
| 345 |
radar_chart = chart # In detailed view, radar and group bar can be the same as the main chart
|
| 346 |
group_bar_chart = chart
|
| 347 |
else:
|
| 348 |
+
# For overall view - use slider value for both leaderboards
|
| 349 |
+
limit = top_n
|
| 350 |
+
df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n, limit)
|
| 351 |
display_df = prepare_dataframe_for_display(df)
|
| 352 |
+
# Pass appropriate title and top_n based on data source
|
| 353 |
+
_, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games, limit_to_top_n=limit, top_n=top_n)
|
| 354 |
chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
|
| 355 |
|
| 356 |
# Return values, including all four plot placeholders
|
|
|
|
| 425 |
}
|
| 426 |
}
|
| 427 |
|
| 428 |
+
def clear_filters(top_n=3, data_source=None):
|
| 429 |
global leaderboard_state
|
| 430 |
|
| 431 |
# Use provided data source or default to rank_data
|
|
|
|
| 440 |
"Ace Attorney": True
|
| 441 |
}
|
| 442 |
|
| 443 |
+
# Use slider value for both leaderboards
|
| 444 |
+
limit = top_n
|
| 445 |
+
df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n, limit)
|
| 446 |
display_df = prepare_dataframe_for_display(df)
|
| 447 |
+
# Pass top_n parameter for consistent titles
|
| 448 |
+
_, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games, limit_to_top_n=limit, top_n=top_n)
|
| 449 |
|
| 450 |
leaderboard_state = get_initial_state()
|
| 451 |
|
|
|
|
| 698 |
max-width: 140px !important;
|
| 699 |
}
|
| 700 |
|
| 701 |
+
/* Avg Normalized Score column (4th column) */
|
| 702 |
+
.table-container th:nth-child(4),
|
| 703 |
+
.table-container td:nth-child(4) {
|
| 704 |
+
width: 140px !important;
|
| 705 |
+
min-width: 120px !important;
|
| 706 |
+
max-width: 160px !important;
|
| 707 |
+
text-align: center !important;
|
| 708 |
+
}
|
| 709 |
+
|
| 710 |
+
/* Game score columns (5th column onwards) */
|
| 711 |
+
.table-container th:nth-child(n+5),
|
| 712 |
+
.table-container td:nth-child(n+5) {
|
| 713 |
width: 120px !important;
|
| 714 |
min-width: 100px !important;
|
| 715 |
max-width: 140px !important;
|
|
|
|
| 775 |
width: 100% !important;
|
| 776 |
margin-top: 40px !important;
|
| 777 |
}
|
| 778 |
+
|
| 779 |
+
.welcome-message {
|
| 780 |
+
background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%);
|
| 781 |
+
color: #333;
|
| 782 |
+
padding: 20px;
|
| 783 |
+
border-radius: 10px;
|
| 784 |
+
margin: 20px 0;
|
| 785 |
+
text-align: center;
|
| 786 |
+
box-shadow: 0 4px 15px rgba(0,0,0,0.05);
|
| 787 |
+
}
|
| 788 |
+
|
| 789 |
+
.welcome-message h3 {
|
| 790 |
+
margin: 0 0 10px 0;
|
| 791 |
+
font-size: 1.3em;
|
| 792 |
+
}
|
| 793 |
+
|
| 794 |
+
.welcome-message p {
|
| 795 |
+
margin: 0;
|
| 796 |
+
font-size: 1.1em;
|
| 797 |
+
line-height: 1.5;
|
| 798 |
+
}
|
| 799 |
""") as demo:
|
| 800 |
gr.Markdown("# 🎮 Lmgame Bench: Leaderboard 🎲")
|
| 801 |
|
|
|
|
| 928 |
with gr.Tabs():
|
| 929 |
with gr.Tab("🏆 Agent Leaderboard"):
|
| 930 |
# Visualization section
|
| 931 |
+
|
| 932 |
+
with gr.Row():
|
| 933 |
+
gr.Markdown("""
|
| 934 |
+
**🎮 Welcome to LMGame Bench!**
|
| 935 |
+
|
| 936 |
+
We welcome everyone to implement their own gaming agents by replacing our baseAgent in `customer_runner.py` and test them on our benchmark. Join the competition and see how your agent performs!
|
| 937 |
+
""", elem_classes="welcome-message")
|
| 938 |
+
|
| 939 |
with gr.Row():
|
| 940 |
gr.Markdown("### 📊 Data Visualization")
|
| 941 |
|
|
|
|
| 945 |
visible=False,
|
| 946 |
elem_classes="visualization-container"
|
| 947 |
)
|
| 948 |
+
# with gr.Row():
|
| 949 |
+
# # Calculate dynamic maximum based on total models
|
| 950 |
+
# agent_max_models = get_total_model_count(rank_data)
|
| 951 |
+
# top_n_slider = gr.Slider(
|
| 952 |
+
# minimum=1,
|
| 953 |
+
# maximum=agent_max_models,
|
| 954 |
+
# step=1,
|
| 955 |
+
# value=min(3, agent_max_models),
|
| 956 |
+
# label=f"Number of Top Models to Display in All Views (max: {agent_max_models})",
|
| 957 |
+
# elem_classes="top-n-slider"
|
| 958 |
+
# )
|
| 959 |
+
|
| 960 |
+
|
| 961 |
|
| 962 |
with gr.Column(visible=True) as overall_visualizations:
|
| 963 |
with gr.Tabs():
|
|
|
|
| 968 |
elem_classes="visualization-container"
|
| 969 |
)
|
| 970 |
gr.Markdown(
|
| 971 |
+
"*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*\n\n*⚔️ - Model with our gaming agent*",
|
| 972 |
elem_classes="radar-tip"
|
| 973 |
)
|
|
|
|
| 974 |
with gr.Tab("📊 Group Bar Chart"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 975 |
group_bar_visualization = gr.Plot(
|
| 976 |
label="Comparative Analysis (Group Bar Chart)",
|
| 977 |
elem_classes="visualization-container"
|
| 978 |
)
|
| 979 |
gr.Markdown(
|
| 980 |
+
"*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*\n\n*⚔️ - Model with our gaming agent*",
|
| 981 |
elem_classes="radar-tip"
|
| 982 |
)
|
|
|
|
| 983 |
|
| 984 |
# Hidden placeholder for group bar visualization (to maintain code references)
|
| 985 |
# group_bar_visualization = gr.Plot(visible=False)
|
| 986 |
|
| 987 |
# Game selection section
|
| 988 |
with gr.Row():
|
| 989 |
+
gr.Markdown("### 🕹️ Game Selection")
|
| 990 |
with gr.Row():
|
| 991 |
# with gr.Column(): # Commented out Super Mario BrosUI
|
| 992 |
# gr.Markdown("**🎮 Super Mario Bros**")
|
| 993 |
# mario_overall = gr.Checkbox(label="Super Mario BrosScore", value=True)
|
| 994 |
# mario_details = gr.Checkbox(label="Super Mario BrosDetails", value=False)
|
| 995 |
with gr.Column(): # Added Super Mario BrosUI
|
| 996 |
+
gr.Markdown("**🍄 Super Mario Bros**")
|
| 997 |
mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
|
| 998 |
mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
|
| 999 |
with gr.Column(): # Sokoban is now after mario_plan
|
|
|
|
| 1033 |
# Leaderboard table
|
| 1034 |
with gr.Row():
|
| 1035 |
gr.Markdown("### 📋 Detailed Results")
|
| 1036 |
+
with gr.Row():
|
| 1037 |
+
gr.Markdown("*⚔️ - Model with our gaming agent*", elem_classes="radar-tip")
|
| 1038 |
+
|
| 1039 |
+
# Welcome message for custom gaming agents
|
| 1040 |
|
| 1041 |
# Add reference to Jupyter notebook
|
| 1042 |
with gr.Row():
|
| 1043 |
gr.Markdown("*All data analysis can be replicated by checking [this Jupyter notebook](https://colab.research.google.com/drive/1CYFiJGm3EoBXXI8vICPVR82J9qrmmRvc#scrollTo=qft1Oald-21J)*")
|
| 1044 |
|
| 1045 |
+
# Get initial leaderboard dataframe (limited by default slider value for agent leaderboard)
|
| 1046 |
initial_df = get_combined_leaderboard(rank_data, {
|
| 1047 |
# "Super Mario Bros": True, # Commented out
|
| 1048 |
"Super Mario Bros": True,
|
|
|
|
| 1052 |
# "Tetris(complete)": True, # Commented out
|
| 1053 |
"Tetris": True,
|
| 1054 |
"Ace Attorney": True
|
| 1055 |
+
}, limit_to_top_n=min(3, get_total_model_count(rank_data)))
|
| 1056 |
|
| 1057 |
# Format the DataFrame for display
|
| 1058 |
initial_display_df = prepare_dataframe_for_display(initial_df)
|
|
|
|
| 1061 |
col_widths = ["40px"] # Row number column width
|
| 1062 |
col_widths.append("230px") # Player column - reduced by 20px
|
| 1063 |
col_widths.append("120px") # Organization column
|
| 1064 |
+
|
| 1065 |
+
# Check if there's an Avg Normalized Score column
|
| 1066 |
+
if any('Avg Normalized' in col for col in initial_display_df.columns):
|
| 1067 |
+
col_widths.append("140px") # Avg Normalized Score column - slightly wider
|
| 1068 |
+
|
| 1069 |
# Add game score columns
|
| 1070 |
+
remaining_cols = len(initial_display_df.columns) - len(col_widths) + 1 # +1 because we subtracted row number column
|
| 1071 |
+
for _ in range(remaining_cols):
|
| 1072 |
col_widths.append("120px")
|
| 1073 |
|
| 1074 |
# Create a standard DataFrame component with enhanced styling
|
|
|
|
| 1133 |
# Update leaderboard and visualizations when checkboxes change
|
| 1134 |
for checkbox in checkbox_list:
|
| 1135 |
checkbox.change(
|
| 1136 |
+
lambda *args: update_leaderboard(*args, top_n=3, data_source=rank_data),
|
| 1137 |
+
inputs=checkbox_list,
|
| 1138 |
outputs=[
|
| 1139 |
leaderboard_df,
|
| 1140 |
detailed_visualization,
|
|
|
|
| 1143 |
] + checkbox_list
|
| 1144 |
)
|
| 1145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1146 |
# Update when clear button is clicked
|
| 1147 |
clear_btn.click(
|
| 1148 |
+
lambda: clear_filters(top_n=3, data_source=rank_data),
|
| 1149 |
+
inputs=[],
|
| 1150 |
outputs=[
|
| 1151 |
leaderboard_df,
|
| 1152 |
detailed_visualization,
|
|
|
|
| 1155 |
] + checkbox_list
|
| 1156 |
)
|
| 1157 |
|
| 1158 |
+
# Initialize the agent leaderboard (with top 5 limit)
|
| 1159 |
demo.load(
|
| 1160 |
lambda: clear_filters(data_source=rank_data),
|
| 1161 |
inputs=[],
|
|
|
|
| 1178 |
visible=False,
|
| 1179 |
elem_classes="visualization-container"
|
| 1180 |
)
|
| 1181 |
+
|
| 1182 |
+
with gr.Row():
|
| 1183 |
+
# Calculate dynamic maximum based on total models
|
| 1184 |
+
model_max_models = get_total_model_count(model_rank_data)
|
| 1185 |
+
model_top_n_slider = gr.Slider(
|
| 1186 |
+
minimum=1,
|
| 1187 |
+
maximum=model_max_models,
|
| 1188 |
+
step=1,
|
| 1189 |
+
value=min(5, model_max_models),
|
| 1190 |
+
label=f"Number of Top Models to Display in All Views (max: {model_max_models})",
|
| 1191 |
+
elem_classes="top-n-slider"
|
| 1192 |
+
)
|
| 1193 |
+
|
| 1194 |
+
|
| 1195 |
|
| 1196 |
with gr.Column(visible=True) as model_overall_visualizations:
|
| 1197 |
with gr.Tabs():
|
|
|
|
| 1205 |
elem_classes="radar-tip"
|
| 1206 |
)
|
| 1207 |
with gr.Tab("📊 Group Bar Chart"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1208 |
model_group_bar_visualization = gr.Plot(
|
| 1209 |
label="Comparative Analysis (Group Bar Chart)",
|
| 1210 |
elem_classes="visualization-container"
|
|
|
|
| 1216 |
|
| 1217 |
# Game selection section
|
| 1218 |
with gr.Row():
|
| 1219 |
+
gr.Markdown("### 🕹️ Game Selection")
|
| 1220 |
with gr.Row():
|
| 1221 |
with gr.Column():
|
| 1222 |
+
gr.Markdown("**🍄 Super Mario Bros**")
|
| 1223 |
model_mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
|
| 1224 |
model_mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
|
| 1225 |
with gr.Column():
|
|
|
|
| 1255 |
# Leaderboard table
|
| 1256 |
with gr.Row():
|
| 1257 |
gr.Markdown("### 📋 Detailed Results")
|
| 1258 |
+
with gr.Row():
|
| 1259 |
+
gr.Markdown("*💡 The slider above controls how many top models are shown in the radar chart, bar chart, and data table.*", elem_classes="radar-tip")
|
| 1260 |
|
| 1261 |
+
# Get initial leaderboard dataframe (limited by default slider value for model leaderboard)
|
| 1262 |
model_initial_df = get_combined_leaderboard(model_rank_data, {
|
| 1263 |
"Super Mario Bros": True,
|
| 1264 |
"Sokoban": True,
|
|
|
|
| 1266 |
"Candy Crush": True,
|
| 1267 |
"Tetris": True,
|
| 1268 |
"Ace Attorney": True
|
| 1269 |
+
}, limit_to_top_n=min(5, get_total_model_count(model_rank_data)))
|
| 1270 |
|
| 1271 |
# Format the DataFrame for display
|
| 1272 |
model_initial_display_df = prepare_dataframe_for_display(model_initial_df)
|
|
|
|
| 1364 |
] + model_checkbox_list
|
| 1365 |
)
|
| 1366 |
|
| 1367 |
+
# Initialize the model leaderboard (with default slider limit)
|
| 1368 |
demo.load(
|
| 1369 |
lambda: clear_filters(data_source=model_rank_data),
|
| 1370 |
inputs=[],
|
assets/model_color.json
CHANGED
|
@@ -27,31 +27,31 @@
|
|
| 27 |
"llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
|
| 28 |
"qwen3-235B-A22B-fp8": "#6A1B9A",
|
| 29 |
"random (x30)": "#9E9E9E",
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"
|
| 40 |
-
"
|
| 41 |
-
"
|
| 42 |
-
"
|
| 43 |
-
"
|
| 44 |
-
"
|
| 45 |
-
"
|
| 46 |
-
"
|
| 47 |
-
"
|
| 48 |
-
"
|
| 49 |
-
"
|
| 50 |
-
"
|
| 51 |
-
"
|
| 52 |
-
"
|
| 53 |
-
"
|
| 54 |
-
"
|
| 55 |
-
"
|
| 56 |
-
"
|
| 57 |
}
|
|
|
|
| 27 |
"llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
|
| 28 |
"qwen3-235B-A22B-fp8": "#6A1B9A",
|
| 29 |
"random (x30)": "#9E9E9E",
|
| 30 |
+
"claude-3-7-sonnet-20250219 (⚔️)": "#4A90E2",
|
| 31 |
+
"claude-3-5-haiku-20241022 (⚔️)": "#7FB5E6",
|
| 32 |
+
"claude-3-5-sonnet-20241022 (⚔️)": "#1A4C7C",
|
| 33 |
+
"claude-opus-4-20250514 (⚔️)": "#3A80D2",
|
| 34 |
+
"claude-sonnet-4-20250514 (⚔️)": "#5A9FE2",
|
| 35 |
+
"gemini-2.0-flash (⚔️)": "#FF4081",
|
| 36 |
+
"gemini-2.0-flash-thinking-exp-1219 (⚔️)": "#C2185B",
|
| 37 |
+
"gemini-2.5-pro-exp-03-25 (⚔️)": "#FF80AB",
|
| 38 |
+
"gemini-2.5-flash-preview-04-17 (⚔️)": "#F06292",
|
| 39 |
+
"gemini-2.5-flash-preview-05-20 (⚔️)": "#F8BBD9",
|
| 40 |
+
"gemini-2.5-pro-preview-05-06 (⚔️)": "#AD1457",
|
| 41 |
+
"gemini-2.5-pro-preview-06-05 (⚔️)": "#EC407A",
|
| 42 |
+
"gpt-4o-2024-11-20 (⚔️)": "#00BFA5",
|
| 43 |
+
"gpt-4.5-preview-2025-02-27 (⚔️)": "#00796B",
|
| 44 |
+
"gpt-4.1-2025-04-14 (⚔️)": "#00897B",
|
| 45 |
+
"o1-2024-12-17 (⚔️)": "#4DB6AC",
|
| 46 |
+
"o1-mini-2024-09-12 (⚔️)": "#26A69A",
|
| 47 |
+
"o3-mini-2025-01-31(medium) (⚔️)": "#80CBC4",
|
| 48 |
+
"o3-2025-04-16 (⚔️)": "#26C6DA",
|
| 49 |
+
"o4-mini-2025-04-16 (⚔️)": "#00ACC1",
|
| 50 |
+
"grok-3-beta (⚔️)": "#FF7043",
|
| 51 |
+
"grok-3-mini-beta (⚔️)": "#FF8A65",
|
| 52 |
+
"deepseek-v3 (⚔️)": "#FFC107",
|
| 53 |
+
"deepseek-r1-0120 (⚔️)": "#FFA000",
|
| 54 |
+
"deepseek-r1-0528 (⚔️)": "#FFB300",
|
| 55 |
+
"llama-4-maverick-17b-128e-instruct-fp8 (⚔️)": "#8E24AA",
|
| 56 |
+
"qwen3-235B-A22B-fp8 (⚔️)": "#6A1B9A"
|
| 57 |
}
|
data_visualization.py
CHANGED
|
@@ -2,13 +2,15 @@ import plotly.graph_objects as go
|
|
| 2 |
import numpy as np
|
| 3 |
import pandas as pd
|
| 4 |
import json
|
|
|
|
|
|
|
| 5 |
from leaderboard_utils import (
|
| 6 |
get_combined_leaderboard,
|
| 7 |
GAME_ORDER
|
| 8 |
)
|
| 9 |
|
| 10 |
# Load model colors
|
| 11 |
-
with open('assets/model_color.json', 'r') as f:
|
| 12 |
MODEL_COLORS = json.load(f)
|
| 13 |
|
| 14 |
GAME_SCORE_COLUMNS = {
|
|
@@ -126,7 +128,7 @@ def create_radar_charts(df):
|
|
| 126 |
categories = [c.replace(" Score", "") for c in game_cols]
|
| 127 |
|
| 128 |
for col in game_cols:
|
| 129 |
-
vals = df[col].replace("n/a", 0).astype(float)
|
| 130 |
mean, std = vals.mean(), vals.std()
|
| 131 |
df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
| 132 |
|
|
@@ -179,7 +181,7 @@ def get_combined_leaderboard_with_radar(rank_data, selected_games):
|
|
| 179 |
df_viz = df.copy()
|
| 180 |
return df, create_radar_charts(df_viz)
|
| 181 |
|
| 182 |
-
def create_group_bar_chart(df, top_n=
|
| 183 |
game_cols = {}
|
| 184 |
for game in GAME_ORDER:
|
| 185 |
col = f"{game} Score"
|
|
@@ -330,8 +332,8 @@ def create_group_bar_chart(df, top_n=10):
|
|
| 330 |
|
| 331 |
|
| 332 |
|
| 333 |
-
def get_combined_leaderboard_with_group_bar(rank_data, selected_games, top_n=
|
| 334 |
-
df = get_combined_leaderboard(rank_data, selected_games)
|
| 335 |
# Create a copy for visualization to avoid modifying the original
|
| 336 |
df_viz = df.copy()
|
| 337 |
return df, create_group_bar_chart(df_viz, top_n)
|
|
@@ -344,7 +346,7 @@ def hex_to_rgba(hex_color, alpha=0.2):
|
|
| 344 |
return f'rgba({r}, {g}, {b}, {alpha})'
|
| 345 |
|
| 346 |
|
| 347 |
-
def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
| 348 |
if selected_games is None:
|
| 349 |
selected_games = ['Super Mario Bros', '2048', 'Candy Crush', 'Sokoban', 'Ace Attorney']
|
| 350 |
|
|
@@ -359,11 +361,25 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
|
| 359 |
game_cols = [f"{game} Score" for game in selected_games]
|
| 360 |
categories = formatted_games
|
| 361 |
|
| 362 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
for col in game_cols:
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
# Group players by prefix and sort alphabetically
|
| 369 |
model_groups = {}
|
|
@@ -411,12 +427,23 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
|
| 411 |
hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
|
| 412 |
))
|
| 413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
fig.update_layout(
|
| 415 |
autosize=True,
|
| 416 |
height=550, # Reduced height for better proportion with legend
|
| 417 |
margin=dict(l=400, r=100, t=20, b=20),
|
| 418 |
title=dict(
|
| 419 |
-
text=
|
| 420 |
x=0.5,
|
| 421 |
xanchor='center',
|
| 422 |
yanchor='top',
|
|
@@ -462,12 +489,20 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
|
| 462 |
|
| 463 |
return fig
|
| 464 |
|
| 465 |
-
def get_combined_leaderboard_with_single_radar(rank_data, selected_games, highlight_models=None):
|
| 466 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
selected_game_names = [g for g, sel in selected_games.items() if sel]
|
| 468 |
-
|
|
|
|
| 469 |
df_viz = df.copy()
|
| 470 |
-
|
|
|
|
|
|
|
| 471 |
|
| 472 |
def create_organization_radar_chart(rank_data):
|
| 473 |
df = get_combined_leaderboard(rank_data, {g: True for g in GAME_ORDER})
|
|
@@ -477,7 +512,7 @@ def create_organization_radar_chart(rank_data):
|
|
| 477 |
|
| 478 |
avg_df = pd.DataFrame([
|
| 479 |
{
|
| 480 |
-
**{col: df[df["Organization"] == org][col].
|
| 481 |
"Organization": org
|
| 482 |
}
|
| 483 |
for org in orgs
|
|
@@ -533,7 +568,10 @@ def create_top_players_radar_chart(rank_data, n=5):
|
|
| 533 |
|
| 534 |
for col in game_cols:
|
| 535 |
# Replace "n/a" with 0 and handle downcasting properly
|
| 536 |
-
|
|
|
|
|
|
|
|
|
|
| 537 |
mean, std = vals.mean(), vals.std()
|
| 538 |
top_df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
| 539 |
|
|
@@ -589,8 +627,15 @@ def create_player_radar_chart(rank_data, player_name):
|
|
| 589 |
|
| 590 |
for col in game_cols:
|
| 591 |
# Replace "n/a" with 0 and handle downcasting properly
|
| 592 |
-
|
| 593 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
player_df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
| 595 |
|
| 596 |
fig = go.Figure()
|
|
@@ -628,6 +673,281 @@ def create_player_radar_chart(rank_data, player_name):
|
|
| 628 |
)
|
| 629 |
return fig
|
| 630 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
|
| 632 |
def save_visualization(fig, filename):
|
| 633 |
-
fig.write_image(filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import pandas as pd
|
| 4 |
import json
|
| 5 |
+
import os
|
| 6 |
+
from datetime import datetime
|
| 7 |
from leaderboard_utils import (
|
| 8 |
get_combined_leaderboard,
|
| 9 |
GAME_ORDER
|
| 10 |
)
|
| 11 |
|
| 12 |
# Load model colors
|
| 13 |
+
with open('assets/model_color.json', 'r', encoding='utf-8') as f:
|
| 14 |
MODEL_COLORS = json.load(f)
|
| 15 |
|
| 16 |
GAME_SCORE_COLUMNS = {
|
|
|
|
| 128 |
categories = [c.replace(" Score", "") for c in game_cols]
|
| 129 |
|
| 130 |
for col in game_cols:
|
| 131 |
+
vals = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
|
| 132 |
mean, std = vals.mean(), vals.std()
|
| 133 |
df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
| 134 |
|
|
|
|
| 181 |
df_viz = df.copy()
|
| 182 |
return df, create_radar_charts(df_viz)
|
| 183 |
|
| 184 |
+
def create_group_bar_chart(df, top_n=5):
|
| 185 |
game_cols = {}
|
| 186 |
for game in GAME_ORDER:
|
| 187 |
col = f"{game} Score"
|
|
|
|
| 332 |
|
| 333 |
|
| 334 |
|
| 335 |
+
def get_combined_leaderboard_with_group_bar(rank_data, selected_games, top_n=5, limit_to_top_n=None):
|
| 336 |
+
df = get_combined_leaderboard(rank_data, selected_games, limit_to_top_n)
|
| 337 |
# Create a copy for visualization to avoid modifying the original
|
| 338 |
df_viz = df.copy()
|
| 339 |
return df, create_group_bar_chart(df_viz, top_n)
|
|
|
|
| 346 |
return f'rgba({r}, {g}, {b}, {alpha})'
|
| 347 |
|
| 348 |
|
| 349 |
+
def create_single_radar_chart(df, selected_games=None, highlight_models=None, chart_title=None, top_n=None, full_df=None):
|
| 350 |
if selected_games is None:
|
| 351 |
selected_games = ['Super Mario Bros', '2048', 'Candy Crush', 'Sokoban', 'Ace Attorney']
|
| 352 |
|
|
|
|
| 361 |
game_cols = [f"{game} Score" for game in selected_games]
|
| 362 |
categories = formatted_games
|
| 363 |
|
| 364 |
+
# Use full dataset for normalization to keep consistent scale
|
| 365 |
+
# If full_df is not provided, use the current df (fallback for backward compatibility)
|
| 366 |
+
normalization_df = full_df if full_df is not None else df
|
| 367 |
+
|
| 368 |
+
# Normalize using the full dataset but apply to the limited df
|
| 369 |
for col in game_cols:
|
| 370 |
+
# Get normalization parameters from full dataset
|
| 371 |
+
# Use where() to avoid FutureWarning about downcasting in replace()
|
| 372 |
+
full_series = normalization_df[col].copy()
|
| 373 |
+
full_series = full_series.where(full_series != "n/a", 0)
|
| 374 |
+
full_vals = full_series.astype(float)
|
| 375 |
+
mean, std = full_vals.mean(), full_vals.std()
|
| 376 |
+
|
| 377 |
+
# Apply normalization to the limited df
|
| 378 |
+
# Use where() to avoid FutureWarning about downcasting in replace()
|
| 379 |
+
limited_series = df[col].copy()
|
| 380 |
+
limited_series = limited_series.where(limited_series != "n/a", 0)
|
| 381 |
+
limited_vals = limited_series.astype(float)
|
| 382 |
+
df[f"norm_{col}"] = normalize_values(limited_vals, mean, std)
|
| 383 |
|
| 384 |
# Group players by prefix and sort alphabetically
|
| 385 |
model_groups = {}
|
|
|
|
| 427 |
hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
|
| 428 |
))
|
| 429 |
|
| 430 |
+
# Dynamic title based on the data source and top_n
|
| 431 |
+
if chart_title is None:
|
| 432 |
+
if top_n is not None:
|
| 433 |
+
chart_title = f"Radar Chart - Top {top_n} Performers by Game"
|
| 434 |
+
else:
|
| 435 |
+
# Fallback title
|
| 436 |
+
if len(df) <= 10:
|
| 437 |
+
chart_title = "🎮 Agent Performance Across Games"
|
| 438 |
+
else:
|
| 439 |
+
chart_title = "🤖 Model Performance Across Games"
|
| 440 |
+
|
| 441 |
fig.update_layout(
|
| 442 |
autosize=True,
|
| 443 |
height=550, # Reduced height for better proportion with legend
|
| 444 |
margin=dict(l=400, r=100, t=20, b=20),
|
| 445 |
title=dict(
|
| 446 |
+
text=chart_title,
|
| 447 |
x=0.5,
|
| 448 |
xanchor='center',
|
| 449 |
yanchor='top',
|
|
|
|
| 489 |
|
| 490 |
return fig
|
| 491 |
|
| 492 |
+
def get_combined_leaderboard_with_single_radar(rank_data, selected_games, highlight_models=None, limit_to_top_n=None, chart_title=None, top_n=None):
|
| 493 |
+
# Get full dataset for normalization
|
| 494 |
+
full_df = get_combined_leaderboard(rank_data, selected_games, limit_to_top_n=None)
|
| 495 |
+
|
| 496 |
+
# Get limited dataset for display
|
| 497 |
+
df = get_combined_leaderboard(rank_data, selected_games, limit_to_top_n)
|
| 498 |
+
|
| 499 |
selected_game_names = [g for g, sel in selected_games.items() if sel]
|
| 500 |
+
|
| 501 |
+
# Create copies for visualization to avoid modifying the original
|
| 502 |
df_viz = df.copy()
|
| 503 |
+
full_df_viz = full_df.copy()
|
| 504 |
+
|
| 505 |
+
return df, create_single_radar_chart(df_viz, selected_game_names, highlight_models, chart_title, top_n, full_df_viz)
|
| 506 |
|
| 507 |
def create_organization_radar_chart(rank_data):
|
| 508 |
df = get_combined_leaderboard(rank_data, {g: True for g in GAME_ORDER})
|
|
|
|
| 512 |
|
| 513 |
avg_df = pd.DataFrame([
|
| 514 |
{
|
| 515 |
+
**{col: df[df["Organization"] == org][col].where(df[df["Organization"] == org][col] != "n/a", 0).astype(float).mean() for col in game_cols},
|
| 516 |
"Organization": org
|
| 517 |
}
|
| 518 |
for org in orgs
|
|
|
|
| 568 |
|
| 569 |
for col in game_cols:
|
| 570 |
# Replace "n/a" with 0 and handle downcasting properly
|
| 571 |
+
# Use where() to avoid FutureWarning about downcasting in replace()
|
| 572 |
+
series = top_df[col].copy()
|
| 573 |
+
series = series.where(series != "n/a", 0)
|
| 574 |
+
vals = series.astype(float)
|
| 575 |
mean, std = vals.mean(), vals.std()
|
| 576 |
top_df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
| 577 |
|
|
|
|
| 627 |
|
| 628 |
for col in game_cols:
|
| 629 |
# Replace "n/a" with 0 and handle downcasting properly
|
| 630 |
+
# Use where() to avoid FutureWarning about downcasting in replace()
|
| 631 |
+
player_series = player_df[col].copy()
|
| 632 |
+
player_series = player_series.where(player_series != "n/a", 0)
|
| 633 |
+
vals = player_series.astype(float)
|
| 634 |
+
|
| 635 |
+
df_series = df[col].copy()
|
| 636 |
+
df_series = df_series.where(df_series != "n/a", 0)
|
| 637 |
+
df_vals = df_series.astype(float)
|
| 638 |
+
mean, std = df_vals.mean(), df_vals.std()
|
| 639 |
player_df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
| 640 |
|
| 641 |
fig = go.Figure()
|
|
|
|
| 673 |
)
|
| 674 |
return fig
|
| 675 |
|
| 676 |
+
def save_normalized_data(df, selected_games, filename="normalized_data.json"):
|
| 677 |
+
"""
|
| 678 |
+
Save normalized data to a JSON file for caching
|
| 679 |
+
|
| 680 |
+
Args:
|
| 681 |
+
df (pd.DataFrame): DataFrame with raw scores
|
| 682 |
+
selected_games (dict): Dictionary of selected games
|
| 683 |
+
filename (str): Output filename
|
| 684 |
+
"""
|
| 685 |
+
game_cols = [f"{game} Score" for game in GAME_ORDER if f"{game} Score" in df.columns]
|
| 686 |
+
|
| 687 |
+
# Calculate normalization parameters and normalized values
|
| 688 |
+
normalization_data = {
|
| 689 |
+
"timestamp": datetime.now().isoformat(),
|
| 690 |
+
"selected_games": selected_games,
|
| 691 |
+
"games": {},
|
| 692 |
+
"players": {}
|
| 693 |
+
}
|
| 694 |
+
|
| 695 |
+
# Store normalization parameters per game
|
| 696 |
+
for col in game_cols:
|
| 697 |
+
game_name = col.replace(" Score", "")
|
| 698 |
+
vals = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
|
| 699 |
+
mean, std = vals.mean(), vals.std()
|
| 700 |
+
|
| 701 |
+
normalization_data["games"][game_name] = {
|
| 702 |
+
"mean": mean,
|
| 703 |
+
"std": std,
|
| 704 |
+
"raw_scores": vals.to_dict()
|
| 705 |
+
}
|
| 706 |
+
|
| 707 |
+
# Store normalized scores per player
|
| 708 |
+
for _, row in df.iterrows():
|
| 709 |
+
player = row["Player"]
|
| 710 |
+
player_data = {"organization": row.get("Organization", "unknown")}
|
| 711 |
+
|
| 712 |
+
for col in game_cols:
|
| 713 |
+
game_name = col.replace(" Score", "")
|
| 714 |
+
raw_score = row[col]
|
| 715 |
+
|
| 716 |
+
if raw_score != "n/a":
|
| 717 |
+
raw_score = float(raw_score)
|
| 718 |
+
mean = normalization_data["games"][game_name]["mean"]
|
| 719 |
+
std = normalization_data["games"][game_name]["std"]
|
| 720 |
+
normalized = normalize_values([raw_score], mean, std)[0]
|
| 721 |
+
else:
|
| 722 |
+
raw_score = "n/a"
|
| 723 |
+
normalized = 0
|
| 724 |
+
|
| 725 |
+
player_data[f"{game_name}_raw"] = raw_score
|
| 726 |
+
player_data[f"{game_name}_normalized"] = normalized
|
| 727 |
+
|
| 728 |
+
normalization_data["players"][player] = player_data
|
| 729 |
+
|
| 730 |
+
# Save to file
|
| 731 |
+
os.makedirs("cache", exist_ok=True)
|
| 732 |
+
filepath = os.path.join("cache", filename)
|
| 733 |
+
|
| 734 |
+
with open(filepath, 'w') as f:
|
| 735 |
+
json.dump(normalization_data, f, indent=2)
|
| 736 |
+
|
| 737 |
+
print(f"Normalized data saved to {filepath}")
|
| 738 |
+
return filepath
|
| 739 |
+
|
| 740 |
+
def load_normalized_data(filename="normalized_data.json"):
|
| 741 |
+
"""
|
| 742 |
+
Load normalized data from a JSON file
|
| 743 |
+
|
| 744 |
+
Args:
|
| 745 |
+
filename (str): Input filename
|
| 746 |
+
|
| 747 |
+
Returns:
|
| 748 |
+
dict: Normalized data or None if file doesn't exist
|
| 749 |
+
"""
|
| 750 |
+
filepath = os.path.join("cache", filename)
|
| 751 |
+
|
| 752 |
+
if not os.path.exists(filepath):
|
| 753 |
+
return None
|
| 754 |
+
|
| 755 |
+
try:
|
| 756 |
+
with open(filepath, 'r') as f:
|
| 757 |
+
data = json.load(f)
|
| 758 |
+
print(f"Normalized data loaded from {filepath}")
|
| 759 |
+
return data
|
| 760 |
+
except Exception as e:
|
| 761 |
+
print(f"Error loading normalized data: {e}")
|
| 762 |
+
return None
|
| 763 |
+
|
| 764 |
+
def get_normalized_scores_from_cache(players, games, cache_data):
|
| 765 |
+
"""
|
| 766 |
+
Extract normalized scores from cached data
|
| 767 |
+
|
| 768 |
+
Args:
|
| 769 |
+
players (list): List of player names
|
| 770 |
+
games (list): List of game names
|
| 771 |
+
cache_data (dict): Cached normalization data
|
| 772 |
+
|
| 773 |
+
Returns:
|
| 774 |
+
pd.DataFrame: DataFrame with normalized scores
|
| 775 |
+
"""
|
| 776 |
+
data = []
|
| 777 |
+
|
| 778 |
+
for player in players:
|
| 779 |
+
if player in cache_data["players"]:
|
| 780 |
+
player_data = {"Player": player}
|
| 781 |
+
player_cache = cache_data["players"][player]
|
| 782 |
+
|
| 783 |
+
for game in games:
|
| 784 |
+
raw_key = f"{game}_raw"
|
| 785 |
+
norm_key = f"{game}_normalized"
|
| 786 |
+
|
| 787 |
+
if raw_key in player_cache:
|
| 788 |
+
player_data[f"{game} Score"] = player_cache[raw_key]
|
| 789 |
+
player_data[f"norm_{game} Score"] = player_cache[norm_key]
|
| 790 |
+
else:
|
| 791 |
+
player_data[f"{game} Score"] = "n/a"
|
| 792 |
+
player_data[f"norm_{game} Score"] = 0
|
| 793 |
+
|
| 794 |
+
data.append(player_data)
|
| 795 |
+
|
| 796 |
+
return pd.DataFrame(data)
|
| 797 |
|
| 798 |
def save_visualization(fig, filename):
|
| 799 |
+
fig.write_image(filename)
|
| 800 |
+
|
| 801 |
+
def generate_and_save_normalized_data(rank_data, filename="normalized_data.json"):
|
| 802 |
+
"""
|
| 803 |
+
Generate normalized data for all games and save to file
|
| 804 |
+
|
| 805 |
+
Args:
|
| 806 |
+
rank_data (dict): Raw rank data
|
| 807 |
+
filename (str): Output filename
|
| 808 |
+
|
| 809 |
+
Returns:
|
| 810 |
+
str: Path to saved file
|
| 811 |
+
"""
|
| 812 |
+
# Select all games
|
| 813 |
+
all_games = {game: True for game in GAME_ORDER}
|
| 814 |
+
|
| 815 |
+
# Get combined leaderboard
|
| 816 |
+
df = get_combined_leaderboard(rank_data, all_games)
|
| 817 |
+
|
| 818 |
+
# Save normalized data
|
| 819 |
+
return save_normalized_data(df, all_games, filename)
|
| 820 |
+
|
| 821 |
+
def create_single_radar_chart_with_cache(df, selected_games=None, highlight_models=None, use_cache=True, cache_filename="normalized_data.json"):
|
| 822 |
+
"""
|
| 823 |
+
Create radar chart with optional caching support
|
| 824 |
+
"""
|
| 825 |
+
if selected_games is None:
|
| 826 |
+
selected_games = ['Super Mario Bros', '2048', 'Candy Crush', 'Sokoban', 'Ace Attorney']
|
| 827 |
+
|
| 828 |
+
# Try to load from cache first
|
| 829 |
+
cached_data = None
|
| 830 |
+
if use_cache:
|
| 831 |
+
cached_data = load_normalized_data(cache_filename)
|
| 832 |
+
|
| 833 |
+
if cached_data:
|
| 834 |
+
# Use cached normalized data
|
| 835 |
+
players = df["Player"].tolist()
|
| 836 |
+
df_normalized = get_normalized_scores_from_cache(players, selected_games, cached_data)
|
| 837 |
+
# Merge with original df to get Organization info
|
| 838 |
+
df_normalized = df_normalized.merge(df[["Player", "Organization"]], on="Player", how="left")
|
| 839 |
+
else:
|
| 840 |
+
# Fall back to on-the-fly normalization
|
| 841 |
+
df_normalized = df.copy()
|
| 842 |
+
game_cols = [f"{game} Score" for game in selected_games]
|
| 843 |
+
|
| 844 |
+
# Normalize
|
| 845 |
+
for col in game_cols:
|
| 846 |
+
vals = df_normalized[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
|
| 847 |
+
mean, std = vals.mean(), vals.std()
|
| 848 |
+
df_normalized[f"norm_{col}"] = normalize_values(vals, mean, std)
|
| 849 |
+
|
| 850 |
+
# Format game names
|
| 851 |
+
formatted_games = []
|
| 852 |
+
for game in selected_games:
|
| 853 |
+
if game == 'Super Mario Bros':
|
| 854 |
+
formatted_games.append('SMB')
|
| 855 |
+
else:
|
| 856 |
+
formatted_games.append(game)
|
| 857 |
+
|
| 858 |
+
categories = formatted_games
|
| 859 |
+
|
| 860 |
+
# Group players by prefix and sort alphabetically
|
| 861 |
+
model_groups = {}
|
| 862 |
+
for player in df_normalized["Player"]:
|
| 863 |
+
prefix = get_model_prefix(player)
|
| 864 |
+
model_groups.setdefault(prefix, []).append(player)
|
| 865 |
+
|
| 866 |
+
# Sort each group alphabetically
|
| 867 |
+
for prefix in model_groups:
|
| 868 |
+
model_groups[prefix] = sorted(model_groups[prefix], key=str.lower)
|
| 869 |
+
|
| 870 |
+
# Get sorted prefixes and create ordered player list
|
| 871 |
+
sorted_prefixes = sorted(model_groups.keys(), key=str.lower)
|
| 872 |
+
grouped_players = []
|
| 873 |
+
for prefix in sorted_prefixes:
|
| 874 |
+
grouped_players.extend(model_groups[prefix])
|
| 875 |
+
|
| 876 |
+
fig = go.Figure()
|
| 877 |
+
|
| 878 |
+
for player in grouped_players:
|
| 879 |
+
row = df_normalized[df_normalized["Player"] == player]
|
| 880 |
+
if row.empty:
|
| 881 |
+
continue
|
| 882 |
+
row = row.iloc[0]
|
| 883 |
+
|
| 884 |
+
is_highlighted = highlight_models and player in highlight_models
|
| 885 |
+
color = 'red' if is_highlighted else MODEL_COLORS.get(player, '#808080')
|
| 886 |
+
fillcolor = 'rgba(255, 0, 0, 0.4)' if is_highlighted else hex_to_rgba(color, 0.2)
|
| 887 |
+
|
| 888 |
+
# Get normalized values
|
| 889 |
+
if cached_data:
|
| 890 |
+
r = [row[f"norm_{game} Score"] for game in selected_games]
|
| 891 |
+
else:
|
| 892 |
+
r = [row[f"norm_{game} Score"] for game in selected_games]
|
| 893 |
+
|
| 894 |
+
display_name = player.lower()
|
| 895 |
+
|
| 896 |
+
fig.add_trace(go.Scatterpolar(
|
| 897 |
+
r=r + [r[0]],
|
| 898 |
+
theta=categories + [categories[0]],
|
| 899 |
+
mode='lines+markers',
|
| 900 |
+
fill='toself',
|
| 901 |
+
name=display_name,
|
| 902 |
+
line=dict(color=color, width=6 if is_highlighted else 2),
|
| 903 |
+
marker=dict(color=color, size=10 if is_highlighted else 6),
|
| 904 |
+
fillcolor=fillcolor,
|
| 905 |
+
opacity=1.0 if is_highlighted else 0.7,
|
| 906 |
+
hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
|
| 907 |
+
))
|
| 908 |
+
|
| 909 |
+
fig.update_layout(
|
| 910 |
+
autosize=True,
|
| 911 |
+
height=550,
|
| 912 |
+
margin=dict(l=400, r=100, t=20, b=20),
|
| 913 |
+
title=dict(
|
| 914 |
+
text="AI Normalized Performance Across Games",
|
| 915 |
+
x=0.5,
|
| 916 |
+
xanchor='center',
|
| 917 |
+
yanchor='top',
|
| 918 |
+
y=0.95,
|
| 919 |
+
font=dict(size=20),
|
| 920 |
+
pad=dict(b=20)
|
| 921 |
+
),
|
| 922 |
+
polar=dict(
|
| 923 |
+
radialaxis=dict(
|
| 924 |
+
visible=True,
|
| 925 |
+
range=[0, 100],
|
| 926 |
+
tickangle=45,
|
| 927 |
+
tickfont=dict(size=12),
|
| 928 |
+
gridcolor='lightgray',
|
| 929 |
+
gridwidth=1,
|
| 930 |
+
angle=45
|
| 931 |
+
),
|
| 932 |
+
angularaxis=dict(
|
| 933 |
+
tickfont=dict(size=14, weight='bold'),
|
| 934 |
+
tickangle=0
|
| 935 |
+
)
|
| 936 |
+
),
|
| 937 |
+
legend=dict(
|
| 938 |
+
font=dict(size=12),
|
| 939 |
+
title="Choose your model 💡 (click / double-click)",
|
| 940 |
+
itemsizing='trace',
|
| 941 |
+
x=-1.4,
|
| 942 |
+
y=0.8,
|
| 943 |
+
yanchor='top',
|
| 944 |
+
xanchor='left',
|
| 945 |
+
bgcolor='rgba(255,255,255,0.6)',
|
| 946 |
+
bordercolor='gray',
|
| 947 |
+
borderwidth=1,
|
| 948 |
+
itemclick="toggleothers",
|
| 949 |
+
itemdoubleclick="toggle"
|
| 950 |
+
)
|
| 951 |
+
)
|
| 952 |
+
|
| 953 |
+
return fig
|
generate_normalized_cache.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to generate normalized data cache for faster visualization loading.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python generate_normalized_cache.py [input_file] [output_file]
|
| 7 |
+
|
| 8 |
+
Example:
|
| 9 |
+
python generate_normalized_cache.py data/rank_data.json normalized_data.json
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import sys
|
| 13 |
+
import json
|
| 14 |
+
from data_visualization import generate_and_save_normalized_data, load_normalized_data
|
| 15 |
+
|
| 16 |
+
def main():
|
| 17 |
+
# Default files
|
| 18 |
+
input_file = "data/rank_data.json" # Update this path as needed
|
| 19 |
+
output_file = "normalized_data.json"
|
| 20 |
+
|
| 21 |
+
# Handle command line arguments
|
| 22 |
+
if len(sys.argv) > 1:
|
| 23 |
+
input_file = sys.argv[1]
|
| 24 |
+
if len(sys.argv) > 2:
|
| 25 |
+
output_file = sys.argv[2]
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
# Load rank data
|
| 29 |
+
print(f"Loading rank data from {input_file}...")
|
| 30 |
+
with open(input_file, 'r') as f:
|
| 31 |
+
rank_data = json.load(f)
|
| 32 |
+
|
| 33 |
+
# Generate and save normalized data
|
| 34 |
+
print("Generating normalized data...")
|
| 35 |
+
saved_path = generate_and_save_normalized_data(rank_data, output_file)
|
| 36 |
+
|
| 37 |
+
# Verify the saved data
|
| 38 |
+
print("Verifying saved data...")
|
| 39 |
+
cached_data = load_normalized_data(output_file)
|
| 40 |
+
|
| 41 |
+
if cached_data:
|
| 42 |
+
print(f"✅ Successfully generated normalized data cache!")
|
| 43 |
+
print(f"📁 Saved to: {saved_path}")
|
| 44 |
+
print(f"🎮 Games included: {list(cached_data['games'].keys())}")
|
| 45 |
+
print(f"👥 Players included: {len(cached_data['players'])}")
|
| 46 |
+
print(f"📅 Generated at: {cached_data['timestamp']}")
|
| 47 |
+
else:
|
| 48 |
+
print("❌ Failed to verify cached data")
|
| 49 |
+
|
| 50 |
+
except FileNotFoundError:
|
| 51 |
+
print(f"❌ Error: Could not find input file '{input_file}'")
|
| 52 |
+
print("Please check the file path and try again.")
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"❌ Error: {str(e)}")
|
| 55 |
+
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
main()
|
leaderboard_utils.py
CHANGED
|
@@ -32,7 +32,7 @@ def get_organization(model_name):
|
|
| 32 |
return "unknown"
|
| 33 |
|
| 34 |
|
| 35 |
-
def get_sokoban_leaderboard(rank_data):
|
| 36 |
data = rank_data.get("Sokoban", {}).get("results", [])
|
| 37 |
df = pd.DataFrame(data)
|
| 38 |
df = df.rename(columns={
|
|
@@ -53,9 +53,12 @@ def get_sokoban_leaderboard(rank_data):
|
|
| 53 |
if "Score" in df.columns:
|
| 54 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
| 55 |
df = df.sort_values("Score", ascending=False)
|
|
|
|
|
|
|
|
|
|
| 56 |
return df
|
| 57 |
|
| 58 |
-
def get_2048_leaderboard(rank_data):
|
| 59 |
data = rank_data.get("2048", {}).get("results", [])
|
| 60 |
# --- Diagnostic Print Removed ---
|
| 61 |
# if data and isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
|
|
@@ -108,9 +111,12 @@ def get_2048_leaderboard(rank_data):
|
|
| 108 |
if "Score" in df.columns:
|
| 109 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
| 110 |
df = df.sort_values("Score", ascending=False)
|
|
|
|
|
|
|
|
|
|
| 111 |
return df
|
| 112 |
|
| 113 |
-
def get_candy_leaderboard(rank_data):
|
| 114 |
data = rank_data.get("Candy Crush", {}).get("results", [])
|
| 115 |
df = pd.DataFrame(data)
|
| 116 |
df = df.rename(columns={
|
|
@@ -127,9 +133,12 @@ def get_candy_leaderboard(rank_data):
|
|
| 127 |
if "Score" in df.columns:
|
| 128 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
| 129 |
df = df.sort_values("Score", ascending=False)
|
|
|
|
|
|
|
|
|
|
| 130 |
return df
|
| 131 |
|
| 132 |
-
def get_tetris_planning_leaderboard(rank_data):
|
| 133 |
data = rank_data.get("Tetris", {}).get("results", [])
|
| 134 |
df = pd.DataFrame(data)
|
| 135 |
df = df.rename(columns={
|
|
@@ -147,9 +156,12 @@ def get_tetris_planning_leaderboard(rank_data):
|
|
| 147 |
if "Score" in df.columns:
|
| 148 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
| 149 |
df = df.sort_values("Score", ascending=False)
|
|
|
|
|
|
|
|
|
|
| 150 |
return df
|
| 151 |
|
| 152 |
-
def get_ace_attorney_leaderboard(rank_data):
|
| 153 |
data = rank_data.get("Ace Attorney", {}).get("results", [])
|
| 154 |
df = pd.DataFrame(data)
|
| 155 |
df = df.rename(columns={
|
|
@@ -168,9 +180,12 @@ def get_ace_attorney_leaderboard(rank_data):
|
|
| 168 |
if "Score" in df.columns:
|
| 169 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
| 170 |
df = df.sort_values("Score", ascending=False) # Higher score is better
|
|
|
|
|
|
|
|
|
|
| 171 |
return df
|
| 172 |
|
| 173 |
-
def get_mario_planning_leaderboard(rank_data):
|
| 174 |
data = rank_data.get("Super Mario Bros", {}).get("results", [])
|
| 175 |
df = pd.DataFrame(data)
|
| 176 |
df = df.rename(columns={
|
|
@@ -188,6 +203,9 @@ def get_mario_planning_leaderboard(rank_data):
|
|
| 188 |
if "Score" in df.columns:
|
| 189 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
| 190 |
df = df.sort_values("Score", ascending=False)
|
|
|
|
|
|
|
|
|
|
| 191 |
return df
|
| 192 |
|
| 193 |
def calculate_rank_and_completeness(rank_data, selected_games):
|
|
@@ -285,13 +303,14 @@ def calculate_rank_and_completeness(rank_data, selected_games):
|
|
| 285 |
|
| 286 |
return df_results
|
| 287 |
|
| 288 |
-
def get_combined_leaderboard(rank_data, selected_games):
|
| 289 |
"""
|
| 290 |
Get combined leaderboard for selected games
|
| 291 |
|
| 292 |
Args:
|
| 293 |
rank_data (dict): Dictionary containing rank data
|
| 294 |
selected_games (dict): Dictionary of game names and their selection status
|
|
|
|
| 295 |
|
| 296 |
Returns:
|
| 297 |
pd.DataFrame: Combined leaderboard DataFrame
|
|
@@ -358,20 +377,64 @@ def get_combined_leaderboard(rank_data, selected_games):
|
|
| 358 |
# Create DataFrame
|
| 359 |
df_results = pd.DataFrame(results)
|
| 360 |
|
| 361 |
-
#
|
| 362 |
if not df_results.empty:
|
| 363 |
-
#
|
| 364 |
-
|
|
|
|
|
|
|
|
|
|
| 365 |
for game in GAME_ORDER:
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
|
| 371 |
-
# Sort by
|
| 372 |
-
df_results = df_results.sort_values("
|
| 373 |
|
| 374 |
-
#
|
| 375 |
-
|
|
|
|
| 376 |
|
| 377 |
return df_results
|
|
|
|
| 32 |
return "unknown"
|
| 33 |
|
| 34 |
|
| 35 |
+
def get_sokoban_leaderboard(rank_data, limit_to_top_n=None):
|
| 36 |
data = rank_data.get("Sokoban", {}).get("results", [])
|
| 37 |
df = pd.DataFrame(data)
|
| 38 |
df = df.rename(columns={
|
|
|
|
| 53 |
if "Score" in df.columns:
|
| 54 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
| 55 |
df = df.sort_values("Score", ascending=False)
|
| 56 |
+
# Apply limit if specified
|
| 57 |
+
if limit_to_top_n is not None:
|
| 58 |
+
df = df.head(limit_to_top_n)
|
| 59 |
return df
|
| 60 |
|
| 61 |
+
def get_2048_leaderboard(rank_data, limit_to_top_n=None):
|
| 62 |
data = rank_data.get("2048", {}).get("results", [])
|
| 63 |
# --- Diagnostic Print Removed ---
|
| 64 |
# if data and isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
|
|
|
|
| 111 |
if "Score" in df.columns:
|
| 112 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
| 113 |
df = df.sort_values("Score", ascending=False)
|
| 114 |
+
# Apply limit if specified
|
| 115 |
+
if limit_to_top_n is not None:
|
| 116 |
+
df = df.head(limit_to_top_n)
|
| 117 |
return df
|
| 118 |
|
| 119 |
+
def get_candy_leaderboard(rank_data, limit_to_top_n=None):
|
| 120 |
data = rank_data.get("Candy Crush", {}).get("results", [])
|
| 121 |
df = pd.DataFrame(data)
|
| 122 |
df = df.rename(columns={
|
|
|
|
| 133 |
if "Score" in df.columns:
|
| 134 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
| 135 |
df = df.sort_values("Score", ascending=False)
|
| 136 |
+
# Apply limit if specified
|
| 137 |
+
if limit_to_top_n is not None:
|
| 138 |
+
df = df.head(limit_to_top_n)
|
| 139 |
return df
|
| 140 |
|
| 141 |
+
def get_tetris_planning_leaderboard(rank_data, limit_to_top_n=None):
|
| 142 |
data = rank_data.get("Tetris", {}).get("results", [])
|
| 143 |
df = pd.DataFrame(data)
|
| 144 |
df = df.rename(columns={
|
|
|
|
| 156 |
if "Score" in df.columns:
|
| 157 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
| 158 |
df = df.sort_values("Score", ascending=False)
|
| 159 |
+
# Apply limit if specified
|
| 160 |
+
if limit_to_top_n is not None:
|
| 161 |
+
df = df.head(limit_to_top_n)
|
| 162 |
return df
|
| 163 |
|
| 164 |
+
def get_ace_attorney_leaderboard(rank_data, limit_to_top_n=None):
|
| 165 |
data = rank_data.get("Ace Attorney", {}).get("results", [])
|
| 166 |
df = pd.DataFrame(data)
|
| 167 |
df = df.rename(columns={
|
|
|
|
| 180 |
if "Score" in df.columns:
|
| 181 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
| 182 |
df = df.sort_values("Score", ascending=False) # Higher score is better
|
| 183 |
+
# Apply limit if specified
|
| 184 |
+
if limit_to_top_n is not None:
|
| 185 |
+
df = df.head(limit_to_top_n)
|
| 186 |
return df
|
| 187 |
|
| 188 |
+
def get_mario_planning_leaderboard(rank_data, limit_to_top_n=None):
|
| 189 |
data = rank_data.get("Super Mario Bros", {}).get("results", [])
|
| 190 |
df = pd.DataFrame(data)
|
| 191 |
df = df.rename(columns={
|
|
|
|
| 203 |
if "Score" in df.columns:
|
| 204 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
| 205 |
df = df.sort_values("Score", ascending=False)
|
| 206 |
+
# Apply limit if specified
|
| 207 |
+
if limit_to_top_n is not None:
|
| 208 |
+
df = df.head(limit_to_top_n)
|
| 209 |
return df
|
| 210 |
|
| 211 |
def calculate_rank_and_completeness(rank_data, selected_games):
|
|
|
|
| 303 |
|
| 304 |
return df_results
|
| 305 |
|
| 306 |
+
def get_combined_leaderboard(rank_data, selected_games, limit_to_top_n=None):
|
| 307 |
"""
|
| 308 |
Get combined leaderboard for selected games
|
| 309 |
|
| 310 |
Args:
|
| 311 |
rank_data (dict): Dictionary containing rank data
|
| 312 |
selected_games (dict): Dictionary of game names and their selection status
|
| 313 |
+
limit_to_top_n (int, optional): Limit results to top N entries. None means no limit.
|
| 314 |
|
| 315 |
Returns:
|
| 316 |
pd.DataFrame: Combined leaderboard DataFrame
|
|
|
|
| 377 |
# Create DataFrame
|
| 378 |
df_results = pd.DataFrame(results)
|
| 379 |
|
| 380 |
+
# Calculate normalized scores and average normalized score
|
| 381 |
if not df_results.empty:
|
| 382 |
+
# Import the normalize_values function from data_visualization
|
| 383 |
+
from data_visualization import normalize_values
|
| 384 |
+
|
| 385 |
+
# Calculate normalized scores for each game
|
| 386 |
+
game_score_columns = []
|
| 387 |
for game in GAME_ORDER:
|
| 388 |
+
score_col = f"{game} Score"
|
| 389 |
+
if score_col in df_results.columns:
|
| 390 |
+
game_score_columns.append(score_col)
|
| 391 |
+
# Get numeric values, replacing 'n/a' with NaN
|
| 392 |
+
# Use where() to avoid FutureWarning about downcasting in replace()
|
| 393 |
+
series = df_results[score_col].copy()
|
| 394 |
+
series = series.where(series != 'n/a', np.nan)
|
| 395 |
+
numeric_scores = pd.to_numeric(series, errors='coerce')
|
| 396 |
+
|
| 397 |
+
# Skip games where all scores are NaN or 0
|
| 398 |
+
valid_scores = numeric_scores.dropna()
|
| 399 |
+
if len(valid_scores) > 0 and valid_scores.sum() > 0:
|
| 400 |
+
mean = valid_scores.mean()
|
| 401 |
+
std = valid_scores.std() if len(valid_scores) > 1 else 0
|
| 402 |
+
|
| 403 |
+
# Calculate normalized scores for all players
|
| 404 |
+
normalized_scores = []
|
| 405 |
+
for _, row in df_results.iterrows():
|
| 406 |
+
score = row[score_col]
|
| 407 |
+
if score == 'n/a' or pd.isna(score):
|
| 408 |
+
normalized_scores.append(0)
|
| 409 |
+
else:
|
| 410 |
+
normalized_scores.append(normalize_values([float(score)], mean, std)[0])
|
| 411 |
+
|
| 412 |
+
df_results[f"norm_{score_col}"] = normalized_scores
|
| 413 |
+
else:
|
| 414 |
+
# If no valid scores, set all normalized scores to 0
|
| 415 |
+
df_results[f"norm_{score_col}"] = 0
|
| 416 |
+
|
| 417 |
+
# Calculate average normalized score across games
|
| 418 |
+
normalized_columns = [f"norm_{col}" for col in game_score_columns if f"norm_{col}" in df_results.columns]
|
| 419 |
+
if normalized_columns:
|
| 420 |
+
df_results["Avg Normalized Score"] = df_results[normalized_columns].mean(axis=1).round(2)
|
| 421 |
+
else:
|
| 422 |
+
df_results["Avg Normalized Score"] = 0.0
|
| 423 |
+
|
| 424 |
+
# Reorder columns to put Avg Normalized Score after Organization
|
| 425 |
+
base_columns = ["Player", "Organization", "Avg Normalized Score"]
|
| 426 |
+
game_columns = [col for col in df_results.columns if col.endswith(" Score") and not col.startswith("norm_") and col != "Avg Normalized Score"]
|
| 427 |
+
other_columns = [col for col in df_results.columns if col not in base_columns + game_columns and not col.startswith("norm_")]
|
| 428 |
+
|
| 429 |
+
# Create final column order
|
| 430 |
+
final_columns = base_columns + game_columns + other_columns
|
| 431 |
+
df_results = df_results[final_columns]
|
| 432 |
|
| 433 |
+
# Sort by average normalized score in descending order
|
| 434 |
+
df_results = df_results.sort_values("Avg Normalized Score", ascending=False)
|
| 435 |
|
| 436 |
+
# Apply limit if specified
|
| 437 |
+
if limit_to_top_n is not None:
|
| 438 |
+
df_results = df_results.head(limit_to_top_n)
|
| 439 |
|
| 440 |
return df_results
|
rank_data_03_25_2025.json
CHANGED
|
@@ -3,61 +3,61 @@
|
|
| 3 |
"runs": 3,
|
| 4 |
"results": [
|
| 5 |
{
|
| 6 |
-
"model": "
|
| 7 |
"score": 1267.7,
|
| 8 |
"detail_data": "709,1532,1562",
|
| 9 |
"progress": "1-1"
|
| 10 |
},
|
| 11 |
{
|
| 12 |
-
"model": "
|
| 13 |
"score": 1418.7,
|
| 14 |
"detail_data": "2015,709,1532",
|
| 15 |
"progress": "1-1"
|
| 16 |
},
|
| 17 |
{
|
| 18 |
-
"model": "
|
| 19 |
"score": 1385.0,
|
| 20 |
"detail_data": "1672,1266,1247",
|
| 21 |
"progress": "1-1"
|
| 22 |
},
|
| 23 |
{
|
| 24 |
-
"model": "
|
| 25 |
"score": 1498.3,
|
| 26 |
"detail_data": "1561,1271,1663",
|
| 27 |
"progress": "1-1"
|
| 28 |
},
|
| 29 |
{
|
| 30 |
-
"model": "
|
| 31 |
"score": 1468.7,
|
| 32 |
"detail_data": "898,2008,1500",
|
| 33 |
"progress": "1-1"
|
| 34 |
},
|
| 35 |
{
|
| 36 |
-
"model": "
|
| 37 |
"score": 2126.3,
|
| 38 |
"detail_data": "1531,722,4126",
|
| 39 |
"progress": "1-1"
|
| 40 |
},
|
| 41 |
{
|
| 42 |
-
"model": "
|
| 43 |
"score": 2047.3,
|
| 44 |
"detail_data": "2017,2590,1535",
|
| 45 |
"progress": "1-1"
|
| 46 |
},
|
| 47 |
{
|
| 48 |
-
"model": "
|
| 49 |
"score": 855,
|
| 50 |
"detail_data": "855",
|
| 51 |
"progress": "1-1"
|
| 52 |
},
|
| 53 |
{
|
| 54 |
-
"model": "
|
| 55 |
"score": 3445,
|
| 56 |
"detail_data": "3445",
|
| 57 |
"progress": "1-1"
|
| 58 |
},
|
| 59 |
{
|
| 60 |
-
"model": "
|
| 61 |
"score": 1448.0,
|
| 62 |
"detail_data": "1525,1263,1556",
|
| 63 |
"progress": "1-1"
|
|
@@ -74,79 +74,79 @@
|
|
| 74 |
"runs": 3,
|
| 75 |
"results": [
|
| 76 |
{
|
| 77 |
-
"model": "
|
| 78 |
"score": 1914.67,
|
| 79 |
"details": "1352,2860,1532",
|
| 80 |
"highest_tail": 256
|
| 81 |
},
|
| 82 |
{
|
| 83 |
-
"model": "
|
| 84 |
"score": 2624,
|
| 85 |
"details": "2560,3224,2088",
|
| 86 |
"highest_tail": 256
|
| 87 |
},
|
| 88 |
{
|
| 89 |
-
"model": "
|
| 90 |
"score": 1873.33,
|
| 91 |
"details": "700,1240,3680",
|
| 92 |
"highest_tail": 256
|
| 93 |
},
|
| 94 |
{
|
| 95 |
-
"model": "
|
| 96 |
"score": 1697.33,
|
| 97 |
"details": "1304,1316,2472",
|
| 98 |
"highest_tail": 256
|
| 99 |
},
|
| 100 |
{
|
| 101 |
-
"model": "
|
| 102 |
"score": 3586.67,
|
| 103 |
"details": "5300,2400,3060",
|
| 104 |
"highest_tail": 512
|
| 105 |
},
|
| 106 |
{
|
| 107 |
-
"model": "
|
| 108 |
"score": 4036,
|
| 109 |
"details": "6412,2492,3204",
|
| 110 |
"highest_tail": 512
|
| 111 |
},
|
| 112 |
{
|
| 113 |
-
"model": "
|
| 114 |
"score": 1586.67,
|
| 115 |
"details": "1404,1272,2084",
|
| 116 |
"highest_tail": 128
|
| 117 |
},
|
| 118 |
{
|
| 119 |
-
"model": "
|
| 120 |
"score": 1656,
|
| 121 |
"details": "1156,2664,1148",
|
| 122 |
"highest_tail": 256
|
| 123 |
},
|
| 124 |
{
|
| 125 |
-
"model": "
|
| 126 |
"score": 1656,
|
| 127 |
"details": "1604,1284,2080",
|
| 128 |
"highest_tail": 256
|
| 129 |
},
|
| 130 |
{
|
| 131 |
-
"model": "
|
| 132 |
"score": 7580,
|
| 133 |
"details": "7580",
|
| 134 |
"highest_tail": 512
|
| 135 |
},
|
| 136 |
{
|
| 137 |
-
"model": "
|
| 138 |
"score": 2757.33,
|
| 139 |
"details": "3132,2004,3136",
|
| 140 |
"highest_tail": 256
|
| 141 |
},
|
| 142 |
{
|
| 143 |
-
"model": "
|
| 144 |
"score": 7120,
|
| 145 |
"details": "7120",
|
| 146 |
"highest_tail": 512
|
| 147 |
},
|
| 148 |
{
|
| 149 |
-
"model": "
|
| 150 |
"score": 4432.0,
|
| 151 |
"details": "4928,5456,2912",
|
| 152 |
"highest_tail": 512
|
|
@@ -158,25 +158,25 @@
|
|
| 158 |
"highest_tail": 128
|
| 159 |
},
|
| 160 |
{
|
| 161 |
-
"model": "
|
| 162 |
"score": 3036.0,
|
| 163 |
"details": "3036.0",
|
| 164 |
"highest_tail": 256
|
| 165 |
},
|
| 166 |
{
|
| 167 |
-
"model": "
|
| 168 |
"score": 3136,
|
| 169 |
"details": "2148,2360,4900",
|
| 170 |
"highest_tail": 256
|
| 171 |
},
|
| 172 |
{
|
| 173 |
-
"model": "
|
| 174 |
"score": 3330.0,
|
| 175 |
"details": "3260,3400",
|
| 176 |
"highest_tail": 256
|
| 177 |
},
|
| 178 |
{
|
| 179 |
-
"model": "
|
| 180 |
"score": 2144.0,
|
| 181 |
"details": "1436,2556,2440",
|
| 182 |
"highest_tail": 256
|
|
@@ -187,67 +187,67 @@
|
|
| 187 |
"runs": 3,
|
| 188 |
"results": [
|
| 189 |
{
|
| 190 |
-
"model": "
|
| 191 |
"score": 14.7,
|
| 192 |
"details": "16,14,14"
|
| 193 |
},
|
| 194 |
{
|
| 195 |
-
"model": "
|
| 196 |
"score": 16.3,
|
| 197 |
"details": "19,15,15"
|
| 198 |
},
|
| 199 |
{
|
| 200 |
-
"model": "
|
| 201 |
"score": 14.3,
|
| 202 |
"details": "15,14,14"
|
| 203 |
},
|
| 204 |
{
|
| 205 |
-
"model": "
|
| 206 |
"score": 16.3,
|
| 207 |
"details": "20,14,15"
|
| 208 |
},
|
| 209 |
{
|
| 210 |
-
"model": "
|
| 211 |
"score": 23.3,
|
| 212 |
"details": "23,23,24"
|
| 213 |
},
|
| 214 |
{
|
| 215 |
-
"model": "
|
| 216 |
"score": 21.3,
|
| 217 |
"details": "20,15,29"
|
| 218 |
},
|
| 219 |
{
|
| 220 |
-
"model": "
|
| 221 |
"score": 10.3,
|
| 222 |
"details": "9,10,12"
|
| 223 |
},
|
| 224 |
{
|
| 225 |
-
"model": "
|
| 226 |
"score": 13.7,
|
| 227 |
"details": "13,14,14"
|
| 228 |
},
|
| 229 |
{
|
| 230 |
-
"model": "
|
| 231 |
"score": 14,
|
| 232 |
"details": "18,11,13"
|
| 233 |
},
|
| 234 |
{
|
| 235 |
-
"model": "
|
| 236 |
"score": 35,
|
| 237 |
"details": "35"
|
| 238 |
},
|
| 239 |
{
|
| 240 |
-
"model": "
|
| 241 |
"score": 11.7,
|
| 242 |
"details": "11,11,13"
|
| 243 |
},
|
| 244 |
{
|
| 245 |
-
"model": "
|
| 246 |
"score": 42,
|
| 247 |
"details": "42"
|
| 248 |
},
|
| 249 |
{
|
| 250 |
-
"model": "
|
| 251 |
"score": 25.3,
|
| 252 |
"details": "22,35,19"
|
| 253 |
},
|
|
@@ -257,22 +257,22 @@
|
|
| 257 |
"details": ""
|
| 258 |
},
|
| 259 |
{
|
| 260 |
-
"model": "
|
| 261 |
"score": 20,
|
| 262 |
"details": "17,18,25"
|
| 263 |
},
|
| 264 |
{
|
| 265 |
-
"model": "
|
| 266 |
"score": 19.33,
|
| 267 |
"details": "20,17,21"
|
| 268 |
},
|
| 269 |
{
|
| 270 |
-
"model": "
|
| 271 |
"score": 33.67,
|
| 272 |
"details": "26,34,41"
|
| 273 |
},
|
| 274 |
{
|
| 275 |
-
"model": "
|
| 276 |
"score": 11.67,
|
| 277 |
"details": "13,14,8"
|
| 278 |
}
|
|
@@ -282,67 +282,67 @@
|
|
| 282 |
"runs": 3,
|
| 283 |
"results": [
|
| 284 |
{
|
| 285 |
-
"model": "
|
| 286 |
"score": 106,
|
| 287 |
"details": "92,165,61"
|
| 288 |
},
|
| 289 |
{
|
| 290 |
-
"model": "
|
| 291 |
"score": 484,
|
| 292 |
"details": "535,428,489"
|
| 293 |
},
|
| 294 |
{
|
| 295 |
-
"model": "
|
| 296 |
"score": 447.3,
|
| 297 |
"details": "409,436,497"
|
| 298 |
},
|
| 299 |
{
|
| 300 |
-
"model": "
|
| 301 |
"score": 334.7,
|
| 302 |
"details": "259,372,373"
|
| 303 |
},
|
| 304 |
{
|
| 305 |
-
"model": "
|
| 306 |
"score": 416.3,
|
| 307 |
"details": "411,414,424"
|
| 308 |
},
|
| 309 |
{
|
| 310 |
-
"model": "
|
| 311 |
"score": 254,
|
| 312 |
"details": "299,332,131"
|
| 313 |
},
|
| 314 |
{
|
| 315 |
-
"model": "
|
| 316 |
"score": 128.7,
|
| 317 |
"details": "67,139,180"
|
| 318 |
},
|
| 319 |
{
|
| 320 |
-
"model": "
|
| 321 |
"score": 182,
|
| 322 |
"details": "163,215,168"
|
| 323 |
},
|
| 324 |
{
|
| 325 |
-
"model": "
|
| 326 |
"score": 147.3,
|
| 327 |
"details": "131,104,207"
|
| 328 |
},
|
| 329 |
{
|
| 330 |
-
"model": "
|
| 331 |
"score": 159,
|
| 332 |
"details": "159"
|
| 333 |
},
|
| 334 |
{
|
| 335 |
-
"model": "
|
| 336 |
"score": 48,
|
| 337 |
"details": "21,86,37"
|
| 338 |
},
|
| 339 |
{
|
| 340 |
-
"model": "
|
| 341 |
"score": 647,
|
| 342 |
"details": "647"
|
| 343 |
},
|
| 344 |
{
|
| 345 |
-
"model": "
|
| 346 |
"score": 487.3,
|
| 347 |
"details": "259,591,612"
|
| 348 |
},
|
|
@@ -352,22 +352,22 @@
|
|
| 352 |
"details": ""
|
| 353 |
},
|
| 354 |
{
|
| 355 |
-
"model": "
|
| 356 |
"score": 464,
|
| 357 |
"details": "593,406,393"
|
| 358 |
},
|
| 359 |
{
|
| 360 |
-
"model": "
|
| 361 |
"score": 478.33,
|
| 362 |
"details": "545,468,422"
|
| 363 |
},
|
| 364 |
{
|
| 365 |
-
"model": "
|
| 366 |
"score": 491.67,
|
| 367 |
"details": "464,463,548"
|
| 368 |
},
|
| 369 |
{
|
| 370 |
-
"model": "
|
| 371 |
"score": 363.33,
|
| 372 |
"details": "365,372,353"
|
| 373 |
}
|
|
@@ -377,79 +377,79 @@
|
|
| 377 |
"runs": 3,
|
| 378 |
"results": [
|
| 379 |
{
|
| 380 |
-
"model": "
|
| 381 |
"score": 0,
|
| 382 |
"detail_box_on_target": "0,0,0",
|
| 383 |
"cracked_levels": "0,0,0"
|
| 384 |
},
|
| 385 |
{
|
| 386 |
-
"model": "
|
| 387 |
"score": 2.33,
|
| 388 |
"detail_box_on_target": "2,4,1",
|
| 389 |
"cracked_levels": "1,2,0"
|
| 390 |
},
|
| 391 |
{
|
| 392 |
-
"model": "
|
| 393 |
"score": 1.33,
|
| 394 |
"detail_box_on_target": "2,0,2",
|
| 395 |
"cracked_levels": "1,0,1"
|
| 396 |
},
|
| 397 |
{
|
| 398 |
-
"model": "
|
| 399 |
"score": 1.67,
|
| 400 |
"detail_box_on_target": "3,0,2",
|
| 401 |
"cracked_levels": "2,0,1"
|
| 402 |
},
|
| 403 |
{
|
| 404 |
-
"model": "
|
| 405 |
"score": 4.33,
|
| 406 |
"detail_box_on_target": "4,4,5",
|
| 407 |
"cracked_levels": "2,2,3"
|
| 408 |
},
|
| 409 |
{
|
| 410 |
-
"model": "
|
| 411 |
"score": 5.67,
|
| 412 |
"detail_box_on_target": "5,6,6",
|
| 413 |
"cracked_levels": "3,3,3"
|
| 414 |
},
|
| 415 |
{
|
| 416 |
-
"model": "
|
| 417 |
"score": 0,
|
| 418 |
"detail_box_on_target": "0,0,0",
|
| 419 |
"cracked_levels": "0,0,0"
|
| 420 |
},
|
| 421 |
{
|
| 422 |
-
"model": "
|
| 423 |
"score": 0,
|
| 424 |
"detail_box_on_target": "0,0,0",
|
| 425 |
"cracked_levels": "0,0,0"
|
| 426 |
},
|
| 427 |
{
|
| 428 |
-
"model": "
|
| 429 |
"score": 0,
|
| 430 |
"detail_box_on_target": "0,0,0",
|
| 431 |
"cracked_levels": "0,0,0"
|
| 432 |
},
|
| 433 |
{
|
| 434 |
-
"model": "
|
| 435 |
"score": 2.33,
|
| 436 |
"detail_box_on_target": "2,2,3",
|
| 437 |
"cracked_levels": "1,1,2"
|
| 438 |
},
|
| 439 |
{
|
| 440 |
-
"model": "
|
| 441 |
"score": 1.33,
|
| 442 |
"detail_box_on_target": "1,2,1",
|
| 443 |
"cracked_levels": "0,1,0"
|
| 444 |
},
|
| 445 |
{
|
| 446 |
-
"model": "
|
| 447 |
"score": 8,
|
| 448 |
"detail_box_on_target": "10,6",
|
| 449 |
"cracked_levels": "5,3"
|
| 450 |
},
|
| 451 |
{
|
| 452 |
-
"model": "
|
| 453 |
"score": 5.33,
|
| 454 |
"detail_box_on_target": "4,6,6",
|
| 455 |
"cracked_levels": "2,2,3"
|
|
@@ -461,22 +461,22 @@
|
|
| 461 |
"cracked_levels": "0,0,0"
|
| 462 |
},
|
| 463 |
{
|
| 464 |
-
"model": "
|
| 465 |
"score": 4,
|
| 466 |
"details": "4,4,4"
|
| 467 |
},
|
| 468 |
{
|
| 469 |
-
"model": "
|
| 470 |
"score": 3,
|
| 471 |
"details": "2,2,5"
|
| 472 |
},
|
| 473 |
{
|
| 474 |
-
"model": "
|
| 475 |
"score": 4.67,
|
| 476 |
"details": "4,4,6"
|
| 477 |
},
|
| 478 |
{
|
| 479 |
-
"model": "
|
| 480 |
"score": 2.33,
|
| 481 |
"details": "1,2,4"
|
| 482 |
}
|
|
@@ -486,79 +486,79 @@
|
|
| 486 |
"runs": 1,
|
| 487 |
"results": [
|
| 488 |
{
|
| 489 |
-
"model": "
|
| 490 |
"score": 2,
|
| 491 |
"progress": "1:2/5",
|
| 492 |
"evaluator result": "1/3"
|
| 493 |
},
|
| 494 |
{
|
| 495 |
-
"model": "
|
| 496 |
"score": 7,
|
| 497 |
"progress": "2:2/9",
|
| 498 |
"evaluator result": "5/11"
|
| 499 |
},
|
| 500 |
{
|
| 501 |
-
"model": "
|
| 502 |
"score": 0,
|
| 503 |
"progress": "0",
|
| 504 |
"evaluator result": "1/5"
|
| 505 |
},
|
| 506 |
{
|
| 507 |
-
"model": "
|
| 508 |
"score": 4,
|
| 509 |
"progress": "1:4/5",
|
| 510 |
"evaluator result": "1/7"
|
| 511 |
},
|
| 512 |
{
|
| 513 |
-
"model": "
|
| 514 |
"score": 7,
|
| 515 |
"progress": "2:2/9",
|
| 516 |
"evaluator result": "2/3"
|
| 517 |
},
|
| 518 |
{
|
| 519 |
-
"model": "
|
| 520 |
"score": 0,
|
| 521 |
"progress": "0",
|
| 522 |
"evaluator result": "0"
|
| 523 |
},
|
| 524 |
{
|
| 525 |
-
"model": "
|
| 526 |
"score": 0,
|
| 527 |
"progress": "0",
|
| 528 |
"evaluator result": "0"
|
| 529 |
},
|
| 530 |
{
|
| 531 |
-
"model": "
|
| 532 |
"score": 2,
|
| 533 |
"progress": "1:2/5",
|
| 534 |
"evaluator result": "2/3"
|
| 535 |
},
|
| 536 |
{
|
| 537 |
-
"model": "
|
| 538 |
"score": 0,
|
| 539 |
"progress": "0",
|
| 540 |
"evaluator result": "0"
|
| 541 |
},
|
| 542 |
{
|
| 543 |
-
"model": "
|
| 544 |
"score": 16,
|
| 545 |
"progress": "3: 2/8",
|
| 546 |
"evaluator result": "6/11"
|
| 547 |
},
|
| 548 |
{
|
| 549 |
-
"model": "
|
| 550 |
"score": 0,
|
| 551 |
"progress": "0",
|
| 552 |
"evaluator result": "1/5"
|
| 553 |
},
|
| 554 |
{
|
| 555 |
-
"model": "
|
| 556 |
"score": 16,
|
| 557 |
"progress": "3: 2/8",
|
| 558 |
"evaluator result": "1/2"
|
| 559 |
},
|
| 560 |
{
|
| 561 |
-
"model": "
|
| 562 |
"score": 4,
|
| 563 |
"progress": "1:4/5",
|
| 564 |
"evaluator result": "2/5"
|
|
@@ -570,17 +570,17 @@
|
|
| 570 |
"evaluator result": "0"
|
| 571 |
},
|
| 572 |
{
|
| 573 |
-
"model": "
|
| 574 |
"score": 6,
|
| 575 |
"details": "6"
|
| 576 |
},
|
| 577 |
{
|
| 578 |
-
"model": "
|
| 579 |
"score": 3.67,
|
| 580 |
"details": "3,4,4"
|
| 581 |
},
|
| 582 |
{
|
| 583 |
-
"model": "
|
| 584 |
"score": 4.33,
|
| 585 |
"details": "3,4,6"
|
| 586 |
}
|
|
|
|
| 3 |
"runs": 3,
|
| 4 |
"results": [
|
| 5 |
{
|
| 6 |
+
"model": "claude-3-5-sonnet-20241022 (⚔️)",
|
| 7 |
"score": 1267.7,
|
| 8 |
"detail_data": "709,1532,1562",
|
| 9 |
"progress": "1-1"
|
| 10 |
},
|
| 11 |
{
|
| 12 |
+
"model": "claude-3-7-sonnet-20250219 (⚔️)",
|
| 13 |
"score": 1418.7,
|
| 14 |
"detail_data": "2015,709,1532",
|
| 15 |
"progress": "1-1"
|
| 16 |
},
|
| 17 |
{
|
| 18 |
+
"model": "gemini-2.5-flash-preview-04-17 (⚔️)",
|
| 19 |
"score": 1385.0,
|
| 20 |
"detail_data": "1672,1266,1247",
|
| 21 |
"progress": "1-1"
|
| 22 |
},
|
| 23 |
{
|
| 24 |
+
"model": "gemini-2.5-pro-preview-05-06 (⚔️)",
|
| 25 |
"score": 1498.3,
|
| 26 |
"detail_data": "1561,1271,1663",
|
| 27 |
"progress": "1-1"
|
| 28 |
},
|
| 29 |
{
|
| 30 |
+
"model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
|
| 31 |
"score": 1468.7,
|
| 32 |
"detail_data": "898,2008,1500",
|
| 33 |
"progress": "1-1"
|
| 34 |
},
|
| 35 |
{
|
| 36 |
+
"model": "gpt-4.1-2025-04-14 (⚔️)",
|
| 37 |
"score": 2126.3,
|
| 38 |
"detail_data": "1531,722,4126",
|
| 39 |
"progress": "1-1"
|
| 40 |
},
|
| 41 |
{
|
| 42 |
+
"model": "gpt-4o-2024-11-20 (⚔️)",
|
| 43 |
"score": 2047.3,
|
| 44 |
"detail_data": "2017,2590,1535",
|
| 45 |
"progress": "1-1"
|
| 46 |
},
|
| 47 |
{
|
| 48 |
+
"model": "o1-2024-12-17 (⚔️)",
|
| 49 |
"score": 855,
|
| 50 |
"detail_data": "855",
|
| 51 |
"progress": "1-1"
|
| 52 |
},
|
| 53 |
{
|
| 54 |
+
"model": "o3-2025-04-16 (⚔️)",
|
| 55 |
"score": 3445,
|
| 56 |
"detail_data": "3445",
|
| 57 |
"progress": "1-1"
|
| 58 |
},
|
| 59 |
{
|
| 60 |
+
"model": "o4-mini-2025-04-16 (⚔️)",
|
| 61 |
"score": 1448.0,
|
| 62 |
"detail_data": "1525,1263,1556",
|
| 63 |
"progress": "1-1"
|
|
|
|
| 74 |
"runs": 3,
|
| 75 |
"results": [
|
| 76 |
{
|
| 77 |
+
"model": "claude-3-5-sonnet-20241022 (⚔️)",
|
| 78 |
"score": 1914.67,
|
| 79 |
"details": "1352,2860,1532",
|
| 80 |
"highest_tail": 256
|
| 81 |
},
|
| 82 |
{
|
| 83 |
+
"model": "claude-3-7-sonnet-20250219 (⚔️)",
|
| 84 |
"score": 2624,
|
| 85 |
"details": "2560,3224,2088",
|
| 86 |
"highest_tail": 256
|
| 87 |
},
|
| 88 |
{
|
| 89 |
+
"model": "deepseek-r1-0120 (⚔️)",
|
| 90 |
"score": 1873.33,
|
| 91 |
"details": "700,1240,3680",
|
| 92 |
"highest_tail": 256
|
| 93 |
},
|
| 94 |
{
|
| 95 |
+
"model": "gemini-2.5-flash-preview-04-17 (⚔️)",
|
| 96 |
"score": 1697.33,
|
| 97 |
"details": "1304,1316,2472",
|
| 98 |
"highest_tail": 256
|
| 99 |
},
|
| 100 |
{
|
| 101 |
+
"model": "gemini-2.5-pro-preview-05-06 (⚔️)",
|
| 102 |
"score": 3586.67,
|
| 103 |
"details": "5300,2400,3060",
|
| 104 |
"highest_tail": 512
|
| 105 |
},
|
| 106 |
{
|
| 107 |
+
"model": "grok-3-mini-beta (⚔️)",
|
| 108 |
"score": 4036,
|
| 109 |
"details": "6412,2492,3204",
|
| 110 |
"highest_tail": 512
|
| 111 |
},
|
| 112 |
{
|
| 113 |
+
"model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
|
| 114 |
"score": 1586.67,
|
| 115 |
"details": "1404,1272,2084",
|
| 116 |
"highest_tail": 128
|
| 117 |
},
|
| 118 |
{
|
| 119 |
+
"model": "gpt-4.1-2025-04-14 (⚔️)",
|
| 120 |
"score": 1656,
|
| 121 |
"details": "1156,2664,1148",
|
| 122 |
"highest_tail": 256
|
| 123 |
},
|
| 124 |
{
|
| 125 |
+
"model": "gpt-4o-2024-11-20 (⚔️)",
|
| 126 |
"score": 1656,
|
| 127 |
"details": "1604,1284,2080",
|
| 128 |
"highest_tail": 256
|
| 129 |
},
|
| 130 |
{
|
| 131 |
+
"model": "o1-2024-12-17 (⚔️)",
|
| 132 |
"score": 7580,
|
| 133 |
"details": "7580",
|
| 134 |
"highest_tail": 512
|
| 135 |
},
|
| 136 |
{
|
| 137 |
+
"model": "o1-mini-2024-09-12 (⚔️)",
|
| 138 |
"score": 2757.33,
|
| 139 |
"details": "3132,2004,3136",
|
| 140 |
"highest_tail": 256
|
| 141 |
},
|
| 142 |
{
|
| 143 |
+
"model": "o3-2025-04-16 (⚔️)",
|
| 144 |
"score": 7120,
|
| 145 |
"details": "7120",
|
| 146 |
"highest_tail": 512
|
| 147 |
},
|
| 148 |
{
|
| 149 |
+
"model": "o4-mini-2025-04-16 (⚔️)",
|
| 150 |
"score": 4432.0,
|
| 151 |
"details": "4928,5456,2912",
|
| 152 |
"highest_tail": 512
|
|
|
|
| 158 |
"highest_tail": 128
|
| 159 |
},
|
| 160 |
{
|
| 161 |
+
"model": "claude-opus-4-20250514 (⚔️)",
|
| 162 |
"score": 3036.0,
|
| 163 |
"details": "3036.0",
|
| 164 |
"highest_tail": 256
|
| 165 |
},
|
| 166 |
{
|
| 167 |
+
"model": "claude-sonnet-4-20250514 (⚔️)",
|
| 168 |
"score": 3136,
|
| 169 |
"details": "2148,2360,4900",
|
| 170 |
"highest_tail": 256
|
| 171 |
},
|
| 172 |
{
|
| 173 |
+
"model": "deepseek-r1-0528 (⚔️)",
|
| 174 |
"score": 3330.0,
|
| 175 |
"details": "3260,3400",
|
| 176 |
"highest_tail": 256
|
| 177 |
},
|
| 178 |
{
|
| 179 |
+
"model": "qwen3-235B-A22B-fp8 (⚔️)",
|
| 180 |
"score": 2144.0,
|
| 181 |
"details": "1436,2556,2440",
|
| 182 |
"highest_tail": 256
|
|
|
|
| 187 |
"runs": 3,
|
| 188 |
"results": [
|
| 189 |
{
|
| 190 |
+
"model": "claude-3-5-sonnet-20241022 (⚔️)",
|
| 191 |
"score": 14.7,
|
| 192 |
"details": "16,14,14"
|
| 193 |
},
|
| 194 |
{
|
| 195 |
+
"model": "claude-3-7-sonnet-20250219 (⚔️)",
|
| 196 |
"score": 16.3,
|
| 197 |
"details": "19,15,15"
|
| 198 |
},
|
| 199 |
{
|
| 200 |
+
"model": "deepseek-r1-0120 (⚔️)",
|
| 201 |
"score": 14.3,
|
| 202 |
"details": "15,14,14"
|
| 203 |
},
|
| 204 |
{
|
| 205 |
+
"model": "gemini-2.5-flash-preview-04-17 (⚔️)",
|
| 206 |
"score": 16.3,
|
| 207 |
"details": "20,14,15"
|
| 208 |
},
|
| 209 |
{
|
| 210 |
+
"model": "gemini-2.5-pro-preview-05-06 (⚔️)",
|
| 211 |
"score": 23.3,
|
| 212 |
"details": "23,23,24"
|
| 213 |
},
|
| 214 |
{
|
| 215 |
+
"model": "grok-3-mini-beta (⚔️)",
|
| 216 |
"score": 21.3,
|
| 217 |
"details": "20,15,29"
|
| 218 |
},
|
| 219 |
{
|
| 220 |
+
"model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
|
| 221 |
"score": 10.3,
|
| 222 |
"details": "9,10,12"
|
| 223 |
},
|
| 224 |
{
|
| 225 |
+
"model": "gpt-4.1-2025-04-14 (⚔️)",
|
| 226 |
"score": 13.7,
|
| 227 |
"details": "13,14,14"
|
| 228 |
},
|
| 229 |
{
|
| 230 |
+
"model": "gpt-4o-2024-11-20 (⚔️)",
|
| 231 |
"score": 14,
|
| 232 |
"details": "18,11,13"
|
| 233 |
},
|
| 234 |
{
|
| 235 |
+
"model": "o1-2024-12-17 (⚔️)",
|
| 236 |
"score": 35,
|
| 237 |
"details": "35"
|
| 238 |
},
|
| 239 |
{
|
| 240 |
+
"model": "o1-mini-2024-09-12 (⚔️)",
|
| 241 |
"score": 11.7,
|
| 242 |
"details": "11,11,13"
|
| 243 |
},
|
| 244 |
{
|
| 245 |
+
"model": "o3-2025-04-16 (⚔️)",
|
| 246 |
"score": 42,
|
| 247 |
"details": "42"
|
| 248 |
},
|
| 249 |
{
|
| 250 |
+
"model": "o4-mini-2025-04-16 (⚔️)",
|
| 251 |
"score": 25.3,
|
| 252 |
"details": "22,35,19"
|
| 253 |
},
|
|
|
|
| 257 |
"details": ""
|
| 258 |
},
|
| 259 |
{
|
| 260 |
+
"model": "claude-opus-4-20250514 (⚔️)",
|
| 261 |
"score": 20,
|
| 262 |
"details": "17,18,25"
|
| 263 |
},
|
| 264 |
{
|
| 265 |
+
"model": "claude-sonnet-4-20250514 (⚔️)",
|
| 266 |
"score": 19.33,
|
| 267 |
"details": "20,17,21"
|
| 268 |
},
|
| 269 |
{
|
| 270 |
+
"model": "deepseek-r1-0528 (⚔️)",
|
| 271 |
"score": 33.67,
|
| 272 |
"details": "26,34,41"
|
| 273 |
},
|
| 274 |
{
|
| 275 |
+
"model": "qwen3-235B-A22B-fp8 (⚔️)",
|
| 276 |
"score": 11.67,
|
| 277 |
"details": "13,14,8"
|
| 278 |
}
|
|
|
|
| 282 |
"runs": 3,
|
| 283 |
"results": [
|
| 284 |
{
|
| 285 |
+
"model": "claude-3-5-sonnet-20241022 (⚔️)",
|
| 286 |
"score": 106,
|
| 287 |
"details": "92,165,61"
|
| 288 |
},
|
| 289 |
{
|
| 290 |
+
"model": "claude-3-7-sonnet-20250219 (⚔️)",
|
| 291 |
"score": 484,
|
| 292 |
"details": "535,428,489"
|
| 293 |
},
|
| 294 |
{
|
| 295 |
+
"model": "deepseek-r1-0120 (⚔️)",
|
| 296 |
"score": 447.3,
|
| 297 |
"details": "409,436,497"
|
| 298 |
},
|
| 299 |
{
|
| 300 |
+
"model": "gemini-2.5-flash-preview-04-17 (⚔️)",
|
| 301 |
"score": 334.7,
|
| 302 |
"details": "259,372,373"
|
| 303 |
},
|
| 304 |
{
|
| 305 |
+
"model": "gemini-2.5-pro-preview-05-06 (⚔️)",
|
| 306 |
"score": 416.3,
|
| 307 |
"details": "411,414,424"
|
| 308 |
},
|
| 309 |
{
|
| 310 |
+
"model": "grok-3-mini-beta (⚔️)",
|
| 311 |
"score": 254,
|
| 312 |
"details": "299,332,131"
|
| 313 |
},
|
| 314 |
{
|
| 315 |
+
"model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
|
| 316 |
"score": 128.7,
|
| 317 |
"details": "67,139,180"
|
| 318 |
},
|
| 319 |
{
|
| 320 |
+
"model": "gpt-4.1-2025-04-14 (⚔️)",
|
| 321 |
"score": 182,
|
| 322 |
"details": "163,215,168"
|
| 323 |
},
|
| 324 |
{
|
| 325 |
+
"model": "gpt-4o-2024-11-20 (⚔️)",
|
| 326 |
"score": 147.3,
|
| 327 |
"details": "131,104,207"
|
| 328 |
},
|
| 329 |
{
|
| 330 |
+
"model": "o1-2024-12-17 (⚔️)",
|
| 331 |
"score": 159,
|
| 332 |
"details": "159"
|
| 333 |
},
|
| 334 |
{
|
| 335 |
+
"model": "o1-mini-2024-09-12 (⚔️)",
|
| 336 |
"score": 48,
|
| 337 |
"details": "21,86,37"
|
| 338 |
},
|
| 339 |
{
|
| 340 |
+
"model": "o3-2025-04-16 (⚔️)",
|
| 341 |
"score": 647,
|
| 342 |
"details": "647"
|
| 343 |
},
|
| 344 |
{
|
| 345 |
+
"model": "o4-mini-2025-04-16 (⚔️)",
|
| 346 |
"score": 487.3,
|
| 347 |
"details": "259,591,612"
|
| 348 |
},
|
|
|
|
| 352 |
"details": ""
|
| 353 |
},
|
| 354 |
{
|
| 355 |
+
"model": "claude-opus-4-20250514 (⚔️)",
|
| 356 |
"score": 464,
|
| 357 |
"details": "593,406,393"
|
| 358 |
},
|
| 359 |
{
|
| 360 |
+
"model": "claude-sonnet-4-20250514 (⚔️)",
|
| 361 |
"score": 478.33,
|
| 362 |
"details": "545,468,422"
|
| 363 |
},
|
| 364 |
{
|
| 365 |
+
"model": "deepseek-r1-0528 (⚔️)",
|
| 366 |
"score": 491.67,
|
| 367 |
"details": "464,463,548"
|
| 368 |
},
|
| 369 |
{
|
| 370 |
+
"model": "qwen3-235B-A22B-fp8 (⚔️)",
|
| 371 |
"score": 363.33,
|
| 372 |
"details": "365,372,353"
|
| 373 |
}
|
|
|
|
| 377 |
"runs": 3,
|
| 378 |
"results": [
|
| 379 |
{
|
| 380 |
+
"model": "claude-3-5-sonnet-20241022 (⚔️)",
|
| 381 |
"score": 0,
|
| 382 |
"detail_box_on_target": "0,0,0",
|
| 383 |
"cracked_levels": "0,0,0"
|
| 384 |
},
|
| 385 |
{
|
| 386 |
+
"model": "claude-3-7-sonnet-20250219 (⚔️)",
|
| 387 |
"score": 2.33,
|
| 388 |
"detail_box_on_target": "2,4,1",
|
| 389 |
"cracked_levels": "1,2,0"
|
| 390 |
},
|
| 391 |
{
|
| 392 |
+
"model": "deepseek-r1-0120 (⚔️)",
|
| 393 |
"score": 1.33,
|
| 394 |
"detail_box_on_target": "2,0,2",
|
| 395 |
"cracked_levels": "1,0,1"
|
| 396 |
},
|
| 397 |
{
|
| 398 |
+
"model": "gemini-2.5-flash-preview-04-17 (⚔️)",
|
| 399 |
"score": 1.67,
|
| 400 |
"detail_box_on_target": "3,0,2",
|
| 401 |
"cracked_levels": "2,0,1"
|
| 402 |
},
|
| 403 |
{
|
| 404 |
+
"model": "gemini-2.5-pro-preview-05-06 (⚔️)",
|
| 405 |
"score": 4.33,
|
| 406 |
"detail_box_on_target": "4,4,5",
|
| 407 |
"cracked_levels": "2,2,3"
|
| 408 |
},
|
| 409 |
{
|
| 410 |
+
"model": "grok-3-mini-beta (⚔️)",
|
| 411 |
"score": 5.67,
|
| 412 |
"detail_box_on_target": "5,6,6",
|
| 413 |
"cracked_levels": "3,3,3"
|
| 414 |
},
|
| 415 |
{
|
| 416 |
+
"model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
|
| 417 |
"score": 0,
|
| 418 |
"detail_box_on_target": "0,0,0",
|
| 419 |
"cracked_levels": "0,0,0"
|
| 420 |
},
|
| 421 |
{
|
| 422 |
+
"model": "gpt-4.1-2025-04-14 (⚔️)",
|
| 423 |
"score": 0,
|
| 424 |
"detail_box_on_target": "0,0,0",
|
| 425 |
"cracked_levels": "0,0,0"
|
| 426 |
},
|
| 427 |
{
|
| 428 |
+
"model": "gpt-4o-2024-11-20 (⚔️)",
|
| 429 |
"score": 0,
|
| 430 |
"detail_box_on_target": "0,0,0",
|
| 431 |
"cracked_levels": "0,0,0"
|
| 432 |
},
|
| 433 |
{
|
| 434 |
+
"model": "o1-2024-12-17 (⚔️)",
|
| 435 |
"score": 2.33,
|
| 436 |
"detail_box_on_target": "2,2,3",
|
| 437 |
"cracked_levels": "1,1,2"
|
| 438 |
},
|
| 439 |
{
|
| 440 |
+
"model": "o1-mini-2024-09-12 (⚔️)",
|
| 441 |
"score": 1.33,
|
| 442 |
"detail_box_on_target": "1,2,1",
|
| 443 |
"cracked_levels": "0,1,0"
|
| 444 |
},
|
| 445 |
{
|
| 446 |
+
"model": "o3-2025-04-16 (⚔️)",
|
| 447 |
"score": 8,
|
| 448 |
"detail_box_on_target": "10,6",
|
| 449 |
"cracked_levels": "5,3"
|
| 450 |
},
|
| 451 |
{
|
| 452 |
+
"model": "o4-mini-2025-04-16 (⚔️)",
|
| 453 |
"score": 5.33,
|
| 454 |
"detail_box_on_target": "4,6,6",
|
| 455 |
"cracked_levels": "2,2,3"
|
|
|
|
| 461 |
"cracked_levels": "0,0,0"
|
| 462 |
},
|
| 463 |
{
|
| 464 |
+
"model": "claude-opus-4-20250514 (⚔️)",
|
| 465 |
"score": 4,
|
| 466 |
"details": "4,4,4"
|
| 467 |
},
|
| 468 |
{
|
| 469 |
+
"model": "claude-sonnet-4-20250514 (⚔️)",
|
| 470 |
"score": 3,
|
| 471 |
"details": "2,2,5"
|
| 472 |
},
|
| 473 |
{
|
| 474 |
+
"model": "deepseek-r1-0528 (⚔️)",
|
| 475 |
"score": 4.67,
|
| 476 |
"details": "4,4,6"
|
| 477 |
},
|
| 478 |
{
|
| 479 |
+
"model": "qwen3-235B-A22B-fp8 (⚔️)",
|
| 480 |
"score": 2.33,
|
| 481 |
"details": "1,2,4"
|
| 482 |
}
|
|
|
|
| 486 |
"runs": 1,
|
| 487 |
"results": [
|
| 488 |
{
|
| 489 |
+
"model": "claude-3-5-sonnet-20241022 (⚔️)",
|
| 490 |
"score": 2,
|
| 491 |
"progress": "1:2/5",
|
| 492 |
"evaluator result": "1/3"
|
| 493 |
},
|
| 494 |
{
|
| 495 |
+
"model": "claude-3-7-sonnet-20250219 (⚔️)",
|
| 496 |
"score": 7,
|
| 497 |
"progress": "2:2/9",
|
| 498 |
"evaluator result": "5/11"
|
| 499 |
},
|
| 500 |
{
|
| 501 |
+
"model": "deepseek-r1-0120 (⚔️)",
|
| 502 |
"score": 0,
|
| 503 |
"progress": "0",
|
| 504 |
"evaluator result": "1/5"
|
| 505 |
},
|
| 506 |
{
|
| 507 |
+
"model": "gemini-2.5-flash-preview-04-17 (⚔️)",
|
| 508 |
"score": 4,
|
| 509 |
"progress": "1:4/5",
|
| 510 |
"evaluator result": "1/7"
|
| 511 |
},
|
| 512 |
{
|
| 513 |
+
"model": "gemini-2.5-pro-preview-05-06 (⚔️)",
|
| 514 |
"score": 7,
|
| 515 |
"progress": "2:2/9",
|
| 516 |
"evaluator result": "2/3"
|
| 517 |
},
|
| 518 |
{
|
| 519 |
+
"model": "grok-3-mini-beta (⚔️)",
|
| 520 |
"score": 0,
|
| 521 |
"progress": "0",
|
| 522 |
"evaluator result": "0"
|
| 523 |
},
|
| 524 |
{
|
| 525 |
+
"model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
|
| 526 |
"score": 0,
|
| 527 |
"progress": "0",
|
| 528 |
"evaluator result": "0"
|
| 529 |
},
|
| 530 |
{
|
| 531 |
+
"model": "gpt-4.1-2025-04-14 (⚔️)",
|
| 532 |
"score": 2,
|
| 533 |
"progress": "1:2/5",
|
| 534 |
"evaluator result": "2/3"
|
| 535 |
},
|
| 536 |
{
|
| 537 |
+
"model": "gpt-4o-2024-11-20 (⚔️)",
|
| 538 |
"score": 0,
|
| 539 |
"progress": "0",
|
| 540 |
"evaluator result": "0"
|
| 541 |
},
|
| 542 |
{
|
| 543 |
+
"model": "o1-2024-12-17 (⚔️)",
|
| 544 |
"score": 16,
|
| 545 |
"progress": "3: 2/8",
|
| 546 |
"evaluator result": "6/11"
|
| 547 |
},
|
| 548 |
{
|
| 549 |
+
"model": "o1-mini-2024-09-12 (⚔️)",
|
| 550 |
"score": 0,
|
| 551 |
"progress": "0",
|
| 552 |
"evaluator result": "1/5"
|
| 553 |
},
|
| 554 |
{
|
| 555 |
+
"model": "o3-2025-04-16 (⚔️)",
|
| 556 |
"score": 16,
|
| 557 |
"progress": "3: 2/8",
|
| 558 |
"evaluator result": "1/2"
|
| 559 |
},
|
| 560 |
{
|
| 561 |
+
"model": "o4-mini-2025-04-16 (⚔️)",
|
| 562 |
"score": 4,
|
| 563 |
"progress": "1:4/5",
|
| 564 |
"evaluator result": "2/5"
|
|
|
|
| 570 |
"evaluator result": "0"
|
| 571 |
},
|
| 572 |
{
|
| 573 |
+
"model": "claude-opus-4-20250514 (⚔️)",
|
| 574 |
"score": 6,
|
| 575 |
"details": "6"
|
| 576 |
},
|
| 577 |
{
|
| 578 |
+
"model": "claude-sonnet-4-20250514 (⚔️)",
|
| 579 |
"score": 3.67,
|
| 580 |
"details": "3,4,4"
|
| 581 |
},
|
| 582 |
{
|
| 583 |
+
"model": "gemini-2.5-flash-preview-05-20 (⚔️)",
|
| 584 |
"score": 4.33,
|
| 585 |
"details": "3,4,6"
|
| 586 |
}
|