Spaces:
Running
Running
Commit
Β·
cc8a66b
1
Parent(s):
49c2344
update
Browse files
app.py
CHANGED
|
@@ -6,6 +6,7 @@ import requests
|
|
| 6 |
import huggingface_hub
|
| 7 |
from huggingface_hub.utils._errors import EntryNotFoundError, RepositoryNotFoundError
|
| 8 |
from dotenv import load_dotenv
|
|
|
|
| 9 |
|
| 10 |
load_dotenv()
|
| 11 |
webhook_url = os.environ.get("WEBHOOK_URL")
|
|
@@ -119,21 +120,25 @@ def get_folders_matching_format(directory):
|
|
| 119 |
|
| 120 |
|
| 121 |
def get_unique_column_names(all_data):
|
| 122 |
-
column_names =
|
| 123 |
|
| 124 |
for folder_name, files in all_data.items():
|
| 125 |
for file_name, sheets in files.items():
|
| 126 |
for sheet_name, dataframe in sheets.items():
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
column_names.remove('Parameters Count (B)')
|
| 131 |
|
| 132 |
-
return list(column_names)
|
| 133 |
|
| 134 |
|
| 135 |
-
def update_table(period: str,
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
target_data = all_data[period]
|
| 138 |
target_metric = metric_to_sheet[metric]
|
| 139 |
|
|
@@ -142,15 +147,38 @@ def update_table(period: str, models: list, metric: str, visible_columns: list,
|
|
| 142 |
combined_data = pd.concat([target_data[model][target_metric] for model in target_model_size], axis=0)
|
| 143 |
combined_data['Name'] = combined_data['Name'].apply(lambda x: x.replace('.pth', ''))
|
| 144 |
|
|
|
|
|
|
|
| 145 |
if 'Average (The lower the better)' in combined_data.columns:
|
| 146 |
relevant_columns = [col for col in visible_columns if
|
| 147 |
col not in ['Name', 'Parameters Count (B)', 'Average (The lower the better)']]
|
| 148 |
combined_data['Average (The lower the better)'] = round(combined_data[relevant_columns].mean(axis=1), 3)
|
| 149 |
|
| 150 |
sorted_data = combined_data.sort_values(by=sort_by, ascending=ascending)
|
| 151 |
-
|
|
|
|
| 152 |
filtered_data = sorted_data[visible_columns]
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
else:
|
| 155 |
return pd.DataFrame()
|
| 156 |
|
|
@@ -210,10 +238,25 @@ initial_period = time_list[-1]
|
|
| 210 |
initial_models = model_size_list[:1]
|
| 211 |
initial_metric = metric_list[0]
|
| 212 |
initial_columns = get_unique_column_names(all_data)
|
|
|
|
| 213 |
|
| 214 |
-
initial_data = update_table(initial_period, initial_models, initial_metric, initial_columns)
|
| 215 |
|
| 216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
gr.HTML('<h1 style="text-align:center"><span style="font-size:1.3em">π Uncheatable Eval Leaderboard</span></h1>')
|
| 218 |
gr.HTML(
|
| 219 |
"<h1 style='text-align:center'><span style='font-size:0.8em'>Welcome to Uncheatable Eval, where fancy fine-tuning and cheating wonβt work π«; only compute π», data π, and real innovation π₯ can prevail!</span></h1>")
|
|
@@ -225,20 +268,30 @@ with gr.Blocks(css=".gradio-container{max-width:95%!important} .tab-buttons butt
|
|
| 225 |
model_selector = gr.CheckboxGroup(label="Model", choices=model_size_list, value=model_size_list[0])
|
| 226 |
metric_selector = gr.Dropdown(label="Metric", choices=metric_list, value=metric_list[0])
|
| 227 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
| 228 |
colfilter = gr.CheckboxGroup(label="Data Source",
|
| 229 |
choices=get_unique_column_names(all_data),
|
| 230 |
value=get_unique_column_names(all_data))
|
| 231 |
|
| 232 |
-
table = gr.Dataframe(initial_data)
|
| 233 |
|
| 234 |
-
period_selector.change(update_table,
|
|
|
|
| 235 |
outputs=table)
|
| 236 |
-
model_selector.change(update_table,
|
|
|
|
| 237 |
outputs=table)
|
| 238 |
-
metric_selector.change(update_table,
|
|
|
|
| 239 |
outputs=table)
|
| 240 |
-
colfilter.change(update_table,
|
|
|
|
| 241 |
outputs=table)
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
with gr.Tab("π MultiLang"):
|
| 244 |
gr.Markdown("## Coming soon...")
|
|
|
|
| 6 |
import huggingface_hub
|
| 7 |
from huggingface_hub.utils._errors import EntryNotFoundError, RepositoryNotFoundError
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
+
from matplotlib.colors import LinearSegmentedColormap
|
| 10 |
|
| 11 |
load_dotenv()
|
| 12 |
webhook_url = os.environ.get("WEBHOOK_URL")
|
|
|
|
| 120 |
|
| 121 |
|
| 122 |
def get_unique_column_names(all_data):
|
| 123 |
+
column_names = {}
|
| 124 |
|
| 125 |
for folder_name, files in all_data.items():
|
| 126 |
for file_name, sheets in files.items():
|
| 127 |
for sheet_name, dataframe in sheets.items():
|
| 128 |
+
for column in dataframe.columns:
|
| 129 |
+
if column not in ['Name', 'Average (The lower the better)', 'Parameters Count (B)']:
|
| 130 |
+
column_names[column] = None
|
|
|
|
| 131 |
|
| 132 |
+
return list(column_names.keys())
|
| 133 |
|
| 134 |
|
| 135 |
+
def update_table(period: str,
|
| 136 |
+
models: list,
|
| 137 |
+
metric: str,
|
| 138 |
+
visible_columns: list,
|
| 139 |
+
color_columns: list,
|
| 140 |
+
sort_by: str = 'Average (The lower the better)',
|
| 141 |
+
ascending: bool = True):
|
| 142 |
target_data = all_data[period]
|
| 143 |
target_metric = metric_to_sheet[metric]
|
| 144 |
|
|
|
|
| 147 |
combined_data = pd.concat([target_data[model][target_metric] for model in target_model_size], axis=0)
|
| 148 |
combined_data['Name'] = combined_data['Name'].apply(lambda x: x.replace('.pth', ''))
|
| 149 |
|
| 150 |
+
combined_data.reset_index(drop=True, inplace=True)
|
| 151 |
+
|
| 152 |
if 'Average (The lower the better)' in combined_data.columns:
|
| 153 |
relevant_columns = [col for col in visible_columns if
|
| 154 |
col not in ['Name', 'Parameters Count (B)', 'Average (The lower the better)']]
|
| 155 |
combined_data['Average (The lower the better)'] = round(combined_data[relevant_columns].mean(axis=1), 3)
|
| 156 |
|
| 157 |
sorted_data = combined_data.sort_values(by=sort_by, ascending=ascending)
|
| 158 |
+
sorted_data = sorted_data.rename(columns={'Average (The lower the better)': 'Average (lower=better)'})
|
| 159 |
+
visible_columns = ['Name', 'Parameters Count (B)', 'Average (lower=better)'] + visible_columns
|
| 160 |
filtered_data = sorted_data[visible_columns]
|
| 161 |
+
|
| 162 |
+
filtered_data.columns = [col.replace('_', ' ') for col in filtered_data.columns]
|
| 163 |
+
|
| 164 |
+
formatter = {col: "{:.3f}" for col in filtered_data.columns if
|
| 165 |
+
filtered_data[col].dtype in ['float64', 'float32']}
|
| 166 |
+
|
| 167 |
+
# color gradient
|
| 168 |
+
colors = ["#63be7b", "#ffffff", "#f8696b"]
|
| 169 |
+
cmap = LinearSegmentedColormap.from_list("custom_cmap", colors)
|
| 170 |
+
target_color_columns = []
|
| 171 |
+
if 'Average' in color_columns:
|
| 172 |
+
target_color_columns.append('Average (lower=better)')
|
| 173 |
+
if 'Individual Tests' in color_columns:
|
| 174 |
+
target_color_columns.extend([col for col in filtered_data.columns if col not in ['Name', 'Parameters Count (B)', 'Average (lower=better)']])
|
| 175 |
+
|
| 176 |
+
styler = filtered_data.style.format(formatter).background_gradient(
|
| 177 |
+
cmap=cmap,
|
| 178 |
+
subset=target_color_columns
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
return styler
|
| 182 |
else:
|
| 183 |
return pd.DataFrame()
|
| 184 |
|
|
|
|
| 238 |
initial_models = model_size_list[:1]
|
| 239 |
initial_metric = metric_list[0]
|
| 240 |
initial_columns = get_unique_column_names(all_data)
|
| 241 |
+
initial_colors = ['Average']
|
| 242 |
|
| 243 |
+
initial_data = update_table(initial_period, initial_models, initial_metric, initial_columns, initial_colors)
|
| 244 |
|
| 245 |
+
css = '''
|
| 246 |
+
.gradio-container {
|
| 247 |
+
max-width: 95% !important;
|
| 248 |
+
}
|
| 249 |
+
.tab-buttons button {
|
| 250 |
+
font-size: 1.3em;
|
| 251 |
+
}
|
| 252 |
+
.gr-dataframe th {
|
| 253 |
+
white-space: normal;
|
| 254 |
+
word-break: break-word;
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
'''
|
| 258 |
+
|
| 259 |
+
with gr.Blocks(css=css) as demo:
|
| 260 |
gr.HTML('<h1 style="text-align:center"><span style="font-size:1.3em">π Uncheatable Eval Leaderboard</span></h1>')
|
| 261 |
gr.HTML(
|
| 262 |
"<h1 style='text-align:center'><span style='font-size:0.8em'>Welcome to Uncheatable Eval, where fancy fine-tuning and cheating wonβt work π«; only compute π», data π, and real innovation π₯ can prevail!</span></h1>")
|
|
|
|
| 268 |
model_selector = gr.CheckboxGroup(label="Model", choices=model_size_list, value=model_size_list[0])
|
| 269 |
metric_selector = gr.Dropdown(label="Metric", choices=metric_list, value=metric_list[0])
|
| 270 |
with gr.Column():
|
| 271 |
+
color_selector = gr.CheckboxGroup(label="Colored Columns",
|
| 272 |
+
choices=['Average', 'Individual Tests'],
|
| 273 |
+
value=['Average'])
|
| 274 |
colfilter = gr.CheckboxGroup(label="Data Source",
|
| 275 |
choices=get_unique_column_names(all_data),
|
| 276 |
value=get_unique_column_names(all_data))
|
| 277 |
|
| 278 |
+
table = gr.Dataframe(initial_data, column_widths=[110, 35, 35, 35, 35, 35, 35, 35, 35, 35], wrap=True)
|
| 279 |
|
| 280 |
+
period_selector.change(update_table,
|
| 281 |
+
inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector],
|
| 282 |
outputs=table)
|
| 283 |
+
model_selector.change(update_table,
|
| 284 |
+
inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector],
|
| 285 |
outputs=table)
|
| 286 |
+
metric_selector.change(update_table,
|
| 287 |
+
inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector],
|
| 288 |
outputs=table)
|
| 289 |
+
colfilter.change(update_table,
|
| 290 |
+
inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector],
|
| 291 |
outputs=table)
|
| 292 |
+
color_selector.change(update_table,
|
| 293 |
+
inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector],
|
| 294 |
+
outputs=table)
|
| 295 |
|
| 296 |
with gr.Tab("π MultiLang"):
|
| 297 |
gr.Markdown("## Coming soon...")
|