Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -321,10 +321,10 @@ def save_text(text_content):
|
|
| 321 |
gr.Error(f"Error saving file: {e}")
|
| 322 |
return None
|
| 323 |
|
| 324 |
-
def load_leaderboard():
|
| 325 |
"""
|
| 326 |
-
Loads evaluation data from 'eval.jsonl', computes average accuracy per model for
|
| 327 |
-
and prepares data for
|
| 328 |
"""
|
| 329 |
try:
|
| 330 |
df = pd.read_json("eval.jsonl", lines=True)
|
|
@@ -335,68 +335,36 @@ def load_leaderboard():
|
|
| 335 |
|
| 336 |
if df.empty:
|
| 337 |
gr.Warning("No valid evaluation data found to populate the leaderboard.")
|
| 338 |
-
|
| 339 |
-
return (
|
| 340 |
-
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
|
| 341 |
-
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
| 342 |
-
)
|
| 343 |
-
|
| 344 |
-
# Filter for MMLU data
|
| 345 |
-
df_mmlu = df[df['benchmark'] == 'MMLU']
|
| 346 |
-
if 'subject' in df_mmlu.columns:
|
| 347 |
-
# For MMLU, if "ALL" subjects are evaluated, consider the overall accuracy.
|
| 348 |
-
# Otherwise, average specific subject accuracies.
|
| 349 |
-
df_mmlu_grouped = df_mmlu[df_mmlu['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
|
| 350 |
-
# If a model only has specific subject evaluations, average those.
|
| 351 |
-
# This is a simplification; a more robust approach might be to calculate weighted average.
|
| 352 |
-
# For now, if "ALL" exists, we use that; otherwise, we average available subjects.
|
| 353 |
-
|
| 354 |
-
# If no 'ALL' subject records, average across available subjects for MMLU
|
| 355 |
-
if df_mmlu_grouped.empty:
|
| 356 |
-
df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index()
|
| 357 |
|
| 358 |
-
|
| 359 |
-
|
| 360 |
|
|
|
|
|
|
|
|
|
|
| 361 |
|
| 362 |
-
|
| 363 |
-
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
-
|
| 366 |
-
df_mmlu_pro = df[df['benchmark'] == 'MMLU-Pro']
|
| 367 |
-
if 'subject' in df_mmlu_pro.columns:
|
| 368 |
-
df_mmlu_pro_grouped = df_mmlu_pro[df_mmlu_pro['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
|
| 369 |
-
if df_mmlu_pro_grouped.empty:
|
| 370 |
-
df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
|
| 371 |
-
else: # Handle older eval.jsonl
|
| 372 |
-
df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
df_mmlu_pro_grouped.columns = ["Model ID", "Average Accuracy (%)"]
|
| 376 |
-
df_mmlu_pro_sorted = df_mmlu_pro_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
|
| 377 |
-
|
| 378 |
-
# Return two dataframes as lists of dictionaries
|
| 379 |
-
return df_mmlu_sorted.to_dict('records'), df_mmlu_pro_sorted.to_dict('records')
|
| 380 |
|
| 381 |
except FileNotFoundError:
|
| 382 |
gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
|
| 383 |
-
return (
|
| 384 |
-
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
|
| 385 |
-
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
| 386 |
-
)
|
| 387 |
except Exception as e:
|
| 388 |
gr.Error(f"Error loading leaderboard: {e}")
|
| 389 |
traceback.print_exc() # Print full traceback for debugging
|
| 390 |
-
return (
|
| 391 |
-
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
|
| 392 |
-
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
| 393 |
-
)
|
| 394 |
|
| 395 |
|
| 396 |
# --- Gradio Interface Definition ---
|
| 397 |
with gr.Blocks(css="""
|
| 398 |
/* Import Google Font - Inter */
|
| 399 |
-
@import url('https://fonts.
|
| 400 |
|
| 401 |
/* General body and container styling */
|
| 402 |
body {
|
|
@@ -728,31 +696,30 @@ with gr.Blocks(css="""
|
|
| 728 |
</div>
|
| 729 |
""")
|
| 730 |
|
| 731 |
-
#
|
| 732 |
-
gr.
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
label="MMLU Leaderboard Data",
|
| 740 |
-
elem_classes="leaderboard-table" # Apply custom class for styling
|
| 741 |
)
|
| 742 |
|
| 743 |
-
|
| 744 |
-
|
| 745 |
headers=["Model ID", "Average Accuracy (%)"],
|
| 746 |
interactive=False,
|
| 747 |
datatype=["str", "number"],
|
| 748 |
row_count=10,
|
| 749 |
col_count=2,
|
| 750 |
-
label="
|
| 751 |
elem_classes="leaderboard-table" # Apply custom class for styling
|
| 752 |
)
|
| 753 |
|
| 754 |
-
#
|
| 755 |
-
demo.load(load_leaderboard, inputs=[], outputs=[
|
|
|
|
| 756 |
|
| 757 |
# Launch the Gradio app
|
| 758 |
-
demo.launch()
|
|
|
|
| 321 |
gr.Error(f"Error saving file: {e}")
|
| 322 |
return None
|
| 323 |
|
| 324 |
+
def load_leaderboard(benchmark_filter):
|
| 325 |
"""
|
| 326 |
+
Loads evaluation data from 'eval.jsonl', computes average accuracy per model for the selected benchmark,
|
| 327 |
+
and prepares data for the leaderboard table.
|
| 328 |
"""
|
| 329 |
try:
|
| 330 |
df = pd.read_json("eval.jsonl", lines=True)
|
|
|
|
| 335 |
|
| 336 |
if df.empty:
|
| 337 |
gr.Warning("No valid evaluation data found to populate the leaderboard.")
|
| 338 |
+
return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
|
| 340 |
+
# Filter data based on the selected benchmark
|
| 341 |
+
df_filtered = df[df['benchmark'] == benchmark_filter]
|
| 342 |
|
| 343 |
+
if df_filtered.empty:
|
| 344 |
+
gr.Warning(f"No evaluation data for {benchmark_filter} found yet.")
|
| 345 |
+
return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
| 346 |
|
| 347 |
+
# For the leaderboard, we typically want the average across all subjects within that benchmark.
|
| 348 |
+
# So we group by model_id and take the mean of accuracy.
|
| 349 |
+
df_grouped = df_filtered.groupby("model_id")["accuracy"].mean().reset_index()
|
| 350 |
+
df_grouped.columns = ["Model ID", "Average Accuracy (%)"]
|
| 351 |
+
df_sorted = df_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
|
| 352 |
|
| 353 |
+
return df_sorted.to_dict('records')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
|
| 355 |
except FileNotFoundError:
|
| 356 |
gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
|
| 357 |
+
return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
|
|
|
|
|
|
|
|
|
| 358 |
except Exception as e:
|
| 359 |
gr.Error(f"Error loading leaderboard: {e}")
|
| 360 |
traceback.print_exc() # Print full traceback for debugging
|
| 361 |
+
return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
|
|
|
|
|
|
|
|
|
| 362 |
|
| 363 |
|
| 364 |
# --- Gradio Interface Definition ---
|
| 365 |
with gr.Blocks(css="""
|
| 366 |
/* Import Google Font - Inter */
|
| 367 |
+
@import url('https://fonts.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
|
| 368 |
|
| 369 |
/* General body and container styling */
|
| 370 |
body {
|
|
|
|
| 696 |
</div>
|
| 697 |
""")
|
| 698 |
|
| 699 |
+
# Leaderboard Type Toggle
|
| 700 |
+
leaderboard_type_toggle = gr.Radio(
|
| 701 |
+
["MMLU", "MMLU-Pro"],
|
| 702 |
+
label="Select Benchmark for Leaderboard",
|
| 703 |
+
value="MMLU", # Default to MMLU
|
| 704 |
+
interactive=True,
|
| 705 |
+
container=False, # Make it inline with content
|
| 706 |
+
elem_id="leaderboard-toggle"
|
|
|
|
|
|
|
| 707 |
)
|
| 708 |
|
| 709 |
+
# Leaderboard Table
|
| 710 |
+
leaderboard_table_output = gr.Dataframe(
|
| 711 |
headers=["Model ID", "Average Accuracy (%)"],
|
| 712 |
interactive=False,
|
| 713 |
datatype=["str", "number"],
|
| 714 |
row_count=10,
|
| 715 |
col_count=2,
|
| 716 |
+
label="Benchmark Leaderboard Data",
|
| 717 |
elem_classes="leaderboard-table" # Apply custom class for styling
|
| 718 |
)
|
| 719 |
|
| 720 |
+
# Initial load and dynamic update for the leaderboard
|
| 721 |
+
demo.load(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
|
| 722 |
+
leaderboard_type_toggle.change(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
|
| 723 |
|
| 724 |
# Launch the Gradio app
|
| 725 |
+
demo.launch()
|