Spaces:
Sleeping
Sleeping
shyuli commited on
Commit ·
87688ca
1
Parent(s): 8ef7e14
version v0.1
Browse files- debug_df.py +0 -59
- requirements.txt +1 -1
- src/populate.py +21 -3
debug_df.py
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
|
| 3 |
-
import json
|
| 4 |
-
import os
|
| 5 |
-
import pandas as pd
|
| 6 |
-
from src.display.utils import AutoEvalColumn
|
| 7 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
| 8 |
-
|
| 9 |
-
# 设置路径
|
| 10 |
-
EVAL_RESULTS_PATH = "./eval-results"
|
| 11 |
-
EVAL_REQUESTS_PATH = "./eval-queue"
|
| 12 |
-
|
| 13 |
-
# 获取列定义
|
| 14 |
-
cols = [c.name for c in AutoEvalColumn.__dict__.values() if hasattr(c, "name") and not getattr(c, "hidden", False)]
|
| 15 |
-
benchmark_cols = ["NQ", "TriviaQA", "PopQA", "HotpotQA", "2wiki", "Musique", "Bamboogle", "FictionalHot"]
|
| 16 |
-
|
| 17 |
-
print("Expected columns from AutoEvalColumn:")
|
| 18 |
-
for c in AutoEvalColumn.__dict__.values():
|
| 19 |
-
if hasattr(c, "name"):
|
| 20 |
-
print(f" {c.name}")
|
| 21 |
-
|
| 22 |
-
print("\nTrying to get raw data...")
|
| 23 |
-
try:
|
| 24 |
-
raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
| 25 |
-
print(f"Found {len(raw_data)} raw data entries")
|
| 26 |
-
|
| 27 |
-
if raw_data:
|
| 28 |
-
print("\nFirst entry structure:")
|
| 29 |
-
first_entry = raw_data[0]
|
| 30 |
-
print(f" eval_name: {first_entry.eval_name}")
|
| 31 |
-
print(f" full_model: {first_entry.full_model}")
|
| 32 |
-
print(f" results: {first_entry.results}")
|
| 33 |
-
|
| 34 |
-
# 转换为字典
|
| 35 |
-
all_data_json = [v.to_dict() for v in raw_data]
|
| 36 |
-
df = pd.DataFrame.from_records(all_data_json)
|
| 37 |
-
|
| 38 |
-
print(f"\nDataFrame shape: {df.shape}")
|
| 39 |
-
print(f"DataFrame columns: {list(df.columns)}")
|
| 40 |
-
|
| 41 |
-
if len(df) > 0:
|
| 42 |
-
print("\nFirst row:")
|
| 43 |
-
print(df.iloc[0])
|
| 44 |
-
|
| 45 |
-
print(f"\nLooking for '{AutoEvalColumn.average.name}' column...")
|
| 46 |
-
if AutoEvalColumn.average.name in df.columns:
|
| 47 |
-
print(f"Found '{AutoEvalColumn.average.name}' column")
|
| 48 |
-
print(f"Average values: {df[AutoEvalColumn.average.name].tolist()}")
|
| 49 |
-
else:
|
| 50 |
-
print(f"ERROR: '{AutoEvalColumn.average.name}' column not found!")
|
| 51 |
-
print("Available columns:", list(df.columns))
|
| 52 |
-
else:
|
| 53 |
-
print("No raw data found!")
|
| 54 |
-
|
| 55 |
-
except Exception as e:
|
| 56 |
-
print(f"Error occurred: {e}")
|
| 57 |
-
import traceback
|
| 58 |
-
|
| 59 |
-
traceback.print_exc()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -2,6 +2,6 @@ APScheduler>=3.10
|
|
| 2 |
gradio>=5.43.1,<6
|
| 3 |
gradio_leaderboard==0.0.13
|
| 4 |
huggingface-hub>=0.18.0
|
| 5 |
-
pandas
|
| 6 |
numpy>=1.23
|
| 7 |
|
|
|
|
| 2 |
gradio>=5.43.1,<6
|
| 3 |
gradio_leaderboard==0.0.13
|
| 4 |
huggingface-hub>=0.18.0
|
| 5 |
+
pandas==2.23.2
|
| 6 |
numpy>=1.23
|
| 7 |
|
src/populate.py
CHANGED
|
@@ -14,16 +14,34 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# Add ranking based on average score
|
| 20 |
-
df[
|
| 21 |
|
| 22 |
df = df[cols].round(decimals=2)
|
| 23 |
|
| 24 |
# filter out if any of the benchmarks have not been produced
|
| 25 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 26 |
-
return df
|
| 27 |
|
| 28 |
|
| 29 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
+
|
| 18 |
+
if df.empty:
|
| 19 |
+
return pd.DataFrame(columns=cols)
|
| 20 |
+
|
| 21 |
+
# Ensure all expected columns exist so downstream operations never fail
|
| 22 |
+
for column in cols:
|
| 23 |
+
if column not in df.columns:
|
| 24 |
+
df[column] = pd.NA
|
| 25 |
+
|
| 26 |
+
average_col = AutoEvalColumn.average.name
|
| 27 |
+
rank_col = AutoEvalColumn.rank.name
|
| 28 |
+
|
| 29 |
+
if average_col not in df.columns:
|
| 30 |
+
df[average_col] = pd.NA
|
| 31 |
+
|
| 32 |
+
df[average_col] = pd.to_numeric(df[average_col], errors="coerce")
|
| 33 |
+
|
| 34 |
+
if df[average_col].notna().any():
|
| 35 |
+
df = df.sort_values(by=[average_col], ascending=False, na_position="last")
|
| 36 |
|
| 37 |
# Add ranking based on average score
|
| 38 |
+
df[rank_col] = range(1, len(df) + 1)
|
| 39 |
|
| 40 |
df = df[cols].round(decimals=2)
|
| 41 |
|
| 42 |
# filter out if any of the benchmarks have not been produced
|
| 43 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 44 |
+
return df.reset_index(drop=True)
|
| 45 |
|
| 46 |
|
| 47 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|