shyuli
commited on
Commit
·
8ef7e14
1
Parent(s):
7ea23dc
version v0.1
Browse files- debug_df.py +59 -0
- src/populate.py +5 -25
debug_df.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from src.display.utils import AutoEvalColumn
|
| 7 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
| 8 |
+
|
| 9 |
+
# 设置路径
|
| 10 |
+
EVAL_RESULTS_PATH = "./eval-results"
|
| 11 |
+
EVAL_REQUESTS_PATH = "./eval-queue"
|
| 12 |
+
|
| 13 |
+
# 获取列定义
|
| 14 |
+
cols = [c.name for c in AutoEvalColumn.__dict__.values() if hasattr(c, "name") and not getattr(c, "hidden", False)]
|
| 15 |
+
benchmark_cols = ["NQ", "TriviaQA", "PopQA", "HotpotQA", "2wiki", "Musique", "Bamboogle", "FictionalHot"]
|
| 16 |
+
|
| 17 |
+
print("Expected columns from AutoEvalColumn:")
|
| 18 |
+
for c in AutoEvalColumn.__dict__.values():
|
| 19 |
+
if hasattr(c, "name"):
|
| 20 |
+
print(f" {c.name}")
|
| 21 |
+
|
| 22 |
+
print("\nTrying to get raw data...")
|
| 23 |
+
try:
|
| 24 |
+
raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
| 25 |
+
print(f"Found {len(raw_data)} raw data entries")
|
| 26 |
+
|
| 27 |
+
if raw_data:
|
| 28 |
+
print("\nFirst entry structure:")
|
| 29 |
+
first_entry = raw_data[0]
|
| 30 |
+
print(f" eval_name: {first_entry.eval_name}")
|
| 31 |
+
print(f" full_model: {first_entry.full_model}")
|
| 32 |
+
print(f" results: {first_entry.results}")
|
| 33 |
+
|
| 34 |
+
# 转换为字典
|
| 35 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
| 36 |
+
df = pd.DataFrame.from_records(all_data_json)
|
| 37 |
+
|
| 38 |
+
print(f"\nDataFrame shape: {df.shape}")
|
| 39 |
+
print(f"DataFrame columns: {list(df.columns)}")
|
| 40 |
+
|
| 41 |
+
if len(df) > 0:
|
| 42 |
+
print("\nFirst row:")
|
| 43 |
+
print(df.iloc[0])
|
| 44 |
+
|
| 45 |
+
print(f"\nLooking for '{AutoEvalColumn.average.name}' column...")
|
| 46 |
+
if AutoEvalColumn.average.name in df.columns:
|
| 47 |
+
print(f"Found '{AutoEvalColumn.average.name}' column")
|
| 48 |
+
print(f"Average values: {df[AutoEvalColumn.average.name].tolist()}")
|
| 49 |
+
else:
|
| 50 |
+
print(f"ERROR: '{AutoEvalColumn.average.name}' column not found!")
|
| 51 |
+
print("Available columns:", list(df.columns))
|
| 52 |
+
else:
|
| 53 |
+
print("No raw data found!")
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"Error occurred: {e}")
|
| 57 |
+
import traceback
|
| 58 |
+
|
| 59 |
+
traceback.print_exc()
|
src/populate.py
CHANGED
|
@@ -14,35 +14,15 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
|
| 20 |
-
return pd.DataFrame(columns=cols)
|
| 21 |
|
| 22 |
-
# Ensure Average exists; compute from available benchmark columns if missing
|
| 23 |
-
average_col = AutoEvalColumn.average.name
|
| 24 |
-
present_bench_cols = [c for c in benchmark_cols if c in df.columns]
|
| 25 |
-
if average_col not in df.columns and len(present_bench_cols) > 0:
|
| 26 |
-
df[average_col] = df[present_bench_cols].mean(axis=1)
|
| 27 |
-
|
| 28 |
-
# Sort if Average present
|
| 29 |
-
if average_col in df.columns:
|
| 30 |
-
df = df.sort_values(by=[average_col], ascending=False)
|
| 31 |
-
|
| 32 |
-
# Add rank if Average present
|
| 33 |
-
if average_col in df.columns:
|
| 34 |
-
df[AutoEvalColumn.rank.name] = range(1, len(df) + 1)
|
| 35 |
-
|
| 36 |
-
# Add any missing columns as NA and reorder
|
| 37 |
-
for c in cols:
|
| 38 |
-
if c not in df.columns:
|
| 39 |
-
df[c] = pd.NA
|
| 40 |
df = df[cols].round(decimals=2)
|
| 41 |
|
| 42 |
-
#
|
| 43 |
-
|
| 44 |
-
if len(safe_benchmark_cols) > 0:
|
| 45 |
-
df = df[has_no_nan_values(df, safe_benchmark_cols)]
|
| 46 |
return df
|
| 47 |
|
| 48 |
|
|
|
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 18 |
|
| 19 |
+
# Add ranking based on average score
|
| 20 |
+
df[AutoEvalColumn.rank.name] = range(1, len(df) + 1)
|
|
|
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
df = df[cols].round(decimals=2)
|
| 23 |
|
| 24 |
+
# filter out if any of the benchmarks have not been produced
|
| 25 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
|
|
|
|
| 26 |
return df
|
| 27 |
|
| 28 |
|