shyuli commited on
Commit
8ef7e14
·
1 Parent(s): 7ea23dc

version v0.1

Browse files
Files changed (2) hide show
  1. debug_df.py +59 -0
  2. src/populate.py +5 -25
debug_df.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import json
4
+ import os
5
+ import pandas as pd
6
+ from src.display.utils import AutoEvalColumn
7
+ from src.leaderboard.read_evals import get_raw_eval_results
8
+
9
+ # 设置路径
10
+ EVAL_RESULTS_PATH = "./eval-results"
11
+ EVAL_REQUESTS_PATH = "./eval-queue"
12
+
13
+ # 获取列定义
14
+ cols = [c.name for c in AutoEvalColumn.__dict__.values() if hasattr(c, "name") and not getattr(c, "hidden", False)]
15
+ benchmark_cols = ["NQ", "TriviaQA", "PopQA", "HotpotQA", "2wiki", "Musique", "Bamboogle", "FictionalHot"]
16
+
17
+ print("Expected columns from AutoEvalColumn:")
18
+ for c in AutoEvalColumn.__dict__.values():
19
+ if hasattr(c, "name"):
20
+ print(f" {c.name}")
21
+
22
+ print("\nTrying to get raw data...")
23
+ try:
24
+ raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
25
+ print(f"Found {len(raw_data)} raw data entries")
26
+
27
+ if raw_data:
28
+ print("\nFirst entry structure:")
29
+ first_entry = raw_data[0]
30
+ print(f" eval_name: {first_entry.eval_name}")
31
+ print(f" full_model: {first_entry.full_model}")
32
+ print(f" results: {first_entry.results}")
33
+
34
+ # 转换为字典
35
+ all_data_json = [v.to_dict() for v in raw_data]
36
+ df = pd.DataFrame.from_records(all_data_json)
37
+
38
+ print(f"\nDataFrame shape: {df.shape}")
39
+ print(f"DataFrame columns: {list(df.columns)}")
40
+
41
+ if len(df) > 0:
42
+ print("\nFirst row:")
43
+ print(df.iloc[0])
44
+
45
+ print(f"\nLooking for '{AutoEvalColumn.average.name}' column...")
46
+ if AutoEvalColumn.average.name in df.columns:
47
+ print(f"Found '{AutoEvalColumn.average.name}' column")
48
+ print(f"Average values: {df[AutoEvalColumn.average.name].tolist()}")
49
+ else:
50
+ print(f"ERROR: '{AutoEvalColumn.average.name}' column not found!")
51
+ print("Available columns:", list(df.columns))
52
+ else:
53
+ print("No raw data found!")
54
+
55
+ except Exception as e:
56
+ print(f"Error occurred: {e}")
57
+ import traceback
58
+
59
+ traceback.print_exc()
src/populate.py CHANGED
@@ -14,35 +14,15 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
 
17
 
18
- # If empty (e.g., first run on HF Spaces), return empty frame with expected columns
19
- if df.empty:
20
- return pd.DataFrame(columns=cols)
21
 
22
- # Ensure Average exists; compute from available benchmark columns if missing
23
- average_col = AutoEvalColumn.average.name
24
- present_bench_cols = [c for c in benchmark_cols if c in df.columns]
25
- if average_col not in df.columns and len(present_bench_cols) > 0:
26
- df[average_col] = df[present_bench_cols].mean(axis=1)
27
-
28
- # Sort if Average present
29
- if average_col in df.columns:
30
- df = df.sort_values(by=[average_col], ascending=False)
31
-
32
- # Add rank if Average present
33
- if average_col in df.columns:
34
- df[AutoEvalColumn.rank.name] = range(1, len(df) + 1)
35
-
36
- # Add any missing columns as NA and reorder
37
- for c in cols:
38
- if c not in df.columns:
39
- df[c] = pd.NA
40
  df = df[cols].round(decimals=2)
41
 
42
- # Filter NaNs only on existing benchmark columns
43
- safe_benchmark_cols = [c for c in benchmark_cols if c in df.columns]
44
- if len(safe_benchmark_cols) > 0:
45
- df = df[has_no_nan_values(df, safe_benchmark_cols)]
46
  return df
47
 
48
 
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
 
19
+ # Add ranking based on average score
20
+ df[AutoEvalColumn.rank.name] = range(1, len(df) + 1)
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  df = df[cols].round(decimals=2)
23
 
24
+ # filter out if any of the benchmarks have not been produced
25
+ df = df[has_no_nan_values(df, benchmark_cols)]
 
 
26
  return df
27
 
28