shyuli commited on
Commit
87688ca
·
1 Parent(s): 8ef7e14

version v0.1

Browse files
Files changed (3) hide show
  1. debug_df.py +0 -59
  2. requirements.txt +1 -1
  3. src/populate.py +21 -3
debug_df.py DELETED
@@ -1,59 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- import json
4
- import os
5
- import pandas as pd
6
- from src.display.utils import AutoEvalColumn
7
- from src.leaderboard.read_evals import get_raw_eval_results
8
-
9
- # 设置路径
10
- EVAL_RESULTS_PATH = "./eval-results"
11
- EVAL_REQUESTS_PATH = "./eval-queue"
12
-
13
- # 获取列定义
14
- cols = [c.name for c in AutoEvalColumn.__dict__.values() if hasattr(c, "name") and not getattr(c, "hidden", False)]
15
- benchmark_cols = ["NQ", "TriviaQA", "PopQA", "HotpotQA", "2wiki", "Musique", "Bamboogle", "FictionalHot"]
16
-
17
- print("Expected columns from AutoEvalColumn:")
18
- for c in AutoEvalColumn.__dict__.values():
19
- if hasattr(c, "name"):
20
- print(f" {c.name}")
21
-
22
- print("\nTrying to get raw data...")
23
- try:
24
- raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
25
- print(f"Found {len(raw_data)} raw data entries")
26
-
27
- if raw_data:
28
- print("\nFirst entry structure:")
29
- first_entry = raw_data[0]
30
- print(f" eval_name: {first_entry.eval_name}")
31
- print(f" full_model: {first_entry.full_model}")
32
- print(f" results: {first_entry.results}")
33
-
34
- # 转换为字典
35
- all_data_json = [v.to_dict() for v in raw_data]
36
- df = pd.DataFrame.from_records(all_data_json)
37
-
38
- print(f"\nDataFrame shape: {df.shape}")
39
- print(f"DataFrame columns: {list(df.columns)}")
40
-
41
- if len(df) > 0:
42
- print("\nFirst row:")
43
- print(df.iloc[0])
44
-
45
- print(f"\nLooking for '{AutoEvalColumn.average.name}' column...")
46
- if AutoEvalColumn.average.name in df.columns:
47
- print(f"Found '{AutoEvalColumn.average.name}' column")
48
- print(f"Average values: {df[AutoEvalColumn.average.name].tolist()}")
49
- else:
50
- print(f"ERROR: '{AutoEvalColumn.average.name}' column not found!")
51
- print("Available columns:", list(df.columns))
52
- else:
53
- print("No raw data found!")
54
-
55
- except Exception as e:
56
- print(f"Error occurred: {e}")
57
- import traceback
58
-
59
- traceback.print_exc()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -2,6 +2,6 @@ APScheduler>=3.10
2
  gradio>=5.43.1,<6
3
  gradio_leaderboard==0.0.13
4
  huggingface-hub>=0.18.0
5
- pandas>=1.5
6
  numpy>=1.23
7
 
 
2
  gradio>=5.43.1,<6
3
  gradio_leaderboard==0.0.13
4
  huggingface-hub>=0.18.0
5
+ pandas==2.23.2
6
  numpy>=1.23
7
 
src/populate.py CHANGED
@@ -14,16 +14,34 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # Add ranking based on average score
20
- df[AutoEvalColumn.rank.name] = range(1, len(df) + 1)
21
 
22
  df = df[cols].round(decimals=2)
23
 
24
  # filter out if any of the benchmarks have not been produced
25
  df = df[has_no_nan_values(df, benchmark_cols)]
26
- return df
27
 
28
 
29
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+
18
+ if df.empty:
19
+ return pd.DataFrame(columns=cols)
20
+
21
+ # Ensure all expected columns exist so downstream operations never fail
22
+ for column in cols:
23
+ if column not in df.columns:
24
+ df[column] = pd.NA
25
+
26
+ average_col = AutoEvalColumn.average.name
27
+ rank_col = AutoEvalColumn.rank.name
28
+
29
+ if average_col not in df.columns:
30
+ df[average_col] = pd.NA
31
+
32
+ df[average_col] = pd.to_numeric(df[average_col], errors="coerce")
33
+
34
+ if df[average_col].notna().any():
35
+ df = df.sort_values(by=[average_col], ascending=False, na_position="last")
36
 
37
  # Add ranking based on average score
38
+ df[rank_col] = range(1, len(df) + 1)
39
 
40
  df = df[cols].round(decimals=2)
41
 
42
  # filter out if any of the benchmarks have not been produced
43
  df = df[has_no_nan_values(df, benchmark_cols)]
44
+ return df.reset_index(drop=True)
45
 
46
 
47
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: