File size: 4,452 Bytes
7800237 654c990 7800237 654c990 781d4b0 2e39b31 781d4b0 974e6f0 781d4b0 0235b45 654c990 fd29588 654c990 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | import json
import os
from dataclasses import fields
import pandas as pd
from src.display.formatting import make_clickable_model
from src.display.utils import AutoEvalColumn
def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
"""
Build the leaderboard dataframe directly from JSON files in eval_results_path.
We completely bypass get_raw_eval_results because our JSONs are already in
a simple schema:
- config.model_name
- results[benchmark_name]["acc"] in [0, 1]
We:
- create a row for each *.json
- fill all AutoEvalColumn fields with None, then overwrite the ones we know:
* model -> clickable HF link
* Average ⬆️ -> mean of all metrics (in percentage)
* each benchmark col in `benchmark_cols` -> metric * 100
"""
# 1) Collect all .json files under eval_results_path
if not os.path.isdir(eval_results_path):
print(f"Results path '{eval_results_path}' does not exist.")
return pd.DataFrame(columns=cols)
json_files = [
f for f in os.listdir(eval_results_path)
if f.endswith(".json") and not f.startswith(".")
]
if not json_files:
print(f"No JSON result files found in '{eval_results_path}'.")
return pd.DataFrame(columns=cols)
rows = []
for fname in json_files:
fpath = os.path.join(eval_results_path, fname)
try:
with open(fpath, "r", encoding="utf-8") as fp:
data = json.load(fp)
except Exception as e:
print(f"Failed to read '{fpath}': {e}")
continue
# Start with all columns set to None so the DF matches AutoEvalColumn
#row = {field.name: None for field in fields(AutoEvalColumn)}
row = {c: None for c in cols}
# ---- model column ----
config = data.get("config", {})
model_id = (
config.get("model_name")
or config.get("model_id")
or config.get("model") # just in case
)
if model_id is None:
# skip weird files without model info
print(f"Skipping '{fname}' – no model_name in config.")
continue
# Fill the "model" column (clickable markdown link)
row[AutoEvalColumn.model.name] = make_clickable_model(model_id)
# ---- metrics ----
results = data.get("results", {})
scores = []
for bench in benchmark_cols:
bench_result = results.get(bench, None)
if not isinstance(bench_result, dict):
continue
# We agreed on metric key "acc" in your JSONs
val = bench_result.get("acc", None)
if val is None:
continue
# Convert to percentage (e.g. 0.747 -> 74.7)
score = float(val) * 100.0
row[bench] = score
scores.append(score)
# ---- Average ⬆️ ----
avg_col = AutoEvalColumn.average.name
if scores:
row[avg_col] = sum(scores) / len(scores)
else:
row[avg_col] = None
rows.append(row)
if not rows:
print("No valid evaluation rows constructed – returning empty leaderboard.")
return pd.DataFrame(columns=cols)
df = pd.DataFrame(rows)
# Keep column ordering consistent with COLS
existing_cols = [c for c in cols if c in df.columns]
df = df[existing_cols]
# Round numeric columns
num_cols = df.select_dtypes(include="number").columns
if len(num_cols) > 0:
df[num_cols] = df[num_cols].round(2)
# Optional: drop rows with NaNs in benchmark columns
existing_benchmarks = [c for c in benchmark_cols if c in df.columns]
if existing_benchmarks:
df = df.dropna(subset=existing_benchmarks, how="any")
return df
def get_evaluation_queue_df(save_path: str, cols: list):
"""
Stubbed evaluation queue.
You are not using a requests dataset / eval queue, so we just:
- ensure the directory exists, and
- return three empty dataframes (finished, running, pending)
with the expected columns.
"""
os.makedirs(save_path, exist_ok=True)
empty_df = pd.DataFrame(columns=cols)
# The order here must match how app.py unpacks the result:
# finished_df, running_df, pending_df = get_evaluation_queue_df(...)
return empty_df, empty_df.copy(), empty_df.copy()
|