Atlas / src /populate.py
Victor Dieguez
Removing envs variables
974e6f0
import json
import os
from dataclasses import fields
import pandas as pd
from src.display.formatting import make_clickable_model
from src.display.utils import AutoEvalColumn
def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
"""
Build the leaderboard dataframe directly from JSON files in eval_results_path.
We completely bypass get_raw_eval_results because our JSONs are already in
a simple schema:
- config.model_name
- results[benchmark_name]["acc"] in [0, 1]
We:
- create a row for each *.json
- fill all AutoEvalColumn fields with None, then overwrite the ones we know:
* model -> clickable HF link
* Average ⬆️ -> mean of all metrics (in percentage)
* each benchmark col in `benchmark_cols` -> metric * 100
"""
# 1) Collect all .json files under eval_results_path
if not os.path.isdir(eval_results_path):
print(f"Results path '{eval_results_path}' does not exist.")
return pd.DataFrame(columns=cols)
json_files = [
f for f in os.listdir(eval_results_path)
if f.endswith(".json") and not f.startswith(".")
]
if not json_files:
print(f"No JSON result files found in '{eval_results_path}'.")
return pd.DataFrame(columns=cols)
rows = []
for fname in json_files:
fpath = os.path.join(eval_results_path, fname)
try:
with open(fpath, "r", encoding="utf-8") as fp:
data = json.load(fp)
except Exception as e:
print(f"Failed to read '{fpath}': {e}")
continue
# Start with all columns set to None so the DF matches AutoEvalColumn
#row = {field.name: None for field in fields(AutoEvalColumn)}
row = {c: None for c in cols}
# ---- model column ----
config = data.get("config", {})
model_id = (
config.get("model_name")
or config.get("model_id")
or config.get("model") # just in case
)
if model_id is None:
# skip weird files without model info
print(f"Skipping '{fname}' – no model_name in config.")
continue
# Fill the "model" column (clickable markdown link)
row[AutoEvalColumn.model.name] = make_clickable_model(model_id)
# ---- metrics ----
results = data.get("results", {})
scores = []
for bench in benchmark_cols:
bench_result = results.get(bench, None)
if not isinstance(bench_result, dict):
continue
# We agreed on metric key "acc" in your JSONs
val = bench_result.get("acc", None)
if val is None:
continue
# Convert to percentage (e.g. 0.747 -> 74.7)
score = float(val) * 100.0
row[bench] = score
scores.append(score)
# ---- Average ⬆️ ----
avg_col = AutoEvalColumn.average.name
if scores:
row[avg_col] = sum(scores) / len(scores)
else:
row[avg_col] = None
rows.append(row)
if not rows:
print("No valid evaluation rows constructed – returning empty leaderboard.")
return pd.DataFrame(columns=cols)
df = pd.DataFrame(rows)
# Keep column ordering consistent with COLS
existing_cols = [c for c in cols if c in df.columns]
df = df[existing_cols]
# Round numeric columns
num_cols = df.select_dtypes(include="number").columns
if len(num_cols) > 0:
df[num_cols] = df[num_cols].round(2)
# Optional: drop rows with NaNs in benchmark columns
existing_benchmarks = [c for c in benchmark_cols if c in df.columns]
if existing_benchmarks:
df = df.dropna(subset=existing_benchmarks, how="any")
return df
def get_evaluation_queue_df(save_path: str, cols: list):
"""
Stubbed evaluation queue.
You are not using a requests dataset / eval queue, so we just:
- ensure the directory exists, and
- return three empty dataframes (finished, running, pending)
with the expected columns.
"""
os.makedirs(save_path, exist_ok=True)
empty_df = pd.DataFrame(columns=cols)
# The order here must match how app.py unpacks the result:
# finished_df, running_df, pending_df = get_evaluation_queue_df(...)
return empty_df, empty_df.copy(), empty_df.copy()