import json import os from dataclasses import fields import pandas as pd from src.display.formatting import make_clickable_model from src.display.utils import AutoEvalColumn def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols): """ Build the leaderboard dataframe directly from JSON files in eval_results_path. We completely bypass get_raw_eval_results because our JSONs are already in a simple schema: - config.model_name - results[benchmark_name]["acc"] in [0, 1] We: - create a row for each *.json - fill all AutoEvalColumn fields with None, then overwrite the ones we know: * model -> clickable HF link * Average ⬆️ -> mean of all metrics (in percentage) * each benchmark col in `benchmark_cols` -> metric * 100 """ # 1) Collect all .json files under eval_results_path if not os.path.isdir(eval_results_path): print(f"Results path '{eval_results_path}' does not exist.") return pd.DataFrame(columns=cols) json_files = [ f for f in os.listdir(eval_results_path) if f.endswith(".json") and not f.startswith(".") ] if not json_files: print(f"No JSON result files found in '{eval_results_path}'.") return pd.DataFrame(columns=cols) rows = [] for fname in json_files: fpath = os.path.join(eval_results_path, fname) try: with open(fpath, "r", encoding="utf-8") as fp: data = json.load(fp) except Exception as e: print(f"Failed to read '{fpath}': {e}") continue # Start with all columns set to None so the DF matches AutoEvalColumn #row = {field.name: None for field in fields(AutoEvalColumn)} row = {c: None for c in cols} # ---- model column ---- config = data.get("config", {}) model_id = ( config.get("model_name") or config.get("model_id") or config.get("model") # just in case ) if model_id is None: # skip weird files without model info print(f"Skipping '{fname}' – no model_name in config.") continue # Fill the "model" column (clickable markdown link) row[AutoEvalColumn.model.name] = make_clickable_model(model_id) # ---- metrics ---- results = data.get("results", {}) scores = [] for bench in benchmark_cols: bench_result = results.get(bench, None) if not isinstance(bench_result, dict): continue # We agreed on metric key "acc" in your JSONs val = bench_result.get("acc", None) if val is None: continue # Convert to percentage (e.g. 0.747 -> 74.7) score = float(val) * 100.0 row[bench] = score scores.append(score) # ---- Average ⬆️ ---- avg_col = AutoEvalColumn.average.name if scores: row[avg_col] = sum(scores) / len(scores) else: row[avg_col] = None rows.append(row) if not rows: print("No valid evaluation rows constructed – returning empty leaderboard.") return pd.DataFrame(columns=cols) df = pd.DataFrame(rows) # Keep column ordering consistent with COLS existing_cols = [c for c in cols if c in df.columns] df = df[existing_cols] # Round numeric columns num_cols = df.select_dtypes(include="number").columns if len(num_cols) > 0: df[num_cols] = df[num_cols].round(2) # Optional: drop rows with NaNs in benchmark columns existing_benchmarks = [c for c in benchmark_cols if c in df.columns] if existing_benchmarks: df = df.dropna(subset=existing_benchmarks, how="any") return df def get_evaluation_queue_df(save_path: str, cols: list): """ Stubbed evaluation queue. You are not using a requests dataset / eval queue, so we just: - ensure the directory exists, and - return three empty dataframes (finished, running, pending) with the expected columns. """ os.makedirs(save_path, exist_ok=True) empty_df = pd.DataFrame(columns=cols) # The order here must match how app.py unpacks the result: # finished_df, running_df, pending_df = get_evaluation_queue_df(...) return empty_df, empty_df.copy(), empty_df.copy()