| | import json |
| | import os |
| | from dataclasses import fields |
| |
|
| | import pandas as pd |
| |
|
| | from src.display.formatting import make_clickable_model |
| | from src.display.utils import AutoEvalColumn |
| |
|
| |
|
| | def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols): |
| | """ |
| | Build the leaderboard dataframe directly from JSON files in eval_results_path. |
| | |
| | We completely bypass get_raw_eval_results because our JSONs are already in |
| | a simple schema: |
| | - config.model_name |
| | - results[benchmark_name]["acc"] in [0, 1] |
| | |
| | We: |
| | - create a row for each *.json |
| | - fill all AutoEvalColumn fields with None, then overwrite the ones we know: |
| | * model -> clickable HF link |
| | * Average ⬆️ -> mean of all metrics (in percentage) |
| | * each benchmark col in `benchmark_cols` -> metric * 100 |
| | """ |
| | |
| | if not os.path.isdir(eval_results_path): |
| | print(f"Results path '{eval_results_path}' does not exist.") |
| | return pd.DataFrame(columns=cols) |
| |
|
| | json_files = [ |
| | f for f in os.listdir(eval_results_path) |
| | if f.endswith(".json") and not f.startswith(".") |
| | ] |
| |
|
| | if not json_files: |
| | print(f"No JSON result files found in '{eval_results_path}'.") |
| | return pd.DataFrame(columns=cols) |
| |
|
| | rows = [] |
| |
|
| | for fname in json_files: |
| | fpath = os.path.join(eval_results_path, fname) |
| | try: |
| | with open(fpath, "r", encoding="utf-8") as fp: |
| | data = json.load(fp) |
| | except Exception as e: |
| | print(f"Failed to read '{fpath}': {e}") |
| | continue |
| |
|
| | |
| | |
| | row = {c: None for c in cols} |
| | |
| | config = data.get("config", {}) |
| | model_id = ( |
| | config.get("model_name") |
| | or config.get("model_id") |
| | or config.get("model") |
| | ) |
| |
|
| | if model_id is None: |
| | |
| | print(f"Skipping '{fname}' – no model_name in config.") |
| | continue |
| |
|
| | |
| | row[AutoEvalColumn.model.name] = make_clickable_model(model_id) |
| |
|
| | |
| | results = data.get("results", {}) |
| | scores = [] |
| |
|
| | for bench in benchmark_cols: |
| | bench_result = results.get(bench, None) |
| | if not isinstance(bench_result, dict): |
| | continue |
| |
|
| | |
| | val = bench_result.get("acc", None) |
| | if val is None: |
| | continue |
| |
|
| | |
| | score = float(val) * 100.0 |
| | row[bench] = score |
| | scores.append(score) |
| |
|
| | |
| | avg_col = AutoEvalColumn.average.name |
| | if scores: |
| | row[avg_col] = sum(scores) / len(scores) |
| | else: |
| | row[avg_col] = None |
| |
|
| | rows.append(row) |
| |
|
| | if not rows: |
| | print("No valid evaluation rows constructed – returning empty leaderboard.") |
| | return pd.DataFrame(columns=cols) |
| |
|
| | df = pd.DataFrame(rows) |
| |
|
| | |
| | existing_cols = [c for c in cols if c in df.columns] |
| | df = df[existing_cols] |
| |
|
| | |
| | num_cols = df.select_dtypes(include="number").columns |
| | if len(num_cols) > 0: |
| | df[num_cols] = df[num_cols].round(2) |
| |
|
| | |
| | existing_benchmarks = [c for c in benchmark_cols if c in df.columns] |
| | if existing_benchmarks: |
| | df = df.dropna(subset=existing_benchmarks, how="any") |
| |
|
| | return df |
| |
|
| |
|
| | def get_evaluation_queue_df(save_path: str, cols: list): |
| | """ |
| | Stubbed evaluation queue. |
| | |
| | You are not using a requests dataset / eval queue, so we just: |
| | - ensure the directory exists, and |
| | - return three empty dataframes (finished, running, pending) |
| | with the expected columns. |
| | """ |
| | os.makedirs(save_path, exist_ok=True) |
| |
|
| | empty_df = pd.DataFrame(columns=cols) |
| |
|
| | |
| | |
| | return empty_df, empty_df.copy(), empty_df.copy() |
| |
|
| |
|