Spaces:

Prosodia
/

Atlas

Sleeping

File size: 4,452 Bytes

import json
import os
from dataclasses import fields

import pandas as pd

from src.display.formatting import make_clickable_model
from src.display.utils import AutoEvalColumn


def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
    """
    Build the leaderboard dataframe directly from JSON files in eval_results_path.

    We completely bypass get_raw_eval_results because our JSONs are already in
    a simple schema:
      - config.model_name
      - results[benchmark_name]["acc"] in [0, 1]

    We:
      - create a row for each *.json
      - fill all AutoEvalColumn fields with None, then overwrite the ones we know:
        * model        -> clickable HF link
        * Average ⬆️  -> mean of all metrics (in percentage)
        * each benchmark col in `benchmark_cols` -> metric * 100
    """
    # 1) Collect all .json files under eval_results_path
    if not os.path.isdir(eval_results_path):
        print(f"Results path '{eval_results_path}' does not exist.")
        return pd.DataFrame(columns=cols)

    json_files = [
        f for f in os.listdir(eval_results_path)
        if f.endswith(".json") and not f.startswith(".")
    ]

    if not json_files:
        print(f"No JSON result files found in '{eval_results_path}'.")
        return pd.DataFrame(columns=cols)

    rows = []

    for fname in json_files:
        fpath = os.path.join(eval_results_path, fname)
        try:
            with open(fpath, "r", encoding="utf-8") as fp:
                data = json.load(fp)
        except Exception as e:
            print(f"Failed to read '{fpath}': {e}")
            continue

        # Start with all columns set to None so the DF matches AutoEvalColumn
        #row = {field.name: None for field in fields(AutoEvalColumn)}
        row = {c: None for c in cols} 
        # ---- model column ----
        config = data.get("config", {})
        model_id = (
            config.get("model_name")
            or config.get("model_id")
            or config.get("model")  # just in case
        )

        if model_id is None:
            # skip weird files without model info
            print(f"Skipping '{fname}' – no model_name in config.")
            continue

        # Fill the "model" column (clickable markdown link)
        row[AutoEvalColumn.model.name] = make_clickable_model(model_id)

        # ---- metrics ----
        results = data.get("results", {})
        scores = []

        for bench in benchmark_cols:
            bench_result = results.get(bench, None)
            if not isinstance(bench_result, dict):
                continue

            # We agreed on metric key "acc" in your JSONs
            val = bench_result.get("acc", None)
            if val is None:
                continue

            # Convert to percentage (e.g. 0.747 -> 74.7)
            score = float(val) * 100.0
            row[bench] = score
            scores.append(score)

        # ---- Average ⬆️ ----
        avg_col = AutoEvalColumn.average.name
        if scores:
            row[avg_col] = sum(scores) / len(scores)
        else:
            row[avg_col] = None

        rows.append(row)

    if not rows:
        print("No valid evaluation rows constructed – returning empty leaderboard.")
        return pd.DataFrame(columns=cols)

    df = pd.DataFrame(rows)

    # Keep column ordering consistent with COLS
    existing_cols = [c for c in cols if c in df.columns]
    df = df[existing_cols]

    # Round numeric columns
    num_cols = df.select_dtypes(include="number").columns
    if len(num_cols) > 0:
        df[num_cols] = df[num_cols].round(2)

    # Optional: drop rows with NaNs in benchmark columns
    existing_benchmarks = [c for c in benchmark_cols if c in df.columns]
    if existing_benchmarks:
        df = df.dropna(subset=existing_benchmarks, how="any")

    return df


def get_evaluation_queue_df(save_path: str, cols: list):
    """
    Stubbed evaluation queue.

    You are not using a requests dataset / eval queue, so we just:
    - ensure the directory exists, and
    - return three empty dataframes (finished, running, pending)
      with the expected columns.
    """
    os.makedirs(save_path, exist_ok=True)

    empty_df = pd.DataFrame(columns=cols)

    # The order here must match how app.py unpacks the result:
    # finished_df, running_df, pending_df = get_evaluation_queue_df(...)
    return empty_df, empty_df.copy(), empty_df.copy()