File size: 4,452 Bytes
7800237
 
654c990
7800237
 
 
654c990
 
781d4b0
2e39b31
781d4b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
974e6f0
 
781d4b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0235b45
654c990
fd29588
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654c990
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import json
import os
from dataclasses import fields

import pandas as pd

from src.display.formatting import make_clickable_model
from src.display.utils import AutoEvalColumn


def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
    """
    Build the leaderboard dataframe directly from JSON files in eval_results_path.

    We completely bypass get_raw_eval_results because our JSONs are already in
    a simple schema:
      - config.model_name
      - results[benchmark_name]["acc"] in [0, 1]

    We:
      - create a row for each *.json
      - fill all AutoEvalColumn fields with None, then overwrite the ones we know:
        * model        -> clickable HF link
        * Average ⬆️  -> mean of all metrics (in percentage)
        * each benchmark col in `benchmark_cols` -> metric * 100
    """
    # 1) Collect all .json files under eval_results_path
    if not os.path.isdir(eval_results_path):
        print(f"Results path '{eval_results_path}' does not exist.")
        return pd.DataFrame(columns=cols)

    json_files = [
        f for f in os.listdir(eval_results_path)
        if f.endswith(".json") and not f.startswith(".")
    ]

    if not json_files:
        print(f"No JSON result files found in '{eval_results_path}'.")
        return pd.DataFrame(columns=cols)

    rows = []

    for fname in json_files:
        fpath = os.path.join(eval_results_path, fname)
        try:
            with open(fpath, "r", encoding="utf-8") as fp:
                data = json.load(fp)
        except Exception as e:
            print(f"Failed to read '{fpath}': {e}")
            continue

        # Start with all columns set to None so the DF matches AutoEvalColumn
        #row = {field.name: None for field in fields(AutoEvalColumn)}
        row = {c: None for c in cols} 
        # ---- model column ----
        config = data.get("config", {})
        model_id = (
            config.get("model_name")
            or config.get("model_id")
            or config.get("model")  # just in case
        )

        if model_id is None:
            # skip weird files without model info
            print(f"Skipping '{fname}' – no model_name in config.")
            continue

        # Fill the "model" column (clickable markdown link)
        row[AutoEvalColumn.model.name] = make_clickable_model(model_id)

        # ---- metrics ----
        results = data.get("results", {})
        scores = []

        for bench in benchmark_cols:
            bench_result = results.get(bench, None)
            if not isinstance(bench_result, dict):
                continue

            # We agreed on metric key "acc" in your JSONs
            val = bench_result.get("acc", None)
            if val is None:
                continue

            # Convert to percentage (e.g. 0.747 -> 74.7)
            score = float(val) * 100.0
            row[bench] = score
            scores.append(score)

        # ---- Average ⬆️ ----
        avg_col = AutoEvalColumn.average.name
        if scores:
            row[avg_col] = sum(scores) / len(scores)
        else:
            row[avg_col] = None

        rows.append(row)

    if not rows:
        print("No valid evaluation rows constructed – returning empty leaderboard.")
        return pd.DataFrame(columns=cols)

    df = pd.DataFrame(rows)

    # Keep column ordering consistent with COLS
    existing_cols = [c for c in cols if c in df.columns]
    df = df[existing_cols]

    # Round numeric columns
    num_cols = df.select_dtypes(include="number").columns
    if len(num_cols) > 0:
        df[num_cols] = df[num_cols].round(2)

    # Optional: drop rows with NaNs in benchmark columns
    existing_benchmarks = [c for c in benchmark_cols if c in df.columns]
    if existing_benchmarks:
        df = df.dropna(subset=existing_benchmarks, how="any")

    return df


def get_evaluation_queue_df(save_path: str, cols: list):
    """
    Stubbed evaluation queue.

    You are not using a requests dataset / eval queue, so we just:
    - ensure the directory exists, and
    - return three empty dataframes (finished, running, pending)
      with the expected columns.
    """
    os.makedirs(save_path, exist_ok=True)

    empty_df = pd.DataFrame(columns=cols)

    # The order here must match how app.py unpacks the result:
    # finished_df, running_df, pending_df = get_evaluation_queue_df(...)
    return empty_df, empty_df.copy(), empty_df.copy()