File size: 5,904 Bytes
60906bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b98f07f
 
c2c3c10
b98f07f
 
 
3b3db42
91e8a06
ceb2102
b98f07f
 
d66a6a3
c2c3c10
 
 
 
d66a6a3
 
 
60906bd
 
 
 
 
 
 
 
 
c2c3c10
 
 
60906bd
 
 
 
 
 
 
 
 
cd1b5e8
60906bd
 
c2c3c10
 
 
 
 
ceb2102
b98f07f
ceb2102
b98f07f
cd1b5e8
 
b98f07f
c85dcc4
4103566
b98f07f
 
c2c3c10
60906bd
 
 
 
 
 
 
 
 
 
 
 
d66a6a3
60906bd
 
 
 
 
 
 
 
 
 
 
 
 
 
b98f07f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3165936
 
 
b98f07f
 
 
 
 
 
 
 
 
 
 
abebeac
b98f07f
 
 
d66a6a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Data population utilities for leaderboard and evaluation queue management.

This module provides functions to create and populate pandas DataFrames from evaluation
results and submission data. It handles data processing for both the main leaderboard
display and the evaluation queue status tracking.

Key Functions:
    get_leaderboard_df: Creates a sorted leaderboard DataFrame from evaluation results
    get_evaluation_queue_df: Creates separate DataFrames for different evaluation statuses

The module processes JSON files containing evaluation results and submission metadata,
applies formatting transformations, and filters data based on completion status.
"""

import json
import os
from pathlib import Path

import pandas as pd

from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn, EvalQueueColumn
from src.leaderboard.read_evals import get_raw_eval_results


def get_leaderboard_df(
    results_versions_dir: Path,
    requests_path: Path,
    *,
    results_version: str,
    cols: list[str],
    benchmark_cols: list[str],
) -> pd.DataFrame:
    """
    Creates a sorted leaderboard DataFrame from evaluation results.

    This function processes raw evaluation data from JSON files and creates a pandas
    DataFrame suitable for leaderboard display. The resulting DataFrame is sorted by
    average performance scores in descending order and filtered to exclude incomplete
    evaluations.

    Args:
        results_versions_dir (Path): Path to the directory containing evaluation result files
        requests_path (Path): Path to the directory containing evaluation request files
        results_version (str): Version of the results
        cols (list): List of column names to include in the final DataFrame
        benchmark_cols (list): List of benchmark column names used for filtering

    Returns:
        pd.DataFrame: A sorted and filtered DataFrame containing leaderboard data.
            Rows are sorted by average score (descending) and filtered to
            exclude entries with missing benchmark results.

    Note:
        The function automatically truncates numeric values to 1 decimal place and
        filters out any entries that have NaN values in the specified benchmark columns.
    """
    raw_data = get_raw_eval_results(
        results_versions_dir,
        requests_path,
        results_version=results_version,
    )
    all_data_json = [v.to_dict() for v in raw_data]

    df = pd.DataFrame.from_records(all_data_json)
    df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
    df = df.loc[:, cols]

    # filter out if any of the benchmarks have not been produced
    df = df.loc[has_no_nan_values(df, benchmark_cols), :]
    return df


def get_evaluation_queue_df(save_path: Path, cols: list[str]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Creates separate DataFrames for different evaluation queue statuses.

    This function scans a directory for evaluation submission files (both individual
    JSON files and files within subdirectories) and categorizes them by their status.
    It returns three separate DataFrames: finished, running, and pending evaluations.

    Args:
        save_path (str): Path to the directory containing evaluation submission files
        cols (list): List of column names to include in the final DataFrames

    Returns:
        tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three DataFrames in order:
            1. df_finished: Evaluations with status "FINISHED*" or "PENDING_NEW_EVAL"
            2. df_running: Evaluations with status "RUNNING"
            3. df_pending: Evaluations with status "PENDING" or "RERUN"

    Note:
        The function processes both individual JSON files and JSON files within
        subdirectories (excluding markdown files). Model names are automatically
        converted to clickable links, and revision defaults to "main" if not specified.

        Status categorization:
        - FINISHED: Any status starting with "FINISHED" or "PENDING_NEW_EVAL"
        - RUNNING: Status equals "RUNNING"
        - PENDING: Status equals "PENDING" or "RERUN"
    """
    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
    all_evals = []

    for entry in entries:
        if ".json" in entry:
            file_path = os.path.join(save_path, entry)
            with open(file_path) as fp:
                data = json.load(fp)

            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
            data[EvalQueueColumn.revision.name] = data.get("revision", "main")

            all_evals.append(data)
        elif ".md" not in entry:
            # this is a folder
            sub_entries = [
                e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")
            ]
            for sub_entry in sub_entries:
                file_path = os.path.join(save_path, entry, sub_entry)
                with open(file_path) as fp:
                    data = json.load(fp)

                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                all_evals.append(data)

    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
    df_running = pd.DataFrame.from_records(running_list, columns=cols)
    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
    return df_finished.loc[:, cols], df_running.loc[:, cols], df_pending.loc[:, cols]