File size: 3,827 Bytes
0061e14
243e685
b49d2fb
e00a798
0061e14
416ebf1
e00a798
 
0061e14
e00a798
0061e14
e00a798
 
416ebf1
 
 
0061e14
54e1175
 
 
 
 
 
 
 
 
 
 
b49d2fb
416ebf1
 
 
 
b49d2fb
416ebf1
b49d2fb
416ebf1
54e1175
 
 
 
 
e00a798
 
416ebf1
 
 
 
 
243e685
 
 
 
 
 
 
 
 
 
 
 
e00a798
 
 
 
 
54e1175
e00a798
54e1175
 
e00a798
 
 
 
54e1175
 
 
 
 
 
e00a798
 
 
 
 
6d7c674
a6adcf8
e00a798
 
 
ea641c7
 
e00a798
 
 
ea641c7
 
 
 
 
e00a798
0061e14
e00a798
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
from datasets import DownloadConfig, get_dataset_config_names, load_dataset
from datasets.exceptions import DatasetNotFoundError
from tqdm.auto import tqdm

from src.display.utils import AutoEvalColumn
from src.envs import TOKEN
from src.logger import get_logger

logger = get_logger(__name__)


def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
    """
    @brief Creates a dataframe from all the individual experiment results.
    """

    empty_df = pd.DataFrame(
        columns=[
            AutoEvalColumn.system.name,
            AutoEvalColumn.organization.name,
            AutoEvalColumn.success_rate_overall.name,
            AutoEvalColumn.success_rate_tier1.name,
            AutoEvalColumn.success_rate_tier2.name,
            AutoEvalColumn.submitted_on.name,
        ]
    )

    try:
        configs = get_dataset_config_names(
            results_dataset_name,
            token=TOKEN,
        )
    except (DatasetNotFoundError, FileNotFoundError):

        # Return an empty DataFrame with expected columns
        logger.warning("Failed to load configuration", exc_info=True)
        return empty_df

    if configs == ["default"]:
        logger.info("Dataset has only default config β€” treating as empty")
        return empty_df

    rows = []
    for submission_id in tqdm(
        configs,
        total=len(configs),
        desc="Processing Submission Results",
    ):
        try:
            submission_ds = load_dataset(
                results_dataset_name,
                submission_id,
                split="train",
                token=TOKEN,
                download_config=DownloadConfig(timeout=60),
            )
            submission_df = pd.DataFrame(submission_ds)
        except Exception as e:
            logger.warning(f"Failed to load submission {submission_id}: {e}")
            continue

        if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
            logger.warning(f"Skipping {submission_id} due to invalid did_pass values")
            continue

        assert submission_df["tier"].isin([1, 2]).all(), "Invalid tier values found in submission_df"
        success_rate = 100 * submission_df["did_pass"].mean()
        tier1_success_rate = 100 * submission_df[submission_df["tier"] == 1]["did_pass"].mean()
        tier2_success_rate = 100 * submission_df[submission_df["tier"] == 2]["did_pass"].mean()
        first_row = submission_df.iloc[0]

        rows.append(
            {
                AutoEvalColumn.system.name: first_row["system_name"],
                AutoEvalColumn.organization.name: first_row["organization"],
                AutoEvalColumn.success_rate_overall.name: success_rate,
                AutoEvalColumn.success_rate_tier1.name: tier1_success_rate,
                AutoEvalColumn.success_rate_tier2.name: tier2_success_rate,
                AutoEvalColumn.submitted_on.name: pd.to_datetime(first_row["submission_ts"]).strftime("%Y-%m-%d %H:%M"),
            }
        )

    full_df = pd.DataFrame(rows)

    logger.info(f"Loaded results df with {len(full_df)} entries")

    # Keep only the latest entry per unique (System Name, System Type, Organization) triplet
    final_df = (
        full_df.sort_values("Submitted On", ascending=False)
        .drop_duplicates(subset=[AutoEvalColumn.system.name, AutoEvalColumn.organization.name], keep="first")
        .sort_values(by=[AutoEvalColumn.success_rate_overall.name], ascending=False)
        .reset_index(drop=True)
    )

    cols_to_round = [
        AutoEvalColumn.success_rate_overall.name,
        AutoEvalColumn.success_rate_tier1.name,
        AutoEvalColumn.success_rate_tier2.name,
    ]
    final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)

    return final_df