File size: 4,092 Bytes
5a5a36e
 
 
 
 
 
e0f982a
5a5a36e
 
 
 
 
 
 
e0f982a
 
 
 
 
5a5a36e
e0f982a
5a5a36e
e0f982a
5a5a36e
 
 
e0f982a
5a5a36e
e0f982a
 
 
 
 
 
5a5a36e
 
 
e0f982a
 
 
5a5a36e
e0f982a
5a5a36e
e0f982a
5a5a36e
 
 
 
 
 
 
 
 
 
 
 
e0f982a
 
5a5a36e
 
e0f982a
 
 
 
 
5a5a36e
e0f982a
 
5a5a36e
e0f982a
5a5a36e
 
 
 
 
e0f982a
 
 
5a5a36e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0f982a
 
 
 
 
 
5a5a36e
 
 
e0f982a
5a5a36e
 
e0f982a
 
5a5a36e
 
 
 
 
 
 
 
 
 
e0f982a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.graph_objs import Figure

from src.leaderboard.filter_models import FLAGGED_MODELS
from src.display.utils import human_baseline_row as HUMAN_BASELINE, auto_eval_cols, Tasks, Task, BENCHMARK_COLS
from src.leaderboard.read_evals import EvalResult


def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
    """
    Generates a DataFrame containing the maximum scores until each date.
    """
    all_data = [v.to_dict() for v in raw_data]
    results_df = pd.DataFrame(all_data)
    
    if "date" in results_df.columns:
        results_df.sort_values(by="date", inplace=True)

    scores = {k: [] for k in BENCHMARK_COLS + [auto_eval_cols.average.name]}

    for task in [t.value for t in Tasks] + [Task("Average", "avg", auto_eval_cols.average.name)]:
        current_max = 0
        last_date = ""
        column = task.col_name
        
        for _, row in results_df.iterrows():
            current_model = row.get("dummy", "Unknown") 
            
            still_on_hub = row.get(auto_eval_cols.still_on_hub.name, True)
            is_flagged = row.get(auto_eval_cols.flagged.name, False)
            
            to_ignore = not still_on_hub or is_flagged or current_model in FLAGGED_MODELS
            if to_ignore:
                continue

            current_date = row.get("date", "")
            if not current_date: continue

            if task.benchmark == "Average":
                current_score = row.get(auto_eval_cols.average.name, 0)
            else:
                current_score = row.get(task.col_name, 0)

            if current_score > current_max:
                if current_date == last_date and len(scores[column]) > 0:
                    scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
                else:
                    scores[column].append({"model": current_model, "date": current_date, "score": current_score})
                current_max = current_score
                last_date = current_date

    return {k: pd.DataFrame(v) for k, v in scores.items()}


def create_plot_df(scores_df: dict[str, pd.DataFrame]) -> pd.DataFrame:
    """Reshapes the scores DataFrame for plotting."""
    dfs = []

    for col in BENCHMARK_COLS + [auto_eval_cols.average.name]:
        if col in scores_df and not scores_df[col].empty:
            d = scores_df[col].reset_index(drop=True)
            d["task"] = col
            dfs.append(d)

    if not dfs:
        return pd.DataFrame(columns=["model", "date", "score", "task"])

    concat_df = pd.concat(dfs, ignore_index=True)
    concat_df.sort_values(by="date", inplace=True)
    concat_df.reset_index(drop=True, inplace=True)
    return concat_df


def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) -> Figure:
    if df.empty:
        return px.line(title="No data available")

    df = df[df["task"].isin(metrics)]

    filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}

    fig = px.line(
        df,
        x="date",
        y="score",
        color="task",
        markers=True,
        custom_data=["task", "score", "model"],
        title=title,
    )

    fig.update_traces(
        hovertemplate="<br>".join([
            "Model Name: %{customdata[2]}",
            "Metric Name: %{customdata[0]}",
            "Date: %{x}",
            "Metric Value: %{y}",
        ])
    )

    fig.update_layout(yaxis_range=[0, 100])
    metric_color_mapping = {trace.name: trace.line.color for trace in fig.data}

    for metric, value in filtered_human_baselines.items():
        color = metric_color_mapping.get(metric, "blue")
        location = "top left" if metric == "HellaSwag" else "bottom left"
        fig.add_hline(
            y=value,
            line_dash="dot",
            annotation_text=f"{metric} human baseline",
            annotation_position=location,
            annotation_font_size=10,
            annotation_font_color=color,
            line_color=color,
        )

    return fig