File size: 4,092 Bytes
5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a 5a5a36e e0f982a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | import pandas as pd
import numpy as np
import plotly.express as px
from plotly.graph_objs import Figure
from src.leaderboard.filter_models import FLAGGED_MODELS
from src.display.utils import human_baseline_row as HUMAN_BASELINE, auto_eval_cols, Tasks, Task, BENCHMARK_COLS
from src.leaderboard.read_evals import EvalResult
def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
"""
Generates a DataFrame containing the maximum scores until each date.
"""
all_data = [v.to_dict() for v in raw_data]
results_df = pd.DataFrame(all_data)
if "date" in results_df.columns:
results_df.sort_values(by="date", inplace=True)
scores = {k: [] for k in BENCHMARK_COLS + [auto_eval_cols.average.name]}
for task in [t.value for t in Tasks] + [Task("Average", "avg", auto_eval_cols.average.name)]:
current_max = 0
last_date = ""
column = task.col_name
for _, row in results_df.iterrows():
current_model = row.get("dummy", "Unknown")
still_on_hub = row.get(auto_eval_cols.still_on_hub.name, True)
is_flagged = row.get(auto_eval_cols.flagged.name, False)
to_ignore = not still_on_hub or is_flagged or current_model in FLAGGED_MODELS
if to_ignore:
continue
current_date = row.get("date", "")
if not current_date: continue
if task.benchmark == "Average":
current_score = row.get(auto_eval_cols.average.name, 0)
else:
current_score = row.get(task.col_name, 0)
if current_score > current_max:
if current_date == last_date and len(scores[column]) > 0:
scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
else:
scores[column].append({"model": current_model, "date": current_date, "score": current_score})
current_max = current_score
last_date = current_date
return {k: pd.DataFrame(v) for k, v in scores.items()}
def create_plot_df(scores_df: dict[str, pd.DataFrame]) -> pd.DataFrame:
"""Reshapes the scores DataFrame for plotting."""
dfs = []
for col in BENCHMARK_COLS + [auto_eval_cols.average.name]:
if col in scores_df and not scores_df[col].empty:
d = scores_df[col].reset_index(drop=True)
d["task"] = col
dfs.append(d)
if not dfs:
return pd.DataFrame(columns=["model", "date", "score", "task"])
concat_df = pd.concat(dfs, ignore_index=True)
concat_df.sort_values(by="date", inplace=True)
concat_df.reset_index(drop=True, inplace=True)
return concat_df
def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) -> Figure:
if df.empty:
return px.line(title="No data available")
df = df[df["task"].isin(metrics)]
filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
fig = px.line(
df,
x="date",
y="score",
color="task",
markers=True,
custom_data=["task", "score", "model"],
title=title,
)
fig.update_traces(
hovertemplate="<br>".join([
"Model Name: %{customdata[2]}",
"Metric Name: %{customdata[0]}",
"Date: %{x}",
"Metric Value: %{y}",
])
)
fig.update_layout(yaxis_range=[0, 100])
metric_color_mapping = {trace.name: trace.line.color for trace in fig.data}
for metric, value in filtered_human_baselines.items():
color = metric_color_mapping.get(metric, "blue")
location = "top left" if metric == "HellaSwag" else "bottom left"
fig.add_hline(
y=value,
line_dash="dot",
annotation_text=f"{metric} human baseline",
annotation_position=location,
annotation_font_size=10,
annotation_font_color=color,
line_color=color,
)
return fig |