| | import pandas as pd |
| | import numpy as np |
| | import plotly.express as px |
| | from plotly.graph_objs import Figure |
| |
|
| | from src.leaderboard.filter_models import FLAGGED_MODELS |
| | from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS |
| | from src.leaderboard.read_evals import EvalResult |
| |
|
| |
|
| |
|
| | def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame: |
| | """ |
| | Generates a DataFrame containing the maximum scores until each date. |
| | |
| | :param results_df: A DataFrame containing result information including metric scores and dates. |
| | :return: A new DataFrame containing the maximum scores until each date for every metric. |
| | """ |
| | |
| | results_df = pd.DataFrame(raw_data) |
| | |
| | results_df.sort_values(by="date", inplace=True) |
| |
|
| | |
| | scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]} |
| |
|
| | |
| | for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]: |
| | current_max = 0 |
| | last_date = "" |
| | column = task.col_name |
| | for _, row in results_df.iterrows(): |
| | current_model = row["full_model"] |
| | |
| | to_ignore = not row["still_on_hub"] or row["flagged"] or current_model in FLAGGED_MODELS or row["status"] != "Finished" |
| | if to_ignore: |
| | continue |
| |
|
| | current_date = row["date"] |
| | if task.benchmark == "Average": |
| | current_score = np.mean(list(row["results"].values())) |
| | else: |
| | current_score = row["results"][task.benchmark] |
| |
|
| | if current_score > current_max: |
| | if current_date == last_date and len(scores[column]) > 0: |
| | scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score} |
| | else: |
| | scores[column].append({"model": current_model, "date": current_date, "score": current_score}) |
| | current_max = current_score |
| | last_date = current_date |
| |
|
| | |
| | return {k: pd.DataFrame(v) for k, v in scores.items()} |
| |
|
| |
|
| | def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame: |
| | """ |
| | Transforms the scores DataFrame into a new format suitable for plotting. |
| | |
| | :param scores_df: A DataFrame containing metric scores and dates. |
| | :return: A new DataFrame reshaped for plotting purposes. |
| | """ |
| | |
| | dfs = [] |
| |
|
| | |
| | for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]: |
| | d = scores_df[col].reset_index(drop=True) |
| | d["task"] = col |
| | dfs.append(d) |
| |
|
| | |
| | concat_df = pd.concat(dfs, ignore_index=True) |
| |
|
| | |
| | concat_df.sort_values(by="date", inplace=True) |
| | concat_df.reset_index(drop=True, inplace=True) |
| | return concat_df |
| |
|
| |
|
| | def create_metric_plot_obj( |
| | df: pd.DataFrame, metrics: list[str], title: str |
| | ) -> Figure: |
| | """ |
| | Create a Plotly figure object with lines representing different metrics |
| | and horizontal dotted lines representing human baselines. |
| | |
| | :param df: The DataFrame containing the metric values, names, and dates. |
| | :param metrics: A list of strings representing the names of the metrics |
| | to be included in the plot. |
| | :param title: A string representing the title of the plot. |
| | :return: A Plotly figure object with lines representing metrics and |
| | horizontal dotted lines representing human baselines. |
| | """ |
| |
|
| | |
| | df = df[df["task"].isin(metrics)] |
| |
|
| | |
| | filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics} |
| |
|
| | |
| | fig = px.line( |
| | df, |
| | x="date", |
| | y="score", |
| | color="task", |
| | markers=True, |
| | custom_data=["task", "score", "model"], |
| | title=title, |
| | ) |
| |
|
| | |
| | fig.update_traces( |
| | hovertemplate="<br>".join( |
| | [ |
| | "Model Name: %{customdata[2]}", |
| | "Metric Name: %{customdata[0]}", |
| | "Date: %{x}", |
| | "Metric Value: %{y}", |
| | ] |
| | ) |
| | ) |
| |
|
| | |
| | fig.update_layout(yaxis_range=[0, 100]) |
| |
|
| | |
| | metric_color_mapping = {} |
| |
|
| | |
| | for trace in fig.data: |
| | metric_color_mapping[trace.name] = trace.line.color |
| |
|
| | |
| | for metric, value in filtered_human_baselines.items(): |
| | color = metric_color_mapping.get(metric, "blue") |
| | location = "top left" if metric == "HellaSwag" else "bottom left" |
| | |
| | fig.add_hline( |
| | y=value, |
| | line_dash="dot", |
| | annotation_text=f"{metric} human baseline", |
| | annotation_position=location, |
| | annotation_font_size=10, |
| | annotation_font_color=color, |
| | line_color=color, |
| | ) |
| |
|
| | return fig |
| |
|
| |
|
| | |
| | |
| | |
| |
|