| import math |
|
|
| import numpy as np |
| import pandas as pd |
| import plotly.express as px |
|
|
|
|
| |
| def compute_pairwise_win_fraction(battles): |
| |
| a_win_ptbl = pd.pivot_table( |
| battles[battles["win"] == "model_a"], |
| index="model_a", |
| columns="model_b", |
| aggfunc="size", |
| fill_value=0, |
| ) |
|
|
| |
| b_win_ptbl = pd.pivot_table( |
| battles[battles["win"] == "model_b"], |
| index="model_a", |
| columns="model_b", |
| aggfunc="size", |
| fill_value=0, |
| ) |
|
|
| |
| num_battles_ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0) |
|
|
| |
| |
| row_beats_col_freq = (a_win_ptbl + b_win_ptbl.T) / (num_battles_ptbl + num_battles_ptbl.T) |
|
|
| |
| prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False) |
| model_names = list(prop_wins.keys()) |
| row_beats_col = row_beats_col_freq.loc[model_names, model_names] |
| return row_beats_col |
|
|
|
|
| def visualize_pairwise_win_fraction(battles, title): |
| row_beats_col = compute_pairwise_win_fraction(battles) |
| fig = px.imshow(row_beats_col, color_continuous_scale="RdBu", text_auto=".2f", title=title) |
| fig.update_layout( |
| xaxis_title="Model B", |
| yaxis_title="Model A", |
| xaxis_side="top", |
| title_y=0.07, |
| title_x=0.5, |
| ) |
| fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>") |
| return fig |
|
|
|
|
| |
| def switch_model_a_b(df): |
| df_switch = df.copy() |
| |
| for i, row in df.iterrows(): |
| if np.random.rand() < 0.5: |
| df_switch.at[i, "model_a"] = row["model_b"] |
| df_switch.at[i, "model_b"] = row["model_a"] |
| if row["win"] == "model_a": |
| df_switch.at[i, "win"] = "model_b" |
| elif row["win"] == "model_b": |
| df_switch.at[i, "win"] = "model_a" |
| return df_switch |
|
|
|
|
| def visualize_battle_count(battles, title): |
| ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0) |
| battle_counts = ptbl + ptbl.T |
| ordering = battle_counts.sum().sort_values(ascending=False).index |
| fig = px.imshow(battle_counts.loc[ordering, ordering], title=title, text_auto=True, width=600) |
| fig.update_layout( |
| xaxis_title="Model B", |
| yaxis_title="Model A", |
| xaxis_side="top", |
| title_y=0.07, |
| title_x=0.5, |
| ) |
| fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>") |
| return fig |
|
|
|
|
| |
| def get_bootstrap_result(battles, func_compute_elo, num_round): |
| rows = [func_compute_elo(battles.sample(frac=1.0, replace=True)) for _ in range(num_round)] |
| df = pd.DataFrame(rows) |
| return df[df.median().sort_values(ascending=False).index] |
|
|
|
|
| def visualize_bootstrap_scores(df, title): |
| bars = ( |
| pd.DataFrame( |
| dict( |
| lower=df.quantile(0.025), |
| rating=df.quantile(0.5), |
| upper=df.quantile(0.975), |
| ) |
| ) |
| .reset_index(names="model") |
| .sort_values("rating", ascending=False) |
| ) |
| bars["error_y"] = bars["upper"] - bars["rating"] |
| bars["error_y_minus"] = bars["rating"] - bars["lower"] |
| bars["rating_rounded"] = np.round(bars["rating"], 2) |
| fig = px.scatter( |
| bars, |
| x="model", |
| y="rating", |
| error_y="error_y", |
| error_y_minus="error_y_minus", |
| text="rating_rounded", |
| title=title, |
| ) |
| fig.update_layout(xaxis_title="Model", yaxis_title="Rating") |
| return fig |
|
|
|
|
| |
| def visualize_rating_count(df, title): |
| df_all_value_counts = pd.concat([df["model_a"], df["model_b"]]).value_counts() |
| fig = px.bar(df_all_value_counts, title=title, text_auto=True) |
|
|
| min_y = df_all_value_counts.min() |
| max_y = df_all_value_counts.max() |
|
|
| y_end = math.ceil(min_y / 100) * 100 |
| y_begin = math.floor(max_y / 100) * 100 |
|
|
| fig.update_layout(xaxis_title="model", yaxis_title="Rating Count", showlegend=False) |
| fig.update_yaxes(range=[y_begin, y_end]) |
| |
| fig.write_html("model_counts.html", full_html=False, include_plotlyjs="cdn") |
| return fig |
|
|