|
|
import streamlit as st |
|
|
import polars as pl |
|
|
import altair as alt |
|
|
from huggingface_hub import hf_hub_download |
|
|
import traceback |
|
|
|
|
|
|
|
|
st.set_page_config(layout="wide") |
|
|
alt.data_transformers.enable( |
|
|
"default" |
|
|
) |
|
|
|
|
|
|
|
|
st.title("Search Arena V1 Dataset Analysis") |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_data(ttl=3600) |
|
|
def load_data(): |
|
|
"""Loads the dataset from Hugging Face Hub.""" |
|
|
try: |
|
|
repo_id = "lmarena-ai/search-arena-v1-7k" |
|
|
filename = "data/search-arena-v1-preference-7k.parquet" |
|
|
|
|
|
|
|
|
|
|
|
local_path = hf_hub_download( |
|
|
repo_id=repo_id, filename=filename, repo_type="dataset" |
|
|
) |
|
|
|
|
|
df = pl.read_parquet(local_path) |
|
|
df = df.with_columns(pl.col("timestamp").dt.date().alias("date")) |
|
|
return df |
|
|
except Exception as e: |
|
|
st.error(f"Error loading data: {e}") |
|
|
st.warning( |
|
|
"Ensure you have added your Hugging Face token (with read access) as a secret named 'HF_TOKEN' in your Space settings." |
|
|
) |
|
|
return None |
|
|
|
|
|
|
|
|
df = load_data() |
|
|
|
|
|
if df is None: |
|
|
st.stop() |
|
|
|
|
|
|
|
|
|
|
|
st.header("Dataset Overview") |
|
|
|
|
|
|
|
|
st.subheader("Data Collection Timeline") |
|
|
try: |
|
|
date_counts = df.group_by("date").agg(pl.len().alias("count")).sort("date") |
|
|
chart_dates = ( |
|
|
alt.Chart(date_counts.to_pandas()) |
|
|
.mark_bar() |
|
|
.encode( |
|
|
x=alt.X( |
|
|
"date:T", axis=alt.Axis(labelAngle=0, title="Date") |
|
|
), |
|
|
y=alt.Y("count:Q", axis=alt.Axis(title="Frequency")), |
|
|
tooltip=["date:T", "count:Q"], |
|
|
) |
|
|
.properties(title="Histogram of Dates") |
|
|
) |
|
|
st.altair_chart(chart_dates, use_container_width=True) |
|
|
except Exception as e: |
|
|
st.error(f"Error generating date histogram: {e}") |
|
|
|
|
|
|
|
|
|
|
|
st.header("Model Performance") |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_data(ttl=3600) |
|
|
def calculate_model_stats(_df): |
|
|
"""Calculates wins, losses, ties, and win rate for each model.""" |
|
|
unique_models = ( |
|
|
pl.concat([_df["model_a"], _df["model_b"]]).unique().sort().to_list() |
|
|
) |
|
|
results = [] |
|
|
|
|
|
for model in unique_models: |
|
|
|
|
|
wins_as_a = _df.filter( |
|
|
(pl.col("model_a") == model) & (pl.col("winner") == "model_a") |
|
|
).height |
|
|
wins_as_b = _df.filter( |
|
|
(pl.col("model_b") == model) & (pl.col("winner") == "model_b") |
|
|
).height |
|
|
total_wins = wins_as_a + wins_as_b |
|
|
|
|
|
|
|
|
losses_as_a = _df.filter( |
|
|
(pl.col("model_a") == model) & (pl.col("winner") == "model_b") |
|
|
).height |
|
|
losses_as_b = _df.filter( |
|
|
(pl.col("model_b") == model) & (pl.col("winner") == "model_a") |
|
|
).height |
|
|
total_losses = losses_as_a + losses_as_b |
|
|
|
|
|
|
|
|
ties_as_a = _df.filter( |
|
|
(pl.col("model_a") == model) & (pl.col("winner").str.contains("tie")) |
|
|
).height |
|
|
ties_as_b = _df.filter( |
|
|
(pl.col("model_b") == model) & (pl.col("winner").str.contains("tie")) |
|
|
).height |
|
|
total_ties = ties_as_a + ties_as_b |
|
|
|
|
|
|
|
|
total_matches = _df.filter( |
|
|
(pl.col("model_a") == model) | (pl.col("model_b") == model) |
|
|
).height |
|
|
|
|
|
|
|
|
win_rate = ( |
|
|
round(total_wins / total_matches * 100, 2) if total_matches > 0 else 0 |
|
|
) |
|
|
|
|
|
results.append( |
|
|
{ |
|
|
"model": model, |
|
|
"wins": total_wins, |
|
|
"losses": total_losses, |
|
|
"ties": total_ties, |
|
|
"total_matches": total_matches, |
|
|
"win_rate (%)": win_rate, |
|
|
} |
|
|
) |
|
|
|
|
|
results_df = pl.DataFrame(results).sort("win_rate (%)", descending=True) |
|
|
return results_df, unique_models |
|
|
|
|
|
|
|
|
results_df, unique_models = calculate_model_stats(df) |
|
|
|
|
|
st.subheader("Overall Win Rates") |
|
|
st.dataframe(results_df.to_pandas(), use_container_width=True) |
|
|
|
|
|
|
|
|
st.subheader("Head-to-Head Matchups (Wins-Losses-Ties-BothBad)") |
|
|
|
|
|
|
|
|
@st.cache_data(ttl=3600) |
|
|
def calculate_head_to_head(_df, _unique_models): |
|
|
"""Calculates head-to-head results.""" |
|
|
head_to_head = [] |
|
|
for model_1 in _unique_models: |
|
|
row = {"model": model_1} |
|
|
for model_2 in _unique_models: |
|
|
if model_1 == model_2: |
|
|
row[model_2] = "N/A" |
|
|
continue |
|
|
|
|
|
matches_ab = _df.filter( |
|
|
(pl.col("model_a") == model_1) & (pl.col("model_b") == model_2) |
|
|
) |
|
|
matches_ba = _df.filter( |
|
|
(pl.col("model_a") == model_2) & (pl.col("model_b") == model_1) |
|
|
) |
|
|
|
|
|
wins_1 = ( |
|
|
matches_ab.filter(pl.col("winner") == "model_a").height |
|
|
+ matches_ba.filter(pl.col("winner") == "model_b").height |
|
|
) |
|
|
wins_2 = ( |
|
|
matches_ab.filter(pl.col("winner") == "model_b").height |
|
|
+ matches_ba.filter(pl.col("winner") == "model_a").height |
|
|
) |
|
|
ties = ( |
|
|
matches_ab.filter(pl.col("winner") == "tie").height |
|
|
+ matches_ba.filter(pl.col("winner") == "tie").height |
|
|
) |
|
|
bothbad = ( |
|
|
matches_ab.filter(pl.col("winner") == "tie (bothbad)").height |
|
|
+ matches_ba.filter(pl.col("winner") == "tie (bothbad)").height |
|
|
) |
|
|
|
|
|
total = wins_1 + wins_2 + ties + bothbad |
|
|
row[model_2] = ( |
|
|
f"{wins_1}-{wins_2}-{ties}-{bothbad}" if total > 0 else "0-0-0-0" |
|
|
) |
|
|
head_to_head.append(row) |
|
|
return pl.DataFrame(head_to_head) |
|
|
|
|
|
|
|
|
head_to_head_df = calculate_head_to_head(df, unique_models) |
|
|
st.dataframe(head_to_head_df.to_pandas(), use_container_width=True) |
|
|
|
|
|
|
|
|
st.subheader("Head-to-Head Heatmaps") |
|
|
|
|
|
|
|
|
@st.cache_data(ttl=3600) |
|
|
def prepare_heatmap_data(_head_to_head_df): |
|
|
"""Prepares data for heatmaps.""" |
|
|
melted_df = _head_to_head_df.unpivot(index=["model"], variable_name="opponent") |
|
|
parsed_data = [] |
|
|
for row in melted_df.iter_rows(named=True): |
|
|
model, opponent, value = row["model"], row["opponent"], row["value"] |
|
|
if value != "N/A": |
|
|
try: |
|
|
parts = value.split("-") |
|
|
wins, losses, ties, bothbad = map(int, parts) |
|
|
parsed_data.extend( |
|
|
[ |
|
|
{ |
|
|
"model": model, |
|
|
"opponent": opponent, |
|
|
"metric": "wins", |
|
|
"value": wins, |
|
|
}, |
|
|
{ |
|
|
"model": model, |
|
|
"opponent": opponent, |
|
|
"metric": "losses", |
|
|
"value": losses, |
|
|
}, |
|
|
{ |
|
|
"model": model, |
|
|
"opponent": opponent, |
|
|
"metric": "ties", |
|
|
"value": ties, |
|
|
}, |
|
|
{ |
|
|
"model": model, |
|
|
"opponent": opponent, |
|
|
"metric": "bothbad", |
|
|
"value": bothbad, |
|
|
}, |
|
|
] |
|
|
) |
|
|
except (ValueError, IndexError): |
|
|
st.warning( |
|
|
f"Could not parse head-to-head value: '{value}' for {model} vs {opponent}" |
|
|
) |
|
|
continue |
|
|
|
|
|
return pl.DataFrame(parsed_data) |
|
|
|
|
|
|
|
|
metrics_df = prepare_heatmap_data(head_to_head_df) |
|
|
|
|
|
|
|
|
def create_heatmap(data_pd, metric, color_scheme): |
|
|
"""Helper function to create an Altair heatmap.""" |
|
|
if data_pd.empty: |
|
|
return alt.Chart(pd.DataFrame({"x": [], "y": [], "value": []})).mark_text( |
|
|
text=f"No data for {metric}" |
|
|
) |
|
|
|
|
|
median_value = data_pd["value"].median() |
|
|
|
|
|
heatmap = ( |
|
|
alt.Chart(data_pd) |
|
|
.mark_rect() |
|
|
.encode( |
|
|
x=alt.X( |
|
|
"opponent:N", |
|
|
title="Opponent", |
|
|
sort=unique_models, |
|
|
axis=alt.Axis(labelLimit=0, labelAngle=90), |
|
|
), |
|
|
y=alt.Y( |
|
|
"model:N", title="Model", sort=unique_models |
|
|
), |
|
|
color=alt.Color( |
|
|
"value:Q", |
|
|
scale=alt.Scale(scheme=color_scheme), |
|
|
title=f"{metric.capitalize()}", |
|
|
), |
|
|
tooltip=["model", "opponent", "value"], |
|
|
) |
|
|
.properties( |
|
|
title=f"{metric.capitalize()}", width=alt.Step(40), height=alt.Step(40) |
|
|
) |
|
|
) |
|
|
text = heatmap.mark_text(baseline="middle").encode( |
|
|
text="value:Q", |
|
|
color=alt.condition( |
|
|
f"datum.value > {median_value}", alt.value("white"), alt.value("black") |
|
|
), |
|
|
tooltip=["model", "opponent", "value"], |
|
|
) |
|
|
return heatmap + text |
|
|
|
|
|
|
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
with col1: |
|
|
try: |
|
|
wins_pd = metrics_df.filter(pl.col("metric") == "wins").to_pandas() |
|
|
if not wins_pd.empty: |
|
|
wins_heatmap = create_heatmap(wins_pd, "wins", "greens") |
|
|
st.altair_chart(wins_heatmap, use_container_width=True) |
|
|
else: |
|
|
st.write("No 'wins' data for heatmap.") |
|
|
except Exception as e: |
|
|
st.error(f"Error creating wins heatmap: {e}") |
|
|
|
|
|
try: |
|
|
ties_pd = metrics_df.filter(pl.col("metric") == "ties").to_pandas() |
|
|
if not ties_pd.empty: |
|
|
ties_heatmap = create_heatmap(ties_pd, "ties", "purples") |
|
|
st.altair_chart(ties_heatmap, use_container_width=True) |
|
|
else: |
|
|
st.write("No 'ties' data for heatmap.") |
|
|
except Exception as e: |
|
|
st.error(f"Error creating ties heatmap: {e}") |
|
|
|
|
|
with col2: |
|
|
try: |
|
|
losses_pd = metrics_df.filter(pl.col("metric") == "losses").to_pandas() |
|
|
if not losses_pd.empty: |
|
|
losses_heatmap = create_heatmap(losses_pd, "losses", "oranges") |
|
|
st.altair_chart(losses_heatmap, use_container_width=True) |
|
|
else: |
|
|
st.write("No 'losses' data for heatmap.") |
|
|
except Exception as e: |
|
|
st.error(f"Error creating losses heatmap: {e}") |
|
|
|
|
|
try: |
|
|
bothbad_pd = metrics_df.filter(pl.col("metric") == "bothbad").to_pandas() |
|
|
if not bothbad_pd.empty: |
|
|
bothbad_heatmap = create_heatmap(bothbad_pd, "bothbad", "reds") |
|
|
st.altair_chart(bothbad_heatmap, use_container_width=True) |
|
|
else: |
|
|
st.write("No 'bothbad' data for heatmap.") |
|
|
except Exception as e: |
|
|
st.error(f"Error creating bothbad heatmap: {e}") |
|
|
|
|
|
|
|
|
|
|
|
st.header("Detailed Outcome Analysis") |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_data(ttl=3600) |
|
|
def reshape_data_for_analysis(_df): |
|
|
"""Reshapes data for outcome analysis per model.""" |
|
|
|
|
|
|
|
|
def safe_struct_field(col_name, field_name): |
|
|
return ( |
|
|
pl.when(pl.col(col_name).struct.field(field_name).is_not_null()) |
|
|
.then(pl.col(col_name).struct.field(field_name)) |
|
|
.otherwise(None) |
|
|
) |
|
|
|
|
|
df_model_a = _df.select( |
|
|
[ |
|
|
pl.col("model_a").alias("model"), |
|
|
pl.when(pl.col("winner") == "model_a") |
|
|
.then(pl.lit("win")) |
|
|
.when(pl.col("winner") == "tie") |
|
|
.then(pl.lit("tie")) |
|
|
.when(pl.col("winner") == "tie (bothbad)") |
|
|
.then(pl.lit("tie (bothbad)")) |
|
|
.otherwise(pl.lit("loss")) |
|
|
.alias("outcome"), |
|
|
"language", |
|
|
"turn", |
|
|
"date", |
|
|
safe_struct_field("conv_metadata", "response_length_a").alias( |
|
|
"response_length" |
|
|
), |
|
|
safe_struct_field("conv_metadata", "num_citations_a").alias( |
|
|
"num_citations" |
|
|
), |
|
|
] |
|
|
) |
|
|
|
|
|
df_model_b = _df.select( |
|
|
[ |
|
|
pl.col("model_b").alias("model"), |
|
|
pl.when(pl.col("winner") == "model_b") |
|
|
.then(pl.lit("win")) |
|
|
.when(pl.col("winner") == "tie") |
|
|
.then(pl.lit("tie")) |
|
|
.when(pl.col("winner") == "tie (bothbad)") |
|
|
.then(pl.lit("tie (bothbad)")) |
|
|
.otherwise(pl.lit("loss")) |
|
|
.alias("outcome"), |
|
|
"language", |
|
|
"turn", |
|
|
"date", |
|
|
safe_struct_field("conv_metadata", "response_length_b").alias( |
|
|
"response_length" |
|
|
), |
|
|
safe_struct_field("conv_metadata", "num_citations_b").alias( |
|
|
"num_citations" |
|
|
), |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
df_models = pl.concat([df_model_a, df_model_b], how="vertical") |
|
|
df_models = df_models.fill_null( |
|
|
0 |
|
|
) |
|
|
|
|
|
return df_models |
|
|
|
|
|
|
|
|
df_models = reshape_data_for_analysis(df) |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_data(ttl=3600) |
|
|
def calculate_rates(_df_models): |
|
|
"""Calculates various outcome rates per model.""" |
|
|
df_rates = ( |
|
|
_df_models.group_by("model") |
|
|
.agg( |
|
|
wins=pl.col("outcome").eq("win").sum(), |
|
|
losses=pl.col("outcome").eq("loss").sum(), |
|
|
ties=pl.col("outcome").eq("tie").sum(), |
|
|
tie_bothbad=pl.col("outcome").eq("tie (bothbad)").sum(), |
|
|
) |
|
|
.with_columns( |
|
|
total=pl.sum_horizontal(["wins", "losses", "ties", "tie_bothbad"]), |
|
|
) |
|
|
.filter(pl.col("total") > 0) |
|
|
.with_columns( |
|
|
win_rate=pl.col("wins") / pl.col("total"), |
|
|
loss_rate=pl.col("losses") / pl.col("total"), |
|
|
tie_rate=pl.col("ties") / pl.col("total"), |
|
|
tie_bothbad_rate=pl.col("tie_bothbad") / pl.col("total"), |
|
|
weighted_rate=( |
|
|
(pl.col("wins") + 0.5 * pl.col("ties") + 0.25 * pl.col("tie_bothbad")) |
|
|
/ pl.col("total") |
|
|
), |
|
|
) |
|
|
) |
|
|
return df_rates |
|
|
|
|
|
|
|
|
df_rates = calculate_rates(df_models) |
|
|
|
|
|
|
|
|
st.subheader("Outcome Distribution by Model") |
|
|
try: |
|
|
outcome_order = ["win", "loss", "tie", "tie (bothbad)"] |
|
|
winner_bar = ( |
|
|
alt.Chart(df_models.to_pandas()) |
|
|
.mark_bar() |
|
|
.encode( |
|
|
x=alt.X( |
|
|
"model:N", |
|
|
title="Model", |
|
|
sort=unique_models, |
|
|
axis=alt.Axis(labelLimit=0), |
|
|
), |
|
|
y=alt.Y("count():Q", title="Count"), |
|
|
color=alt.Color( |
|
|
"outcome:N", |
|
|
title="Outcome", |
|
|
scale=alt.Scale( |
|
|
domain=outcome_order, |
|
|
range=["#1f77b4", "#d62728", "#2ca02c", "#9467bd"], |
|
|
), |
|
|
sort=outcome_order, |
|
|
), |
|
|
tooltip=["model", "outcome", "count()"], |
|
|
order=alt.Order( |
|
|
"color_outcome_sort_index:Q" |
|
|
), |
|
|
) |
|
|
.transform_calculate( |
|
|
|
|
|
color_outcome_sort_index=f"{{'win': 0, 'loss': 1, 'tie': 2, 'tie (bothbad)': 3}}[datum.outcome]" |
|
|
) |
|
|
) |
|
|
st.altair_chart(winner_bar, use_container_width=True) |
|
|
except Exception as e: |
|
|
st.error(f"Error generating outcome distribution chart: {e}") |
|
|
|
|
|
|
|
|
|
|
|
st.subheader("Outcome Rates by Model") |
|
|
try: |
|
|
df_rates_long = ( |
|
|
df_rates.select( |
|
|
["model", "win_rate", "loss_rate", "tie_rate", "tie_bothbad_rate"] |
|
|
) |
|
|
.unpivot( |
|
|
index=["model"], |
|
|
variable_name="rate_type", |
|
|
value_name="rate_value", |
|
|
) |
|
|
.with_columns( |
|
|
|
|
|
pl.col("rate_type").str.replace("_rate", "").str.to_titlecase() |
|
|
) |
|
|
) |
|
|
|
|
|
rate_order = ["Win", "Loss", "Tie", "Tie Bothbad"] |
|
|
|
|
|
stacked_bar = ( |
|
|
alt.Chart(df_rates_long.to_pandas()) |
|
|
.mark_bar() |
|
|
.encode( |
|
|
x=alt.X( |
|
|
"model:N", |
|
|
title="Model", |
|
|
sort=unique_models, |
|
|
axis=alt.Axis(labelLimit=0), |
|
|
), |
|
|
y=alt.Y( |
|
|
"rate_value:Q", |
|
|
title="Rate", |
|
|
stack="normalize", |
|
|
axis=alt.Axis(format="%"), |
|
|
), |
|
|
color=alt.Color("rate_type:N", title="Rate Type", sort=rate_order), |
|
|
order=alt.Order( |
|
|
"color_rate_type_sort_index:Q" |
|
|
), |
|
|
tooltip=["model", "rate_type", alt.Tooltip("rate_value:Q", format=".1%")], |
|
|
) |
|
|
.transform_calculate( |
|
|
|
|
|
color_rate_type_sort_index=f"{{'Win': 0, 'Loss': 1, 'Tie': 2, 'Tie Bothbad': 3}}[datum.rate_type]" |
|
|
) |
|
|
) |
|
|
|
|
|
weighted_line = ( |
|
|
alt.Chart(df_rates.to_pandas()) |
|
|
.mark_line(point=True, color="orange", strokeDash=[5, 5]) |
|
|
.encode( |
|
|
x=alt.X("model:N", title="Model", sort=unique_models), |
|
|
y=alt.Y( |
|
|
"weighted_rate:Q", title="Weighted Rate", axis=alt.Axis(format=".1%") |
|
|
), |
|
|
tooltip=[ |
|
|
"model", |
|
|
alt.Tooltip("weighted_rate:Q", title="Weighted Rate", format=".1%"), |
|
|
], |
|
|
) |
|
|
) |
|
|
|
|
|
rates_chart = ( |
|
|
(stacked_bar + weighted_line) |
|
|
.properties(title="Stacked Outcome Rates by Model (Weighted Rate Overlay)") |
|
|
.resolve_scale(y="independent") |
|
|
) |
|
|
|
|
|
st.altair_chart(rates_chart, use_container_width=True) |
|
|
except Exception as e: |
|
|
st.error(f"Error generating rates chart: {e}") |
|
|
|
|
|
|
|
|
|
|
|
st.header("Multilingual Performance") |
|
|
|
|
|
|
|
|
st.subheader("Language Distribution") |
|
|
try: |
|
|
|
|
|
lang_freq_df = ( |
|
|
df_models["language"].value_counts().rename({"count": "total_samples"}) |
|
|
) |
|
|
|
|
|
language_freq_chart = ( |
|
|
alt.Chart(lang_freq_df.to_pandas()) |
|
|
.mark_bar() |
|
|
.encode( |
|
|
x=alt.X("language:N", sort="-y", title="Language"), |
|
|
y=alt.Y("total_samples:Q", title="Number of Comparisons"), |
|
|
tooltip=["language", "total_samples"], |
|
|
) |
|
|
) |
|
|
st.altair_chart(language_freq_chart, use_container_width=True) |
|
|
except Exception as e: |
|
|
st.error(f"Error generating language frequency chart: {e}") |
|
|
|
|
|
|
|
|
|
|
|
st.subheader("Win Rate by Model and Language") |
|
|
try: |
|
|
win_rate_lang_df = ( |
|
|
df_models.group_by(["model", "language"]) |
|
|
.agg(wins=(pl.col("outcome") == "win").sum(), total=pl.len()) |
|
|
.filter(pl.col("total") > 0) |
|
|
.with_columns(win_rate=pl.col("wins") / pl.col("total")) |
|
|
) |
|
|
|
|
|
win_rate_language_heatmap = ( |
|
|
alt.Chart(win_rate_lang_df.to_pandas()) |
|
|
.mark_rect() |
|
|
.encode( |
|
|
x=alt.X( |
|
|
"model:N", |
|
|
title="Model", |
|
|
sort=unique_models, |
|
|
axis=alt.Axis(labelLimit=0), |
|
|
), |
|
|
y=alt.Y("language:N", title="Language"), |
|
|
color=alt.Color( |
|
|
"win_rate:Q", |
|
|
title="Win Rate", |
|
|
scale=alt.Scale(scheme="blues"), |
|
|
legend=alt.Legend(format=".0%"), |
|
|
), |
|
|
tooltip=[ |
|
|
"model", |
|
|
"language", |
|
|
alt.Tooltip("win_rate:Q", format=".1%"), |
|
|
"total", |
|
|
], |
|
|
) |
|
|
) |
|
|
st.altair_chart(win_rate_language_heatmap, use_container_width=True) |
|
|
except Exception as e: |
|
|
st.error(f"Error generating win rate by language heatmap: {e}") |
|
|
|
|
|
|
|
|
|
|
|
st.subheader("Outcome Distribution by Language and Model") |
|
|
try: |
|
|
|
|
|
@st.cache_data(ttl=3600) |
|
|
def prepare_data_for_language_facet(_df_models): |
|
|
df_language_totals = _df_models.group_by("language").agg(total_samples=pl.len()) |
|
|
df_models_with_totals = _df_models.join( |
|
|
df_language_totals, on="language", how="left" |
|
|
) |
|
|
return df_models_with_totals |
|
|
|
|
|
df_models_for_facet = prepare_data_for_language_facet(df_models) |
|
|
|
|
|
|
|
|
df_models_pd = df_models_for_facet.to_pandas() |
|
|
|
|
|
|
|
|
bar_chart = ( |
|
|
alt.Chart(df_models_pd) |
|
|
.mark_bar() |
|
|
.encode( |
|
|
x=alt.X( |
|
|
"model:N", |
|
|
title="Model", |
|
|
axis=alt.Axis(labels=True, labelLimit=0), |
|
|
), |
|
|
|
|
|
|
|
|
y=alt.Y("count():Q", title="Count", stack="zero"), |
|
|
|
|
|
|
|
|
|
|
|
color=alt.Color( |
|
|
"outcome:N", |
|
|
title="Outcome", |
|
|
sort=outcome_order, |
|
|
scale=alt.Scale( |
|
|
domain=outcome_order, |
|
|
range=[ |
|
|
"green", |
|
|
"orange", |
|
|
"lightblue", |
|
|
"red", |
|
|
], |
|
|
), |
|
|
), |
|
|
tooltip=[ |
|
|
"model", |
|
|
"language", |
|
|
"outcome", |
|
|
"count()", |
|
|
alt.Tooltip("total_samples:Q", title="Total Samples in Language"), |
|
|
], |
|
|
order=alt.Order( |
|
|
"color_outcome_sort_index:Q" |
|
|
), |
|
|
) |
|
|
.transform_calculate( |
|
|
|
|
|
color_outcome_sort_index=f"{{'win': 0, 'loss': 1, 'tie': 2, 'tie (bothbad)': 3}}[datum.outcome]" |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
text_chart = ( |
|
|
alt.Chart(df_models_pd) |
|
|
.mark_text( |
|
|
align="left", baseline="middle", dx=5, dy=-5, color="black", fontSize=10 |
|
|
) |
|
|
.encode( |
|
|
|
|
|
x=alt.value(5), |
|
|
y=alt.value(15), |
|
|
text=alt.Text("total_samples:Q", format=",d"), |
|
|
color=alt.value("black"), |
|
|
|
|
|
tooltip=[alt.Tooltip("total_samples:Q", title="Total Samples in Language")], |
|
|
) |
|
|
.transform_aggregate( |
|
|
|
|
|
|
|
|
total_samples="max(total_samples)", |
|
|
groupby=["language"], |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
language_outcome_chart = ( |
|
|
|
|
|
(bar_chart + text_chart) |
|
|
.facet( |
|
|
|
|
|
facet=alt.Facet( |
|
|
"language:N", |
|
|
title="Language", |
|
|
header=alt.Header(titleOrient="top", labelOrient="top"), |
|
|
), |
|
|
columns=5, |
|
|
) |
|
|
.resolve_scale( |
|
|
y="independent" |
|
|
) |
|
|
) |
|
|
|
|
|
st.altair_chart(language_outcome_chart, use_container_width=True) |
|
|
|
|
|
except Exception as e: |
|
|
import traceback |
|
|
|
|
|
st.error( |
|
|
f"Error generating faceted outcome distribution by language: {e}\n{traceback.format_exc()}" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
st.header("Performance Dynamics") |
|
|
|
|
|
|
|
|
st.subheader("Win Rate by Turn") |
|
|
try: |
|
|
win_rate_turn_df = ( |
|
|
df_models.group_by(["model", "turn"]) |
|
|
.agg(wins=(pl.col("outcome") == "win").sum(), total=pl.len()) |
|
|
.filter(pl.col("total") > 0) |
|
|
.with_columns(win_rate=pl.col("wins") / pl.col("total")) |
|
|
) |
|
|
win_rate_turn_heatmap = ( |
|
|
alt.Chart(win_rate_turn_df.to_pandas()) |
|
|
.mark_rect() |
|
|
.encode( |
|
|
x=alt.X("turn:O", title="Turn"), |
|
|
y=alt.Y( |
|
|
"model:N", |
|
|
title="Model", |
|
|
sort=unique_models, |
|
|
axis=alt.Axis(labelLimit=0), |
|
|
), |
|
|
color=alt.Color( |
|
|
"win_rate:Q", |
|
|
title="Win Rate", |
|
|
scale=alt.Scale(scheme="blues"), |
|
|
legend=alt.Legend(format=".0%"), |
|
|
), |
|
|
tooltip=[ |
|
|
"model", |
|
|
"turn", |
|
|
alt.Tooltip("win_rate:Q", format=".1%"), |
|
|
"total", |
|
|
], |
|
|
) |
|
|
.properties(title="Win Rate by Model and Turn") |
|
|
) |
|
|
st.altair_chart(win_rate_turn_heatmap, use_container_width=True) |
|
|
except Exception as e: |
|
|
st.error(f"Error generating win rate by turn heatmap: {e}") |
|
|
|
|
|
|
|
|
st.subheader("Wins Over Time") |
|
|
try: |
|
|
wins_time_df = ( |
|
|
df_models.filter(pl.col("outcome") == "win") |
|
|
.group_by(["date", "model"]) |
|
|
.agg(win_count=pl.len()) |
|
|
.sort("date") |
|
|
) |
|
|
time_line = ( |
|
|
alt.Chart(wins_time_df.to_pandas()) |
|
|
.mark_line(point=True) |
|
|
.encode( |
|
|
x=alt.X("date:T", title="Date"), |
|
|
y=alt.Y("win_count:Q", title="Daily Win Count"), |
|
|
color=alt.Color("model:N", title="Model"), |
|
|
tooltip=["model", "date", "win_count"], |
|
|
) |
|
|
.properties(title="Wins by Model Over Time") |
|
|
) |
|
|
st.altair_chart(time_line, use_container_width=True) |
|
|
except Exception as e: |
|
|
st.error(f"Error generating wins over time chart: {e}") |
|
|
|
|
|
|
|
|
|
|
|
st.header("Response Characteristics vs. Outcome") |
|
|
|
|
|
|
|
|
st.subheader( |
|
|
"Response Length Distribution by Outcome (Box Plot - Adjusted Scale, No Outliers)" |
|
|
) |
|
|
try: |
|
|
|
|
|
response_length_df = df_models.filter( |
|
|
pl.col("response_length").is_not_null().and_(pl.col("response_length") >= 0) |
|
|
) |
|
|
|
|
|
|
|
|
response_length_pd = response_length_df.to_pandas() |
|
|
|
|
|
|
|
|
valid_lengths = response_length_pd["response_length"].dropna() |
|
|
valid_lengths = valid_lengths[valid_lengths >= 0] |
|
|
if not valid_lengths.empty: |
|
|
upper_limit = valid_lengths.quantile(0.99) |
|
|
upper_limit = max(upper_limit, 100) |
|
|
else: |
|
|
upper_limit = 1000 |
|
|
|
|
|
st.write( |
|
|
f"(Note: Y-axis for Response Length capped at ~{int(upper_limit)} [99th percentile] to improve visibility)" |
|
|
) |
|
|
|
|
|
response_facet_boxplot = ( |
|
|
alt.Chart(response_length_pd) |
|
|
.mark_boxplot( |
|
|
extent="min-max", |
|
|
outliers=False, |
|
|
) |
|
|
.encode( |
|
|
x=alt.X( |
|
|
"model:N", |
|
|
title="Model", |
|
|
sort=unique_models, |
|
|
axis=alt.Axis(labelLimit=0), |
|
|
), |
|
|
y=alt.Y( |
|
|
"response_length:Q", |
|
|
title="Response Length", |
|
|
scale=alt.Scale(domain=[0, upper_limit], clamp=True), |
|
|
), |
|
|
color=alt.Color( |
|
|
"model:N", title="Model", legend=None |
|
|
), |
|
|
) |
|
|
.facet( |
|
|
|
|
|
column=alt.Column("outcome:N", title="Outcome", sort=outcome_order) |
|
|
) |
|
|
.properties( |
|
|
title="Response Length Distribution by Model per Outcome (Box Plot, Adjusted Scale, No Outliers)" |
|
|
) |
|
|
) |
|
|
st.altair_chart(response_facet_boxplot, use_container_width=True) |
|
|
|
|
|
except Exception as e: |
|
|
import traceback |
|
|
|
|
|
st.error(f"Error generating response length boxplot: {e}\n{traceback.format_exc()}") |
|
|
|
|
|
|
|
|
|
|
|
st.subheader("Number of Citations by Outcome") |
|
|
try: |
|
|
|
|
|
citation_df = df_models.filter( |
|
|
pl.col("num_citations").is_not_null().and_(pl.col("num_citations") >= 0) |
|
|
) |
|
|
|
|
|
citation_agg = citation_df.group_by(["model", "outcome", "num_citations"]).agg( |
|
|
count=pl.len() |
|
|
) |
|
|
|
|
|
|
|
|
citation_agg_pd = citation_agg.to_pandas() |
|
|
|
|
|
citation_facet = ( |
|
|
alt.Chart(citation_agg_pd) |
|
|
.mark_circle() |
|
|
.encode( |
|
|
x=alt.X( |
|
|
"model:N", |
|
|
title="Model", |
|
|
sort=unique_models, |
|
|
axis=alt.Axis(labelLimit=0), |
|
|
), |
|
|
y=alt.Y( |
|
|
"num_citations:Q", |
|
|
title="Number of Citations", |
|
|
axis=alt.Axis(tickMinStep=1), |
|
|
), |
|
|
|
|
|
size=alt.Size( |
|
|
"count:Q", |
|
|
title="Number of Responses", |
|
|
legend=alt.Legend( |
|
|
symbolFillColor="lightblue", |
|
|
symbolStrokeColor="lightblue", |
|
|
), |
|
|
), |
|
|
|
|
|
color=alt.Color( |
|
|
"model:N", title="Model", legend=None |
|
|
), |
|
|
tooltip=["model", "outcome", "num_citations", "count"], |
|
|
) |
|
|
.facet(column=alt.Column("outcome:N", title="Outcome", sort=outcome_order)) |
|
|
.properties(title="Citations by Model per Outcome") |
|
|
) |
|
|
st.altair_chart(citation_facet, use_container_width=True) |
|
|
except Exception as e: |
|
|
import traceback |
|
|
|
|
|
st.error(f"Error generating citation facet plot: {e}\n{traceback.format_exc()}") |
|
|
|
|
|
|
|
|
|
|
|
st.header("Top/Worst Models per Language") |
|
|
|
|
|
|
|
|
@st.cache_data(ttl=3600) |
|
|
def calculate_language_ranks(_df_models): |
|
|
"""Calculates top/worst models based on win rate per language.""" |
|
|
df_win_rates = ( |
|
|
_df_models.group_by(["language", "model"]) |
|
|
.agg(wins=pl.col("outcome").eq("win").sum(), total=pl.len()) |
|
|
.filter(pl.col("total") > 5) |
|
|
.with_columns(win_rate=pl.col("wins") / pl.col("total")) |
|
|
) |
|
|
|
|
|
if df_win_rates.is_empty(): |
|
|
return pl.DataFrame() |
|
|
|
|
|
df_ranked = df_win_rates.with_columns( |
|
|
rank_top=pl.col("win_rate") |
|
|
.rank(method="min", descending=True) |
|
|
.over("language"), |
|
|
rank_worst=pl.col("win_rate") |
|
|
.rank(method="min", descending=False) |
|
|
.over("language"), |
|
|
) |
|
|
|
|
|
df_top_1 = ( |
|
|
df_ranked.filter(pl.col("rank_top") == 1) |
|
|
.group_by("language") |
|
|
.agg( |
|
|
top_model_1=pl.col("model").first(), |
|
|
win_rate_1=pl.col("win_rate").first(), |
|
|
total_samples=pl.col("total").sum(), |
|
|
) |
|
|
) |
|
|
df_top_2 = ( |
|
|
df_ranked.filter(pl.col("rank_top") == 2) |
|
|
.group_by("language") |
|
|
.agg(top_model_2=pl.col("model").first(), win_rate_2=pl.col("win_rate").first()) |
|
|
) |
|
|
df_worst_1 = ( |
|
|
df_ranked.filter(pl.col("rank_worst") == 1) |
|
|
.group_by("language") |
|
|
.agg( |
|
|
worst_model_1=pl.col("model").first(), |
|
|
worst_win_rate_1=pl.col("win_rate").first(), |
|
|
) |
|
|
) |
|
|
df_worst_2 = ( |
|
|
df_ranked.filter(pl.col("rank_worst") == 2) |
|
|
.group_by("language") |
|
|
.agg( |
|
|
worst_model_2=pl.col("model").first(), |
|
|
worst_win_rate_2=pl.col("win_rate").first(), |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
df_table = ( |
|
|
df_top_1.join(df_top_2, on="language", how="left") |
|
|
.join(df_worst_1, on="language", how="left") |
|
|
.join(df_worst_2, on="language", how="left") |
|
|
.select( |
|
|
[ |
|
|
"language", |
|
|
"total_samples", |
|
|
"top_model_1", |
|
|
pl.col("win_rate_1").round(3).alias("WR #1"), |
|
|
"top_model_2", |
|
|
pl.col("win_rate_2").round(3).alias("WR #2"), |
|
|
"worst_model_1", |
|
|
pl.col("worst_win_rate_1").round(3).alias("WR Worst #1"), |
|
|
"worst_model_2", |
|
|
pl.col("worst_win_rate_2").round(3).alias("WR Worst #2"), |
|
|
] |
|
|
) |
|
|
.sort("language") |
|
|
.fill_null("N/A") |
|
|
) |
|
|
return df_table |
|
|
|
|
|
|
|
|
df_language_table = calculate_language_ranks(df_models) |
|
|
|
|
|
if not df_language_table.is_empty(): |
|
|
st.subheader("Top & Bottom 2 Models by Win Rate per Language (Min 5 Comparisons)") |
|
|
st.dataframe(df_language_table.to_pandas(), use_container_width=True) |
|
|
else: |
|
|
st.subheader("Top & Bottom 2 Models by Win Rate per Language") |
|
|
st.write( |
|
|
"Insufficient data (fewer than 5 comparisons) for one or more languages to generate rankings." |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if not df_language_table.is_empty(): |
|
|
st.subheader("How Often Models Rank Top/Worst Across Languages") |
|
|
try: |
|
|
df_top_1_counts = ( |
|
|
df_language_table["top_model_1"] |
|
|
.value_counts() |
|
|
.rename({"top_model_1": "model", "count": "Rank 1 Count"}) |
|
|
.sort("Rank 1 Count", descending=True) |
|
|
) |
|
|
df_worst_1_counts = ( |
|
|
df_language_table["worst_model_1"] |
|
|
.value_counts() |
|
|
.rename({"worst_model_1": "model", "count": "Worst Rank Count"}) |
|
|
.sort("Worst Rank Count", descending=True) |
|
|
) |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
with col1: |
|
|
st.write("**Times Ranked #1**") |
|
|
st.dataframe(df_top_1_counts.to_pandas(), use_container_width=True) |
|
|
with col2: |
|
|
st.write("**Times Ranked Worst**") |
|
|
st.dataframe(df_worst_1_counts.to_pandas(), use_container_width=True) |
|
|
except Exception as e: |
|
|
st.error(f"Error generating model ranking counts: {e}") |
|
|
|
|
|
|
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown( |
|
|
"Analysis based on the `lmarena-ai/search-arena-v1-7k` dataset on Hugging Face." |
|
|
) |
|
|
|