Spaces:

msminhas
/

search-arena-v1-7k-analysis

Sleeping

App Files Files Community

search-arena-v1-7k-analysis / app.py

msm

legend fixes

e45f29b 8 months ago

raw

history blame contribute delete

36.3 kB

	import streamlit as st
	import polars as pl
	import altair as alt
	from huggingface_hub import hf_hub_download
	import traceback

	# --- Configuration and Setup ---
	st.set_page_config(layout="wide")
	alt.data_transformers.enable(
	"default"
	) # Use default, vegafusion might need server extension
	# alt.data_transformers.enable("vegafusion") # Optional: if vegafusion works in your env

	st.title("Search Arena V1 Dataset Analysis")


	# --- Hugging Face Authentication & Data Loading ---
	@st.cache_data(ttl=3600) # Cache data for 1 hour
	def load_data():
	"""Loads the dataset from Hugging Face Hub."""
	try:
	repo_id = "lmarena-ai/search-arena-v1-7k"
	filename = "data/search-arena-v1-preference-7k.parquet"

	# Download the parquet file using the token if provided
	# hf_hub_download handles caching locally within the Space's container
	local_path = hf_hub_download(
	repo_id=repo_id, filename=filename, repo_type="dataset"
	)

	df = pl.read_parquet(local_path)
	df = df.with_columns(pl.col("timestamp").dt.date().alias("date"))
	return df
	except Exception as e:
	st.error(f"Error loading data: {e}")
	st.warning(
	"Ensure you have added your Hugging Face token (with read access) as a secret named 'HF_TOKEN' in your Space settings."
	)
	return None


	df = load_data()

	if df is None:
	st.stop() # Stop execution if data loading failed

	# --- Data Processing and Visualization ---

	st.header("Dataset Overview")

	# Date Histogram
	st.subheader("Data Collection Timeline")
	try:
	date_counts = df.group_by("date").agg(pl.len().alias("count")).sort("date")
	chart_dates = (
	alt.Chart(date_counts.to_pandas()) # Convert to Pandas for Altair/Streamlit
	.mark_bar()
	.encode(
	x=alt.X(
	"date:T", axis=alt.Axis(labelAngle=0, title="Date")
	), # Adjusted angle
	y=alt.Y("count:Q", axis=alt.Axis(title="Frequency")),
	tooltip=["date:T", "count:Q"],
	)
	.properties(title="Histogram of Dates")
	)
	st.altair_chart(chart_dates, use_container_width=True)
	except Exception as e:
	st.error(f"Error generating date histogram: {e}")


	# --- Model Performance Analysis ---
	st.header("Model Performance")


	# Calculate Model Statistics
	@st.cache_data(ttl=3600)
	def calculate_model_stats(_df):
	"""Calculates wins, losses, ties, and win rate for each model."""
	unique_models = (
	pl.concat([_df["model_a"], _df["model_b"]]).unique().sort().to_list()
	)
	results = []

	for model in unique_models:
	# Wins
	wins_as_a = _df.filter(
	(pl.col("model_a") == model) & (pl.col("winner") == "model_a")
	).height
	wins_as_b = _df.filter(
	(pl.col("model_b") == model) & (pl.col("winner") == "model_b")
	).height
	total_wins = wins_as_a + wins_as_b

	# Losses
	losses_as_a = _df.filter(
	(pl.col("model_a") == model) & (pl.col("winner") == "model_b")
	).height
	losses_as_b = _df.filter(
	(pl.col("model_b") == model) & (pl.col("winner") == "model_a")
	).height
	total_losses = losses_as_a + losses_as_b

	# Ties (including bothbad)
	ties_as_a = _df.filter(
	(pl.col("model_a") == model) & (pl.col("winner").str.contains("tie"))
	).height
	ties_as_b = _df.filter(
	(pl.col("model_b") == model) & (pl.col("winner").str.contains("tie"))
	).height
	total_ties = ties_as_a + ties_as_b

	# Total Matches
	total_matches = _df.filter(
	(pl.col("model_a") == model) \| (pl.col("model_b") == model)
	).height

	# Win Rate
	win_rate = (
	round(total_wins / total_matches * 100, 2) if total_matches > 0 else 0
	)

	results.append(
	{
	"model": model,
	"wins": total_wins,
	"losses": total_losses,
	"ties": total_ties,
	"total_matches": total_matches,
	"win_rate (%)": win_rate,
	}
	)

	results_df = pl.DataFrame(results).sort("win_rate (%)", descending=True)
	return results_df, unique_models


	results_df, unique_models = calculate_model_stats(df)

	st.subheader("Overall Win Rates")
	st.dataframe(results_df.to_pandas(), use_container_width=True) # Display as dataframe

	# Head-to-Head Analysis
	st.subheader("Head-to-Head Matchups (Wins-Losses-Ties-BothBad)")


	@st.cache_data(ttl=3600)
	def calculate_head_to_head(_df, _unique_models):
	"""Calculates head-to-head results."""
	head_to_head = []
	for model_1 in _unique_models:
	row = {"model": model_1}
	for model_2 in _unique_models:
	if model_1 == model_2:
	row[model_2] = "N/A"
	continue

	matches_ab = _df.filter(
	(pl.col("model_a") == model_1) & (pl.col("model_b") == model_2)
	)
	matches_ba = _df.filter(
	(pl.col("model_a") == model_2) & (pl.col("model_b") == model_1)
	)

	wins_1 = (
	matches_ab.filter(pl.col("winner") == "model_a").height
	+ matches_ba.filter(pl.col("winner") == "model_b").height
	)
	wins_2 = (
	matches_ab.filter(pl.col("winner") == "model_b").height
	+ matches_ba.filter(pl.col("winner") == "model_a").height
	)
	ties = (
	matches_ab.filter(pl.col("winner") == "tie").height
	+ matches_ba.filter(pl.col("winner") == "tie").height
	)
	bothbad = (
	matches_ab.filter(pl.col("winner") == "tie (bothbad)").height
	+ matches_ba.filter(pl.col("winner") == "tie (bothbad)").height
	)

	total = wins_1 + wins_2 + ties + bothbad
	row[model_2] = (
	f"{wins_1}-{wins_2}-{ties}-{bothbad}" if total > 0 else "0-0-0-0"
	)
	head_to_head.append(row)
	return pl.DataFrame(head_to_head)


	head_to_head_df = calculate_head_to_head(df, unique_models)
	st.dataframe(head_to_head_df.to_pandas(), use_container_width=True)

	# Heatmaps
	st.subheader("Head-to-Head Heatmaps")


	@st.cache_data(ttl=3600)
	def prepare_heatmap_data(_head_to_head_df):
	"""Prepares data for heatmaps."""
	melted_df = _head_to_head_df.unpivot(index=["model"], variable_name="opponent")
	parsed_data = []
	for row in melted_df.iter_rows(named=True):
	model, opponent, value = row["model"], row["opponent"], row["value"]
	if value != "N/A":
	try:
	parts = value.split("-")
	wins, losses, ties, bothbad = map(int, parts)
	parsed_data.extend(
	[
	{
	"model": model,
	"opponent": opponent,
	"metric": "wins",
	"value": wins,
	},
	{
	"model": model,
	"opponent": opponent,
	"metric": "losses",
	"value": losses,
	},
	{
	"model": model,
	"opponent": opponent,
	"metric": "ties",
	"value": ties,
	},
	{
	"model": model,
	"opponent": opponent,
	"metric": "bothbad",
	"value": bothbad,
	},
	]
	)
	except (ValueError, IndexError):
	st.warning(
	f"Could not parse head-to-head value: '{value}' for {model} vs {opponent}"
	)
	continue # Skip malformed entries

	return pl.DataFrame(parsed_data)


	metrics_df = prepare_heatmap_data(head_to_head_df)


	def create_heatmap(data_pd, metric, color_scheme):
	"""Helper function to create an Altair heatmap."""
	if data_pd.empty:
	return alt.Chart(pd.DataFrame({"x": [], "y": [], "value": []})).mark_text(
	text=f"No data for {metric}"
	)

	median_value = data_pd["value"].median()

	heatmap = (
	alt.Chart(data_pd)
	.mark_rect()
	.encode(
	x=alt.X(
	"opponent:N",
	title="Opponent",
	sort=unique_models,
	axis=alt.Axis(labelLimit=0, labelAngle=90),
	), # Ensure consistent sorting
	y=alt.Y(
	"model:N", title="Model", sort=unique_models
	), # Ensure consistent sorting
	color=alt.Color(
	"value:Q",
	scale=alt.Scale(scheme=color_scheme),
	title=f"{metric.capitalize()}",
	),
	tooltip=["model", "opponent", "value"],
	)
	.properties(
	title=f"{metric.capitalize()}", width=alt.Step(40), height=alt.Step(40)
	) # Adjust step for size
	)
	text = heatmap.mark_text(baseline="middle").encode(
	text="value:Q",
	color=alt.condition(
	f"datum.value > {median_value}", alt.value("white"), alt.value("black")
	),
	tooltip=["model", "opponent", "value"], # Ensure tooltip is on text too
	)
	return heatmap + text


	# Filter and convert data for each heatmap within Streamlit rendering flow
	col1, col2 = st.columns(2)
	with col1:
	try:
	wins_pd = metrics_df.filter(pl.col("metric") == "wins").to_pandas()
	if not wins_pd.empty:
	wins_heatmap = create_heatmap(wins_pd, "wins", "greens")
	st.altair_chart(wins_heatmap, use_container_width=True)
	else:
	st.write("No 'wins' data for heatmap.")
	except Exception as e:
	st.error(f"Error creating wins heatmap: {e}")

	try:
	ties_pd = metrics_df.filter(pl.col("metric") == "ties").to_pandas()
	if not ties_pd.empty:
	ties_heatmap = create_heatmap(ties_pd, "ties", "purples")
	st.altair_chart(ties_heatmap, use_container_width=True)
	else:
	st.write("No 'ties' data for heatmap.")
	except Exception as e:
	st.error(f"Error creating ties heatmap: {e}")

	with col2:
	try:
	losses_pd = metrics_df.filter(pl.col("metric") == "losses").to_pandas()
	if not losses_pd.empty:
	losses_heatmap = create_heatmap(losses_pd, "losses", "oranges")
	st.altair_chart(losses_heatmap, use_container_width=True)
	else:
	st.write("No 'losses' data for heatmap.")
	except Exception as e:
	st.error(f"Error creating losses heatmap: {e}")

	try:
	bothbad_pd = metrics_df.filter(pl.col("metric") == "bothbad").to_pandas()
	if not bothbad_pd.empty:
	bothbad_heatmap = create_heatmap(bothbad_pd, "bothbad", "reds")
	st.altair_chart(bothbad_heatmap, use_container_width=True)
	else:
	st.write("No 'bothbad' data for heatmap.")
	except Exception as e:
	st.error(f"Error creating bothbad heatmap: {e}")


	# --- Detailed Outcome Analysis ---
	st.header("Detailed Outcome Analysis")


	# Reshape Data for Detailed Analysis
	@st.cache_data(ttl=3600)
	def reshape_data_for_analysis(_df):
	"""Reshapes data for outcome analysis per model."""

	# Helper to safely extract struct fields, returning None if field doesn't exist or is null
	def safe_struct_field(col_name, field_name):
	return (
	pl.when(pl.col(col_name).struct.field(field_name).is_not_null())
	.then(pl.col(col_name).struct.field(field_name))
	.otherwise(None)
	) # Or pl.lit(0) or appropriate default

	df_model_a = _df.select(
	[
	pl.col("model_a").alias("model"),
	pl.when(pl.col("winner") == "model_a")
	.then(pl.lit("win"))
	.when(pl.col("winner") == "tie")
	.then(pl.lit("tie"))
	.when(pl.col("winner") == "tie (bothbad)")
	.then(pl.lit("tie (bothbad)"))
	.otherwise(pl.lit("loss"))
	.alias("outcome"),
	"language",
	"turn",
	"date",
	safe_struct_field("conv_metadata", "response_length_a").alias(
	"response_length"
	),
	safe_struct_field("conv_metadata", "num_citations_a").alias(
	"num_citations"
	),
	]
	)

	df_model_b = _df.select(
	[
	pl.col("model_b").alias("model"),
	pl.when(pl.col("winner") == "model_b")
	.then(pl.lit("win"))
	.when(pl.col("winner") == "tie")
	.then(pl.lit("tie"))
	.when(pl.col("winner") == "tie (bothbad)")
	.then(pl.lit("tie (bothbad)"))
	.otherwise(pl.lit("loss"))
	.alias("outcome"),
	"language",
	"turn",
	"date",
	safe_struct_field("conv_metadata", "response_length_b").alias(
	"response_length"
	),
	safe_struct_field("conv_metadata", "num_citations_b").alias(
	"num_citations"
	),
	]
	)

	# Drop rows where essential analysis columns are null if necessary
	df_models = pl.concat([df_model_a, df_model_b], how="vertical")
	df_models = df_models.fill_null(
	0
	) # Fill nulls after concat, maybe with 0 or a specific strategy

	return df_models


	df_models = reshape_data_for_analysis(df)


	# Calculate Rates
	@st.cache_data(ttl=3600)
	def calculate_rates(_df_models):
	"""Calculates various outcome rates per model."""
	df_rates = (
	_df_models.group_by("model")
	.agg(
	wins=pl.col("outcome").eq("win").sum(),
	losses=pl.col("outcome").eq("loss").sum(),
	ties=pl.col("outcome").eq("tie").sum(),
	tie_bothbad=pl.col("outcome").eq("tie (bothbad)").sum(),
	)
	.with_columns(
	total=pl.sum_horizontal(["wins", "losses", "ties", "tie_bothbad"]),
	)
	.filter(pl.col("total") > 0) # Avoid division by zero
	.with_columns(
	win_rate=pl.col("wins") / pl.col("total"),
	loss_rate=pl.col("losses") / pl.col("total"),
	tie_rate=pl.col("ties") / pl.col("total"),
	tie_bothbad_rate=pl.col("tie_bothbad") / pl.col("total"),
	weighted_rate=(
	(pl.col("wins") + 0.5 * pl.col("ties") + 0.25 * pl.col("tie_bothbad"))
	/ pl.col("total")
	),
	)
	)
	return df_rates


	df_rates = calculate_rates(df_models)

	# Outcome Distribution Chart
	st.subheader("Outcome Distribution by Model")
	try:
	outcome_order = ["win", "loss", "tie", "tie (bothbad)"]
	winner_bar = (
	alt.Chart(df_models.to_pandas())
	.mark_bar()
	.encode(
	x=alt.X(
	"model:N",
	title="Model",
	sort=unique_models,
	axis=alt.Axis(labelLimit=0),
	),
	y=alt.Y("count():Q", title="Count"),
	color=alt.Color(
	"outcome:N",
	title="Outcome",
	scale=alt.Scale(
	domain=outcome_order,
	range=["#1f77b4", "#d62728", "#2ca02c", "#9467bd"],
	),
	sort=outcome_order,
	), # Explicit colors/order
	tooltip=["model", "outcome", "count()"],
	order=alt.Order(
	"color_outcome_sort_index:Q"
	), # Ensure stack order matches legend
	)
	.transform_calculate(
	# Create a field for sorting based on the domain order
	color_outcome_sort_index=f"{{'win': 0, 'loss': 1, 'tie': 2, 'tie (bothbad)': 3}}[datum.outcome]"
	)
	)
	st.altair_chart(winner_bar, use_container_width=True)
	except Exception as e:
	st.error(f"Error generating outcome distribution chart: {e}")


	# Rates Chart
	st.subheader("Outcome Rates by Model")
	try:
	df_rates_long = (
	df_rates.select(
	["model", "win_rate", "loss_rate", "tie_rate", "tie_bothbad_rate"]
	)
	.unpivot(
	index=["model"],
	variable_name="rate_type",
	value_name="rate_value",
	)
	.with_columns(
	# Clean up rate type names for display
	pl.col("rate_type").str.replace("_rate", "").str.to_titlecase()
	)
	)

	rate_order = ["Win", "Loss", "Tie", "Tie Bothbad"] # Order for stacking and legend

	stacked_bar = (
	alt.Chart(df_rates_long.to_pandas())
	.mark_bar()
	.encode(
	x=alt.X(
	"model:N",
	title="Model",
	sort=unique_models,
	axis=alt.Axis(labelLimit=0),
	),
	y=alt.Y(
	"rate_value:Q",
	title="Rate",
	stack="normalize",
	axis=alt.Axis(format="%"),
	), # Normalize stack
	color=alt.Color("rate_type:N", title="Rate Type", sort=rate_order),
	order=alt.Order(
	"color_rate_type_sort_index:Q"
	), # Use calculation for stack order
	tooltip=["model", "rate_type", alt.Tooltip("rate_value:Q", format=".1%")],
	)
	.transform_calculate(
	# Create a field for sorting based on the domain order
	color_rate_type_sort_index=f"{{'Win': 0, 'Loss': 1, 'Tie': 2, 'Tie Bothbad': 3}}[datum.rate_type]"
	)
	)

	weighted_line = (
	alt.Chart(df_rates.to_pandas())
	.mark_line(point=True, color="orange", strokeDash=[5, 5]) # Dashed line
	.encode(
	x=alt.X("model:N", title="Model", sort=unique_models),
	y=alt.Y(
	"weighted_rate:Q", title="Weighted Rate", axis=alt.Axis(format=".1%")
	),
	tooltip=[
	"model",
	alt.Tooltip("weighted_rate:Q", title="Weighted Rate", format=".1%"),
	],
	)
	)

	rates_chart = (
	(stacked_bar + weighted_line)
	.properties(title="Stacked Outcome Rates by Model (Weighted Rate Overlay)")
	.resolve_scale(y="independent")
	) # Independent Y-axis for line vs bars

	st.altair_chart(rates_chart, use_container_width=True)
	except Exception as e:
	st.error(f"Error generating rates chart: {e}")


	# --- Multilingual Performance ---
	st.header("Multilingual Performance")

	# Language Frequency
	st.subheader("Language Distribution")
	try:
	# Calculate language frequency from df_models which has one row per model appearance
	lang_freq_df = (
	df_models["language"].value_counts().rename({"count": "total_samples"})
	)

	language_freq_chart = (
	alt.Chart(lang_freq_df.to_pandas())
	.mark_bar()
	.encode(
	x=alt.X("language:N", sort="-y", title="Language"),
	y=alt.Y("total_samples:Q", title="Number of Comparisons"),
	tooltip=["language", "total_samples"],
	)
	)
	st.altair_chart(language_freq_chart, use_container_width=True)
	except Exception as e:
	st.error(f"Error generating language frequency chart: {e}")


	# Win Rate by Language Heatmap
	st.subheader("Win Rate by Model and Language")
	try:
	win_rate_lang_df = (
	df_models.group_by(["model", "language"])
	.agg(wins=(pl.col("outcome") == "win").sum(), total=pl.len())
	.filter(pl.col("total") > 0) # Avoid division by zero
	.with_columns(win_rate=pl.col("wins") / pl.col("total"))
	)

	win_rate_language_heatmap = (
	alt.Chart(win_rate_lang_df.to_pandas())
	.mark_rect()
	.encode(
	x=alt.X(
	"model:N",
	title="Model",
	sort=unique_models,
	axis=alt.Axis(labelLimit=0),
	),
	y=alt.Y("language:N", title="Language"),
	color=alt.Color(
	"win_rate:Q",
	title="Win Rate",
	scale=alt.Scale(scheme="blues"),
	legend=alt.Legend(format=".0%"),
	),
	tooltip=[
	"model",
	"language",
	alt.Tooltip("win_rate:Q", format=".1%"),
	"total",
	],
	)
	)
	st.altair_chart(win_rate_language_heatmap, use_container_width=True)
	except Exception as e:
	st.error(f"Error generating win rate by language heatmap: {e}")


	# Outcome Distribution by Language and Model (Faceted & Wrapped)
	st.subheader("Outcome Distribution by Language and Model")
	try:
	# --- Data Prep for Annotation (as per working example) ---
	@st.cache_data(ttl=3600)
	def prepare_data_for_language_facet(_df_models):
	df_language_totals = _df_models.group_by("language").agg(total_samples=pl.len())
	df_models_with_totals = _df_models.join(
	df_language_totals, on="language", how="left"
	)
	return df_models_with_totals

	df_models_for_facet = prepare_data_for_language_facet(df_models)

	# Convert to Pandas for Altair
	df_models_pd = df_models_for_facet.to_pandas()

	# --- Base Bar Chart (using the full dataset) ---
	bar_chart = (
	alt.Chart(df_models_pd) # Use the common DataFrame
	.mark_bar()
	.encode(
	x=alt.X(
	"model:N",
	title="Model",
	axis=alt.Axis(labels=True, labelLimit=0),
	), # Show labels, rotated
	# --- Choose one Y encoding ---
	# For absolute counts (like example):
	y=alt.Y("count():Q", title="Count", stack="zero"),
	# For normalized bars:
	# y=alt.Y("count():Q", title="%", stack="normalize", axis=alt.Axis(format="%")),
	# -----------------------------
	color=alt.Color(
	"outcome:N",
	title="Outcome",
	sort=outcome_order, # Use the existing outcome_order list
	scale=alt.Scale(
	domain=outcome_order,
	range=[
	"green",
	"orange",
	"lightblue",
	"red",
	],
	),
	),
	tooltip=[
	"model",
	"language",
	"outcome",
	"count()",
	alt.Tooltip("total_samples:Q", title="Total Samples in Language"),
	],
	order=alt.Order( # Important for consistent stacking, esp. if normalized
	"color_outcome_sort_index:Q"
	),
	)
	.transform_calculate(
	# Field for sorting stack order
	color_outcome_sort_index=f"{{'win': 0, 'loss': 1, 'tie': 2, 'tie (bothbad)': 3}}[datum.outcome]"
	)
	)

	# --- Text Annotation (using the full dataset with transform_aggregate) ---
	text_chart = (
	alt.Chart(df_models_pd) # Use the same common DataFrame
	.mark_text(
	align="left", baseline="middle", dx=5, dy=-5, color="black", fontSize=10
	) # Adjusted position/appearance
	.encode(
	# Position text relative to the top-left corner of the facet using values
	x=alt.value(5), # Small offset from the left edge
	y=alt.value(15), # Small offset from the top edge
	text=alt.Text("total_samples:Q", format=",d"),
	color=alt.value("black"), # Explicit text color
	# Tooltip for the text itself (optional)
	tooltip=[alt.Tooltip("total_samples:Q", title="Total Samples in Language")],
	)
	.transform_aggregate(
	# Aggregate within Altair to get max total_samples per language
	# Note: total_samples is constant per language, so max() is just a way to get it once
	total_samples="max(total_samples)",
	groupby=["language"], # Group by the faceting variable
	)
	)

	# --- Layer the charts THEN Facet (Correct Approach) ---
	language_outcome_chart = (
	# Layer first using '+'
	(bar_chart + text_chart)
	.facet(
	# Then facet the layered chart
	facet=alt.Facet(
	"language:N",
	title="Language",
	header=alt.Header(titleOrient="top", labelOrient="top"),
	),
	columns=5, # Wrap into 3 columns
	)
	.resolve_scale(
	y="independent" # Resolve Y scale because text has different positioning logic
	)
	)

	st.altair_chart(language_outcome_chart, use_container_width=True)

	except Exception as e:
	import traceback # Make sure traceback is imported at the top of your script

	st.error(
	f"Error generating faceted outcome distribution by language: {e}\n{traceback.format_exc()}"
	)


	# --- Performance Over Turns and Time ---
	st.header("Performance Dynamics")

	# Win Rate by Turn Heatmap
	st.subheader("Win Rate by Turn")
	try:
	win_rate_turn_df = (
	df_models.group_by(["model", "turn"])
	.agg(wins=(pl.col("outcome") == "win").sum(), total=pl.len())
	.filter(pl.col("total") > 0)
	.with_columns(win_rate=pl.col("wins") / pl.col("total"))
	)
	win_rate_turn_heatmap = (
	alt.Chart(win_rate_turn_df.to_pandas())
	.mark_rect()
	.encode(
	x=alt.X("turn:O", title="Turn"), # Treat turn as ordinal
	y=alt.Y(
	"model:N",
	title="Model",
	sort=unique_models,
	axis=alt.Axis(labelLimit=0),
	),
	color=alt.Color(
	"win_rate:Q",
	title="Win Rate",
	scale=alt.Scale(scheme="blues"),
	legend=alt.Legend(format=".0%"),
	),
	tooltip=[
	"model",
	"turn",
	alt.Tooltip("win_rate:Q", format=".1%"),
	"total",
	],
	)
	.properties(title="Win Rate by Model and Turn")
	)
	st.altair_chart(win_rate_turn_heatmap, use_container_width=True)
	except Exception as e:
	st.error(f"Error generating win rate by turn heatmap: {e}")

	# Wins Over Time Line Chart
	st.subheader("Wins Over Time")
	try:
	wins_time_df = (
	df_models.filter(pl.col("outcome") == "win")
	.group_by(["date", "model"])
	.agg(win_count=pl.len())
	.sort("date")
	)
	time_line = (
	alt.Chart(wins_time_df.to_pandas())
	.mark_line(point=True)
	.encode(
	x=alt.X("date:T", title="Date"),
	y=alt.Y("win_count:Q", title="Daily Win Count"),
	color=alt.Color("model:N", title="Model"),
	tooltip=["model", "date", "win_count"],
	)
	.properties(title="Wins by Model Over Time")
	)
	st.altair_chart(time_line, use_container_width=True)
	except Exception as e:
	st.error(f"Error generating wins over time chart: {e}")


	# --- Response Characteristics ---
	st.header("Response Characteristics vs. Outcome")

	# Response Length Boxplot (Faceted, No Outliers, Adjusted Y-Axis, Detailed Tooltip)
	st.subheader(
	"Response Length Distribution by Outcome (Box Plot - Adjusted Scale, No Outliers)"
	)
	try:
	# Filter using .and_() for logical AND
	response_length_df = df_models.filter(
	pl.col("response_length").is_not_null().and_(pl.col("response_length") >= 0)
	)

	# Convert to Pandas for Altair and percentile calculation
	response_length_pd = response_length_df.to_pandas()

	# --- Calculate a reasonable upper limit for the Y-axis ---
	valid_lengths = response_length_pd["response_length"].dropna()
	valid_lengths = valid_lengths[valid_lengths >= 0]
	if not valid_lengths.empty:
	upper_limit = valid_lengths.quantile(0.99)
	upper_limit = max(upper_limit, 100) # Example minimum range
	else:
	upper_limit = 1000 # Default if no valid data

	st.write(
	f"(Note: Y-axis for Response Length capped at ~{int(upper_limit)} [99th percentile] to improve visibility)"
	)

	response_facet_boxplot = (
	alt.Chart(response_length_pd)
	.mark_boxplot(
	extent="min-max", # Keep whiskers extending to min/max within 1.5*IQR
	outliers=False, # Keep outliers hidden
	)
	.encode(
	x=alt.X(
	"model:N",
	title="Model",
	sort=unique_models,
	axis=alt.Axis(labelLimit=0),
	), # Keep axis formatting
	y=alt.Y(
	"response_length:Q",
	title="Response Length",
	scale=alt.Scale(domain=[0, upper_limit], clamp=True), # Keep clamping
	),
	color=alt.Color(
	"model:N", title="Model", legend=None
	), # Color by model for clarity within facet
	)
	.facet(
	# Keep the same faceting by outcome
	column=alt.Column("outcome:N", title="Outcome", sort=outcome_order)
	)
	.properties(
	title="Response Length Distribution by Model per Outcome (Box Plot, Adjusted Scale, No Outliers)"
	)
	)
	st.altair_chart(response_facet_boxplot, use_container_width=True)

	except Exception as e:
	import traceback # Ensure traceback is imported

	st.error(f"Error generating response length boxplot: {e}\n{traceback.format_exc()}")


	# --- Citations Scatter Plot (Faceted) ---
	st.subheader("Number of Citations by Outcome")
	try:
	# Corrected filter using .and_() for logical AND
	citation_df = df_models.filter(
	pl.col("num_citations").is_not_null().and_(pl.col("num_citations") >= 0)
	)
	# Aggregate counts for bubble size
	citation_agg = citation_df.group_by(["model", "outcome", "num_citations"]).agg(
	count=pl.len()
	)

	# Convert to Pandas if needed
	citation_agg_pd = citation_agg.to_pandas()

	citation_facet = (
	alt.Chart(citation_agg_pd)
	.mark_circle()
	.encode(
	x=alt.X(
	"model:N",
	title="Model",
	sort=unique_models,
	axis=alt.Axis(labelLimit=0),
	), # Keep axis formatting
	y=alt.Y(
	"num_citations:Q",
	title="Number of Citations",
	axis=alt.Axis(tickMinStep=1),
	), # Ensure integer ticks
	# --- Modify the legend within alt.Size ---
	size=alt.Size(
	"count:Q",
	title="Number of Responses",
	legend=alt.Legend(
	symbolFillColor="lightblue", # Set legend symbol fill color to white
	symbolStrokeColor="lightblue", # Optional: add a subtle border if needed
	),
	),
	# ------------------------------------------
	color=alt.Color(
	"model:N", title="Model", legend=None
	), # Keep model legend hidden
	tooltip=["model", "outcome", "num_citations", "count"],
	)
	.facet(column=alt.Column("outcome:N", title="Outcome", sort=outcome_order))
	.properties(title="Citations by Model per Outcome")
	)
	st.altair_chart(citation_facet, use_container_width=True)
	except Exception as e:
	import traceback # Ensure traceback is imported

	st.error(f"Error generating citation facet plot: {e}\n{traceback.format_exc()}")


	# --- Language Leaderboard ---
	st.header("Top/Worst Models per Language")


	@st.cache_data(ttl=3600)
	def calculate_language_ranks(_df_models):
	"""Calculates top/worst models based on win rate per language."""
	df_win_rates = (
	_df_models.group_by(["language", "model"])
	.agg(wins=pl.col("outcome").eq("win").sum(), total=pl.len())
	.filter(pl.col("total") > 5) # Require minimum samples for ranking
	.with_columns(win_rate=pl.col("wins") / pl.col("total"))
	)

	if df_win_rates.is_empty():
	return pl.DataFrame() # Return empty if no language meets criteria

	df_ranked = df_win_rates.with_columns(
	rank_top=pl.col("win_rate")
	.rank(method="min", descending=True)
	.over("language"),
	rank_worst=pl.col("win_rate")
	.rank(method="min", descending=False)
	.over("language"),
	)

	df_top_1 = (
	df_ranked.filter(pl.col("rank_top") == 1)
	.group_by("language")
	.agg(
	top_model_1=pl.col("model").first(),
	win_rate_1=pl.col("win_rate").first(),
	total_samples=pl.col("total").sum(), # Sum totals for the language
	)
	)
	df_top_2 = (
	df_ranked.filter(pl.col("rank_top") == 2)
	.group_by("language")
	.agg(top_model_2=pl.col("model").first(), win_rate_2=pl.col("win_rate").first())
	)
	df_worst_1 = (
	df_ranked.filter(pl.col("rank_worst") == 1)
	.group_by("language")
	.agg(
	worst_model_1=pl.col("model").first(),
	worst_win_rate_1=pl.col("win_rate").first(),
	)
	)
	df_worst_2 = (
	df_ranked.filter(pl.col("rank_worst") == 2)
	.group_by("language")
	.agg(
	worst_model_2=pl.col("model").first(),
	worst_win_rate_2=pl.col("win_rate").first(),
	)
	)

	# Combine using outer joins to handle cases where ranks don't exist (e.g., < 4 models)
	df_table = (
	df_top_1.join(df_top_2, on="language", how="left")
	.join(df_worst_1, on="language", how="left")
	.join(df_worst_2, on="language", how="left")
	.select(
	[
	"language",
	"total_samples",
	"top_model_1",
	pl.col("win_rate_1").round(3).alias("WR #1"),
	"top_model_2",
	pl.col("win_rate_2").round(3).alias("WR #2"),
	"worst_model_1",
	pl.col("worst_win_rate_1").round(3).alias("WR Worst #1"),
	"worst_model_2",
	pl.col("worst_win_rate_2").round(3).alias("WR Worst #2"),
	]
	)
	.sort("language")
	.fill_null("N/A")
	) # Fill missing ranks with N/A
	return df_table


	df_language_table = calculate_language_ranks(df_models)

	if not df_language_table.is_empty():
	st.subheader("Top & Bottom 2 Models by Win Rate per Language (Min 5 Comparisons)")
	st.dataframe(df_language_table.to_pandas(), use_container_width=True)
	else:
	st.subheader("Top & Bottom 2 Models by Win Rate per Language")
	st.write(
	"Insufficient data (fewer than 5 comparisons) for one or more languages to generate rankings."
	)


	# Model Ranking Counts
	if not df_language_table.is_empty():
	st.subheader("How Often Models Rank Top/Worst Across Languages")
	try:
	df_top_1_counts = (
	df_language_table["top_model_1"]
	.value_counts()
	.rename({"top_model_1": "model", "count": "Rank 1 Count"})
	.sort("Rank 1 Count", descending=True)
	)
	df_worst_1_counts = (
	df_language_table["worst_model_1"]
	.value_counts()
	.rename({"worst_model_1": "model", "count": "Worst Rank Count"})
	.sort("Worst Rank Count", descending=True)
	)

	col1, col2 = st.columns(2)
	with col1:
	st.write("Times Ranked #1")
	st.dataframe(df_top_1_counts.to_pandas(), use_container_width=True)
	with col2:
	st.write("Times Ranked Worst")
	st.dataframe(df_worst_1_counts.to_pandas(), use_container_width=True)
	except Exception as e:
	st.error(f"Error generating model ranking counts: {e}")


	# --- Footer ---
	st.markdown("---")
	st.markdown(
	"Analysis based on the `lmarena-ai/search-arena-v1-7k` dataset on Hugging Face."
	)