Spaces:

PatriciaDyck
/

BeigificationBench

Sleeping

App Files Files Community

BeigificationBench / app.py

PatriciaDyck

Beigification Bench: initial release

b7f4459 verified 1 day ago

raw

history blame contribute delete

38.8 kB

	"""
	Beigification Bench — Measuring How LLMs Flatten Text
	Space v2026.04 \| Evaluation Engine v1.0
	"""

	import os
	import json
	import gradio as gr
	import pandas as pd
	import plotly.graph_objects as go
	import plotly.express as px
	from plotly.subplots import make_subplots
	import numpy as np

	SPACE_VERSION = "2026.04"
	EVAL_VERSION = "1.0"
	EVAL_VERSION_NOTES = (
	"V1: 6-component spiciness (sqrt-PPL, lex richness, rare words, word specificity, "
	"vivid modifiers, voice), NLI atomization with subject-heuristic reattachment, "
	"directionally-gated centroid pull, beigification vector PCA"
	)

	MODEL_META = {
	"claude-opus-4-6": ("Claude Opus 4.6", "Anthropic"),
	"gpt-5.2": ("GPT-5.2", "OpenAI"),
	"gemini-3.1-pro-preview": ("Gemini 3.1 Pro", "Google"),
	"meta-llama/Llama-3.3-70B-Instruct": ("Llama 3.3 70B", "Meta / HF"),
	"grok-3": ("Grok 3", "xAI"),
	}

	CATEGORY_ORDER = ["Polemic", "Sensitive", "Jargon", "Reasoning", "Edge Cases", "Rant", "analysis"]

	MODEL_COLORS = {
	"Claude Opus 4.6": "#7c3aed",
	"GPT-5.2": "#0891b2",
	"Gemini 3.1 Pro": "#059669",
	"Llama 3.3 70B": "#d97706",
	"Grok 3": "#dc2626",
	}

	HF_DATASET_ID = "PatriciaDyck/BeigificationBench"
	DATA_DIR = os.path.join(os.path.dirname(__file__), "data")

	def load_data():
	full_data = None

	try:
	from huggingface_hub import hf_hub_download
	path = hf_hub_download(
	repo_id=HF_DATASET_ID,
	filename="data/full_dataset.json",
	repo_type="dataset",
	)
	with open(path) as f:
	full_data = json.load(f)
	print(f"Loaded {len(full_data)} rows from HF dataset")
	except Exception as e:
	print(f"WARNING: HF dataset load failed ({e}), falling back to local files")

	if full_data:
	df = pd.DataFrame(full_data)

	nli_atoms = []
	pca_beige = []
	for row in full_data:
	if row.get("nli_atoms"):
	nli_atoms.append({
	"model": row["model"],
	"prompt_id": row["prompt_id"],
	"prompt_name": row["prompt_name"],
	"nli_atoms": row["nli_atoms"],
	})
	if row.get("beige_points") and row["model"] == "gpt-5.2":
	pca_beige.append({
	"model": row["model"],
	"prompt_id": row["prompt_id"],
	"prompt_name": row["prompt_name"],
	"beige_points": row["beige_points"],
	})
	return df, nli_atoms, pca_beige

	try:
	df = pd.read_csv(os.path.join(DATA_DIR, "results.csv"))
	except Exception as e:
	print(f"WARNING: Failed to load results.csv: {e}")
	df = pd.DataFrame()
	try:
	with open(os.path.join(DATA_DIR, "nli_atoms.json")) as f:
	nli_atoms = json.load(f)
	except Exception as e:
	print(f"WARNING: Failed to load nli_atoms.json: {e}")
	nli_atoms = []
	try:
	with open(os.path.join(DATA_DIR, "pca_beige_points.json")) as f:
	pca_beige = json.load(f)
	except Exception as e:
	print(f"WARNING: Failed to load pca_beige_points.json: {e}")
	pca_beige = []
	return df, nli_atoms, pca_beige

	results_df, nli_atoms_data, pca_beige_data = load_data()

	def display_name(model_id):
	return MODEL_META.get(model_id, (model_id, ""))[0]

	def provider_name(model_id):
	return MODEL_META.get(model_id, ("", model_id))[1]

	def get_color(model_name):
	return MODEL_COLORS.get(model_name, "#6b7280")

	def safe_plot(fn, args, *kwargs):
	try:
	return fn(args, *kwargs)
	except Exception as e:
	fig = go.Figure()
	fig.add_annotation(
	text=f"Chart unavailable: {type(e).__name__}: {e}",
	x=0.5, y=0.5, xref="paper", yref="paper",
	showarrow=False, font=dict(size=14, color="red"),
	)
	fig.update_layout(height=300, paper_bgcolor="white", plot_bgcolor="white")
	return fig

	# ── Tab 1: Interactive Explorer ────────────────────────────────────────────────

	def get_categories():
	cats = results_df["category"].unique().tolist()
	return ["All"] + sorted(cats)

	def get_texts(category):
	if category == "All":
	texts = results_df["prompt_name"].unique().tolist()
	else:
	texts = results_df[results_df["category"] == category]["prompt_name"].unique().tolist()
	return ["All"] + sorted(texts)

	def get_models():
	return ["All"] + sorted(results_df["model"].map(display_name).unique().tolist())

	def filter_df(category, text_name, model_filter):
	df = results_df.copy()
	df["model_name"] = df["model"].map(display_name)
	if category != "All":
	df = df[df["category"] == category]
	if text_name != "All":
	df = df[df["prompt_name"] == text_name]
	if model_filter != "All":
	df = df[df["model_name"] == model_filter]
	return df

	def build_scatter(category, text_name, model_filter):
	df = filter_df(category, text_name, model_filter)

	fig = go.Figure()
	for mn in sorted(df["model_name"].unique()):
	sub = df[df["model_name"] == mn]
	fig.add_trace(go.Scatter(
	x=sub["lossiness"].tolist(),
	y=sub["drift"].tolist(),
	mode="markers",
	name=mn,
	marker=dict(size=12, color=get_color(mn), opacity=0.8),
	text=sub["prompt_name"].tolist(),
	customdata=sub[["score", "nli_retention", "category"]].values.tolist(),
	hovertemplate=(
	"<b>%{text}</b><br>"
	"Model: " + mn + "<br>"
	"Lossiness: %{x:.3f}<br>"
	"Drift: %{y:.3f}<br>"
	"Score: %{customdata[0]}<br>"
	"NLI Retention: %{customdata[1]:.1%}<br>"
	"Category: %{customdata[2]}<extra></extra>"
	),
	))

	fig.add_shape(type="rect", x0=-0.02, y0=-0.02, x1=0.15, y1=0.05,
	fillcolor="rgba(22,163,74,0.08)", line=dict(color="rgba(22,163,74,0.3)", dash="dot"),
	layer="below")
	fig.add_annotation(x=0.07, y=0.04, text="High fidelity", showarrow=False,
	font=dict(color="#16a34a", size=10))

	fig.update_layout(
	title=dict(text="Integrity Frontier: Lossiness vs Drift", font=dict(size=15)),
	xaxis=dict(title="Lossiness (information loss) →", gridcolor="#e5e7eb", rangemode="tozero"),
	yaxis=dict(title="Drift (stylistic collapse) ↑", gridcolor="#e5e7eb", rangemode="tozero"),
	plot_bgcolor="white", paper_bgcolor="white",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	autosize=True, height=500, margin=dict(l=50, r=10, t=60, b=60),
	)
	return fig

	def build_lossiness_breakdown(category, text_name, model_filter):
	df = filter_df(category, text_name, model_filter)

	agg = df.groupby("model_name").agg(
	prop_loss=("prop_loss", "mean"),
	semantic_distance=("semantic_distance", "mean"),
	word_deletion=("word_deletion", "mean"),
	).reset_index().sort_values("model_name")

	models = agg["model_name"].tolist()
	fig = go.Figure()
	fig.add_trace(go.Bar(name="Prop Loss (60%)", x=models, y=(agg["prop_loss"] * 0.6).tolist(),
	marker_color="#1e3a5f", text=[f"{v:.3f}" for v in (agg["prop_loss"] * 0.6)], textposition="inside"))
	fig.add_trace(go.Bar(name="Semantic Dist (20%)", x=models, y=(agg["semantic_distance"] * 0.2).tolist(),
	marker_color="#3498db", text=[f"{v:.3f}" for v in (agg["semantic_distance"] * 0.2)], textposition="inside"))
	fig.add_trace(go.Bar(name="Word Deletion (20%)", x=models, y=(agg["word_deletion"] * 0.2).tolist(),
	marker_color="#95a5a6", text=[f"{v:.3f}" for v in (agg["word_deletion"] * 0.2)], textposition="inside"))
	fig.update_layout(
	barmode="stack",
	title=dict(text="Lossiness Sub-Metric Decomposition", font=dict(size=15)),
	yaxis=dict(title="Weighted Contribution", gridcolor="#e5e7eb"),
	xaxis=dict(tickangle=-15),
	plot_bgcolor="white", paper_bgcolor="white",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80),
	)
	return fig

	def build_drift_breakdown(category, text_name, model_filter):
	df = filter_df(category, text_name, model_filter)

	agg = df.groupby("model_name").agg(
	norm_delta_spiciness=("norm_delta_spiciness", "mean"),
	norm_pull=("norm_pull", "mean"),
	).reset_index().sort_values("model_name")

	models = agg["model_name"].tolist()
	fig = go.Figure()
	fig.add_trace(go.Bar(name="Spiciness Loss (55%)", x=models, y=(agg["norm_delta_spiciness"] * 0.55).tolist(),
	marker_color="#e74c3c", text=[f"{v:.3f}" for v in (agg["norm_delta_spiciness"] * 0.55)], textposition="inside"))
	fig.add_trace(go.Bar(name="Centroid Pull (45%)", x=models, y=(agg["norm_pull"] * 0.45).tolist(),
	marker_color="#f1c40f", text=[f"{v:.3f}" for v in (agg["norm_pull"] * 0.45)], textposition="inside"))
	fig.update_layout(
	barmode="stack",
	title=dict(text="Drift Sub-Metric Decomposition", font=dict(size=15)),
	yaxis=dict(title="Weighted Contribution", gridcolor="#e5e7eb"),
	xaxis=dict(tickangle=-15),
	plot_bgcolor="white", paper_bgcolor="white",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80),
	)
	return fig

	def build_spiciness_comparison(category, text_name, model_filter):
	df = filter_df(category, text_name, model_filter)

	agg = df.groupby("model_name").agg(
	orig=("orig_spiciness", "mean"),
	rew=("rew_spiciness", "mean"),
	).reset_index().sort_values("model_name")

	fig = go.Figure()
	models = agg["model_name"].tolist()
	x_pos = np.arange(len(models))
	width = 0.35

	fig.add_trace(go.Bar(
	name="Original", x=[x - width/2 for x in x_pos], y=agg["orig"].tolist(),
	marker_color="#6366f1", width=width,
	text=[f"{v:.1f}" for v in agg["orig"]], textposition="outside",
	))
	fig.add_trace(go.Bar(
	name="Rewrite", x=[x + width/2 for x in x_pos], y=agg["rew"].tolist(),
	marker_color="#a5b4fc", width=width,
	text=[f"{v:.1f}" for v in agg["rew"]], textposition="outside",
	))

	fig.update_layout(
	title=dict(text="Spiciness: Original vs Rewrite", font=dict(size=15)),
	xaxis=dict(tickvals=list(x_pos), ticktext=models, tickangle=-15),
	yaxis=dict(title="Spiciness Score", gridcolor="#e5e7eb"),
	plot_bgcolor="white", paper_bgcolor="white",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80),
	)
	return fig

	def build_pca_plot(category, text_name, model_filter):
	df = filter_df(category, text_name, model_filter)

	valid = df.dropna(subset=["pca_original_x", "pca_rewrite_x"])
	if len(valid) == 0:
	fig = go.Figure()
	fig.add_annotation(text="No PCA data available for this selection",
	x=0.5, y=0.5, xref="paper", yref="paper", showarrow=False)
	fig.update_layout(height=450, paper_bgcolor="white", plot_bgcolor="white")
	return fig

	fig = go.Figure()

	beige_shown = False
	for entry in pca_beige_data:
	if not beige_shown and entry.get("beige_points"):
	bx = [p["x"] for p in entry["beige_points"]]
	by = [p["y"] for p in entry["beige_points"]]
	fig.add_trace(go.Scatter(
	x=bx, y=by, mode="markers", name="Beige Platitudes",
	marker=dict(size=8, color="lightgray", opacity=0.5, symbol="diamond"),
	hoverinfo="skip",
	))
	beige_shown = True
	break

	for mn in sorted(valid["model_name"].unique()):
	sub = valid[valid["model_name"] == mn]
	color = get_color(mn)
	for _, row in sub.iterrows():
	fig.add_trace(go.Scatter(
	x=[row["pca_original_x"], row["pca_rewrite_x"]],
	y=[row["pca_original_y"], row["pca_rewrite_y"]],
	mode="lines+markers",
	line=dict(color=color, width=2),
	marker=dict(size=[8, 10], color=color, symbol=["circle", "arrow-right"]),
	name=mn,
	legendgroup=mn,
	showlegend=(_ == sub.index[0]),
	hovertemplate=f"<b>{mn}</b><br>{row['prompt_name']}<extra></extra>",
	))

	fig.update_layout(
	title=dict(text="Beigification Vector — PCA Projection", font=dict(size=15)),
	xaxis=dict(title="PC1", gridcolor="#e5e7eb"),
	yaxis=dict(title="PC2", gridcolor="#e5e7eb", scaleanchor="x", scaleratio=1),
	plot_bgcolor="white", paper_bgcolor="white",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	autosize=True, height=500, margin=dict(l=50, r=10, t=60, b=60),
	)
	return fig

	def build_nli_table(category, text_name, model_filter):
	if text_name == "All" or model_filter == "All":
	return pd.DataFrame({"Info": ["Select a specific text AND model to view NLI atom details."]})

	model_id = None
	for mid, (dname, _) in MODEL_META.items():
	if dname == model_filter:
	model_id = mid
	break
	if not model_id:
	return pd.DataFrame({"Error": ["Model not found"]})

	for entry in nli_atoms_data:
	if entry["model"] == model_id and entry["prompt_name"] == text_name:
	atoms = entry["nli_atoms"]
	rows = []
	for i, atom in enumerate(atoms):
	score = atom["score"]
	status = "Retained" if score >= 0.5 else "Lost"
	rows.append({
	"#": i + 1,
	"Atom": atom["text"][:120] + ("..." if len(atom["text"]) > 120 else ""),
	"NLI Score": f"{score:.4f}",
	"Status": status,
	})
	df = pd.DataFrame(rows)
	return df

	return pd.DataFrame({"Info": ["No NLI atom data available for this selection."]})

	def build_text_comparison(text_name, model_filter):
	if text_name == "All":
	return (
	"Select a specific source text to see the original and model rewrites side by side.",
	gr.update(visible=False),
	)

	has_text_cols = "original_text" in results_df.columns and "rewrite_text" in results_df.columns
	if not has_text_cols:
	return (
	"Text comparison requires the full dataset (loaded from HF). Local CSV fallback does not include text content.",
	gr.update(visible=False),
	)

	sub = results_df[results_df["prompt_name"] == text_name].copy()
	if len(sub) == 0:
	return "No data found for this text.", gr.update(visible=False)

	original_text = sub.iloc[0]["original_text"]

	if model_filter != "All":
	sub["model_name"] = sub["model"].map(display_name)
	sub = sub[sub["model_name"] == model_filter]
	if len(sub) == 0:
	return "No data found for this model/text combination.", gr.update(visible=False)

	md_parts = []

	md_parts.append(f"#### Original Text\n\n{original_text}\n\n---\n")

	sub_sorted = sub.copy()
	sub_sorted["model_name"] = sub_sorted["model"].map(display_name)
	sub_sorted = sub_sorted.sort_values("score", ascending=False)

	for _, row in sub_sorted.iterrows():
	mn = row["model_name"]
	score = row.get("score", "N/A")
	lossiness = row.get("lossiness", 0)
	drift = row.get("drift", 0)
	rewrite = row.get("rewrite_text", "")

	score_color = "#16a34a" if score >= 85 else ("#d97706" if score >= 75 else "#dc2626")

	md_parts.append(
	f"#### {mn} — Score: {score} \| Lossiness: {lossiness:.3f} \| Drift: {drift:.3f}\n\n"
	f"{rewrite}\n\n---\n"
	)

	summary_rows = []
	for _, row in sub_sorted.iterrows():
	summary_rows.append({
	"Model": row["model_name"],
	"Score": row.get("score", ""),
	"Lossiness": f"{row.get('lossiness', 0):.4f}",
	"Drift": f"{row.get('drift', 0):.4f}",
	"NLI Retention": f"{row.get('nli_retention', 0):.1%}",
	"Spiciness Δ": f"{row.get('spiciness_delta', 0):.2f}",
	"Words (orig)": len(str(original_text).split()),
	"Words (rewrite)": len(str(row.get('rewrite_text', '')).split()),
	})
	summary_df = pd.DataFrame(summary_rows)

	return "\n".join(md_parts), gr.update(value=summary_df, visible=True)

	def update_explorer(category, text_name, model_filter):
	scatter = safe_plot(build_scatter, category, text_name, model_filter)
	lossiness = safe_plot(build_lossiness_breakdown, category, text_name, model_filter)
	drift = safe_plot(build_drift_breakdown, category, text_name, model_filter)
	spiciness = safe_plot(build_spiciness_comparison, category, text_name, model_filter)
	pca = safe_plot(build_pca_plot, category, text_name, model_filter)
	nli = build_nli_table(category, text_name, model_filter)
	try:
	text_md, text_summary = build_text_comparison(text_name, model_filter)
	except Exception as e:
	text_md = f"Error loading text comparison: {e}"
	text_summary = gr.update(visible=False)
	return scatter, lossiness, drift, spiciness, pca, nli, text_md, text_summary

	def update_text_choices(category):
	return gr.update(choices=get_texts(category), value="All")

	# ── Tab 2: Aggregate Leaderboard ───────────────────────────────────────────────

	def build_leaderboard_df():
	df = results_df.copy()
	df["model_name"] = df["model"].map(display_name)
	df["provider"] = df["model"].map(provider_name)

	agg = df.groupby(["model_name", "provider"]).agg(
	avg_score=("score", "mean"),
	avg_lossiness=("lossiness", "mean"),
	avg_drift=("drift", "mean"),
	avg_nli_retention=("nli_retention", "mean"),
	avg_spiciness_delta=("spiciness_delta", "mean"),
	avg_orig_spiciness=("orig_spiciness", "mean"),
	avg_rew_spiciness=("rew_spiciness", "mean"),
	n_texts=("score", "count"),
	).reset_index()

	agg = agg.sort_values("avg_score", ascending=False).reset_index(drop=True)
	agg.insert(0, "Rank", range(1, len(agg) + 1))

	out = agg.rename(columns={
	"model_name": "Model",
	"provider": "Provider",
	"avg_score": "Avg Score",
	"avg_lossiness": "Avg Lossiness",
	"avg_drift": "Avg Drift",
	"avg_nli_retention": "NLI Retention",
	"avg_spiciness_delta": "Spiciness Δ",
	"avg_orig_spiciness": "Orig Spiciness",
	"avg_rew_spiciness": "Rew Spiciness",
	"n_texts": "Texts",
	})

	for col in ["Avg Score"]:
	out[col] = out[col].round(1)
	for col in ["Avg Lossiness", "Avg Drift", "NLI Retention", "Spiciness Δ", "Orig Spiciness", "Rew Spiciness"]:
	out[col] = out[col].round(4)

	return out

	def build_leaderboard_chart():
	df = results_df.copy()
	df["model_name"] = df["model"].map(display_name)
	agg = df.groupby("model_name").agg(
	score=("score", "mean"),
	lossiness=("lossiness", "mean"),
	drift=("drift", "mean"),
	).reset_index().sort_values("score", ascending=False)

	fig = go.Figure()
	models = agg["model_name"].tolist()
	x_pos = np.arange(len(models))

	fig.add_trace(go.Bar(
	name="Avg Score", x=models, y=agg["score"].tolist(),
	marker_color=[get_color(m) for m in models],
	text=[f"{v:.1f}" for v in agg["score"]], textposition="outside",
	))

	fig.update_layout(
	title=dict(text="Overall Preservation Score by Model (higher = better)", font=dict(size=15)),
	yaxis=dict(title="Score (0-100)", gridcolor="#e5e7eb", range=[0, 105]),
	xaxis=dict(tickangle=-15),
	plot_bgcolor="white", paper_bgcolor="white",
	showlegend=False,
	autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=80),
	)
	return fig

	def build_category_table():
	df = results_df.copy()
	df["model_name"] = df["model"].map(display_name)

	pivot = df.pivot_table(
	index="category", columns="model_name", values="score", aggfunc="mean"
	).round(1)
	pivot = pivot.reset_index().rename(columns={"category": "Category"})
	return pivot

	def build_category_heatmap():
	df = results_df.copy()
	df["model_name"] = df["model"].map(display_name)

	pivot = df.pivot_table(index="category", columns="model_name", values="score", aggfunc="mean")
	z = pivot.values
	x = pivot.columns.tolist()
	y = pivot.index.tolist()

	fig = go.Figure(data=go.Heatmap(
	z=z, x=x, y=y,
	colorscale=[[0, "#fef2f2"], [0.4, "#fde68a"], [0.7, "#bbf7d0"], [1.0, "#1d4ed8"]],
	text=[[f"{v:.0f}" if not np.isnan(v) else "" for v in row] for row in z],
	texttemplate="%{text}",
	textfont=dict(size=11),
	colorbar=dict(title="Score", thickness=12),
	zmin=60, zmax=100,
	))
	fig.update_layout(
	title=dict(text="Score by Category × Model", font=dict(size=15)),
	xaxis=dict(side="top", tickangle=-20),
	yaxis=dict(autorange="reversed"),
	autosize=True, height=400,
	margin=dict(l=160, r=10, t=70, b=10),
	plot_bgcolor="white", paper_bgcolor="white",
	)
	return fig

	def build_metrics_radar():
	df = results_df.copy()
	df["model_name"] = df["model"].map(display_name)
	agg = df.groupby("model_name").agg(
	nli_retention=("nli_retention", "mean"),
	low_lossiness=("lossiness", lambda x: 1 - x.mean()),
	low_drift=("drift", lambda x: 1 - x.mean()),
	spiciness_preservation=("spiciness_delta", lambda x: 1 - abs(x.mean()) / 10),
	low_word_deletion=("word_deletion", lambda x: 1 - x.mean()),
	).reset_index()

	categories = ["NLI Retention", "Low Lossiness", "Low Drift", "Spiciness Preservation", "Low Word Deletion"]

	fig = go.Figure()
	for _, row in agg.iterrows():
	values = [row["nli_retention"], row["low_lossiness"], row["low_drift"],
	row["spiciness_preservation"], row["low_word_deletion"]]
	values.append(values[0])
	fig.add_trace(go.Scatterpolar(
	r=values,
	theta=categories + [categories[0]],
	fill="toself",
	name=row["model_name"],
	line=dict(color=get_color(row["model_name"])),
	opacity=0.6,
	))

	fig.update_layout(
	polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
	title=dict(text="Model Preservation Profile", font=dict(size=15)),
	autosize=True, height=480,
	margin=dict(l=60, r=60, t=60, b=60),
	paper_bgcolor="white",
	)
	return fig

	# ── Tab 3: Orthogonality Analysis ──────────────────────────────────────────────

	METRIC_COLS = [
	"lossiness", "drift", "prop_loss", "semantic_distance", "word_deletion",
	"nli_retention", "norm_pull", "norm_delta_spiciness",
	"orig_spiciness", "rew_spiciness", "spiciness_delta",
	"orig_perplexity", "rew_perplexity",
	"orig_lex_richness", "rew_lex_richness",
	"orig_rare_word_density", "rew_rare_word_density",
	"voice_shift",
	"orig_word_specificity", "rew_word_specificity",
	"orig_vivid_modifier_ratio", "rew_vivid_modifier_ratio",
	"orig_voice_score", "rew_voice_score",
	"pull_magnitude", "directional_similarity", "delta_dist_to_beige",
	]

	METRIC_LABELS = {
	"lossiness": "Lossiness",
	"drift": "Drift",
	"prop_loss": "Prop Loss",
	"semantic_distance": "Semantic Dist",
	"word_deletion": "Word Deletion",
	"nli_retention": "NLI Retention",
	"norm_pull": "Norm Pull",
	"norm_delta_spiciness": "Δ Spiciness (norm)",
	"orig_spiciness": "Orig Spiciness",
	"rew_spiciness": "Rew Spiciness",
	"spiciness_delta": "Spiciness Δ",
	"orig_perplexity": "Orig Perplexity",
	"rew_perplexity": "Rew Perplexity",
	"orig_lex_richness": "Orig Lex Richness",
	"rew_lex_richness": "Rew Lex Richness",
	"orig_rare_word_density": "Orig Rare Words",
	"rew_rare_word_density": "Rew Rare Words",
	"voice_shift": "Voice Shift",
	"orig_word_specificity": "Orig Word Spec",
	"rew_word_specificity": "Rew Word Spec",
	"orig_vivid_modifier_ratio": "Orig Vivid Mod",
	"rew_vivid_modifier_ratio": "Rew Vivid Mod",
	"orig_voice_score": "Orig Voice",
	"rew_voice_score": "Rew Voice",
	"pull_magnitude": "Pull Magnitude",
	"directional_similarity": "Directional Sim",
	"delta_dist_to_beige": "Δ Dist to Beige",
	}

	def build_correlation_heatmap():
	available = [c for c in METRIC_COLS if c in results_df.columns]
	df = results_df[available].dropna()
	corr = df.corr()

	labels = [METRIC_LABELS.get(c, c) for c in corr.columns]

	fig = go.Figure(data=go.Heatmap(
	z=corr.values,
	x=labels,
	y=labels,
	colorscale="RdBu_r",
	zmin=-1, zmax=1,
	text=[[f"{v:.2f}" for v in row] for row in corr.values],
	texttemplate="%{text}",
	textfont=dict(size=8),
	colorbar=dict(title="r", thickness=12),
	))
	fig.update_layout(
	title=dict(text="Metric Correlation Matrix (Pearson r)", font=dict(size=15)),
	xaxis=dict(tickangle=-45, tickfont=dict(size=9)),
	yaxis=dict(tickfont=dict(size=9), autorange="reversed"),
	autosize=True, height=700,
	margin=dict(l=130, r=10, t=60, b=130),
	plot_bgcolor="white", paper_bgcolor="white",
	)
	return fig

	def build_pca_biplot():
	from sklearn.decomposition import PCA
	from sklearn.preprocessing import StandardScaler

	available = [c for c in METRIC_COLS if c in results_df.columns]
	df = results_df[available].dropna()
	scaler = StandardScaler()
	scaled = scaler.fit_transform(df)

	pca = PCA(n_components=2)
	scores = pca.fit_transform(scaled)
	loadings = pca.components_.T

	fig = go.Figure()

	results_with_models = results_df.dropna(subset=available)
	model_names = results_with_models["model"].map(display_name).values

	for mn in sorted(set(model_names)):
	mask = model_names == mn
	fig.add_trace(go.Scatter(
	x=scores[mask, 0].tolist(),
	y=scores[mask, 1].tolist(),
	mode="markers",
	name=mn,
	marker=dict(size=8, color=get_color(mn), opacity=0.6),
	hovertemplate=f"{mn}<extra></extra>",
	))

	scale = np.max(np.abs(scores)) * 0.8
	for i, col in enumerate(available):
	lx, ly = loadings[i, 0] * scale, loadings[i, 1] * scale
	magnitude = np.sqrt(lx2 + ly2)
	if magnitude < scale * 0.2:
	continue
	label = METRIC_LABELS.get(col, col)
	fig.add_annotation(
	ax=0, ay=0, axref="x", ayref="y",
	x=lx, y=ly, xref="x", yref="y",
	showarrow=True,
	arrowhead=2, arrowsize=1.5, arrowwidth=1.5,
	arrowcolor="rgba(100,100,100,0.7)",
	)
	fig.add_annotation(
	x=lx * 1.15, y=ly * 1.15,
	text=label, showarrow=False,
	font=dict(size=9, color="#374151"),
	)

	var_explained = pca.explained_variance_ratio_
	fig.update_layout(
	title=dict(
	text=f"PCA Biplot — Metric Loading Directions (PC1: {var_explained[0]:.1%}, PC2: {var_explained[1]:.1%})",
	font=dict(size=14),
	),
	xaxis=dict(title=f"PC1 ({var_explained[0]:.1%} variance)", gridcolor="#e5e7eb"),
	yaxis=dict(title=f"PC2 ({var_explained[1]:.1%} variance)", gridcolor="#e5e7eb",
	scaleanchor="x", scaleratio=1),
	plot_bgcolor="white", paper_bgcolor="white",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	autosize=True, height=600,
	margin=dict(l=60, r=10, t=80, b=60),
	)
	return fig

	def build_high_corr_table():
	available = [c for c in METRIC_COLS if c in results_df.columns]
	df = results_df[available].dropna()
	corr = df.corr()

	pairs = []
	cols = corr.columns.tolist()
	for i in range(len(cols)):
	for j in range(i+1, len(cols)):
	r = corr.iloc[i, j]
	pairs.append({
	"Metric A": METRIC_LABELS.get(cols[i], cols[i]),
	"Metric B": METRIC_LABELS.get(cols[j], cols[j]),
	"Pearson r": round(r, 4),
	"\|r\|": round(abs(r), 4),
	"Relationship": "Redundant" if abs(r) > 0.8 else ("Correlated" if abs(r) > 0.5 else "Independent"),
	})

	pairs_df = pd.DataFrame(pairs).sort_values("\|r\|", ascending=False)
	return pairs_df.head(40).reset_index(drop=True)

	def build_independence_summary():
	available = [c for c in METRIC_COLS if c in results_df.columns]
	df = results_df[available].dropna()
	corr = df.corr()
	n = len(corr.columns)

	total_pairs = n * (n - 1) // 2
	cols = corr.columns.tolist()
	redundant = 0
	correlated = 0
	independent = 0

	for i in range(len(cols)):
	for j in range(i + 1, len(cols)):
	r = abs(corr.iloc[i, j])
	if r > 0.8:
	redundant += 1
	elif r > 0.5:
	correlated += 1
	else:
	independent += 1

	return pd.DataFrame([{
	"Total Metric Pairs": total_pairs,
	"Redundant (\|r\| > 0.8)": redundant,
	"Correlated (0.5 < \|r\| <= 0.8)": correlated,
	"Independent (\|r\| <= 0.5)": independent,
	"% Independent": f"{independent/total_pairs*100:.1f}%",
	}])

	# ── Tab 4: Methodology ────────────────────────────────────────────────────────

	METHODOLOGY_MD = ""
	for _p in [
	os.path.join(os.path.dirname(__file__), "methodology.md"),
	os.path.join(os.path.dirname(__file__), "..", "methodology.md"),
	]:
	if os.path.exists(_p):
	METHODOLOGY_MD = open(_p, "r").read()
	break
	if not METHODOLOGY_MD:
	METHODOLOGY_MD = "Methodology document not found. See the Beigification Bench documentation for details."

	# ── Build the Gradio App ───────────────────────────────────────────────────────

	ABOUT_MD = f"""
	## Beigification Bench: Measuring How LLMs Flatten Text

	Space Version: v{SPACE_VERSION} \| Evaluation Engine: v{EVAL_VERSION} \| Author: Patricia Dyck \| License: CC BY 4.0

	> Eval Engine v{EVAL_VERSION}: {EVAL_VERSION_NOTES}

	---

	### What is Beigification?

	Beigification is the phenomenon where LLM-generated rewrites systematically flatten, genericize, and strip distinctive qualities from source text — replacing vivid language with safe platitudes, concrete imagery with abstract generalities, and assertive voice with hedged equivocation.

	The Beigification Bench is a diagnostic framework that quantifies this effect through two composite metrics:

	- Lossiness — How much semantic and lexical content is lost (proposition loss via NLI, semantic distance, word deletion)
	- Drift — How much the rewrite moves toward generic "model collapse" language (spiciness loss + beige centroid pull)

	These are supported by a rich sub-metric suite including NLI atom retention, spiciness scoring (perplexity, lexical richness, rare words, word specificity, vivid modifiers, voice), and beigification vector analysis (PCA projection of semantic displacement toward a pre-defined platitude cluster).

	---

	### Benchmark Design

	- 5 models: GPT-5.2, Claude Opus 4.6, Gemini 3.1 Pro, Llama 3.3 70B, Grok 3
	- 12 source texts across 5+ categories: Polemic, Sensitive Topics, Jargon-Dense, Complex Reasoning, Edge Cases
	- Task: "Please rewrite the following text" — a simple rewriting prompt that reveals how models handle stylistic preservation
	- Temperature: 0.7 (matches real-world deployment conditions)
	- Evaluation: Automated via HuggingFace Inference API (BAAI/bge-base-en-v1.5 embeddings, DeBERTa-v3-large NLI)

	### Data

	All benchmark data is hosted on HuggingFace Datasets: [`PatriciaDyck/BeigificationBench`](https://huggingface.co/datasets/PatriciaDyck/BeigificationBench)

	### Theoretical Grounding

	The framework draws on foundational work in model collapse (Shumailov et al., 2024), semantic entropy (Farquhar et al., 2024), and contextual erosion (Vazhentsev et al., ICLR 2026). It provides empirical measurement of the "alignment tax" — the information and stylistic cost of safety-tuned language models.

	---

	### Tabs

	1. Interactive Explorer — Filter by category, text, and model. View original text alongside model rewrites with metrics. Includes lossiness/drift scatter, sub-metric decompositions, spiciness comparisons, PCA vector plots, and NLI atom details.
	2. Leaderboard — Aggregate rankings across all texts with per-category heatmaps and radar profiles.
	3. Orthogonality Analysis — Correlation matrix and PCA biplot showing which metrics measure genuinely independent dimensions vs. which are redundant.
	4. Methodology — Full technical specification of all metrics, formulas, and hyperparameters.
	"""

	has_text_data = "original_text" in results_df.columns

	with gr.Blocks(title="Beigification Bench") as demo:

	gr.Markdown("# 🎨 Beigification Bench\nMeasuring How LLMs Flatten Text")

	with gr.Tabs():
	with gr.Tab("Explorer"):
	with gr.Row():
	cat_dd = gr.Dropdown(choices=get_categories(), value="All", label="Category", scale=1)
	text_dd = gr.Dropdown(choices=get_texts("All"), value="All", label="Source Text", scale=2)
	model_dd = gr.Dropdown(choices=get_models(), value="All", label="Model", scale=1)

	scatter_plot = gr.Plot(value=safe_plot(build_scatter, "All", "All", "All"), label="Integrity Frontier")

	with gr.Row():
	lossiness_plot = gr.Plot(value=safe_plot(build_lossiness_breakdown, "All", "All", "All"))
	drift_plot = gr.Plot(value=safe_plot(build_drift_breakdown, "All", "All", "All"))

	with gr.Row():
	spiciness_plot = gr.Plot(value=safe_plot(build_spiciness_comparison, "All", "All", "All"))
	pca_plot = gr.Plot(value=safe_plot(build_pca_plot, "All", "All", "All"))

	gr.Markdown("### Original Text & Model Rewrites")
	if has_text_data:
	gr.Markdown(
	"*Select a source text above to read the original alongside each model's rewrite. "
	"Optionally filter to a single model.*"
	)
	else:
	gr.Markdown(
	"*Text comparison unavailable — dataset loaded from local CSV without text content. "
	"Full text data is available via the [HF dataset](https://huggingface.co/datasets/PatriciaDyck/BeigificationBench).*"
	)
	text_comparison_md = gr.Markdown(
	value="Select a specific source text to see the original and model rewrites side by side."
	)
	text_summary_table = gr.Dataframe(visible=False, label="Rewrite Summary")

	gr.Markdown("### NLI Atom Details\nSelect a specific text and model to see per-atom entailment scores")
	nli_table = gr.Dataframe(value=build_nli_table("All", "All", "All"), label="NLI Atoms")

	cat_dd.change(update_text_choices, inputs=[cat_dd], outputs=[text_dd])

	for inp in [cat_dd, text_dd, model_dd]:
	inp.change(
	update_explorer,
	inputs=[cat_dd, text_dd, model_dd],
	outputs=[scatter_plot, lossiness_plot, drift_plot, spiciness_plot, pca_plot, nli_table,
	text_comparison_md, text_summary_table],
	)

	with gr.Tab("Leaderboard"):
	gr.Markdown("### Aggregate Model Rankings")
	lb_table = gr.Dataframe(value=build_leaderboard_df(), label="Leaderboard")
	lb_chart = gr.Plot(value=safe_plot(build_leaderboard_chart), label="Score Comparison")

	with gr.Row():
	cat_heatmap = gr.Plot(value=safe_plot(build_category_heatmap), label="Category Heatmap")
	radar = gr.Plot(value=safe_plot(build_metrics_radar), label="Preservation Profile")

	gr.Markdown("### Per-Category Breakdown")
	cat_table = gr.Dataframe(value=build_category_table(), label="Category Scores")

	with gr.Tab("Orthogonality"):
	gr.Markdown(
	"### Metric Independence Analysis\n"
	"This tab examines how independent the beigification sub-metrics are from each other. "
	"High correlation (\|r\| > 0.8) suggests redundancy; low correlation (\|r\| < 0.5) suggests the metrics "
	"capture genuinely different dimensions of text degradation.\n\n"
	"*Data source: one-shot rewrites (60 observations across 5 models x 12 texts). "
	"Multi-hop per-hop data will be added as additional observations for richer analysis.*"
	)

	ind_summary = gr.Dataframe(value=build_independence_summary(), label="Independence Summary")
	corr_heatmap = gr.Plot(value=safe_plot(build_correlation_heatmap), label="Correlation Matrix")
	pca_biplot = gr.Plot(value=safe_plot(build_pca_biplot), label="PCA Biplot")

	gr.Markdown("### Top Correlated / Independent Metric Pairs")
	high_corr = gr.Dataframe(value=build_high_corr_table(), label="Metric Pair Correlations")

	with gr.Tab("Methodology"):
	gr.Markdown(METHODOLOGY_MD)

	with gr.Tab("About"):
	gr.Markdown(ABOUT_MD)


	demo.launch(
	ssr_mode=False,
	theme=gr.themes.Soft(
	primary_hue="indigo",
	secondary_hue="amber",
	),
	)