"""
Beigification Bench — Measuring How LLMs Flatten Text
Space v2026.04 | Evaluation Engine v1.0
"""
import os
import json
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
SPACE_VERSION = "2026.04"
EVAL_VERSION = "1.0"
EVAL_VERSION_NOTES = (
"V1: 6-component spiciness (sqrt-PPL, lex richness, rare words, word specificity, "
"vivid modifiers, voice), NLI atomization with subject-heuristic reattachment, "
"directionally-gated centroid pull, beigification vector PCA"
)
MODEL_META = {
"claude-opus-4-6": ("Claude Opus 4.6", "Anthropic"),
"gpt-5.2": ("GPT-5.2", "OpenAI"),
"gemini-3.1-pro-preview": ("Gemini 3.1 Pro", "Google"),
"meta-llama/Llama-3.3-70B-Instruct": ("Llama 3.3 70B", "Meta / HF"),
"grok-3": ("Grok 3", "xAI"),
}
CATEGORY_ORDER = ["Polemic", "Sensitive", "Jargon", "Reasoning", "Edge Cases", "Rant", "analysis"]
MODEL_COLORS = {
"Claude Opus 4.6": "#7c3aed",
"GPT-5.2": "#0891b2",
"Gemini 3.1 Pro": "#059669",
"Llama 3.3 70B": "#d97706",
"Grok 3": "#dc2626",
}
HF_DATASET_ID = "PatriciaDyck/BeigificationBench"
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
def load_data():
full_data = None
try:
from huggingface_hub import hf_hub_download
path = hf_hub_download(
repo_id=HF_DATASET_ID,
filename="data/full_dataset.json",
repo_type="dataset",
)
with open(path) as f:
full_data = json.load(f)
print(f"Loaded {len(full_data)} rows from HF dataset")
except Exception as e:
print(f"WARNING: HF dataset load failed ({e}), falling back to local files")
if full_data:
df = pd.DataFrame(full_data)
nli_atoms = []
pca_beige = []
for row in full_data:
if row.get("nli_atoms"):
nli_atoms.append({
"model": row["model"],
"prompt_id": row["prompt_id"],
"prompt_name": row["prompt_name"],
"nli_atoms": row["nli_atoms"],
})
if row.get("beige_points") and row["model"] == "gpt-5.2":
pca_beige.append({
"model": row["model"],
"prompt_id": row["prompt_id"],
"prompt_name": row["prompt_name"],
"beige_points": row["beige_points"],
})
return df, nli_atoms, pca_beige
try:
df = pd.read_csv(os.path.join(DATA_DIR, "results.csv"))
except Exception as e:
print(f"WARNING: Failed to load results.csv: {e}")
df = pd.DataFrame()
try:
with open(os.path.join(DATA_DIR, "nli_atoms.json")) as f:
nli_atoms = json.load(f)
except Exception as e:
print(f"WARNING: Failed to load nli_atoms.json: {e}")
nli_atoms = []
try:
with open(os.path.join(DATA_DIR, "pca_beige_points.json")) as f:
pca_beige = json.load(f)
except Exception as e:
print(f"WARNING: Failed to load pca_beige_points.json: {e}")
pca_beige = []
return df, nli_atoms, pca_beige
results_df, nli_atoms_data, pca_beige_data = load_data()
def display_name(model_id):
return MODEL_META.get(model_id, (model_id, ""))[0]
def provider_name(model_id):
return MODEL_META.get(model_id, ("", model_id))[1]
def get_color(model_name):
return MODEL_COLORS.get(model_name, "#6b7280")
def safe_plot(fn, *args, **kwargs):
try:
return fn(*args, **kwargs)
except Exception as e:
fig = go.Figure()
fig.add_annotation(
text=f"Chart unavailable: {type(e).__name__}: {e}",
x=0.5, y=0.5, xref="paper", yref="paper",
showarrow=False, font=dict(size=14, color="red"),
)
fig.update_layout(height=300, paper_bgcolor="white", plot_bgcolor="white")
return fig
# ── Tab 1: Interactive Explorer ────────────────────────────────────────────────
def get_categories():
cats = results_df["category"].unique().tolist()
return ["All"] + sorted(cats)
def get_texts(category):
if category == "All":
texts = results_df["prompt_name"].unique().tolist()
else:
texts = results_df[results_df["category"] == category]["prompt_name"].unique().tolist()
return ["All"] + sorted(texts)
def get_models():
return ["All"] + sorted(results_df["model"].map(display_name).unique().tolist())
def filter_df(category, text_name, model_filter):
df = results_df.copy()
df["model_name"] = df["model"].map(display_name)
if category != "All":
df = df[df["category"] == category]
if text_name != "All":
df = df[df["prompt_name"] == text_name]
if model_filter != "All":
df = df[df["model_name"] == model_filter]
return df
def build_scatter(category, text_name, model_filter):
df = filter_df(category, text_name, model_filter)
fig = go.Figure()
for mn in sorted(df["model_name"].unique()):
sub = df[df["model_name"] == mn]
fig.add_trace(go.Scatter(
x=sub["lossiness"].tolist(),
y=sub["drift"].tolist(),
mode="markers",
name=mn,
marker=dict(size=12, color=get_color(mn), opacity=0.8),
text=sub["prompt_name"].tolist(),
customdata=sub[["score", "nli_retention", "category"]].values.tolist(),
hovertemplate=(
"%{text}
"
"Model: " + mn + "
"
"Lossiness: %{x:.3f}
"
"Drift: %{y:.3f}
"
"Score: %{customdata[0]}
"
"NLI Retention: %{customdata[1]:.1%}
"
"Category: %{customdata[2]}"
),
))
fig.add_shape(type="rect", x0=-0.02, y0=-0.02, x1=0.15, y1=0.05,
fillcolor="rgba(22,163,74,0.08)", line=dict(color="rgba(22,163,74,0.3)", dash="dot"),
layer="below")
fig.add_annotation(x=0.07, y=0.04, text="High fidelity", showarrow=False,
font=dict(color="#16a34a", size=10))
fig.update_layout(
title=dict(text="Integrity Frontier: Lossiness vs Drift", font=dict(size=15)),
xaxis=dict(title="Lossiness (information loss) →", gridcolor="#e5e7eb", rangemode="tozero"),
yaxis=dict(title="Drift (stylistic collapse) ↑", gridcolor="#e5e7eb", rangemode="tozero"),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=500, margin=dict(l=50, r=10, t=60, b=60),
)
return fig
def build_lossiness_breakdown(category, text_name, model_filter):
df = filter_df(category, text_name, model_filter)
agg = df.groupby("model_name").agg(
prop_loss=("prop_loss", "mean"),
semantic_distance=("semantic_distance", "mean"),
word_deletion=("word_deletion", "mean"),
).reset_index().sort_values("model_name")
models = agg["model_name"].tolist()
fig = go.Figure()
fig.add_trace(go.Bar(name="Prop Loss (60%)", x=models, y=(agg["prop_loss"] * 0.6).tolist(),
marker_color="#1e3a5f", text=[f"{v:.3f}" for v in (agg["prop_loss"] * 0.6)], textposition="inside"))
fig.add_trace(go.Bar(name="Semantic Dist (20%)", x=models, y=(agg["semantic_distance"] * 0.2).tolist(),
marker_color="#3498db", text=[f"{v:.3f}" for v in (agg["semantic_distance"] * 0.2)], textposition="inside"))
fig.add_trace(go.Bar(name="Word Deletion (20%)", x=models, y=(agg["word_deletion"] * 0.2).tolist(),
marker_color="#95a5a6", text=[f"{v:.3f}" for v in (agg["word_deletion"] * 0.2)], textposition="inside"))
fig.update_layout(
barmode="stack",
title=dict(text="Lossiness Sub-Metric Decomposition", font=dict(size=15)),
yaxis=dict(title="Weighted Contribution", gridcolor="#e5e7eb"),
xaxis=dict(tickangle=-15),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80),
)
return fig
def build_drift_breakdown(category, text_name, model_filter):
df = filter_df(category, text_name, model_filter)
agg = df.groupby("model_name").agg(
norm_delta_spiciness=("norm_delta_spiciness", "mean"),
norm_pull=("norm_pull", "mean"),
).reset_index().sort_values("model_name")
models = agg["model_name"].tolist()
fig = go.Figure()
fig.add_trace(go.Bar(name="Spiciness Loss (55%)", x=models, y=(agg["norm_delta_spiciness"] * 0.55).tolist(),
marker_color="#e74c3c", text=[f"{v:.3f}" for v in (agg["norm_delta_spiciness"] * 0.55)], textposition="inside"))
fig.add_trace(go.Bar(name="Centroid Pull (45%)", x=models, y=(agg["norm_pull"] * 0.45).tolist(),
marker_color="#f1c40f", text=[f"{v:.3f}" for v in (agg["norm_pull"] * 0.45)], textposition="inside"))
fig.update_layout(
barmode="stack",
title=dict(text="Drift Sub-Metric Decomposition", font=dict(size=15)),
yaxis=dict(title="Weighted Contribution", gridcolor="#e5e7eb"),
xaxis=dict(tickangle=-15),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80),
)
return fig
def build_spiciness_comparison(category, text_name, model_filter):
df = filter_df(category, text_name, model_filter)
agg = df.groupby("model_name").agg(
orig=("orig_spiciness", "mean"),
rew=("rew_spiciness", "mean"),
).reset_index().sort_values("model_name")
fig = go.Figure()
models = agg["model_name"].tolist()
x_pos = np.arange(len(models))
width = 0.35
fig.add_trace(go.Bar(
name="Original", x=[x - width/2 for x in x_pos], y=agg["orig"].tolist(),
marker_color="#6366f1", width=width,
text=[f"{v:.1f}" for v in agg["orig"]], textposition="outside",
))
fig.add_trace(go.Bar(
name="Rewrite", x=[x + width/2 for x in x_pos], y=agg["rew"].tolist(),
marker_color="#a5b4fc", width=width,
text=[f"{v:.1f}" for v in agg["rew"]], textposition="outside",
))
fig.update_layout(
title=dict(text="Spiciness: Original vs Rewrite", font=dict(size=15)),
xaxis=dict(tickvals=list(x_pos), ticktext=models, tickangle=-15),
yaxis=dict(title="Spiciness Score", gridcolor="#e5e7eb"),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80),
)
return fig
def build_pca_plot(category, text_name, model_filter):
df = filter_df(category, text_name, model_filter)
valid = df.dropna(subset=["pca_original_x", "pca_rewrite_x"])
if len(valid) == 0:
fig = go.Figure()
fig.add_annotation(text="No PCA data available for this selection",
x=0.5, y=0.5, xref="paper", yref="paper", showarrow=False)
fig.update_layout(height=450, paper_bgcolor="white", plot_bgcolor="white")
return fig
fig = go.Figure()
beige_shown = False
for entry in pca_beige_data:
if not beige_shown and entry.get("beige_points"):
bx = [p["x"] for p in entry["beige_points"]]
by = [p["y"] for p in entry["beige_points"]]
fig.add_trace(go.Scatter(
x=bx, y=by, mode="markers", name="Beige Platitudes",
marker=dict(size=8, color="lightgray", opacity=0.5, symbol="diamond"),
hoverinfo="skip",
))
beige_shown = True
break
for mn in sorted(valid["model_name"].unique()):
sub = valid[valid["model_name"] == mn]
color = get_color(mn)
for _, row in sub.iterrows():
fig.add_trace(go.Scatter(
x=[row["pca_original_x"], row["pca_rewrite_x"]],
y=[row["pca_original_y"], row["pca_rewrite_y"]],
mode="lines+markers",
line=dict(color=color, width=2),
marker=dict(size=[8, 10], color=color, symbol=["circle", "arrow-right"]),
name=mn,
legendgroup=mn,
showlegend=(_ == sub.index[0]),
hovertemplate=f"{mn}
{row['prompt_name']}",
))
fig.update_layout(
title=dict(text="Beigification Vector — PCA Projection", font=dict(size=15)),
xaxis=dict(title="PC1", gridcolor="#e5e7eb"),
yaxis=dict(title="PC2", gridcolor="#e5e7eb", scaleanchor="x", scaleratio=1),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=500, margin=dict(l=50, r=10, t=60, b=60),
)
return fig
def build_nli_table(category, text_name, model_filter):
if text_name == "All" or model_filter == "All":
return pd.DataFrame({"Info": ["Select a specific text AND model to view NLI atom details."]})
model_id = None
for mid, (dname, _) in MODEL_META.items():
if dname == model_filter:
model_id = mid
break
if not model_id:
return pd.DataFrame({"Error": ["Model not found"]})
for entry in nli_atoms_data:
if entry["model"] == model_id and entry["prompt_name"] == text_name:
atoms = entry["nli_atoms"]
rows = []
for i, atom in enumerate(atoms):
score = atom["score"]
status = "Retained" if score >= 0.5 else "Lost"
rows.append({
"#": i + 1,
"Atom": atom["text"][:120] + ("..." if len(atom["text"]) > 120 else ""),
"NLI Score": f"{score:.4f}",
"Status": status,
})
df = pd.DataFrame(rows)
return df
return pd.DataFrame({"Info": ["No NLI atom data available for this selection."]})
def build_text_comparison(text_name, model_filter):
if text_name == "All":
return (
"*Select a specific source text to see the original and model rewrites side by side.*",
gr.update(visible=False),
)
has_text_cols = "original_text" in results_df.columns and "rewrite_text" in results_df.columns
if not has_text_cols:
return (
"*Text comparison requires the full dataset (loaded from HF). Local CSV fallback does not include text content.*",
gr.update(visible=False),
)
sub = results_df[results_df["prompt_name"] == text_name].copy()
if len(sub) == 0:
return "*No data found for this text.*", gr.update(visible=False)
original_text = sub.iloc[0]["original_text"]
if model_filter != "All":
sub["model_name"] = sub["model"].map(display_name)
sub = sub[sub["model_name"] == model_filter]
if len(sub) == 0:
return "*No data found for this model/text combination.*", gr.update(visible=False)
md_parts = []
md_parts.append(f"#### Original Text\n\n{original_text}\n\n---\n")
sub_sorted = sub.copy()
sub_sorted["model_name"] = sub_sorted["model"].map(display_name)
sub_sorted = sub_sorted.sort_values("score", ascending=False)
for _, row in sub_sorted.iterrows():
mn = row["model_name"]
score = row.get("score", "N/A")
lossiness = row.get("lossiness", 0)
drift = row.get("drift", 0)
rewrite = row.get("rewrite_text", "")
score_color = "#16a34a" if score >= 85 else ("#d97706" if score >= 75 else "#dc2626")
md_parts.append(
f"#### {mn} — Score: {score} | Lossiness: {lossiness:.3f} | Drift: {drift:.3f}\n\n"
f"{rewrite}\n\n---\n"
)
summary_rows = []
for _, row in sub_sorted.iterrows():
summary_rows.append({
"Model": row["model_name"],
"Score": row.get("score", ""),
"Lossiness": f"{row.get('lossiness', 0):.4f}",
"Drift": f"{row.get('drift', 0):.4f}",
"NLI Retention": f"{row.get('nli_retention', 0):.1%}",
"Spiciness Δ": f"{row.get('spiciness_delta', 0):.2f}",
"Words (orig)": len(str(original_text).split()),
"Words (rewrite)": len(str(row.get('rewrite_text', '')).split()),
})
summary_df = pd.DataFrame(summary_rows)
return "\n".join(md_parts), gr.update(value=summary_df, visible=True)
def update_explorer(category, text_name, model_filter):
scatter = safe_plot(build_scatter, category, text_name, model_filter)
lossiness = safe_plot(build_lossiness_breakdown, category, text_name, model_filter)
drift = safe_plot(build_drift_breakdown, category, text_name, model_filter)
spiciness = safe_plot(build_spiciness_comparison, category, text_name, model_filter)
pca = safe_plot(build_pca_plot, category, text_name, model_filter)
nli = build_nli_table(category, text_name, model_filter)
try:
text_md, text_summary = build_text_comparison(text_name, model_filter)
except Exception as e:
text_md = f"*Error loading text comparison: {e}*"
text_summary = gr.update(visible=False)
return scatter, lossiness, drift, spiciness, pca, nli, text_md, text_summary
def update_text_choices(category):
return gr.update(choices=get_texts(category), value="All")
# ── Tab 2: Aggregate Leaderboard ───────────────────────────────────────────────
def build_leaderboard_df():
df = results_df.copy()
df["model_name"] = df["model"].map(display_name)
df["provider"] = df["model"].map(provider_name)
agg = df.groupby(["model_name", "provider"]).agg(
avg_score=("score", "mean"),
avg_lossiness=("lossiness", "mean"),
avg_drift=("drift", "mean"),
avg_nli_retention=("nli_retention", "mean"),
avg_spiciness_delta=("spiciness_delta", "mean"),
avg_orig_spiciness=("orig_spiciness", "mean"),
avg_rew_spiciness=("rew_spiciness", "mean"),
n_texts=("score", "count"),
).reset_index()
agg = agg.sort_values("avg_score", ascending=False).reset_index(drop=True)
agg.insert(0, "Rank", range(1, len(agg) + 1))
out = agg.rename(columns={
"model_name": "Model",
"provider": "Provider",
"avg_score": "Avg Score",
"avg_lossiness": "Avg Lossiness",
"avg_drift": "Avg Drift",
"avg_nli_retention": "NLI Retention",
"avg_spiciness_delta": "Spiciness Δ",
"avg_orig_spiciness": "Orig Spiciness",
"avg_rew_spiciness": "Rew Spiciness",
"n_texts": "Texts",
})
for col in ["Avg Score"]:
out[col] = out[col].round(1)
for col in ["Avg Lossiness", "Avg Drift", "NLI Retention", "Spiciness Δ", "Orig Spiciness", "Rew Spiciness"]:
out[col] = out[col].round(4)
return out
def build_leaderboard_chart():
df = results_df.copy()
df["model_name"] = df["model"].map(display_name)
agg = df.groupby("model_name").agg(
score=("score", "mean"),
lossiness=("lossiness", "mean"),
drift=("drift", "mean"),
).reset_index().sort_values("score", ascending=False)
fig = go.Figure()
models = agg["model_name"].tolist()
x_pos = np.arange(len(models))
fig.add_trace(go.Bar(
name="Avg Score", x=models, y=agg["score"].tolist(),
marker_color=[get_color(m) for m in models],
text=[f"{v:.1f}" for v in agg["score"]], textposition="outside",
))
fig.update_layout(
title=dict(text="Overall Preservation Score by Model (higher = better)", font=dict(size=15)),
yaxis=dict(title="Score (0-100)", gridcolor="#e5e7eb", range=[0, 105]),
xaxis=dict(tickangle=-15),
plot_bgcolor="white", paper_bgcolor="white",
showlegend=False,
autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=80),
)
return fig
def build_category_table():
df = results_df.copy()
df["model_name"] = df["model"].map(display_name)
pivot = df.pivot_table(
index="category", columns="model_name", values="score", aggfunc="mean"
).round(1)
pivot = pivot.reset_index().rename(columns={"category": "Category"})
return pivot
def build_category_heatmap():
df = results_df.copy()
df["model_name"] = df["model"].map(display_name)
pivot = df.pivot_table(index="category", columns="model_name", values="score", aggfunc="mean")
z = pivot.values
x = pivot.columns.tolist()
y = pivot.index.tolist()
fig = go.Figure(data=go.Heatmap(
z=z, x=x, y=y,
colorscale=[[0, "#fef2f2"], [0.4, "#fde68a"], [0.7, "#bbf7d0"], [1.0, "#1d4ed8"]],
text=[[f"{v:.0f}" if not np.isnan(v) else "" for v in row] for row in z],
texttemplate="%{text}",
textfont=dict(size=11),
colorbar=dict(title="Score", thickness=12),
zmin=60, zmax=100,
))
fig.update_layout(
title=dict(text="Score by Category × Model", font=dict(size=15)),
xaxis=dict(side="top", tickangle=-20),
yaxis=dict(autorange="reversed"),
autosize=True, height=400,
margin=dict(l=160, r=10, t=70, b=10),
plot_bgcolor="white", paper_bgcolor="white",
)
return fig
def build_metrics_radar():
df = results_df.copy()
df["model_name"] = df["model"].map(display_name)
agg = df.groupby("model_name").agg(
nli_retention=("nli_retention", "mean"),
low_lossiness=("lossiness", lambda x: 1 - x.mean()),
low_drift=("drift", lambda x: 1 - x.mean()),
spiciness_preservation=("spiciness_delta", lambda x: 1 - abs(x.mean()) / 10),
low_word_deletion=("word_deletion", lambda x: 1 - x.mean()),
).reset_index()
categories = ["NLI Retention", "Low Lossiness", "Low Drift", "Spiciness Preservation", "Low Word Deletion"]
fig = go.Figure()
for _, row in agg.iterrows():
values = [row["nli_retention"], row["low_lossiness"], row["low_drift"],
row["spiciness_preservation"], row["low_word_deletion"]]
values.append(values[0])
fig.add_trace(go.Scatterpolar(
r=values,
theta=categories + [categories[0]],
fill="toself",
name=row["model_name"],
line=dict(color=get_color(row["model_name"])),
opacity=0.6,
))
fig.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
title=dict(text="Model Preservation Profile", font=dict(size=15)),
autosize=True, height=480,
margin=dict(l=60, r=60, t=60, b=60),
paper_bgcolor="white",
)
return fig
# ── Tab 3: Orthogonality Analysis ──────────────────────────────────────────────
METRIC_COLS = [
"lossiness", "drift", "prop_loss", "semantic_distance", "word_deletion",
"nli_retention", "norm_pull", "norm_delta_spiciness",
"orig_spiciness", "rew_spiciness", "spiciness_delta",
"orig_perplexity", "rew_perplexity",
"orig_lex_richness", "rew_lex_richness",
"orig_rare_word_density", "rew_rare_word_density",
"voice_shift",
"orig_word_specificity", "rew_word_specificity",
"orig_vivid_modifier_ratio", "rew_vivid_modifier_ratio",
"orig_voice_score", "rew_voice_score",
"pull_magnitude", "directional_similarity", "delta_dist_to_beige",
]
METRIC_LABELS = {
"lossiness": "Lossiness",
"drift": "Drift",
"prop_loss": "Prop Loss",
"semantic_distance": "Semantic Dist",
"word_deletion": "Word Deletion",
"nli_retention": "NLI Retention",
"norm_pull": "Norm Pull",
"norm_delta_spiciness": "Δ Spiciness (norm)",
"orig_spiciness": "Orig Spiciness",
"rew_spiciness": "Rew Spiciness",
"spiciness_delta": "Spiciness Δ",
"orig_perplexity": "Orig Perplexity",
"rew_perplexity": "Rew Perplexity",
"orig_lex_richness": "Orig Lex Richness",
"rew_lex_richness": "Rew Lex Richness",
"orig_rare_word_density": "Orig Rare Words",
"rew_rare_word_density": "Rew Rare Words",
"voice_shift": "Voice Shift",
"orig_word_specificity": "Orig Word Spec",
"rew_word_specificity": "Rew Word Spec",
"orig_vivid_modifier_ratio": "Orig Vivid Mod",
"rew_vivid_modifier_ratio": "Rew Vivid Mod",
"orig_voice_score": "Orig Voice",
"rew_voice_score": "Rew Voice",
"pull_magnitude": "Pull Magnitude",
"directional_similarity": "Directional Sim",
"delta_dist_to_beige": "Δ Dist to Beige",
}
def build_correlation_heatmap():
available = [c for c in METRIC_COLS if c in results_df.columns]
df = results_df[available].dropna()
corr = df.corr()
labels = [METRIC_LABELS.get(c, c) for c in corr.columns]
fig = go.Figure(data=go.Heatmap(
z=corr.values,
x=labels,
y=labels,
colorscale="RdBu_r",
zmin=-1, zmax=1,
text=[[f"{v:.2f}" for v in row] for row in corr.values],
texttemplate="%{text}",
textfont=dict(size=8),
colorbar=dict(title="r", thickness=12),
))
fig.update_layout(
title=dict(text="Metric Correlation Matrix (Pearson r)", font=dict(size=15)),
xaxis=dict(tickangle=-45, tickfont=dict(size=9)),
yaxis=dict(tickfont=dict(size=9), autorange="reversed"),
autosize=True, height=700,
margin=dict(l=130, r=10, t=60, b=130),
plot_bgcolor="white", paper_bgcolor="white",
)
return fig
def build_pca_biplot():
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
available = [c for c in METRIC_COLS if c in results_df.columns]
df = results_df[available].dropna()
scaler = StandardScaler()
scaled = scaler.fit_transform(df)
pca = PCA(n_components=2)
scores = pca.fit_transform(scaled)
loadings = pca.components_.T
fig = go.Figure()
results_with_models = results_df.dropna(subset=available)
model_names = results_with_models["model"].map(display_name).values
for mn in sorted(set(model_names)):
mask = model_names == mn
fig.add_trace(go.Scatter(
x=scores[mask, 0].tolist(),
y=scores[mask, 1].tolist(),
mode="markers",
name=mn,
marker=dict(size=8, color=get_color(mn), opacity=0.6),
hovertemplate=f"{mn}",
))
scale = np.max(np.abs(scores)) * 0.8
for i, col in enumerate(available):
lx, ly = loadings[i, 0] * scale, loadings[i, 1] * scale
magnitude = np.sqrt(lx**2 + ly**2)
if magnitude < scale * 0.2:
continue
label = METRIC_LABELS.get(col, col)
fig.add_annotation(
ax=0, ay=0, axref="x", ayref="y",
x=lx, y=ly, xref="x", yref="y",
showarrow=True,
arrowhead=2, arrowsize=1.5, arrowwidth=1.5,
arrowcolor="rgba(100,100,100,0.7)",
)
fig.add_annotation(
x=lx * 1.15, y=ly * 1.15,
text=label, showarrow=False,
font=dict(size=9, color="#374151"),
)
var_explained = pca.explained_variance_ratio_
fig.update_layout(
title=dict(
text=f"PCA Biplot — Metric Loading Directions (PC1: {var_explained[0]:.1%}, PC2: {var_explained[1]:.1%})",
font=dict(size=14),
),
xaxis=dict(title=f"PC1 ({var_explained[0]:.1%} variance)", gridcolor="#e5e7eb"),
yaxis=dict(title=f"PC2 ({var_explained[1]:.1%} variance)", gridcolor="#e5e7eb",
scaleanchor="x", scaleratio=1),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=600,
margin=dict(l=60, r=10, t=80, b=60),
)
return fig
def build_high_corr_table():
available = [c for c in METRIC_COLS if c in results_df.columns]
df = results_df[available].dropna()
corr = df.corr()
pairs = []
cols = corr.columns.tolist()
for i in range(len(cols)):
for j in range(i+1, len(cols)):
r = corr.iloc[i, j]
pairs.append({
"Metric A": METRIC_LABELS.get(cols[i], cols[i]),
"Metric B": METRIC_LABELS.get(cols[j], cols[j]),
"Pearson r": round(r, 4),
"|r|": round(abs(r), 4),
"Relationship": "Redundant" if abs(r) > 0.8 else ("Correlated" if abs(r) > 0.5 else "Independent"),
})
pairs_df = pd.DataFrame(pairs).sort_values("|r|", ascending=False)
return pairs_df.head(40).reset_index(drop=True)
def build_independence_summary():
available = [c for c in METRIC_COLS if c in results_df.columns]
df = results_df[available].dropna()
corr = df.corr()
n = len(corr.columns)
total_pairs = n * (n - 1) // 2
cols = corr.columns.tolist()
redundant = 0
correlated = 0
independent = 0
for i in range(len(cols)):
for j in range(i + 1, len(cols)):
r = abs(corr.iloc[i, j])
if r > 0.8:
redundant += 1
elif r > 0.5:
correlated += 1
else:
independent += 1
return pd.DataFrame([{
"Total Metric Pairs": total_pairs,
"Redundant (|r| > 0.8)": redundant,
"Correlated (0.5 < |r| <= 0.8)": correlated,
"Independent (|r| <= 0.5)": independent,
"% Independent": f"{independent/total_pairs*100:.1f}%",
}])
# ── Tab 4: Methodology ────────────────────────────────────────────────────────
METHODOLOGY_MD = ""
for _p in [
os.path.join(os.path.dirname(__file__), "methodology.md"),
os.path.join(os.path.dirname(__file__), "..", "methodology.md"),
]:
if os.path.exists(_p):
METHODOLOGY_MD = open(_p, "r").read()
break
if not METHODOLOGY_MD:
METHODOLOGY_MD = "Methodology document not found. See the Beigification Bench documentation for details."
# ── Build the Gradio App ───────────────────────────────────────────────────────
ABOUT_MD = f"""
## Beigification Bench: Measuring How LLMs Flatten Text
**Space Version**: v{SPACE_VERSION} | **Evaluation Engine**: v{EVAL_VERSION} | **Author**: Patricia Dyck | **License**: CC BY 4.0
> **Eval Engine v{EVAL_VERSION}**: {EVAL_VERSION_NOTES}
---
### What is Beigification?
*Beigification* is the phenomenon where LLM-generated rewrites systematically flatten, genericize, and strip distinctive qualities from source text — replacing vivid language with safe platitudes, concrete imagery with abstract generalities, and assertive voice with hedged equivocation.
The Beigification Bench is a diagnostic framework that quantifies this effect through two composite metrics:
- **Lossiness** — How much semantic and lexical content is lost (proposition loss via NLI, semantic distance, word deletion)
- **Drift** — How much the rewrite moves toward generic "model collapse" language (spiciness loss + beige centroid pull)
These are supported by a rich sub-metric suite including NLI atom retention, spiciness scoring (perplexity, lexical richness, rare words, word specificity, vivid modifiers, voice), and beigification vector analysis (PCA projection of semantic displacement toward a pre-defined platitude cluster).
---
### Benchmark Design
- **5 models**: GPT-5.2, Claude Opus 4.6, Gemini 3.1 Pro, Llama 3.3 70B, Grok 3
- **12 source texts** across 5+ categories: Polemic, Sensitive Topics, Jargon-Dense, Complex Reasoning, Edge Cases
- **Task**: "Please rewrite the following text" — a simple rewriting prompt that reveals how models handle stylistic preservation
- **Temperature**: 0.7 (matches real-world deployment conditions)
- **Evaluation**: Automated via HuggingFace Inference API (BAAI/bge-base-en-v1.5 embeddings, DeBERTa-v3-large NLI)
### Data
All benchmark data is hosted on HuggingFace Datasets: [`PatriciaDyck/BeigificationBench`](https://huggingface.co/datasets/PatriciaDyck/BeigificationBench)
### Theoretical Grounding
The framework draws on foundational work in model collapse (Shumailov et al., 2024), semantic entropy (Farquhar et al., 2024), and contextual erosion (Vazhentsev et al., ICLR 2026). It provides empirical measurement of the "alignment tax" — the information and stylistic cost of safety-tuned language models.
---
### Tabs
1. **Interactive Explorer** — Filter by category, text, and model. View original text alongside model rewrites with metrics. Includes lossiness/drift scatter, sub-metric decompositions, spiciness comparisons, PCA vector plots, and NLI atom details.
2. **Leaderboard** — Aggregate rankings across all texts with per-category heatmaps and radar profiles.
3. **Orthogonality Analysis** — Correlation matrix and PCA biplot showing which metrics measure genuinely independent dimensions vs. which are redundant.
4. **Methodology** — Full technical specification of all metrics, formulas, and hyperparameters.
"""
has_text_data = "original_text" in results_df.columns
with gr.Blocks(title="Beigification Bench") as demo:
gr.Markdown("# 🎨 Beigification Bench\n*Measuring How LLMs Flatten Text*")
with gr.Tabs():
with gr.Tab("Explorer"):
with gr.Row():
cat_dd = gr.Dropdown(choices=get_categories(), value="All", label="Category", scale=1)
text_dd = gr.Dropdown(choices=get_texts("All"), value="All", label="Source Text", scale=2)
model_dd = gr.Dropdown(choices=get_models(), value="All", label="Model", scale=1)
scatter_plot = gr.Plot(value=safe_plot(build_scatter, "All", "All", "All"), label="Integrity Frontier")
with gr.Row():
lossiness_plot = gr.Plot(value=safe_plot(build_lossiness_breakdown, "All", "All", "All"))
drift_plot = gr.Plot(value=safe_plot(build_drift_breakdown, "All", "All", "All"))
with gr.Row():
spiciness_plot = gr.Plot(value=safe_plot(build_spiciness_comparison, "All", "All", "All"))
pca_plot = gr.Plot(value=safe_plot(build_pca_plot, "All", "All", "All"))
gr.Markdown("### Original Text & Model Rewrites")
if has_text_data:
gr.Markdown(
"*Select a source text above to read the original alongside each model's rewrite. "
"Optionally filter to a single model.*"
)
else:
gr.Markdown(
"*Text comparison unavailable — dataset loaded from local CSV without text content. "
"Full text data is available via the [HF dataset](https://huggingface.co/datasets/PatriciaDyck/BeigificationBench).*"
)
text_comparison_md = gr.Markdown(
value="*Select a specific source text to see the original and model rewrites side by side.*"
)
text_summary_table = gr.Dataframe(visible=False, label="Rewrite Summary")
gr.Markdown("### NLI Atom Details\n*Select a specific text and model to see per-atom entailment scores*")
nli_table = gr.Dataframe(value=build_nli_table("All", "All", "All"), label="NLI Atoms")
cat_dd.change(update_text_choices, inputs=[cat_dd], outputs=[text_dd])
for inp in [cat_dd, text_dd, model_dd]:
inp.change(
update_explorer,
inputs=[cat_dd, text_dd, model_dd],
outputs=[scatter_plot, lossiness_plot, drift_plot, spiciness_plot, pca_plot, nli_table,
text_comparison_md, text_summary_table],
)
with gr.Tab("Leaderboard"):
gr.Markdown("### Aggregate Model Rankings")
lb_table = gr.Dataframe(value=build_leaderboard_df(), label="Leaderboard")
lb_chart = gr.Plot(value=safe_plot(build_leaderboard_chart), label="Score Comparison")
with gr.Row():
cat_heatmap = gr.Plot(value=safe_plot(build_category_heatmap), label="Category Heatmap")
radar = gr.Plot(value=safe_plot(build_metrics_radar), label="Preservation Profile")
gr.Markdown("### Per-Category Breakdown")
cat_table = gr.Dataframe(value=build_category_table(), label="Category Scores")
with gr.Tab("Orthogonality"):
gr.Markdown(
"### Metric Independence Analysis\n"
"This tab examines how independent the beigification sub-metrics are from each other. "
"High correlation (|r| > 0.8) suggests redundancy; low correlation (|r| < 0.5) suggests the metrics "
"capture genuinely different dimensions of text degradation.\n\n"
"*Data source: one-shot rewrites (60 observations across 5 models x 12 texts). "
"Multi-hop per-hop data will be added as additional observations for richer analysis.*"
)
ind_summary = gr.Dataframe(value=build_independence_summary(), label="Independence Summary")
corr_heatmap = gr.Plot(value=safe_plot(build_correlation_heatmap), label="Correlation Matrix")
pca_biplot = gr.Plot(value=safe_plot(build_pca_biplot), label="PCA Biplot")
gr.Markdown("### Top Correlated / Independent Metric Pairs")
high_corr = gr.Dataframe(value=build_high_corr_table(), label="Metric Pair Correlations")
with gr.Tab("Methodology"):
gr.Markdown(METHODOLOGY_MD)
with gr.Tab("About"):
gr.Markdown(ABOUT_MD)
demo.launch(
ssr_mode=False,
theme=gr.themes.Soft(
primary_hue="indigo",
secondary_hue="amber",
),
)