"""
BeigificationBench — Measuring How LLMs Flatten Text
Anonymous submission | Evaluation Engine v1.0
"""
import os
import json
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
SPACE_VERSION = "2026.05"
EVAL_VERSION = "1.0"
MODEL_META = {
"claude-opus-4-6": ("Claude Opus 4.6", "Anthropic"),
"gpt-5.2": ("GPT-5.2", "OpenAI"),
"gemini-3.1-pro-preview": ("Gemini 3.1 Pro", "Google"),
"meta-llama/Llama-3.3-70B-Instruct": ("Llama 3.3 70B", "Meta / HF"),
"meta-llama/Llama-3.1-70B-Instruct": ("Llama 3.1 70B", "Meta / HF"),
"Qwen/Qwen2.5-72B-Instruct": ("Qwen 2.5 72B", "Alibaba / HF"),
}
MODEL_COLORS = {
"Claude Opus 4.6": "#7c3aed",
"GPT-5.2": "#0891b2",
"Gemini 3.1 Pro": "#059669",
"Llama 3.3 70B": "#d97706",
"Llama 3.1 70B": "#b45309",
"Qwen 2.5 72B": "#db2777",
}
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
def display_name(model_id):
return MODEL_META.get(model_id, (model_id, ""))[0]
def provider_name(model_id):
return MODEL_META.get(model_id, ("", model_id))[1]
def get_color(model_name):
return MODEL_COLORS.get(model_name, "#6b7280")
def safe_plot(fn, *args, **kwargs):
try:
return fn(*args, **kwargs)
except Exception as e:
fig = go.Figure()
fig.add_annotation(
text=f"Chart unavailable: {type(e).__name__}: {e}",
x=0.5, y=0.5, xref="paper", yref="paper",
showarrow=False, font=dict(size=13, color="red"),
)
fig.update_layout(height=300, paper_bgcolor="white", plot_bgcolor="white")
return fig
# ── Data loading ───────────────────────────────────────────────────────────────
def load_data():
try:
df = pd.read_csv(os.path.join(DATA_DIR, "results.csv"))
except Exception as e:
print(f"WARNING: results.csv load failed: {e}")
df = pd.DataFrame()
try:
with open(os.path.join(DATA_DIR, "nli_atoms.json")) as f:
nli_atoms = json.load(f)
except Exception:
nli_atoms = []
try:
with open(os.path.join(DATA_DIR, "pca_beige_points.json")) as f:
pca_beige = json.load(f)
except Exception:
pca_beige = []
return df, nli_atoms, pca_beige
def load_multihop_data():
try:
hops_df = pd.read_csv(os.path.join(DATA_DIR, "multihop_hops.csv"))
except Exception as e:
print(f"WARNING: multihop_hops.csv load failed: {e}")
hops_df = pd.DataFrame()
try:
summary_df = pd.read_csv(os.path.join(DATA_DIR, "multihop_summary.csv"))
except Exception:
summary_df = pd.DataFrame()
return hops_df, summary_df
results_df, nli_atoms_data, pca_beige_data = load_data()
hops_df, mh_summary_df = load_multihop_data()
# ── Helpers ────────────────────────────────────────────────────────────────────
def get_categories():
cats = results_df["category"].unique().tolist() if len(results_df) else []
return ["All"] + sorted(cats)
def get_texts(category="All"):
if len(results_df) == 0:
return ["All"]
if category == "All":
texts = results_df["prompt_name"].unique().tolist()
else:
texts = results_df[results_df["category"] == category]["prompt_name"].unique().tolist()
return ["All"] + sorted(texts)
def get_models():
if len(results_df) == 0:
return ["All"]
return ["All"] + sorted(results_df["model"].map(display_name).unique().tolist())
def filter_df(category, text_name, model_filter):
df = results_df.copy()
df["model_name"] = df["model"].map(display_name)
if category != "All":
df = df[df["category"] == category]
if text_name != "All":
df = df[df["prompt_name"] == text_name]
if model_filter != "All":
df = df[df["model_name"] == model_filter]
return df
def get_mh_texts():
if len(hops_df) == 0:
return ["All"]
return ["All"] + sorted(hops_df["prompt_name"].unique().tolist())
def filter_hops(text_filter):
df = hops_df.copy()
df["model_name"] = df["model"].map(display_name)
if text_filter != "All":
df = df[df["prompt_name"] == text_filter]
return df
# ── Tab 1: Interactive Explorer ────────────────────────────────────────────────
def build_scatter(category, text_name, model_filter):
df = filter_df(category, text_name, model_filter)
fig = go.Figure()
has_std = "lossiness_std" in df.columns and "drift_std" in df.columns
for mn in sorted(df["model_name"].unique()):
sub = df[df["model_name"] == mn]
err_x = dict(type="data", array=sub["lossiness_std"].fillna(0).tolist(), visible=True,
color=get_color(mn), thickness=1.5, width=4) if has_std else None
err_y = dict(type="data", array=sub["drift_std"].fillna(0).tolist(), visible=True,
color=get_color(mn), thickness=1.5, width=4) if has_std else None
fig.add_trace(go.Scatter(
x=sub["lossiness"].tolist(), y=sub["drift"].tolist(),
mode="markers", name=mn,
marker=dict(size=12, color=get_color(mn), opacity=0.85),
error_x=err_x, error_y=err_y,
text=sub["prompt_name"].tolist(),
customdata=sub[["score", "nli_retention", "category",
"lossiness_std", "drift_std"]].values.tolist(),
hovertemplate=(
"%{text}
"
"Model: " + mn + "
"
"Lossiness: %{x:.3f} ± %{customdata[3]:.3f}
"
"Drift: %{y:.3f} ± %{customdata[4]:.3f}
"
"Score: %{customdata[0]:.1f}
"
"NLI Retention: %{customdata[1]:.1%}
"
"Category: %{customdata[2]}"
),
))
fig.add_shape(type="rect", x0=-0.02, y0=-0.02, x1=0.15, y1=0.05,
fillcolor="rgba(22,163,74,0.08)",
line=dict(color="rgba(22,163,74,0.3)", dash="dot"), layer="below")
fig.add_annotation(x=0.07, y=0.04, text="High fidelity", showarrow=False,
font=dict(color="#16a34a", size=10))
fig.update_layout(
title=dict(text="Integrity Frontier: Lossiness vs Drift", font=dict(size=15)),
xaxis=dict(title="Lossiness (information loss) →", gridcolor="#e5e7eb", rangemode="tozero"),
yaxis=dict(title="Drift (stylistic collapse) ↑", gridcolor="#e5e7eb", rangemode="tozero"),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=500, margin=dict(l=50, r=10, t=60, b=60),
)
return fig
def build_lossiness_breakdown(category, text_name, model_filter):
df = filter_df(category, text_name, model_filter)
has_std = "lossiness_std" in df.columns
agg_spec = {
"prop_loss": ("prop_loss", "mean"),
"semantic_distance":("semantic_distance", "mean"),
"word_deletion": ("word_deletion", "mean"),
}
if has_std:
agg_spec["lossiness_std"] = ("lossiness_std", "mean")
agg = df.groupby("model_name").agg(**agg_spec).reset_index().sort_values("model_name")
models = agg["model_name"].tolist()
# Total lossiness = weighted sum of sub-metrics
totals = (agg["prop_loss"] * 0.6 + agg["semantic_distance"] * 0.2 + agg["word_deletion"] * 0.2).tolist()
err = dict(type="data", array=agg["lossiness_std"].fillna(0).tolist(), visible=True,
color="#374151", thickness=1.5, width=5) if has_std else None
fig = go.Figure()
fig.add_trace(go.Bar(name="Prop Loss (60%)", x=models,
y=(agg["prop_loss"] * 0.6).tolist(), marker_color="#1e3a5f",
text=[f"{v:.3f}" for v in (agg["prop_loss"] * 0.6)], textposition="inside"))
fig.add_trace(go.Bar(name="Semantic Dist (20%)", x=models,
y=(agg["semantic_distance"] * 0.2).tolist(), marker_color="#3498db",
text=[f"{v:.3f}" for v in (agg["semantic_distance"] * 0.2)], textposition="inside"))
# Top segment carries the error bar for total lossiness
fig.add_trace(go.Bar(name="Word Deletion (20%)", x=models,
y=(agg["word_deletion"] * 0.2).tolist(), marker_color="#95a5a6",
text=[f"{v:.3f}" for v in (agg["word_deletion"] * 0.2)], textposition="inside",
error_y=err))
fig.update_layout(
barmode="stack",
title=dict(text="Lossiness Sub-Metric Decomposition — error bars = replicate σ", font=dict(size=15)),
yaxis=dict(title="Weighted Contribution", gridcolor="#e5e7eb"),
xaxis=dict(tickangle=-15),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80),
)
return fig
def build_drift_breakdown(category, text_name, model_filter):
df = filter_df(category, text_name, model_filter)
has_std = "drift_std" in df.columns
agg_spec = {
"norm_delta_spiciness": ("norm_delta_spiciness", "mean"),
"norm_pull": ("norm_pull", "mean"),
}
if has_std:
agg_spec["drift_std"] = ("drift_std", "mean")
agg = df.groupby("model_name").agg(**agg_spec).reset_index().sort_values("model_name")
models = agg["model_name"].tolist()
err = dict(type="data", array=agg["drift_std"].fillna(0).tolist(), visible=True,
color="#374151", thickness=1.5, width=5) if has_std else None
fig = go.Figure()
fig.add_trace(go.Bar(name="Spiciness Loss (55%)", x=models,
y=(agg["norm_delta_spiciness"] * 0.55).tolist(), marker_color="#e74c3c",
text=[f"{v:.3f}" for v in (agg["norm_delta_spiciness"] * 0.55)], textposition="inside"))
# Top segment carries the error bar for total drift
fig.add_trace(go.Bar(name="Centroid Pull (45%)", x=models,
y=(agg["norm_pull"] * 0.45).tolist(), marker_color="#f1c40f",
text=[f"{v:.3f}" for v in (agg["norm_pull"] * 0.45)], textposition="inside",
error_y=err))
fig.update_layout(
barmode="stack",
title=dict(text="Drift Sub-Metric Decomposition — error bars = replicate σ", font=dict(size=15)),
yaxis=dict(title="Weighted Contribution", gridcolor="#e5e7eb"),
xaxis=dict(tickangle=-15),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80),
)
return fig
def build_spiciness_comparison(category, text_name, model_filter):
df = filter_df(category, text_name, model_filter)
has_std = "orig_spiciness_std" in df.columns
agg_spec = {"orig": ("orig_spiciness", "mean"), "rew": ("rew_spiciness", "mean")}
if has_std:
agg_spec["orig_std"] = ("orig_spiciness_std", "mean")
agg_spec["rew_std"] = ("rew_spiciness_std", "mean")
agg = df.groupby("model_name").agg(**agg_spec).reset_index().sort_values("model_name")
models = agg["model_name"].tolist()
x_pos = np.arange(len(models))
width = 0.35
orig_err = dict(type="data", array=agg["orig_std"].fillna(0).tolist(), visible=True,
color="#374151", thickness=1.5, width=5) if has_std else None
rew_err = dict(type="data", array=agg["rew_std"].fillna(0).tolist(), visible=True,
color="#374151", thickness=1.5, width=5) if has_std else None
fig = go.Figure()
fig.add_trace(go.Bar(name="Original", x=[x - width/2 for x in x_pos], y=agg["orig"].tolist(),
marker_color="#6366f1", width=width, error_y=orig_err,
text=[f"{v:.1f}" for v in agg["orig"]], textposition="outside"))
fig.add_trace(go.Bar(name="Rewrite", x=[x + width/2 for x in x_pos], y=agg["rew"].tolist(),
marker_color="#a5b4fc", width=width, error_y=rew_err,
text=[f"{v:.1f}" for v in agg["rew"]], textposition="outside"))
fig.update_layout(
title=dict(text="Spiciness: Original vs Rewrite — error bars = replicate σ", font=dict(size=15)),
xaxis=dict(tickvals=list(x_pos), ticktext=models, tickangle=-15),
yaxis=dict(title="Spiciness Score", gridcolor="#e5e7eb"),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80),
)
return fig
def build_pca_plot(category, text_name, model_filter):
df = filter_df(category, text_name, model_filter)
valid = df.dropna(subset=["pca_original_x", "pca_rewrite_x"])
if len(valid) == 0:
fig = go.Figure()
fig.add_annotation(text="No PCA data available for this selection",
x=0.5, y=0.5, xref="paper", yref="paper", showarrow=False)
fig.update_layout(height=450, paper_bgcolor="white", plot_bgcolor="white")
return fig
fig = go.Figure()
beige_shown = False
for entry in pca_beige_data:
if not beige_shown and entry.get("beige_points"):
bx = [p["x"] for p in entry["beige_points"]]
by = [p["y"] for p in entry["beige_points"]]
fig.add_trace(go.Scatter(x=bx, y=by, mode="markers", name="Beige Platitudes",
marker=dict(size=8, color="lightgray", opacity=0.5, symbol="diamond"),
hoverinfo="skip"))
beige_shown = True
break
for mn in sorted(valid["model_name"].unique()):
sub = valid[valid["model_name"] == mn]
color = get_color(mn)
for idx, (_, row) in enumerate(sub.iterrows()):
fig.add_trace(go.Scatter(
x=[row["pca_original_x"], row["pca_rewrite_x"]],
y=[row["pca_original_y"], row["pca_rewrite_y"]],
mode="lines+markers", line=dict(color=color, width=2),
marker=dict(size=[8, 10], color=color, symbol=["circle", "arrow-right"]),
name=mn, legendgroup=mn, showlegend=(idx == 0),
hovertemplate=f"{mn}
{row['prompt_name']}",
))
fig.update_layout(
title=dict(text="Beigification Vector — PCA Projection", font=dict(size=15)),
xaxis=dict(title="PC1", gridcolor="#e5e7eb"),
yaxis=dict(title="PC2", gridcolor="#e5e7eb", scaleanchor="x", scaleratio=1),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=500, margin=dict(l=50, r=10, t=60, b=60),
)
return fig
def build_nli_table(category, text_name, model_filter):
if text_name == "All" or model_filter == "All":
return pd.DataFrame({"Info": ["Select a specific text AND model to view NLI atom details."]})
model_id = next((mid for mid, (dn, _) in MODEL_META.items() if dn == model_filter), None)
if not model_id:
return pd.DataFrame({"Error": ["Model not found"]})
for entry in nli_atoms_data:
if entry["model"] == model_id and entry["prompt_name"] == text_name:
atoms = entry["nli_atoms"]
rows = [{"#": i+1,
"Atom": a["text"][:120] + ("..." if len(a["text"]) > 120 else ""),
"NLI Score": f"{a['score']:.4f}",
"Status": "Retained" if a["score"] >= 0.5 else "Lost"}
for i, a in enumerate(atoms)]
return pd.DataFrame(rows)
return pd.DataFrame({"Info": ["No NLI atom data available for this selection."]})
def update_explorer(category, text_name, model_filter):
return (
safe_plot(build_scatter, category, text_name, model_filter),
safe_plot(build_lossiness_breakdown, category, text_name, model_filter),
safe_plot(build_drift_breakdown, category, text_name, model_filter),
safe_plot(build_spiciness_comparison, category, text_name, model_filter),
safe_plot(build_pca_plot, category, text_name, model_filter),
build_nli_table(category, text_name, model_filter),
)
def update_text_choices(category):
return gr.update(choices=get_texts(category), value="All")
# ── Tab 2: Leaderboard ─────────────────────────────────────────────────────────
def build_leaderboard_df():
df = results_df.copy()
df["model_name"] = df["model"].map(display_name)
df["provider"] = df["model"].map(provider_name)
agg = df.groupby(["model_name", "provider"]).agg(
avg_score=("score", "mean"),
avg_lossiness=("lossiness", "mean"),
avg_drift=("drift", "mean"),
avg_nli_retention=("nli_retention", "mean"),
avg_spiciness_delta=("spiciness_delta", "mean"),
avg_orig_spiciness=("orig_spiciness", "mean"),
avg_rew_spiciness=("rew_spiciness", "mean"),
n_texts=("score", "count"),
).reset_index().sort_values("avg_score", ascending=False).reset_index(drop=True)
agg.insert(0, "Rank", range(1, len(agg) + 1))
out = agg.rename(columns={
"model_name": "Model", "provider": "Provider",
"avg_score": "Avg Score", "avg_lossiness": "Avg Lossiness",
"avg_drift": "Avg Drift", "avg_nli_retention": "NLI Retention",
"avg_spiciness_delta": "Spiciness Δ", "avg_orig_spiciness": "Orig Spiciness",
"avg_rew_spiciness": "Rew Spiciness", "n_texts": "Texts",
})
out["Avg Score"] = out["Avg Score"].round(1)
for c in ["Avg Lossiness", "Avg Drift", "NLI Retention", "Spiciness Δ", "Orig Spiciness", "Rew Spiciness"]:
out[c] = out[c].round(4)
return out
def build_leaderboard_chart():
df = results_df.copy()
df["model_name"] = df["model"].map(display_name)
has_std = "score_std" in df.columns
# Per-model: mean score and mean of per-replicate std (avg replication noise)
agg_cols = {"score": ("score", "mean")}
if has_std:
agg_cols["score_std_mean"] = ("score_std", "mean")
agg = df.groupby("model_name").agg(**agg_cols).reset_index().sort_values("score", ascending=False)
models = agg["model_name"].tolist()
err = agg["score_std_mean"].fillna(0).tolist() if has_std else None
fig = go.Figure()
fig.add_trace(go.Bar(
name="Avg Score", x=models, y=agg["score"].tolist(),
marker_color=[get_color(m) for m in models],
text=[f"{v:.1f}" for v in agg["score"]], textposition="outside",
error_y=dict(type="data", array=err, visible=True,
color="#374151", thickness=1.5, width=6) if err else None,
))
fig.update_layout(
title=dict(text="Overall Preservation Score by Model — error bars = replicate σ", font=dict(size=15)),
yaxis=dict(title="Score (0–100)", gridcolor="#e5e7eb", range=[0, 110]),
xaxis=dict(tickangle=-15), plot_bgcolor="white", paper_bgcolor="white",
showlegend=False, autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=80),
)
return fig
def build_category_heatmap():
df = results_df.copy()
df["model_name"] = df["model"].map(display_name)
pivot = df.pivot_table(index="category", columns="model_name", values="score", aggfunc="mean")
z, x, y = pivot.values, pivot.columns.tolist(), pivot.index.tolist()
fig = go.Figure(data=go.Heatmap(
z=z, x=x, y=y,
colorscale=[[0,"#fef2f2"],[0.4,"#fde68a"],[0.7,"#bbf7d0"],[1.0,"#1d4ed8"]],
text=[[f"{v:.0f}" if not np.isnan(v) else "" for v in row] for row in z],
texttemplate="%{text}", textfont=dict(size=11),
colorbar=dict(title="Score", thickness=12), zmin=60, zmax=100,
))
fig.update_layout(
title=dict(text="Score by Category × Model", font=dict(size=15)),
xaxis=dict(side="top", tickangle=-20), yaxis=dict(autorange="reversed"),
autosize=True, height=400, margin=dict(l=160, r=10, t=70, b=10),
plot_bgcolor="white", paper_bgcolor="white",
)
return fig
def build_metrics_radar():
df = results_df.copy()
df["model_name"] = df["model"].map(display_name)
agg = df.groupby("model_name").agg(
nli_retention=("nli_retention", "mean"),
low_lossiness=("lossiness", lambda x: 1 - x.mean()),
low_drift=("drift", lambda x: 1 - x.mean()),
spiciness_preservation=("spiciness_delta", lambda x: 1 - abs(x.mean()) / 10),
low_word_deletion=("word_deletion", lambda x: 1 - x.mean()),
).reset_index()
cats = ["NLI Retention", "Low Lossiness", "Low Drift", "Spiciness Preservation", "Low Word Deletion"]
fig = go.Figure()
for _, row in agg.iterrows():
vals = [row["nli_retention"], row["low_lossiness"], row["low_drift"],
row["spiciness_preservation"], row["low_word_deletion"]]
vals += [vals[0]]
fig.add_trace(go.Scatterpolar(r=vals, theta=cats + [cats[0]], fill="toself",
name=row["model_name"],
line=dict(color=get_color(row["model_name"]))))
fig.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
title=dict(text="Multi-Metric Capability Radar", font=dict(size=15)),
legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5),
autosize=True, height=480, margin=dict(l=60, r=60, t=70, b=80),
paper_bgcolor="white",
)
return fig
# ── Tab 3: Multi-Hop Trajectories ──────────────────────────────────────────────
def build_mh_lossiness(text_filter):
df = filter_hops(text_filter)
if len(df) == 0:
return go.Figure()
agg = df.groupby(["model_name", "hop_number"])["lossiness"].mean().reset_index()
fig = go.Figure()
for mn in sorted(agg["model_name"].unique()):
sub = agg[agg["model_name"] == mn].sort_values("hop_number")
fig.add_trace(go.Scatter(
x=sub["hop_number"].tolist(), y=sub["lossiness"].tolist(),
mode="lines+markers", name=mn,
line=dict(color=get_color(mn), width=2),
marker=dict(size=7),
hovertemplate=f"{mn}
Hop %{{x}}
Lossiness: %{{y:.3f}}",
))
fig.update_layout(
title=dict(text="Lossiness by Hop (information loss per rewrite)", font=dict(size=15)),
xaxis=dict(title="Hop Number", dtick=1, gridcolor="#e5e7eb"),
yaxis=dict(title="Lossiness →", gridcolor="#e5e7eb", rangemode="tozero"),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=60),
)
return fig
def build_mh_drift(text_filter):
df = filter_hops(text_filter)
if len(df) == 0:
return go.Figure()
agg = df.groupby(["model_name", "hop_number"])["drift"].mean().reset_index()
fig = go.Figure()
for mn in sorted(agg["model_name"].unique()):
sub = agg[agg["model_name"] == mn].sort_values("hop_number")
fig.add_trace(go.Scatter(
x=sub["hop_number"].tolist(), y=sub["drift"].tolist(),
mode="lines+markers", name=mn,
line=dict(color=get_color(mn), width=2),
marker=dict(size=7),
hovertemplate=f"{mn}
Hop %{{x}}
Drift: %{{y:.3f}}",
))
fig.update_layout(
title=dict(text="Drift by Hop (stylistic collapse per rewrite)", font=dict(size=15)),
xaxis=dict(title="Hop Number", dtick=1, gridcolor="#e5e7eb"),
yaxis=dict(title="Drift ↑", gridcolor="#e5e7eb", rangemode="tozero"),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=60),
)
return fig
def build_mh_spiciness(text_filter):
df = filter_hops(text_filter)
if len(df) == 0:
return go.Figure()
agg = df.groupby(["model_name", "hop_number"]).agg(
orig_s=("orig_spiciness", "mean"),
rew_s=("rew_spiciness", "mean"),
).reset_index()
fig = go.Figure()
shown = set()
for mn in sorted(agg["model_name"].unique()):
sub = agg[agg["model_name"] == mn].sort_values("hop_number")
color = get_color(mn)
fig.add_trace(go.Scatter(
x=sub["hop_number"].tolist(), y=sub["rew_s"].tolist(),
mode="lines+markers", name=mn,
line=dict(color=color, width=2), marker=dict(size=7),
hovertemplate=f"{mn}
Hop %{{x}}
Rewrite Spiciness: %{{y:.2f}}",
))
if mn not in shown:
if not sub.empty:
fig.add_trace(go.Scatter(
x=[sub["hop_number"].iloc[0]], y=[sub["orig_s"].iloc[0]],
mode="markers", name=f"{mn} (orig)",
marker=dict(size=11, color=color, symbol="diamond"),
showlegend=True,
hovertemplate=f"{mn} original
Spiciness: %{{y:.2f}}",
))
shown.add(mn)
fig.update_layout(
title=dict(text="Spiciness by Hop (rewrite spiciness vs original)", font=dict(size=15)),
xaxis=dict(title="Hop Number", dtick=1, gridcolor="#e5e7eb"),
yaxis=dict(title="Spiciness Score", gridcolor="#e5e7eb"),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=60),
)
return fig
def build_mh_scatter_trajectory(text_filter):
df = filter_hops(text_filter)
if len(df) == 0:
return go.Figure()
agg = df.groupby(["model_name", "hop_number"]).agg(
lossiness=("lossiness", "mean"), drift=("drift", "mean"),
).reset_index()
fig = go.Figure()
for mn in sorted(agg["model_name"].unique()):
sub = agg[agg["model_name"] == mn].sort_values("hop_number")
color = get_color(mn)
fig.add_trace(go.Scatter(
x=sub["lossiness"].tolist(), y=sub["drift"].tolist(),
mode="lines+markers", name=mn,
line=dict(color=color, width=2),
marker=dict(size=[6 + i * 1.5 for i in range(len(sub))], color=color),
text=[f"Hop {h}" for h in sub["hop_number"].tolist()],
hovertemplate=f"{mn} — %{{text}}
Lossiness: %{{x:.3f}}
Drift: %{{y:.3f}}",
))
fig.update_layout(
title=dict(text="Degradation Trajectory: Lossiness vs Drift over Hops", font=dict(size=15)),
xaxis=dict(title="Lossiness →", gridcolor="#e5e7eb", rangemode="tozero"),
yaxis=dict(title="Drift ↑", gridcolor="#e5e7eb", rangemode="tozero"),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=480, margin=dict(l=50, r=10, t=60, b=60),
)
return fig
def build_anim_scatter(text_filter, current_hop):
df = filter_hops(text_filter)
if len(df) == 0:
return go.Figure()
max_hop = int(df["hop_number"].max())
current_hop = max(1, min(int(current_hop), max_hop))
agg_all = df.groupby(["model_name", "hop_number"]).agg(
lossiness=("lossiness", "mean"), drift=("drift", "mean"),
).reset_index()
fig = go.Figure()
for mn in sorted(agg_all["model_name"].unique()):
sub_all = agg_all[agg_all["model_name"] == mn].sort_values("hop_number")
sub = sub_all[sub_all["hop_number"] <= current_hop]
if len(sub) == 0:
continue
color = get_color(mn)
trail = sub.iloc[:-1]
tip = sub.iloc[[-1]]
if len(trail) > 0:
fig.add_trace(go.Scatter(
x=trail["lossiness"].tolist(), y=trail["drift"].tolist(),
mode="lines+markers", name=mn, legendgroup=mn,
line=dict(color=color, width=2, dash="dot"),
marker=dict(size=7, color=color, opacity=0.5),
showlegend=False, hoverinfo="skip",
))
fig.add_trace(go.Scatter(
x=tip["lossiness"].tolist(), y=tip["drift"].tolist(),
mode="markers", name=mn, legendgroup=mn,
marker=dict(size=16, color=color, symbol="circle",
line=dict(color="white", width=2)),
hovertemplate=(
f"{mn}
Hop {current_hop}
"
"Lossiness: %{x:.3f}
Drift: %{y:.3f}"
),
))
fig.add_shape(type="line", x0=0.5, x1=0.5, y0=0, y1=1,
xref="x", yref="paper",
line=dict(color="#9ca3af", dash="dash", width=1))
fig.add_shape(type="line", x0=0, x1=1, y0=0.5, y1=0.5,
xref="paper", yref="y",
line=dict(color="#9ca3af", dash="dash", width=1))
fig.update_layout(
title=dict(
text=f"Lossiness × Drift Trajectory — Hop {current_hop} / {max_hop}",
font=dict(size=15),
),
xaxis=dict(title="Lossiness →", gridcolor="#e5e7eb",
range=[-0.02, max(0.6, agg_all["lossiness"].max() + 0.05)]),
yaxis=dict(title="Drift ↑", gridcolor="#e5e7eb",
range=[-0.01, max(0.3, agg_all["drift"].max() + 0.05)]),
plot_bgcolor="white", paper_bgcolor="white",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
autosize=True, height=480, margin=dict(l=50, r=10, t=60, b=60),
)
return fig
def build_mh_summary_table():
if len(mh_summary_df) == 0:
return pd.DataFrame({"Info": ["No multi-hop summary data available."]})
df = mh_summary_df.copy()
df["model_name"] = df["model"].map(display_name)
out = df[["model_name", "prompt_name", "score", "lossiness", "drift", "nli_retention"]].copy()
out = out.rename(columns={"model_name": "Model", "prompt_name": "Text",
"score": "Score", "lossiness": "Lossiness",
"drift": "Drift", "nli_retention": "NLI Retention"})
out = out.sort_values(["Model", "Text"]).reset_index(drop=True)
for c in ["Lossiness", "Drift", "NLI Retention"]:
out[c] = out[c].round(4)
return out
def update_mh(text_filter):
return (
safe_plot(build_mh_lossiness, text_filter),
safe_plot(build_mh_drift, text_filter),
safe_plot(build_mh_spiciness, text_filter),
safe_plot(build_mh_scatter_trajectory, text_filter),
build_mh_summary_table(),
)
# ── Gradio UI ──────────────────────────────────────────────────────────────────
ABOUT_MD = f"""
## BeigificationBench — Evaluation Engine v{EVAL_VERSION}
**Beigification** describes the tendency of LLMs to produce rewrites that are safe, bland, and
stylistically uniform — stripping out the distinctive voice, specificity, and informational density
of the source text.
### Metrics
| Metric | Description |
|--------|-------------|
| **Lossiness** | NLI-weighted information loss: prop loss (60%) + semantic distance (20%) + word deletion (20%) |
| **Drift** | Model collapse: spiciness loss (55%) + centroid pull (45%) |
| **Spiciness** | 6-component vividness score: sqrt-PPL, lexical richness, rare word density, word specificity, vivid modifiers, voice |
| **NLI Retention** | Fraction of source propositions preserved per NLI model |
### Benchmark Design
- **Single-hop** results average 3 independent replicates to reduce variance.
- **Multi-hop** results show degradation trajectories over 8 successive rewrites using run 154 (5 models × 3 texts × 8 hops).
- Texts span literary non-fiction, polemic, and sensitive personal essay genres.
### Models Evaluated
| Model | Provider | Single-hop | Multi-hop |
|-------|----------|:----------:|:---------:|
| Claude Opus 4.6 | Anthropic | ✓ | ✓ |
| GPT-5.2 | OpenAI | ✓ | ✓ |
| Gemini 3.1 Pro | Google | ✓ | ✓ |
| Llama 3.3 70B | Meta / HF | ✓ | ✓ |
| Llama 3.1 70B | Meta / HF | | ✓ |
| Qwen 2.5 72B | Alibaba / HF | ✓ | |
*Anonymous submission. Author information withheld for peer review.*
"""
with gr.Blocks(title="BeigificationBench") as demo:
gr.Markdown(
f"# 📊 BeigificationBench\n"
f"**Measuring How LLMs Flatten Text** | "
f"Eval Engine v{EVAL_VERSION} | Space v{SPACE_VERSION}"
)
with gr.Tabs():
# ── Tab 1: Explorer ────────────────────────────────────────────────────
with gr.Tab("Interactive Explorer"):
with gr.Row():
cat_dd = gr.Dropdown(choices=get_categories(), value="All", label="Category")
text_dd = gr.Dropdown(choices=get_texts(), value="All", label="Source Text")
model_dd = gr.Dropdown(choices=get_models(), value="All", label="Model")
scatter_plot = gr.Plot()
with gr.Row():
loss_plot = gr.Plot()
drift_plot = gr.Plot()
with gr.Row():
spic_plot = gr.Plot()
pca_plot = gr.Plot()
gr.Markdown("### NLI Atom Detail *(select a specific text and model)*")
nli_table = gr.Dataframe(wrap=True)
def _update(cat, txt, mdl):
return update_explorer(cat, txt, mdl)
for ctrl in [cat_dd, text_dd, model_dd]:
ctrl.change(_update, [cat_dd, text_dd, model_dd],
[scatter_plot, loss_plot, drift_plot, spic_plot, pca_plot, nli_table])
cat_dd.change(update_text_choices, [cat_dd], [text_dd])
demo.load(_update, [cat_dd, text_dd, model_dd],
[scatter_plot, loss_plot, drift_plot, spic_plot, pca_plot, nli_table])
# ── Tab 2: Leaderboard ─────────────────────────────────────────────────
with gr.Tab("Leaderboard"):
gr.Markdown("Results averaged across 3 replicates per prompt × model combination.")
lb_table = gr.Dataframe(value=build_leaderboard_df, wrap=False)
lb_chart = gr.Plot(value=build_leaderboard_chart)
cat_heatmap = gr.Plot(value=build_category_heatmap)
radar_plot = gr.Plot(value=build_metrics_radar)
# ── Tab 3: Multi-Hop Trajectories ──────────────────────────────────────
with gr.Tab("Multi-Hop Trajectories"):
gr.Markdown(
"### 8-Hop Rewrite Trajectories\n"
"Each model rewrites the same text 8 times in succession. "
"Charts show how Lossiness, Drift, and Spiciness evolve across hops, "
"averaged across texts unless a specific text is selected."
)
mh_text_dd = gr.Dropdown(choices=get_mh_texts(), value="All", label="Source Text")
mh_traj_plot = gr.Plot()
with gr.Row():
mh_loss_plot = gr.Plot()
mh_drift_plot = gr.Plot()
mh_spic_plot = gr.Plot()
gr.Markdown("### Overall (hop 1 → 8) Summary")
mh_summary_table = gr.Dataframe(wrap=False)
def _update_mh(txt):
return update_mh(txt)
mh_text_dd.change(_update_mh, [mh_text_dd],
[mh_loss_plot, mh_drift_plot, mh_spic_plot, mh_traj_plot, mh_summary_table])
demo.load(_update_mh, [mh_text_dd],
[mh_loss_plot, mh_drift_plot, mh_spic_plot, mh_traj_plot, mh_summary_table])
# ── Animation ──────────────────────────────────────────────────────
gr.Markdown("---\n### 🎬 Trajectory Animation")
gr.Markdown(
"Watch each model's position move through Lossiness × Drift space as hops accumulate. "
"The large dot shows the current hop; the dotted trail shows the path so far."
)
anim_is_playing = gr.State(False)
with gr.Row():
anim_play_btn = gr.Button("▶ Play", size="sm", variant="primary", scale=0)
anim_reset_btn = gr.Button("⟳ Reset", size="sm", scale=0)
anim_hop_slider = gr.Slider(
minimum=1, maximum=8, step=1, value=1,
label="Hop", scale=2,
)
anim_plot = gr.Plot()
anim_timer = gr.Timer(value=1.2, active=False)
def _anim_init(txt):
return 1, build_anim_scatter(txt, 1)
def _anim_scrub(hop, txt):
return safe_plot(build_anim_scatter, txt, hop)
def _anim_toggle(playing):
new_playing = not playing
label = "⏸ Pause" if new_playing else "▶ Play"
return new_playing, gr.Timer(active=new_playing), label
def _anim_reset(txt):
return False, gr.Timer(active=False), 1, safe_plot(build_anim_scatter, txt, 1), "▶ Play"
def _anim_tick(playing, hop, txt):
if not playing:
return hop, safe_plot(build_anim_scatter, txt, hop)
max_hop = int(hops_df["hop_number"].max()) if len(hops_df) > 0 else 8
if hop >= max_hop:
return hop, safe_plot(build_anim_scatter, txt, hop)
next_hop = hop + 1
return next_hop, safe_plot(build_anim_scatter, txt, next_hop)
anim_play_btn.click(
_anim_toggle,
inputs=[anim_is_playing],
outputs=[anim_is_playing, anim_timer, anim_play_btn],
)
anim_reset_btn.click(
_anim_reset,
inputs=[mh_text_dd],
outputs=[anim_is_playing, anim_timer, anim_hop_slider, anim_plot, anim_play_btn],
)
anim_hop_slider.change(
_anim_scrub,
inputs=[anim_hop_slider, mh_text_dd],
outputs=[anim_plot],
)
mh_text_dd.change(
_anim_init,
inputs=[mh_text_dd],
outputs=[anim_hop_slider, anim_plot],
)
anim_timer.tick(
_anim_tick,
inputs=[anim_is_playing, anim_hop_slider, mh_text_dd],
outputs=[anim_hop_slider, anim_plot],
)
demo.load(_anim_init, inputs=[mh_text_dd], outputs=[anim_hop_slider, anim_plot])
# ── Tab 4: About ───────────────────────────────────────────────────────
with gr.Tab("About"):
gr.Markdown(ABOUT_MD)
demo.launch(ssr_mode=False)