""" BeigificationBench — Measuring How LLMs Flatten Text Anonymous submission | Evaluation Engine v1.0 """ import os import json import gradio as gr import pandas as pd import plotly.graph_objects as go from plotly.subplots import make_subplots import numpy as np SPACE_VERSION = "2026.05" EVAL_VERSION = "1.0" MODEL_META = { "claude-opus-4-6": ("Claude Opus 4.6", "Anthropic"), "gpt-5.2": ("GPT-5.2", "OpenAI"), "gemini-3.1-pro-preview": ("Gemini 3.1 Pro", "Google"), "meta-llama/Llama-3.3-70B-Instruct": ("Llama 3.3 70B", "Meta / HF"), "meta-llama/Llama-3.1-70B-Instruct": ("Llama 3.1 70B", "Meta / HF"), "Qwen/Qwen2.5-72B-Instruct": ("Qwen 2.5 72B", "Alibaba / HF"), } MODEL_COLORS = { "Claude Opus 4.6": "#7c3aed", "GPT-5.2": "#0891b2", "Gemini 3.1 Pro": "#059669", "Llama 3.3 70B": "#d97706", "Llama 3.1 70B": "#b45309", "Qwen 2.5 72B": "#db2777", } DATA_DIR = os.path.join(os.path.dirname(__file__), "data") def display_name(model_id): return MODEL_META.get(model_id, (model_id, ""))[0] def provider_name(model_id): return MODEL_META.get(model_id, ("", model_id))[1] def get_color(model_name): return MODEL_COLORS.get(model_name, "#6b7280") def safe_plot(fn, *args, **kwargs): try: return fn(*args, **kwargs) except Exception as e: fig = go.Figure() fig.add_annotation( text=f"Chart unavailable: {type(e).__name__}: {e}", x=0.5, y=0.5, xref="paper", yref="paper", showarrow=False, font=dict(size=13, color="red"), ) fig.update_layout(height=300, paper_bgcolor="white", plot_bgcolor="white") return fig # ── Data loading ─────────────────────────────────────────────────────────────── def load_data(): try: df = pd.read_csv(os.path.join(DATA_DIR, "results.csv")) except Exception as e: print(f"WARNING: results.csv load failed: {e}") df = pd.DataFrame() try: with open(os.path.join(DATA_DIR, "nli_atoms.json")) as f: nli_atoms = json.load(f) except Exception: nli_atoms = [] try: with open(os.path.join(DATA_DIR, "pca_beige_points.json")) as f: pca_beige = json.load(f) except Exception: pca_beige = [] return df, nli_atoms, pca_beige def load_multihop_data(): try: hops_df = pd.read_csv(os.path.join(DATA_DIR, "multihop_hops.csv")) except Exception as e: print(f"WARNING: multihop_hops.csv load failed: {e}") hops_df = pd.DataFrame() try: summary_df = pd.read_csv(os.path.join(DATA_DIR, "multihop_summary.csv")) except Exception: summary_df = pd.DataFrame() return hops_df, summary_df results_df, nli_atoms_data, pca_beige_data = load_data() hops_df, mh_summary_df = load_multihop_data() # ── Helpers ──────────────────────────────────────────────────────────────────── def get_categories(): cats = results_df["category"].unique().tolist() if len(results_df) else [] return ["All"] + sorted(cats) def get_texts(category="All"): if len(results_df) == 0: return ["All"] if category == "All": texts = results_df["prompt_name"].unique().tolist() else: texts = results_df[results_df["category"] == category]["prompt_name"].unique().tolist() return ["All"] + sorted(texts) def get_models(): if len(results_df) == 0: return ["All"] return ["All"] + sorted(results_df["model"].map(display_name).unique().tolist()) def filter_df(category, text_name, model_filter): df = results_df.copy() df["model_name"] = df["model"].map(display_name) if category != "All": df = df[df["category"] == category] if text_name != "All": df = df[df["prompt_name"] == text_name] if model_filter != "All": df = df[df["model_name"] == model_filter] return df def get_mh_texts(): if len(hops_df) == 0: return ["All"] return ["All"] + sorted(hops_df["prompt_name"].unique().tolist()) def filter_hops(text_filter): df = hops_df.copy() df["model_name"] = df["model"].map(display_name) if text_filter != "All": df = df[df["prompt_name"] == text_filter] return df # ── Tab 1: Interactive Explorer ──────────────────────────────────────────────── def build_scatter(category, text_name, model_filter): df = filter_df(category, text_name, model_filter) fig = go.Figure() has_std = "lossiness_std" in df.columns and "drift_std" in df.columns for mn in sorted(df["model_name"].unique()): sub = df[df["model_name"] == mn] err_x = dict(type="data", array=sub["lossiness_std"].fillna(0).tolist(), visible=True, color=get_color(mn), thickness=1.5, width=4) if has_std else None err_y = dict(type="data", array=sub["drift_std"].fillna(0).tolist(), visible=True, color=get_color(mn), thickness=1.5, width=4) if has_std else None fig.add_trace(go.Scatter( x=sub["lossiness"].tolist(), y=sub["drift"].tolist(), mode="markers", name=mn, marker=dict(size=12, color=get_color(mn), opacity=0.85), error_x=err_x, error_y=err_y, text=sub["prompt_name"].tolist(), customdata=sub[["score", "nli_retention", "category", "lossiness_std", "drift_std"]].values.tolist(), hovertemplate=( "%{text}
" "Model: " + mn + "
" "Lossiness: %{x:.3f} ± %{customdata[3]:.3f}
" "Drift: %{y:.3f} ± %{customdata[4]:.3f}
" "Score: %{customdata[0]:.1f}
" "NLI Retention: %{customdata[1]:.1%}
" "Category: %{customdata[2]}" ), )) fig.add_shape(type="rect", x0=-0.02, y0=-0.02, x1=0.15, y1=0.05, fillcolor="rgba(22,163,74,0.08)", line=dict(color="rgba(22,163,74,0.3)", dash="dot"), layer="below") fig.add_annotation(x=0.07, y=0.04, text="High fidelity", showarrow=False, font=dict(color="#16a34a", size=10)) fig.update_layout( title=dict(text="Integrity Frontier: Lossiness vs Drift", font=dict(size=15)), xaxis=dict(title="Lossiness (information loss) →", gridcolor="#e5e7eb", rangemode="tozero"), yaxis=dict(title="Drift (stylistic collapse) ↑", gridcolor="#e5e7eb", rangemode="tozero"), plot_bgcolor="white", paper_bgcolor="white", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), autosize=True, height=500, margin=dict(l=50, r=10, t=60, b=60), ) return fig def build_lossiness_breakdown(category, text_name, model_filter): df = filter_df(category, text_name, model_filter) has_std = "lossiness_std" in df.columns agg_spec = { "prop_loss": ("prop_loss", "mean"), "semantic_distance":("semantic_distance", "mean"), "word_deletion": ("word_deletion", "mean"), } if has_std: agg_spec["lossiness_std"] = ("lossiness_std", "mean") agg = df.groupby("model_name").agg(**agg_spec).reset_index().sort_values("model_name") models = agg["model_name"].tolist() # Total lossiness = weighted sum of sub-metrics totals = (agg["prop_loss"] * 0.6 + agg["semantic_distance"] * 0.2 + agg["word_deletion"] * 0.2).tolist() err = dict(type="data", array=agg["lossiness_std"].fillna(0).tolist(), visible=True, color="#374151", thickness=1.5, width=5) if has_std else None fig = go.Figure() fig.add_trace(go.Bar(name="Prop Loss (60%)", x=models, y=(agg["prop_loss"] * 0.6).tolist(), marker_color="#1e3a5f", text=[f"{v:.3f}" for v in (agg["prop_loss"] * 0.6)], textposition="inside")) fig.add_trace(go.Bar(name="Semantic Dist (20%)", x=models, y=(agg["semantic_distance"] * 0.2).tolist(), marker_color="#3498db", text=[f"{v:.3f}" for v in (agg["semantic_distance"] * 0.2)], textposition="inside")) # Top segment carries the error bar for total lossiness fig.add_trace(go.Bar(name="Word Deletion (20%)", x=models, y=(agg["word_deletion"] * 0.2).tolist(), marker_color="#95a5a6", text=[f"{v:.3f}" for v in (agg["word_deletion"] * 0.2)], textposition="inside", error_y=err)) fig.update_layout( barmode="stack", title=dict(text="Lossiness Sub-Metric Decomposition — error bars = replicate σ", font=dict(size=15)), yaxis=dict(title="Weighted Contribution", gridcolor="#e5e7eb"), xaxis=dict(tickangle=-15), plot_bgcolor="white", paper_bgcolor="white", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80), ) return fig def build_drift_breakdown(category, text_name, model_filter): df = filter_df(category, text_name, model_filter) has_std = "drift_std" in df.columns agg_spec = { "norm_delta_spiciness": ("norm_delta_spiciness", "mean"), "norm_pull": ("norm_pull", "mean"), } if has_std: agg_spec["drift_std"] = ("drift_std", "mean") agg = df.groupby("model_name").agg(**agg_spec).reset_index().sort_values("model_name") models = agg["model_name"].tolist() err = dict(type="data", array=agg["drift_std"].fillna(0).tolist(), visible=True, color="#374151", thickness=1.5, width=5) if has_std else None fig = go.Figure() fig.add_trace(go.Bar(name="Spiciness Loss (55%)", x=models, y=(agg["norm_delta_spiciness"] * 0.55).tolist(), marker_color="#e74c3c", text=[f"{v:.3f}" for v in (agg["norm_delta_spiciness"] * 0.55)], textposition="inside")) # Top segment carries the error bar for total drift fig.add_trace(go.Bar(name="Centroid Pull (45%)", x=models, y=(agg["norm_pull"] * 0.45).tolist(), marker_color="#f1c40f", text=[f"{v:.3f}" for v in (agg["norm_pull"] * 0.45)], textposition="inside", error_y=err)) fig.update_layout( barmode="stack", title=dict(text="Drift Sub-Metric Decomposition — error bars = replicate σ", font=dict(size=15)), yaxis=dict(title="Weighted Contribution", gridcolor="#e5e7eb"), xaxis=dict(tickangle=-15), plot_bgcolor="white", paper_bgcolor="white", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80), ) return fig def build_spiciness_comparison(category, text_name, model_filter): df = filter_df(category, text_name, model_filter) has_std = "orig_spiciness_std" in df.columns agg_spec = {"orig": ("orig_spiciness", "mean"), "rew": ("rew_spiciness", "mean")} if has_std: agg_spec["orig_std"] = ("orig_spiciness_std", "mean") agg_spec["rew_std"] = ("rew_spiciness_std", "mean") agg = df.groupby("model_name").agg(**agg_spec).reset_index().sort_values("model_name") models = agg["model_name"].tolist() x_pos = np.arange(len(models)) width = 0.35 orig_err = dict(type="data", array=agg["orig_std"].fillna(0).tolist(), visible=True, color="#374151", thickness=1.5, width=5) if has_std else None rew_err = dict(type="data", array=agg["rew_std"].fillna(0).tolist(), visible=True, color="#374151", thickness=1.5, width=5) if has_std else None fig = go.Figure() fig.add_trace(go.Bar(name="Original", x=[x - width/2 for x in x_pos], y=agg["orig"].tolist(), marker_color="#6366f1", width=width, error_y=orig_err, text=[f"{v:.1f}" for v in agg["orig"]], textposition="outside")) fig.add_trace(go.Bar(name="Rewrite", x=[x + width/2 for x in x_pos], y=agg["rew"].tolist(), marker_color="#a5b4fc", width=width, error_y=rew_err, text=[f"{v:.1f}" for v in agg["rew"]], textposition="outside")) fig.update_layout( title=dict(text="Spiciness: Original vs Rewrite — error bars = replicate σ", font=dict(size=15)), xaxis=dict(tickvals=list(x_pos), ticktext=models, tickangle=-15), yaxis=dict(title="Spiciness Score", gridcolor="#e5e7eb"), plot_bgcolor="white", paper_bgcolor="white", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80), ) return fig def build_pca_plot(category, text_name, model_filter): df = filter_df(category, text_name, model_filter) valid = df.dropna(subset=["pca_original_x", "pca_rewrite_x"]) if len(valid) == 0: fig = go.Figure() fig.add_annotation(text="No PCA data available for this selection", x=0.5, y=0.5, xref="paper", yref="paper", showarrow=False) fig.update_layout(height=450, paper_bgcolor="white", plot_bgcolor="white") return fig fig = go.Figure() beige_shown = False for entry in pca_beige_data: if not beige_shown and entry.get("beige_points"): bx = [p["x"] for p in entry["beige_points"]] by = [p["y"] for p in entry["beige_points"]] fig.add_trace(go.Scatter(x=bx, y=by, mode="markers", name="Beige Platitudes", marker=dict(size=8, color="lightgray", opacity=0.5, symbol="diamond"), hoverinfo="skip")) beige_shown = True break for mn in sorted(valid["model_name"].unique()): sub = valid[valid["model_name"] == mn] color = get_color(mn) for idx, (_, row) in enumerate(sub.iterrows()): fig.add_trace(go.Scatter( x=[row["pca_original_x"], row["pca_rewrite_x"]], y=[row["pca_original_y"], row["pca_rewrite_y"]], mode="lines+markers", line=dict(color=color, width=2), marker=dict(size=[8, 10], color=color, symbol=["circle", "arrow-right"]), name=mn, legendgroup=mn, showlegend=(idx == 0), hovertemplate=f"{mn}
{row['prompt_name']}", )) fig.update_layout( title=dict(text="Beigification Vector — PCA Projection", font=dict(size=15)), xaxis=dict(title="PC1", gridcolor="#e5e7eb"), yaxis=dict(title="PC2", gridcolor="#e5e7eb", scaleanchor="x", scaleratio=1), plot_bgcolor="white", paper_bgcolor="white", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), autosize=True, height=500, margin=dict(l=50, r=10, t=60, b=60), ) return fig def build_nli_table(category, text_name, model_filter): if text_name == "All" or model_filter == "All": return pd.DataFrame({"Info": ["Select a specific text AND model to view NLI atom details."]}) model_id = next((mid for mid, (dn, _) in MODEL_META.items() if dn == model_filter), None) if not model_id: return pd.DataFrame({"Error": ["Model not found"]}) for entry in nli_atoms_data: if entry["model"] == model_id and entry["prompt_name"] == text_name: atoms = entry["nli_atoms"] rows = [{"#": i+1, "Atom": a["text"][:120] + ("..." if len(a["text"]) > 120 else ""), "NLI Score": f"{a['score']:.4f}", "Status": "Retained" if a["score"] >= 0.5 else "Lost"} for i, a in enumerate(atoms)] return pd.DataFrame(rows) return pd.DataFrame({"Info": ["No NLI atom data available for this selection."]}) def update_explorer(category, text_name, model_filter): return ( safe_plot(build_scatter, category, text_name, model_filter), safe_plot(build_lossiness_breakdown, category, text_name, model_filter), safe_plot(build_drift_breakdown, category, text_name, model_filter), safe_plot(build_spiciness_comparison, category, text_name, model_filter), safe_plot(build_pca_plot, category, text_name, model_filter), build_nli_table(category, text_name, model_filter), ) def update_text_choices(category): return gr.update(choices=get_texts(category), value="All") # ── Tab 2: Leaderboard ───────────────────────────────────────────────────────── def build_leaderboard_df(): df = results_df.copy() df["model_name"] = df["model"].map(display_name) df["provider"] = df["model"].map(provider_name) agg = df.groupby(["model_name", "provider"]).agg( avg_score=("score", "mean"), avg_lossiness=("lossiness", "mean"), avg_drift=("drift", "mean"), avg_nli_retention=("nli_retention", "mean"), avg_spiciness_delta=("spiciness_delta", "mean"), avg_orig_spiciness=("orig_spiciness", "mean"), avg_rew_spiciness=("rew_spiciness", "mean"), n_texts=("score", "count"), ).reset_index().sort_values("avg_score", ascending=False).reset_index(drop=True) agg.insert(0, "Rank", range(1, len(agg) + 1)) out = agg.rename(columns={ "model_name": "Model", "provider": "Provider", "avg_score": "Avg Score", "avg_lossiness": "Avg Lossiness", "avg_drift": "Avg Drift", "avg_nli_retention": "NLI Retention", "avg_spiciness_delta": "Spiciness Δ", "avg_orig_spiciness": "Orig Spiciness", "avg_rew_spiciness": "Rew Spiciness", "n_texts": "Texts", }) out["Avg Score"] = out["Avg Score"].round(1) for c in ["Avg Lossiness", "Avg Drift", "NLI Retention", "Spiciness Δ", "Orig Spiciness", "Rew Spiciness"]: out[c] = out[c].round(4) return out def build_leaderboard_chart(): df = results_df.copy() df["model_name"] = df["model"].map(display_name) has_std = "score_std" in df.columns # Per-model: mean score and mean of per-replicate std (avg replication noise) agg_cols = {"score": ("score", "mean")} if has_std: agg_cols["score_std_mean"] = ("score_std", "mean") agg = df.groupby("model_name").agg(**agg_cols).reset_index().sort_values("score", ascending=False) models = agg["model_name"].tolist() err = agg["score_std_mean"].fillna(0).tolist() if has_std else None fig = go.Figure() fig.add_trace(go.Bar( name="Avg Score", x=models, y=agg["score"].tolist(), marker_color=[get_color(m) for m in models], text=[f"{v:.1f}" for v in agg["score"]], textposition="outside", error_y=dict(type="data", array=err, visible=True, color="#374151", thickness=1.5, width=6) if err else None, )) fig.update_layout( title=dict(text="Overall Preservation Score by Model — error bars = replicate σ", font=dict(size=15)), yaxis=dict(title="Score (0–100)", gridcolor="#e5e7eb", range=[0, 110]), xaxis=dict(tickangle=-15), plot_bgcolor="white", paper_bgcolor="white", showlegend=False, autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=80), ) return fig def build_category_heatmap(): df = results_df.copy() df["model_name"] = df["model"].map(display_name) pivot = df.pivot_table(index="category", columns="model_name", values="score", aggfunc="mean") z, x, y = pivot.values, pivot.columns.tolist(), pivot.index.tolist() fig = go.Figure(data=go.Heatmap( z=z, x=x, y=y, colorscale=[[0,"#fef2f2"],[0.4,"#fde68a"],[0.7,"#bbf7d0"],[1.0,"#1d4ed8"]], text=[[f"{v:.0f}" if not np.isnan(v) else "" for v in row] for row in z], texttemplate="%{text}", textfont=dict(size=11), colorbar=dict(title="Score", thickness=12), zmin=60, zmax=100, )) fig.update_layout( title=dict(text="Score by Category × Model", font=dict(size=15)), xaxis=dict(side="top", tickangle=-20), yaxis=dict(autorange="reversed"), autosize=True, height=400, margin=dict(l=160, r=10, t=70, b=10), plot_bgcolor="white", paper_bgcolor="white", ) return fig def build_metrics_radar(): df = results_df.copy() df["model_name"] = df["model"].map(display_name) agg = df.groupby("model_name").agg( nli_retention=("nli_retention", "mean"), low_lossiness=("lossiness", lambda x: 1 - x.mean()), low_drift=("drift", lambda x: 1 - x.mean()), spiciness_preservation=("spiciness_delta", lambda x: 1 - abs(x.mean()) / 10), low_word_deletion=("word_deletion", lambda x: 1 - x.mean()), ).reset_index() cats = ["NLI Retention", "Low Lossiness", "Low Drift", "Spiciness Preservation", "Low Word Deletion"] fig = go.Figure() for _, row in agg.iterrows(): vals = [row["nli_retention"], row["low_lossiness"], row["low_drift"], row["spiciness_preservation"], row["low_word_deletion"]] vals += [vals[0]] fig.add_trace(go.Scatterpolar(r=vals, theta=cats + [cats[0]], fill="toself", name=row["model_name"], line=dict(color=get_color(row["model_name"])))) fig.update_layout( polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title=dict(text="Multi-Metric Capability Radar", font=dict(size=15)), legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5), autosize=True, height=480, margin=dict(l=60, r=60, t=70, b=80), paper_bgcolor="white", ) return fig # ── Tab 3: Multi-Hop Trajectories ────────────────────────────────────────────── def build_mh_lossiness(text_filter): df = filter_hops(text_filter) if len(df) == 0: return go.Figure() agg = df.groupby(["model_name", "hop_number"])["lossiness"].mean().reset_index() fig = go.Figure() for mn in sorted(agg["model_name"].unique()): sub = agg[agg["model_name"] == mn].sort_values("hop_number") fig.add_trace(go.Scatter( x=sub["hop_number"].tolist(), y=sub["lossiness"].tolist(), mode="lines+markers", name=mn, line=dict(color=get_color(mn), width=2), marker=dict(size=7), hovertemplate=f"{mn}
Hop %{{x}}
Lossiness: %{{y:.3f}}", )) fig.update_layout( title=dict(text="Lossiness by Hop (information loss per rewrite)", font=dict(size=15)), xaxis=dict(title="Hop Number", dtick=1, gridcolor="#e5e7eb"), yaxis=dict(title="Lossiness →", gridcolor="#e5e7eb", rangemode="tozero"), plot_bgcolor="white", paper_bgcolor="white", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=60), ) return fig def build_mh_drift(text_filter): df = filter_hops(text_filter) if len(df) == 0: return go.Figure() agg = df.groupby(["model_name", "hop_number"])["drift"].mean().reset_index() fig = go.Figure() for mn in sorted(agg["model_name"].unique()): sub = agg[agg["model_name"] == mn].sort_values("hop_number") fig.add_trace(go.Scatter( x=sub["hop_number"].tolist(), y=sub["drift"].tolist(), mode="lines+markers", name=mn, line=dict(color=get_color(mn), width=2), marker=dict(size=7), hovertemplate=f"{mn}
Hop %{{x}}
Drift: %{{y:.3f}}", )) fig.update_layout( title=dict(text="Drift by Hop (stylistic collapse per rewrite)", font=dict(size=15)), xaxis=dict(title="Hop Number", dtick=1, gridcolor="#e5e7eb"), yaxis=dict(title="Drift ↑", gridcolor="#e5e7eb", rangemode="tozero"), plot_bgcolor="white", paper_bgcolor="white", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=60), ) return fig def build_mh_spiciness(text_filter): df = filter_hops(text_filter) if len(df) == 0: return go.Figure() agg = df.groupby(["model_name", "hop_number"]).agg( orig_s=("orig_spiciness", "mean"), rew_s=("rew_spiciness", "mean"), ).reset_index() fig = go.Figure() shown = set() for mn in sorted(agg["model_name"].unique()): sub = agg[agg["model_name"] == mn].sort_values("hop_number") color = get_color(mn) fig.add_trace(go.Scatter( x=sub["hop_number"].tolist(), y=sub["rew_s"].tolist(), mode="lines+markers", name=mn, line=dict(color=color, width=2), marker=dict(size=7), hovertemplate=f"{mn}
Hop %{{x}}
Rewrite Spiciness: %{{y:.2f}}", )) if mn not in shown: if not sub.empty: fig.add_trace(go.Scatter( x=[sub["hop_number"].iloc[0]], y=[sub["orig_s"].iloc[0]], mode="markers", name=f"{mn} (orig)", marker=dict(size=11, color=color, symbol="diamond"), showlegend=True, hovertemplate=f"{mn} original
Spiciness: %{{y:.2f}}", )) shown.add(mn) fig.update_layout( title=dict(text="Spiciness by Hop (rewrite spiciness vs original)", font=dict(size=15)), xaxis=dict(title="Hop Number", dtick=1, gridcolor="#e5e7eb"), yaxis=dict(title="Spiciness Score", gridcolor="#e5e7eb"), plot_bgcolor="white", paper_bgcolor="white", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=60), ) return fig def build_mh_scatter_trajectory(text_filter): df = filter_hops(text_filter) if len(df) == 0: return go.Figure() agg = df.groupby(["model_name", "hop_number"]).agg( lossiness=("lossiness", "mean"), drift=("drift", "mean"), ).reset_index() fig = go.Figure() for mn in sorted(agg["model_name"].unique()): sub = agg[agg["model_name"] == mn].sort_values("hop_number") color = get_color(mn) fig.add_trace(go.Scatter( x=sub["lossiness"].tolist(), y=sub["drift"].tolist(), mode="lines+markers", name=mn, line=dict(color=color, width=2), marker=dict(size=[6 + i * 1.5 for i in range(len(sub))], color=color), text=[f"Hop {h}" for h in sub["hop_number"].tolist()], hovertemplate=f"{mn} — %{{text}}
Lossiness: %{{x:.3f}}
Drift: %{{y:.3f}}", )) fig.update_layout( title=dict(text="Degradation Trajectory: Lossiness vs Drift over Hops", font=dict(size=15)), xaxis=dict(title="Lossiness →", gridcolor="#e5e7eb", rangemode="tozero"), yaxis=dict(title="Drift ↑", gridcolor="#e5e7eb", rangemode="tozero"), plot_bgcolor="white", paper_bgcolor="white", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), autosize=True, height=480, margin=dict(l=50, r=10, t=60, b=60), ) return fig def build_anim_scatter(text_filter, current_hop): df = filter_hops(text_filter) if len(df) == 0: return go.Figure() max_hop = int(df["hop_number"].max()) current_hop = max(1, min(int(current_hop), max_hop)) agg_all = df.groupby(["model_name", "hop_number"]).agg( lossiness=("lossiness", "mean"), drift=("drift", "mean"), ).reset_index() fig = go.Figure() for mn in sorted(agg_all["model_name"].unique()): sub_all = agg_all[agg_all["model_name"] == mn].sort_values("hop_number") sub = sub_all[sub_all["hop_number"] <= current_hop] if len(sub) == 0: continue color = get_color(mn) trail = sub.iloc[:-1] tip = sub.iloc[[-1]] if len(trail) > 0: fig.add_trace(go.Scatter( x=trail["lossiness"].tolist(), y=trail["drift"].tolist(), mode="lines+markers", name=mn, legendgroup=mn, line=dict(color=color, width=2, dash="dot"), marker=dict(size=7, color=color, opacity=0.5), showlegend=False, hoverinfo="skip", )) fig.add_trace(go.Scatter( x=tip["lossiness"].tolist(), y=tip["drift"].tolist(), mode="markers", name=mn, legendgroup=mn, marker=dict(size=16, color=color, symbol="circle", line=dict(color="white", width=2)), hovertemplate=( f"{mn}
Hop {current_hop}
" "Lossiness: %{x:.3f}
Drift: %{y:.3f}" ), )) fig.add_shape(type="line", x0=0.5, x1=0.5, y0=0, y1=1, xref="x", yref="paper", line=dict(color="#9ca3af", dash="dash", width=1)) fig.add_shape(type="line", x0=0, x1=1, y0=0.5, y1=0.5, xref="paper", yref="y", line=dict(color="#9ca3af", dash="dash", width=1)) fig.update_layout( title=dict( text=f"Lossiness × Drift Trajectory — Hop {current_hop} / {max_hop}", font=dict(size=15), ), xaxis=dict(title="Lossiness →", gridcolor="#e5e7eb", range=[-0.02, max(0.6, agg_all["lossiness"].max() + 0.05)]), yaxis=dict(title="Drift ↑", gridcolor="#e5e7eb", range=[-0.01, max(0.3, agg_all["drift"].max() + 0.05)]), plot_bgcolor="white", paper_bgcolor="white", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), autosize=True, height=480, margin=dict(l=50, r=10, t=60, b=60), ) return fig def build_mh_summary_table(): if len(mh_summary_df) == 0: return pd.DataFrame({"Info": ["No multi-hop summary data available."]}) df = mh_summary_df.copy() df["model_name"] = df["model"].map(display_name) out = df[["model_name", "prompt_name", "score", "lossiness", "drift", "nli_retention"]].copy() out = out.rename(columns={"model_name": "Model", "prompt_name": "Text", "score": "Score", "lossiness": "Lossiness", "drift": "Drift", "nli_retention": "NLI Retention"}) out = out.sort_values(["Model", "Text"]).reset_index(drop=True) for c in ["Lossiness", "Drift", "NLI Retention"]: out[c] = out[c].round(4) return out def update_mh(text_filter): return ( safe_plot(build_mh_lossiness, text_filter), safe_plot(build_mh_drift, text_filter), safe_plot(build_mh_spiciness, text_filter), safe_plot(build_mh_scatter_trajectory, text_filter), build_mh_summary_table(), ) # ── Gradio UI ────────────────────────────────────────────────────────────────── ABOUT_MD = f""" ## BeigificationBench — Evaluation Engine v{EVAL_VERSION} **Beigification** describes the tendency of LLMs to produce rewrites that are safe, bland, and stylistically uniform — stripping out the distinctive voice, specificity, and informational density of the source text. ### Metrics | Metric | Description | |--------|-------------| | **Lossiness** | NLI-weighted information loss: prop loss (60%) + semantic distance (20%) + word deletion (20%) | | **Drift** | Model collapse: spiciness loss (55%) + centroid pull (45%) | | **Spiciness** | 6-component vividness score: sqrt-PPL, lexical richness, rare word density, word specificity, vivid modifiers, voice | | **NLI Retention** | Fraction of source propositions preserved per NLI model | ### Benchmark Design - **Single-hop** results average 3 independent replicates to reduce variance. - **Multi-hop** results show degradation trajectories over 8 successive rewrites using run 154 (5 models × 3 texts × 8 hops). - Texts span literary non-fiction, polemic, and sensitive personal essay genres. ### Models Evaluated | Model | Provider | Single-hop | Multi-hop | |-------|----------|:----------:|:---------:| | Claude Opus 4.6 | Anthropic | ✓ | ✓ | | GPT-5.2 | OpenAI | ✓ | ✓ | | Gemini 3.1 Pro | Google | ✓ | ✓ | | Llama 3.3 70B | Meta / HF | ✓ | ✓ | | Llama 3.1 70B | Meta / HF | | ✓ | | Qwen 2.5 72B | Alibaba / HF | ✓ | | *Anonymous submission. Author information withheld for peer review.* """ with gr.Blocks(title="BeigificationBench") as demo: gr.Markdown( f"# 📊 BeigificationBench\n" f"**Measuring How LLMs Flatten Text**  |  " f"Eval Engine v{EVAL_VERSION}  |  Space v{SPACE_VERSION}" ) with gr.Tabs(): # ── Tab 1: Explorer ──────────────────────────────────────────────────── with gr.Tab("Interactive Explorer"): with gr.Row(): cat_dd = gr.Dropdown(choices=get_categories(), value="All", label="Category") text_dd = gr.Dropdown(choices=get_texts(), value="All", label="Source Text") model_dd = gr.Dropdown(choices=get_models(), value="All", label="Model") scatter_plot = gr.Plot() with gr.Row(): loss_plot = gr.Plot() drift_plot = gr.Plot() with gr.Row(): spic_plot = gr.Plot() pca_plot = gr.Plot() gr.Markdown("### NLI Atom Detail *(select a specific text and model)*") nli_table = gr.Dataframe(wrap=True) def _update(cat, txt, mdl): return update_explorer(cat, txt, mdl) for ctrl in [cat_dd, text_dd, model_dd]: ctrl.change(_update, [cat_dd, text_dd, model_dd], [scatter_plot, loss_plot, drift_plot, spic_plot, pca_plot, nli_table]) cat_dd.change(update_text_choices, [cat_dd], [text_dd]) demo.load(_update, [cat_dd, text_dd, model_dd], [scatter_plot, loss_plot, drift_plot, spic_plot, pca_plot, nli_table]) # ── Tab 2: Leaderboard ───────────────────────────────────────────────── with gr.Tab("Leaderboard"): gr.Markdown("Results averaged across 3 replicates per prompt × model combination.") lb_table = gr.Dataframe(value=build_leaderboard_df, wrap=False) lb_chart = gr.Plot(value=build_leaderboard_chart) cat_heatmap = gr.Plot(value=build_category_heatmap) radar_plot = gr.Plot(value=build_metrics_radar) # ── Tab 3: Multi-Hop Trajectories ────────────────────────────────────── with gr.Tab("Multi-Hop Trajectories"): gr.Markdown( "### 8-Hop Rewrite Trajectories\n" "Each model rewrites the same text 8 times in succession. " "Charts show how Lossiness, Drift, and Spiciness evolve across hops, " "averaged across texts unless a specific text is selected." ) mh_text_dd = gr.Dropdown(choices=get_mh_texts(), value="All", label="Source Text") mh_traj_plot = gr.Plot() with gr.Row(): mh_loss_plot = gr.Plot() mh_drift_plot = gr.Plot() mh_spic_plot = gr.Plot() gr.Markdown("### Overall (hop 1 → 8) Summary") mh_summary_table = gr.Dataframe(wrap=False) def _update_mh(txt): return update_mh(txt) mh_text_dd.change(_update_mh, [mh_text_dd], [mh_loss_plot, mh_drift_plot, mh_spic_plot, mh_traj_plot, mh_summary_table]) demo.load(_update_mh, [mh_text_dd], [mh_loss_plot, mh_drift_plot, mh_spic_plot, mh_traj_plot, mh_summary_table]) # ── Animation ────────────────────────────────────────────────────── gr.Markdown("---\n### 🎬 Trajectory Animation") gr.Markdown( "Watch each model's position move through Lossiness × Drift space as hops accumulate. " "The large dot shows the current hop; the dotted trail shows the path so far." ) anim_is_playing = gr.State(False) with gr.Row(): anim_play_btn = gr.Button("▶ Play", size="sm", variant="primary", scale=0) anim_reset_btn = gr.Button("⟳ Reset", size="sm", scale=0) anim_hop_slider = gr.Slider( minimum=1, maximum=8, step=1, value=1, label="Hop", scale=2, ) anim_plot = gr.Plot() anim_timer = gr.Timer(value=1.2, active=False) def _anim_init(txt): return 1, build_anim_scatter(txt, 1) def _anim_scrub(hop, txt): return safe_plot(build_anim_scatter, txt, hop) def _anim_toggle(playing): new_playing = not playing label = "⏸ Pause" if new_playing else "▶ Play" return new_playing, gr.Timer(active=new_playing), label def _anim_reset(txt): return False, gr.Timer(active=False), 1, safe_plot(build_anim_scatter, txt, 1), "▶ Play" def _anim_tick(playing, hop, txt): if not playing: return hop, safe_plot(build_anim_scatter, txt, hop) max_hop = int(hops_df["hop_number"].max()) if len(hops_df) > 0 else 8 if hop >= max_hop: return hop, safe_plot(build_anim_scatter, txt, hop) next_hop = hop + 1 return next_hop, safe_plot(build_anim_scatter, txt, next_hop) anim_play_btn.click( _anim_toggle, inputs=[anim_is_playing], outputs=[anim_is_playing, anim_timer, anim_play_btn], ) anim_reset_btn.click( _anim_reset, inputs=[mh_text_dd], outputs=[anim_is_playing, anim_timer, anim_hop_slider, anim_plot, anim_play_btn], ) anim_hop_slider.change( _anim_scrub, inputs=[anim_hop_slider, mh_text_dd], outputs=[anim_plot], ) mh_text_dd.change( _anim_init, inputs=[mh_text_dd], outputs=[anim_hop_slider, anim_plot], ) anim_timer.tick( _anim_tick, inputs=[anim_is_playing, anim_hop_slider, mh_text_dd], outputs=[anim_hop_slider, anim_plot], ) demo.load(_anim_init, inputs=[mh_text_dd], outputs=[anim_hop_slider, anim_plot]) # ── Tab 4: About ─────────────────────────────────────────────────────── with gr.Tab("About"): gr.Markdown(ABOUT_MD) demo.launch(ssr_mode=False)