| """ |
| BeigificationBench β Measuring How LLMs Flatten Text |
| Anonymous submission | Evaluation Engine v1.0 |
| """ |
|
|
| import os |
| import json |
| import gradio as gr |
| import pandas as pd |
| import plotly.graph_objects as go |
| from plotly.subplots import make_subplots |
| import numpy as np |
|
|
| SPACE_VERSION = "2026.05" |
| EVAL_VERSION = "1.0" |
|
|
| MODEL_META = { |
| "claude-opus-4-6": ("Claude Opus 4.6", "Anthropic"), |
| "gpt-5.2": ("GPT-5.2", "OpenAI"), |
| "gemini-3.1-pro-preview": ("Gemini 3.1 Pro", "Google"), |
| "meta-llama/Llama-3.3-70B-Instruct": ("Llama 3.3 70B", "Meta / HF"), |
| "meta-llama/Llama-3.1-70B-Instruct": ("Llama 3.1 70B", "Meta / HF"), |
| "Qwen/Qwen2.5-72B-Instruct": ("Qwen 2.5 72B", "Alibaba / HF"), |
| } |
|
|
| MODEL_COLORS = { |
| "Claude Opus 4.6": "#7c3aed", |
| "GPT-5.2": "#0891b2", |
| "Gemini 3.1 Pro": "#059669", |
| "Llama 3.3 70B": "#d97706", |
| "Llama 3.1 70B": "#b45309", |
| "Qwen 2.5 72B": "#db2777", |
| } |
|
|
| DATA_DIR = os.path.join(os.path.dirname(__file__), "data") |
|
|
| def display_name(model_id): |
| return MODEL_META.get(model_id, (model_id, ""))[0] |
|
|
| def provider_name(model_id): |
| return MODEL_META.get(model_id, ("", model_id))[1] |
|
|
| def get_color(model_name): |
| return MODEL_COLORS.get(model_name, "#6b7280") |
|
|
| def safe_plot(fn, *args, **kwargs): |
| try: |
| return fn(*args, **kwargs) |
| except Exception as e: |
| fig = go.Figure() |
| fig.add_annotation( |
| text=f"Chart unavailable: {type(e).__name__}: {e}", |
| x=0.5, y=0.5, xref="paper", yref="paper", |
| showarrow=False, font=dict(size=13, color="red"), |
| ) |
| fig.update_layout(height=300, paper_bgcolor="white", plot_bgcolor="white") |
| return fig |
|
|
| |
|
|
| def load_data(): |
| try: |
| df = pd.read_csv(os.path.join(DATA_DIR, "results.csv")) |
| except Exception as e: |
| print(f"WARNING: results.csv load failed: {e}") |
| df = pd.DataFrame() |
| try: |
| with open(os.path.join(DATA_DIR, "nli_atoms.json")) as f: |
| nli_atoms = json.load(f) |
| except Exception: |
| nli_atoms = [] |
| try: |
| with open(os.path.join(DATA_DIR, "pca_beige_points.json")) as f: |
| pca_beige = json.load(f) |
| except Exception: |
| pca_beige = [] |
| return df, nli_atoms, pca_beige |
|
|
| def load_multihop_data(): |
| try: |
| hops_df = pd.read_csv(os.path.join(DATA_DIR, "multihop_hops.csv")) |
| except Exception as e: |
| print(f"WARNING: multihop_hops.csv load failed: {e}") |
| hops_df = pd.DataFrame() |
| try: |
| summary_df = pd.read_csv(os.path.join(DATA_DIR, "multihop_summary.csv")) |
| except Exception: |
| summary_df = pd.DataFrame() |
| return hops_df, summary_df |
|
|
| results_df, nli_atoms_data, pca_beige_data = load_data() |
| hops_df, mh_summary_df = load_multihop_data() |
|
|
| |
|
|
| def get_categories(): |
| cats = results_df["category"].unique().tolist() if len(results_df) else [] |
| return ["All"] + sorted(cats) |
|
|
| def get_texts(category="All"): |
| if len(results_df) == 0: |
| return ["All"] |
| if category == "All": |
| texts = results_df["prompt_name"].unique().tolist() |
| else: |
| texts = results_df[results_df["category"] == category]["prompt_name"].unique().tolist() |
| return ["All"] + sorted(texts) |
|
|
| def get_models(): |
| if len(results_df) == 0: |
| return ["All"] |
| return ["All"] + sorted(results_df["model"].map(display_name).unique().tolist()) |
|
|
| def filter_df(category, text_name, model_filter): |
| df = results_df.copy() |
| df["model_name"] = df["model"].map(display_name) |
| if category != "All": |
| df = df[df["category"] == category] |
| if text_name != "All": |
| df = df[df["prompt_name"] == text_name] |
| if model_filter != "All": |
| df = df[df["model_name"] == model_filter] |
| return df |
|
|
| def get_mh_texts(): |
| if len(hops_df) == 0: |
| return ["All"] |
| return ["All"] + sorted(hops_df["prompt_name"].unique().tolist()) |
|
|
| def filter_hops(text_filter): |
| df = hops_df.copy() |
| df["model_name"] = df["model"].map(display_name) |
| if text_filter != "All": |
| df = df[df["prompt_name"] == text_filter] |
| return df |
|
|
| |
|
|
| def build_scatter(category, text_name, model_filter): |
| df = filter_df(category, text_name, model_filter) |
| fig = go.Figure() |
| has_std = "lossiness_std" in df.columns and "drift_std" in df.columns |
| for mn in sorted(df["model_name"].unique()): |
| sub = df[df["model_name"] == mn] |
| err_x = dict(type="data", array=sub["lossiness_std"].fillna(0).tolist(), visible=True, |
| color=get_color(mn), thickness=1.5, width=4) if has_std else None |
| err_y = dict(type="data", array=sub["drift_std"].fillna(0).tolist(), visible=True, |
| color=get_color(mn), thickness=1.5, width=4) if has_std else None |
| fig.add_trace(go.Scatter( |
| x=sub["lossiness"].tolist(), y=sub["drift"].tolist(), |
| mode="markers", name=mn, |
| marker=dict(size=12, color=get_color(mn), opacity=0.85), |
| error_x=err_x, error_y=err_y, |
| text=sub["prompt_name"].tolist(), |
| customdata=sub[["score", "nli_retention", "category", |
| "lossiness_std", "drift_std"]].values.tolist(), |
| hovertemplate=( |
| "<b>%{text}</b><br>" |
| "Model: " + mn + "<br>" |
| "Lossiness: %{x:.3f} Β± %{customdata[3]:.3f}<br>" |
| "Drift: %{y:.3f} Β± %{customdata[4]:.3f}<br>" |
| "Score: %{customdata[0]:.1f}<br>" |
| "NLI Retention: %{customdata[1]:.1%}<br>" |
| "Category: %{customdata[2]}<extra></extra>" |
| ), |
| )) |
| fig.add_shape(type="rect", x0=-0.02, y0=-0.02, x1=0.15, y1=0.05, |
| fillcolor="rgba(22,163,74,0.08)", |
| line=dict(color="rgba(22,163,74,0.3)", dash="dot"), layer="below") |
| fig.add_annotation(x=0.07, y=0.04, text="High fidelity", showarrow=False, |
| font=dict(color="#16a34a", size=10)) |
| fig.update_layout( |
| title=dict(text="Integrity Frontier: Lossiness vs Drift", font=dict(size=15)), |
| xaxis=dict(title="Lossiness (information loss) β", gridcolor="#e5e7eb", rangemode="tozero"), |
| yaxis=dict(title="Drift (stylistic collapse) β", gridcolor="#e5e7eb", rangemode="tozero"), |
| plot_bgcolor="white", paper_bgcolor="white", |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
| autosize=True, height=500, margin=dict(l=50, r=10, t=60, b=60), |
| ) |
| return fig |
|
|
| def build_lossiness_breakdown(category, text_name, model_filter): |
| df = filter_df(category, text_name, model_filter) |
| has_std = "lossiness_std" in df.columns |
| agg_spec = { |
| "prop_loss": ("prop_loss", "mean"), |
| "semantic_distance":("semantic_distance", "mean"), |
| "word_deletion": ("word_deletion", "mean"), |
| } |
| if has_std: |
| agg_spec["lossiness_std"] = ("lossiness_std", "mean") |
| agg = df.groupby("model_name").agg(**agg_spec).reset_index().sort_values("model_name") |
| models = agg["model_name"].tolist() |
| |
| totals = (agg["prop_loss"] * 0.6 + agg["semantic_distance"] * 0.2 + agg["word_deletion"] * 0.2).tolist() |
| err = dict(type="data", array=agg["lossiness_std"].fillna(0).tolist(), visible=True, |
| color="#374151", thickness=1.5, width=5) if has_std else None |
| fig = go.Figure() |
| fig.add_trace(go.Bar(name="Prop Loss (60%)", x=models, |
| y=(agg["prop_loss"] * 0.6).tolist(), marker_color="#1e3a5f", |
| text=[f"{v:.3f}" for v in (agg["prop_loss"] * 0.6)], textposition="inside")) |
| fig.add_trace(go.Bar(name="Semantic Dist (20%)", x=models, |
| y=(agg["semantic_distance"] * 0.2).tolist(), marker_color="#3498db", |
| text=[f"{v:.3f}" for v in (agg["semantic_distance"] * 0.2)], textposition="inside")) |
| |
| fig.add_trace(go.Bar(name="Word Deletion (20%)", x=models, |
| y=(agg["word_deletion"] * 0.2).tolist(), marker_color="#95a5a6", |
| text=[f"{v:.3f}" for v in (agg["word_deletion"] * 0.2)], textposition="inside", |
| error_y=err)) |
| fig.update_layout( |
| barmode="stack", |
| title=dict(text="Lossiness Sub-Metric Decomposition β error bars = replicate Ο", font=dict(size=15)), |
| yaxis=dict(title="Weighted Contribution", gridcolor="#e5e7eb"), |
| xaxis=dict(tickangle=-15), |
| plot_bgcolor="white", paper_bgcolor="white", |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
| autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80), |
| ) |
| return fig |
|
|
| def build_drift_breakdown(category, text_name, model_filter): |
| df = filter_df(category, text_name, model_filter) |
| has_std = "drift_std" in df.columns |
| agg_spec = { |
| "norm_delta_spiciness": ("norm_delta_spiciness", "mean"), |
| "norm_pull": ("norm_pull", "mean"), |
| } |
| if has_std: |
| agg_spec["drift_std"] = ("drift_std", "mean") |
| agg = df.groupby("model_name").agg(**agg_spec).reset_index().sort_values("model_name") |
| models = agg["model_name"].tolist() |
| err = dict(type="data", array=agg["drift_std"].fillna(0).tolist(), visible=True, |
| color="#374151", thickness=1.5, width=5) if has_std else None |
| fig = go.Figure() |
| fig.add_trace(go.Bar(name="Spiciness Loss (55%)", x=models, |
| y=(agg["norm_delta_spiciness"] * 0.55).tolist(), marker_color="#e74c3c", |
| text=[f"{v:.3f}" for v in (agg["norm_delta_spiciness"] * 0.55)], textposition="inside")) |
| |
| fig.add_trace(go.Bar(name="Centroid Pull (45%)", x=models, |
| y=(agg["norm_pull"] * 0.45).tolist(), marker_color="#f1c40f", |
| text=[f"{v:.3f}" for v in (agg["norm_pull"] * 0.45)], textposition="inside", |
| error_y=err)) |
| fig.update_layout( |
| barmode="stack", |
| title=dict(text="Drift Sub-Metric Decomposition β error bars = replicate Ο", font=dict(size=15)), |
| yaxis=dict(title="Weighted Contribution", gridcolor="#e5e7eb"), |
| xaxis=dict(tickangle=-15), |
| plot_bgcolor="white", paper_bgcolor="white", |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
| autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80), |
| ) |
| return fig |
|
|
| def build_spiciness_comparison(category, text_name, model_filter): |
| df = filter_df(category, text_name, model_filter) |
| has_std = "orig_spiciness_std" in df.columns |
| agg_spec = {"orig": ("orig_spiciness", "mean"), "rew": ("rew_spiciness", "mean")} |
| if has_std: |
| agg_spec["orig_std"] = ("orig_spiciness_std", "mean") |
| agg_spec["rew_std"] = ("rew_spiciness_std", "mean") |
| agg = df.groupby("model_name").agg(**agg_spec).reset_index().sort_values("model_name") |
| models = agg["model_name"].tolist() |
| x_pos = np.arange(len(models)) |
| width = 0.35 |
| orig_err = dict(type="data", array=agg["orig_std"].fillna(0).tolist(), visible=True, |
| color="#374151", thickness=1.5, width=5) if has_std else None |
| rew_err = dict(type="data", array=agg["rew_std"].fillna(0).tolist(), visible=True, |
| color="#374151", thickness=1.5, width=5) if has_std else None |
| fig = go.Figure() |
| fig.add_trace(go.Bar(name="Original", x=[x - width/2 for x in x_pos], y=agg["orig"].tolist(), |
| marker_color="#6366f1", width=width, error_y=orig_err, |
| text=[f"{v:.1f}" for v in agg["orig"]], textposition="outside")) |
| fig.add_trace(go.Bar(name="Rewrite", x=[x + width/2 for x in x_pos], y=agg["rew"].tolist(), |
| marker_color="#a5b4fc", width=width, error_y=rew_err, |
| text=[f"{v:.1f}" for v in agg["rew"]], textposition="outside")) |
| fig.update_layout( |
| title=dict(text="Spiciness: Original vs Rewrite β error bars = replicate Ο", font=dict(size=15)), |
| xaxis=dict(tickvals=list(x_pos), ticktext=models, tickangle=-15), |
| yaxis=dict(title="Spiciness Score", gridcolor="#e5e7eb"), |
| plot_bgcolor="white", paper_bgcolor="white", |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
| autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80), |
| ) |
| return fig |
|
|
| def build_pca_plot(category, text_name, model_filter): |
| df = filter_df(category, text_name, model_filter) |
| valid = df.dropna(subset=["pca_original_x", "pca_rewrite_x"]) |
| if len(valid) == 0: |
| fig = go.Figure() |
| fig.add_annotation(text="No PCA data available for this selection", |
| x=0.5, y=0.5, xref="paper", yref="paper", showarrow=False) |
| fig.update_layout(height=450, paper_bgcolor="white", plot_bgcolor="white") |
| return fig |
| fig = go.Figure() |
| beige_shown = False |
| for entry in pca_beige_data: |
| if not beige_shown and entry.get("beige_points"): |
| bx = [p["x"] for p in entry["beige_points"]] |
| by = [p["y"] for p in entry["beige_points"]] |
| fig.add_trace(go.Scatter(x=bx, y=by, mode="markers", name="Beige Platitudes", |
| marker=dict(size=8, color="lightgray", opacity=0.5, symbol="diamond"), |
| hoverinfo="skip")) |
| beige_shown = True |
| break |
| for mn in sorted(valid["model_name"].unique()): |
| sub = valid[valid["model_name"] == mn] |
| color = get_color(mn) |
| for idx, (_, row) in enumerate(sub.iterrows()): |
| fig.add_trace(go.Scatter( |
| x=[row["pca_original_x"], row["pca_rewrite_x"]], |
| y=[row["pca_original_y"], row["pca_rewrite_y"]], |
| mode="lines+markers", line=dict(color=color, width=2), |
| marker=dict(size=[8, 10], color=color, symbol=["circle", "arrow-right"]), |
| name=mn, legendgroup=mn, showlegend=(idx == 0), |
| hovertemplate=f"<b>{mn}</b><br>{row['prompt_name']}<extra></extra>", |
| )) |
| fig.update_layout( |
| title=dict(text="Beigification Vector β PCA Projection", font=dict(size=15)), |
| xaxis=dict(title="PC1", gridcolor="#e5e7eb"), |
| yaxis=dict(title="PC2", gridcolor="#e5e7eb", scaleanchor="x", scaleratio=1), |
| plot_bgcolor="white", paper_bgcolor="white", |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
| autosize=True, height=500, margin=dict(l=50, r=10, t=60, b=60), |
| ) |
| return fig |
|
|
| def build_nli_table(category, text_name, model_filter): |
| if text_name == "All" or model_filter == "All": |
| return pd.DataFrame({"Info": ["Select a specific text AND model to view NLI atom details."]}) |
| model_id = next((mid for mid, (dn, _) in MODEL_META.items() if dn == model_filter), None) |
| if not model_id: |
| return pd.DataFrame({"Error": ["Model not found"]}) |
| for entry in nli_atoms_data: |
| if entry["model"] == model_id and entry["prompt_name"] == text_name: |
| atoms = entry["nli_atoms"] |
| rows = [{"#": i+1, |
| "Atom": a["text"][:120] + ("..." if len(a["text"]) > 120 else ""), |
| "NLI Score": f"{a['score']:.4f}", |
| "Status": "Retained" if a["score"] >= 0.5 else "Lost"} |
| for i, a in enumerate(atoms)] |
| return pd.DataFrame(rows) |
| return pd.DataFrame({"Info": ["No NLI atom data available for this selection."]}) |
|
|
| def update_explorer(category, text_name, model_filter): |
| return ( |
| safe_plot(build_scatter, category, text_name, model_filter), |
| safe_plot(build_lossiness_breakdown, category, text_name, model_filter), |
| safe_plot(build_drift_breakdown, category, text_name, model_filter), |
| safe_plot(build_spiciness_comparison, category, text_name, model_filter), |
| safe_plot(build_pca_plot, category, text_name, model_filter), |
| build_nli_table(category, text_name, model_filter), |
| ) |
|
|
| def update_text_choices(category): |
| return gr.update(choices=get_texts(category), value="All") |
|
|
| |
|
|
| def build_leaderboard_df(): |
| df = results_df.copy() |
| df["model_name"] = df["model"].map(display_name) |
| df["provider"] = df["model"].map(provider_name) |
| agg = df.groupby(["model_name", "provider"]).agg( |
| avg_score=("score", "mean"), |
| avg_lossiness=("lossiness", "mean"), |
| avg_drift=("drift", "mean"), |
| avg_nli_retention=("nli_retention", "mean"), |
| avg_spiciness_delta=("spiciness_delta", "mean"), |
| avg_orig_spiciness=("orig_spiciness", "mean"), |
| avg_rew_spiciness=("rew_spiciness", "mean"), |
| n_texts=("score", "count"), |
| ).reset_index().sort_values("avg_score", ascending=False).reset_index(drop=True) |
| agg.insert(0, "Rank", range(1, len(agg) + 1)) |
| out = agg.rename(columns={ |
| "model_name": "Model", "provider": "Provider", |
| "avg_score": "Avg Score", "avg_lossiness": "Avg Lossiness", |
| "avg_drift": "Avg Drift", "avg_nli_retention": "NLI Retention", |
| "avg_spiciness_delta": "Spiciness Ξ", "avg_orig_spiciness": "Orig Spiciness", |
| "avg_rew_spiciness": "Rew Spiciness", "n_texts": "Texts", |
| }) |
| out["Avg Score"] = out["Avg Score"].round(1) |
| for c in ["Avg Lossiness", "Avg Drift", "NLI Retention", "Spiciness Ξ", "Orig Spiciness", "Rew Spiciness"]: |
| out[c] = out[c].round(4) |
| return out |
|
|
| def build_leaderboard_chart(): |
| df = results_df.copy() |
| df["model_name"] = df["model"].map(display_name) |
| has_std = "score_std" in df.columns |
| |
| agg_cols = {"score": ("score", "mean")} |
| if has_std: |
| agg_cols["score_std_mean"] = ("score_std", "mean") |
| agg = df.groupby("model_name").agg(**agg_cols).reset_index().sort_values("score", ascending=False) |
| models = agg["model_name"].tolist() |
| err = agg["score_std_mean"].fillna(0).tolist() if has_std else None |
| fig = go.Figure() |
| fig.add_trace(go.Bar( |
| name="Avg Score", x=models, y=agg["score"].tolist(), |
| marker_color=[get_color(m) for m in models], |
| text=[f"{v:.1f}" for v in agg["score"]], textposition="outside", |
| error_y=dict(type="data", array=err, visible=True, |
| color="#374151", thickness=1.5, width=6) if err else None, |
| )) |
| fig.update_layout( |
| title=dict(text="Overall Preservation Score by Model β error bars = replicate Ο", font=dict(size=15)), |
| yaxis=dict(title="Score (0β100)", gridcolor="#e5e7eb", range=[0, 110]), |
| xaxis=dict(tickangle=-15), plot_bgcolor="white", paper_bgcolor="white", |
| showlegend=False, autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=80), |
| ) |
| return fig |
|
|
| def build_category_heatmap(): |
| df = results_df.copy() |
| df["model_name"] = df["model"].map(display_name) |
| pivot = df.pivot_table(index="category", columns="model_name", values="score", aggfunc="mean") |
| z, x, y = pivot.values, pivot.columns.tolist(), pivot.index.tolist() |
| fig = go.Figure(data=go.Heatmap( |
| z=z, x=x, y=y, |
| colorscale=[[0,"#fef2f2"],[0.4,"#fde68a"],[0.7,"#bbf7d0"],[1.0,"#1d4ed8"]], |
| text=[[f"{v:.0f}" if not np.isnan(v) else "" for v in row] for row in z], |
| texttemplate="%{text}", textfont=dict(size=11), |
| colorbar=dict(title="Score", thickness=12), zmin=60, zmax=100, |
| )) |
| fig.update_layout( |
| title=dict(text="Score by Category Γ Model", font=dict(size=15)), |
| xaxis=dict(side="top", tickangle=-20), yaxis=dict(autorange="reversed"), |
| autosize=True, height=400, margin=dict(l=160, r=10, t=70, b=10), |
| plot_bgcolor="white", paper_bgcolor="white", |
| ) |
| return fig |
|
|
| def build_metrics_radar(): |
| df = results_df.copy() |
| df["model_name"] = df["model"].map(display_name) |
| agg = df.groupby("model_name").agg( |
| nli_retention=("nli_retention", "mean"), |
| low_lossiness=("lossiness", lambda x: 1 - x.mean()), |
| low_drift=("drift", lambda x: 1 - x.mean()), |
| spiciness_preservation=("spiciness_delta", lambda x: 1 - abs(x.mean()) / 10), |
| low_word_deletion=("word_deletion", lambda x: 1 - x.mean()), |
| ).reset_index() |
| cats = ["NLI Retention", "Low Lossiness", "Low Drift", "Spiciness Preservation", "Low Word Deletion"] |
| fig = go.Figure() |
| for _, row in agg.iterrows(): |
| vals = [row["nli_retention"], row["low_lossiness"], row["low_drift"], |
| row["spiciness_preservation"], row["low_word_deletion"]] |
| vals += [vals[0]] |
| fig.add_trace(go.Scatterpolar(r=vals, theta=cats + [cats[0]], fill="toself", |
| name=row["model_name"], |
| line=dict(color=get_color(row["model_name"])))) |
| fig.update_layout( |
| polar=dict(radialaxis=dict(visible=True, range=[0, 1])), |
| title=dict(text="Multi-Metric Capability Radar", font=dict(size=15)), |
| legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5), |
| autosize=True, height=480, margin=dict(l=60, r=60, t=70, b=80), |
| paper_bgcolor="white", |
| ) |
| return fig |
|
|
| |
|
|
| def build_mh_lossiness(text_filter): |
| df = filter_hops(text_filter) |
| if len(df) == 0: |
| return go.Figure() |
| agg = df.groupby(["model_name", "hop_number"])["lossiness"].mean().reset_index() |
| fig = go.Figure() |
| for mn in sorted(agg["model_name"].unique()): |
| sub = agg[agg["model_name"] == mn].sort_values("hop_number") |
| fig.add_trace(go.Scatter( |
| x=sub["hop_number"].tolist(), y=sub["lossiness"].tolist(), |
| mode="lines+markers", name=mn, |
| line=dict(color=get_color(mn), width=2), |
| marker=dict(size=7), |
| hovertemplate=f"<b>{mn}</b><br>Hop %{{x}}<br>Lossiness: %{{y:.3f}}<extra></extra>", |
| )) |
| fig.update_layout( |
| title=dict(text="Lossiness by Hop (information loss per rewrite)", font=dict(size=15)), |
| xaxis=dict(title="Hop Number", dtick=1, gridcolor="#e5e7eb"), |
| yaxis=dict(title="Lossiness β", gridcolor="#e5e7eb", rangemode="tozero"), |
| plot_bgcolor="white", paper_bgcolor="white", |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
| autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=60), |
| ) |
| return fig |
|
|
| def build_mh_drift(text_filter): |
| df = filter_hops(text_filter) |
| if len(df) == 0: |
| return go.Figure() |
| agg = df.groupby(["model_name", "hop_number"])["drift"].mean().reset_index() |
| fig = go.Figure() |
| for mn in sorted(agg["model_name"].unique()): |
| sub = agg[agg["model_name"] == mn].sort_values("hop_number") |
| fig.add_trace(go.Scatter( |
| x=sub["hop_number"].tolist(), y=sub["drift"].tolist(), |
| mode="lines+markers", name=mn, |
| line=dict(color=get_color(mn), width=2), |
| marker=dict(size=7), |
| hovertemplate=f"<b>{mn}</b><br>Hop %{{x}}<br>Drift: %{{y:.3f}}<extra></extra>", |
| )) |
| fig.update_layout( |
| title=dict(text="Drift by Hop (stylistic collapse per rewrite)", font=dict(size=15)), |
| xaxis=dict(title="Hop Number", dtick=1, gridcolor="#e5e7eb"), |
| yaxis=dict(title="Drift β", gridcolor="#e5e7eb", rangemode="tozero"), |
| plot_bgcolor="white", paper_bgcolor="white", |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
| autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=60), |
| ) |
| return fig |
|
|
| def build_mh_spiciness(text_filter): |
| df = filter_hops(text_filter) |
| if len(df) == 0: |
| return go.Figure() |
| agg = df.groupby(["model_name", "hop_number"]).agg( |
| orig_s=("orig_spiciness", "mean"), |
| rew_s=("rew_spiciness", "mean"), |
| ).reset_index() |
| fig = go.Figure() |
| shown = set() |
| for mn in sorted(agg["model_name"].unique()): |
| sub = agg[agg["model_name"] == mn].sort_values("hop_number") |
| color = get_color(mn) |
| fig.add_trace(go.Scatter( |
| x=sub["hop_number"].tolist(), y=sub["rew_s"].tolist(), |
| mode="lines+markers", name=mn, |
| line=dict(color=color, width=2), marker=dict(size=7), |
| hovertemplate=f"<b>{mn}</b><br>Hop %{{x}}<br>Rewrite Spiciness: %{{y:.2f}}<extra></extra>", |
| )) |
| if mn not in shown: |
| if not sub.empty: |
| fig.add_trace(go.Scatter( |
| x=[sub["hop_number"].iloc[0]], y=[sub["orig_s"].iloc[0]], |
| mode="markers", name=f"{mn} (orig)", |
| marker=dict(size=11, color=color, symbol="diamond"), |
| showlegend=True, |
| hovertemplate=f"<b>{mn} original</b><br>Spiciness: %{{y:.2f}}<extra></extra>", |
| )) |
| shown.add(mn) |
| fig.update_layout( |
| title=dict(text="Spiciness by Hop (rewrite spiciness vs original)", font=dict(size=15)), |
| xaxis=dict(title="Hop Number", dtick=1, gridcolor="#e5e7eb"), |
| yaxis=dict(title="Spiciness Score", gridcolor="#e5e7eb"), |
| plot_bgcolor="white", paper_bgcolor="white", |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
| autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=60), |
| ) |
| return fig |
|
|
| def build_mh_scatter_trajectory(text_filter): |
| df = filter_hops(text_filter) |
| if len(df) == 0: |
| return go.Figure() |
| agg = df.groupby(["model_name", "hop_number"]).agg( |
| lossiness=("lossiness", "mean"), drift=("drift", "mean"), |
| ).reset_index() |
| fig = go.Figure() |
| for mn in sorted(agg["model_name"].unique()): |
| sub = agg[agg["model_name"] == mn].sort_values("hop_number") |
| color = get_color(mn) |
| fig.add_trace(go.Scatter( |
| x=sub["lossiness"].tolist(), y=sub["drift"].tolist(), |
| mode="lines+markers", name=mn, |
| line=dict(color=color, width=2), |
| marker=dict(size=[6 + i * 1.5 for i in range(len(sub))], color=color), |
| text=[f"Hop {h}" for h in sub["hop_number"].tolist()], |
| hovertemplate=f"<b>{mn}</b> β %{{text}}<br>Lossiness: %{{x:.3f}}<br>Drift: %{{y:.3f}}<extra></extra>", |
| )) |
| fig.update_layout( |
| title=dict(text="Degradation Trajectory: Lossiness vs Drift over Hops", font=dict(size=15)), |
| xaxis=dict(title="Lossiness β", gridcolor="#e5e7eb", rangemode="tozero"), |
| yaxis=dict(title="Drift β", gridcolor="#e5e7eb", rangemode="tozero"), |
| plot_bgcolor="white", paper_bgcolor="white", |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
| autosize=True, height=480, margin=dict(l=50, r=10, t=60, b=60), |
| ) |
| return fig |
|
|
| def build_anim_scatter(text_filter, current_hop): |
| df = filter_hops(text_filter) |
| if len(df) == 0: |
| return go.Figure() |
| max_hop = int(df["hop_number"].max()) |
| current_hop = max(1, min(int(current_hop), max_hop)) |
| agg_all = df.groupby(["model_name", "hop_number"]).agg( |
| lossiness=("lossiness", "mean"), drift=("drift", "mean"), |
| ).reset_index() |
| fig = go.Figure() |
| for mn in sorted(agg_all["model_name"].unique()): |
| sub_all = agg_all[agg_all["model_name"] == mn].sort_values("hop_number") |
| sub = sub_all[sub_all["hop_number"] <= current_hop] |
| if len(sub) == 0: |
| continue |
| color = get_color(mn) |
| trail = sub.iloc[:-1] |
| tip = sub.iloc[[-1]] |
| if len(trail) > 0: |
| fig.add_trace(go.Scatter( |
| x=trail["lossiness"].tolist(), y=trail["drift"].tolist(), |
| mode="lines+markers", name=mn, legendgroup=mn, |
| line=dict(color=color, width=2, dash="dot"), |
| marker=dict(size=7, color=color, opacity=0.5), |
| showlegend=False, hoverinfo="skip", |
| )) |
| fig.add_trace(go.Scatter( |
| x=tip["lossiness"].tolist(), y=tip["drift"].tolist(), |
| mode="markers", name=mn, legendgroup=mn, |
| marker=dict(size=16, color=color, symbol="circle", |
| line=dict(color="white", width=2)), |
| hovertemplate=( |
| f"<b>{mn}</b><br>Hop {current_hop}<br>" |
| "Lossiness: %{x:.3f}<br>Drift: %{y:.3f}<extra></extra>" |
| ), |
| )) |
| fig.add_shape(type="line", x0=0.5, x1=0.5, y0=0, y1=1, |
| xref="x", yref="paper", |
| line=dict(color="#9ca3af", dash="dash", width=1)) |
| fig.add_shape(type="line", x0=0, x1=1, y0=0.5, y1=0.5, |
| xref="paper", yref="y", |
| line=dict(color="#9ca3af", dash="dash", width=1)) |
| fig.update_layout( |
| title=dict( |
| text=f"Lossiness Γ Drift Trajectory β Hop {current_hop} / {max_hop}", |
| font=dict(size=15), |
| ), |
| xaxis=dict(title="Lossiness β", gridcolor="#e5e7eb", |
| range=[-0.02, max(0.6, agg_all["lossiness"].max() + 0.05)]), |
| yaxis=dict(title="Drift β", gridcolor="#e5e7eb", |
| range=[-0.01, max(0.3, agg_all["drift"].max() + 0.05)]), |
| plot_bgcolor="white", paper_bgcolor="white", |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
| autosize=True, height=480, margin=dict(l=50, r=10, t=60, b=60), |
| ) |
| return fig |
|
|
| def build_mh_summary_table(): |
| if len(mh_summary_df) == 0: |
| return pd.DataFrame({"Info": ["No multi-hop summary data available."]}) |
| df = mh_summary_df.copy() |
| df["model_name"] = df["model"].map(display_name) |
| out = df[["model_name", "prompt_name", "score", "lossiness", "drift", "nli_retention"]].copy() |
| out = out.rename(columns={"model_name": "Model", "prompt_name": "Text", |
| "score": "Score", "lossiness": "Lossiness", |
| "drift": "Drift", "nli_retention": "NLI Retention"}) |
| out = out.sort_values(["Model", "Text"]).reset_index(drop=True) |
| for c in ["Lossiness", "Drift", "NLI Retention"]: |
| out[c] = out[c].round(4) |
| return out |
|
|
| def update_mh(text_filter): |
| return ( |
| safe_plot(build_mh_lossiness, text_filter), |
| safe_plot(build_mh_drift, text_filter), |
| safe_plot(build_mh_spiciness, text_filter), |
| safe_plot(build_mh_scatter_trajectory, text_filter), |
| build_mh_summary_table(), |
| ) |
|
|
| |
|
|
| ABOUT_MD = f""" |
| ## BeigificationBench β Evaluation Engine v{EVAL_VERSION} |
| |
| **Beigification** describes the tendency of LLMs to produce rewrites that are safe, bland, and |
| stylistically uniform β stripping out the distinctive voice, specificity, and informational density |
| of the source text. |
| |
| ### Metrics |
| |
| | Metric | Description | |
| |--------|-------------| |
| | **Lossiness** | NLI-weighted information loss: prop loss (60%) + semantic distance (20%) + word deletion (20%) | |
| | **Drift** | Model collapse: spiciness loss (55%) + centroid pull (45%) | |
| | **Spiciness** | 6-component vividness score: sqrt-PPL, lexical richness, rare word density, word specificity, vivid modifiers, voice | |
| | **NLI Retention** | Fraction of source propositions preserved per NLI model | |
| |
| ### Benchmark Design |
| |
| - **Single-hop** results average 3 independent replicates to reduce variance. |
| - **Multi-hop** results show degradation trajectories over 8 successive rewrites using run 154 (5 models Γ 3 texts Γ 8 hops). |
| - Texts span literary non-fiction, polemic, and sensitive personal essay genres. |
| |
| ### Models Evaluated |
| |
| | Model | Provider | Single-hop | Multi-hop | |
| |-------|----------|:----------:|:---------:| |
| | Claude Opus 4.6 | Anthropic | β | β | |
| | GPT-5.2 | OpenAI | β | β | |
| | Gemini 3.1 Pro | Google | β | β | |
| | Llama 3.3 70B | Meta / HF | β | β | |
| | Llama 3.1 70B | Meta / HF | | β | |
| | Qwen 2.5 72B | Alibaba / HF | β | | |
| |
| *Anonymous submission. Author information withheld for peer review.* |
| """ |
|
|
| with gr.Blocks(title="BeigificationBench") as demo: |
| gr.Markdown( |
| f"# π BeigificationBench\n" |
| f"**Measuring How LLMs Flatten Text** | " |
| f"Eval Engine v{EVAL_VERSION} | Space v{SPACE_VERSION}" |
| ) |
|
|
| with gr.Tabs(): |
|
|
| |
| with gr.Tab("Interactive Explorer"): |
| with gr.Row(): |
| cat_dd = gr.Dropdown(choices=get_categories(), value="All", label="Category") |
| text_dd = gr.Dropdown(choices=get_texts(), value="All", label="Source Text") |
| model_dd = gr.Dropdown(choices=get_models(), value="All", label="Model") |
|
|
| scatter_plot = gr.Plot() |
| with gr.Row(): |
| loss_plot = gr.Plot() |
| drift_plot = gr.Plot() |
| with gr.Row(): |
| spic_plot = gr.Plot() |
| pca_plot = gr.Plot() |
| gr.Markdown("### NLI Atom Detail *(select a specific text and model)*") |
| nli_table = gr.Dataframe(wrap=True) |
|
|
| def _update(cat, txt, mdl): |
| return update_explorer(cat, txt, mdl) |
|
|
| for ctrl in [cat_dd, text_dd, model_dd]: |
| ctrl.change(_update, [cat_dd, text_dd, model_dd], |
| [scatter_plot, loss_plot, drift_plot, spic_plot, pca_plot, nli_table]) |
| cat_dd.change(update_text_choices, [cat_dd], [text_dd]) |
|
|
| demo.load(_update, [cat_dd, text_dd, model_dd], |
| [scatter_plot, loss_plot, drift_plot, spic_plot, pca_plot, nli_table]) |
|
|
| |
| with gr.Tab("Leaderboard"): |
| gr.Markdown("Results averaged across 3 replicates per prompt Γ model combination.") |
| lb_table = gr.Dataframe(value=build_leaderboard_df, wrap=False) |
| lb_chart = gr.Plot(value=build_leaderboard_chart) |
| cat_heatmap = gr.Plot(value=build_category_heatmap) |
| radar_plot = gr.Plot(value=build_metrics_radar) |
|
|
| |
| with gr.Tab("Multi-Hop Trajectories"): |
| gr.Markdown( |
| "### 8-Hop Rewrite Trajectories\n" |
| "Each model rewrites the same text 8 times in succession. " |
| "Charts show how Lossiness, Drift, and Spiciness evolve across hops, " |
| "averaged across texts unless a specific text is selected." |
| ) |
| mh_text_dd = gr.Dropdown(choices=get_mh_texts(), value="All", label="Source Text") |
|
|
| mh_traj_plot = gr.Plot() |
| with gr.Row(): |
| mh_loss_plot = gr.Plot() |
| mh_drift_plot = gr.Plot() |
| mh_spic_plot = gr.Plot() |
| gr.Markdown("### Overall (hop 1 β 8) Summary") |
| mh_summary_table = gr.Dataframe(wrap=False) |
|
|
| def _update_mh(txt): |
| return update_mh(txt) |
|
|
| mh_text_dd.change(_update_mh, [mh_text_dd], |
| [mh_loss_plot, mh_drift_plot, mh_spic_plot, mh_traj_plot, mh_summary_table]) |
| demo.load(_update_mh, [mh_text_dd], |
| [mh_loss_plot, mh_drift_plot, mh_spic_plot, mh_traj_plot, mh_summary_table]) |
|
|
| |
| gr.Markdown("---\n### π¬ Trajectory Animation") |
| gr.Markdown( |
| "Watch each model's position move through Lossiness Γ Drift space as hops accumulate. " |
| "The large dot shows the current hop; the dotted trail shows the path so far." |
| ) |
|
|
| anim_is_playing = gr.State(False) |
| with gr.Row(): |
| anim_play_btn = gr.Button("βΆ Play", size="sm", variant="primary", scale=0) |
| anim_reset_btn = gr.Button("β³ Reset", size="sm", scale=0) |
| anim_hop_slider = gr.Slider( |
| minimum=1, maximum=8, step=1, value=1, |
| label="Hop", scale=2, |
| ) |
| anim_plot = gr.Plot() |
| anim_timer = gr.Timer(value=1.2, active=False) |
|
|
| def _anim_init(txt): |
| return 1, build_anim_scatter(txt, 1) |
|
|
| def _anim_scrub(hop, txt): |
| return safe_plot(build_anim_scatter, txt, hop) |
|
|
| def _anim_toggle(playing): |
| new_playing = not playing |
| label = "βΈ Pause" if new_playing else "βΆ Play" |
| return new_playing, gr.Timer(active=new_playing), label |
|
|
| def _anim_reset(txt): |
| return False, gr.Timer(active=False), 1, safe_plot(build_anim_scatter, txt, 1), "βΆ Play" |
|
|
| def _anim_tick(playing, hop, txt): |
| if not playing: |
| return hop, safe_plot(build_anim_scatter, txt, hop) |
| max_hop = int(hops_df["hop_number"].max()) if len(hops_df) > 0 else 8 |
| if hop >= max_hop: |
| return hop, safe_plot(build_anim_scatter, txt, hop) |
| next_hop = hop + 1 |
| return next_hop, safe_plot(build_anim_scatter, txt, next_hop) |
|
|
| anim_play_btn.click( |
| _anim_toggle, |
| inputs=[anim_is_playing], |
| outputs=[anim_is_playing, anim_timer, anim_play_btn], |
| ) |
| anim_reset_btn.click( |
| _anim_reset, |
| inputs=[mh_text_dd], |
| outputs=[anim_is_playing, anim_timer, anim_hop_slider, anim_plot, anim_play_btn], |
| ) |
| anim_hop_slider.change( |
| _anim_scrub, |
| inputs=[anim_hop_slider, mh_text_dd], |
| outputs=[anim_plot], |
| ) |
| mh_text_dd.change( |
| _anim_init, |
| inputs=[mh_text_dd], |
| outputs=[anim_hop_slider, anim_plot], |
| ) |
| anim_timer.tick( |
| _anim_tick, |
| inputs=[anim_is_playing, anim_hop_slider, mh_text_dd], |
| outputs=[anim_hop_slider, anim_plot], |
| ) |
| demo.load(_anim_init, inputs=[mh_text_dd], outputs=[anim_hop_slider, anim_plot]) |
|
|
| |
| with gr.Tab("About"): |
| gr.Markdown(ABOUT_MD) |
|
|
| demo.launch(ssr_mode=False) |
|
|