Spaces:
Sleeping
Sleeping
| """ | |
| Beigification Bench β Measuring How LLMs Flatten Text | |
| Space v2026.04 | Evaluation Engine v1.0 | |
| """ | |
| import os | |
| import json | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| import plotly.express as px | |
| from plotly.subplots import make_subplots | |
| import numpy as np | |
| SPACE_VERSION = "2026.04" | |
| EVAL_VERSION = "1.0" | |
| EVAL_VERSION_NOTES = ( | |
| "V1: 6-component spiciness (sqrt-PPL, lex richness, rare words, word specificity, " | |
| "vivid modifiers, voice), NLI atomization with subject-heuristic reattachment, " | |
| "directionally-gated centroid pull, beigification vector PCA" | |
| ) | |
| MODEL_META = { | |
| "claude-opus-4-6": ("Claude Opus 4.6", "Anthropic"), | |
| "gpt-5.2": ("GPT-5.2", "OpenAI"), | |
| "gemini-3.1-pro-preview": ("Gemini 3.1 Pro", "Google"), | |
| "meta-llama/Llama-3.3-70B-Instruct": ("Llama 3.3 70B", "Meta / HF"), | |
| "grok-3": ("Grok 3", "xAI"), | |
| } | |
| CATEGORY_ORDER = ["Polemic", "Sensitive", "Jargon", "Reasoning", "Edge Cases", "Rant", "analysis"] | |
| MODEL_COLORS = { | |
| "Claude Opus 4.6": "#7c3aed", | |
| "GPT-5.2": "#0891b2", | |
| "Gemini 3.1 Pro": "#059669", | |
| "Llama 3.3 70B": "#d97706", | |
| "Grok 3": "#dc2626", | |
| } | |
| HF_DATASET_ID = "PatriciaDyck/BeigificationBench" | |
| DATA_DIR = os.path.join(os.path.dirname(__file__), "data") | |
| def load_data(): | |
| full_data = None | |
| try: | |
| from huggingface_hub import hf_hub_download | |
| path = hf_hub_download( | |
| repo_id=HF_DATASET_ID, | |
| filename="data/full_dataset.json", | |
| repo_type="dataset", | |
| ) | |
| with open(path) as f: | |
| full_data = json.load(f) | |
| print(f"Loaded {len(full_data)} rows from HF dataset") | |
| except Exception as e: | |
| print(f"WARNING: HF dataset load failed ({e}), falling back to local files") | |
| if full_data: | |
| df = pd.DataFrame(full_data) | |
| nli_atoms = [] | |
| pca_beige = [] | |
| for row in full_data: | |
| if row.get("nli_atoms"): | |
| nli_atoms.append({ | |
| "model": row["model"], | |
| "prompt_id": row["prompt_id"], | |
| "prompt_name": row["prompt_name"], | |
| "nli_atoms": row["nli_atoms"], | |
| }) | |
| if row.get("beige_points") and row["model"] == "gpt-5.2": | |
| pca_beige.append({ | |
| "model": row["model"], | |
| "prompt_id": row["prompt_id"], | |
| "prompt_name": row["prompt_name"], | |
| "beige_points": row["beige_points"], | |
| }) | |
| return df, nli_atoms, pca_beige | |
| try: | |
| df = pd.read_csv(os.path.join(DATA_DIR, "results.csv")) | |
| except Exception as e: | |
| print(f"WARNING: Failed to load results.csv: {e}") | |
| df = pd.DataFrame() | |
| try: | |
| with open(os.path.join(DATA_DIR, "nli_atoms.json")) as f: | |
| nli_atoms = json.load(f) | |
| except Exception as e: | |
| print(f"WARNING: Failed to load nli_atoms.json: {e}") | |
| nli_atoms = [] | |
| try: | |
| with open(os.path.join(DATA_DIR, "pca_beige_points.json")) as f: | |
| pca_beige = json.load(f) | |
| except Exception as e: | |
| print(f"WARNING: Failed to load pca_beige_points.json: {e}") | |
| pca_beige = [] | |
| return df, nli_atoms, pca_beige | |
| results_df, nli_atoms_data, pca_beige_data = load_data() | |
| def display_name(model_id): | |
| return MODEL_META.get(model_id, (model_id, ""))[0] | |
| def provider_name(model_id): | |
| return MODEL_META.get(model_id, ("", model_id))[1] | |
| def get_color(model_name): | |
| return MODEL_COLORS.get(model_name, "#6b7280") | |
| def safe_plot(fn, *args, **kwargs): | |
| try: | |
| return fn(*args, **kwargs) | |
| except Exception as e: | |
| fig = go.Figure() | |
| fig.add_annotation( | |
| text=f"Chart unavailable: {type(e).__name__}: {e}", | |
| x=0.5, y=0.5, xref="paper", yref="paper", | |
| showarrow=False, font=dict(size=14, color="red"), | |
| ) | |
| fig.update_layout(height=300, paper_bgcolor="white", plot_bgcolor="white") | |
| return fig | |
| # ββ Tab 1: Interactive Explorer ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_categories(): | |
| cats = results_df["category"].unique().tolist() | |
| return ["All"] + sorted(cats) | |
| def get_texts(category): | |
| if category == "All": | |
| texts = results_df["prompt_name"].unique().tolist() | |
| else: | |
| texts = results_df[results_df["category"] == category]["prompt_name"].unique().tolist() | |
| return ["All"] + sorted(texts) | |
| def get_models(): | |
| return ["All"] + sorted(results_df["model"].map(display_name).unique().tolist()) | |
| def filter_df(category, text_name, model_filter): | |
| df = results_df.copy() | |
| df["model_name"] = df["model"].map(display_name) | |
| if category != "All": | |
| df = df[df["category"] == category] | |
| if text_name != "All": | |
| df = df[df["prompt_name"] == text_name] | |
| if model_filter != "All": | |
| df = df[df["model_name"] == model_filter] | |
| return df | |
| def build_scatter(category, text_name, model_filter): | |
| df = filter_df(category, text_name, model_filter) | |
| fig = go.Figure() | |
| for mn in sorted(df["model_name"].unique()): | |
| sub = df[df["model_name"] == mn] | |
| fig.add_trace(go.Scatter( | |
| x=sub["lossiness"].tolist(), | |
| y=sub["drift"].tolist(), | |
| mode="markers", | |
| name=mn, | |
| marker=dict(size=12, color=get_color(mn), opacity=0.8), | |
| text=sub["prompt_name"].tolist(), | |
| customdata=sub[["score", "nli_retention", "category"]].values.tolist(), | |
| hovertemplate=( | |
| "<b>%{text}</b><br>" | |
| "Model: " + mn + "<br>" | |
| "Lossiness: %{x:.3f}<br>" | |
| "Drift: %{y:.3f}<br>" | |
| "Score: %{customdata[0]}<br>" | |
| "NLI Retention: %{customdata[1]:.1%}<br>" | |
| "Category: %{customdata[2]}<extra></extra>" | |
| ), | |
| )) | |
| fig.add_shape(type="rect", x0=-0.02, y0=-0.02, x1=0.15, y1=0.05, | |
| fillcolor="rgba(22,163,74,0.08)", line=dict(color="rgba(22,163,74,0.3)", dash="dot"), | |
| layer="below") | |
| fig.add_annotation(x=0.07, y=0.04, text="High fidelity", showarrow=False, | |
| font=dict(color="#16a34a", size=10)) | |
| fig.update_layout( | |
| title=dict(text="Integrity Frontier: Lossiness vs Drift", font=dict(size=15)), | |
| xaxis=dict(title="Lossiness (information loss) β", gridcolor="#e5e7eb", rangemode="tozero"), | |
| yaxis=dict(title="Drift (stylistic collapse) β", gridcolor="#e5e7eb", rangemode="tozero"), | |
| plot_bgcolor="white", paper_bgcolor="white", | |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), | |
| autosize=True, height=500, margin=dict(l=50, r=10, t=60, b=60), | |
| ) | |
| return fig | |
| def build_lossiness_breakdown(category, text_name, model_filter): | |
| df = filter_df(category, text_name, model_filter) | |
| agg = df.groupby("model_name").agg( | |
| prop_loss=("prop_loss", "mean"), | |
| semantic_distance=("semantic_distance", "mean"), | |
| word_deletion=("word_deletion", "mean"), | |
| ).reset_index().sort_values("model_name") | |
| models = agg["model_name"].tolist() | |
| fig = go.Figure() | |
| fig.add_trace(go.Bar(name="Prop Loss (60%)", x=models, y=(agg["prop_loss"] * 0.6).tolist(), | |
| marker_color="#1e3a5f", text=[f"{v:.3f}" for v in (agg["prop_loss"] * 0.6)], textposition="inside")) | |
| fig.add_trace(go.Bar(name="Semantic Dist (20%)", x=models, y=(agg["semantic_distance"] * 0.2).tolist(), | |
| marker_color="#3498db", text=[f"{v:.3f}" for v in (agg["semantic_distance"] * 0.2)], textposition="inside")) | |
| fig.add_trace(go.Bar(name="Word Deletion (20%)", x=models, y=(agg["word_deletion"] * 0.2).tolist(), | |
| marker_color="#95a5a6", text=[f"{v:.3f}" for v in (agg["word_deletion"] * 0.2)], textposition="inside")) | |
| fig.update_layout( | |
| barmode="stack", | |
| title=dict(text="Lossiness Sub-Metric Decomposition", font=dict(size=15)), | |
| yaxis=dict(title="Weighted Contribution", gridcolor="#e5e7eb"), | |
| xaxis=dict(tickangle=-15), | |
| plot_bgcolor="white", paper_bgcolor="white", | |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), | |
| autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80), | |
| ) | |
| return fig | |
| def build_drift_breakdown(category, text_name, model_filter): | |
| df = filter_df(category, text_name, model_filter) | |
| agg = df.groupby("model_name").agg( | |
| norm_delta_spiciness=("norm_delta_spiciness", "mean"), | |
| norm_pull=("norm_pull", "mean"), | |
| ).reset_index().sort_values("model_name") | |
| models = agg["model_name"].tolist() | |
| fig = go.Figure() | |
| fig.add_trace(go.Bar(name="Spiciness Loss (55%)", x=models, y=(agg["norm_delta_spiciness"] * 0.55).tolist(), | |
| marker_color="#e74c3c", text=[f"{v:.3f}" for v in (agg["norm_delta_spiciness"] * 0.55)], textposition="inside")) | |
| fig.add_trace(go.Bar(name="Centroid Pull (45%)", x=models, y=(agg["norm_pull"] * 0.45).tolist(), | |
| marker_color="#f1c40f", text=[f"{v:.3f}" for v in (agg["norm_pull"] * 0.45)], textposition="inside")) | |
| fig.update_layout( | |
| barmode="stack", | |
| title=dict(text="Drift Sub-Metric Decomposition", font=dict(size=15)), | |
| yaxis=dict(title="Weighted Contribution", gridcolor="#e5e7eb"), | |
| xaxis=dict(tickangle=-15), | |
| plot_bgcolor="white", paper_bgcolor="white", | |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), | |
| autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80), | |
| ) | |
| return fig | |
| def build_spiciness_comparison(category, text_name, model_filter): | |
| df = filter_df(category, text_name, model_filter) | |
| agg = df.groupby("model_name").agg( | |
| orig=("orig_spiciness", "mean"), | |
| rew=("rew_spiciness", "mean"), | |
| ).reset_index().sort_values("model_name") | |
| fig = go.Figure() | |
| models = agg["model_name"].tolist() | |
| x_pos = np.arange(len(models)) | |
| width = 0.35 | |
| fig.add_trace(go.Bar( | |
| name="Original", x=[x - width/2 for x in x_pos], y=agg["orig"].tolist(), | |
| marker_color="#6366f1", width=width, | |
| text=[f"{v:.1f}" for v in agg["orig"]], textposition="outside", | |
| )) | |
| fig.add_trace(go.Bar( | |
| name="Rewrite", x=[x + width/2 for x in x_pos], y=agg["rew"].tolist(), | |
| marker_color="#a5b4fc", width=width, | |
| text=[f"{v:.1f}" for v in agg["rew"]], textposition="outside", | |
| )) | |
| fig.update_layout( | |
| title=dict(text="Spiciness: Original vs Rewrite", font=dict(size=15)), | |
| xaxis=dict(tickvals=list(x_pos), ticktext=models, tickangle=-15), | |
| yaxis=dict(title="Spiciness Score", gridcolor="#e5e7eb"), | |
| plot_bgcolor="white", paper_bgcolor="white", | |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), | |
| autosize=True, height=420, margin=dict(l=50, r=10, t=70, b=80), | |
| ) | |
| return fig | |
| def build_pca_plot(category, text_name, model_filter): | |
| df = filter_df(category, text_name, model_filter) | |
| valid = df.dropna(subset=["pca_original_x", "pca_rewrite_x"]) | |
| if len(valid) == 0: | |
| fig = go.Figure() | |
| fig.add_annotation(text="No PCA data available for this selection", | |
| x=0.5, y=0.5, xref="paper", yref="paper", showarrow=False) | |
| fig.update_layout(height=450, paper_bgcolor="white", plot_bgcolor="white") | |
| return fig | |
| fig = go.Figure() | |
| beige_shown = False | |
| for entry in pca_beige_data: | |
| if not beige_shown and entry.get("beige_points"): | |
| bx = [p["x"] for p in entry["beige_points"]] | |
| by = [p["y"] for p in entry["beige_points"]] | |
| fig.add_trace(go.Scatter( | |
| x=bx, y=by, mode="markers", name="Beige Platitudes", | |
| marker=dict(size=8, color="lightgray", opacity=0.5, symbol="diamond"), | |
| hoverinfo="skip", | |
| )) | |
| beige_shown = True | |
| break | |
| for mn in sorted(valid["model_name"].unique()): | |
| sub = valid[valid["model_name"] == mn] | |
| color = get_color(mn) | |
| for _, row in sub.iterrows(): | |
| fig.add_trace(go.Scatter( | |
| x=[row["pca_original_x"], row["pca_rewrite_x"]], | |
| y=[row["pca_original_y"], row["pca_rewrite_y"]], | |
| mode="lines+markers", | |
| line=dict(color=color, width=2), | |
| marker=dict(size=[8, 10], color=color, symbol=["circle", "arrow-right"]), | |
| name=mn, | |
| legendgroup=mn, | |
| showlegend=(_ == sub.index[0]), | |
| hovertemplate=f"<b>{mn}</b><br>{row['prompt_name']}<extra></extra>", | |
| )) | |
| fig.update_layout( | |
| title=dict(text="Beigification Vector β PCA Projection", font=dict(size=15)), | |
| xaxis=dict(title="PC1", gridcolor="#e5e7eb"), | |
| yaxis=dict(title="PC2", gridcolor="#e5e7eb", scaleanchor="x", scaleratio=1), | |
| plot_bgcolor="white", paper_bgcolor="white", | |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), | |
| autosize=True, height=500, margin=dict(l=50, r=10, t=60, b=60), | |
| ) | |
| return fig | |
| def build_nli_table(category, text_name, model_filter): | |
| if text_name == "All" or model_filter == "All": | |
| return pd.DataFrame({"Info": ["Select a specific text AND model to view NLI atom details."]}) | |
| model_id = None | |
| for mid, (dname, _) in MODEL_META.items(): | |
| if dname == model_filter: | |
| model_id = mid | |
| break | |
| if not model_id: | |
| return pd.DataFrame({"Error": ["Model not found"]}) | |
| for entry in nli_atoms_data: | |
| if entry["model"] == model_id and entry["prompt_name"] == text_name: | |
| atoms = entry["nli_atoms"] | |
| rows = [] | |
| for i, atom in enumerate(atoms): | |
| score = atom["score"] | |
| status = "Retained" if score >= 0.5 else "Lost" | |
| rows.append({ | |
| "#": i + 1, | |
| "Atom": atom["text"][:120] + ("..." if len(atom["text"]) > 120 else ""), | |
| "NLI Score": f"{score:.4f}", | |
| "Status": status, | |
| }) | |
| df = pd.DataFrame(rows) | |
| return df | |
| return pd.DataFrame({"Info": ["No NLI atom data available for this selection."]}) | |
| def build_text_comparison(text_name, model_filter): | |
| if text_name == "All": | |
| return ( | |
| "*Select a specific source text to see the original and model rewrites side by side.*", | |
| gr.update(visible=False), | |
| ) | |
| has_text_cols = "original_text" in results_df.columns and "rewrite_text" in results_df.columns | |
| if not has_text_cols: | |
| return ( | |
| "*Text comparison requires the full dataset (loaded from HF). Local CSV fallback does not include text content.*", | |
| gr.update(visible=False), | |
| ) | |
| sub = results_df[results_df["prompt_name"] == text_name].copy() | |
| if len(sub) == 0: | |
| return "*No data found for this text.*", gr.update(visible=False) | |
| original_text = sub.iloc[0]["original_text"] | |
| if model_filter != "All": | |
| sub["model_name"] = sub["model"].map(display_name) | |
| sub = sub[sub["model_name"] == model_filter] | |
| if len(sub) == 0: | |
| return "*No data found for this model/text combination.*", gr.update(visible=False) | |
| md_parts = [] | |
| md_parts.append(f"#### Original Text\n\n{original_text}\n\n---\n") | |
| sub_sorted = sub.copy() | |
| sub_sorted["model_name"] = sub_sorted["model"].map(display_name) | |
| sub_sorted = sub_sorted.sort_values("score", ascending=False) | |
| for _, row in sub_sorted.iterrows(): | |
| mn = row["model_name"] | |
| score = row.get("score", "N/A") | |
| lossiness = row.get("lossiness", 0) | |
| drift = row.get("drift", 0) | |
| rewrite = row.get("rewrite_text", "") | |
| score_color = "#16a34a" if score >= 85 else ("#d97706" if score >= 75 else "#dc2626") | |
| md_parts.append( | |
| f"#### {mn} β Score: {score} | Lossiness: {lossiness:.3f} | Drift: {drift:.3f}\n\n" | |
| f"{rewrite}\n\n---\n" | |
| ) | |
| summary_rows = [] | |
| for _, row in sub_sorted.iterrows(): | |
| summary_rows.append({ | |
| "Model": row["model_name"], | |
| "Score": row.get("score", ""), | |
| "Lossiness": f"{row.get('lossiness', 0):.4f}", | |
| "Drift": f"{row.get('drift', 0):.4f}", | |
| "NLI Retention": f"{row.get('nli_retention', 0):.1%}", | |
| "Spiciness Ξ": f"{row.get('spiciness_delta', 0):.2f}", | |
| "Words (orig)": len(str(original_text).split()), | |
| "Words (rewrite)": len(str(row.get('rewrite_text', '')).split()), | |
| }) | |
| summary_df = pd.DataFrame(summary_rows) | |
| return "\n".join(md_parts), gr.update(value=summary_df, visible=True) | |
| def update_explorer(category, text_name, model_filter): | |
| scatter = safe_plot(build_scatter, category, text_name, model_filter) | |
| lossiness = safe_plot(build_lossiness_breakdown, category, text_name, model_filter) | |
| drift = safe_plot(build_drift_breakdown, category, text_name, model_filter) | |
| spiciness = safe_plot(build_spiciness_comparison, category, text_name, model_filter) | |
| pca = safe_plot(build_pca_plot, category, text_name, model_filter) | |
| nli = build_nli_table(category, text_name, model_filter) | |
| try: | |
| text_md, text_summary = build_text_comparison(text_name, model_filter) | |
| except Exception as e: | |
| text_md = f"*Error loading text comparison: {e}*" | |
| text_summary = gr.update(visible=False) | |
| return scatter, lossiness, drift, spiciness, pca, nli, text_md, text_summary | |
| def update_text_choices(category): | |
| return gr.update(choices=get_texts(category), value="All") | |
| # ββ Tab 2: Aggregate Leaderboard βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_leaderboard_df(): | |
| df = results_df.copy() | |
| df["model_name"] = df["model"].map(display_name) | |
| df["provider"] = df["model"].map(provider_name) | |
| agg = df.groupby(["model_name", "provider"]).agg( | |
| avg_score=("score", "mean"), | |
| avg_lossiness=("lossiness", "mean"), | |
| avg_drift=("drift", "mean"), | |
| avg_nli_retention=("nli_retention", "mean"), | |
| avg_spiciness_delta=("spiciness_delta", "mean"), | |
| avg_orig_spiciness=("orig_spiciness", "mean"), | |
| avg_rew_spiciness=("rew_spiciness", "mean"), | |
| n_texts=("score", "count"), | |
| ).reset_index() | |
| agg = agg.sort_values("avg_score", ascending=False).reset_index(drop=True) | |
| agg.insert(0, "Rank", range(1, len(agg) + 1)) | |
| out = agg.rename(columns={ | |
| "model_name": "Model", | |
| "provider": "Provider", | |
| "avg_score": "Avg Score", | |
| "avg_lossiness": "Avg Lossiness", | |
| "avg_drift": "Avg Drift", | |
| "avg_nli_retention": "NLI Retention", | |
| "avg_spiciness_delta": "Spiciness Ξ", | |
| "avg_orig_spiciness": "Orig Spiciness", | |
| "avg_rew_spiciness": "Rew Spiciness", | |
| "n_texts": "Texts", | |
| }) | |
| for col in ["Avg Score"]: | |
| out[col] = out[col].round(1) | |
| for col in ["Avg Lossiness", "Avg Drift", "NLI Retention", "Spiciness Ξ", "Orig Spiciness", "Rew Spiciness"]: | |
| out[col] = out[col].round(4) | |
| return out | |
| def build_leaderboard_chart(): | |
| df = results_df.copy() | |
| df["model_name"] = df["model"].map(display_name) | |
| agg = df.groupby("model_name").agg( | |
| score=("score", "mean"), | |
| lossiness=("lossiness", "mean"), | |
| drift=("drift", "mean"), | |
| ).reset_index().sort_values("score", ascending=False) | |
| fig = go.Figure() | |
| models = agg["model_name"].tolist() | |
| x_pos = np.arange(len(models)) | |
| fig.add_trace(go.Bar( | |
| name="Avg Score", x=models, y=agg["score"].tolist(), | |
| marker_color=[get_color(m) for m in models], | |
| text=[f"{v:.1f}" for v in agg["score"]], textposition="outside", | |
| )) | |
| fig.update_layout( | |
| title=dict(text="Overall Preservation Score by Model (higher = better)", font=dict(size=15)), | |
| yaxis=dict(title="Score (0-100)", gridcolor="#e5e7eb", range=[0, 105]), | |
| xaxis=dict(tickangle=-15), | |
| plot_bgcolor="white", paper_bgcolor="white", | |
| showlegend=False, | |
| autosize=True, height=420, margin=dict(l=50, r=10, t=60, b=80), | |
| ) | |
| return fig | |
| def build_category_table(): | |
| df = results_df.copy() | |
| df["model_name"] = df["model"].map(display_name) | |
| pivot = df.pivot_table( | |
| index="category", columns="model_name", values="score", aggfunc="mean" | |
| ).round(1) | |
| pivot = pivot.reset_index().rename(columns={"category": "Category"}) | |
| return pivot | |
| def build_category_heatmap(): | |
| df = results_df.copy() | |
| df["model_name"] = df["model"].map(display_name) | |
| pivot = df.pivot_table(index="category", columns="model_name", values="score", aggfunc="mean") | |
| z = pivot.values | |
| x = pivot.columns.tolist() | |
| y = pivot.index.tolist() | |
| fig = go.Figure(data=go.Heatmap( | |
| z=z, x=x, y=y, | |
| colorscale=[[0, "#fef2f2"], [0.4, "#fde68a"], [0.7, "#bbf7d0"], [1.0, "#1d4ed8"]], | |
| text=[[f"{v:.0f}" if not np.isnan(v) else "" for v in row] for row in z], | |
| texttemplate="%{text}", | |
| textfont=dict(size=11), | |
| colorbar=dict(title="Score", thickness=12), | |
| zmin=60, zmax=100, | |
| )) | |
| fig.update_layout( | |
| title=dict(text="Score by Category Γ Model", font=dict(size=15)), | |
| xaxis=dict(side="top", tickangle=-20), | |
| yaxis=dict(autorange="reversed"), | |
| autosize=True, height=400, | |
| margin=dict(l=160, r=10, t=70, b=10), | |
| plot_bgcolor="white", paper_bgcolor="white", | |
| ) | |
| return fig | |
| def build_metrics_radar(): | |
| df = results_df.copy() | |
| df["model_name"] = df["model"].map(display_name) | |
| agg = df.groupby("model_name").agg( | |
| nli_retention=("nli_retention", "mean"), | |
| low_lossiness=("lossiness", lambda x: 1 - x.mean()), | |
| low_drift=("drift", lambda x: 1 - x.mean()), | |
| spiciness_preservation=("spiciness_delta", lambda x: 1 - abs(x.mean()) / 10), | |
| low_word_deletion=("word_deletion", lambda x: 1 - x.mean()), | |
| ).reset_index() | |
| categories = ["NLI Retention", "Low Lossiness", "Low Drift", "Spiciness Preservation", "Low Word Deletion"] | |
| fig = go.Figure() | |
| for _, row in agg.iterrows(): | |
| values = [row["nli_retention"], row["low_lossiness"], row["low_drift"], | |
| row["spiciness_preservation"], row["low_word_deletion"]] | |
| values.append(values[0]) | |
| fig.add_trace(go.Scatterpolar( | |
| r=values, | |
| theta=categories + [categories[0]], | |
| fill="toself", | |
| name=row["model_name"], | |
| line=dict(color=get_color(row["model_name"])), | |
| opacity=0.6, | |
| )) | |
| fig.update_layout( | |
| polar=dict(radialaxis=dict(visible=True, range=[0, 1])), | |
| title=dict(text="Model Preservation Profile", font=dict(size=15)), | |
| autosize=True, height=480, | |
| margin=dict(l=60, r=60, t=60, b=60), | |
| paper_bgcolor="white", | |
| ) | |
| return fig | |
| # ββ Tab 3: Orthogonality Analysis ββββββββββββββββββββββββββββββββββββββββββββββ | |
| METRIC_COLS = [ | |
| "lossiness", "drift", "prop_loss", "semantic_distance", "word_deletion", | |
| "nli_retention", "norm_pull", "norm_delta_spiciness", | |
| "orig_spiciness", "rew_spiciness", "spiciness_delta", | |
| "orig_perplexity", "rew_perplexity", | |
| "orig_lex_richness", "rew_lex_richness", | |
| "orig_rare_word_density", "rew_rare_word_density", | |
| "voice_shift", | |
| "orig_word_specificity", "rew_word_specificity", | |
| "orig_vivid_modifier_ratio", "rew_vivid_modifier_ratio", | |
| "orig_voice_score", "rew_voice_score", | |
| "pull_magnitude", "directional_similarity", "delta_dist_to_beige", | |
| ] | |
| METRIC_LABELS = { | |
| "lossiness": "Lossiness", | |
| "drift": "Drift", | |
| "prop_loss": "Prop Loss", | |
| "semantic_distance": "Semantic Dist", | |
| "word_deletion": "Word Deletion", | |
| "nli_retention": "NLI Retention", | |
| "norm_pull": "Norm Pull", | |
| "norm_delta_spiciness": "Ξ Spiciness (norm)", | |
| "orig_spiciness": "Orig Spiciness", | |
| "rew_spiciness": "Rew Spiciness", | |
| "spiciness_delta": "Spiciness Ξ", | |
| "orig_perplexity": "Orig Perplexity", | |
| "rew_perplexity": "Rew Perplexity", | |
| "orig_lex_richness": "Orig Lex Richness", | |
| "rew_lex_richness": "Rew Lex Richness", | |
| "orig_rare_word_density": "Orig Rare Words", | |
| "rew_rare_word_density": "Rew Rare Words", | |
| "voice_shift": "Voice Shift", | |
| "orig_word_specificity": "Orig Word Spec", | |
| "rew_word_specificity": "Rew Word Spec", | |
| "orig_vivid_modifier_ratio": "Orig Vivid Mod", | |
| "rew_vivid_modifier_ratio": "Rew Vivid Mod", | |
| "orig_voice_score": "Orig Voice", | |
| "rew_voice_score": "Rew Voice", | |
| "pull_magnitude": "Pull Magnitude", | |
| "directional_similarity": "Directional Sim", | |
| "delta_dist_to_beige": "Ξ Dist to Beige", | |
| } | |
| def build_correlation_heatmap(): | |
| available = [c for c in METRIC_COLS if c in results_df.columns] | |
| df = results_df[available].dropna() | |
| corr = df.corr() | |
| labels = [METRIC_LABELS.get(c, c) for c in corr.columns] | |
| fig = go.Figure(data=go.Heatmap( | |
| z=corr.values, | |
| x=labels, | |
| y=labels, | |
| colorscale="RdBu_r", | |
| zmin=-1, zmax=1, | |
| text=[[f"{v:.2f}" for v in row] for row in corr.values], | |
| texttemplate="%{text}", | |
| textfont=dict(size=8), | |
| colorbar=dict(title="r", thickness=12), | |
| )) | |
| fig.update_layout( | |
| title=dict(text="Metric Correlation Matrix (Pearson r)", font=dict(size=15)), | |
| xaxis=dict(tickangle=-45, tickfont=dict(size=9)), | |
| yaxis=dict(tickfont=dict(size=9), autorange="reversed"), | |
| autosize=True, height=700, | |
| margin=dict(l=130, r=10, t=60, b=130), | |
| plot_bgcolor="white", paper_bgcolor="white", | |
| ) | |
| return fig | |
| def build_pca_biplot(): | |
| from sklearn.decomposition import PCA | |
| from sklearn.preprocessing import StandardScaler | |
| available = [c for c in METRIC_COLS if c in results_df.columns] | |
| df = results_df[available].dropna() | |
| scaler = StandardScaler() | |
| scaled = scaler.fit_transform(df) | |
| pca = PCA(n_components=2) | |
| scores = pca.fit_transform(scaled) | |
| loadings = pca.components_.T | |
| fig = go.Figure() | |
| results_with_models = results_df.dropna(subset=available) | |
| model_names = results_with_models["model"].map(display_name).values | |
| for mn in sorted(set(model_names)): | |
| mask = model_names == mn | |
| fig.add_trace(go.Scatter( | |
| x=scores[mask, 0].tolist(), | |
| y=scores[mask, 1].tolist(), | |
| mode="markers", | |
| name=mn, | |
| marker=dict(size=8, color=get_color(mn), opacity=0.6), | |
| hovertemplate=f"{mn}<extra></extra>", | |
| )) | |
| scale = np.max(np.abs(scores)) * 0.8 | |
| for i, col in enumerate(available): | |
| lx, ly = loadings[i, 0] * scale, loadings[i, 1] * scale | |
| magnitude = np.sqrt(lx**2 + ly**2) | |
| if magnitude < scale * 0.2: | |
| continue | |
| label = METRIC_LABELS.get(col, col) | |
| fig.add_annotation( | |
| ax=0, ay=0, axref="x", ayref="y", | |
| x=lx, y=ly, xref="x", yref="y", | |
| showarrow=True, | |
| arrowhead=2, arrowsize=1.5, arrowwidth=1.5, | |
| arrowcolor="rgba(100,100,100,0.7)", | |
| ) | |
| fig.add_annotation( | |
| x=lx * 1.15, y=ly * 1.15, | |
| text=label, showarrow=False, | |
| font=dict(size=9, color="#374151"), | |
| ) | |
| var_explained = pca.explained_variance_ratio_ | |
| fig.update_layout( | |
| title=dict( | |
| text=f"PCA Biplot β Metric Loading Directions (PC1: {var_explained[0]:.1%}, PC2: {var_explained[1]:.1%})", | |
| font=dict(size=14), | |
| ), | |
| xaxis=dict(title=f"PC1 ({var_explained[0]:.1%} variance)", gridcolor="#e5e7eb"), | |
| yaxis=dict(title=f"PC2 ({var_explained[1]:.1%} variance)", gridcolor="#e5e7eb", | |
| scaleanchor="x", scaleratio=1), | |
| plot_bgcolor="white", paper_bgcolor="white", | |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), | |
| autosize=True, height=600, | |
| margin=dict(l=60, r=10, t=80, b=60), | |
| ) | |
| return fig | |
| def build_high_corr_table(): | |
| available = [c for c in METRIC_COLS if c in results_df.columns] | |
| df = results_df[available].dropna() | |
| corr = df.corr() | |
| pairs = [] | |
| cols = corr.columns.tolist() | |
| for i in range(len(cols)): | |
| for j in range(i+1, len(cols)): | |
| r = corr.iloc[i, j] | |
| pairs.append({ | |
| "Metric A": METRIC_LABELS.get(cols[i], cols[i]), | |
| "Metric B": METRIC_LABELS.get(cols[j], cols[j]), | |
| "Pearson r": round(r, 4), | |
| "|r|": round(abs(r), 4), | |
| "Relationship": "Redundant" if abs(r) > 0.8 else ("Correlated" if abs(r) > 0.5 else "Independent"), | |
| }) | |
| pairs_df = pd.DataFrame(pairs).sort_values("|r|", ascending=False) | |
| return pairs_df.head(40).reset_index(drop=True) | |
| def build_independence_summary(): | |
| available = [c for c in METRIC_COLS if c in results_df.columns] | |
| df = results_df[available].dropna() | |
| corr = df.corr() | |
| n = len(corr.columns) | |
| total_pairs = n * (n - 1) // 2 | |
| cols = corr.columns.tolist() | |
| redundant = 0 | |
| correlated = 0 | |
| independent = 0 | |
| for i in range(len(cols)): | |
| for j in range(i + 1, len(cols)): | |
| r = abs(corr.iloc[i, j]) | |
| if r > 0.8: | |
| redundant += 1 | |
| elif r > 0.5: | |
| correlated += 1 | |
| else: | |
| independent += 1 | |
| return pd.DataFrame([{ | |
| "Total Metric Pairs": total_pairs, | |
| "Redundant (|r| > 0.8)": redundant, | |
| "Correlated (0.5 < |r| <= 0.8)": correlated, | |
| "Independent (|r| <= 0.5)": independent, | |
| "% Independent": f"{independent/total_pairs*100:.1f}%", | |
| }]) | |
| # ββ Tab 4: Methodology ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| METHODOLOGY_MD = "" | |
| for _p in [ | |
| os.path.join(os.path.dirname(__file__), "methodology.md"), | |
| os.path.join(os.path.dirname(__file__), "..", "methodology.md"), | |
| ]: | |
| if os.path.exists(_p): | |
| METHODOLOGY_MD = open(_p, "r").read() | |
| break | |
| if not METHODOLOGY_MD: | |
| METHODOLOGY_MD = "Methodology document not found. See the Beigification Bench documentation for details." | |
| # ββ Build the Gradio App βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ABOUT_MD = f""" | |
| ## Beigification Bench: Measuring How LLMs Flatten Text | |
| **Space Version**: v{SPACE_VERSION} | **Evaluation Engine**: v{EVAL_VERSION} | **Author**: Patricia Dyck | **License**: CC BY 4.0 | |
| > **Eval Engine v{EVAL_VERSION}**: {EVAL_VERSION_NOTES} | |
| --- | |
| ### What is Beigification? | |
| *Beigification* is the phenomenon where LLM-generated rewrites systematically flatten, genericize, and strip distinctive qualities from source text β replacing vivid language with safe platitudes, concrete imagery with abstract generalities, and assertive voice with hedged equivocation. | |
| The Beigification Bench is a diagnostic framework that quantifies this effect through two composite metrics: | |
| - **Lossiness** β How much semantic and lexical content is lost (proposition loss via NLI, semantic distance, word deletion) | |
| - **Drift** β How much the rewrite moves toward generic "model collapse" language (spiciness loss + beige centroid pull) | |
| These are supported by a rich sub-metric suite including NLI atom retention, spiciness scoring (perplexity, lexical richness, rare words, word specificity, vivid modifiers, voice), and beigification vector analysis (PCA projection of semantic displacement toward a pre-defined platitude cluster). | |
| --- | |
| ### Benchmark Design | |
| - **5 models**: GPT-5.2, Claude Opus 4.6, Gemini 3.1 Pro, Llama 3.3 70B, Grok 3 | |
| - **12 source texts** across 5+ categories: Polemic, Sensitive Topics, Jargon-Dense, Complex Reasoning, Edge Cases | |
| - **Task**: "Please rewrite the following text" β a simple rewriting prompt that reveals how models handle stylistic preservation | |
| - **Temperature**: 0.7 (matches real-world deployment conditions) | |
| - **Evaluation**: Automated via HuggingFace Inference API (BAAI/bge-base-en-v1.5 embeddings, DeBERTa-v3-large NLI) | |
| ### Data | |
| All benchmark data is hosted on HuggingFace Datasets: [`PatriciaDyck/BeigificationBench`](https://huggingface.co/datasets/PatriciaDyck/BeigificationBench) | |
| ### Theoretical Grounding | |
| The framework draws on foundational work in model collapse (Shumailov et al., 2024), semantic entropy (Farquhar et al., 2024), and contextual erosion (Vazhentsev et al., ICLR 2026). It provides empirical measurement of the "alignment tax" β the information and stylistic cost of safety-tuned language models. | |
| --- | |
| ### Tabs | |
| 1. **Interactive Explorer** β Filter by category, text, and model. View original text alongside model rewrites with metrics. Includes lossiness/drift scatter, sub-metric decompositions, spiciness comparisons, PCA vector plots, and NLI atom details. | |
| 2. **Leaderboard** β Aggregate rankings across all texts with per-category heatmaps and radar profiles. | |
| 3. **Orthogonality Analysis** β Correlation matrix and PCA biplot showing which metrics measure genuinely independent dimensions vs. which are redundant. | |
| 4. **Methodology** β Full technical specification of all metrics, formulas, and hyperparameters. | |
| """ | |
| has_text_data = "original_text" in results_df.columns | |
| with gr.Blocks(title="Beigification Bench") as demo: | |
| gr.Markdown("# π¨ Beigification Bench\n*Measuring How LLMs Flatten Text*") | |
| with gr.Tabs(): | |
| with gr.Tab("Explorer"): | |
| with gr.Row(): | |
| cat_dd = gr.Dropdown(choices=get_categories(), value="All", label="Category", scale=1) | |
| text_dd = gr.Dropdown(choices=get_texts("All"), value="All", label="Source Text", scale=2) | |
| model_dd = gr.Dropdown(choices=get_models(), value="All", label="Model", scale=1) | |
| scatter_plot = gr.Plot(value=safe_plot(build_scatter, "All", "All", "All"), label="Integrity Frontier") | |
| with gr.Row(): | |
| lossiness_plot = gr.Plot(value=safe_plot(build_lossiness_breakdown, "All", "All", "All")) | |
| drift_plot = gr.Plot(value=safe_plot(build_drift_breakdown, "All", "All", "All")) | |
| with gr.Row(): | |
| spiciness_plot = gr.Plot(value=safe_plot(build_spiciness_comparison, "All", "All", "All")) | |
| pca_plot = gr.Plot(value=safe_plot(build_pca_plot, "All", "All", "All")) | |
| gr.Markdown("### Original Text & Model Rewrites") | |
| if has_text_data: | |
| gr.Markdown( | |
| "*Select a source text above to read the original alongside each model's rewrite. " | |
| "Optionally filter to a single model.*" | |
| ) | |
| else: | |
| gr.Markdown( | |
| "*Text comparison unavailable β dataset loaded from local CSV without text content. " | |
| "Full text data is available via the [HF dataset](https://huggingface.co/datasets/PatriciaDyck/BeigificationBench).*" | |
| ) | |
| text_comparison_md = gr.Markdown( | |
| value="*Select a specific source text to see the original and model rewrites side by side.*" | |
| ) | |
| text_summary_table = gr.Dataframe(visible=False, label="Rewrite Summary") | |
| gr.Markdown("### NLI Atom Details\n*Select a specific text and model to see per-atom entailment scores*") | |
| nli_table = gr.Dataframe(value=build_nli_table("All", "All", "All"), label="NLI Atoms") | |
| cat_dd.change(update_text_choices, inputs=[cat_dd], outputs=[text_dd]) | |
| for inp in [cat_dd, text_dd, model_dd]: | |
| inp.change( | |
| update_explorer, | |
| inputs=[cat_dd, text_dd, model_dd], | |
| outputs=[scatter_plot, lossiness_plot, drift_plot, spiciness_plot, pca_plot, nli_table, | |
| text_comparison_md, text_summary_table], | |
| ) | |
| with gr.Tab("Leaderboard"): | |
| gr.Markdown("### Aggregate Model Rankings") | |
| lb_table = gr.Dataframe(value=build_leaderboard_df(), label="Leaderboard") | |
| lb_chart = gr.Plot(value=safe_plot(build_leaderboard_chart), label="Score Comparison") | |
| with gr.Row(): | |
| cat_heatmap = gr.Plot(value=safe_plot(build_category_heatmap), label="Category Heatmap") | |
| radar = gr.Plot(value=safe_plot(build_metrics_radar), label="Preservation Profile") | |
| gr.Markdown("### Per-Category Breakdown") | |
| cat_table = gr.Dataframe(value=build_category_table(), label="Category Scores") | |
| with gr.Tab("Orthogonality"): | |
| gr.Markdown( | |
| "### Metric Independence Analysis\n" | |
| "This tab examines how independent the beigification sub-metrics are from each other. " | |
| "High correlation (|r| > 0.8) suggests redundancy; low correlation (|r| < 0.5) suggests the metrics " | |
| "capture genuinely different dimensions of text degradation.\n\n" | |
| "*Data source: one-shot rewrites (60 observations across 5 models x 12 texts). " | |
| "Multi-hop per-hop data will be added as additional observations for richer analysis.*" | |
| ) | |
| ind_summary = gr.Dataframe(value=build_independence_summary(), label="Independence Summary") | |
| corr_heatmap = gr.Plot(value=safe_plot(build_correlation_heatmap), label="Correlation Matrix") | |
| pca_biplot = gr.Plot(value=safe_plot(build_pca_biplot), label="PCA Biplot") | |
| gr.Markdown("### Top Correlated / Independent Metric Pairs") | |
| high_corr = gr.Dataframe(value=build_high_corr_table(), label="Metric Pair Correlations") | |
| with gr.Tab("Methodology"): | |
| gr.Markdown(METHODOLOGY_MD) | |
| with gr.Tab("About"): | |
| gr.Markdown(ABOUT_MD) | |
| demo.launch( | |
| ssr_mode=False, | |
| theme=gr.themes.Soft( | |
| primary_hue="indigo", | |
| secondary_hue="amber", | |
| ), | |
| ) | |