import os from pathlib import Path from base64 import b64encode import streamlit as st import pandas as pd import altair as alt from datasets import load_dataset # --- Page setup --- st.set_page_config( page_title="RAT-Bench Leaderboard", page_icon="π", layout="centered", ) # --- Global CSS --- st.markdown(""" """, unsafe_allow_html=True) # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ # β 1. Hero Banner β # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ # Embed the Rat_Bench logo with open(Path(__file__).parent / "images" / "Rat_Bench.png", "rb") as f: logo_b64 = b64encode(f.read()).decode("utf-8") st.markdown(f"""
RAT-Bench is a synthetic benchmark for evaluating how well anonymization tools
prevent re-identification of individuals in text.
Using U.S. demographic statistics, we generate text with direct and indirect identifiers,
anonymize it, and measure how easily an LLM-based attacker can still re-identify people.
Curious how your tool compares? Follow the instructions in our repo and send us your results!
Toggle which results to display. " "The No anonymization baseline is pinned on top (not ranked). " "Tools are ranked by Average Risk (lower is better).
", unsafe_allow_html=True, ) _, c1, c2, _ = st.columns([1, 2, 2, 1], gap="medium") with c1: language = st.selectbox("Language", languages) with c2: st.write("") # vertical spacer show_levels = st.checkbox("Show difficulty levels", value=True, key="levels_cb") # --- Build display table --- work = df.copy() work["Average Risk (Explicit)"] = work[f"{language} Avg"] work = work.dropna(subset=[f"{language} Avg"]) baseline_mask = work[tool_col].str.strip().str.lower() == baseline_name.lower() others = work[~baseline_mask].sort_values(f"{language} Avg").reset_index(drop=True) others["Rank"] = (others.index + 1).astype(str) baselines = work[baseline_mask].copy() baselines["Rank"] = "β" final = pd.concat([baselines, others], ignore_index=True) cols = ["Rank", tool_col, "Type"] if not show_levels: cols += ["Average Risk (Explicit)"] elif language == "English": cols += [ f"{language} Explicit (easy)", f"{language} Explicit (hard)", "Average Risk (Explicit)", f"{language} Implicit", ] else: cols += [f"{language} Explicit (easy)", "Average Risk (Explicit)"] if language == "English": cols += [f"BLEU score ({language}, Explicit avg)"] rename_map = { f"{language} Explicit (easy)": "Explicit (easy)", f"{language} Explicit (hard)": "Explicit (hard)", f"{language} Implicit": "Implicit", f"BLEU score ({language}, Explicit avg)": "Avg BLEU (Explicit)", } display = final[cols].rename(columns=rename_map) # --- Badge helper --- BADGE_CLS = { "NER-based": "badge-ner", "LLM-based": "badge-llm", "Perturbation": "badge-perturb", "Baseline": "badge-baseline", } def _badge(typ: str) -> str: cls = BADGE_CLS.get(typ, "badge-baseline") return f'{typ}' # --- Risk heatmap color (greenβyellowβred) --- def _risk_color(val, lo=0, hi=100): """Return a CSS background for risk values: green(0) -> yellow(50) -> red(100).""" try: v = float(val) except (ValueError, TypeError): return "" t = max(0.0, min(1.0, (v - lo) / (hi - lo))) if t <= 0.5: r = int(76 + (t / 0.5) * (234 - 76)) g = int(175 + (t / 0.5) * (179 - 175)) b = int(80 + (t / 0.5) * (8 - 80)) else: r = int(234 + ((t - 0.5) / 0.5) * (220 - 234)) g = int(179 - ((t - 0.5) / 0.5) * (179 - 53)) b = int(8 + ((t - 0.5) / 0.5) * (69 - 8)) return f"background:rgba({r},{g},{b},0.22); font-weight:600;" # --- BLEU heatmap color (redβyellowβgreen, higher=better) --- def _bleu_color(val, lo=0.5, hi=1.0): """Return a CSS background for BLEU values: red(low) -> yellow(mid) -> green(high).""" try: v = float(val) except (ValueError, TypeError): return "" t = max(0.0, min(1.0, (v - lo) / (hi - lo))) if t <= 0.5: # red to yellow r = int(220 + (t / 0.5) * (234 - 220)) g = int(53 + (t / 0.5) * (179 - 53)) b = int(69 + (t / 0.5) * (8 - 69)) else: # yellow to green r = int(234 - ((t - 0.5) / 0.5) * (234 - 76)) g = int(179 - ((t - 0.5) / 0.5) * (179 - 175)) b = int(8 + ((t - 0.5) / 0.5) * (80 - 8)) return f"background:rgba({r},{g},{b},0.22); font-weight:600;" # Risk value columns in the display table risk_cols = {"Explicit (easy)", "Explicit (hard)", "Implicit", "Average Risk (Explicit)"} bleu_col_name = "Avg BLEU (Explicit)" # --- Build HTML table --- html_rows = [] for _, row in display.iterrows(): is_baseline = str(row.get(tool_col, "")).strip().lower() == baseline_name.lower() tr_cls = ' class="baseline-row"' if is_baseline else "" cells = [] for col in display.columns: val = row[col] if col == "Type": cells.append(f"We measure how much identifying information survives anonymization. An LLM-based attacker reads the anonymized text and attempts to recover identifying attributes.
Figure: Re-identification risk based on direct and indirect identifiers.
""", unsafe_allow_html=True) # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ # β 5. Interactive Risk vs BLEU Scatter (Altair) β # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.markdown('" "A good anonymizer sits in the lower-right corner: low risk, high BLEU (text utility preserved). " "Hover over points for details.
", unsafe_allow_html=True, ) scatter_df = df.dropna(subset=["BLEU score (English, Explicit avg)", "English Avg"]).copy() scatter_df = scatter_df[scatter_df[tool_col].str.strip().str.lower() != baseline_name.lower()] scatter_df = scatter_df.rename(columns={ "English Avg": "Average Risk", "BLEU score (English, Explicit avg)": "BLEU Score", tool_col: "Tool", }) type_colors = alt.Scale( domain=["NER-based", "LLM-based", "Perturbation"], range=["#3b82f6", "#8b5cf6", "#f59e0b"], ) points = ( alt.Chart(scatter_df) .mark_circle(size=120, opacity=0.85, stroke="#fff", strokeWidth=1) .encode( x=alt.X("BLEU Score:Q", scale=alt.Scale(domain=[0.5, 1.0]), title="BLEU Score (higher = more utility)"), y=alt.Y("Average Risk:Q", scale=alt.Scale(domain=[20, 100]), title="Average Risk % (lower = safer)"), color=alt.Color("Type:N", scale=type_colors, legend=alt.Legend(title="Type", orient="bottom")), tooltip=["Tool:N", "Type:N", alt.Tooltip("Average Risk:Q", format=".1f"), alt.Tooltip("BLEU Score:Q", format=".2f")], ) ) labels = ( alt.Chart(scatter_df) .mark_text(align="left", dx=8, dy=-6, fontSize=11, fontWeight=500) .encode( x="BLEU Score:Q", y="Average Risk:Q", text="Tool:N", color=alt.Color("Type:N", scale=type_colors, legend=None), ) ) chart = ( (points + labels) .properties(width=500, height=380) .configure_axis( grid=True, gridColor="rgba(128,128,128,0.12)", labelFontSize=12, titleFontSize=13, titleFontWeight=600, ) .configure_view(strokeWidth=0) .interactive() ) st.altair_chart(chart, use_container_width=True) # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ # β 6. BibTeX Citation β # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.markdown("