Spaces:

imperial-cpg
/

rat-bench

Running

File size: 21,647 Bytes

import os
from pathlib import Path
from base64 import b64encode
import streamlit as st
import pandas as pd
import altair as alt
from datasets import load_dataset

# --- Page setup ---
st.set_page_config(
    page_title="RAT-Bench Leaderboard",
    page_icon="📊",
    layout="centered",
)

# --- Global CSS ---
st.markdown("""
<style>
/* ── Reset & Typography ── */
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
html, body, [class*="css"] { font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; }
.block-container { max-width: 1100px; padding-top: 0; padding-bottom: 2rem; }

/* ── Hero Banner ── */
.hero {
    background: linear-gradient(135deg, #1a1a2e 0%, #2d2d44 40%, #3d3d5c 100%);
    border-radius: 0 0 1.5rem 1.5rem;
    padding: 2.4rem 2rem 2rem 2rem;
    margin: -1rem -1rem 1.8rem -1rem;
    text-align: center;
    color: #f0f4f8;
}
.hero-logo { max-width: 60px; border-radius: 8px; margin: 0.6rem 0 0.3rem 0; }
.hero h1 {
    font-size: 2.1rem;
    font-weight: 700;
    margin: 0 0 0.45rem 0;
    letter-spacing: -0.02em;
    line-height: 1.25;
    color: #ffffff;
}
.hero p {
    font-size: 0.97rem;
    line-height: 1.65;
    color: #c0c7d0;
    max-width: 800px;
    margin: 0 auto;
}
.hero p b { color: #ffffff; }
.hero p a { color: #ffd470; }

/* ── Pill Link Buttons ── */
.link-pills {
    display: flex; justify-content: center; gap: 0.75rem;
    margin-top: 1.2rem;
}
.link-pills a {
    display: inline-flex; align-items: center; gap: 0.4rem;
    padding: 0.45rem 1.1rem;
    font-size: 0.88rem; font-weight: 600;
    color: #e0e7ee;
    text-decoration: none;
    border: 1px solid rgba(255,255,255,0.2);
    border-radius: 9999px;
    backdrop-filter: blur(4px);
    transition: all 0.2s ease;
}
.link-pills a:hover {
    background: rgba(255,215,100,0.15);
    color: #fff;
    border-color: rgba(255,215,100,0.4);
    transform: translateY(-1px);
}

/* ── Metric Cards Row ── */
.metric-row {
    display: flex; gap: 1rem; justify-content: center;
    margin-bottom: 1.6rem; flex-wrap: wrap;
}
.metric-card {
    flex: 1; min-width: 160px; max-width: 260px;
    background: var(--secondary-background-color, #f8f9fa);
    border-radius: 0.85rem;
    padding: 1.1rem 1.3rem;
    text-align: center;
    box-shadow: 0 1px 4px rgba(0,0,0,0.06);
    border: 1px solid rgba(128,128,128,0.1);
}
.metric-card .metric-label {
    font-size: 0.78rem; font-weight: 500;
    color: var(--text-color-secondary, #666);
    text-transform: uppercase; letter-spacing: 0.04em;
    margin-bottom: 0.2rem;
}
.metric-card .metric-value {
    font-size: 1.2rem; font-weight: 700;
    color: var(--text-color, #222);
}

/* ── Section Card ── */
.card {
    background: var(--secondary-background-color, #f8f9fa);
    border-radius: 1rem;
    padding: 1.6rem 1.8rem;
    margin: 0 auto 1.5rem auto;
    max-width: 850px;
    box-shadow: 0 1px 6px rgba(0,0,0,0.05);
    border: 1px solid rgba(128,128,128,0.08);
}
.card h2 {
    font-size: 1.35rem; font-weight: 700;
    margin: 0 0 0.6rem 0;
    text-align: center;
    color: var(--text-color, #222);
}
.card p, .card div {
    font-size: 0.95rem; line-height: 1.65;
    color: var(--text-color, #444);
}

/* ── Section Titles ── */
.section-title {
    text-align: center;
    font-size: 1.55rem;
    font-weight: 700;
    margin: 0.5rem 0 0.35rem 0;
    color: var(--text-color, #222);
}

/* ── Type Badges ── */
.badge {
    display: inline-block;
    padding: 0.18rem 0.65rem;
    border-radius: 9999px;
    font-size: 0.78rem;
    font-weight: 600;
    letter-spacing: 0.02em;
    white-space: nowrap;
}
.badge-ner       { background: #dbeafe; color: #1e40af; }
.badge-llm       { background: #ede9fe; color: #5b21b6; }
.badge-perturb   { background: #fef3c7; color: #92400e; }
.badge-baseline  { background: #e5e7eb; color: #374151; }

/* ── Leaderboard Table ── */
.lb-table {
    width: 100%;
    border-collapse: separate;
    border-spacing: 0;
    font-size: 0.9rem;
}
.lb-table thead th {
    background: var(--secondary-background-color, #f1f3f5);
    padding: 0.7rem 0.65rem;
    font-weight: 600;
    font-size: 0.78rem;
    text-transform: uppercase;
    letter-spacing: 0.04em;
    color: var(--text-color-secondary, #555);
    border-bottom: 2px solid rgba(128,128,128,0.15);
    text-align: center;
}
.lb-table thead th:first-child { text-align: center; border-radius: 0.5rem 0 0 0; }
.lb-table thead th:last-child { border-radius: 0 0.5rem 0 0; }
.lb-table tbody td {
    padding: 0.6rem 0.65rem;
    border-bottom: 1px solid rgba(128,128,128,0.08);
    text-align: center;
    vertical-align: middle;
}
.lb-table tbody td:nth-child(1) { font-weight: 700; width: 3rem; }
.lb-table tbody td:nth-child(2) { text-align: left; font-weight: 500; }
.lb-table tbody td:nth-child(3) { text-align: center; }
.lb-table tbody tr:hover { background: rgba(128,128,128,0.04); }
.lb-table .baseline-row {
    background: rgba(200,200,200,0.15);
}
.lb-table .baseline-row td { font-weight: 600; color: var(--text-color-secondary, #666); }

/* ── Risk explanation boxes ── */
.risk-boxes {
    display: flex; gap: 1.2rem; margin-top: 1rem;
    flex-wrap: wrap; justify-content: center;
}
.risk-box {
    flex: 1; min-width: 260px; max-width: 420px;
    border-radius: 0.75rem;
    padding: 1.2rem 1.4rem;
    border: 1px solid rgba(128,128,128,0.1);
}
.risk-box .box-title { font-weight: 700; font-size: 0.95rem; margin-bottom: 0.35rem; }
.risk-box .box-title .kw-direct { color: darkorange; }
.risk-box .box-title .kw-indirect { color: darkblue; }
.risk-box .box-desc { font-size: 0.85rem; color: var(--text-color-secondary, #666); line-height: 1.5; }
</style>
""", unsafe_allow_html=True)

# ╔══════════════════════════════════════════════════════════════╗
# ║  1. Hero Banner                                             ║
# ╚══════════════════════════════════════════════════════════════╝
# Embed the Rat_Bench logo
with open(Path(__file__).parent / "images" / "Rat_Bench.png", "rb") as f:
    logo_b64 = b64encode(f.read()).decode("utf-8")

st.markdown(f"""
<div class="hero">
    <h1>RAT-Bench: A Comprehensive Benchmark for Text Anonymization</h1>
    <img src="data:image/png;base64,{logo_b64}" class="hero-logo" style="width:120px; height:auto;" alt="RAT-Bench">
    <p>
        <b>RAT-Bench</b> is a synthetic benchmark for evaluating how well anonymization tools
        prevent <b>re-identification</b> of individuals in text.<br>
        Using U.S. demographic statistics, we generate text with direct and indirect identifiers,
        anonymize it, and measure how easily an LLM-based attacker can still re-identify people.
    </p>
    <p style="margin-top:0.7rem; font-size:0.92rem; color:#a0a8b4;">
        <i>Curious how your tool compares?</i> Follow the instructions in
        <a href="https://github.com/imperial-aisp/rat-bench" target="_blank">our repo</a> and send us your results!
    </p>
    <div class="link-pills">
        <a href="https://arxiv.org/abs/XXXX.XXXXX" target="_blank">📄 Paper</a>
        <a href="https://github.com/imperial-aisp/rat-bench" target="_blank">💻 Code</a>
        <a href="https://huggingface.co/datasets/imperial-cpg/rat-bench" target="_blank">🗂️ Data</a>
    </div>
</div>
""", unsafe_allow_html=True)

# ╔══════════════════════════════════════════════════════════════╗
# ║  Load Data                                                   ║
# ╚══════════════════════════════════════════════════════════════╝
@st.cache_data
def load_results():
    ds = load_dataset(
        "imperial-cpg/rat-bench-results",
        split="train",
        token=os.environ.get("HF_TOKEN"),
    )
    return ds.to_pandas()

df = load_results()

# ╔══════════════════════════════════════════════════════════════╗
# ║  2. Metric Summary Cards                                    ║
# ╚══════════════════════════════════════════════════════════════╝
tool_col = "Anonymization tool"
baseline_name = "No anonymization"

non_baseline = df[df[tool_col].str.strip().str.lower() != baseline_name.lower()]
num_tools = len(non_baseline)
best_tool = non_baseline.loc[non_baseline["English Avg"].idxmin(), tool_col]
languages = ["English", "Spanish", "Simplified Chinese"]
num_langs = len(languages)

# Best risk-BLEU tradeoff: lowest risk among tools with above-median BLEU
bleu_col_src = "BLEU score (English, Explicit avg)"
with_bleu = non_baseline.dropna(subset=[bleu_col_src, "English Avg"])
median_bleu = with_bleu[bleu_col_src].median()
good_bleu = with_bleu[with_bleu[bleu_col_src] >= median_bleu]
best_tradeoff = good_bleu.loc[good_bleu["English Avg"].idxmin(), tool_col]

st.markdown(f"""
<div class="metric-row">
    <div class="metric-card">
        <div class="metric-label">Tools Evaluated</div>
        <div class="metric-value">{num_tools}</div>
    </div>
    <div class="metric-card">
        <div class="metric-label">Lowest Avg Risk (EN)</div>
        <div class="metric-value">{best_tool}</div>
    </div>
    <div class="metric-card">
        <div class="metric-label">Best Risk-BLEU Tradeoff</div>
        <div class="metric-value">{best_tradeoff}</div>
    </div>
    <div class="metric-card">
        <div class="metric-label">Languages</div>
        <div class="metric-value">{num_langs}</div>
    </div>
</div>
""", unsafe_allow_html=True)

# ╔══════════════════════════════════════════════════════════════╗
# ║  3. Leaderboard Table                                       ║
# ╚══════════════════════════════════════════════════════════════╝
st.markdown('<div class="section-title">Leaderboard</div>', unsafe_allow_html=True)
st.caption(
    "<p style='text-align:center;'>Toggle which results to display. "
    "The <i>No anonymization</i> baseline is pinned on top (not ranked). "
    "Tools are ranked by <b>Average Risk</b> (lower is better).</p>",
    unsafe_allow_html=True,
)

_, c1, c2, _ = st.columns([1, 2, 2, 1], gap="medium")
with c1:
    language = st.selectbox("Language", languages)
with c2:
    st.write("")  # vertical spacer
    show_levels = st.checkbox("Show difficulty levels", value=True, key="levels_cb")

# --- Build display table ---
work = df.copy()
work["Average Risk (Explicit)"] = work[f"{language} Avg"]
work = work.dropna(subset=[f"{language} Avg"])

baseline_mask = work[tool_col].str.strip().str.lower() == baseline_name.lower()
others = work[~baseline_mask].sort_values(f"{language} Avg").reset_index(drop=True)
others["Rank"] = (others.index + 1).astype(str)
baselines = work[baseline_mask].copy()
baselines["Rank"] = "—"
final = pd.concat([baselines, others], ignore_index=True)

cols = ["Rank", tool_col, "Type"]
if not show_levels:
    cols += ["Average Risk (Explicit)"]
elif language == "English":
    cols += [
        f"{language} Explicit (easy)",
        f"{language} Explicit (hard)",
        "Average Risk (Explicit)",
        f"{language} Implicit",
    ]
else:
    cols += [f"{language} Explicit (easy)", "Average Risk (Explicit)"]

if language == "English":
    cols += [f"BLEU score ({language}, Explicit avg)"]

rename_map = {
    f"{language} Explicit (easy)": "Explicit (easy)",
    f"{language} Explicit (hard)": "Explicit (hard)",
    f"{language} Implicit": "Implicit",
    f"BLEU score ({language}, Explicit avg)": "Avg BLEU (Explicit)",
}
display = final[cols].rename(columns=rename_map)

# --- Badge helper ---
BADGE_CLS = {
    "NER-based": "badge-ner",
    "LLM-based": "badge-llm",
    "Perturbation": "badge-perturb",
    "Baseline": "badge-baseline",
}

def _badge(typ: str) -> str:
    cls = BADGE_CLS.get(typ, "badge-baseline")
    return f'<span class="badge {cls}">{typ}</span>'

# --- Risk heatmap color (green→yellow→red) ---
def _risk_color(val, lo=0, hi=100):
    """Return a CSS background for risk values: green(0) -> yellow(50) -> red(100)."""
    try:
        v = float(val)
    except (ValueError, TypeError):
        return ""
    t = max(0.0, min(1.0, (v - lo) / (hi - lo)))
    if t <= 0.5:
        r = int(76 + (t / 0.5) * (234 - 76))
        g = int(175 + (t / 0.5) * (179 - 175))
        b = int(80 + (t / 0.5) * (8 - 80))
    else:
        r = int(234 + ((t - 0.5) / 0.5) * (220 - 234))
        g = int(179 - ((t - 0.5) / 0.5) * (179 - 53))
        b = int(8 + ((t - 0.5) / 0.5) * (69 - 8))
    return f"background:rgba({r},{g},{b},0.22); font-weight:600;"

# --- BLEU heatmap color (red→yellow→green, higher=better) ---
def _bleu_color(val, lo=0.5, hi=1.0):
    """Return a CSS background for BLEU values: red(low) -> yellow(mid) -> green(high)."""
    try:
        v = float(val)
    except (ValueError, TypeError):
        return ""
    t = max(0.0, min(1.0, (v - lo) / (hi - lo)))
    if t <= 0.5:
        # red to yellow
        r = int(220 + (t / 0.5) * (234 - 220))
        g = int(53 + (t / 0.5) * (179 - 53))
        b = int(69 + (t / 0.5) * (8 - 69))
    else:
        # yellow to green
        r = int(234 - ((t - 0.5) / 0.5) * (234 - 76))
        g = int(179 - ((t - 0.5) / 0.5) * (179 - 175))
        b = int(8 + ((t - 0.5) / 0.5) * (80 - 8))
    return f"background:rgba({r},{g},{b},0.22); font-weight:600;"

# Risk value columns in the display table
risk_cols = {"Explicit (easy)", "Explicit (hard)", "Implicit", "Average Risk (Explicit)"}
bleu_col_name = "Avg BLEU (Explicit)"

# --- Build HTML table ---
html_rows = []
for _, row in display.iterrows():
    is_baseline = str(row.get(tool_col, "")).strip().lower() == baseline_name.lower()
    tr_cls = ' class="baseline-row"' if is_baseline else ""
    cells = []
    for col in display.columns:
        val = row[col]
        if col == "Type":
            cells.append(f"<td>{_badge(str(val))}</td>")
        elif col in risk_cols and not is_baseline:
            style = _risk_color(val)
            formatted = f"{val:.1f}" if pd.notna(val) else "—"
            cells.append(f'<td style="{style}">{formatted}</td>')
        elif col == bleu_col_name and not is_baseline:
            style = _bleu_color(val)
            formatted = f"{val:.2f}" if pd.notna(val) else "—"
            cells.append(f'<td style="{style}">{formatted}</td>')
        elif col in risk_cols or col == bleu_col_name:
            formatted = f"{val:.2f}" if pd.notna(val) and col == bleu_col_name else (f"{val:.1f}" if pd.notna(val) else "—")
            cells.append(f"<td>{formatted}</td>")
        else:
            cells.append(f"<td>{val}</td>")
    html_rows.append(f"<tr{tr_cls}>{''.join(cells)}</tr>")

header_cells = "".join(f"<th>{c}</th>" for c in display.columns)
table_html = f"""
<table class="lb-table">
    <thead><tr>{header_cells}</tr></thead>
    <tbody>{''.join(html_rows)}</tbody>
</table>
"""
st.markdown(table_html, unsafe_allow_html=True)

# ╔══════════════════════════════════════════════════════════════╗
# ║  4. Re-identification Risk Explanation + Overview Figure     ║
# ╚══════════════════════════════════════════════════════════════╝
st.markdown("<br>", unsafe_allow_html=True)
st.markdown("""
<div class="card">
    <h2>How Re-identification Risk Is Computed</h2>
    <p>
        We measure how much identifying information survives anonymization.
        An LLM-based attacker reads the anonymized text and attempts to recover identifying attributes.
    </p>
    <div class="risk-boxes">
        <div class="risk-box">
            <div class="box-title"><span class="kw-direct">Direct</span> Identifiers</div>
            <div class="box-desc">If any <span class="kw-direct">direct</span> identifier (e.g., full address, SSN) is recovered by the attacker, the re-identification risk is automatically set to <b>1</b>.</div>
        </div>
        <div class="risk-box">
            <div class="box-title"><span class="kw-indirect">Indirect</span> Identifiers</div>
            <div class="box-desc">Otherwise, risk is computed from the set of <span class="kw-indirect">indirect</span> identifiers recovered (state of residence, date of birth, marital status, …). The risk equals the probability that their combination uniquely identifies the individual in the population.</div>
        </div>
    </div>
</div>
""", unsafe_allow_html=True)

# Original overview figure
with open(Path(__file__).parent / "images" / "overview.png", "rb") as f:
    overview_b64 = b64encode(f.read()).decode("utf-8")

st.markdown(f"""<div style='display: flex; justify-content: center; margin-bottom: 0.5rem;'>
    <img src="data:image/png;base64,{overview_b64}" style="max-width:80%; border-radius:8px;">
</div>""", unsafe_allow_html=True)
st.markdown("""<p style='text-align: center; font-size: 0.9rem; color: var(--text-color-secondary, #555);'>
    Figure: Re-identification risk based on direct and indirect identifiers.
</p>""", unsafe_allow_html=True)

# ╔══════════════════════════════════════════════════════════════╗
# ║  5. Interactive Risk vs BLEU Scatter (Altair)               ║
# ╚══════════════════════════════════════════════════════════════╝
st.markdown('<div class="section-title">Re-identification Risk vs. BLEU Score</div>', unsafe_allow_html=True)
st.markdown(
    "<p style='text-align:center; max-width:750px; margin:0 auto 1rem auto; font-size:0.93rem; "
    "color:var(--text-color-secondary,#666); line-height:1.55;'>"
    "A good anonymizer sits in the <b>lower-right corner</b>: low risk, high BLEU (text utility preserved). "
    "Hover over points for details.</p>",
    unsafe_allow_html=True,
)

scatter_df = df.dropna(subset=["BLEU score (English, Explicit avg)", "English Avg"]).copy()
scatter_df = scatter_df[scatter_df[tool_col].str.strip().str.lower() != baseline_name.lower()]
scatter_df = scatter_df.rename(columns={
    "English Avg": "Average Risk",
    "BLEU score (English, Explicit avg)": "BLEU Score",
    tool_col: "Tool",
})

type_colors = alt.Scale(
    domain=["NER-based", "LLM-based", "Perturbation"],
    range=["#3b82f6", "#8b5cf6", "#f59e0b"],
)

points = (
    alt.Chart(scatter_df)
    .mark_circle(size=120, opacity=0.85, stroke="#fff", strokeWidth=1)
    .encode(
        x=alt.X("BLEU Score:Q", scale=alt.Scale(domain=[0.5, 1.0]), title="BLEU Score (higher = more utility)"),
        y=alt.Y("Average Risk:Q", scale=alt.Scale(domain=[20, 100]), title="Average Risk % (lower = safer)"),
        color=alt.Color("Type:N", scale=type_colors, legend=alt.Legend(title="Type", orient="bottom")),
        tooltip=["Tool:N", "Type:N", alt.Tooltip("Average Risk:Q", format=".1f"), alt.Tooltip("BLEU Score:Q", format=".2f")],
    )
)

labels = (
    alt.Chart(scatter_df)
    .mark_text(align="left", dx=8, dy=-6, fontSize=11, fontWeight=500)
    .encode(
        x="BLEU Score:Q",
        y="Average Risk:Q",
        text="Tool:N",
        color=alt.Color("Type:N", scale=type_colors, legend=None),
    )
)

chart = (
    (points + labels)
    .properties(width=500, height=380)
    .configure_axis(
        grid=True,
        gridColor="rgba(128,128,128,0.12)",
        labelFontSize=12,
        titleFontSize=13,
        titleFontWeight=600,
    )
    .configure_view(strokeWidth=0)
    .interactive()
)

st.altair_chart(chart, use_container_width=True)

# ╔══════════════════════════════════════════════════════════════╗
# ║  6. BibTeX Citation                                          ║
# ╚══════════════════════════════════════════════════════════════╝
st.markdown("<br>", unsafe_allow_html=True)
st.markdown('<div class="section-title">BibTeX</div>', unsafe_allow_html=True)
st.markdown("If you found this useful for your work, please cite:")
st.code("""@article{krvco2026rat,
  title={RAT-Bench: A Comprehensive Benchmark for Text Anonymization},
  author={Kr{\v{c}}o, Nata{\v{s}}a and Yao, Zexi and Meeus, Matthieu and de Montjoye, Yves-Alexandre},
  journal={arXiv preprint arXiv:2602.12806},
  year={2026}
}""", language="bibtex")