rat-bench / src /streamlit_app.py
matthieumeeus97's picture
adding bibtex
261b840
import os
from pathlib import Path
from base64 import b64encode
import streamlit as st
import pandas as pd
import altair as alt
from datasets import load_dataset
# --- Page setup ---
st.set_page_config(
page_title="RAT-Bench Leaderboard",
page_icon="πŸ“Š",
layout="centered",
)
# --- Global CSS ---
st.markdown("""
<style>
/* ── Reset & Typography ── */
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
html, body, [class*="css"] { font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; }
.block-container { max-width: 1100px; padding-top: 0; padding-bottom: 2rem; }
/* ── Hero Banner ── */
.hero {
background: linear-gradient(135deg, #1a1a2e 0%, #2d2d44 40%, #3d3d5c 100%);
border-radius: 0 0 1.5rem 1.5rem;
padding: 2.4rem 2rem 2rem 2rem;
margin: -1rem -1rem 1.8rem -1rem;
text-align: center;
color: #f0f4f8;
}
.hero-logo { max-width: 60px; border-radius: 8px; margin: 0.6rem 0 0.3rem 0; }
.hero h1 {
font-size: 2.1rem;
font-weight: 700;
margin: 0 0 0.45rem 0;
letter-spacing: -0.02em;
line-height: 1.25;
color: #ffffff;
}
.hero p {
font-size: 0.97rem;
line-height: 1.65;
color: #c0c7d0;
max-width: 800px;
margin: 0 auto;
}
.hero p b { color: #ffffff; }
.hero p a { color: #ffd470; }
/* ── Pill Link Buttons ── */
.link-pills {
display: flex; justify-content: center; gap: 0.75rem;
margin-top: 1.2rem;
}
.link-pills a {
display: inline-flex; align-items: center; gap: 0.4rem;
padding: 0.45rem 1.1rem;
font-size: 0.88rem; font-weight: 600;
color: #e0e7ee;
text-decoration: none;
border: 1px solid rgba(255,255,255,0.2);
border-radius: 9999px;
backdrop-filter: blur(4px);
transition: all 0.2s ease;
}
.link-pills a:hover {
background: rgba(255,215,100,0.15);
color: #fff;
border-color: rgba(255,215,100,0.4);
transform: translateY(-1px);
}
/* ── Metric Cards Row ── */
.metric-row {
display: flex; gap: 1rem; justify-content: center;
margin-bottom: 1.6rem; flex-wrap: wrap;
}
.metric-card {
flex: 1; min-width: 160px; max-width: 260px;
background: var(--secondary-background-color, #f8f9fa);
border-radius: 0.85rem;
padding: 1.1rem 1.3rem;
text-align: center;
box-shadow: 0 1px 4px rgba(0,0,0,0.06);
border: 1px solid rgba(128,128,128,0.1);
}
.metric-card .metric-label {
font-size: 0.78rem; font-weight: 500;
color: var(--text-color-secondary, #666);
text-transform: uppercase; letter-spacing: 0.04em;
margin-bottom: 0.2rem;
}
.metric-card .metric-value {
font-size: 1.2rem; font-weight: 700;
color: var(--text-color, #222);
}
/* ── Section Card ── */
.card {
background: var(--secondary-background-color, #f8f9fa);
border-radius: 1rem;
padding: 1.6rem 1.8rem;
margin: 0 auto 1.5rem auto;
max-width: 850px;
box-shadow: 0 1px 6px rgba(0,0,0,0.05);
border: 1px solid rgba(128,128,128,0.08);
}
.card h2 {
font-size: 1.35rem; font-weight: 700;
margin: 0 0 0.6rem 0;
text-align: center;
color: var(--text-color, #222);
}
.card p, .card div {
font-size: 0.95rem; line-height: 1.65;
color: var(--text-color, #444);
}
/* ── Section Titles ── */
.section-title {
text-align: center;
font-size: 1.55rem;
font-weight: 700;
margin: 0.5rem 0 0.35rem 0;
color: var(--text-color, #222);
}
/* ── Type Badges ── */
.badge {
display: inline-block;
padding: 0.18rem 0.65rem;
border-radius: 9999px;
font-size: 0.78rem;
font-weight: 600;
letter-spacing: 0.02em;
white-space: nowrap;
}
.badge-ner { background: #dbeafe; color: #1e40af; }
.badge-llm { background: #ede9fe; color: #5b21b6; }
.badge-perturb { background: #fef3c7; color: #92400e; }
.badge-baseline { background: #e5e7eb; color: #374151; }
/* ── Leaderboard Table ── */
.lb-table {
width: 100%;
border-collapse: separate;
border-spacing: 0;
font-size: 0.9rem;
}
.lb-table thead th {
background: var(--secondary-background-color, #f1f3f5);
padding: 0.7rem 0.65rem;
font-weight: 600;
font-size: 0.78rem;
text-transform: uppercase;
letter-spacing: 0.04em;
color: var(--text-color-secondary, #555);
border-bottom: 2px solid rgba(128,128,128,0.15);
text-align: center;
}
.lb-table thead th:first-child { text-align: center; border-radius: 0.5rem 0 0 0; }
.lb-table thead th:last-child { border-radius: 0 0.5rem 0 0; }
.lb-table tbody td {
padding: 0.6rem 0.65rem;
border-bottom: 1px solid rgba(128,128,128,0.08);
text-align: center;
vertical-align: middle;
}
.lb-table tbody td:nth-child(1) { font-weight: 700; width: 3rem; }
.lb-table tbody td:nth-child(2) { text-align: left; font-weight: 500; }
.lb-table tbody td:nth-child(3) { text-align: center; }
.lb-table tbody tr:hover { background: rgba(128,128,128,0.04); }
.lb-table .baseline-row {
background: rgba(200,200,200,0.15);
}
.lb-table .baseline-row td { font-weight: 600; color: var(--text-color-secondary, #666); }
/* ── Risk explanation boxes ── */
.risk-boxes {
display: flex; gap: 1.2rem; margin-top: 1rem;
flex-wrap: wrap; justify-content: center;
}
.risk-box {
flex: 1; min-width: 260px; max-width: 420px;
border-radius: 0.75rem;
padding: 1.2rem 1.4rem;
border: 1px solid rgba(128,128,128,0.1);
}
.risk-box .box-title { font-weight: 700; font-size: 0.95rem; margin-bottom: 0.35rem; }
.risk-box .box-title .kw-direct { color: darkorange; }
.risk-box .box-title .kw-indirect { color: darkblue; }
.risk-box .box-desc { font-size: 0.85rem; color: var(--text-color-secondary, #666); line-height: 1.5; }
</style>
""", unsafe_allow_html=True)
# ╔══════════════════════════════════════════════════════════════╗
# β•‘ 1. Hero Banner β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
# Embed the Rat_Bench logo
with open(Path(__file__).parent / "images" / "Rat_Bench.png", "rb") as f:
logo_b64 = b64encode(f.read()).decode("utf-8")
st.markdown(f"""
<div class="hero">
<h1>RAT-Bench: A Comprehensive Benchmark for Text Anonymization</h1>
<img src="data:image/png;base64,{logo_b64}" class="hero-logo" style="width:120px; height:auto;" alt="RAT-Bench">
<p>
<b>RAT-Bench</b> is a synthetic benchmark for evaluating how well anonymization tools
prevent <b>re-identification</b> of individuals in text.<br>
Using U.S. demographic statistics, we generate text with direct and indirect identifiers,
anonymize it, and measure how easily an LLM-based attacker can still re-identify people.
</p>
<p style="margin-top:0.7rem; font-size:0.92rem; color:#a0a8b4;">
<i>Curious how your tool compares?</i> Follow the instructions in
<a href="https://github.com/imperial-aisp/rat-bench" target="_blank">our repo</a> and send us your results!
</p>
<div class="link-pills">
<a href="https://arxiv.org/abs/XXXX.XXXXX" target="_blank">πŸ“„ Paper</a>
<a href="https://github.com/imperial-aisp/rat-bench" target="_blank">πŸ’» Code</a>
<a href="https://huggingface.co/datasets/imperial-cpg/rat-bench" target="_blank">πŸ—‚οΈ Data</a>
</div>
</div>
""", unsafe_allow_html=True)
# ╔══════════════════════════════════════════════════════════════╗
# β•‘ Load Data β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
@st.cache_data
def load_results():
ds = load_dataset(
"imperial-cpg/rat-bench-results",
split="train",
token=os.environ.get("HF_TOKEN"),
)
return ds.to_pandas()
df = load_results()
# ╔══════════════════════════════════════════════════════════════╗
# β•‘ 2. Metric Summary Cards β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
tool_col = "Anonymization tool"
baseline_name = "No anonymization"
non_baseline = df[df[tool_col].str.strip().str.lower() != baseline_name.lower()]
num_tools = len(non_baseline)
best_tool = non_baseline.loc[non_baseline["English Avg"].idxmin(), tool_col]
languages = ["English", "Spanish", "Simplified Chinese"]
num_langs = len(languages)
# Best risk-BLEU tradeoff: lowest risk among tools with above-median BLEU
bleu_col_src = "BLEU score (English, Explicit avg)"
with_bleu = non_baseline.dropna(subset=[bleu_col_src, "English Avg"])
median_bleu = with_bleu[bleu_col_src].median()
good_bleu = with_bleu[with_bleu[bleu_col_src] >= median_bleu]
best_tradeoff = good_bleu.loc[good_bleu["English Avg"].idxmin(), tool_col]
st.markdown(f"""
<div class="metric-row">
<div class="metric-card">
<div class="metric-label">Tools Evaluated</div>
<div class="metric-value">{num_tools}</div>
</div>
<div class="metric-card">
<div class="metric-label">Lowest Avg Risk (EN)</div>
<div class="metric-value">{best_tool}</div>
</div>
<div class="metric-card">
<div class="metric-label">Best Risk-BLEU Tradeoff</div>
<div class="metric-value">{best_tradeoff}</div>
</div>
<div class="metric-card">
<div class="metric-label">Languages</div>
<div class="metric-value">{num_langs}</div>
</div>
</div>
""", unsafe_allow_html=True)
# ╔══════════════════════════════════════════════════════════════╗
# β•‘ 3. Leaderboard Table β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
st.markdown('<div class="section-title">Leaderboard</div>', unsafe_allow_html=True)
st.caption(
"<p style='text-align:center;'>Toggle which results to display. "
"The <i>No anonymization</i> baseline is pinned on top (not ranked). "
"Tools are ranked by <b>Average Risk</b> (lower is better).</p>",
unsafe_allow_html=True,
)
_, c1, c2, _ = st.columns([1, 2, 2, 1], gap="medium")
with c1:
language = st.selectbox("Language", languages)
with c2:
st.write("") # vertical spacer
show_levels = st.checkbox("Show difficulty levels", value=True, key="levels_cb")
# --- Build display table ---
work = df.copy()
work["Average Risk (Explicit)"] = work[f"{language} Avg"]
work = work.dropna(subset=[f"{language} Avg"])
baseline_mask = work[tool_col].str.strip().str.lower() == baseline_name.lower()
others = work[~baseline_mask].sort_values(f"{language} Avg").reset_index(drop=True)
others["Rank"] = (others.index + 1).astype(str)
baselines = work[baseline_mask].copy()
baselines["Rank"] = "β€”"
final = pd.concat([baselines, others], ignore_index=True)
cols = ["Rank", tool_col, "Type"]
if not show_levels:
cols += ["Average Risk (Explicit)"]
elif language == "English":
cols += [
f"{language} Explicit (easy)",
f"{language} Explicit (hard)",
"Average Risk (Explicit)",
f"{language} Implicit",
]
else:
cols += [f"{language} Explicit (easy)", "Average Risk (Explicit)"]
if language == "English":
cols += [f"BLEU score ({language}, Explicit avg)"]
rename_map = {
f"{language} Explicit (easy)": "Explicit (easy)",
f"{language} Explicit (hard)": "Explicit (hard)",
f"{language} Implicit": "Implicit",
f"BLEU score ({language}, Explicit avg)": "Avg BLEU (Explicit)",
}
display = final[cols].rename(columns=rename_map)
# --- Badge helper ---
BADGE_CLS = {
"NER-based": "badge-ner",
"LLM-based": "badge-llm",
"Perturbation": "badge-perturb",
"Baseline": "badge-baseline",
}
def _badge(typ: str) -> str:
cls = BADGE_CLS.get(typ, "badge-baseline")
return f'<span class="badge {cls}">{typ}</span>'
# --- Risk heatmap color (green→yellow→red) ---
def _risk_color(val, lo=0, hi=100):
"""Return a CSS background for risk values: green(0) -> yellow(50) -> red(100)."""
try:
v = float(val)
except (ValueError, TypeError):
return ""
t = max(0.0, min(1.0, (v - lo) / (hi - lo)))
if t <= 0.5:
r = int(76 + (t / 0.5) * (234 - 76))
g = int(175 + (t / 0.5) * (179 - 175))
b = int(80 + (t / 0.5) * (8 - 80))
else:
r = int(234 + ((t - 0.5) / 0.5) * (220 - 234))
g = int(179 - ((t - 0.5) / 0.5) * (179 - 53))
b = int(8 + ((t - 0.5) / 0.5) * (69 - 8))
return f"background:rgba({r},{g},{b},0.22); font-weight:600;"
# --- BLEU heatmap color (red→yellow→green, higher=better) ---
def _bleu_color(val, lo=0.5, hi=1.0):
"""Return a CSS background for BLEU values: red(low) -> yellow(mid) -> green(high)."""
try:
v = float(val)
except (ValueError, TypeError):
return ""
t = max(0.0, min(1.0, (v - lo) / (hi - lo)))
if t <= 0.5:
# red to yellow
r = int(220 + (t / 0.5) * (234 - 220))
g = int(53 + (t / 0.5) * (179 - 53))
b = int(69 + (t / 0.5) * (8 - 69))
else:
# yellow to green
r = int(234 - ((t - 0.5) / 0.5) * (234 - 76))
g = int(179 - ((t - 0.5) / 0.5) * (179 - 175))
b = int(8 + ((t - 0.5) / 0.5) * (80 - 8))
return f"background:rgba({r},{g},{b},0.22); font-weight:600;"
# Risk value columns in the display table
risk_cols = {"Explicit (easy)", "Explicit (hard)", "Implicit", "Average Risk (Explicit)"}
bleu_col_name = "Avg BLEU (Explicit)"
# --- Build HTML table ---
html_rows = []
for _, row in display.iterrows():
is_baseline = str(row.get(tool_col, "")).strip().lower() == baseline_name.lower()
tr_cls = ' class="baseline-row"' if is_baseline else ""
cells = []
for col in display.columns:
val = row[col]
if col == "Type":
cells.append(f"<td>{_badge(str(val))}</td>")
elif col in risk_cols and not is_baseline:
style = _risk_color(val)
formatted = f"{val:.1f}" if pd.notna(val) else "β€”"
cells.append(f'<td style="{style}">{formatted}</td>')
elif col == bleu_col_name and not is_baseline:
style = _bleu_color(val)
formatted = f"{val:.2f}" if pd.notna(val) else "β€”"
cells.append(f'<td style="{style}">{formatted}</td>')
elif col in risk_cols or col == bleu_col_name:
formatted = f"{val:.2f}" if pd.notna(val) and col == bleu_col_name else (f"{val:.1f}" if pd.notna(val) else "β€”")
cells.append(f"<td>{formatted}</td>")
else:
cells.append(f"<td>{val}</td>")
html_rows.append(f"<tr{tr_cls}>{''.join(cells)}</tr>")
header_cells = "".join(f"<th>{c}</th>" for c in display.columns)
table_html = f"""
<table class="lb-table">
<thead><tr>{header_cells}</tr></thead>
<tbody>{''.join(html_rows)}</tbody>
</table>
"""
st.markdown(table_html, unsafe_allow_html=True)
# ╔══════════════════════════════════════════════════════════════╗
# β•‘ 4. Re-identification Risk Explanation + Overview Figure β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
st.markdown("<br>", unsafe_allow_html=True)
st.markdown("""
<div class="card">
<h2>How Re-identification Risk Is Computed</h2>
<p>
We measure how much identifying information survives anonymization.
An LLM-based attacker reads the anonymized text and attempts to recover identifying attributes.
</p>
<div class="risk-boxes">
<div class="risk-box">
<div class="box-title"><span class="kw-direct">Direct</span> Identifiers</div>
<div class="box-desc">If any <span class="kw-direct">direct</span> identifier (e.g., full address, SSN) is recovered by the attacker, the re-identification risk is automatically set to <b>1</b>.</div>
</div>
<div class="risk-box">
<div class="box-title"><span class="kw-indirect">Indirect</span> Identifiers</div>
<div class="box-desc">Otherwise, risk is computed from the set of <span class="kw-indirect">indirect</span> identifiers recovered (state of residence, date of birth, marital status, …). The risk equals the probability that their combination uniquely identifies the individual in the population.</div>
</div>
</div>
</div>
""", unsafe_allow_html=True)
# Original overview figure
with open(Path(__file__).parent / "images" / "overview.png", "rb") as f:
overview_b64 = b64encode(f.read()).decode("utf-8")
st.markdown(f"""<div style='display: flex; justify-content: center; margin-bottom: 0.5rem;'>
<img src="data:image/png;base64,{overview_b64}" style="max-width:80%; border-radius:8px;">
</div>""", unsafe_allow_html=True)
st.markdown("""<p style='text-align: center; font-size: 0.9rem; color: var(--text-color-secondary, #555);'>
Figure: Re-identification risk based on direct and indirect identifiers.
</p>""", unsafe_allow_html=True)
# ╔══════════════════════════════════════════════════════════════╗
# β•‘ 5. Interactive Risk vs BLEU Scatter (Altair) β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
st.markdown('<div class="section-title">Re-identification Risk vs. BLEU Score</div>', unsafe_allow_html=True)
st.markdown(
"<p style='text-align:center; max-width:750px; margin:0 auto 1rem auto; font-size:0.93rem; "
"color:var(--text-color-secondary,#666); line-height:1.55;'>"
"A good anonymizer sits in the <b>lower-right corner</b>: low risk, high BLEU (text utility preserved). "
"Hover over points for details.</p>",
unsafe_allow_html=True,
)
scatter_df = df.dropna(subset=["BLEU score (English, Explicit avg)", "English Avg"]).copy()
scatter_df = scatter_df[scatter_df[tool_col].str.strip().str.lower() != baseline_name.lower()]
scatter_df = scatter_df.rename(columns={
"English Avg": "Average Risk",
"BLEU score (English, Explicit avg)": "BLEU Score",
tool_col: "Tool",
})
type_colors = alt.Scale(
domain=["NER-based", "LLM-based", "Perturbation"],
range=["#3b82f6", "#8b5cf6", "#f59e0b"],
)
points = (
alt.Chart(scatter_df)
.mark_circle(size=120, opacity=0.85, stroke="#fff", strokeWidth=1)
.encode(
x=alt.X("BLEU Score:Q", scale=alt.Scale(domain=[0.5, 1.0]), title="BLEU Score (higher = more utility)"),
y=alt.Y("Average Risk:Q", scale=alt.Scale(domain=[20, 100]), title="Average Risk % (lower = safer)"),
color=alt.Color("Type:N", scale=type_colors, legend=alt.Legend(title="Type", orient="bottom")),
tooltip=["Tool:N", "Type:N", alt.Tooltip("Average Risk:Q", format=".1f"), alt.Tooltip("BLEU Score:Q", format=".2f")],
)
)
labels = (
alt.Chart(scatter_df)
.mark_text(align="left", dx=8, dy=-6, fontSize=11, fontWeight=500)
.encode(
x="BLEU Score:Q",
y="Average Risk:Q",
text="Tool:N",
color=alt.Color("Type:N", scale=type_colors, legend=None),
)
)
chart = (
(points + labels)
.properties(width=500, height=380)
.configure_axis(
grid=True,
gridColor="rgba(128,128,128,0.12)",
labelFontSize=12,
titleFontSize=13,
titleFontWeight=600,
)
.configure_view(strokeWidth=0)
.interactive()
)
st.altair_chart(chart, use_container_width=True)
# ╔══════════════════════════════════════════════════════════════╗
# β•‘ 6. BibTeX Citation β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
st.markdown("<br>", unsafe_allow_html=True)
st.markdown('<div class="section-title">BibTeX</div>', unsafe_allow_html=True)
st.markdown("If you found this useful for your work, please cite:")
st.code("""@article{krvco2026rat,
title={RAT-Bench: A Comprehensive Benchmark for Text Anonymization},
author={Kr{\v{c}}o, Nata{\v{s}}a and Yao, Zexi and Meeus, Matthieu and de Montjoye, Yves-Alexandre},
journal={arXiv preprint arXiv:2602.12806},
year={2026}
}""", language="bibtex")