Anonymiser / app.py
Willxo's picture
Sync to current bundle: Redact/Anonymise modes, lightweight banner, pinned deps
44361bf
"""
HuggingFace Spaces entry point for the Legal Text Anonymisation demo.
This is a Spaces-tailored copy of the main repo's demo/app.py:
- NER backbone forced to `en_core_web_sm` for fast cold-start (the
fine-tuned RoBERTa checkpoint isn't bundled — too large for the free tier).
- Working directory layout is flat (no `src/` prefix), so the imports
resolve via the local `anonymisation/` package.
To run locally for testing:
pip install -r requirements.txt
python -m spacy download en_core_web_sm
python app.py
"""
from __future__ import annotations
import json
import os
from pathlib import Path
from typing import Callable, List, Tuple
import gradio as gr
from anonymisation.mapping import SPACY_TO_TAB
from anonymisation.pipeline import LitePipeline, ProPipeline, MosaicScorer
_state = {"predictor": None, "backend_label": None, "scorer": None}
def _make_spacy_predictor(model_name: str) -> Tuple[Callable, str]:
import spacy
nlp = spacy.load(model_name)
def predict(text: str):
doc = nlp(text)
return [
(e.start_char, e.end_char, SPACY_TO_TAB[e.label_], e.text)
for e in doc.ents if e.label_ in SPACY_TO_TAB
]
return predict, f"spaCy {model_name}"
def _get_predictor() -> Tuple[Callable, str]:
if _state["predictor"] is not None:
return _state["predictor"], _state["backend_label"]
model_name = os.environ.get("SPACY_MODEL", "en_core_web_sm")
predict, label = _make_spacy_predictor(model_name)
_state["predictor"] = predict
_state["backend_label"] = label
print(f"[spaces] Loaded NER backend: {label}")
return predict, label
def _get_scorer() -> MosaicScorer:
if _state["scorer"] is None:
try:
from anonymisation.data import load_tab
ds = load_tab()
_state["scorer"] = MosaicScorer.from_tab(list(ds["test"]))
except Exception as exc:
print(f"[spaces] Could not load TAB ({exc}); empty haystack.")
_state["scorer"] = MosaicScorer.empty()
return _state["scorer"]
def _backend_badge() -> str:
label = _state["backend_label"] or _get_predictor()[1]
return f"**NER backend:** `{label}` · **Coref:** Phase 5 post-processor enabled"
def redact(text: str, variant: str, k_target: int, max_iters: int,
pseudonymise: bool, coref_extend: bool):
if not text or not text.strip():
return (
"", _backend_badge(),
"_paste some text on the left and click Redact_",
"_(no vault)_", "",
)
predictor, _ = _get_predictor()
if variant.startswith("Redact"):
pipeline = LitePipeline(
ner_provider=predictor,
coref_extend=coref_extend,
pseudonymise=pseudonymise,
)
else:
pipeline = ProPipeline(
ner_provider=predictor, scorer=_get_scorer(),
k_target=k_target, max_iterations=max_iters,
coref_extend=coref_extend, pseudonymise=pseudonymise,
)
result = pipeline(text)
parts: List[str] = []
if variant.startswith("Anonymise"):
ck = "✓ converged" if result.converged else "✗ fallback to suppression"
parts.append(
f"**Mosaic risk:** k_initial = `{result.mosaic_risk_initial}` → "
f"k_final = `{result.mosaic_risk_final}` · "
f"iterations: `{result.iterations_used}` · {ck}"
)
if pseudonymise:
parts.append(f"**Vault:** {len(result.pseudonym_vault)} entries (see Vault tab)")
status = " · ".join(parts) if parts else "_(no status — Redact mode: direct identifiers only)_"
if result.pseudonym_vault:
vault_md = "| Token | Original surface form |\n|---|---|\n" + "\n".join(
f"| `{tok}` | {orig} |" for tok, orig in result.pseudonym_vault.items()
)
else:
vault_md = "_(Pseudonymisation off — enable to see referential tokens here)_"
return (
result.redacted_text,
_backend_badge(),
status,
vault_md,
json.dumps(result.to_dict(), indent=2, ensure_ascii=False),
)
def _load_examples() -> List[List]:
examples_dir = Path(__file__).parent / "examples"
if not examples_dir.exists():
return []
return [
[
path.read_text().strip(),
"Anonymise (mosaic-aware)", 5, 3, False, True,
]
for path in sorted(examples_dir.glob("*.txt"))
]
DESCRIPTION = """\
# Legal Text Anonymisation
> ⚠️ **This is a lightweight version of the full project.** To boot quickly on a
> free CPU, this demo runs spaCy's small `en_core_web_sm` model for name
> detection, so it *will* miss names the full project catches. The real pipeline
> uses a RoBERTa model fine-tuned on legal text (F1 0.85 vs ~0.4 here) — see the
> [portfolio write-up](https://github.com/WillCatt/Data-Anonymisation) for the
> full results. What's faithful here is the *logic*: the redaction modes, the
> mosaic re-identification scoring, and the audit trail.
Paste a document on the left and pick a mode. The full per-decision audit log is
on the **Audit log** tab.
- **Redact** — removes the direct identifiers (names, organisations, case and
reference numbers, IBANs). The quick option.
- **Anonymise** — Redact, plus mosaic-aware generalisation: it broadens the
remaining everyday details (dates, places, demographics) one step at a time
until the document's residual fingerprint is no longer unique (reaches the
target k-anonymity), so it can't be traced back to one person.
- **Pseudonymise** — assigns stable referential tokens (`[PERSON_A]`, `[PERSON_B]` …)
instead of plain `[PERSON]`. The Vault tab shows the mapping; restore() reverses
it locally so an external LLM's answer comes back with the real names.
> The mosaic scorer compares against the TAB corpus (1,268 ECHR cases) as a
> methodological stand-in for a firm's own document corpus.
"""
def _safe_textbox(**kwargs):
try:
return gr.Textbox(**kwargs)
except TypeError as exc:
bad = str(exc).split("'")[1] if "'" in str(exc) else None
if bad and bad in kwargs:
kwargs.pop(bad)
return _safe_textbox(**kwargs)
return gr.Textbox(label=kwargs.get("label", ""), lines=kwargs.get("lines", 10))
def build_ui():
with gr.Blocks(title="Legal Text Anonymisation") as demo:
gr.Markdown(DESCRIPTION)
_get_predictor()
backend_md = gr.Markdown(_backend_badge())
with gr.Row():
with gr.Column(scale=1):
input_text = gr.Textbox(
label="Input",
placeholder="Paste legal-style text here…",
lines=14,
)
variant = gr.Radio(
choices=["Redact (direct identifiers)", "Anonymise (mosaic-aware)"],
value="Anonymise (mosaic-aware)",
label="Mode",
)
with gr.Row():
k_target = gr.Slider(2, 10, value=5, step=1, label="Anonymise: k_target")
max_iters = gr.Slider(1, 5, value=3, step=1, label="Anonymise: max iterations")
with gr.Row():
pseudonymise = gr.Checkbox(value=False, label="Pseudonymise")
coref_extend = gr.Checkbox(value=True, label="Coref extension")
run_btn = gr.Button("Redact", variant="primary")
with gr.Column(scale=1):
with gr.Tabs():
with gr.Tab("Redacted"):
redacted_out = _safe_textbox(
label="Redacted text", lines=12, show_copy_button=True,
)
status_md = gr.Markdown()
with gr.Tab("Vault"):
vault_md = gr.Markdown(
value="_(Pseudonymisation off — enable to see referential tokens here)_"
)
with gr.Tab("Audit log"):
audit_out = gr.Code(
label="Full per-decision audit (JSON)", language="json"
)
examples = _load_examples()
if examples:
gr.Examples(
examples=examples,
inputs=[input_text, variant, k_target, max_iters, pseudonymise, coref_extend],
label="Pre-canned examples",
# HF Spaces defaults to caching examples, which requires fn+outputs
# and would run the pipeline on every example at boot. Disable it.
cache_examples=False,
)
run_btn.click(
fn=redact,
inputs=[input_text, variant, k_target, max_iters, pseudonymise, coref_extend],
outputs=[redacted_out, backend_md, status_md, vault_md, audit_out],
)
return demo
if __name__ == "__main__":
app = build_ui()
# HF Spaces sets PORT=7860 by default; honour it if present.
app.launch(
server_name="0.0.0.0",
server_port=int(os.environ.get("PORT", "7860")),
)