""" HuggingFace Spaces entry point for the Legal Text Anonymisation demo. This is a Spaces-tailored copy of the main repo's demo/app.py: - NER backbone forced to `en_core_web_sm` for fast cold-start (the fine-tuned RoBERTa checkpoint isn't bundled — too large for the free tier). - Working directory layout is flat (no `src/` prefix), so the imports resolve via the local `anonymisation/` package. To run locally for testing: pip install -r requirements.txt python -m spacy download en_core_web_sm python app.py """ from __future__ import annotations import json import os from pathlib import Path from typing import Callable, List, Tuple import gradio as gr from anonymisation.mapping import SPACY_TO_TAB from anonymisation.pipeline import LitePipeline, ProPipeline, MosaicScorer _state = {"predictor": None, "backend_label": None, "scorer": None} def _make_spacy_predictor(model_name: str) -> Tuple[Callable, str]: import spacy nlp = spacy.load(model_name) def predict(text: str): doc = nlp(text) return [ (e.start_char, e.end_char, SPACY_TO_TAB[e.label_], e.text) for e in doc.ents if e.label_ in SPACY_TO_TAB ] return predict, f"spaCy {model_name}" def _get_predictor() -> Tuple[Callable, str]: if _state["predictor"] is not None: return _state["predictor"], _state["backend_label"] model_name = os.environ.get("SPACY_MODEL", "en_core_web_sm") predict, label = _make_spacy_predictor(model_name) _state["predictor"] = predict _state["backend_label"] = label print(f"[spaces] Loaded NER backend: {label}") return predict, label def _get_scorer() -> MosaicScorer: if _state["scorer"] is None: try: from anonymisation.data import load_tab ds = load_tab() _state["scorer"] = MosaicScorer.from_tab(list(ds["test"])) except Exception as exc: print(f"[spaces] Could not load TAB ({exc}); empty haystack.") _state["scorer"] = MosaicScorer.empty() return _state["scorer"] def _backend_badge() -> str: label = _state["backend_label"] or _get_predictor()[1] return f"**NER backend:** `{label}` · **Coref:** Phase 5 post-processor enabled" def redact(text: str, variant: str, k_target: int, max_iters: int, pseudonymise: bool, coref_extend: bool): if not text or not text.strip(): return ( "", _backend_badge(), "_paste some text on the left and click Redact_", "_(no vault)_", "", ) predictor, _ = _get_predictor() if variant.startswith("Redact"): pipeline = LitePipeline( ner_provider=predictor, coref_extend=coref_extend, pseudonymise=pseudonymise, ) else: pipeline = ProPipeline( ner_provider=predictor, scorer=_get_scorer(), k_target=k_target, max_iterations=max_iters, coref_extend=coref_extend, pseudonymise=pseudonymise, ) result = pipeline(text) parts: List[str] = [] if variant.startswith("Anonymise"): ck = "✓ converged" if result.converged else "✗ fallback to suppression" parts.append( f"**Mosaic risk:** k_initial = `{result.mosaic_risk_initial}` → " f"k_final = `{result.mosaic_risk_final}` · " f"iterations: `{result.iterations_used}` · {ck}" ) if pseudonymise: parts.append(f"**Vault:** {len(result.pseudonym_vault)} entries (see Vault tab)") status = " · ".join(parts) if parts else "_(no status — Redact mode: direct identifiers only)_" if result.pseudonym_vault: vault_md = "| Token | Original surface form |\n|---|---|\n" + "\n".join( f"| `{tok}` | {orig} |" for tok, orig in result.pseudonym_vault.items() ) else: vault_md = "_(Pseudonymisation off — enable to see referential tokens here)_" return ( result.redacted_text, _backend_badge(), status, vault_md, json.dumps(result.to_dict(), indent=2, ensure_ascii=False), ) def _load_examples() -> List[List]: examples_dir = Path(__file__).parent / "examples" if not examples_dir.exists(): return [] return [ [ path.read_text().strip(), "Anonymise (mosaic-aware)", 5, 3, False, True, ] for path in sorted(examples_dir.glob("*.txt")) ] DESCRIPTION = """\ # Legal Text Anonymisation > ⚠️ **This is a lightweight version of the full project.** To boot quickly on a > free CPU, this demo runs spaCy's small `en_core_web_sm` model for name > detection, so it *will* miss names the full project catches. The real pipeline > uses a RoBERTa model fine-tuned on legal text (F1 0.85 vs ~0.4 here) — see the > [portfolio write-up](https://github.com/WillCatt/Data-Anonymisation) for the > full results. What's faithful here is the *logic*: the redaction modes, the > mosaic re-identification scoring, and the audit trail. Paste a document on the left and pick a mode. The full per-decision audit log is on the **Audit log** tab. - **Redact** — removes the direct identifiers (names, organisations, case and reference numbers, IBANs). The quick option. - **Anonymise** — Redact, plus mosaic-aware generalisation: it broadens the remaining everyday details (dates, places, demographics) one step at a time until the document's residual fingerprint is no longer unique (reaches the target k-anonymity), so it can't be traced back to one person. - **Pseudonymise** — assigns stable referential tokens (`[PERSON_A]`, `[PERSON_B]` …) instead of plain `[PERSON]`. The Vault tab shows the mapping; restore() reverses it locally so an external LLM's answer comes back with the real names. > The mosaic scorer compares against the TAB corpus (1,268 ECHR cases) as a > methodological stand-in for a firm's own document corpus. """ def _safe_textbox(**kwargs): try: return gr.Textbox(**kwargs) except TypeError as exc: bad = str(exc).split("'")[1] if "'" in str(exc) else None if bad and bad in kwargs: kwargs.pop(bad) return _safe_textbox(**kwargs) return gr.Textbox(label=kwargs.get("label", ""), lines=kwargs.get("lines", 10)) def build_ui(): with gr.Blocks(title="Legal Text Anonymisation") as demo: gr.Markdown(DESCRIPTION) _get_predictor() backend_md = gr.Markdown(_backend_badge()) with gr.Row(): with gr.Column(scale=1): input_text = gr.Textbox( label="Input", placeholder="Paste legal-style text here…", lines=14, ) variant = gr.Radio( choices=["Redact (direct identifiers)", "Anonymise (mosaic-aware)"], value="Anonymise (mosaic-aware)", label="Mode", ) with gr.Row(): k_target = gr.Slider(2, 10, value=5, step=1, label="Anonymise: k_target") max_iters = gr.Slider(1, 5, value=3, step=1, label="Anonymise: max iterations") with gr.Row(): pseudonymise = gr.Checkbox(value=False, label="Pseudonymise") coref_extend = gr.Checkbox(value=True, label="Coref extension") run_btn = gr.Button("Redact", variant="primary") with gr.Column(scale=1): with gr.Tabs(): with gr.Tab("Redacted"): redacted_out = _safe_textbox( label="Redacted text", lines=12, show_copy_button=True, ) status_md = gr.Markdown() with gr.Tab("Vault"): vault_md = gr.Markdown( value="_(Pseudonymisation off — enable to see referential tokens here)_" ) with gr.Tab("Audit log"): audit_out = gr.Code( label="Full per-decision audit (JSON)", language="json" ) examples = _load_examples() if examples: gr.Examples( examples=examples, inputs=[input_text, variant, k_target, max_iters, pseudonymise, coref_extend], label="Pre-canned examples", # HF Spaces defaults to caching examples, which requires fn+outputs # and would run the pipeline on every example at boot. Disable it. cache_examples=False, ) run_btn.click( fn=redact, inputs=[input_text, variant, k_target, max_iters, pseudonymise, coref_extend], outputs=[redacted_out, backend_md, status_md, vault_md, audit_out], ) return demo if __name__ == "__main__": app = build_ui() # HF Spaces sets PORT=7860 by default; honour it if present. app.launch( server_name="0.0.0.0", server_port=int(os.environ.get("PORT", "7860")), )