Spaces:

Willxo
/

Anonymiser

Sleeping

App Files Files Community

Anonymiser / app.py

Willxo

Sync to current bundle: Redact/Anonymise modes, lightweight banner, pinned deps

44361bf 1 day ago

raw

history blame contribute delete

9.21 kB

	"""
	HuggingFace Spaces entry point for the Legal Text Anonymisation demo.

	This is a Spaces-tailored copy of the main repo's demo/app.py:
	- NER backbone forced to `en_core_web_sm` for fast cold-start (the
	fine-tuned RoBERTa checkpoint isn't bundled — too large for the free tier).
	- Working directory layout is flat (no `src/` prefix), so the imports
	resolve via the local `anonymisation/` package.

	To run locally for testing:
	pip install -r requirements.txt
	python -m spacy download en_core_web_sm
	python app.py
	"""
	from __future__ import annotations

	import json
	import os
	from pathlib import Path
	from typing import Callable, List, Tuple

	import gradio as gr

	from anonymisation.mapping import SPACY_TO_TAB
	from anonymisation.pipeline import LitePipeline, ProPipeline, MosaicScorer


	_state = {"predictor": None, "backend_label": None, "scorer": None}


	def _make_spacy_predictor(model_name: str) -> Tuple[Callable, str]:
	import spacy
	nlp = spacy.load(model_name)

	def predict(text: str):
	doc = nlp(text)
	return [
	(e.start_char, e.end_char, SPACY_TO_TAB[e.label_], e.text)
	for e in doc.ents if e.label_ in SPACY_TO_TAB
	]
	return predict, f"spaCy {model_name}"


	def _get_predictor() -> Tuple[Callable, str]:
	if _state["predictor"] is not None:
	return _state["predictor"], _state["backend_label"]
	model_name = os.environ.get("SPACY_MODEL", "en_core_web_sm")
	predict, label = _make_spacy_predictor(model_name)
	_state["predictor"] = predict
	_state["backend_label"] = label
	print(f"[spaces] Loaded NER backend: {label}")
	return predict, label


	def _get_scorer() -> MosaicScorer:
	if _state["scorer"] is None:
	try:
	from anonymisation.data import load_tab
	ds = load_tab()
	_state["scorer"] = MosaicScorer.from_tab(list(ds["test"]))
	except Exception as exc:
	print(f"[spaces] Could not load TAB ({exc}); empty haystack.")
	_state["scorer"] = MosaicScorer.empty()
	return _state["scorer"]


	def _backend_badge() -> str:
	label = _state["backend_label"] or _get_predictor()[1]
	return f"NER backend: `{label}` · Coref: Phase 5 post-processor enabled"


	def redact(text: str, variant: str, k_target: int, max_iters: int,
	pseudonymise: bool, coref_extend: bool):
	if not text or not text.strip():
	return (
	"", _backend_badge(),
	"_paste some text on the left and click Redact_",
	"_(no vault)_", "",
	)

	predictor, _ = _get_predictor()

	if variant.startswith("Redact"):
	pipeline = LitePipeline(
	ner_provider=predictor,
	coref_extend=coref_extend,
	pseudonymise=pseudonymise,
	)
	else:
	pipeline = ProPipeline(
	ner_provider=predictor, scorer=_get_scorer(),
	k_target=k_target, max_iterations=max_iters,
	coref_extend=coref_extend, pseudonymise=pseudonymise,
	)

	result = pipeline(text)

	parts: List[str] = []
	if variant.startswith("Anonymise"):
	ck = "✓ converged" if result.converged else "✗ fallback to suppression"
	parts.append(
	f"Mosaic risk: k_initial = `{result.mosaic_risk_initial}` → "
	f"k_final = `{result.mosaic_risk_final}` · "
	f"iterations: `{result.iterations_used}` · {ck}"
	)
	if pseudonymise:
	parts.append(f"Vault: {len(result.pseudonym_vault)} entries (see Vault tab)")
	status = " · ".join(parts) if parts else "_(no status — Redact mode: direct identifiers only)_"

	if result.pseudonym_vault:
	vault_md = "\| Token \| Original surface form \|\n\|---\|---\|\n" + "\n".join(
	f"\| `{tok}` \| {orig} \|" for tok, orig in result.pseudonym_vault.items()
	)
	else:
	vault_md = "_(Pseudonymisation off — enable to see referential tokens here)_"

	return (
	result.redacted_text,
	_backend_badge(),
	status,
	vault_md,
	json.dumps(result.to_dict(), indent=2, ensure_ascii=False),
	)


	def _load_examples() -> List[List]:
	examples_dir = Path(__file__).parent / "examples"
	if not examples_dir.exists():
	return []
	return [
	[
	path.read_text().strip(),
	"Anonymise (mosaic-aware)", 5, 3, False, True,
	]
	for path in sorted(examples_dir.glob("*.txt"))
	]


	DESCRIPTION = """\
	# Legal Text Anonymisation

	> ⚠️ This is a lightweight version of the full project. To boot quickly on a
	> free CPU, this demo runs spaCy's small `en_core_web_sm` model for name
	> detection, so it will miss names the full project catches. The real pipeline
	> uses a RoBERTa model fine-tuned on legal text (F1 0.85 vs ~0.4 here) — see the
	> [portfolio write-up](https://github.com/WillCatt/Data-Anonymisation) for the
	> full results. What's faithful here is the logic: the redaction modes, the
	> mosaic re-identification scoring, and the audit trail.

	Paste a document on the left and pick a mode. The full per-decision audit log is
	on the Audit log tab.

	- Redact — removes the direct identifiers (names, organisations, case and
	reference numbers, IBANs). The quick option.
	- Anonymise — Redact, plus mosaic-aware generalisation: it broadens the
	remaining everyday details (dates, places, demographics) one step at a time
	until the document's residual fingerprint is no longer unique (reaches the
	target k-anonymity), so it can't be traced back to one person.
	- Pseudonymise — assigns stable referential tokens (`[PERSON_A]`, `[PERSON_B]` …)
	instead of plain `[PERSON]`. The Vault tab shows the mapping; restore() reverses
	it locally so an external LLM's answer comes back with the real names.

	> The mosaic scorer compares against the TAB corpus (1,268 ECHR cases) as a
	> methodological stand-in for a firm's own document corpus.
	"""


	def _safe_textbox(**kwargs):
	try:
	return gr.Textbox(**kwargs)
	except TypeError as exc:
	bad = str(exc).split("'")[1] if "'" in str(exc) else None
	if bad and bad in kwargs:
	kwargs.pop(bad)
	return _safe_textbox(**kwargs)
	return gr.Textbox(label=kwargs.get("label", ""), lines=kwargs.get("lines", 10))


	def build_ui():
	with gr.Blocks(title="Legal Text Anonymisation") as demo:
	gr.Markdown(DESCRIPTION)
	_get_predictor()
	backend_md = gr.Markdown(_backend_badge())

	with gr.Row():
	with gr.Column(scale=1):
	input_text = gr.Textbox(
	label="Input",
	placeholder="Paste legal-style text here…",
	lines=14,
	)
	variant = gr.Radio(
	choices=["Redact (direct identifiers)", "Anonymise (mosaic-aware)"],
	value="Anonymise (mosaic-aware)",
	label="Mode",
	)
	with gr.Row():
	k_target = gr.Slider(2, 10, value=5, step=1, label="Anonymise: k_target")
	max_iters = gr.Slider(1, 5, value=3, step=1, label="Anonymise: max iterations")
	with gr.Row():
	pseudonymise = gr.Checkbox(value=False, label="Pseudonymise")
	coref_extend = gr.Checkbox(value=True, label="Coref extension")
	run_btn = gr.Button("Redact", variant="primary")

	with gr.Column(scale=1):
	with gr.Tabs():
	with gr.Tab("Redacted"):
	redacted_out = _safe_textbox(
	label="Redacted text", lines=12, show_copy_button=True,
	)
	status_md = gr.Markdown()
	with gr.Tab("Vault"):
	vault_md = gr.Markdown(
	value="_(Pseudonymisation off — enable to see referential tokens here)_"
	)
	with gr.Tab("Audit log"):
	audit_out = gr.Code(
	label="Full per-decision audit (JSON)", language="json"
	)

	examples = _load_examples()
	if examples:
	gr.Examples(
	examples=examples,
	inputs=[input_text, variant, k_target, max_iters, pseudonymise, coref_extend],
	label="Pre-canned examples",
	# HF Spaces defaults to caching examples, which requires fn+outputs
	# and would run the pipeline on every example at boot. Disable it.
	cache_examples=False,
	)

	run_btn.click(
	fn=redact,
	inputs=[input_text, variant, k_target, max_iters, pseudonymise, coref_extend],
	outputs=[redacted_out, backend_md, status_md, vault_md, audit_out],
	)

	return demo


	if __name__ == "__main__":
	app = build_ui()
	# HF Spaces sets PORT=7860 by default; honour it if present.
	app.launch(
	server_name="0.0.0.0",
	server_port=int(os.environ.get("PORT", "7860")),
	)