noteguard / streamlit_app.py
yumi.h
Add de-identified download, remove FLock.io tech refs, add hackathon footer
6f7e511
Raw
History Blame Contribute Delete
15.5 kB
"""NoteGuard — demo UI.
Run from the repo root: streamlit run streamlit_app.py
Try-it (detect & sanitise) · Metrics & leakage · Governance (Five Safes) · Two-Trust sharing.
Built on the NoteGuard package (src/) — pluggable detectors + patient-consistent transforms.
"""
from __future__ import annotations
import html
import json
import sys
from collections import Counter
from pathlib import Path
import pandas as pd
import streamlit as st
REPO = Path(__file__).resolve().parent
sys.path.insert(0, str(REPO))
from src.data import load_notes # noqa: E402
from src.detect import build_detector # noqa: E402
from src.evaluate import evaluate # noqa: E402
from src.pipeline import Pipeline # noqa: E402
from src.transform import PSEUDONYM, REDACTION, PseudonymVault # noqa: E402
OUT_DIR = REPO / "outputs"
RESULTS = REPO / "outputs" / "results.json"
ENTITY_COLORS = {
"PERSON": "#ffd6e0", "UK_NHS": "#ffe9b3", "DATE_TIME": "#d4f4dd", "UK_POSTCODE": "#cfe8ff",
"LOCATION": "#cfe8ff", "ORGANIZATION": "#cfe8ff", "RECORD_ID": "#ffd9c2",
"PHONE_NUMBER": "#d4f4dd", "EMAIL_ADDRESS": "#d4f4dd",
"UK_NINO": "#ffe9b3", "GMC": "#f0e0a0", "NMC": "#f0e0a0", "NHS_ODS": "#f0e0a0",
}
st.set_page_config(page_title="NoteGuard", page_icon="🛡️", layout="wide")
@st.cache_resource(show_spinner="Loading the de-identification engine + sample notes…")
def load_engine():
detector = build_detector(use_presidio=True)
try:
notes = load_notes(limit=50)
except Exception:
notes = []
return detector, notes
def highlight(text: str, spans) -> str:
chosen, last_end = [], -1
for s in sorted(spans, key=lambda s: (s.start, -(s.end - s.start))):
if s.start >= last_end:
chosen.append(s)
last_end = s.end
out, idx = [], 0
for s in chosen:
out.append(html.escape(text[idx:s.start]))
color = ENTITY_COLORS.get(s.entity_type, "#e0e0e0")
border = "2px dashed #e67e00" if s.needs_review else "none"
out.append(
f'<mark style="background:{color};padding:0 2px;border-radius:3px;border:{border}" '
f'title="{s.entity_type} ({s.score:.2f}){" ⚠ review" if s.needs_review else ""}'
f'">{html.escape(text[s.start:s.end])}</mark>'
)
idx = s.end
out.append(html.escape(text[idx:]))
return "".join(out).replace("\n", "<br>")
def scroll_box(inner_html: str, height: int = 340):
st.markdown(
f'<div style="height:{height}px;overflow:auto;border:1px solid #ddd;border-radius:8px;'
f'padding:12px;font-family:ui-monospace,monospace;font-size:13px;line-height:1.5">{inner_html}</div>',
unsafe_allow_html=True,
)
def load_json(path: Path):
return json.loads(path.read_text(encoding="utf-8")) if path.exists() else None
st.title("🛡️ NoteGuard — NHS De-Identification Gate")
st.caption("AI detects patient and clinician PII, humans review, audit logs account.")
detector, NOTES = load_engine()
tab_try, tab_metrics, tab_gov, tab_trust = st.tabs(
["🔎 Try it", "📊 Metrics & Leakage", "🏛️ Governance (Five Safes)", "🤝 Two-Trust sharing"]
)
# ---------------------------------------------------------------- Try it
with tab_try:
st.markdown(
"Paste a clinical note and see what the gate detects, removes, and flags for human review "
"before the text is allowed into the SDE pool."
)
c1, c2 = st.columns([3, 2])
with c2:
method = st.radio("Transform", [PSEUDONYM, REDACTION],
format_func=lambda m: "Pseudonymise (realistic, patient-consistent)"
if m == PSEUDONYM else "Redact ([TYPE] tags)")
source = st.radio("Input", ["Sample note", "Paste your own"])
with c1:
if source == "Sample note" and NOTES:
idx = st.number_input("Note index", 1, len(NOTES), 1, step=1)
rec = NOTES[int(idx) - 1]
text, person_id, note_id = rec.text, rec.person_id, rec.note_id
else:
text = st.text_area("Clinical note (messy free-text)", height=200,
value="Pt John Smith, NHS no 943 476 5919, DOB 02/03/1981, lives SW1A 1AA. "
"Admitted Manchester Royal Infirmary Ward 9. "
"Reviewed by Dr Lee, GMC 1234567.")
person_id, note_id = "demo", "pasted"
if text.strip():
result = Pipeline(detector, PseudonymVault()).sanitise(text, method, person_id)
st.markdown("##### 1) Detected PII")
scroll_box(highlight(text, result.spans))
st.markdown(f"##### 2) Sanitised output — `{method}`")
scroll_box(html.escape(result.sanitised).replace("\n", "<br>"))
st.markdown("##### 3) Audit log (counts only — no raw values leave the gate)")
confirmed = [s for s in result.spans if not s.needs_review]
counts = Counter(s.entity_type for s in confirmed)
st.dataframe({"entity": list(counts), "auto-removed": list(counts.values())},
hide_index=True, use_container_width=True)
if result.review_items:
st.warning(
f"**Human review required — {len(result.review_items)} low-confidence detection(s)**\n\n"
"These spans were redacted for safety but the model's confidence was below the "
"auto-confirm threshold. An IG analyst should confirm before the note enters the SDE pool.",
icon="⚠️",
)
for s in result.review_items:
context_start = max(0, s.start - 40)
context_end = min(len(text), s.end + 40)
ctx = text[context_start:context_end].replace("\n", " ")
st.markdown(
f"- **`{s.entity_type}`** · score `{s.score:.2f}` · "
f'…{html.escape(ctx[:s.start - context_start])}'
f'**_{html.escape(s.text)}_**'
f'{html.escape(ctx[s.end - context_start:])}…'
)
else:
st.success("All detections auto-confirmed (score ≥ threshold). No human review needed.", icon="✅")
st.markdown("##### 4) Download this de-identified note")
one = [{"note_id": note_id, "method": method, "sanitised_text": result.sanitised}]
d1, d2 = st.columns(2)
d1.download_button("⬇ Download JSON", json.dumps(one, ensure_ascii=False, indent=2),
file_name="noteguard_note.json", mime="application/json",
use_container_width=True)
d2.download_button("⬇ Download CSV", pd.DataFrame(one).to_csv(index=False),
file_name="noteguard_note.csv", mime="text/csv",
use_container_width=True)
st.divider()
with st.expander("⬇ Download the full de-identified dataset"):
st.caption("De-identify a batch of notes and export **only the sanitised text** — "
"the original PHI never leaves the gate.")
ca, cb = st.columns([3, 1])
n_all = ca.slider("Notes to de-identify", 50, 1600, 200, step=50, key="dataset_n")
if cb.button("Prepare", use_container_width=True):
with st.spinner(f"De-identifying {n_all} notes…"):
pipe = Pipeline(detector, PseudonymVault()) # one vault → patient-consistent
rows = [{"note_id": r.note_id, "method": method,
"sanitised_text": pipe.sanitise(r.text, method, r.person_id).sanitised}
for r in load_notes(limit=n_all) if r.text]
st.session_state["dataset_rows"] = rows
rows = st.session_state.get("dataset_rows")
if rows:
st.success(f"{len(rows)} notes de-identified — ready to download.")
e1, e2 = st.columns(2)
e1.download_button("⬇ Download JSON", json.dumps(rows, ensure_ascii=False, indent=2),
file_name="noteguard_dataset.json", mime="application/json",
use_container_width=True)
e2.download_button("⬇ Download CSV", pd.DataFrame(rows).to_csv(index=False),
file_name="noteguard_dataset.csv", mime="text/csv",
use_container_width=True)
# ---------------------------------------------------------------- Metrics
with tab_metrics:
st.markdown(
"**Leakage rate** is the headline SDE gate metric: after sanitisation, what fraction of known "
"patient identifiers still appear in the output text? Ground truth is **joined from the "
"dataset's structured tables** — every note's identifiers are known in advance, so this is "
"a real, measurable re-identification risk, not an estimate."
)
st.markdown(
"> **Target for SDE admission:** leakage = 0. Any note with a non-zero leakage score "
"must be held back from the shared pool until reviewed."
)
data = load_json(RESULTS)
n = st.slider("Notes to evaluate (live run)", 50, 1000, 200, step=50)
if st.button("▶ Run evaluation"):
with st.spinner("Evaluating…"):
recs = load_notes(limit=n)
res = evaluate(recs, detector, PSEUDONYM).to_dict()
data = {"presidio+rules": res}
RESULTS.write_text(json.dumps(data, indent=2), encoding="utf-8")
if data:
name = "presidio+rules" if "presidio+rules" in data else next(iter(data))
r = data[name]
leak = r["leakage"]["leakage_rate_pct"]
m1, m2, m3 = st.columns(3)
m1.metric("Identifiers removed", f"{100 - leak:.1f}%", help="Known PII not present in output")
m2.metric("Residual leakage", f"{leak:.2f}%",
delta=f"{leak:.2f}%" if leak > 0 else None,
delta_color="inverse",
help="Fraction of known PII surviving sanitisation — target: 0%")
m3.metric("Notes evaluated", r["notes_evaluated"])
st.markdown("##### Detection recall by entity type")
pe = r["detection"]["per_entity"]
st.dataframe(
{"entity": list(pe),
"recall": [f"{m['recall']:.0%}" for m in pe.values()],
"precision": [f"{m['precision']:.0%}" for m in pe.values()],
"support": [m["support"] for m in pe.values()]},
hide_index=True, use_container_width=True,
)
st.caption(
"Precision is a conservative lower bound. Clinician names and unlisted locations "
"detected correctly are counted as false positives."
)
else:
st.info("No metrics yet — click **Run evaluation** above.")
# ---------------------------------------------------------------- Governance
with tab_gov:
st.markdown("### NHS Five Safes — How NoteGuard maps")
st.markdown(
"The Five Safes framework is the standard NHS governance model for data access. "
"NoteGuard is designed as the **Safe Data** layer that makes the other four safes cheaper to achieve."
)
five_safes = [
("✅ Safe Data",
"DAPB1523 / ICO standard",
"Names · NHS number · DOB · postcode → outward code · "
"GMC/NMC clinician IDs · ODS org codes · record UUIDs · site names. "
"NRP (nationality/religion) always redacted, never pseudonymised (UK GDPR Art. 9)."),
("✅ Safe Settings",
"Processing inside the Trust",
"Detection and sanitisation run locally. Raw notes, vault (re-id key), and CSVs "
"are gitignored and never leave the Trust boundary. Only de-identified text is exported."),
("✅ Safe Outputs",
"Leakage-gated release",
"Residual leakage is measured against ground-truth identifiers before any note enters "
"the SDE pool. Target: 0 known identifiers surviving sanitisation. "
"Low-confidence spans are held in a human review queue rather than auto-released."),
("⚠️ Safe People",
"Human-in-the-loop required",
"The re-identification vault stays Trust-local. Pseudonymised data is still personal "
"data under UK GDPR (stated honestly — no over-claim of anonymisation). "
"An IG analyst reviews low-confidence detections before pool admission."),
("⚠️ Safe Projects",
"Project-level approval not covered here",
"NoteGuard provides the technical de-identification layer; "
"project-level data access approval (Data Access Request / DARS) remains a Trust process."),
]
for safe, standard, detail in five_safes:
with st.expander(f"**{safe}** — {standard}"):
st.markdown(detail)
st.divider()
st.markdown("### Adoption path — NHS SDE on-ramp")
st.markdown("""
```
NHS Trust (raw notes)
▼ NoteGuard gate (runs inside Trust)
│ clean → detect PII → sanitise → leakage check
│ low-confidence spans → IG analyst review queue
▼ de-identified notes + audit log (no PHI crosses boundary)
▼ NHS Secure Data Environment / Federated Data Platform pool
│ (same model as OpenSAFELY: code comes to data, data never leaves)
▼ Federated AI training
each Trust trains locally; only model gradients are shared
```
""")
# ---------------------------------------------------------------- Two-Trust
with tab_trust:
st.markdown(
"### Sanitise-at-source: two Trusts sharing without sharing\n\n"
"Each Trust runs the NoteGuard gate locally — raw notes and the re-identification vault "
"**never leave**. Only de-identified text and a content-free audit manifest go into the "
"shared SDE pool. This is the same privacy model behind OpenSAFELY and the NHS Federated "
"Data Platform: *code comes to the data, data never leaves*."
)
summary = load_json(OUT_DIR / "trust_demo_summary.json")
if st.button("▶ Run two-Trust demo"):
from src.trust_demo import main as run_trust
with st.spinner("Sanitising at each Trust…"):
run_trust()
summary = load_json(OUT_DIR / "trust_demo_summary.json")
if summary:
cols = st.columns(len(summary["trusts"]) + 1)
for col, t in zip(cols, summary["trusts"], strict=False):
with col:
st.markdown(f"#### 🏥 {t['trust'].split('(')[0].strip()}")
st.metric("Notes de-identified", t["notes_deidentified"])
st.metric("Raw records shared", t["raw_records_shared"])
st.metric("Residual leaks", t["residual_leaks"])
st.caption("🔒 raw notes + vault stay local")
with cols[-1]:
st.markdown("#### 🟢 Shared SDE pool")
st.metric("De-identified notes", summary["shared_pool_size"])
st.metric("Raw records shared", summary["raw_records_shared"])
st.metric("Total residual leaks", summary["total_residual_leaks"])
st.caption("→ ready for federated AI training")
else:
st.info("Click **Run two-Trust demo** above.")
# ---------------------------------------------------------------- Footer (all tabs)
st.divider()
st.caption(
"Live demo for the **FLock Sovereign AI Challenge** at the Encode Vibe Coding Hackathon, "
"hosted by Encode Hub."
)