File size: 15,521 Bytes
98cd18a cc56abb ba1ace4 98cd18a 84981a4 cc56abb 6f7e511 cc56abb 0b26370 cc56abb 84981a4 cc56abb 9ee0f17 cc56abb 39f19d3 98cd18a cc56abb 98cd18a cc56abb f542ae7 98cd18a 747b6ef 98cd18a cc56abb 98cd18a cc56abb 98cd18a cc56abb 98cd18a 39f19d3 cc56abb 39f19d3 cc56abb 98cd18a cc56abb 98cd18a cc56abb 98cd18a cc56abb 39f19d3 d9b0ba1 cc56abb 98cd18a cc56abb 4b0d494 cc56abb 39f19d3 cc56abb 98cd18a cc56abb 98cd18a 747b6ef 6f7e511 cc56abb 98cd18a 39f19d3 6f7e511 cc56abb 4d404c0 39f19d3 98cd18a 39f19d3 98cd18a 39f19d3 cc56abb 6f7e511 cc56abb 39f19d3 98cd18a 4b0d494 98cd18a 39f19d3 98cd18a 39f19d3 98cd18a cc56abb 39f19d3 98cd18a 39f19d3 f542ae7 39f19d3 cc56abb 1ca31db cc56abb 39f19d3 f542ae7 39f19d3 f542ae7 39f19d3 6f7e511 39f19d3 cc56abb 39f19d3 98cd18a cc56abb 84981a4 98cd18a cc56abb 98cd18a cc56abb 84981a4 cc56abb 98cd18a cc56abb 98cd18a 6f7e511 cc56abb 1ca31db 6f7e511 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 | """NoteGuard — demo UI.
Run from the repo root: streamlit run streamlit_app.py
Try-it (detect & sanitise) · Metrics & leakage · Governance (Five Safes) · Two-Trust sharing.
Built on the NoteGuard package (src/) — pluggable detectors + patient-consistent transforms.
"""
from __future__ import annotations
import html
import json
import sys
from collections import Counter
from pathlib import Path
import pandas as pd
import streamlit as st
REPO = Path(__file__).resolve().parent
sys.path.insert(0, str(REPO))
from src.data import load_notes # noqa: E402
from src.detect import build_detector # noqa: E402
from src.evaluate import evaluate # noqa: E402
from src.pipeline import Pipeline # noqa: E402
from src.transform import PSEUDONYM, REDACTION, PseudonymVault # noqa: E402
OUT_DIR = REPO / "outputs"
RESULTS = REPO / "outputs" / "results.json"
ENTITY_COLORS = {
"PERSON": "#ffd6e0", "UK_NHS": "#ffe9b3", "DATE_TIME": "#d4f4dd", "UK_POSTCODE": "#cfe8ff",
"LOCATION": "#cfe8ff", "ORGANIZATION": "#cfe8ff", "RECORD_ID": "#ffd9c2",
"PHONE_NUMBER": "#d4f4dd", "EMAIL_ADDRESS": "#d4f4dd",
"UK_NINO": "#ffe9b3", "GMC": "#f0e0a0", "NMC": "#f0e0a0", "NHS_ODS": "#f0e0a0",
}
st.set_page_config(page_title="NoteGuard", page_icon="🛡️", layout="wide")
@st.cache_resource(show_spinner="Loading the de-identification engine + sample notes…")
def load_engine():
detector = build_detector(use_presidio=True)
try:
notes = load_notes(limit=50)
except Exception:
notes = []
return detector, notes
def highlight(text: str, spans) -> str:
chosen, last_end = [], -1
for s in sorted(spans, key=lambda s: (s.start, -(s.end - s.start))):
if s.start >= last_end:
chosen.append(s)
last_end = s.end
out, idx = [], 0
for s in chosen:
out.append(html.escape(text[idx:s.start]))
color = ENTITY_COLORS.get(s.entity_type, "#e0e0e0")
border = "2px dashed #e67e00" if s.needs_review else "none"
out.append(
f'<mark style="background:{color};padding:0 2px;border-radius:3px;border:{border}" '
f'title="{s.entity_type} ({s.score:.2f}){" ⚠ review" if s.needs_review else ""}'
f'">{html.escape(text[s.start:s.end])}</mark>'
)
idx = s.end
out.append(html.escape(text[idx:]))
return "".join(out).replace("\n", "<br>")
def scroll_box(inner_html: str, height: int = 340):
st.markdown(
f'<div style="height:{height}px;overflow:auto;border:1px solid #ddd;border-radius:8px;'
f'padding:12px;font-family:ui-monospace,monospace;font-size:13px;line-height:1.5">{inner_html}</div>',
unsafe_allow_html=True,
)
def load_json(path: Path):
return json.loads(path.read_text(encoding="utf-8")) if path.exists() else None
st.title("🛡️ NoteGuard — NHS De-Identification Gate")
st.caption("AI detects patient and clinician PII, humans review, audit logs account.")
detector, NOTES = load_engine()
tab_try, tab_metrics, tab_gov, tab_trust = st.tabs(
["🔎 Try it", "📊 Metrics & Leakage", "🏛️ Governance (Five Safes)", "🤝 Two-Trust sharing"]
)
# ---------------------------------------------------------------- Try it
with tab_try:
st.markdown(
"Paste a clinical note and see what the gate detects, removes, and flags for human review "
"before the text is allowed into the SDE pool."
)
c1, c2 = st.columns([3, 2])
with c2:
method = st.radio("Transform", [PSEUDONYM, REDACTION],
format_func=lambda m: "Pseudonymise (realistic, patient-consistent)"
if m == PSEUDONYM else "Redact ([TYPE] tags)")
source = st.radio("Input", ["Sample note", "Paste your own"])
with c1:
if source == "Sample note" and NOTES:
idx = st.number_input("Note index", 1, len(NOTES), 1, step=1)
rec = NOTES[int(idx) - 1]
text, person_id, note_id = rec.text, rec.person_id, rec.note_id
else:
text = st.text_area("Clinical note (messy free-text)", height=200,
value="Pt John Smith, NHS no 943 476 5919, DOB 02/03/1981, lives SW1A 1AA. "
"Admitted Manchester Royal Infirmary Ward 9. "
"Reviewed by Dr Lee, GMC 1234567.")
person_id, note_id = "demo", "pasted"
if text.strip():
result = Pipeline(detector, PseudonymVault()).sanitise(text, method, person_id)
st.markdown("##### 1) Detected PII")
scroll_box(highlight(text, result.spans))
st.markdown(f"##### 2) Sanitised output — `{method}`")
scroll_box(html.escape(result.sanitised).replace("\n", "<br>"))
st.markdown("##### 3) Audit log (counts only — no raw values leave the gate)")
confirmed = [s for s in result.spans if not s.needs_review]
counts = Counter(s.entity_type for s in confirmed)
st.dataframe({"entity": list(counts), "auto-removed": list(counts.values())},
hide_index=True, use_container_width=True)
if result.review_items:
st.warning(
f"**Human review required — {len(result.review_items)} low-confidence detection(s)**\n\n"
"These spans were redacted for safety but the model's confidence was below the "
"auto-confirm threshold. An IG analyst should confirm before the note enters the SDE pool.",
icon="⚠️",
)
for s in result.review_items:
context_start = max(0, s.start - 40)
context_end = min(len(text), s.end + 40)
ctx = text[context_start:context_end].replace("\n", " ")
st.markdown(
f"- **`{s.entity_type}`** · score `{s.score:.2f}` · "
f'…{html.escape(ctx[:s.start - context_start])}'
f'**_{html.escape(s.text)}_**'
f'{html.escape(ctx[s.end - context_start:])}…'
)
else:
st.success("All detections auto-confirmed (score ≥ threshold). No human review needed.", icon="✅")
st.markdown("##### 4) Download this de-identified note")
one = [{"note_id": note_id, "method": method, "sanitised_text": result.sanitised}]
d1, d2 = st.columns(2)
d1.download_button("⬇ Download JSON", json.dumps(one, ensure_ascii=False, indent=2),
file_name="noteguard_note.json", mime="application/json",
use_container_width=True)
d2.download_button("⬇ Download CSV", pd.DataFrame(one).to_csv(index=False),
file_name="noteguard_note.csv", mime="text/csv",
use_container_width=True)
st.divider()
with st.expander("⬇ Download the full de-identified dataset"):
st.caption("De-identify a batch of notes and export **only the sanitised text** — "
"the original PHI never leaves the gate.")
ca, cb = st.columns([3, 1])
n_all = ca.slider("Notes to de-identify", 50, 1600, 200, step=50, key="dataset_n")
if cb.button("Prepare", use_container_width=True):
with st.spinner(f"De-identifying {n_all} notes…"):
pipe = Pipeline(detector, PseudonymVault()) # one vault → patient-consistent
rows = [{"note_id": r.note_id, "method": method,
"sanitised_text": pipe.sanitise(r.text, method, r.person_id).sanitised}
for r in load_notes(limit=n_all) if r.text]
st.session_state["dataset_rows"] = rows
rows = st.session_state.get("dataset_rows")
if rows:
st.success(f"{len(rows)} notes de-identified — ready to download.")
e1, e2 = st.columns(2)
e1.download_button("⬇ Download JSON", json.dumps(rows, ensure_ascii=False, indent=2),
file_name="noteguard_dataset.json", mime="application/json",
use_container_width=True)
e2.download_button("⬇ Download CSV", pd.DataFrame(rows).to_csv(index=False),
file_name="noteguard_dataset.csv", mime="text/csv",
use_container_width=True)
# ---------------------------------------------------------------- Metrics
with tab_metrics:
st.markdown(
"**Leakage rate** is the headline SDE gate metric: after sanitisation, what fraction of known "
"patient identifiers still appear in the output text? Ground truth is **joined from the "
"dataset's structured tables** — every note's identifiers are known in advance, so this is "
"a real, measurable re-identification risk, not an estimate."
)
st.markdown(
"> **Target for SDE admission:** leakage = 0. Any note with a non-zero leakage score "
"must be held back from the shared pool until reviewed."
)
data = load_json(RESULTS)
n = st.slider("Notes to evaluate (live run)", 50, 1000, 200, step=50)
if st.button("▶ Run evaluation"):
with st.spinner("Evaluating…"):
recs = load_notes(limit=n)
res = evaluate(recs, detector, PSEUDONYM).to_dict()
data = {"presidio+rules": res}
RESULTS.write_text(json.dumps(data, indent=2), encoding="utf-8")
if data:
name = "presidio+rules" if "presidio+rules" in data else next(iter(data))
r = data[name]
leak = r["leakage"]["leakage_rate_pct"]
m1, m2, m3 = st.columns(3)
m1.metric("Identifiers removed", f"{100 - leak:.1f}%", help="Known PII not present in output")
m2.metric("Residual leakage", f"{leak:.2f}%",
delta=f"{leak:.2f}%" if leak > 0 else None,
delta_color="inverse",
help="Fraction of known PII surviving sanitisation — target: 0%")
m3.metric("Notes evaluated", r["notes_evaluated"])
st.markdown("##### Detection recall by entity type")
pe = r["detection"]["per_entity"]
st.dataframe(
{"entity": list(pe),
"recall": [f"{m['recall']:.0%}" for m in pe.values()],
"precision": [f"{m['precision']:.0%}" for m in pe.values()],
"support": [m["support"] for m in pe.values()]},
hide_index=True, use_container_width=True,
)
st.caption(
"Precision is a conservative lower bound. Clinician names and unlisted locations "
"detected correctly are counted as false positives."
)
else:
st.info("No metrics yet — click **Run evaluation** above.")
# ---------------------------------------------------------------- Governance
with tab_gov:
st.markdown("### NHS Five Safes — How NoteGuard maps")
st.markdown(
"The Five Safes framework is the standard NHS governance model for data access. "
"NoteGuard is designed as the **Safe Data** layer that makes the other four safes cheaper to achieve."
)
five_safes = [
("✅ Safe Data",
"DAPB1523 / ICO standard",
"Names · NHS number · DOB · postcode → outward code · "
"GMC/NMC clinician IDs · ODS org codes · record UUIDs · site names. "
"NRP (nationality/religion) always redacted, never pseudonymised (UK GDPR Art. 9)."),
("✅ Safe Settings",
"Processing inside the Trust",
"Detection and sanitisation run locally. Raw notes, vault (re-id key), and CSVs "
"are gitignored and never leave the Trust boundary. Only de-identified text is exported."),
("✅ Safe Outputs",
"Leakage-gated release",
"Residual leakage is measured against ground-truth identifiers before any note enters "
"the SDE pool. Target: 0 known identifiers surviving sanitisation. "
"Low-confidence spans are held in a human review queue rather than auto-released."),
("⚠️ Safe People",
"Human-in-the-loop required",
"The re-identification vault stays Trust-local. Pseudonymised data is still personal "
"data under UK GDPR (stated honestly — no over-claim of anonymisation). "
"An IG analyst reviews low-confidence detections before pool admission."),
("⚠️ Safe Projects",
"Project-level approval not covered here",
"NoteGuard provides the technical de-identification layer; "
"project-level data access approval (Data Access Request / DARS) remains a Trust process."),
]
for safe, standard, detail in five_safes:
with st.expander(f"**{safe}** — {standard}"):
st.markdown(detail)
st.divider()
st.markdown("### Adoption path — NHS SDE on-ramp")
st.markdown("""
```
NHS Trust (raw notes)
│
▼ NoteGuard gate (runs inside Trust)
│ clean → detect PII → sanitise → leakage check
│ low-confidence spans → IG analyst review queue
│
▼ de-identified notes + audit log (no PHI crosses boundary)
│
▼ NHS Secure Data Environment / Federated Data Platform pool
│ (same model as OpenSAFELY: code comes to data, data never leaves)
│
▼ Federated AI training
each Trust trains locally; only model gradients are shared
```
""")
# ---------------------------------------------------------------- Two-Trust
with tab_trust:
st.markdown(
"### Sanitise-at-source: two Trusts sharing without sharing\n\n"
"Each Trust runs the NoteGuard gate locally — raw notes and the re-identification vault "
"**never leave**. Only de-identified text and a content-free audit manifest go into the "
"shared SDE pool. This is the same privacy model behind OpenSAFELY and the NHS Federated "
"Data Platform: *code comes to the data, data never leaves*."
)
summary = load_json(OUT_DIR / "trust_demo_summary.json")
if st.button("▶ Run two-Trust demo"):
from src.trust_demo import main as run_trust
with st.spinner("Sanitising at each Trust…"):
run_trust()
summary = load_json(OUT_DIR / "trust_demo_summary.json")
if summary:
cols = st.columns(len(summary["trusts"]) + 1)
for col, t in zip(cols, summary["trusts"], strict=False):
with col:
st.markdown(f"#### 🏥 {t['trust'].split('(')[0].strip()}")
st.metric("Notes de-identified", t["notes_deidentified"])
st.metric("Raw records shared", t["raw_records_shared"])
st.metric("Residual leaks", t["residual_leaks"])
st.caption("🔒 raw notes + vault stay local")
with cols[-1]:
st.markdown("#### 🟢 Shared SDE pool")
st.metric("De-identified notes", summary["shared_pool_size"])
st.metric("Raw records shared", summary["raw_records_shared"])
st.metric("Total residual leaks", summary["total_residual_leaks"])
st.caption("→ ready for federated AI training")
else:
st.info("Click **Run two-Trust demo** above.")
# ---------------------------------------------------------------- Footer (all tabs)
st.divider()
st.caption(
"Live demo for the **FLock Sovereign AI Challenge** at the Encode Vibe Coding Hackathon, "
"hosted by Encode Hub."
)
|