File size: 15,521 Bytes
98cd18a
cc56abb
ba1ace4
98cd18a
 
84981a4
cc56abb
 
 
 
 
 
 
 
 
6f7e511
cc56abb
 
0b26370
cc56abb
 
84981a4
 
 
 
 
cc56abb
9ee0f17
 
cc56abb
 
 
39f19d3
 
98cd18a
cc56abb
 
98cd18a
cc56abb
 
f542ae7
98cd18a
 
 
747b6ef
98cd18a
 
 
cc56abb
 
98cd18a
cc56abb
98cd18a
 
 
 
cc56abb
98cd18a
 
 
39f19d3
cc56abb
39f19d3
 
 
cc56abb
98cd18a
cc56abb
 
 
 
98cd18a
cc56abb
 
 
 
 
 
 
98cd18a
cc56abb
 
 
39f19d3
d9b0ba1
cc56abb
98cd18a
cc56abb
 
4b0d494
cc56abb
 
 
 
39f19d3
 
 
 
cc56abb
 
98cd18a
 
 
 
cc56abb
98cd18a
747b6ef
 
6f7e511
cc56abb
98cd18a
 
39f19d3
 
6f7e511
cc56abb
 
4d404c0
39f19d3
 
98cd18a
39f19d3
 
98cd18a
39f19d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc56abb
6f7e511
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc56abb
 
39f19d3
 
 
 
 
 
 
 
 
 
98cd18a
 
4b0d494
98cd18a
 
 
 
 
 
 
 
 
 
 
39f19d3
 
 
 
 
98cd18a
39f19d3
98cd18a
cc56abb
39f19d3
 
98cd18a
 
39f19d3
 
 
f542ae7
39f19d3
cc56abb
 
1ca31db
cc56abb
 
 
39f19d3
 
 
 
 
 
 
 
 
f542ae7
39f19d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f542ae7
39f19d3
 
 
 
 
 
 
6f7e511
39f19d3
 
 
cc56abb
 
 
39f19d3
 
 
 
 
 
 
98cd18a
cc56abb
84981a4
98cd18a
cc56abb
98cd18a
cc56abb
 
 
84981a4
cc56abb
 
 
 
98cd18a
cc56abb
 
 
 
 
98cd18a
6f7e511
cc56abb
1ca31db
6f7e511
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
"""NoteGuard — demo UI.

Run from the repo root:  streamlit run streamlit_app.py

Try-it (detect & sanitise) · Metrics & leakage · Governance (Five Safes) · Two-Trust sharing.
Built on the NoteGuard package (src/) — pluggable detectors + patient-consistent transforms.
"""
from __future__ import annotations

import html
import json
import sys
from collections import Counter
from pathlib import Path

import pandas as pd
import streamlit as st

REPO = Path(__file__).resolve().parent
sys.path.insert(0, str(REPO))

from src.data import load_notes  # noqa: E402
from src.detect import build_detector  # noqa: E402
from src.evaluate import evaluate  # noqa: E402
from src.pipeline import Pipeline  # noqa: E402
from src.transform import PSEUDONYM, REDACTION, PseudonymVault  # noqa: E402

OUT_DIR = REPO / "outputs"
RESULTS = REPO / "outputs" / "results.json"

ENTITY_COLORS = {
    "PERSON": "#ffd6e0", "UK_NHS": "#ffe9b3", "DATE_TIME": "#d4f4dd", "UK_POSTCODE": "#cfe8ff",
    "LOCATION": "#cfe8ff", "ORGANIZATION": "#cfe8ff", "RECORD_ID": "#ffd9c2",
    "PHONE_NUMBER": "#d4f4dd", "EMAIL_ADDRESS": "#d4f4dd",
    "UK_NINO": "#ffe9b3", "GMC": "#f0e0a0", "NMC": "#f0e0a0", "NHS_ODS": "#f0e0a0",
}

st.set_page_config(page_title="NoteGuard", page_icon="🛡️", layout="wide")


@st.cache_resource(show_spinner="Loading the de-identification engine + sample notes…")
def load_engine():
    detector = build_detector(use_presidio=True)
    try:
        notes = load_notes(limit=50)
    except Exception:
        notes = []
    return detector, notes


def highlight(text: str, spans) -> str:
    chosen, last_end = [], -1
    for s in sorted(spans, key=lambda s: (s.start, -(s.end - s.start))):
        if s.start >= last_end:
            chosen.append(s)
            last_end = s.end
    out, idx = [], 0
    for s in chosen:
        out.append(html.escape(text[idx:s.start]))
        color = ENTITY_COLORS.get(s.entity_type, "#e0e0e0")
        border = "2px dashed #e67e00" if s.needs_review else "none"
        out.append(
            f'<mark style="background:{color};padding:0 2px;border-radius:3px;border:{border}" '
            f'title="{s.entity_type} ({s.score:.2f}){" ⚠ review" if s.needs_review else ""}'
            f'">{html.escape(text[s.start:s.end])}</mark>'
        )
        idx = s.end
    out.append(html.escape(text[idx:]))
    return "".join(out).replace("\n", "<br>")


def scroll_box(inner_html: str, height: int = 340):
    st.markdown(
        f'<div style="height:{height}px;overflow:auto;border:1px solid #ddd;border-radius:8px;'
        f'padding:12px;font-family:ui-monospace,monospace;font-size:13px;line-height:1.5">{inner_html}</div>',
        unsafe_allow_html=True,
    )


def load_json(path: Path):
    return json.loads(path.read_text(encoding="utf-8")) if path.exists() else None


st.title("🛡️ NoteGuard — NHS De-Identification Gate")
st.caption("AI detects patient and clinician PII, humans review, audit logs account.")

detector, NOTES = load_engine()

tab_try, tab_metrics, tab_gov, tab_trust = st.tabs(
    ["🔎 Try it", "📊 Metrics & Leakage", "🏛️ Governance (Five Safes)", "🤝 Two-Trust sharing"]
)

# ---------------------------------------------------------------- Try it
with tab_try:
    st.markdown(
        "Paste a clinical note and see what the gate detects, removes, and flags for human review "
        "before the text is allowed into the SDE pool."
    )
    c1, c2 = st.columns([3, 2])
    with c2:
        method = st.radio("Transform", [PSEUDONYM, REDACTION],
                          format_func=lambda m: "Pseudonymise (realistic, patient-consistent)"
                          if m == PSEUDONYM else "Redact ([TYPE] tags)")
        source = st.radio("Input", ["Sample note", "Paste your own"])
    with c1:
        if source == "Sample note" and NOTES:
            idx = st.number_input("Note index", 1, len(NOTES), 1, step=1)
            rec = NOTES[int(idx) - 1]
            text, person_id, note_id = rec.text, rec.person_id, rec.note_id
        else:
            text = st.text_area("Clinical note (messy free-text)", height=200,
                                value="Pt John Smith, NHS no 943 476 5919, DOB 02/03/1981, lives SW1A 1AA. "
                                      "Admitted Manchester Royal Infirmary Ward 9. "
                                      "Reviewed by Dr Lee, GMC 1234567.")
            person_id, note_id = "demo", "pasted"

    if text.strip():
        result = Pipeline(detector, PseudonymVault()).sanitise(text, method, person_id)

        st.markdown("##### 1) Detected PII")
        scroll_box(highlight(text, result.spans))

        st.markdown(f"##### 2) Sanitised output — `{method}`")
        scroll_box(html.escape(result.sanitised).replace("\n", "<br>"))

        st.markdown("##### 3) Audit log (counts only — no raw values leave the gate)")
        confirmed = [s for s in result.spans if not s.needs_review]
        counts = Counter(s.entity_type for s in confirmed)
        st.dataframe({"entity": list(counts), "auto-removed": list(counts.values())},
                     hide_index=True, use_container_width=True)

        if result.review_items:
            st.warning(
                f"**Human review required — {len(result.review_items)} low-confidence detection(s)**\n\n"
                "These spans were redacted for safety but the model's confidence was below the "
                "auto-confirm threshold. An IG analyst should confirm before the note enters the SDE pool.",
                icon="⚠️",
            )
            for s in result.review_items:
                context_start = max(0, s.start - 40)
                context_end = min(len(text), s.end + 40)
                ctx = text[context_start:context_end].replace("\n", " ")
                st.markdown(
                    f"- **`{s.entity_type}`** · score `{s.score:.2f}` · "
                    f'…{html.escape(ctx[:s.start - context_start])}'
                    f'**_{html.escape(s.text)}_**'
                    f'{html.escape(ctx[s.end - context_start:])}…'
                )
        else:
            st.success("All detections auto-confirmed (score ≥ threshold). No human review needed.", icon="✅")

        st.markdown("##### 4) Download this de-identified note")
        one = [{"note_id": note_id, "method": method, "sanitised_text": result.sanitised}]
        d1, d2 = st.columns(2)
        d1.download_button("⬇ Download JSON", json.dumps(one, ensure_ascii=False, indent=2),
                           file_name="noteguard_note.json", mime="application/json",
                           use_container_width=True)
        d2.download_button("⬇ Download CSV", pd.DataFrame(one).to_csv(index=False),
                           file_name="noteguard_note.csv", mime="text/csv",
                           use_container_width=True)

    st.divider()
    with st.expander("⬇ Download the full de-identified dataset"):
        st.caption("De-identify a batch of notes and export **only the sanitised text** — "
                   "the original PHI never leaves the gate.")
        ca, cb = st.columns([3, 1])
        n_all = ca.slider("Notes to de-identify", 50, 1600, 200, step=50, key="dataset_n")
        if cb.button("Prepare", use_container_width=True):
            with st.spinner(f"De-identifying {n_all} notes…"):
                pipe = Pipeline(detector, PseudonymVault())  # one vault → patient-consistent
                rows = [{"note_id": r.note_id, "method": method,
                         "sanitised_text": pipe.sanitise(r.text, method, r.person_id).sanitised}
                        for r in load_notes(limit=n_all) if r.text]
                st.session_state["dataset_rows"] = rows
        rows = st.session_state.get("dataset_rows")
        if rows:
            st.success(f"{len(rows)} notes de-identified — ready to download.")
            e1, e2 = st.columns(2)
            e1.download_button("⬇ Download JSON", json.dumps(rows, ensure_ascii=False, indent=2),
                               file_name="noteguard_dataset.json", mime="application/json",
                               use_container_width=True)
            e2.download_button("⬇ Download CSV", pd.DataFrame(rows).to_csv(index=False),
                               file_name="noteguard_dataset.csv", mime="text/csv",
                               use_container_width=True)

# ---------------------------------------------------------------- Metrics
with tab_metrics:
    st.markdown(
        "**Leakage rate** is the headline SDE gate metric: after sanitisation, what fraction of known "
        "patient identifiers still appear in the output text? Ground truth is **joined from the "
        "dataset's structured tables** — every note's identifiers are known in advance, so this is "
        "a real, measurable re-identification risk, not an estimate."
    )
    st.markdown(
        "> **Target for SDE admission:** leakage = 0. Any note with a non-zero leakage score "
        "must be held back from the shared pool until reviewed."
    )
    data = load_json(RESULTS)
    n = st.slider("Notes to evaluate (live run)", 50, 1000, 200, step=50)
    if st.button("▶ Run evaluation"):
        with st.spinner("Evaluating…"):
            recs = load_notes(limit=n)
            res = evaluate(recs, detector, PSEUDONYM).to_dict()
            data = {"presidio+rules": res}
            RESULTS.write_text(json.dumps(data, indent=2), encoding="utf-8")

    if data:
        name = "presidio+rules" if "presidio+rules" in data else next(iter(data))
        r = data[name]
        leak = r["leakage"]["leakage_rate_pct"]
        m1, m2, m3 = st.columns(3)
        m1.metric("Identifiers removed", f"{100 - leak:.1f}%", help="Known PII not present in output")
        m2.metric("Residual leakage", f"{leak:.2f}%",
                  delta=f"{leak:.2f}%" if leak > 0 else None,
                  delta_color="inverse",
                  help="Fraction of known PII surviving sanitisation — target: 0%")
        m3.metric("Notes evaluated", r["notes_evaluated"])
        st.markdown("##### Detection recall by entity type")
        pe = r["detection"]["per_entity"]
        st.dataframe(
            {"entity": list(pe),
             "recall": [f"{m['recall']:.0%}" for m in pe.values()],
             "precision": [f"{m['precision']:.0%}" for m in pe.values()],
             "support": [m["support"] for m in pe.values()]},
            hide_index=True, use_container_width=True,
        )
        st.caption(
            "Precision is a conservative lower bound. Clinician names and unlisted locations "
            "detected correctly are counted as false positives."
        )
    else:
        st.info("No metrics yet — click **Run evaluation** above.")

# ---------------------------------------------------------------- Governance
with tab_gov:
    st.markdown("### NHS Five Safes — How NoteGuard maps")
    st.markdown(
        "The Five Safes framework is the standard NHS governance model for data access. "
        "NoteGuard is designed as the **Safe Data** layer that makes the other four safes cheaper to achieve."
    )

    five_safes = [
        ("✅ Safe Data",
         "DAPB1523 / ICO standard",
         "Names · NHS number · DOB · postcode → outward code · "
         "GMC/NMC clinician IDs · ODS org codes · record UUIDs · site names. "
         "NRP (nationality/religion) always redacted, never pseudonymised (UK GDPR Art. 9)."),
        ("✅ Safe Settings",
         "Processing inside the Trust",
         "Detection and sanitisation run locally. Raw notes, vault (re-id key), and CSVs "
         "are gitignored and never leave the Trust boundary. Only de-identified text is exported."),
        ("✅ Safe Outputs",
         "Leakage-gated release",
         "Residual leakage is measured against ground-truth identifiers before any note enters "
         "the SDE pool. Target: 0 known identifiers surviving sanitisation. "
         "Low-confidence spans are held in a human review queue rather than auto-released."),
        ("⚠️ Safe People",
         "Human-in-the-loop required",
         "The re-identification vault stays Trust-local. Pseudonymised data is still personal "
         "data under UK GDPR (stated honestly — no over-claim of anonymisation). "
         "An IG analyst reviews low-confidence detections before pool admission."),
        ("⚠️ Safe Projects",
         "Project-level approval not covered here",
         "NoteGuard provides the technical de-identification layer; "
         "project-level data access approval (Data Access Request / DARS) remains a Trust process."),
    ]
    for safe, standard, detail in five_safes:
        with st.expander(f"**{safe}** — {standard}"):
            st.markdown(detail)

    st.divider()
    st.markdown("### Adoption path — NHS SDE on-ramp")
    st.markdown("""
```
NHS Trust (raw notes)

    ▼  NoteGuard gate (runs inside Trust)
    │   clean → detect PII → sanitise → leakage check
    │   low-confidence spans → IG analyst review queue

    ▼  de-identified notes + audit log  (no PHI crosses boundary)

    ▼  NHS Secure Data Environment / Federated Data Platform pool
    │   (same model as OpenSAFELY: code comes to data, data never leaves)

    ▼  Federated AI training
        each Trust trains locally; only model gradients are shared
```
    """)

# ---------------------------------------------------------------- Two-Trust
with tab_trust:
    st.markdown(
        "### Sanitise-at-source: two Trusts sharing without sharing\n\n"
        "Each Trust runs the NoteGuard gate locally — raw notes and the re-identification vault "
        "**never leave**. Only de-identified text and a content-free audit manifest go into the "
        "shared SDE pool. This is the same privacy model behind OpenSAFELY and the NHS Federated "
        "Data Platform: *code comes to the data, data never leaves*."
    )
    summary = load_json(OUT_DIR / "trust_demo_summary.json")
    if st.button("▶ Run two-Trust demo"):
        from src.trust_demo import main as run_trust
        with st.spinner("Sanitising at each Trust…"):
            run_trust()
        summary = load_json(OUT_DIR / "trust_demo_summary.json")

    if summary:
        cols = st.columns(len(summary["trusts"]) + 1)
        for col, t in zip(cols, summary["trusts"], strict=False):
            with col:
                st.markdown(f"#### 🏥 {t['trust'].split('(')[0].strip()}")
                st.metric("Notes de-identified", t["notes_deidentified"])
                st.metric("Raw records shared", t["raw_records_shared"])
                st.metric("Residual leaks", t["residual_leaks"])
                st.caption("🔒 raw notes + vault stay local")
        with cols[-1]:
            st.markdown("#### 🟢 Shared SDE pool")
            st.metric("De-identified notes", summary["shared_pool_size"])
            st.metric("Raw records shared", summary["raw_records_shared"])
            st.metric("Total residual leaks", summary["total_residual_leaks"])
            st.caption("→ ready for federated AI training")
    else:
        st.info("Click **Run two-Trust demo** above.")

# ---------------------------------------------------------------- Footer (all tabs)
st.divider()
st.caption(
    "Live demo for the **FLock Sovereign AI Challenge** at the Encode Vibe Coding Hackathon, "
    "hosted by Encode Hub."
)