noteguard-agent / tests /test_deid.py
github-actions[bot]
Deploy 9aa839066bbf99a8ada733b41479a39770b3bb83 from main
eb83689
Raw
History Blame Contribute Delete
13.4 kB
"""Unit tests for noteguard.deid.
All tests use the standard-library-only de-id core β€” no external services,
no API keys, no network calls required. Safe to run in CI.
"""
from __future__ import annotations
import pytest
from src.deid import NoteGuard, load_known_from_csv
KNOWN = {"PERSON": ["Margaret Okafor"], "NHS": ["485 777 3456"]}
def _ng() -> NoteGuard:
return NoteGuard(known=KNOWN)
@pytest.fixture(autouse=True)
def _no_ner(monkeypatch):
"""Pin the rule/vault layer for unit tests.
Presidio NER is an optional recall boost (the ``[nlp]`` extra). Tests must be
deterministic whether or not it is installed, so default every test to the
no-op detector; the NER-path test injects its own fake on top.
"""
import src.deid as deid
monkeypatch.setattr(deid, "_DETECTOR", deid._Detector())
# ── deidentify ────────────────────────────────────────────────────────────────
def test_nhs_number_replaced():
ng = _ng()
result = ng.deidentify("Patient NHS 485 777 3456 admitted.")
assert "485 777 3456" not in result.clean_text
assert "[NHS_" in result.clean_text
def test_person_name_replaced():
ng = _ng()
result = ng.deidentify("Pt Margaret Okafor discharged home.")
assert "Margaret Okafor" not in result.clean_text
assert "[PERSON_" in result.clean_text
def test_email_replaced():
ng = _ng()
result = ng.deidentify("Contact a.okafor@nhs.net for follow-up.")
assert "@" not in result.clean_text
def test_dob_replaced():
ng = _ng()
result = ng.deidentify("DOB 14/03/1934, admitted post-fall.")
assert "14/03/1934" not in result.clean_text
def test_gmc_replaced():
ng = _ng()
result = ng.deidentify("Referring clinician GMC 1234567.")
assert "1234567" not in result.clean_text
def test_clean_text_passes_through_unchanged():
ng = _ng()
note = "Patient admitted post-fall. Hx AF, on warfarin. BP 128/74."
result = ng.deidentify(note)
assert result.clean_text == note
# ── GMC / NMC connector-word variants ────────────────────────────────────────
def test_gmc_with_connector_no():
ng = _ng()
res = ng.deidentify("Referring clinician GMC No. 7654321.")
assert "7654321" not in res.clean_text
assert "[GMC_" in res.clean_text
def test_gmc_with_connector_number():
ng = _ng()
res = ng.deidentify("GMC number 7654321 on record.")
assert "7654321" not in res.clean_text
def test_nmc_with_connector_number_colon():
ng = _ng()
res = ng.deidentify("Nurse Chukwuebuka Okafor, NMC number: 18D6896L")
assert "18D6896L" not in res.clean_text
assert "[NMC_" in res.clean_text
def test_nmc_with_pin():
ng = _ng()
res = ng.deidentify("Registered nurse PIN 18D6896L.")
assert "18D6896L" not in res.clean_text
def test_nmc_bare():
ng = _ng()
res = ng.deidentify("NMC 18D6896L confirmed.")
assert "18D6896L" not in res.clean_text
# ── Clinician name detection (via expanded vault) ─────────────────────────────
def test_clinician_name_via_vault():
"""Clinician names added to the vault are redacted deterministically."""
known = {"PERSON": ["Chukwuebuka Okafor", "Margaret Okafor"], "NHS": []}
ng = NoteGuard(known=known)
res = ng.deidentify("Nurse Chukwuebuka Okafor assessed the patient.")
assert "Chukwuebuka Okafor" not in res.clean_text
assert "[PERSON_" in res.clean_text
def test_full_clinician_nmc_note():
"""Combined: nurse name in vault + NMC number with connector word."""
known = {"PERSON": ["Chukwuebuka Okafor"], "NHS": []}
ng = NoteGuard(known=known)
note = "Patient assessed at triage by Nurse Chukwuebuka Okafor, NMC number: 18D6896L"
res = ng.deidentify(note)
assert "Chukwuebuka Okafor" not in res.clean_text
assert "18D6896L" not in res.clean_text
ng.assert_clean(res.clean_text)
# ── assert_clean ──────────────────────────────────────────────────────────────
def test_assert_clean_passes_on_safe_text():
ng = _ng()
ng.assert_clean("Admitted post-fall. Hx AF. INR 2.4.") # must not raise
def test_assert_clean_raises_on_nhs_number():
ng = _ng()
with pytest.raises(ValueError, match="485 777 3456"):
ng.assert_clean("NHS 485 777 3456 still present.")
def test_assert_clean_raises_on_known_name():
ng = _ng()
with pytest.raises(ValueError, match="Margaret Okafor"):
ng.assert_clean("Patient Margaret Okafor discharged.")
def test_assert_clean_raises_on_nmc():
ng = _ng()
with pytest.raises(ValueError):
ng.assert_clean("NMC number: 18D6896L not redacted.")
# ── residual_identifiers (trust metric) ───────────────────────────────────────
def test_residual_identifiers_catches_orphaned_token():
"""A [LABEL_n] token with no reverse mapping is an unmapped-token leak."""
ng = NoteGuard(known={}, reverse={"[PERSON_1]": "Real Name"})
text = "Summary for [PERSON_1] and [PERSON_2]." # PERSON_2 has no mapping
hits = ng.residual_identifiers(text)
assert any("unmapped_token" in h for h in hits)
# PERSON_1 is mapped β€” should NOT appear as orphaned
assert not any("PERSON_1" in h for h in hits)
def test_residual_identifiers_catches_nmc():
ng = _ng()
hits = ng.residual_identifiers("Nurse PIN 18D6896L still present.")
assert any(h for h in hits) # something was found
# ── reidentify ────────────────────────────────────────────────────────────────
def test_reidentify_restores_surrogate():
ng = _ng()
result = ng.deidentify("Pt Margaret Okafor (NHS 485 777 3456) admitted.")
restored = ng.reidentify(result.clean_text)
assert "Margaret Okafor" in restored
assert "485 777 3456" in restored
def test_reidentify_consistent_surrogates():
"""Same original -> same surrogate across multiple notes."""
ng = _ng()
r1 = ng.deidentify("Note 1: Margaret Okafor, INR normal.")
r2 = ng.deidentify("Note 2: Margaret Okafor, discharged.")
tokens_1 = {tok for tok in r1.clean_text.split() if tok.startswith("[PERSON-")}
tokens_2 = {tok for tok in r2.clean_text.split() if tok.startswith("[PERSON-")}
assert tokens_1 == tokens_2
# ── load_known_from_csv ───────────────────────────────────────────────────────
def test_load_known_from_csv(tmp_path):
csv_file = tmp_path / "patients.csv"
csv_file.write_text("full_name,nhs_number\nJane Smith,123 456 7890\n")
known = load_known_from_csv(str(csv_file))
assert "Jane Smith" in known["PERSON"]
assert "123 456 7890" in known["NHS"]
def test_load_known_from_csv_admissions(tmp_path):
"""Names in admissions.csv clinician columns are added to the vault."""
patients = tmp_path / "patients.csv"
patients.write_text("full_name,nhs_number\nJane Smith,123 456 7890\n")
admissions = tmp_path / "admissions.csv"
admissions.write_text("clinician_name,attending\nDr Wei Wang,Nurse Chukwuebuka Okafor\n")
known = load_known_from_csv(str(patients), str(admissions))
assert "Dr Wei Wang" in known["PERSON"]
assert "Nurse Chukwuebuka Okafor" in known["PERSON"]
def test_load_known_from_csv_missing_admissions(tmp_path):
"""Missing admissions.csv is silently ignored."""
patients = tmp_path / "patients.csv"
patients.write_text("full_name,nhs_number\nJane Smith,123 456 7890\n")
known = load_known_from_csv(str(patients), str(tmp_path / "missing.csv"))
assert "Jane Smith" in known["PERSON"]
# ── scan_pii: vault-independent residual-PII audit for the trust panel ─────────
def test_scan_pii_flags_titled_names_missed_by_vault():
"""The reported failure: free-text clinician names with no vault entry slip
past de-id, and scan_pii must catch them while ignoring tokenised IDs."""
ng = NoteGuard(known={"PERSON": [], "NHS": []}) # arbitrary pasted note, no vault
note = (
"Contacted patient's GP, Dr. Ethel Joanne Duffy, to provide an update.\n"
"Nurse Jasmine Freda Murray\nNMC number: 20F4626L"
)
res = ng.deidentify(note)
findings = ng.scan_pii(res.clean_text)
texts = " | ".join(f["text"] for f in findings)
assert all(f["type"] == "name" for f in findings)
assert "Ethel Joanne Duffy" in texts
assert "Jasmine Freda Murray" in texts
assert "NMC" not in texts and "[NMC_1]" not in texts # tokenised id is not PII
def test_scan_pii_clean_when_names_tokenised():
"""When names are in the vault they become surrogates β†’ no residual PII."""
ng = NoteGuard(known={"PERSON": ["Ethel Joanne Duffy", "Jasmine Freda Murray"], "NHS": []})
res = ng.deidentify("GP Dr. Ethel Joanne Duffy. Nurse Jasmine Freda Murray.")
assert ng.scan_pii(res.clean_text) == []
def test_scan_pii_ignores_surrogate_tokens_and_role_words():
"""Surrogate tokens and bare role words must not be flagged as names."""
ng = NoteGuard()
text = "Consultant: [PERSON_1], seen by Dr [PERSON_2]. Nurse Practitioner reviewed."
assert ng.scan_pii(text) == []
def test_scan_pii_flags_residual_structured_identifier():
"""A structured identifier that slipped through is reported with its type."""
ng = NoteGuard()
findings = ng.scan_pii("Contact the team at a.b.smith@nhs.net for queries.")
assert any(f["type"] == "email" and "a.b.smith@nhs.net" in f["text"] for f in findings)
# ── NER path: Presidio/spaCy redacts free-text names with no vault entry ───────
def test_deidentify_redacts_ner_detected_names(monkeypatch):
"""When an NLP detector is present, a free-text name with no vault entry is
still tokenised β€” the recall boost the [nlp] extra adds in the deployed image."""
import src.deid as deid
class _Fake(deid._Detector):
def detect_persons(self, text: str) -> list[str]:
return ["Ethel Joanne Duffy"] if "Ethel Joanne Duffy" in text else []
monkeypatch.setattr(deid, "_DETECTOR", _Fake())
ng = deid.NoteGuard(known={"PERSON": [], "NHS": []})
res = ng.deidentify("Reviewed by Ethel Joanne Duffy on the ward round.")
assert "Ethel Joanne Duffy" not in res.clean_text
assert "[PERSON_" in res.clean_text
assert ng.scan_pii(res.clean_text) == [] # nothing left for the audit to flag
def test_shifted_date_round_trips_through_reidentify():
"""A visit date is shifted (so the model never sees the real one), but the
shift is reversible: reproducing the shifted date restores the true date for
the clinician. This is the mechanism the discharge-summary date relies on."""
ng = NoteGuard()
res = ng.deidentify("Admission date 13/02/26.")
shifted = res.forward["13/02/26"]
assert shifted != "13/02/26" # the model sees a different (shifted) date
assert "13/02/26" not in res.clean_text
# Model reproduces the shifted date verbatim β†’ reidentify restores the real one.
restored = NoteGuard(reverse=res.reverse).reidentify(f"Admitted on {shifted}.")
assert "13/02/26" in restored
def test_redact_unresolved_strips_stray_date_placeholder():
"""A stray template placeholder like [DATE_X] (label + non-digit) is redacted
and flagged, so it never reaches the clinician verbatim."""
ng = NoteGuard()
out, leaked = ng.redact_unresolved("Admitted [DATE_X] after chest pain.")
assert "[DATE_X]" not in out
assert out == "Admitted [redacted] after chest pain."
assert leaked == ["[DATE_X]"]
def test_redact_unresolved_strips_unrestored_surrogate():
"""An unrestored [LABEL_n] surrogate is also caught."""
ng = NoteGuard()
out, leaked = ng.redact_unresolved("Seen by [PERSON_9].")
assert out == "Seen by [redacted]." and leaked == ["[PERSON_9]"]
def test_redact_unresolved_leaves_clean_text_untouched():
"""Text with no surrogate-shaped tokens is returned unchanged."""
ng = NoteGuard()
out, leaked = ng.redact_unresolved("Admitted after chest pain. Stable.")
assert out == "Admitted after chest pain. Stable." and leaked == []
def test_ner_clinical_stopwords_not_redacted(monkeypatch):
"""Clinical abbreviations the NER layer mislabels (e.g. 'Subcut') are kept,
while a real name flagged in the same pass is still redacted."""
import src.deid as deid
class _Fake(deid._Detector):
def detect_persons(self, text: str) -> list[str]:
return ["Subcut", "Afua Asare"]
monkeypatch.setattr(deid, "_DETECTOR", _Fake())
ng = deid.NoteGuard(known={"PERSON": [], "NHS": []})
res = ng.deidentify("Subcut emph noted on palpation. Reviewed by Afua Asare.")
assert "Subcut" in res.clean_text # clinical term β€” not a name
assert "Afua Asare" not in res.clean_text and "[PERSON_" in res.clean_text