Spaces:
Running
Running
| """Unit tests for noteguard.deid. | |
| All tests use the standard-library-only de-id core β no external services, | |
| no API keys, no network calls required. Safe to run in CI. | |
| """ | |
| from __future__ import annotations | |
| import pytest | |
| from src.deid import NoteGuard, load_known_from_csv | |
| KNOWN = {"PERSON": ["Margaret Okafor"], "NHS": ["485 777 3456"]} | |
| def _ng() -> NoteGuard: | |
| return NoteGuard(known=KNOWN) | |
| def _no_ner(monkeypatch): | |
| """Pin the rule/vault layer for unit tests. | |
| Presidio NER is an optional recall boost (the ``[nlp]`` extra). Tests must be | |
| deterministic whether or not it is installed, so default every test to the | |
| no-op detector; the NER-path test injects its own fake on top. | |
| """ | |
| import src.deid as deid | |
| monkeypatch.setattr(deid, "_DETECTOR", deid._Detector()) | |
| # ββ deidentify ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_nhs_number_replaced(): | |
| ng = _ng() | |
| result = ng.deidentify("Patient NHS 485 777 3456 admitted.") | |
| assert "485 777 3456" not in result.clean_text | |
| assert "[NHS_" in result.clean_text | |
| def test_person_name_replaced(): | |
| ng = _ng() | |
| result = ng.deidentify("Pt Margaret Okafor discharged home.") | |
| assert "Margaret Okafor" not in result.clean_text | |
| assert "[PERSON_" in result.clean_text | |
| def test_email_replaced(): | |
| ng = _ng() | |
| result = ng.deidentify("Contact a.okafor@nhs.net for follow-up.") | |
| assert "@" not in result.clean_text | |
| def test_dob_replaced(): | |
| ng = _ng() | |
| result = ng.deidentify("DOB 14/03/1934, admitted post-fall.") | |
| assert "14/03/1934" not in result.clean_text | |
| def test_gmc_replaced(): | |
| ng = _ng() | |
| result = ng.deidentify("Referring clinician GMC 1234567.") | |
| assert "1234567" not in result.clean_text | |
| def test_clean_text_passes_through_unchanged(): | |
| ng = _ng() | |
| note = "Patient admitted post-fall. Hx AF, on warfarin. BP 128/74." | |
| result = ng.deidentify(note) | |
| assert result.clean_text == note | |
| # ββ GMC / NMC connector-word variants ββββββββββββββββββββββββββββββββββββββββ | |
| def test_gmc_with_connector_no(): | |
| ng = _ng() | |
| res = ng.deidentify("Referring clinician GMC No. 7654321.") | |
| assert "7654321" not in res.clean_text | |
| assert "[GMC_" in res.clean_text | |
| def test_gmc_with_connector_number(): | |
| ng = _ng() | |
| res = ng.deidentify("GMC number 7654321 on record.") | |
| assert "7654321" not in res.clean_text | |
| def test_nmc_with_connector_number_colon(): | |
| ng = _ng() | |
| res = ng.deidentify("Nurse Chukwuebuka Okafor, NMC number: 18D6896L") | |
| assert "18D6896L" not in res.clean_text | |
| assert "[NMC_" in res.clean_text | |
| def test_nmc_with_pin(): | |
| ng = _ng() | |
| res = ng.deidentify("Registered nurse PIN 18D6896L.") | |
| assert "18D6896L" not in res.clean_text | |
| def test_nmc_bare(): | |
| ng = _ng() | |
| res = ng.deidentify("NMC 18D6896L confirmed.") | |
| assert "18D6896L" not in res.clean_text | |
| # ββ Clinician name detection (via expanded vault) βββββββββββββββββββββββββββββ | |
| def test_clinician_name_via_vault(): | |
| """Clinician names added to the vault are redacted deterministically.""" | |
| known = {"PERSON": ["Chukwuebuka Okafor", "Margaret Okafor"], "NHS": []} | |
| ng = NoteGuard(known=known) | |
| res = ng.deidentify("Nurse Chukwuebuka Okafor assessed the patient.") | |
| assert "Chukwuebuka Okafor" not in res.clean_text | |
| assert "[PERSON_" in res.clean_text | |
| def test_full_clinician_nmc_note(): | |
| """Combined: nurse name in vault + NMC number with connector word.""" | |
| known = {"PERSON": ["Chukwuebuka Okafor"], "NHS": []} | |
| ng = NoteGuard(known=known) | |
| note = "Patient assessed at triage by Nurse Chukwuebuka Okafor, NMC number: 18D6896L" | |
| res = ng.deidentify(note) | |
| assert "Chukwuebuka Okafor" not in res.clean_text | |
| assert "18D6896L" not in res.clean_text | |
| ng.assert_clean(res.clean_text) | |
| # ββ assert_clean ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_assert_clean_passes_on_safe_text(): | |
| ng = _ng() | |
| ng.assert_clean("Admitted post-fall. Hx AF. INR 2.4.") # must not raise | |
| def test_assert_clean_raises_on_nhs_number(): | |
| ng = _ng() | |
| with pytest.raises(ValueError, match="485 777 3456"): | |
| ng.assert_clean("NHS 485 777 3456 still present.") | |
| def test_assert_clean_raises_on_known_name(): | |
| ng = _ng() | |
| with pytest.raises(ValueError, match="Margaret Okafor"): | |
| ng.assert_clean("Patient Margaret Okafor discharged.") | |
| def test_assert_clean_raises_on_nmc(): | |
| ng = _ng() | |
| with pytest.raises(ValueError): | |
| ng.assert_clean("NMC number: 18D6896L not redacted.") | |
| # ββ residual_identifiers (trust metric) βββββββββββββββββββββββββββββββββββββββ | |
| def test_residual_identifiers_catches_orphaned_token(): | |
| """A [LABEL_n] token with no reverse mapping is an unmapped-token leak.""" | |
| ng = NoteGuard(known={}, reverse={"[PERSON_1]": "Real Name"}) | |
| text = "Summary for [PERSON_1] and [PERSON_2]." # PERSON_2 has no mapping | |
| hits = ng.residual_identifiers(text) | |
| assert any("unmapped_token" in h for h in hits) | |
| # PERSON_1 is mapped β should NOT appear as orphaned | |
| assert not any("PERSON_1" in h for h in hits) | |
| def test_residual_identifiers_catches_nmc(): | |
| ng = _ng() | |
| hits = ng.residual_identifiers("Nurse PIN 18D6896L still present.") | |
| assert any(h for h in hits) # something was found | |
| # ββ reidentify ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_reidentify_restores_surrogate(): | |
| ng = _ng() | |
| result = ng.deidentify("Pt Margaret Okafor (NHS 485 777 3456) admitted.") | |
| restored = ng.reidentify(result.clean_text) | |
| assert "Margaret Okafor" in restored | |
| assert "485 777 3456" in restored | |
| def test_reidentify_consistent_surrogates(): | |
| """Same original -> same surrogate across multiple notes.""" | |
| ng = _ng() | |
| r1 = ng.deidentify("Note 1: Margaret Okafor, INR normal.") | |
| r2 = ng.deidentify("Note 2: Margaret Okafor, discharged.") | |
| tokens_1 = {tok for tok in r1.clean_text.split() if tok.startswith("[PERSON-")} | |
| tokens_2 = {tok for tok in r2.clean_text.split() if tok.startswith("[PERSON-")} | |
| assert tokens_1 == tokens_2 | |
| # ββ load_known_from_csv βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_load_known_from_csv(tmp_path): | |
| csv_file = tmp_path / "patients.csv" | |
| csv_file.write_text("full_name,nhs_number\nJane Smith,123 456 7890\n") | |
| known = load_known_from_csv(str(csv_file)) | |
| assert "Jane Smith" in known["PERSON"] | |
| assert "123 456 7890" in known["NHS"] | |
| def test_load_known_from_csv_admissions(tmp_path): | |
| """Names in admissions.csv clinician columns are added to the vault.""" | |
| patients = tmp_path / "patients.csv" | |
| patients.write_text("full_name,nhs_number\nJane Smith,123 456 7890\n") | |
| admissions = tmp_path / "admissions.csv" | |
| admissions.write_text("clinician_name,attending\nDr Wei Wang,Nurse Chukwuebuka Okafor\n") | |
| known = load_known_from_csv(str(patients), str(admissions)) | |
| assert "Dr Wei Wang" in known["PERSON"] | |
| assert "Nurse Chukwuebuka Okafor" in known["PERSON"] | |
| def test_load_known_from_csv_missing_admissions(tmp_path): | |
| """Missing admissions.csv is silently ignored.""" | |
| patients = tmp_path / "patients.csv" | |
| patients.write_text("full_name,nhs_number\nJane Smith,123 456 7890\n") | |
| known = load_known_from_csv(str(patients), str(tmp_path / "missing.csv")) | |
| assert "Jane Smith" in known["PERSON"] | |
| # ββ scan_pii: vault-independent residual-PII audit for the trust panel βββββββββ | |
| def test_scan_pii_flags_titled_names_missed_by_vault(): | |
| """The reported failure: free-text clinician names with no vault entry slip | |
| past de-id, and scan_pii must catch them while ignoring tokenised IDs.""" | |
| ng = NoteGuard(known={"PERSON": [], "NHS": []}) # arbitrary pasted note, no vault | |
| note = ( | |
| "Contacted patient's GP, Dr. Ethel Joanne Duffy, to provide an update.\n" | |
| "Nurse Jasmine Freda Murray\nNMC number: 20F4626L" | |
| ) | |
| res = ng.deidentify(note) | |
| findings = ng.scan_pii(res.clean_text) | |
| texts = " | ".join(f["text"] for f in findings) | |
| assert all(f["type"] == "name" for f in findings) | |
| assert "Ethel Joanne Duffy" in texts | |
| assert "Jasmine Freda Murray" in texts | |
| assert "NMC" not in texts and "[NMC_1]" not in texts # tokenised id is not PII | |
| def test_scan_pii_clean_when_names_tokenised(): | |
| """When names are in the vault they become surrogates β no residual PII.""" | |
| ng = NoteGuard(known={"PERSON": ["Ethel Joanne Duffy", "Jasmine Freda Murray"], "NHS": []}) | |
| res = ng.deidentify("GP Dr. Ethel Joanne Duffy. Nurse Jasmine Freda Murray.") | |
| assert ng.scan_pii(res.clean_text) == [] | |
| def test_scan_pii_ignores_surrogate_tokens_and_role_words(): | |
| """Surrogate tokens and bare role words must not be flagged as names.""" | |
| ng = NoteGuard() | |
| text = "Consultant: [PERSON_1], seen by Dr [PERSON_2]. Nurse Practitioner reviewed." | |
| assert ng.scan_pii(text) == [] | |
| def test_scan_pii_flags_residual_structured_identifier(): | |
| """A structured identifier that slipped through is reported with its type.""" | |
| ng = NoteGuard() | |
| findings = ng.scan_pii("Contact the team at a.b.smith@nhs.net for queries.") | |
| assert any(f["type"] == "email" and "a.b.smith@nhs.net" in f["text"] for f in findings) | |
| # ββ NER path: Presidio/spaCy redacts free-text names with no vault entry βββββββ | |
| def test_deidentify_redacts_ner_detected_names(monkeypatch): | |
| """When an NLP detector is present, a free-text name with no vault entry is | |
| still tokenised β the recall boost the [nlp] extra adds in the deployed image.""" | |
| import src.deid as deid | |
| class _Fake(deid._Detector): | |
| def detect_persons(self, text: str) -> list[str]: | |
| return ["Ethel Joanne Duffy"] if "Ethel Joanne Duffy" in text else [] | |
| monkeypatch.setattr(deid, "_DETECTOR", _Fake()) | |
| ng = deid.NoteGuard(known={"PERSON": [], "NHS": []}) | |
| res = ng.deidentify("Reviewed by Ethel Joanne Duffy on the ward round.") | |
| assert "Ethel Joanne Duffy" not in res.clean_text | |
| assert "[PERSON_" in res.clean_text | |
| assert ng.scan_pii(res.clean_text) == [] # nothing left for the audit to flag | |
| def test_shifted_date_round_trips_through_reidentify(): | |
| """A visit date is shifted (so the model never sees the real one), but the | |
| shift is reversible: reproducing the shifted date restores the true date for | |
| the clinician. This is the mechanism the discharge-summary date relies on.""" | |
| ng = NoteGuard() | |
| res = ng.deidentify("Admission date 13/02/26.") | |
| shifted = res.forward["13/02/26"] | |
| assert shifted != "13/02/26" # the model sees a different (shifted) date | |
| assert "13/02/26" not in res.clean_text | |
| # Model reproduces the shifted date verbatim β reidentify restores the real one. | |
| restored = NoteGuard(reverse=res.reverse).reidentify(f"Admitted on {shifted}.") | |
| assert "13/02/26" in restored | |
| def test_redact_unresolved_strips_stray_date_placeholder(): | |
| """A stray template placeholder like [DATE_X] (label + non-digit) is redacted | |
| and flagged, so it never reaches the clinician verbatim.""" | |
| ng = NoteGuard() | |
| out, leaked = ng.redact_unresolved("Admitted [DATE_X] after chest pain.") | |
| assert "[DATE_X]" not in out | |
| assert out == "Admitted [redacted] after chest pain." | |
| assert leaked == ["[DATE_X]"] | |
| def test_redact_unresolved_strips_unrestored_surrogate(): | |
| """An unrestored [LABEL_n] surrogate is also caught.""" | |
| ng = NoteGuard() | |
| out, leaked = ng.redact_unresolved("Seen by [PERSON_9].") | |
| assert out == "Seen by [redacted]." and leaked == ["[PERSON_9]"] | |
| def test_redact_unresolved_leaves_clean_text_untouched(): | |
| """Text with no surrogate-shaped tokens is returned unchanged.""" | |
| ng = NoteGuard() | |
| out, leaked = ng.redact_unresolved("Admitted after chest pain. Stable.") | |
| assert out == "Admitted after chest pain. Stable." and leaked == [] | |
| def test_ner_clinical_stopwords_not_redacted(monkeypatch): | |
| """Clinical abbreviations the NER layer mislabels (e.g. 'Subcut') are kept, | |
| while a real name flagged in the same pass is still redacted.""" | |
| import src.deid as deid | |
| class _Fake(deid._Detector): | |
| def detect_persons(self, text: str) -> list[str]: | |
| return ["Subcut", "Afua Asare"] | |
| monkeypatch.setattr(deid, "_DETECTOR", _Fake()) | |
| ng = deid.NoteGuard(known={"PERSON": [], "NHS": []}) | |
| res = ng.deidentify("Subcut emph noted on palpation. Reviewed by Afua Asare.") | |
| assert "Subcut" in res.clean_text # clinical term β not a name | |
| assert "Afua Asare" not in res.clean_text and "[PERSON_" in res.clean_text | |