File size: 6,426 Bytes
abfd704 98cd18a abfd704 98cd18a 4cf04c1 abfd704 39f19d3 abfd704 4cf04c1 abfd704 4cf04c1 abfd704 98cd18a b7c4eb2 abfd704 b7c4eb2 abfd704 98cd18a 4cf04c1 39f19d3 2da24e4 39f19d3 98cd18a b7c4eb2 98cd18a 4cf04c1 98cd18a abfd704 98cd18a abfd704 98cd18a abfd704 39f19d3 abfd704 98cd18a abfd704 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | """Pure-Python rule recognisers — no spaCy / Presidio dependency.
These give NoteGuard a transparent, auditable baseline that runs anywhere, and
let the evaluation harness work even before the (heavier) NER engine is wired up.
The NHS-number recogniser validates the mod-11 check digit so random 10-digit
strings (dose volumes, IDs) aren't flagged as patient identifiers.
The NHS staff / organisation rules below (GMC & NMC clinician IDs, ODS org codes,
record UUIDs) were folded in from the Presidio branch so the rule layer also
covers people who aren't patients — Caldicott/DPA apply to anyone identifiable.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from .data import DATE, LOCATION, PERSON, UK_NHS # noqa: F401 (re-exported types)
EMAIL = "EMAIL_ADDRESS"
PHONE = "PHONE_NUMBER"
POSTCODE = "UK_POSTCODE"
GMC = "GMC" # General Medical Council number (UK doctors)
NMC = "NMC" # Nursing & Midwifery Council PIN
NHS_ODS = "NHS_ODS" # NHS Organisation Data Service codes (GP practices, trusts)
RECORD_ID = "RECORD_ID" # record/document UUIDs that act as quasi-identifiers
UK_NINO = "UK_NINO" # National Insurance Number
UK_VEHICLE_REGISTRATION = "UK_VEHICLE_REGISTRATION" # current-format UK plate
@dataclass(frozen=True)
class Span:
start: int
end: int
entity_type: str
text: str
score: float = 1.0
needs_review: bool = False # True for detections below the auto-confirm threshold
def nhs_number_is_valid(digits: str) -> bool:
"""Validate a 10-digit NHS number using the Modulus 11 check-digit algorithm."""
d = re.sub(r"\D", "", digits)
if len(d) != 10:
return False
total = sum(int(d[i]) * (10 - i) for i in range(9))
remainder = total % 11
check = 11 - remainder
if check == 11:
check = 0
if check == 10:
return False # never valid
return check == int(d[9])
# Real NHS numbers are 10 digits with a mod-11 check digit, optionally grouped.
# Dataset writes them with space, comma, or hyphen separators (e.g. 272,733,208).
_NHS_RE = re.compile(r"\b\d{3}[ ,\-]?\d{3}[ ,\-]?\d{4}\b")
# Context-anchored: an "NHS ..." label followed by a 9-10 digit number. Needed
# because this synthetic dataset uses 9-digit NHS numbers (no valid checksum),
# which neither the checksum rule nor Presidio's UK_NHS recogniser would catch.
_NHS_CTX_RE = re.compile(
r"NHS\s*(?:Number|No\.?|#)?\s*[:\-]?\s*(\d{3}[ ,\-]?\d{3}[ ,\-]?\d{2,4})",
re.IGNORECASE,
)
_EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b")
_PHONE_RE = re.compile(r"\b(?:\+?44\s?|0)(?:\d\s?){9,10}\b")
# UK postcode (simplified but standard) e.g. SW1A 1AA, M1 1AE
_POSTCODE_RE = re.compile(r"\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b", re.IGNORECASE)
# Dates are PII only in a DATE-OF-BIRTH context. Visit / encounter / admission dates
# are clinically useful and NOT identifiers on their own, so they're left intact.
# Captures the date itself as group 1.
_DOB_RE = re.compile(
r"(?i)\b(?:DOB|D\.O\.B\.?|date\s+of\s+birth|born(?:\s+on)?)[\s:]*"
r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}"
r"|\d{4}-\d{2}-\d{2}"
r"|\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{2,4})"
)
# --- NHS staff / organisation identifiers (context-anchored to avoid noise) ---
_GMC_RE = re.compile(r"(?i)\bGMC(?:\s*(?:no|number|#))?[:\s#]*(\d{7})\b")
_NMC_RE = re.compile(r"(?i)\bNMC(?:\s*pin)?[:\s#]*(\d{2}[A-Z]\d{4}[A-Z])\b")
_NMC_BARE_RE = re.compile(r"\b(\d{2}[A-Z]\d{4}[A-Z])\b") # specific enough to stand alone
_ODS_RE = re.compile(r"(?i)\b(?:ODS|practice\s*code)[:\s]*([A-Z]\d{5})\b")
_UUID_RE = re.compile(
r"\b([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\b", re.IGNORECASE
)
# 2 letters, 6 digits in pairs, A-D suffix — specific enough to fire without context anchoring
_NINO_RE = re.compile(r"\b[A-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-Da-d]\b", re.IGNORECASE)
# Current-format UK plate: 2 letters, 2 digits, optional space, 3 letters (e.g. AB12 CDE)
_VEHICLE_RE = re.compile(r"\b[A-Z]{2}\d{2}\s?[A-Z]{3}\b")
# UK NHS site names: "X Hospital / Infirmary / NHS Trust / Medical Centre / Clinic"
# One-to-four leading title-cased words followed by a known NHS facility suffix.
# Context-anchored to title-case to avoid flagging generic lowercase mentions.
_SITE_RE = re.compile(
r"\b(?:[A-Z][A-Za-z']+\s+){1,4}"
r"(?:Hospital|Infirmary|Trust|Medical\s+Centre|Health\s+Centre|Clinic|Surgery)\b"
)
# (regex, entity_type, capture_group): group 0 = whole match, 1 = inner capture
_PLAIN = [
(_EMAIL_RE, EMAIL, 0),
(_PHONE_RE, PHONE, 0),
(_POSTCODE_RE, POSTCODE, 0),
(_DOB_RE, DATE, 1),
(_GMC_RE, GMC, 1),
(_NMC_RE, NMC, 1),
(_NMC_BARE_RE, NMC, 1),
(_ODS_RE, NHS_ODS, 1),
(_UUID_RE, RECORD_ID, 1),
(_NINO_RE, UK_NINO, 0),
(_VEHICLE_RE, UK_VEHICLE_REGISTRATION, 0),
]
def find_rule_spans(text: str) -> list[Span]:
spans: list[Span] = []
for m in _NHS_RE.finditer(text):
if nhs_number_is_valid(m.group()):
spans.append(Span(m.start(), m.end(), UK_NHS, m.group()))
# context-anchored NHS numbers (catches the 9-digit synthetic ones)
for m in _NHS_CTX_RE.finditer(text):
spans.append(Span(m.start(1), m.end(1), UK_NHS, m.group(1)))
for regex, etype, grp in _PLAIN:
for m in regex.finditer(text):
spans.append(Span(m.start(grp), m.end(grp), etype, m.group(grp)))
for m in _SITE_RE.finditer(text):
spans.append(Span(m.start(), m.end(), LOCATION, m.group()))
return _dedupe(spans)
def _dedupe(spans: list[Span]) -> list[Span]:
"""Drop spans fully contained within another (keep the longer match)."""
spans = sorted(spans, key=lambda s: (s.start, -(s.end - s.start)))
kept: list[Span] = []
for s in spans:
if any(k.start <= s.start and s.end <= k.end for k in kept):
continue
kept.append(s)
return kept
if __name__ == "__main__":
# quick check: 9434765919 is a documented valid NHS test number
assert nhs_number_is_valid("943 476 5919"), "valid NHS number rejected"
assert not nhs_number_is_valid("943 476 5918"), "bad check digit accepted"
demo = ("NHS no 943 476 5919, ring 07700 900123, dob 12/03/1981, SW1A 1AA, "
"seen by Dr Lee GMC 1234567, nurse NMC 12A3456B.")
for sp in find_rule_spans(demo):
print(sp)
|