noteguard / src /recognisers.py
Chaeyoon
Refactor to Gold-RAP structure: src/ package, tests/, docs/, data/, output/
84981a4
Raw
History Blame Contribute Delete
6.43 kB
"""Pure-Python rule recognisers — no spaCy / Presidio dependency.
These give NoteGuard a transparent, auditable baseline that runs anywhere, and
let the evaluation harness work even before the (heavier) NER engine is wired up.
The NHS-number recogniser validates the mod-11 check digit so random 10-digit
strings (dose volumes, IDs) aren't flagged as patient identifiers.
The NHS staff / organisation rules below (GMC & NMC clinician IDs, ODS org codes,
record UUIDs) were folded in from the Presidio branch so the rule layer also
covers people who aren't patients — Caldicott/DPA apply to anyone identifiable.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from .data import DATE, LOCATION, PERSON, UK_NHS # noqa: F401 (re-exported types)
EMAIL = "EMAIL_ADDRESS"
PHONE = "PHONE_NUMBER"
POSTCODE = "UK_POSTCODE"
GMC = "GMC" # General Medical Council number (UK doctors)
NMC = "NMC" # Nursing & Midwifery Council PIN
NHS_ODS = "NHS_ODS" # NHS Organisation Data Service codes (GP practices, trusts)
RECORD_ID = "RECORD_ID" # record/document UUIDs that act as quasi-identifiers
UK_NINO = "UK_NINO" # National Insurance Number
UK_VEHICLE_REGISTRATION = "UK_VEHICLE_REGISTRATION" # current-format UK plate
@dataclass(frozen=True)
class Span:
start: int
end: int
entity_type: str
text: str
score: float = 1.0
needs_review: bool = False # True for detections below the auto-confirm threshold
def nhs_number_is_valid(digits: str) -> bool:
"""Validate a 10-digit NHS number using the Modulus 11 check-digit algorithm."""
d = re.sub(r"\D", "", digits)
if len(d) != 10:
return False
total = sum(int(d[i]) * (10 - i) for i in range(9))
remainder = total % 11
check = 11 - remainder
if check == 11:
check = 0
if check == 10:
return False # never valid
return check == int(d[9])
# Real NHS numbers are 10 digits with a mod-11 check digit, optionally grouped.
# Dataset writes them with space, comma, or hyphen separators (e.g. 272,733,208).
_NHS_RE = re.compile(r"\b\d{3}[ ,\-]?\d{3}[ ,\-]?\d{4}\b")
# Context-anchored: an "NHS ..." label followed by a 9-10 digit number. Needed
# because this synthetic dataset uses 9-digit NHS numbers (no valid checksum),
# which neither the checksum rule nor Presidio's UK_NHS recogniser would catch.
_NHS_CTX_RE = re.compile(
r"NHS\s*(?:Number|No\.?|#)?\s*[:\-]?\s*(\d{3}[ ,\-]?\d{3}[ ,\-]?\d{2,4})",
re.IGNORECASE,
)
_EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b")
_PHONE_RE = re.compile(r"\b(?:\+?44\s?|0)(?:\d\s?){9,10}\b")
# UK postcode (simplified but standard) e.g. SW1A 1AA, M1 1AE
_POSTCODE_RE = re.compile(r"\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b", re.IGNORECASE)
# Dates are PII only in a DATE-OF-BIRTH context. Visit / encounter / admission dates
# are clinically useful and NOT identifiers on their own, so they're left intact.
# Captures the date itself as group 1.
_DOB_RE = re.compile(
r"(?i)\b(?:DOB|D\.O\.B\.?|date\s+of\s+birth|born(?:\s+on)?)[\s:]*"
r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}"
r"|\d{4}-\d{2}-\d{2}"
r"|\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{2,4})"
)
# --- NHS staff / organisation identifiers (context-anchored to avoid noise) ---
_GMC_RE = re.compile(r"(?i)\bGMC(?:\s*(?:no|number|#))?[:\s#]*(\d{7})\b")
_NMC_RE = re.compile(r"(?i)\bNMC(?:\s*pin)?[:\s#]*(\d{2}[A-Z]\d{4}[A-Z])\b")
_NMC_BARE_RE = re.compile(r"\b(\d{2}[A-Z]\d{4}[A-Z])\b") # specific enough to stand alone
_ODS_RE = re.compile(r"(?i)\b(?:ODS|practice\s*code)[:\s]*([A-Z]\d{5})\b")
_UUID_RE = re.compile(
r"\b([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\b", re.IGNORECASE
)
# 2 letters, 6 digits in pairs, A-D suffix — specific enough to fire without context anchoring
_NINO_RE = re.compile(r"\b[A-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-Da-d]\b", re.IGNORECASE)
# Current-format UK plate: 2 letters, 2 digits, optional space, 3 letters (e.g. AB12 CDE)
_VEHICLE_RE = re.compile(r"\b[A-Z]{2}\d{2}\s?[A-Z]{3}\b")
# UK NHS site names: "X Hospital / Infirmary / NHS Trust / Medical Centre / Clinic"
# One-to-four leading title-cased words followed by a known NHS facility suffix.
# Context-anchored to title-case to avoid flagging generic lowercase mentions.
_SITE_RE = re.compile(
r"\b(?:[A-Z][A-Za-z']+\s+){1,4}"
r"(?:Hospital|Infirmary|Trust|Medical\s+Centre|Health\s+Centre|Clinic|Surgery)\b"
)
# (regex, entity_type, capture_group): group 0 = whole match, 1 = inner capture
_PLAIN = [
(_EMAIL_RE, EMAIL, 0),
(_PHONE_RE, PHONE, 0),
(_POSTCODE_RE, POSTCODE, 0),
(_DOB_RE, DATE, 1),
(_GMC_RE, GMC, 1),
(_NMC_RE, NMC, 1),
(_NMC_BARE_RE, NMC, 1),
(_ODS_RE, NHS_ODS, 1),
(_UUID_RE, RECORD_ID, 1),
(_NINO_RE, UK_NINO, 0),
(_VEHICLE_RE, UK_VEHICLE_REGISTRATION, 0),
]
def find_rule_spans(text: str) -> list[Span]:
spans: list[Span] = []
for m in _NHS_RE.finditer(text):
if nhs_number_is_valid(m.group()):
spans.append(Span(m.start(), m.end(), UK_NHS, m.group()))
# context-anchored NHS numbers (catches the 9-digit synthetic ones)
for m in _NHS_CTX_RE.finditer(text):
spans.append(Span(m.start(1), m.end(1), UK_NHS, m.group(1)))
for regex, etype, grp in _PLAIN:
for m in regex.finditer(text):
spans.append(Span(m.start(grp), m.end(grp), etype, m.group(grp)))
for m in _SITE_RE.finditer(text):
spans.append(Span(m.start(), m.end(), LOCATION, m.group()))
return _dedupe(spans)
def _dedupe(spans: list[Span]) -> list[Span]:
"""Drop spans fully contained within another (keep the longer match)."""
spans = sorted(spans, key=lambda s: (s.start, -(s.end - s.start)))
kept: list[Span] = []
for s in spans:
if any(k.start <= s.start and s.end <= k.end for k in kept):
continue
kept.append(s)
return kept
if __name__ == "__main__":
# quick check: 9434765919 is a documented valid NHS test number
assert nhs_number_is_valid("943 476 5919"), "valid NHS number rejected"
assert not nhs_number_is_valid("943 476 5918"), "bad check digit accepted"
demo = ("NHS no 943 476 5919, ring 07700 900123, dob 12/03/1981, SW1A 1AA, "
"seen by Dr Lee GMC 1234567, nurse NMC 12A3456B.")
for sp in find_rule_spans(demo):
print(sp)