| """Pure-Python rule recognisers — no spaCy / Presidio dependency. |
| |
| These give NoteGuard a transparent, auditable baseline that runs anywhere, and |
| let the evaluation harness work even before the (heavier) NER engine is wired up. |
| The NHS-number recogniser validates the mod-11 check digit so random 10-digit |
| strings (dose volumes, IDs) aren't flagged as patient identifiers. |
| |
| The NHS staff / organisation rules below (GMC & NMC clinician IDs, ODS org codes, |
| record UUIDs) were folded in from the Presidio branch so the rule layer also |
| covers people who aren't patients — Caldicott/DPA apply to anyone identifiable. |
| """ |
| from __future__ import annotations |
|
|
| import re |
| from dataclasses import dataclass |
|
|
| from .data import DATE, LOCATION, PERSON, UK_NHS |
|
|
| EMAIL = "EMAIL_ADDRESS" |
| PHONE = "PHONE_NUMBER" |
| POSTCODE = "UK_POSTCODE" |
| GMC = "GMC" |
| NMC = "NMC" |
| NHS_ODS = "NHS_ODS" |
| RECORD_ID = "RECORD_ID" |
| UK_NINO = "UK_NINO" |
| UK_VEHICLE_REGISTRATION = "UK_VEHICLE_REGISTRATION" |
|
|
|
|
| @dataclass(frozen=True) |
| class Span: |
| start: int |
| end: int |
| entity_type: str |
| text: str |
| score: float = 1.0 |
| needs_review: bool = False |
|
|
|
|
| def nhs_number_is_valid(digits: str) -> bool: |
| """Validate a 10-digit NHS number using the Modulus 11 check-digit algorithm.""" |
| d = re.sub(r"\D", "", digits) |
| if len(d) != 10: |
| return False |
| total = sum(int(d[i]) * (10 - i) for i in range(9)) |
| remainder = total % 11 |
| check = 11 - remainder |
| if check == 11: |
| check = 0 |
| if check == 10: |
| return False |
| return check == int(d[9]) |
|
|
|
|
| |
| |
| _NHS_RE = re.compile(r"\b\d{3}[ ,\-]?\d{3}[ ,\-]?\d{4}\b") |
| |
| |
| |
| _NHS_CTX_RE = re.compile( |
| r"NHS\s*(?:Number|No\.?|#)?\s*[:\-]?\s*(\d{3}[ ,\-]?\d{3}[ ,\-]?\d{2,4})", |
| re.IGNORECASE, |
| ) |
| _EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b") |
| _PHONE_RE = re.compile(r"\b(?:\+?44\s?|0)(?:\d\s?){9,10}\b") |
| |
| _POSTCODE_RE = re.compile(r"\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b", re.IGNORECASE) |
| |
| |
| |
| _DOB_RE = re.compile( |
| r"(?i)\b(?:DOB|D\.O\.B\.?|date\s+of\s+birth|born(?:\s+on)?)[\s:]*" |
| r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}" |
| r"|\d{4}-\d{2}-\d{2}" |
| r"|\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{2,4})" |
| ) |
| |
| _GMC_RE = re.compile(r"(?i)\bGMC(?:\s*(?:no|number|#))?[:\s#]*(\d{7})\b") |
| _NMC_RE = re.compile(r"(?i)\bNMC(?:\s*pin)?[:\s#]*(\d{2}[A-Z]\d{4}[A-Z])\b") |
| _NMC_BARE_RE = re.compile(r"\b(\d{2}[A-Z]\d{4}[A-Z])\b") |
| _ODS_RE = re.compile(r"(?i)\b(?:ODS|practice\s*code)[:\s]*([A-Z]\d{5})\b") |
| _UUID_RE = re.compile( |
| r"\b([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\b", re.IGNORECASE |
| ) |
| |
| _NINO_RE = re.compile(r"\b[A-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-Da-d]\b", re.IGNORECASE) |
| |
| _VEHICLE_RE = re.compile(r"\b[A-Z]{2}\d{2}\s?[A-Z]{3}\b") |
| |
| |
| |
| _SITE_RE = re.compile( |
| r"\b(?:[A-Z][A-Za-z']+\s+){1,4}" |
| r"(?:Hospital|Infirmary|Trust|Medical\s+Centre|Health\s+Centre|Clinic|Surgery)\b" |
| ) |
|
|
| |
| _PLAIN = [ |
| (_EMAIL_RE, EMAIL, 0), |
| (_PHONE_RE, PHONE, 0), |
| (_POSTCODE_RE, POSTCODE, 0), |
| (_DOB_RE, DATE, 1), |
| (_GMC_RE, GMC, 1), |
| (_NMC_RE, NMC, 1), |
| (_NMC_BARE_RE, NMC, 1), |
| (_ODS_RE, NHS_ODS, 1), |
| (_UUID_RE, RECORD_ID, 1), |
| (_NINO_RE, UK_NINO, 0), |
| (_VEHICLE_RE, UK_VEHICLE_REGISTRATION, 0), |
| ] |
|
|
|
|
| def find_rule_spans(text: str) -> list[Span]: |
| spans: list[Span] = [] |
|
|
| for m in _NHS_RE.finditer(text): |
| if nhs_number_is_valid(m.group()): |
| spans.append(Span(m.start(), m.end(), UK_NHS, m.group())) |
| |
| for m in _NHS_CTX_RE.finditer(text): |
| spans.append(Span(m.start(1), m.end(1), UK_NHS, m.group(1))) |
|
|
| for regex, etype, grp in _PLAIN: |
| for m in regex.finditer(text): |
| spans.append(Span(m.start(grp), m.end(grp), etype, m.group(grp))) |
|
|
| for m in _SITE_RE.finditer(text): |
| spans.append(Span(m.start(), m.end(), LOCATION, m.group())) |
|
|
| return _dedupe(spans) |
|
|
|
|
| def _dedupe(spans: list[Span]) -> list[Span]: |
| """Drop spans fully contained within another (keep the longer match).""" |
| spans = sorted(spans, key=lambda s: (s.start, -(s.end - s.start))) |
| kept: list[Span] = [] |
| for s in spans: |
| if any(k.start <= s.start and s.end <= k.end for k in kept): |
| continue |
| kept.append(s) |
| return kept |
|
|
|
|
| if __name__ == "__main__": |
| |
| assert nhs_number_is_valid("943 476 5919"), "valid NHS number rejected" |
| assert not nhs_number_is_valid("943 476 5918"), "bad check digit accepted" |
| demo = ("NHS no 943 476 5919, ring 07700 900123, dob 12/03/1981, SW1A 1AA, " |
| "seen by Dr Lee GMC 1234567, nurse NMC 12A3456B.") |
| for sp in find_rule_spans(demo): |
| print(sp) |
|
|