Spaces:

chaeyoona
/

noteguard

Running

noteguard / src /recognisers.py

Chaeyoon

Refactor to Gold-RAP structure: src/ package, tests/, docs/, data/, output/

84981a4 5 days ago

6.43 kB

	"""Pure-Python rule recognisers — no spaCy / Presidio dependency.

	These give NoteGuard a transparent, auditable baseline that runs anywhere, and
	let the evaluation harness work even before the (heavier) NER engine is wired up.
	The NHS-number recogniser validates the mod-11 check digit so random 10-digit
	strings (dose volumes, IDs) aren't flagged as patient identifiers.

	The NHS staff / organisation rules below (GMC & NMC clinician IDs, ODS org codes,
	record UUIDs) were folded in from the Presidio branch so the rule layer also
	covers people who aren't patients — Caldicott/DPA apply to anyone identifiable.
	"""
	from __future__ import annotations

	import re
	from dataclasses import dataclass

	from .data import DATE, LOCATION, PERSON, UK_NHS # noqa: F401 (re-exported types)

	EMAIL = "EMAIL_ADDRESS"
	PHONE = "PHONE_NUMBER"
	POSTCODE = "UK_POSTCODE"
	GMC = "GMC" # General Medical Council number (UK doctors)
	NMC = "NMC" # Nursing & Midwifery Council PIN
	NHS_ODS = "NHS_ODS" # NHS Organisation Data Service codes (GP practices, trusts)
	RECORD_ID = "RECORD_ID" # record/document UUIDs that act as quasi-identifiers
	UK_NINO = "UK_NINO" # National Insurance Number
	UK_VEHICLE_REGISTRATION = "UK_VEHICLE_REGISTRATION" # current-format UK plate


	@dataclass(frozen=True)
	class Span:
	start: int
	end: int
	entity_type: str
	text: str
	score: float = 1.0
	needs_review: bool = False # True for detections below the auto-confirm threshold


	def nhs_number_is_valid(digits: str) -> bool:
	"""Validate a 10-digit NHS number using the Modulus 11 check-digit algorithm."""
	d = re.sub(r"\D", "", digits)
	if len(d) != 10:
	return False
	total = sum(int(d[i]) * (10 - i) for i in range(9))
	remainder = total % 11
	check = 11 - remainder
	if check == 11:
	check = 0
	if check == 10:
	return False # never valid
	return check == int(d[9])


	# Real NHS numbers are 10 digits with a mod-11 check digit, optionally grouped.
	# Dataset writes them with space, comma, or hyphen separators (e.g. 272,733,208).
	_NHS_RE = re.compile(r"\b\d{3}[ ,\-]?\d{3}[ ,\-]?\d{4}\b")
	# Context-anchored: an "NHS ..." label followed by a 9-10 digit number. Needed
	# because this synthetic dataset uses 9-digit NHS numbers (no valid checksum),
	# which neither the checksum rule nor Presidio's UK_NHS recogniser would catch.
	_NHS_CTX_RE = re.compile(
	r"NHS\s(?:Number\|No\.?\|#)?\s[:\-]?\s*(\d{3}[ ,\-]?\d{3}[ ,\-]?\d{2,4})",
	re.IGNORECASE,
	)
	_EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b")
	_PHONE_RE = re.compile(r"\b(?:\+?44\s?\|0)(?:\d\s?){9,10}\b")
	# UK postcode (simplified but standard) e.g. SW1A 1AA, M1 1AE
	_POSTCODE_RE = re.compile(r"\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b", re.IGNORECASE)
	# Dates are PII only in a DATE-OF-BIRTH context. Visit / encounter / admission dates
	# are clinically useful and NOT identifiers on their own, so they're left intact.
	# Captures the date itself as group 1.
	_DOB_RE = re.compile(
	r"(?i)\b(?:DOB\|D\.O\.B\.?\|date\s+of\s+birth\|born(?:\s+on)?)[\s:]*"
	r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}"
	r"\|\d{4}-\d{2}-\d{2}"
	r"\|\d{1,2}\s+(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]*\.?\s+\d{2,4})"
	)
	# --- NHS staff / organisation identifiers (context-anchored to avoid noise) ---
	_GMC_RE = re.compile(r"(?i)\bGMC(?:\s(?:no\|number\|#))?[:\s#](\d{7})\b")
	_NMC_RE = re.compile(r"(?i)\bNMC(?:\spin)?[:\s#](\d{2}[A-Z]\d{4}[A-Z])\b")
	_NMC_BARE_RE = re.compile(r"\b(\d{2}[A-Z]\d{4}[A-Z])\b") # specific enough to stand alone
	_ODS_RE = re.compile(r"(?i)\b(?:ODS\|practice\scode)[:\s]([A-Z]\d{5})\b")
	_UUID_RE = re.compile(
	r"\b([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\b", re.IGNORECASE
	)
	# 2 letters, 6 digits in pairs, A-D suffix — specific enough to fire without context anchoring
	_NINO_RE = re.compile(r"\b[A-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-Da-d]\b", re.IGNORECASE)
	# Current-format UK plate: 2 letters, 2 digits, optional space, 3 letters (e.g. AB12 CDE)
	_VEHICLE_RE = re.compile(r"\b[A-Z]{2}\d{2}\s?[A-Z]{3}\b")
	# UK NHS site names: "X Hospital / Infirmary / NHS Trust / Medical Centre / Clinic"
	# One-to-four leading title-cased words followed by a known NHS facility suffix.
	# Context-anchored to title-case to avoid flagging generic lowercase mentions.
	_SITE_RE = re.compile(
	r"\b(?:[A-Z][A-Za-z']+\s+){1,4}"
	r"(?:Hospital\|Infirmary\|Trust\|Medical\s+Centre\|Health\s+Centre\|Clinic\|Surgery)\b"
	)

	# (regex, entity_type, capture_group): group 0 = whole match, 1 = inner capture
	_PLAIN = [
	(_EMAIL_RE, EMAIL, 0),
	(_PHONE_RE, PHONE, 0),
	(_POSTCODE_RE, POSTCODE, 0),
	(_DOB_RE, DATE, 1),
	(_GMC_RE, GMC, 1),
	(_NMC_RE, NMC, 1),
	(_NMC_BARE_RE, NMC, 1),
	(_ODS_RE, NHS_ODS, 1),
	(_UUID_RE, RECORD_ID, 1),
	(_NINO_RE, UK_NINO, 0),
	(_VEHICLE_RE, UK_VEHICLE_REGISTRATION, 0),
	]


	def find_rule_spans(text: str) -> list[Span]:
	spans: list[Span] = []

	for m in _NHS_RE.finditer(text):
	if nhs_number_is_valid(m.group()):
	spans.append(Span(m.start(), m.end(), UK_NHS, m.group()))
	# context-anchored NHS numbers (catches the 9-digit synthetic ones)
	for m in _NHS_CTX_RE.finditer(text):
	spans.append(Span(m.start(1), m.end(1), UK_NHS, m.group(1)))

	for regex, etype, grp in _PLAIN:
	for m in regex.finditer(text):
	spans.append(Span(m.start(grp), m.end(grp), etype, m.group(grp)))

	for m in _SITE_RE.finditer(text):
	spans.append(Span(m.start(), m.end(), LOCATION, m.group()))

	return _dedupe(spans)


	def _dedupe(spans: list[Span]) -> list[Span]:
	"""Drop spans fully contained within another (keep the longer match)."""
	spans = sorted(spans, key=lambda s: (s.start, -(s.end - s.start)))
	kept: list[Span] = []
	for s in spans:
	if any(k.start <= s.start and s.end <= k.end for k in kept):
	continue
	kept.append(s)
	return kept


	if __name__ == "__main__":
	# quick check: 9434765919 is a documented valid NHS test number
	assert nhs_number_is_valid("943 476 5919"), "valid NHS number rejected"
	assert not nhs_number_is_valid("943 476 5918"), "bad check digit accepted"
	demo = ("NHS no 943 476 5919, ring 07700 900123, dob 12/03/1981, SW1A 1AA, "
	"seen by Dr Lee GMC 1234567, nurse NMC 12A3456B.")
	for sp in find_rule_spans(demo):
	print(sp)