"""Pure-Python rule recognisers — no spaCy / Presidio dependency.

These give NoteGuard a transparent, auditable baseline that runs anywhere, and
let the evaluation harness work even before the (heavier) NER engine is wired up.
The NHS-number recogniser validates the mod-11 check digit so random 10-digit
strings (dose volumes, IDs) aren't flagged as patient identifiers.

The NHS staff / organisation rules below (GMC & NMC clinician IDs, ODS org codes,
record UUIDs) were folded in from the Presidio branch so the rule layer also
covers people who aren't patients — Caldicott/DPA apply to anyone identifiable.
"""
from __future__ import annotations

import re
from dataclasses import dataclass

from .data import DATE, LOCATION, PERSON, UK_NHS  # noqa: F401  (re-exported types)

EMAIL = "EMAIL_ADDRESS"
PHONE = "PHONE_NUMBER"
POSTCODE = "UK_POSTCODE"
GMC = "GMC"              # General Medical Council number (UK doctors)
NMC = "NMC"              # Nursing & Midwifery Council PIN
NHS_ODS = "NHS_ODS"      # NHS Organisation Data Service codes (GP practices, trusts)
RECORD_ID = "RECORD_ID"  # record/document UUIDs that act as quasi-identifiers
UK_NINO = "UK_NINO"                    # National Insurance Number
UK_VEHICLE_REGISTRATION = "UK_VEHICLE_REGISTRATION"  # current-format UK plate


@dataclass(frozen=True)
class Span:
    start: int
    end: int
    entity_type: str
    text: str
    score: float = 1.0
    needs_review: bool = False  # True for detections below the auto-confirm threshold


def nhs_number_is_valid(digits: str) -> bool:
    """Validate a 10-digit NHS number using the Modulus 11 check-digit algorithm."""
    d = re.sub(r"\D", "", digits)
    if len(d) != 10:
        return False
    total = sum(int(d[i]) * (10 - i) for i in range(9))
    remainder = total % 11
    check = 11 - remainder
    if check == 11:
        check = 0
    if check == 10:
        return False  # never valid
    return check == int(d[9])


# Real NHS numbers are 10 digits with a mod-11 check digit, optionally grouped.
# Dataset writes them with space, comma, or hyphen separators (e.g. 272,733,208).
_NHS_RE = re.compile(r"\b\d{3}[ ,\-]?\d{3}[ ,\-]?\d{4}\b")
# Context-anchored: an "NHS ..." label followed by a 9-10 digit number. Needed
# because this synthetic dataset uses 9-digit NHS numbers (no valid checksum),
# which neither the checksum rule nor Presidio's UK_NHS recogniser would catch.
_NHS_CTX_RE = re.compile(
    r"NHS\s*(?:Number|No\.?|#)?\s*[:\-]?\s*(\d{3}[ ,\-]?\d{3}[ ,\-]?\d{2,4})",
    re.IGNORECASE,
)
_EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b")
_PHONE_RE = re.compile(r"\b(?:\+?44\s?|0)(?:\d\s?){9,10}\b")
# UK postcode (simplified but standard) e.g. SW1A 1AA, M1 1AE
_POSTCODE_RE = re.compile(r"\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b", re.IGNORECASE)
# Dates are PII only in a DATE-OF-BIRTH context. Visit / encounter / admission dates
# are clinically useful and NOT identifiers on their own, so they're left intact.
# Captures the date itself as group 1.
_DOB_RE = re.compile(
    r"(?i)\b(?:DOB|D\.O\.B\.?|date\s+of\s+birth|born(?:\s+on)?)[\s:]*"
    r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}"
    r"|\d{4}-\d{2}-\d{2}"
    r"|\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{2,4})"
)
# --- NHS staff / organisation identifiers (context-anchored to avoid noise) ---
_GMC_RE = re.compile(r"(?i)\bGMC(?:\s*(?:no|number|#))?[:\s#]*(\d{7})\b")
_NMC_RE = re.compile(r"(?i)\bNMC(?:\s*pin)?[:\s#]*(\d{2}[A-Z]\d{4}[A-Z])\b")
_NMC_BARE_RE = re.compile(r"\b(\d{2}[A-Z]\d{4}[A-Z])\b")  # specific enough to stand alone
_ODS_RE = re.compile(r"(?i)\b(?:ODS|practice\s*code)[:\s]*([A-Z]\d{5})\b")
_UUID_RE = re.compile(
    r"\b([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\b", re.IGNORECASE
)
# 2 letters, 6 digits in pairs, A-D suffix — specific enough to fire without context anchoring
_NINO_RE = re.compile(r"\b[A-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-Da-d]\b", re.IGNORECASE)
# Current-format UK plate: 2 letters, 2 digits, optional space, 3 letters (e.g. AB12 CDE)
_VEHICLE_RE = re.compile(r"\b[A-Z]{2}\d{2}\s?[A-Z]{3}\b")
# UK NHS site names: "X Hospital / Infirmary / NHS Trust / Medical Centre / Clinic"
# One-to-four leading title-cased words followed by a known NHS facility suffix.
# Context-anchored to title-case to avoid flagging generic lowercase mentions.
_SITE_RE = re.compile(
    r"\b(?:[A-Z][A-Za-z']+\s+){1,4}"
    r"(?:Hospital|Infirmary|Trust|Medical\s+Centre|Health\s+Centre|Clinic|Surgery)\b"
)

# (regex, entity_type, capture_group): group 0 = whole match, 1 = inner capture
_PLAIN = [
    (_EMAIL_RE, EMAIL, 0),
    (_PHONE_RE, PHONE, 0),
    (_POSTCODE_RE, POSTCODE, 0),
    (_DOB_RE, DATE, 1),
    (_GMC_RE, GMC, 1),
    (_NMC_RE, NMC, 1),
    (_NMC_BARE_RE, NMC, 1),
    (_ODS_RE, NHS_ODS, 1),
    (_UUID_RE, RECORD_ID, 1),
    (_NINO_RE, UK_NINO, 0),
    (_VEHICLE_RE, UK_VEHICLE_REGISTRATION, 0),
]


def find_rule_spans(text: str) -> list[Span]:
    spans: list[Span] = []

    for m in _NHS_RE.finditer(text):
        if nhs_number_is_valid(m.group()):
            spans.append(Span(m.start(), m.end(), UK_NHS, m.group()))
    # context-anchored NHS numbers (catches the 9-digit synthetic ones)
    for m in _NHS_CTX_RE.finditer(text):
        spans.append(Span(m.start(1), m.end(1), UK_NHS, m.group(1)))

    for regex, etype, grp in _PLAIN:
        for m in regex.finditer(text):
            spans.append(Span(m.start(grp), m.end(grp), etype, m.group(grp)))

    for m in _SITE_RE.finditer(text):
        spans.append(Span(m.start(), m.end(), LOCATION, m.group()))

    return _dedupe(spans)


def _dedupe(spans: list[Span]) -> list[Span]:
    """Drop spans fully contained within another (keep the longer match)."""
    spans = sorted(spans, key=lambda s: (s.start, -(s.end - s.start)))
    kept: list[Span] = []
    for s in spans:
        if any(k.start <= s.start and s.end <= k.end for k in kept):
            continue
        kept.append(s)
    return kept


if __name__ == "__main__":
    # quick check: 9434765919 is a documented valid NHS test number
    assert nhs_number_is_valid("943 476 5919"), "valid NHS number rejected"
    assert not nhs_number_is_valid("943 476 5918"), "bad check digit accepted"
    demo = ("NHS no 943 476 5919, ring 07700 900123, dob 12/03/1981, SW1A 1AA, "
            "seen by Dr Lee GMC 1234567, nurse NMC 12A3456B.")
    for sp in find_rule_spans(demo):
        print(sp)