"""Layer 1 — deterministic detectors: regex + validators + entropy.

Each detector returns spans as dicts:
    {"start": int, "end": int, "type": str, "value": str,
     "source": "regex", "confidence": 0.99}

High precision is the goal: numeric PII (Aadhaar, card) must pass real
checksums (Verhoeff / Luhn) so random 12/16-digit numbers are NOT flagged.
"""
import math
import re
import string

# ---------------------------------------------------------------------------
# Verhoeff checksum (Aadhaar) — identical tables to Appendix A
# ---------------------------------------------------------------------------
_D = [
    [0,1,2,3,4,5,6,7,8,9],[1,2,3,4,0,6,7,8,9,5],[2,3,4,0,1,7,8,9,5,6],
    [3,4,0,1,2,8,9,5,6,7],[4,0,1,2,3,9,5,6,7,8],[5,9,8,7,6,0,4,3,2,1],
    [6,5,9,8,7,1,0,4,3,2],[7,6,5,9,8,2,1,0,4,3],[8,7,6,5,9,3,2,1,0,4],
    [9,8,7,6,5,4,3,2,1,0]]
_P = [
    [0,1,2,3,4,5,6,7,8,9],[1,5,7,6,2,8,3,0,9,4],[5,8,0,3,7,9,6,1,4,2],
    [8,9,1,6,0,4,3,5,2,7],[9,4,5,3,1,2,6,8,7,0],[4,2,8,6,5,7,3,9,0,1],
    [2,7,9,3,8,0,6,4,1,5],[7,0,4,6,9,1,3,2,5,8]]
_INV = [0,4,3,2,1,5,6,7,8,9]


def verhoeff_validate(number: str) -> bool:
    """Validate a Verhoeff checksum (last digit is the check digit)."""
    c = 0
    for i, item in enumerate(reversed(number)):
        c = _D[c][_P[i % 8][int(item)]]
    return c == 0


# ---------------------------------------------------------------------------
# Luhn checksum (cards)
# ---------------------------------------------------------------------------
def luhn_validate(card: str) -> bool:
    s = 0
    for i, ch in enumerate(reversed(card)):
        d = int(ch)
        if i % 2 == 1:
            d *= 2
            if d > 9:
                d -= 9
        s += d
    return s % 10 == 0


# ---------------------------------------------------------------------------
# Shannon entropy (secrets)
# ---------------------------------------------------------------------------
def shannon_entropy(s: str) -> float:
    if not s:
        return 0.0
    probs = [s.count(c) / len(s) for c in set(s)]
    return -sum(p * math.log2(p) for p in probs)


# A small dictionary-word check so plain English words of length >= 20
# (rare, but possible in compound text) don't get flagged as secrets.
_COMMON_WORDS = {
    "responsibility", "internationalization", "characterization",
    "telecommunications", "incomprehensibility", "disproportionately",
}


def _is_dictionary_word(token: str) -> bool:
    return token.lower() in _COMMON_WORDS


# ---------------------------------------------------------------------------
# Regex patterns
# ---------------------------------------------------------------------------
_EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b")
_PHONE_RE = re.compile(r"(?<!\d)(?:\+?91[-\s]?)?[6-9]\d{9}(?!\d)")
_AADHAAR_RE = re.compile(r"(?<!\d)\d{4}\s?\d{4}\s?\d{4}(?!\d)")
_PAN_RE = re.compile(r"\b[A-Z]{5}[0-9]{4}[A-Z]\b")
_IFSC_RE = re.compile(r"\b[A-Z]{4}0[A-Z0-9]{6}\b")
_CARD_RE = re.compile(r"(?<!\d)\d(?:[ -]?\d){12,18}(?!\d)")
_IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
_DOB_RE = re.compile(r"\b(?:0[1-9]|[12]\d|3[01])[/-](?:0[1-9]|1[0-2])[/-](?:19|20)\d{2}\b")
_UPI_RE = re.compile(r"\b[\w.\-]{2,256}@(?:[a-zA-Z]{3,64})\b")

# Secrets
_AWS_KEY_RE = re.compile(r"\b(?:AKIA|ASIA)[A-Z0-9]{16}\b")
_GITHUB_TOKEN_RE = re.compile(r"\bgh[pousr]_[A-Za-z0-9]{36,255}\b")
_SLACK_TOKEN_RE = re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,72}\b")
_JWT_RE = re.compile(r"\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b")
_PRIVATE_KEY_RE = re.compile(
    r"-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----.*?-----END [A-Z0-9 ]*PRIVATE KEY-----",
    re.DOTALL,
)
# Generic API-token-like assignments: api_key="...", token: '...', secret=...
_GENERIC_TOKEN_RE = re.compile(
    r"(?i)\b(?:api[_-]?key|api[_-]?secret|access[_-]?token|secret[_-]?key|"
    r"auth[_-]?token|password|token)\s*[:=]\s*[\"']?([A-Za-z0-9/_+\-]{16,})[\"']?"
)

# Indian UPI handles (specific bank suffixes, narrower than generic email-ish)
_UPI_SUFFIXES = (
    "okhdfcbank", "oksbi", "okicici", "okaxis", "ybl", "paytm", "ibl", "axl", "apl",
)


def _find_aadhaar(text):
    spans = []
    for m in _AADHAAR_RE.finditer(text):
        digits = m.group(0).replace(" ", "")
        if len(digits) == 12 and verhoeff_validate(digits):
            spans.append({
                "start": m.start(), "end": m.end(), "type": "AADHAAR",
                "value": m.group(0), "source": "regex", "confidence": 0.99,
            })
    return spans


def _find_card(text):
    spans = []
    for m in _CARD_RE.finditer(text):
        digits = re.sub(r"[ -]", "", m.group(0))
        if len(digits) in (13, 14, 15, 16, 17, 18, 19) and luhn_validate(digits):
            spans.append({
                "start": m.start(), "end": m.end(), "type": "CARD",
                "value": m.group(0), "source": "regex", "confidence": 0.99,
            })
    return spans


def _find_upi(text):
    spans = []
    for m in _UPI_RE.finditer(text):
        handle = m.group(0)
        suffix = handle.rsplit("@", 1)[-1].lower()
        if suffix in _UPI_SUFFIXES:
            spans.append({
                "start": m.start(), "end": m.end(), "type": "UPI",
                "value": handle, "source": "regex", "confidence": 0.97,
            })
    return spans


def _find_entropy_secrets(text, threshold=4.0, min_len=20):
    spans = []
    for m in re.finditer(r"[A-Za-z0-9+/=_\-]{%d,}" % min_len, text):
        token = m.group(0)
        ent = shannon_entropy(token)
        if ent > threshold and not _is_dictionary_word(token) and not token.isdigit():
            spans.append({
                "start": m.start(), "end": m.end(), "type": "SECRET",
                "value": token, "source": "entropy",
                "confidence": round(min(0.5 + (ent - threshold) * 0.15, 0.95), 2),
            })
    return spans


def _find_generic_tokens(text):
    spans = []
    for m in _GENERIC_TOKEN_RE.finditer(text):
        value = m.group(1)
        if len(value) < 8:
            continue
        start, end = m.span(1)
        spans.append({
            "start": start, "end": end, "type": "SECRET",
            "value": value, "source": "regex", "confidence": 0.9,
        })
    return spans


def _simple_finditer(pattern, type_name, text, confidence=0.99, source="regex"):
    return [
        {
            "start": m.start(), "end": m.end(), "type": type_name,
            "value": m.group(0), "source": source, "confidence": confidence,
        }
        for m in pattern.finditer(text)
    ]


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def detect_pii(text: str) -> list[dict]:
    """Detect structured + context-free PII (Layer 1, PII categories)."""
    spans = []
    spans += _simple_finditer(_EMAIL_RE, "EMAIL", text)
    spans += _find_aadhaar(text)
    spans += _simple_finditer(_PAN_RE, "PAN", text)
    spans += _simple_finditer(_IFSC_RE, "IFSC", text)
    spans += _find_card(text)
    spans += _simple_finditer(_IP_RE, "IP", text, confidence=0.95)
    spans += _simple_finditer(_DOB_RE, "DOB", text, confidence=0.9)
    spans += _find_upi(text)
    # phone last: avoid double-matching digits already consumed by aadhaar/card
    spans += _simple_finditer(_PHONE_RE, "PHONE", text, confidence=0.95)
    return spans


def detect_secrets(text: str) -> list[dict]:
    """Detect credentials / secrets (Layer 1, secret categories)."""
    spans = []
    spans += _simple_finditer(_AWS_KEY_RE, "AWS_KEY", text)
    spans += _simple_finditer(_GITHUB_TOKEN_RE, "GITHUB_TOKEN", text)
    spans += _simple_finditer(_SLACK_TOKEN_RE, "SLACK_TOKEN", text)
    spans += _simple_finditer(_JWT_RE, "JWT", text)
    spans += _simple_finditer(_PRIVATE_KEY_RE, "PRIVATE_KEY", text, confidence=0.99)
    spans += _find_generic_tokens(text)
    spans += _find_entropy_secrets(text)
    return spans


def detect_all(text: str, pii: bool = True, secrets: bool = True) -> list[dict]:
    """Run all enabled detector groups and return raw (possibly overlapping) spans."""
    spans = []
    if pii:
        spans += detect_pii(text)
    if secrets:
        spans += detect_secrets(text)
    return spans


# Alias matching the integration doc's naming (`detectors.detect(text)`).
detect = detect_all