#!/usr/bin/env python3
import re
import torch

from eircode import iter_eircode_candidates, is_valid_eircode
from irish_core_generated_scanner_spec import SCANNER_SPEC
from ppsn import is_plausible_ppsn, iter_ppsn_candidates
from raw_word_aligned import word_aligned_ppsn_spans


TOKEN_RE = re.compile(r"[A-Za-z0-9]+|[^\w\s]", re.UNICODE)
TRAILING_TRIM_CHARS = set(" \t\r\n\u00A0-")
KNOWN_IE_IBAN_BANK_CODES = {
    "AIBK",
    "BOFI",
    "IPBS",
    "IRCE",
    "ULSB",
    "PTSB",
    "EBSI",
    "DABA",
    "CITI",
    "TRWI",
    "REVO",
}

DEFAULT_LABEL_THRESHOLDS = {
    "PHONE_NUMBER": 0.35,
    "PASSPORT_NUMBER": 0.11,
    "BANK_ROUTING_NUMBER": 0.35,
    "ACCOUNT_NUMBER": 0.40,
    "CREDIT_DEBIT_CARD": 0.08,
    "SWIFT_BIC": 0.50,
}

FORMAT_LABELS = set(DEFAULT_LABEL_THRESHOLDS)
OUTPUT_PRIORITY = {
    "PPSN": 0,
    "PASSPORT_NUMBER": 1,
    "ACCOUNT_NUMBER": 2,
    "BANK_ROUTING_NUMBER": 3,
    "CREDIT_DEBIT_CARD": 4,
    "PHONE_NUMBER": 5,
    "SWIFT_BIC": 6,
    "POSTCODE": 7,
    "EMAIL": 8,
    "FIRST_NAME": 9,
    "LAST_NAME": 10,
}


def tokenize_with_spans(text: str):
    return [(m.group(0), m.start(), m.end()) for m in TOKEN_RE.finditer(text)]


def is_ascii_digit(ch: str) -> bool:
    return "0" <= ch <= "9"


def is_ascii_letter(ch: str) -> bool:
    upper = ch.upper()
    return "A" <= upper <= "Z"


def is_ascii_alnum(ch: str) -> bool:
    return is_ascii_digit(ch) or is_ascii_letter(ch)


def is_word_boundary(text: str, index: int) -> bool:
    if index < 0 or index >= len(text):
        return True
    return not text[index].isalnum()


def normalize_compact(value: str, uppercase: bool = True) -> str:
    chars = []
    for ch in value.strip():
        if ch.isalnum():
            chars.append(ch.upper() if uppercase else ch)
    return "".join(chars)


def normalize_label(label: str) -> str:
    label = (label or "").strip()
    if label.startswith("B-") or label.startswith("I-"):
        label = label[2:]
    return label.upper()


def luhn_ok(value: str) -> bool:
    digits = "".join(ch for ch in value if ch.isdigit())
    if not (13 <= len(digits) <= 19):
        return False
    total = 0
    double = False
    for ch in reversed(digits):
        number = int(ch)
        if double:
            number *= 2
            if number > 9:
                number -= 9
        total += number
        double = not double
    return total % 10 == 0


def iban_mod97_ok(value: str) -> bool:
    compact = normalize_compact(value)
    if len(compact) != 22 or not compact.startswith("IE"):
        return False
    if not compact[2:4].isdigit():
        return False
    if not all(is_ascii_letter(ch) for ch in compact[4:8]):
        return False
    if not compact[8:].isdigit():
        return False
    rearranged = compact[4:] + compact[:4]
    remainder = 0
    for ch in rearranged:
        if ch.isdigit():
            digits = ch
        else:
            digits = str(ord(ch) - ord("A") + 10)
        for digit in digits:
            remainder = (remainder * 10 + int(digit)) % 97
    return remainder == 1


def is_plausible_ie_iban(value: str) -> bool:
    compact = normalize_compact(value)
    if len(compact) != 22 or not compact.startswith("IE"):
        return False
    if not compact[2:4].isdigit():
        return False
    if not all(is_ascii_letter(ch) for ch in compact[4:8]):
        return False
    if not compact[8:].isdigit():
        return False
    if iban_mod97_ok(compact):
        return True
    return compact[4:8] in KNOWN_IE_IBAN_BANK_CODES


def normalize_irish_phone(value: str) -> str:
    compact = value.strip()
    compact = compact.replace("(0)", "0")
    chars = []
    for ch in compact:
        if ch in " -()":
            continue
        chars.append(ch)
    compact = "".join(chars)
    if compact.startswith("00353"):
        compact = "+" + compact[2:]
    return compact


def is_valid_irish_phone(value: str) -> bool:
    compact = normalize_irish_phone(value)
    if compact.startswith("+353"):
        rest = compact[4:]
        if rest.startswith("0"):
            rest = rest[1:]
        if not rest.isdigit():
            return False
        if rest.startswith("8"):
            return len(rest) == 9
        return len(rest) in {8, 9}
    if not compact.startswith("0") or not compact.isdigit():
        return False
    if compact.startswith("08"):
        return len(compact) == 10
    return len(compact) in {9, 10}


def is_plausible_card(value: str) -> bool:
    digits = "".join(ch for ch in value if ch.isdigit())
    if not (13 <= len(digits) <= 19):
        return False
    if luhn_ok(value):
        return True
    stripped = value.strip()
    if not stripped:
        return False
    groups = []
    current = []
    saw_sep = False
    for ch in stripped:
        if ch.isdigit():
            current.append(ch)
            continue
        if ch not in {" ", "-"}:
            return False
        saw_sep = True
        if not current:
            return False
        groups.append("".join(current))
        current = []
    if current:
        groups.append("".join(current))
    if not saw_sep:
        return False
    lengths = [len(group) for group in groups]
    return lengths in ([4, 4, 4, 4], [4, 4, 4, 4, 3], [4, 6, 5])


def normalize_passport(value: str) -> str:
    chars = []
    for ch in value.strip():
        if ch.isspace():
            continue
        chars.append(ch.upper())
    return "".join(chars)


def is_valid_passport(value: str) -> bool:
    compact = normalize_passport(value)
    return len(compact) == 9 and all(is_ascii_letter(ch) for ch in compact[:2]) and compact[2:].isdigit()


def is_valid_sort_code(value: str) -> bool:
    stripped = value.strip()
    if not stripped:
        return False
    if stripped.isdigit():
        return len(stripped) == 6
    groups = []
    current = []
    for ch in stripped:
        if ch.isdigit():
            current.append(ch)
            continue
        if ch not in {" ", "-"}:
            return False
        if not current:
            return False
        groups.append("".join(current))
        current = []
    if current:
        groups.append("".join(current))
    return len(groups) == 3 and all(len(group) == 2 and group.isdigit() for group in groups)


def is_valid_bic(value: str) -> bool:
    compact = normalize_compact(value)
    if len(compact) not in {8, 11}:
        return False
    if not all(is_ascii_letter(ch) for ch in compact[:6]):
        return False
    return all(is_ascii_alnum(ch) for ch in compact[6:])


def scan_candidates(
    text: str,
    *,
    start_ok,
    allowed_chars: set[str],
    min_len: int,
    max_len: int,
    validator,
):
    i = 0
    n = len(text)
    while i < n:
        ch = text[i]
        if not start_ok(ch) or not is_word_boundary(text, i - 1):
            i += 1
            continue
        run_end = i
        while run_end < n and run_end - i < max_len and text[run_end] in allowed_chars:
            run_end += 1
        best_end = None
        end = run_end
        while end > i:
            while end > i and text[end - 1] in TRAILING_TRIM_CHARS:
                end -= 1
            if end - i < min_len:
                break
            if is_word_boundary(text, end):
                candidate = text[i:end]
                if validator(candidate):
                    best_end = end
                    break
            end -= 1
        if best_end is not None:
            value = text[i:best_end]
            yield {
                "start": i,
                "end": best_end,
                "text": value,
                "normalized": normalize_compact(value, uppercase=False),
            }
            i = best_end
        else:
            i += 1


def spec_candidates_for_label(text: str, label: str):
    label = label.upper()
    spec = SCANNER_SPEC["scanners"].get(label)
    if spec is None:
        return
    if spec["kind"] == "delegate":
        delegate_name = spec["function"]
        if delegate_name == "iter_ppsn_candidates":
            yield from iter_ppsn_candidates(text)
        elif delegate_name == "iter_eircode_candidates":
            yield from iter_eircode_candidates(text)
        return

    start_spec = SCANNER_SPEC["start_predicates"][spec["start_predicate"]]
    validators = {
        "is_valid_irish_phone": is_valid_irish_phone,
        "is_valid_passport": is_valid_passport,
        "is_valid_sort_code": is_valid_sort_code,
        "is_plausible_ie_iban": is_plausible_ie_iban,
        "is_plausible_card": is_plausible_card,
        "is_valid_bic": is_valid_bic,
    }

    if "builtin" in start_spec:
        builtin = start_spec["builtin"]
        if builtin == "ascii_letter":
            start_ok = is_ascii_letter
        elif builtin == "ascii_digit":
            start_ok = is_ascii_digit
        else:
            raise ValueError(f"Unknown builtin start predicate: {builtin}")
    else:
        allowed = set(start_spec["any_of"])
        start_ok = lambda ch, allowed=allowed: ch in allowed

    yield from scan_candidates(
        text,
        start_ok=start_ok,
        allowed_chars=set(SCANNER_SPEC["char_classes"][spec["allowed_chars"]]),
        min_len=int(spec["min_len"]),
        max_len=int(spec["max_len"]),
        validator=validators[spec["validator"]],
    )


def plausible_label_text(label: str, value: str) -> bool:
    value = value.strip()
    if label == "PPSN":
        return is_plausible_ppsn(value)
    if label == "PHONE_NUMBER":
        return is_valid_irish_phone(value)
    if label == "PASSPORT_NUMBER":
        return is_valid_passport(value)
    if label == "BANK_ROUTING_NUMBER":
        return is_valid_sort_code(value)
    if label == "ACCOUNT_NUMBER":
        compact = normalize_compact(value)
        return is_plausible_ie_iban(value) or (compact.isdigit() and len(compact) == 8)
    if label == "CREDIT_DEBIT_CARD":
        return is_plausible_card(value)
    if label == "SWIFT_BIC":
        return is_valid_bic(value)
    if label == "POSTCODE":
        return is_valid_eircode(value)
    return True


def label_ids_from_mapping(id2label, label: str):
    target = label.upper()
    ids = []
    for raw_id, raw_label in id2label.items():
        if normalize_label(str(raw_label)) == target:
            ids.append(int(raw_id))
    return ids


def label_ids(model, label: str):
    return label_ids_from_mapping(model.config.id2label, label)


def word_scores_for_label(text: str, model, tokenizer, label: str):
    pieces = tokenize_with_spans(text)
    if not pieces:
        return pieces, []
    words = [word for word, _, _ in pieces]
    encoded = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True)
    word_ids = encoded.word_ids(batch_index=0)
    device = next(model.parameters()).device
    encoded = {key: value.to(device) for key, value in encoded.items()}
    with torch.no_grad():
        logits = model(**encoded).logits[0]
    probs = torch.softmax(logits, dim=-1)
    ids = label_ids(model, label)
    scores = []
    for word_index in range(len(pieces)):
        score = 0.0
        for token_index, wid in enumerate(word_ids):
            if wid != word_index:
                continue
            for label_id in ids:
                score = max(score, float(probs[token_index, label_id]))
        scores.append(score)
    return pieces, scores


def word_scores_for_label_onnx(text: str, session, tokenizer, config, label: str):
    from onnx_token_classifier import _run_onnx, _softmax

    pieces = tokenize_with_spans(text)
    if not pieces:
        return pieces, []
    words = [word for word, _, _ in pieces]
    encoded = tokenizer(words, is_split_into_words=True, return_tensors="np", truncation=True)
    word_ids = encoded.word_ids(batch_index=0)
    logits = _run_onnx(session, encoded)[0]
    probs = _softmax(logits, axis=-1)
    ids = label_ids_from_mapping(config.id2label, label)
    scores = []
    for word_index in range(len(pieces)):
        score = 0.0
        for token_index, wid in enumerate(word_ids):
            if wid != word_index:
                continue
            for label_id in ids:
                score = max(score, float(probs[token_index, label_id]))
        scores.append(score)
    return pieces, scores


def _word_aligned_label_spans_from_scores(text: str, label: str, threshold: float, pieces, scores):
    spans = []
    active = None
    for (word, start, end), score in zip(pieces, scores):
        keep = score >= threshold
        if label in {"PHONE_NUMBER", "BANK_ROUTING_NUMBER", "CREDIT_DEBIT_CARD"} and word in {"-", "/"}:
            keep = active is not None and score >= threshold / 2.0
        if keep:
            if active is None:
                active = {"start": start, "end": end, "label": label}
            else:
                if start - active["end"] <= 1:
                    active["end"] = end
                else:
                    spans.append(active)
                    active = {"start": start, "end": end, "label": label}
        elif active is not None:
            spans.append(active)
            active = None
    if active is not None:
        spans.append(active)
    out = []
    for span in spans:
        value = text[span["start"] : span["end"]]
        if plausible_label_text(label, value):
            out.append(
                {
                    "label": label,
                    "start": span["start"],
                    "end": span["end"],
                    "text": value,
                    "source": "word_aligned",
                }
            )
    return out


def word_aligned_label_spans(
    text: str,
    model,
    tokenizer,
    label: str,
    threshold: float,
):
    pieces, scores = word_scores_for_label(text, model, tokenizer, label)
    return _word_aligned_label_spans_from_scores(text, label, threshold, pieces, scores)


def word_aligned_label_spans_onnx(
    text: str,
    session,
    tokenizer,
    config,
    label: str,
    threshold: float,
):
    pieces, scores = word_scores_for_label_onnx(text, session, tokenizer, config, label)
    return _word_aligned_label_spans_from_scores(text, label, threshold, pieces, scores)


def scanner_guided_label_spans(text: str, label: str, threshold: float, pieces, scores):
    if not pieces:
        return []
    out = []
    for candidate in spec_candidates_for_label(text, label):
        start = int(candidate["start"])
        end = int(candidate["end"])
        while start < end and text[start].isspace():
            start += 1
        while end > start and text[end - 1].isspace():
            end -= 1
        support = 0.0
        for (_, piece_start, piece_end), score in zip(pieces, scores):
            if piece_end <= start or piece_start >= end:
                continue
            support = max(support, float(score))
        value = text[start:end]
        if support >= threshold and plausible_label_text(label, value):
            out.append(
                {
                    "label": label,
                    "start": start,
                    "end": end,
                    "text": value,
                    "score": support,
                    "source": "scanner_guided",
                }
            )
    return out


def pipeline_to_spans(text: str, outputs: list[dict], min_score: float):
    spans = []
    for output in outputs:
        label = normalize_label(output.get("entity_group") or output.get("entity") or "")
        if not label:
            continue
        score = float(output.get("score", 0.0))
        if score < min_score:
            continue
        spans.append(
            {
                "label": label,
                "start": int(output["start"]),
                "end": int(output["end"]),
                "score": score,
                "text": text[int(output["start"]) : int(output["end"])],
            }
        )
    return spans


def overlaps(a: dict, b: dict) -> bool:
    return not (a["end"] <= b["start"] or b["end"] <= a["start"])


def span_length(span: dict) -> int:
    return int(span["end"]) - int(span["start"])


def normalize_simple_span(span: dict):
    label = normalize_label(span["label"])
    value = span["text"]
    if label == "PHONE_NUMBER" and plausible_label_text("CREDIT_DEBIT_CARD", value):
        label = "CREDIT_DEBIT_CARD"
    if label in FORMAT_LABELS or label == "POSTCODE":
        if not plausible_label_text(label, value):
            return None
    return {
        "label": label,
        "start": int(span["start"]),
        "end": int(span["end"]),
        "score": float(span.get("score", 0.0)),
        "text": value,
        "source": span.get("source", "model"),
    }


def dedupe_and_sort(spans: list[dict]):
    ordered = sorted(
        spans,
        key=lambda span: (
            int(span["start"]),
            -span_length(span),
            OUTPUT_PRIORITY.get(str(span["label"]).upper(), 99),
        ),
    )
    kept = []
    for span in ordered:
        if any(overlaps(span, other) for other in kept):
            continue
        kept.append(span)
    return kept


def repair_irish_core_spans(
    text: str,
    model,
    tokenizer,
    general_outputs: list[dict],
    other_min_score: float,
    ppsn_min_score: float,
    label_thresholds: dict[str, float] | None = None,
):
    thresholds = dict(DEFAULT_LABEL_THRESHOLDS)
    if label_thresholds:
        thresholds.update({key.upper(): value for key, value in label_thresholds.items()})

    spans = []
    for span in pipeline_to_spans(text, general_outputs, min_score=other_min_score):
        normalized = normalize_simple_span(span)
        if normalized is not None and normalized["label"] != "PPSN":
            spans.append(normalized)

    ppsn_spans = word_aligned_ppsn_spans(text, model, tokenizer, threshold=ppsn_min_score)
    for span in ppsn_spans:
        value = text[int(span["start"]) : int(span["end"])]
        if plausible_label_text("PPSN", value):
            spans.append(
                {
                    "label": "PPSN",
                    "start": int(span["start"]),
                    "end": int(span["end"]),
                    "score": float(span.get("score", 0.0)),
                    "text": value,
                    "source": span.get("source", "model"),
                }
            )

    repairs = []
    ppsn_pieces, ppsn_scores = word_scores_for_label(text, model, tokenizer, "PPSN")
    repairs.extend(scanner_guided_label_spans(text, "PPSN", ppsn_min_score, ppsn_pieces, ppsn_scores))
    for label, threshold in thresholds.items():
        pieces, scores = word_scores_for_label(text, model, tokenizer, label)
        repairs.extend(_word_aligned_label_spans_from_scores(text, label, threshold, pieces, scores))
        repairs.extend(scanner_guided_label_spans(text, label, threshold, pieces, scores))

    for candidate in repairs:
        updated = []
        replaced = False
        for span in spans:
            if not overlaps(candidate, span):
                updated.append(span)
                continue
            if candidate["label"] == span["label"] and span_length(candidate) > span_length(span):
                replaced = True
                continue
            if (
                candidate["label"] == span["label"]
                and candidate.get("source") == "scanner_guided"
                and span.get("source") != "scanner_guided"
            ):
                replaced = True
                continue
            if candidate["label"] in FORMAT_LABELS and span["label"] in FORMAT_LABELS and span_length(candidate) > span_length(span):
                replaced = True
                continue
            updated.append(span)
        spans = updated
        if replaced or not any(overlaps(candidate, span) for span in spans):
            spans.append(candidate)

    return dedupe_and_sort(spans)


def repair_irish_core_spans_onnx(
    text: str,
    session,
    tokenizer,
    config,
    other_min_score: float,
    ppsn_min_score: float,
    label_thresholds: dict[str, float] | None = None,
    general_outputs: list[dict] | None = None,
):
    from onnx_token_classifier import simple_aggregate_spans_onnx, word_aligned_ppsn_spans_onnx

    thresholds = dict(DEFAULT_LABEL_THRESHOLDS)
    if label_thresholds:
        thresholds.update({key.upper(): value for key, value in label_thresholds.items()})

    spans = []
    if general_outputs is None:
        general_outputs = simple_aggregate_spans_onnx(
            text,
            session,
            tokenizer,
            config,
            min_score=other_min_score,
        )
    for span in pipeline_to_spans(text, general_outputs, min_score=other_min_score):
        normalized = normalize_simple_span(span)
        if normalized is not None and normalized["label"] != "PPSN":
            spans.append(normalized)

    ppsn_spans = word_aligned_ppsn_spans_onnx(text, session, tokenizer, config, threshold=ppsn_min_score)
    for span in ppsn_spans:
        value = text[int(span["start"]) : int(span["end"])]
        if plausible_label_text("PPSN", value):
            spans.append(
                {
                    "label": "PPSN",
                    "start": int(span["start"]),
                    "end": int(span["end"]),
                    "score": float(span.get("score", 0.0)),
                    "text": value,
                    "source": span.get("source", "model"),
                }
            )

    repairs = []
    ppsn_pieces, ppsn_scores = word_scores_for_label_onnx(text, session, tokenizer, config, "PPSN")
    repairs.extend(scanner_guided_label_spans(text, "PPSN", ppsn_min_score, ppsn_pieces, ppsn_scores))
    for label, threshold in thresholds.items():
        pieces, scores = word_scores_for_label_onnx(text, session, tokenizer, config, label)
        repairs.extend(_word_aligned_label_spans_from_scores(text, label, threshold, pieces, scores))
        repairs.extend(scanner_guided_label_spans(text, label, threshold, pieces, scores))

    for candidate in repairs:
        updated = []
        replaced = False
        for span in spans:
            if not overlaps(candidate, span):
                updated.append(span)
                continue
            if candidate["label"] == span["label"] and span_length(candidate) > span_length(span):
                replaced = True
                continue
            if (
                candidate["label"] == span["label"]
                and candidate.get("source") == "scanner_guided"
                and span.get("source") != "scanner_guided"
            ):
                replaced = True
                continue
            if candidate["label"] in FORMAT_LABELS and span["label"] in FORMAT_LABELS and span_length(candidate) > span_length(span):
                replaced = True
                continue
            updated.append(span)
        spans = updated
        if replaced or not any(overlaps(candidate, span) for span in spans):
            spans.append(candidate)

    return dedupe_and_sort(spans)