# Packaging-ready functions for grc_core_web_sm
#
# This file is included in the wheel via `spacy package --code`.
# It loads the lookup table from vocab.lookups (embedded in the model)
# instead of from a filesystem path.

from __future__ import annotations

import string
import unicodedata
from typing import Dict

from spacy.language import Language
from spacy.tokens import Doc, Token


# ----- Greek accent normalization (inline, no external dependency) ----- #

# Grave (VARIA) → Acute (OXIA) mapping for polytonic Greek
_GRAVE_TO_ACUTE: Dict[str, str] = {
    # Alpha with breathing
    "\u1F02": "\u1F04", "\u1F03": "\u1F05",
    "\u1F0A": "\u1F0C", "\u1F0B": "\u1F0D",
    # Epsilon
    "\u1F12": "\u1F14", "\u1F13": "\u1F15",
    "\u1F1A": "\u1F1C", "\u1F1B": "\u1F1D",
    # Eta
    "\u1F22": "\u1F24", "\u1F23": "\u1F25",
    "\u1F2A": "\u1F2C", "\u1F2B": "\u1F2D",
    # Iota
    "\u1F32": "\u1F34", "\u1F33": "\u1F35",
    "\u1F3A": "\u1F3C", "\u1F3B": "\u1F3D",
    # Omicron
    "\u1F42": "\u1F44", "\u1F43": "\u1F45",
    "\u1F4A": "\u1F4C", "\u1F4B": "\u1F4D",
    # Upsilon
    "\u1F52": "\u1F54", "\u1F53": "\u1F55", "\u1F5B": "\u1F5D",
    # Omega
    "\u1F62": "\u1F64", "\u1F63": "\u1F65",
    "\u1F6A": "\u1F6C", "\u1F6B": "\u1F6D",
    # Simple vowel + varia
    "\u1F70": "\u1F71", "\u1F72": "\u1F73", "\u1F74": "\u1F75",
    "\u1F76": "\u1F77", "\u1F78": "\u1F79", "\u1F7A": "\u1F7B",
    "\u1F7C": "\u1F7D",
    # With iota subscript
    "\u1F82": "\u1F84", "\u1F83": "\u1F85",
    "\u1F8A": "\u1F8C", "\u1F8B": "\u1F8D",
    "\u1F92": "\u1F94", "\u1F93": "\u1F95",
    "\u1F9A": "\u1F9C", "\u1F9B": "\u1F9D",
    "\u1FA2": "\u1FA4", "\u1FA3": "\u1FA5",
    "\u1FAA": "\u1FAC", "\u1FAB": "\u1FAD",
    "\u1FB2": "\u1FB4", "\u1FC2": "\u1FC4", "\u1FF2": "\u1FF4",
    # Capitals
    "\u1FBA": "\u1FBB", "\u1FC8": "\u1FC9", "\u1FCA": "\u1FCB",
    "\u1FDA": "\u1FDB", "\u1FEA": "\u1FEB", "\u1FF8": "\u1FF9",
    "\u1FFA": "\u1FFB",
    # Dialytika + varia
    "\u1FD2": "\u1FD3", "\u1FE2": "\u1FE3",
}

_GRAVE_TO_ACUTE_TABLE = str.maketrans(_GRAVE_TO_ACUTE)


def _normalize_greek(text: str) -> str:
    """Normalize Greek text for lookup: grave→acute, strip macron/breve, NFC."""
    if not text:
        return text
    # Strip macrons/breves (NFD → remove U+0304/U+0306 → NFC)
    decomposed = unicodedata.normalize("NFD", text)
    stripped = decomposed.replace("\u0304", "").replace("\u0306", "")
    result = unicodedata.normalize("NFC", stripped)
    # Grave → acute (precomposed)
    result = result.translate(_GRAVE_TO_ACUTE_TABLE)
    # Combining grave → acute
    if "\u0300" in result:
        result = result.replace("\u0300", "\u0301")
    # Final NFC
    return unicodedata.normalize("NFC", result)


# ----- lookup_lemmatizer (loads from vocab.lookups) ----- #

if not Token.has_extension("predicted_lemma"):
    Token.set_extension("predicted_lemma", default=None)


@Language.component(name="lookup_lemmatizer")
def lookup_lemmatizer(doc: Doc) -> Doc:
    """Lookup-based lemmatizer for Ancient Greek.

    Assigns lemmas using a 1.2M-entry dictionary built from CLTK Morpheus,
    UD treebanks, and Wiktionary. Normalizes grave→acute accents at query
    time so running-text forms match citation entries.

    Runs after trainable_lemmatizer: overrides only when a lookup match
    exists, preserving the trainable model's output for unseen forms.

    In this packaging-ready version, lookups are loaded from the model's
    vocab.lookups (embedded in vocab/lookups.bin) rather than a filesystem path.
    """
    lookups = doc.vocab.lookups
    if not lookups.has_table("lemma_lookup"):
        return doc
    table = lookups.get_table("lemma_lookup")

    for token in doc:
        # Store trainable lemmatizer's prediction
        token._.predicted_lemma = token.lemma_

        # Skip punctuation
        if token.pos_ == "PUNCT" or token.text in string.punctuation:
            continue

        # Normalize for lookup (grave→acute, strip macron/breve)
        normalized = _normalize_greek(token.text)

        # Direct match
        if normalized in table:
            token.lemma_ = table[normalized]
            continue

        # Case-insensitive fallback for capitalized words
        if normalized and normalized[0].isupper():
            lower = normalized.lower()
            if lower in table:
                token.lemma_ = table[lower]

    return doc