"""Hebrew morphology via DictaBERT — a real model from the Dicta lab, replacing
hand-rolled particle/suffix stem heuristics.

`lemmas()` uses `dicta-il/dictabert-lex` (lemmatization): it strips attached
particles and reduces inflections to the lexeme ('האש'->'אש', 'מילים'->'מילה',
'התוכנית'->'תוכנית'), so legality can be defined as *shared lemma* rather than a
letter-list approximation. Same-root-but-different-lemma cases (תוכנה / תוכנית)
are a stricter shoresh rule handled separately by the LLM root-judge in probe.py.

First load downloads the model (~mins); afterwards it is HF-cached and loads offline.
"""

from __future__ import annotations

import threading

_LEX_ID = "dicta-il/dictabert-lex"
_MORPH_ID = "dicta-il/dictabert-morph"
_lock = threading.Lock()
_tok = _model = None
_mtok = _mmodel = None

CONTENT_POS = {"NOUN", "PROPN", "ADJ", "VERB"}   # keep as clue words; drop ADP/PRON/DET/CONJ/ADV/NUM
_BATCH = 512   # DictaBERT.predict holds the whole list in memory at once — chunk to bound it


def _predict(model, tok, words):
    """model.predict over `words` in bounded batches (avoids OOM on large vocabularies)."""
    out = []
    for i in range(0, len(words), _BATCH):
        out.extend(model.predict(words[i:i + _BATCH], tok))
    return out


def _load():
    global _tok, _model
    if _model is None:
        with _lock:
            if _model is None:
                from transformers import AutoModel, AutoTokenizer
                _tok = AutoTokenizer.from_pretrained(_LEX_ID)
                _model = AutoModel.from_pretrained(_LEX_ID, trust_remote_code=True).eval()
    return _tok, _model


def _load_morph():
    global _mtok, _mmodel
    if _mmodel is None:
        with _lock:
            if _mmodel is None:
                from transformers import AutoModel, AutoTokenizer
                _mtok = AutoTokenizer.from_pretrained(_MORPH_ID)
                _mmodel = AutoModel.from_pretrained(_MORPH_ID, trust_remote_code=True).eval()
    return _mtok, _mmodel


def pos(words) -> list[str]:
    """Coarse UD part-of-speech per (isolated) word via DictaBERT-morph. For a word with
    attached particles, the content head's POS wins (so 'בבית' reads as NOUN, not ADP)."""
    words = list(words)
    if not words:
        return []
    tok, model = _load_morph()
    out = _predict(model, tok, words)
    res = []
    for item in out:
        toks = (item or {}).get("tokens") or []
        ps = [t.get("pos") for t in toks if t.get("pos")]
        head = next((p for p in ps if p in CONTENT_POS), ps[-1] if ps else "X")
        res.append(head)
    return res


def lemmas(words) -> list[str]:
    """Lemma of each (isolated) Hebrew word, aligned with `words`. Falls back to the
    surface form when the model returns nothing."""
    words = list(words)
    if not words:
        return []
    tok, model = _load()
    preds = _predict(model, tok, words)         # each word treated as its own sentence
    out = []
    for w, pred in zip(words, preds):
        lem = None
        if pred:
            # pred is a list of (token, lemma) for the word's piece(s)
            first = pred[0]
            lem = first[1] if isinstance(first, (list, tuple)) and len(first) > 1 else None
        out.append(lem if lem and lem != "[BLANK]" else w)
    return out


def lemma(word: str) -> str:
    return lemmas([word])[0]


_FINALS = str.maketrans("ךםןףץ", "כמנפצ")


def root_sig(word: str) -> str:
    """A coarse consonantal *shoresh* signature for shared-root legality. Normalise final
    forms, drop the matres lectionis (ו / י), and strip a trailing ה / ת (nominal/feminine
    ending). Two words whose signatures are equal almost always share a root — קוסם/קסם,
    רכבת/רכב, שומר/שמירה, תוכנה/תוכנית — which plain lemma equality cannot see.

    Apply it to a *lemma* (so attached particles and inflection are already gone). It is a
    morphologically motivated approximation, tuned to over-reject rather than ever let a
    derivative through; residual same-root pairs with a different skeleton are caught by the
    DictaLM root-judge (`probe.llm_root_conflicts`)."""
    s = word.translate(_FINALS).replace("ו", "").replace("י", "")
    if len(s) >= 4 and s[0] in "מהנ":     # servile prefix: present-participle מ-, hif'il ה-, nif'al נ-
        s = s[1:]                          # מפחד→פחד, הפחיד→הפחד→פחד, נפחד→פחד
    if len(s) > 3 and s[-1] in "נתה":     # agentive/feminine ending: פחדן→פחד, שומרת→שומר
        s = s[:-1]
    return s