shmulc's picture
Upload morph.py with huggingface_hub
4f9f23e verified
Raw
History Blame Contribute Delete
4.65 kB
"""Hebrew morphology via DictaBERT โ€” a real model from the Dicta lab, replacing
hand-rolled particle/suffix stem heuristics.
`lemmas()` uses `dicta-il/dictabert-lex` (lemmatization): it strips attached
particles and reduces inflections to the lexeme ('ื”ืืฉ'->'ืืฉ', 'ืžื™ืœื™ื'->'ืžื™ืœื”',
'ื”ืชื•ื›ื ื™ืช'->'ืชื•ื›ื ื™ืช'), so legality can be defined as *shared lemma* rather than a
letter-list approximation. Same-root-but-different-lemma cases (ืชื•ื›ื ื” / ืชื•ื›ื ื™ืช)
are a stricter shoresh rule handled separately by the LLM root-judge in probe.py.
First load downloads the model (~mins); afterwards it is HF-cached and loads offline.
"""
from __future__ import annotations
import threading
_LEX_ID = "dicta-il/dictabert-lex"
_MORPH_ID = "dicta-il/dictabert-morph"
_lock = threading.Lock()
_tok = _model = None
_mtok = _mmodel = None
CONTENT_POS = {"NOUN", "PROPN", "ADJ", "VERB"} # keep as clue words; drop ADP/PRON/DET/CONJ/ADV/NUM
_BATCH = 512 # DictaBERT.predict holds the whole list in memory at once โ€” chunk to bound it
def _predict(model, tok, words):
"""model.predict over `words` in bounded batches (avoids OOM on large vocabularies)."""
out = []
for i in range(0, len(words), _BATCH):
out.extend(model.predict(words[i:i + _BATCH], tok))
return out
def _load():
global _tok, _model
if _model is None:
with _lock:
if _model is None:
from transformers import AutoModel, AutoTokenizer
_tok = AutoTokenizer.from_pretrained(_LEX_ID)
_model = AutoModel.from_pretrained(_LEX_ID, trust_remote_code=True).eval()
return _tok, _model
def _load_morph():
global _mtok, _mmodel
if _mmodel is None:
with _lock:
if _mmodel is None:
from transformers import AutoModel, AutoTokenizer
_mtok = AutoTokenizer.from_pretrained(_MORPH_ID)
_mmodel = AutoModel.from_pretrained(_MORPH_ID, trust_remote_code=True).eval()
return _mtok, _mmodel
def pos(words) -> list[str]:
"""Coarse UD part-of-speech per (isolated) word via DictaBERT-morph. For a word with
attached particles, the content head's POS wins (so 'ื‘ื‘ื™ืช' reads as NOUN, not ADP)."""
words = list(words)
if not words:
return []
tok, model = _load_morph()
out = _predict(model, tok, words)
res = []
for item in out:
toks = (item or {}).get("tokens") or []
ps = [t.get("pos") for t in toks if t.get("pos")]
head = next((p for p in ps if p in CONTENT_POS), ps[-1] if ps else "X")
res.append(head)
return res
def lemmas(words) -> list[str]:
"""Lemma of each (isolated) Hebrew word, aligned with `words`. Falls back to the
surface form when the model returns nothing."""
words = list(words)
if not words:
return []
tok, model = _load()
preds = _predict(model, tok, words) # each word treated as its own sentence
out = []
for w, pred in zip(words, preds):
lem = None
if pred:
# pred is a list of (token, lemma) for the word's piece(s)
first = pred[0]
lem = first[1] if isinstance(first, (list, tuple)) and len(first) > 1 else None
out.append(lem if lem and lem != "[BLANK]" else w)
return out
def lemma(word: str) -> str:
return lemmas([word])[0]
_FINALS = str.maketrans("ืšืืŸืฃืฅ", "ื›ืžื ืคืฆ")
def root_sig(word: str) -> str:
"""A coarse consonantal *shoresh* signature for shared-root legality. Normalise final
forms, drop the matres lectionis (ื• / ื™), and strip a trailing ื” / ืช (nominal/feminine
ending). Two words whose signatures are equal almost always share a root โ€” ืงื•ืกื/ืงืกื,
ืจื›ื‘ืช/ืจื›ื‘, ืฉื•ืžืจ/ืฉืžื™ืจื”, ืชื•ื›ื ื”/ืชื•ื›ื ื™ืช โ€” which plain lemma equality cannot see.
Apply it to a *lemma* (so attached particles and inflection are already gone). It is a
morphologically motivated approximation, tuned to over-reject rather than ever let a
derivative through; residual same-root pairs with a different skeleton are caught by the
DictaLM root-judge (`probe.llm_root_conflicts`)."""
s = word.translate(_FINALS).replace("ื•", "").replace("ื™", "")
if len(s) >= 4 and s[0] in "ืžื”ื ": # servile prefix: present-participle ืž-, hif'il ื”-, nif'al ื -
s = s[1:] # ืžืคื—ื“โ†’ืคื—ื“, ื”ืคื—ื™ื“โ†’ื”ืคื—ื“โ†’ืคื—ื“, ื ืคื—ื“โ†’ืคื—ื“
if len(s) > 3 and s[-1] in "ื ืชื”": # agentive/feminine ending: ืคื—ื“ืŸโ†’ืคื—ื“, ืฉื•ืžืจืชโ†’ืฉื•ืžืจ
s = s[:-1]
return s