Spaces:
Running
Running
| """Hebrew morphology via DictaBERT โ a real model from the Dicta lab, replacing | |
| hand-rolled particle/suffix stem heuristics. | |
| `lemmas()` uses `dicta-il/dictabert-lex` (lemmatization): it strips attached | |
| particles and reduces inflections to the lexeme ('ืืืฉ'->'ืืฉ', 'ืืืืื'->'ืืืื', | |
| 'ืืชืืื ืืช'->'ืชืืื ืืช'), so legality can be defined as *shared lemma* rather than a | |
| letter-list approximation. Same-root-but-different-lemma cases (ืชืืื ื / ืชืืื ืืช) | |
| are a stricter shoresh rule handled separately by the LLM root-judge in probe.py. | |
| First load downloads the model (~mins); afterwards it is HF-cached and loads offline. | |
| """ | |
| from __future__ import annotations | |
| import threading | |
| _LEX_ID = "dicta-il/dictabert-lex" | |
| _MORPH_ID = "dicta-il/dictabert-morph" | |
| _lock = threading.Lock() | |
| _tok = _model = None | |
| _mtok = _mmodel = None | |
| CONTENT_POS = {"NOUN", "PROPN", "ADJ", "VERB"} # keep as clue words; drop ADP/PRON/DET/CONJ/ADV/NUM | |
| _BATCH = 512 # DictaBERT.predict holds the whole list in memory at once โ chunk to bound it | |
| def _predict(model, tok, words): | |
| """model.predict over `words` in bounded batches (avoids OOM on large vocabularies).""" | |
| out = [] | |
| for i in range(0, len(words), _BATCH): | |
| out.extend(model.predict(words[i:i + _BATCH], tok)) | |
| return out | |
| def _load(): | |
| global _tok, _model | |
| if _model is None: | |
| with _lock: | |
| if _model is None: | |
| from transformers import AutoModel, AutoTokenizer | |
| _tok = AutoTokenizer.from_pretrained(_LEX_ID) | |
| _model = AutoModel.from_pretrained(_LEX_ID, trust_remote_code=True).eval() | |
| return _tok, _model | |
| def _load_morph(): | |
| global _mtok, _mmodel | |
| if _mmodel is None: | |
| with _lock: | |
| if _mmodel is None: | |
| from transformers import AutoModel, AutoTokenizer | |
| _mtok = AutoTokenizer.from_pretrained(_MORPH_ID) | |
| _mmodel = AutoModel.from_pretrained(_MORPH_ID, trust_remote_code=True).eval() | |
| return _mtok, _mmodel | |
| def pos(words) -> list[str]: | |
| """Coarse UD part-of-speech per (isolated) word via DictaBERT-morph. For a word with | |
| attached particles, the content head's POS wins (so 'ืืืืช' reads as NOUN, not ADP).""" | |
| words = list(words) | |
| if not words: | |
| return [] | |
| tok, model = _load_morph() | |
| out = _predict(model, tok, words) | |
| res = [] | |
| for item in out: | |
| toks = (item or {}).get("tokens") or [] | |
| ps = [t.get("pos") for t in toks if t.get("pos")] | |
| head = next((p for p in ps if p in CONTENT_POS), ps[-1] if ps else "X") | |
| res.append(head) | |
| return res | |
| def lemmas(words) -> list[str]: | |
| """Lemma of each (isolated) Hebrew word, aligned with `words`. Falls back to the | |
| surface form when the model returns nothing.""" | |
| words = list(words) | |
| if not words: | |
| return [] | |
| tok, model = _load() | |
| preds = _predict(model, tok, words) # each word treated as its own sentence | |
| out = [] | |
| for w, pred in zip(words, preds): | |
| lem = None | |
| if pred: | |
| # pred is a list of (token, lemma) for the word's piece(s) | |
| first = pred[0] | |
| lem = first[1] if isinstance(first, (list, tuple)) and len(first) > 1 else None | |
| out.append(lem if lem and lem != "[BLANK]" else w) | |
| return out | |
| def lemma(word: str) -> str: | |
| return lemmas([word])[0] | |
| _FINALS = str.maketrans("ืืืืฃืฅ", "ืืื ืคืฆ") | |
| def root_sig(word: str) -> str: | |
| """A coarse consonantal *shoresh* signature for shared-root legality. Normalise final | |
| forms, drop the matres lectionis (ื / ื), and strip a trailing ื / ืช (nominal/feminine | |
| ending). Two words whose signatures are equal almost always share a root โ ืงืืกื/ืงืกื, | |
| ืจืืืช/ืจืื, ืฉืืืจ/ืฉืืืจื, ืชืืื ื/ืชืืื ืืช โ which plain lemma equality cannot see. | |
| Apply it to a *lemma* (so attached particles and inflection are already gone). It is a | |
| morphologically motivated approximation, tuned to over-reject rather than ever let a | |
| derivative through; residual same-root pairs with a different skeleton are caught by the | |
| DictaLM root-judge (`probe.llm_root_conflicts`).""" | |
| s = word.translate(_FINALS).replace("ื", "").replace("ื", "") | |
| if len(s) >= 4 and s[0] in "ืืื ": # servile prefix: present-participle ื-, hif'il ื-, nif'al ื - | |
| s = s[1:] # ืืคืืโืคืื, ืืคืืืโืืคืืโืคืื, ื ืคืืโืคืื | |
| if len(s) > 3 and s[-1] in "ื ืชื": # agentive/feminine ending: ืคืืืโืคืื, ืฉืืืจืชโืฉืืืจ | |
| s = s[:-1] | |
| return s | |