Spaces:

shmulc
/

hebrew-codenames-copilot

Running

App Files Files Community

hebrew-codenames-copilot / morph.py

shmulc

Upload morph.py with huggingface_hub

4f9f23e verified 5 days ago

Raw

History Blame Contribute Delete

4.65 kB

	"""Hebrew morphology via DictaBERT — a real model from the Dicta lab, replacing
	hand-rolled particle/suffix stem heuristics.

	`lemmas()` uses `dicta-il/dictabert-lex` (lemmatization): it strips attached
	particles and reduces inflections to the lexeme ('האש'->'אש', 'מילים'->'מילה',
	'התוכנית'->'תוכנית'), so legality can be defined as shared lemma rather than a
	letter-list approximation. Same-root-but-different-lemma cases (תוכנה / תוכנית)
	are a stricter shoresh rule handled separately by the LLM root-judge in probe.py.

	First load downloads the model (~mins); afterwards it is HF-cached and loads offline.
	"""

	from __future__ import annotations

	import threading

	_LEX_ID = "dicta-il/dictabert-lex"
	_MORPH_ID = "dicta-il/dictabert-morph"
	_lock = threading.Lock()
	_tok = _model = None
	_mtok = _mmodel = None

	CONTENT_POS = {"NOUN", "PROPN", "ADJ", "VERB"} # keep as clue words; drop ADP/PRON/DET/CONJ/ADV/NUM
	_BATCH = 512 # DictaBERT.predict holds the whole list in memory at once — chunk to bound it


	def _predict(model, tok, words):
	"""model.predict over `words` in bounded batches (avoids OOM on large vocabularies)."""
	out = []
	for i in range(0, len(words), _BATCH):
	out.extend(model.predict(words[i:i + _BATCH], tok))
	return out


	def _load():
	global _tok, _model
	if _model is None:
	with _lock:
	if _model is None:
	from transformers import AutoModel, AutoTokenizer
	_tok = AutoTokenizer.from_pretrained(_LEX_ID)
	_model = AutoModel.from_pretrained(_LEX_ID, trust_remote_code=True).eval()
	return _tok, _model


	def _load_morph():
	global _mtok, _mmodel
	if _mmodel is None:
	with _lock:
	if _mmodel is None:
	from transformers import AutoModel, AutoTokenizer
	_mtok = AutoTokenizer.from_pretrained(_MORPH_ID)
	_mmodel = AutoModel.from_pretrained(_MORPH_ID, trust_remote_code=True).eval()
	return _mtok, _mmodel


	def pos(words) -> list[str]:
	"""Coarse UD part-of-speech per (isolated) word via DictaBERT-morph. For a word with
	attached particles, the content head's POS wins (so 'בבית' reads as NOUN, not ADP)."""
	words = list(words)
	if not words:
	return []
	tok, model = _load_morph()
	out = _predict(model, tok, words)
	res = []
	for item in out:
	toks = (item or {}).get("tokens") or []
	ps = [t.get("pos") for t in toks if t.get("pos")]
	head = next((p for p in ps if p in CONTENT_POS), ps[-1] if ps else "X")
	res.append(head)
	return res


	def lemmas(words) -> list[str]:
	"""Lemma of each (isolated) Hebrew word, aligned with `words`. Falls back to the
	surface form when the model returns nothing."""
	words = list(words)
	if not words:
	return []
	tok, model = _load()
	preds = _predict(model, tok, words) # each word treated as its own sentence
	out = []
	for w, pred in zip(words, preds):
	lem = None
	if pred:
	# pred is a list of (token, lemma) for the word's piece(s)
	first = pred[0]
	lem = first[1] if isinstance(first, (list, tuple)) and len(first) > 1 else None
	out.append(lem if lem and lem != "[BLANK]" else w)
	return out


	def lemma(word: str) -> str:
	return lemmas([word])[0]


	_FINALS = str.maketrans("ךםןףץ", "כמנפצ")


	def root_sig(word: str) -> str:
	"""A coarse consonantal shoresh signature for shared-root legality. Normalise final
	forms, drop the matres lectionis (ו / י), and strip a trailing ה / ת (nominal/feminine
	ending). Two words whose signatures are equal almost always share a root — קוסם/קסם,
	רכבת/רכב, שומר/שמירה, תוכנה/תוכנית — which plain lemma equality cannot see.

	Apply it to a lemma (so attached particles and inflection are already gone). It is a
	morphologically motivated approximation, tuned to over-reject rather than ever let a
	derivative through; residual same-root pairs with a different skeleton are caught by the
	DictaLM root-judge (`probe.llm_root_conflicts`)."""
	s = word.translate(_FINALS).replace("ו", "").replace("י", "")
	if len(s) >= 4 and s[0] in "מהנ": # servile prefix: present-participle מ-, hif'il ה-, nif'al נ-
	s = s[1:] # מפחד→פחד, הפחיד→הפחד→פחד, נפחד→פחד
	if len(s) > 3 and s[-1] in "נתה": # agentive/feminine ending: פחדן→פחד, שומרת→שומר
	s = s[:-1]
	return s