File size: 2,086 Bytes
ca41c16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | """Fix 12: Context-aware Zemberek disambiguation."""
from __future__ import annotations
from ._root_validator import ZEMBEREK_AVAILABLE, _morphology, _jstr
AMBIGUOUS_WORDS = {
"yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
"biter", "düşer", "tutar", "kalır", "gerekir", "uyar",
"uçar", "güzel", "büyük", "küçük", "yeni", "eski",
}
def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
"""Enrich ROOT tokens with POS and lemma using Zemberek sentence-level disambiguation."""
if not ZEMBEREK_AVAILABLE:
return tokens
try:
sa_result = _morphology.analyzeAndDisambiguate(_jstr(original_text.strip()))
best_list = sa_result.bestAnalysis()
analyses: dict[str, dict] = {}
for idx in range(best_list.size()):
try:
sa = best_list.get(idx)
item = sa.getDictionaryItem()
sf = str(sa.surfaceForm()).lower().strip()
if sf not in analyses:
analyses[sf] = {
"lemma": str(item.lemma),
"pos": str(sa.getPos().shortForm),
"morphemes": [str(m) for m in sa.getMorphemes()],
}
except Exception: # noqa: BLE001
continue
result: list[dict] = []
for tok in tokens:
if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"):
result.append(tok)
continue
surface = tok["token"].strip().lower()
z = analyses.get(surface)
if z:
result.append({
**tok,
"_pos": z["pos"],
"_lemma": z["lemma"],
"_morphemes": z["morphemes"],
"_disambiguated": surface in AMBIGUOUS_WORDS,
})
else:
result.append(tok)
return result
except Exception: # noqa: BLE001
return tokens
|