File size: 2,063 Bytes
8c72d18 a0e8f24 8c72d18 a0e8f24 8c72d18 a0e8f24 8c72d18 a0e8f24 8c72d18 a0e8f24 8c72d18 a0e8f24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | """Fix 12: Context-aware Zemberek disambiguation.
Uses zemberek-python (pure Python) — no JVM required.
"""
from __future__ import annotations
from ._root_validator import ZEMBEREK_AVAILABLE, _morphology
AMBIGUOUS_WORDS = {
"yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
"biter", "düşer", "tutar", "kalır", "gerekir", "uyar",
"uçar", "güzel", "büyük", "küçük", "yeni", "eski",
}
def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
"""Enrich ROOT tokens with POS and lemma using Zemberek sentence-level disambiguation."""
if not ZEMBEREK_AVAILABLE:
return tokens
try:
sa_result = _morphology.analyze_and_disambiguate(original_text.strip())
best_list = sa_result.best_analysis()
analyses: dict[str, dict] = {}
for sa in best_list:
try:
sf = (str(sa.get_stem()) + str(sa.get_ending())).lower().strip()
if sf not in analyses:
analyses[sf] = {
"lemma": str(sa.item.lemma),
"pos": str(sa.item.primary_pos.short_form),
"morphemes": [str(m) for m in sa.get_morphemes()],
}
except Exception: # noqa: BLE001
continue
result: list[dict] = []
for tok in tokens:
if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"):
result.append(tok)
continue
surface = tok["token"].strip().lower()
z = analyses.get(surface)
if z:
result.append({
**tok,
"_pos": z["pos"],
"_lemma": z["lemma"],
"_morphemes": z["morphemes"],
"_disambiguated": surface in AMBIGUOUS_WORDS,
})
else:
result.append(tok)
return result
except Exception: # noqa: BLE001
return tokens
|