"""Fix 12: Context-aware Zemberek disambiguation.""" from __future__ import annotations from ._root_validator import ZEMBEREK_AVAILABLE, _morphology, _jstr AMBIGUOUS_WORDS = { "yüz", "gelir", "yazar", "geçer", "çıkar", "gider", "biter", "düşer", "tutar", "kalır", "gerekir", "uyar", "uçar", "güzel", "büyük", "küçük", "yeni", "eski", } def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]: """Enrich ROOT tokens with POS and lemma using Zemberek sentence-level disambiguation.""" if not ZEMBEREK_AVAILABLE: return tokens try: sa_result = _morphology.analyzeAndDisambiguate(_jstr(original_text.strip())) best_list = sa_result.bestAnalysis() analyses: dict[str, dict] = {} for idx in range(best_list.size()): try: sa = best_list.get(idx) item = sa.getDictionaryItem() sf = str(sa.surfaceForm()).lower().strip() if sf not in analyses: analyses[sf] = { "lemma": str(item.lemma), "pos": str(sa.getPos().shortForm), "morphemes": [str(m) for m in sa.getMorphemes()], } except Exception: # noqa: BLE001 continue result: list[dict] = [] for tok in tokens: if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"): result.append(tok) continue surface = tok["token"].strip().lower() z = analyses.get(surface) if z: result.append({ **tok, "_pos": z["pos"], "_lemma": z["lemma"], "_morphemes": z["morphemes"], "_disambiguated": surface in AMBIGUOUS_WORDS, }) else: result.append(tok) return result except Exception: # noqa: BLE001 return tokens