| """Fix 12: Context-aware Zemberek disambiguation.""" |
|
|
| from __future__ import annotations |
|
|
| from ._root_validator import ZEMBEREK_AVAILABLE, _morphology, _jstr |
|
|
| AMBIGUOUS_WORDS = { |
| "yüz", "gelir", "yazar", "geçer", "çıkar", "gider", |
| "biter", "düşer", "tutar", "kalır", "gerekir", "uyar", |
| "uçar", "güzel", "büyük", "küçük", "yeni", "eski", |
| } |
|
|
|
|
| def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]: |
| """Enrich ROOT tokens with POS and lemma using Zemberek sentence-level disambiguation.""" |
| if not ZEMBEREK_AVAILABLE: |
| return tokens |
|
|
| try: |
| sa_result = _morphology.analyzeAndDisambiguate(_jstr(original_text.strip())) |
| best_list = sa_result.bestAnalysis() |
|
|
| analyses: dict[str, dict] = {} |
| for idx in range(best_list.size()): |
| try: |
| sa = best_list.get(idx) |
| item = sa.getDictionaryItem() |
| sf = str(sa.surfaceForm()).lower().strip() |
| if sf not in analyses: |
| analyses[sf] = { |
| "lemma": str(item.lemma), |
| "pos": str(sa.getPos().shortForm), |
| "morphemes": [str(m) for m in sa.getMorphemes()], |
| } |
| except Exception: |
| continue |
|
|
| result: list[dict] = [] |
| for tok in tokens: |
| if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"): |
| result.append(tok) |
| continue |
|
|
| surface = tok["token"].strip().lower() |
| z = analyses.get(surface) |
| if z: |
| result.append({ |
| **tok, |
| "_pos": z["pos"], |
| "_lemma": z["lemma"], |
| "_morphemes": z["morphemes"], |
| "_disambiguated": surface in AMBIGUOUS_WORDS, |
| }) |
| else: |
| result.append(tok) |
|
|
| return result |
|
|
| except Exception: |
| return tokens |
|
|