File size: 6,659 Bytes
ca41c16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | """Zemberek-based root validation and correction (Fix 4)."""
from __future__ import annotations
import os
from pathlib import Path
# ββ Zemberek JAR: bundled with package βββββββββββββββββββββββββββββββββββββββ
_DATA_DIR = Path(__file__).parent / "data"
JAR_PATH = _DATA_DIR / "zemberek-full.jar"
ZEMBEREK_AVAILABLE = False
_morphology = None
def _init_zemberek() -> None:
global ZEMBEREK_AVAILABLE, _morphology
if not JAR_PATH.exists():
print(
f"[TurkTokenizer] zemberek-full.jar not found at {JAR_PATH}\n"
" Root validation disabled β morphological fixes will be limited."
)
return
try:
import jpype # noqa: PLC0415
if not jpype.isJVMStarted():
jpype.startJVM(
jpype.getDefaultJVMPath(),
"-ea",
f"-Djava.class.path={JAR_PATH}",
convertStrings=False,
)
TurkishMorphology = jpype.JClass("zemberek.morphology.TurkishMorphology")
_morphology = TurkishMorphology.createWithDefaults()
ZEMBEREK_AVAILABLE = True
except ImportError:
print("[TurkTokenizer] jpype1 not installed β pip install jpype1")
except Exception as exc: # noqa: BLE001
print(f"[TurkTokenizer] Zemberek init failed: {exc}")
_init_zemberek()
# ββ Zemberek API helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _jstr(s: str):
import jpype # noqa: PLC0415
return jpype.JString(s)
def analyze_word(word: str) -> list[dict]:
"""Return all Zemberek analyses for a single word."""
if not ZEMBEREK_AVAILABLE:
return []
try:
wa = _morphology.analyze(_jstr(word))
return [
{
"lemma": str(sa.getDictionaryItem().lemma),
"pos": str(sa.getPos().shortForm),
"morphemes":[str(m) for m in sa.getMorphemes()],
"surface": str(sa.surfaceForm()),
}
for sa in wa.getAnalysisResults()
]
except Exception: # noqa: BLE001
return []
def get_root_and_suffixes(word: str) -> dict | None:
"""Return root + suffix list for a word, or None if unknown."""
analyses = analyze_word(word)
if not analyses:
return None
a = analyses[0]
return {"root": a["lemma"], "suffixes": a["morphemes"][1:], "pos": a["pos"]}
# ββ Heuristic fallback (no Zemberek) βββββββββββββββββββββββββββββββββββββββββ
_SPURIOUS_SHORT_ROOTS = {"oΔ", "gΓΆk", "zo", "me", "im", "pro", "go", "da", "al"}
def _is_spurious_root(root: str, next_tokens: list[dict]) -> bool:
if root.strip().lower() not in _SPURIOUS_SHORT_ROOTS:
return False
return sum(1 for t in next_tokens[:3] if t["type"] == "BPE") >= 2
# ββ Main validation βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def build_correction_map(
original_words: list[str], base_tokenizer
) -> dict[str, str]:
"""Build a {tokenizer_root β zemberek_root} correction map."""
correction_map: dict[str, str] = {}
for word in original_words:
w = word.lower().strip("'\".,!?;:()")
if not w or len(w) < 3:
continue
z = get_root_and_suffixes(w)
if not z or z["root"] == "UNK":
continue
z_root = z["root"].lower()
try:
toks = base_tokenizer.tokenize_text(w)
t_root = next(
(t["token"].strip().lower() for t in toks if t["type"] == "ROOT"),
None,
)
except Exception: # noqa: BLE001
continue
if not t_root or t_root == z_root:
continue
diff = len(z_root) - len(t_root)
if diff < 0 or diff > 4:
continue
if not z_root.startswith(t_root):
continue
correction_map[t_root] = z_root
return correction_map
def validate_roots(
tokens: list[dict],
original_words: list[str],
base_tokenizer=None,
) -> list[dict]:
"""Apply Zemberek root corrections to the token stream."""
if not ZEMBEREK_AVAILABLE:
result = []
for i, tok in enumerate(tokens):
if tok["type"] == "ROOT" and not tok["token"].strip().startswith("<"):
if _is_spurious_root(tok["token"], tokens[i + 1 : i + 5]):
tok = {**tok, "_suspicious": True}
result.append(tok)
return result
corr = (
build_correction_map(original_words, base_tokenizer)
if base_tokenizer is not None
else {}
)
result = []
for tok in tokens:
if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"):
result.append(tok)
continue
surface = tok["token"].strip().lower()
correct = corr.get(surface)
if correct and correct != surface:
leading = " " if tok["token"].startswith(" ") else ""
tok = {
**tok,
"token": leading + correct,
"_original_token": tok["token"],
"_root_corrected": True,
"_note": f"root corrected: '{surface}' β '{correct}'",
}
result.append(tok)
return result
def disambiguate_sentence(words: list[str]) -> list[dict | None]:
"""Sentence-level Zemberek disambiguation."""
if not ZEMBEREK_AVAILABLE:
return [None] * len(words)
try:
sa_result = _morphology.analyzeAndDisambiguate(_jstr(" ".join(words)))
best = sa_result.bestAnalysis()
out = []
for i in range(best.size()):
try:
sa = best.get(i)
item = sa.getDictionaryItem()
out.append({
"lemma": str(item.lemma),
"pos": str(sa.getPos().shortForm),
"morphemes": [str(m) for m in sa.getMorphemes()],
})
except Exception: # noqa: BLE001
out.append(None)
while len(out) < len(words):
out.append(None)
return out[: len(words)]
except Exception: # noqa: BLE001
return [analyze_word(w)[0] if analyze_word(w) else None for w in words]
|