""" text_utils.py Single-source Hebrew normalization & tokenization. Controls behavior across all scripts. """ import re import unicodedata from typing import List HEB_PREFIXES = ("ו","ה","ב","ל","כ","מ","ש") STOPWORDS = set(""" אבל אם או אז אתה את אתם אתן אצל על עד עם אנחנו אני הוא היא הם הן אשר של ולא לא כן כבר כאשר לכן לפני לאחר כדי עוד רק אל זה זו אך כי גם כל כך בלי לפי וכן וכו וכ' """.split()) # --- Core Function --- def identity(s: str) -> str: """Does nothing""" return s def norm_he(s: str) -> str: """Current normalization implementation (bad)""" if not s: return "" s = unicodedata.normalize("NFKC", s) s = re.sub(r"[\u0591-\u05BD\u05BF-\u05C7]", "", s) # strip nikkud s = (s.replace("״", '"').replace("׳", "'") .replace("”", '"').replace("“", '"') .replace("–", "-").replace("—", "-")) return re.sub(r"\s+", " ", s).strip() def tok_he(text: str) -> List[str]: """The main tokenizer. It uses the BM25 normalizer internally.""" s = norm_bm25(text) # Use the specific normalizer for BM25 toks = re.findall(r"[A-Za-z0-9\u0590-\u05FF]+", s) out: List[str] = [] for t in toks: if len(t) > 3 and t[0] in HEB_PREFIXES: out.append(t[1:]) # stripped prefix out.append(t) return [t for t in out if t not in STOPWORDS] # --- Component-Specific Assignments --- # For now, only BM25 gets real normalization. norm_bm25 = norm_he # For now, E5, Gemma and BGE inputs are passed through unchanged. norm_e5_query = identity norm_e5_passage = identity norm_gemma_query = identity norm_gemma_passage = identity norm_bge_query = identity norm_bge_passage = identity # --- General Aliases --- tokenize = tok_he normalize = norm_he # General normalize points to the strong one