| """
|
| text_utils.py
|
| Single-source Hebrew normalization & tokenization.
|
| Controls behavior across all scripts.
|
| """
|
| import re
|
| import unicodedata
|
| from typing import List
|
|
|
| HEB_PREFIXES = ("ื","ื","ื","ื","ื","ื","ืฉ")
|
| STOPWORDS = set("""
|
| ืืื ืื ืื ืื ืืชื ืืช ืืชื ืืชื ืืฆื ืขื ืขื ืขื ืื ืื ื ืื ื ืืื ืืื ืื ืื ืืฉืจ ืฉื
|
| ืืื ืื ืื ืืืจ ืืืฉืจ ืืื ืืคื ื ืืืืจ ืืื ืขืื ืจืง
|
| ืื ืื ืื ืื ืื ืื ืื ืื ืืื ืืคื ืืื ืืื ืื'
|
| """.split())
|
|
|
|
|
|
|
|
|
| def identity(s: str) -> str:
|
| """Does nothing"""
|
| return s
|
|
|
| def norm_he(s: str) -> str:
|
| """Current normalization implementation (bad)"""
|
| if not s:
|
| return ""
|
| s = unicodedata.normalize("NFKC", s)
|
| s = re.sub(r"[\u0591-\u05BD\u05BF-\u05C7]", "", s)
|
| s = (s.replace("ืด", '"').replace("ืณ", "'")
|
| .replace("โ", '"').replace("โ", '"')
|
| .replace("โ", "-").replace("โ", "-"))
|
| return re.sub(r"\s+", " ", s).strip()
|
|
|
| def tok_he(text: str) -> List[str]:
|
| """The main tokenizer. It uses the BM25 normalizer internally."""
|
| s = norm_bm25(text)
|
| toks = re.findall(r"[A-Za-z0-9\u0590-\u05FF]+", s)
|
|
|
| out: List[str] = []
|
| for t in toks:
|
| if len(t) > 3 and t[0] in HEB_PREFIXES:
|
| out.append(t[1:])
|
| out.append(t)
|
| return [t for t in out if t not in STOPWORDS]
|
|
|
|
|
|
|
|
|
|
|
| norm_bm25 = norm_he
|
|
|
|
|
| norm_e5_query = identity
|
| norm_e5_passage = identity
|
| norm_gemma_query = identity
|
| norm_gemma_passage = identity
|
| norm_bge_query = identity
|
| norm_bge_passage = identity
|
|
|
|
|
| tokenize = tok_he
|
| normalize = norm_he |