yarden077's picture
uploading 2nd place model
0f5ecaf verified
"""
text_utils.py
Single-source Hebrew normalization & tokenization.
Controls behavior across all scripts.
"""
import re
import unicodedata
from typing import List
HEB_PREFIXES = ("ื•","ื”","ื‘","ืœ","ื›","ืž","ืฉ")
STOPWORDS = set("""
ืื‘ืœ ืื ืื• ืื– ืืชื” ืืช ืืชื ืืชืŸ ืืฆืœ ืขืœ ืขื“ ืขื ืื ื—ื ื• ืื ื™ ื”ื•ื ื”ื™ื ื”ื ื”ืŸ ืืฉืจ ืฉืœ
ื•ืœื ืœื ื›ืŸ ื›ื‘ืจ ื›ืืฉืจ ืœื›ืŸ ืœืคื ื™ ืœืื—ืจ ื›ื“ื™ ืขื•ื“ ืจืง
ืืœ ื–ื” ื–ื• ืืš ื›ื™ ื’ื ื›ืœ ื›ืš ื‘ืœื™ ืœืคื™ ื•ื›ืŸ ื•ื›ื• ื•ื›'
""".split())
# --- Core Function ---
def identity(s: str) -> str:
"""Does nothing"""
return s
def norm_he(s: str) -> str:
"""Current normalization implementation (bad)"""
if not s:
return ""
s = unicodedata.normalize("NFKC", s)
s = re.sub(r"[\u0591-\u05BD\u05BF-\u05C7]", "", s) # strip nikkud
s = (s.replace("ืด", '"').replace("ืณ", "'")
.replace("โ€", '"').replace("โ€œ", '"')
.replace("โ€“", "-").replace("โ€”", "-"))
return re.sub(r"\s+", " ", s).strip()
def tok_he(text: str) -> List[str]:
"""The main tokenizer. It uses the BM25 normalizer internally."""
s = norm_bm25(text) # Use the specific normalizer for BM25
toks = re.findall(r"[A-Za-z0-9\u0590-\u05FF]+", s)
out: List[str] = []
for t in toks:
if len(t) > 3 and t[0] in HEB_PREFIXES:
out.append(t[1:]) # stripped prefix
out.append(t)
return [t for t in out if t not in STOPWORDS]
# --- Component-Specific Assignments ---
# For now, only BM25 gets real normalization.
norm_bm25 = norm_he
# For now, E5, Gemma and BGE inputs are passed through unchanged.
norm_e5_query = identity
norm_e5_passage = identity
norm_gemma_query = identity
norm_gemma_passage = identity
norm_bge_query = identity
norm_bge_passage = identity
# --- General Aliases ---
tokenize = tok_he
normalize = norm_he # General normalize points to the strong one