File size: 2,006 Bytes

0f5ecaf

"""

text_utils.py

Single-source Hebrew normalization & tokenization.

Controls behavior across all scripts.

"""
import re
import unicodedata
from typing import List

HEB_PREFIXES = ("ו","ה","ב","ל","כ","מ","ש")
STOPWORDS = set("""

    אבל אם או אז אתה את אתם אתן אצל על עד עם אנחנו אני הוא היא הם הן אשר של

    ולא לא כן כבר כאשר לכן לפני לאחר כדי עוד רק

    אל זה זו אך כי גם כל כך בלי לפי וכן וכו וכ'

""".split())


# --- Core Function ---

def identity(s: str) -> str:
    """Does nothing"""
    return s

def norm_he(s: str) -> str:
    """Current normalization implementation (bad)"""
    if not s:
        return ""
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"[\u0591-\u05BD\u05BF-\u05C7]", "", s)  # strip nikkud
    s = (s.replace("״", '"').replace("׳", "'")
           .replace("”", '"').replace("“", '"')
           .replace("–", "-").replace("—", "-"))
    return re.sub(r"\s+", " ", s).strip()

def tok_he(text: str) -> List[str]:
    """The main tokenizer. It uses the BM25 normalizer internally."""
    s = norm_bm25(text)  # Use the specific normalizer for BM25
    toks = re.findall(r"[A-Za-z0-9\u0590-\u05FF]+", s)
    
    out: List[str] = []
    for t in toks:
        if len(t) > 3 and t[0] in HEB_PREFIXES:
            out.append(t[1:])  # stripped prefix
        out.append(t)
    return [t for t in out if t not in STOPWORDS]


# --- Component-Specific Assignments ---

# For now, only BM25 gets real normalization.
norm_bm25 = norm_he

# For now, E5, Gemma and BGE inputs are passed through unchanged.
norm_e5_query = identity
norm_e5_passage = identity
norm_gemma_query = identity
norm_gemma_passage = identity
norm_bge_query = identity
norm_bge_passage = identity

# --- General Aliases ---
tokenize = tok_he
normalize = norm_he # General normalize points to the strong one