File size: 2,006 Bytes
0f5ecaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""

text_utils.py

Single-source Hebrew normalization & tokenization.

Controls behavior across all scripts.

"""
import re
import unicodedata
from typing import List

HEB_PREFIXES = ("ื•","ื”","ื‘","ืœ","ื›","ืž","ืฉ")
STOPWORDS = set("""

    ืื‘ืœ ืื ืื• ืื– ืืชื” ืืช ืืชื ืืชืŸ ืืฆืœ ืขืœ ืขื“ ืขื ืื ื—ื ื• ืื ื™ ื”ื•ื ื”ื™ื ื”ื ื”ืŸ ืืฉืจ ืฉืœ

    ื•ืœื ืœื ื›ืŸ ื›ื‘ืจ ื›ืืฉืจ ืœื›ืŸ ืœืคื ื™ ืœืื—ืจ ื›ื“ื™ ืขื•ื“ ืจืง

    ืืœ ื–ื” ื–ื• ืืš ื›ื™ ื’ื ื›ืœ ื›ืš ื‘ืœื™ ืœืคื™ ื•ื›ืŸ ื•ื›ื• ื•ื›'

""".split())


# --- Core Function ---

def identity(s: str) -> str:
    """Does nothing"""
    return s

def norm_he(s: str) -> str:
    """Current normalization implementation (bad)"""
    if not s:
        return ""
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"[\u0591-\u05BD\u05BF-\u05C7]", "", s)  # strip nikkud
    s = (s.replace("ืด", '"').replace("ืณ", "'")
           .replace("โ€", '"').replace("โ€œ", '"')
           .replace("โ€“", "-").replace("โ€”", "-"))
    return re.sub(r"\s+", " ", s).strip()

def tok_he(text: str) -> List[str]:
    """The main tokenizer. It uses the BM25 normalizer internally."""
    s = norm_bm25(text)  # Use the specific normalizer for BM25
    toks = re.findall(r"[A-Za-z0-9\u0590-\u05FF]+", s)
    
    out: List[str] = []
    for t in toks:
        if len(t) > 3 and t[0] in HEB_PREFIXES:
            out.append(t[1:])  # stripped prefix
        out.append(t)
    return [t for t in out if t not in STOPWORDS]


# --- Component-Specific Assignments ---

# For now, only BM25 gets real normalization.
norm_bm25 = norm_he

# For now, E5, Gemma and BGE inputs are passed through unchanged.
norm_e5_query = identity
norm_e5_passage = identity
norm_gemma_query = identity
norm_gemma_passage = identity
norm_bge_query = identity
norm_bge_passage = identity

# --- General Aliases ---
tokenize = tok_he
normalize = norm_he # General normalize points to the strong one