File size: 1,165 Bytes
4e7e4c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import re

# --------------------------------------
# Arabic-safe normalization (RAG oriented)
# --------------------------------------

ARABIC_DIACRITICS = re.compile(r"[ู‘ูŽู‹ููŒููู’ู€]")

def normalize_arabic_text(text: str) -> str:
    """

    Safe normalization for Arabic PDFs:

    - Remove diacritics

    - Normalize punctuation

    - Preserve word boundaries

    - DO NOT hallucinate spaces

    """

    # 1. Remove diacritics
    text = re.sub(ARABIC_DIACRITICS, "", text)

    # 2. Normalize Arabic punctuation spacing
    text = re.sub(r"\s*([ุŒุ›ุŸ!])\s*", r"\1 ", text)
    text = re.sub(r"\s*([.:])\s*", r"\1 ", text)

    # 3. Fix obvious header glue (##ุนู†ูˆุงู†)
    text = re.sub(r"(#+)([^\s#])", r"\1 \2", text)

    # 4. Separate bullets safely
    text = re.sub(r"\*\s*", "* ", text)

    # 5. Collapse excessive whitespace
    text = re.sub(r"[ \t]{2,}", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()


def normalize_text(text: str) -> str:
    text = normalize_arabic_text(text)
    lines = [line.strip() for line in text.splitlines()]
    return "\n".join(lines)