Spaces:
Sleeping
Sleeping
File size: 1,165 Bytes
4e7e4c0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | import re
# --------------------------------------
# Arabic-safe normalization (RAG oriented)
# --------------------------------------
ARABIC_DIACRITICS = re.compile(r"[ููููููููู]")
def normalize_arabic_text(text: str) -> str:
"""
Safe normalization for Arabic PDFs:
- Remove diacritics
- Normalize punctuation
- Preserve word boundaries
- DO NOT hallucinate spaces
"""
# 1. Remove diacritics
text = re.sub(ARABIC_DIACRITICS, "", text)
# 2. Normalize Arabic punctuation spacing
text = re.sub(r"\s*([ุุุ!])\s*", r"\1 ", text)
text = re.sub(r"\s*([.:])\s*", r"\1 ", text)
# 3. Fix obvious header glue (##ุนููุงู)
text = re.sub(r"(#+)([^\s#])", r"\1 \2", text)
# 4. Separate bullets safely
text = re.sub(r"\*\s*", "* ", text)
# 5. Collapse excessive whitespace
text = re.sub(r"[ \t]{2,}", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def normalize_text(text: str) -> str:
text = normalize_arabic_text(text)
lines = [line.strip() for line in text.splitlines()]
return "\n".join(lines)
|