import re # -------------------------------------- # Arabic-safe normalization (RAG oriented) # -------------------------------------- ARABIC_DIACRITICS = re.compile(r"[ًٌٍَُِّْـ]") def normalize_arabic_text(text: str) -> str: """ Safe normalization for Arabic PDFs: - Remove diacritics - Normalize punctuation - Preserve word boundaries - DO NOT hallucinate spaces """ # 1. Remove diacritics text = re.sub(ARABIC_DIACRITICS, "", text) # 2. Normalize Arabic punctuation spacing text = re.sub(r"\s*([،؛؟!])\s*", r"\1 ", text) text = re.sub(r"\s*([.:])\s*", r"\1 ", text) # 3. Fix obvious header glue (##عنوان) text = re.sub(r"(#+)([^\s#])", r"\1 \2", text) # 4. Separate bullets safely text = re.sub(r"\*\s*", "* ", text) # 5. Collapse excessive whitespace text = re.sub(r"[ \t]{2,}", " ", text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def normalize_text(text: str) -> str: text = normalize_arabic_text(text) lines = [line.strip() for line in text.splitlines()] return "\n".join(lines)