ContiAI-v4 / rag /preprocess.py
ziadsameh32's picture
Initial FastAPI CrewAI setup
41027b6
import re
from typing import List
from rapidfuzz import fuzz
# =========================
# Arabic Utilities
# =========================
_AR_DIACRITICS = re.compile(r"[\u0617-\u061A\u064B-\u0652\u0670\u06D6-\u06ED]")
_AR_TATWEEL = "\u0640"
def normalize_arabic(text: str) -> str:
"""
Normalize Arabic text:
- remove tatweel
- remove diacritics
- normalize spaces
"""
if not text:
return ""
text = text.replace(_AR_TATWEEL, "")
text = _AR_DIACRITICS.sub("", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def drop_common_headers_footers(
pages: List[str], min_similarity: int = 92
) -> List[str]:
"""
Detect and remove repeated headers / footers across pages.
"""
if not pages:
return pages
first_lines, last_lines = [], []
for p in pages:
lines = [l.strip() for l in p.splitlines() if l.strip()]
first_lines.append("\n".join(lines[:2]) if len(lines) >= 2 else "")
last_lines.append("\n".join(lines[-2:]) if len(lines) >= 2 else "")
def detect(candidates: List[str]) -> str | None:
candidates = sorted([c for c in candidates if c], key=len, reverse=True)
if not candidates:
return None
base = candidates[0]
hits = sum(1 for c in candidates if fuzz.ratio(base, c) >= min_similarity)
return base if hits >= max(3, int(0.4 * len(candidates))) else None
header = detect(first_lines)
footer = detect(last_lines)
cleaned_pages = []
for p in pages:
if header:
p = p.replace(header, "")
if footer:
p = p.replace(footer, "")
cleaned_pages.append(p.strip())
return cleaned_pages