import re from typing import List from rapidfuzz import fuzz # ========================= # Arabic Utilities # ========================= _AR_DIACRITICS = re.compile(r"[\u0617-\u061A\u064B-\u0652\u0670\u06D6-\u06ED]") _AR_TATWEEL = "\u0640" def normalize_arabic(text: str) -> str: """ Normalize Arabic text: - remove tatweel - remove diacritics - normalize spaces """ if not text: return "" text = text.replace(_AR_TATWEEL, "") text = _AR_DIACRITICS.sub("", text) text = re.sub(r"\s+", " ", text).strip() return text def drop_common_headers_footers( pages: List[str], min_similarity: int = 92 ) -> List[str]: """ Detect and remove repeated headers / footers across pages. """ if not pages: return pages first_lines, last_lines = [], [] for p in pages: lines = [l.strip() for l in p.splitlines() if l.strip()] first_lines.append("\n".join(lines[:2]) if len(lines) >= 2 else "") last_lines.append("\n".join(lines[-2:]) if len(lines) >= 2 else "") def detect(candidates: List[str]) -> str | None: candidates = sorted([c for c in candidates if c], key=len, reverse=True) if not candidates: return None base = candidates[0] hits = sum(1 for c in candidates if fuzz.ratio(base, c) >= min_similarity) return base if hits >= max(3, int(0.4 * len(candidates))) else None header = detect(first_lines) footer = detect(last_lines) cleaned_pages = [] for p in pages: if header: p = p.replace(header, "") if footer: p = p.replace(footer, "") cleaned_pages.append(p.strip()) return cleaned_pages