Spaces:
Sleeping
Sleeping
| import re | |
| from typing import List | |
| from rapidfuzz import fuzz | |
| # ========================= | |
| # Arabic Utilities | |
| # ========================= | |
| _AR_DIACRITICS = re.compile(r"[\u0617-\u061A\u064B-\u0652\u0670\u06D6-\u06ED]") | |
| _AR_TATWEEL = "\u0640" | |
| def normalize_arabic(text: str) -> str: | |
| """ | |
| Normalize Arabic text: | |
| - remove tatweel | |
| - remove diacritics | |
| - normalize spaces | |
| """ | |
| if not text: | |
| return "" | |
| text = text.replace(_AR_TATWEEL, "") | |
| text = _AR_DIACRITICS.sub("", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def drop_common_headers_footers( | |
| pages: List[str], min_similarity: int = 92 | |
| ) -> List[str]: | |
| """ | |
| Detect and remove repeated headers / footers across pages. | |
| """ | |
| if not pages: | |
| return pages | |
| first_lines, last_lines = [], [] | |
| for p in pages: | |
| lines = [l.strip() for l in p.splitlines() if l.strip()] | |
| first_lines.append("\n".join(lines[:2]) if len(lines) >= 2 else "") | |
| last_lines.append("\n".join(lines[-2:]) if len(lines) >= 2 else "") | |
| def detect(candidates: List[str]) -> str | None: | |
| candidates = sorted([c for c in candidates if c], key=len, reverse=True) | |
| if not candidates: | |
| return None | |
| base = candidates[0] | |
| hits = sum(1 for c in candidates if fuzz.ratio(base, c) >= min_similarity) | |
| return base if hits >= max(3, int(0.4 * len(candidates))) else None | |
| header = detect(first_lines) | |
| footer = detect(last_lines) | |
| cleaned_pages = [] | |
| for p in pages: | |
| if header: | |
| p = p.replace(header, "") | |
| if footer: | |
| p = p.replace(footer, "") | |
| cleaned_pages.append(p.strip()) | |
| return cleaned_pages | |