File size: 1,727 Bytes
41027b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import re
from typing import List
from rapidfuzz import fuzz

# =========================
# Arabic Utilities
# =========================

_AR_DIACRITICS = re.compile(r"[\u0617-\u061A\u064B-\u0652\u0670\u06D6-\u06ED]")
_AR_TATWEEL = "\u0640"


def normalize_arabic(text: str) -> str:
    """
    Normalize Arabic text:
    - remove tatweel
    - remove diacritics
    - normalize spaces
    """
    if not text:
        return ""
    text = text.replace(_AR_TATWEEL, "")
    text = _AR_DIACRITICS.sub("", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def drop_common_headers_footers(
    pages: List[str], min_similarity: int = 92
) -> List[str]:
    """
    Detect and remove repeated headers / footers across pages.
    """
    if not pages:
        return pages

    first_lines, last_lines = [], []

    for p in pages:
        lines = [l.strip() for l in p.splitlines() if l.strip()]
        first_lines.append("\n".join(lines[:2]) if len(lines) >= 2 else "")
        last_lines.append("\n".join(lines[-2:]) if len(lines) >= 2 else "")

    def detect(candidates: List[str]) -> str | None:
        candidates = sorted([c for c in candidates if c], key=len, reverse=True)
        if not candidates:
            return None
        base = candidates[0]
        hits = sum(1 for c in candidates if fuzz.ratio(base, c) >= min_similarity)
        return base if hits >= max(3, int(0.4 * len(candidates))) else None

    header = detect(first_lines)
    footer = detect(last_lines)

    cleaned_pages = []
    for p in pages:
        if header:
            p = p.replace(header, "")
        if footer:
            p = p.replace(footer, "")
        cleaned_pages.append(p.strip())

    return cleaned_pages