File size: 4,411 Bytes
f05e8f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# src/utils/text_cleaning.py
from __future__ import annotations
import re
import unicodedata
from typing import List, Dict, Tuple, Optional

# Control chars except \n and \t
_CONTROL_CHARS = ''.join(map(chr, list(range(0,9)) + [11,12] + list(range(14,32)) + [127]))
_CTRL_RE = re.compile(f'[{re.escape(_CONTROL_CHARS)}]')

BULLET_MAP = {
    'β€’': '-', 'β—¦': '-', 'Β·': '-', '●': '-', 'ο‚§': '-', '*': '-',
    '–': '-', 'β€”': '-',  # normalize dashes
}
QUOTE_MAP = {
    'β€œ':'"', '”':'"', 'β€Ÿ':'"', 'β€ž':'"',
    '’':"'", 'β€˜':"'", 'β€š':"'",
}
LIG_MAP = {
    'fi':'fi','fl':'fl','ffi':'ffi','ffl':'ffl','ff':'ff','ο¬…':'st','st':'st'
}

def unicode_normalize(text: str) -> str:
    if not text: return ''
    return unicodedata.normalize('NFC', text)

def remove_control_chars(text: str) -> str:
    return _CTRL_RE.sub(' ', text)

def normalize_quotes_dashes_ligatures(text: str) -> str:
    for k,v in QUOTE_MAP.items(): text = text.replace(k,v)
    for k,v in BULLET_MAP.items(): text = text.replace(k,v)
    for k,v in LIG_MAP.items(): text = text.replace(k,v)
    return text

def normalize_spaces(text: str) -> str:
    # Map all unicode spaces to normal space
    text = ''.join(' ' if unicodedata.category(ch).startswith('Z') else ch for ch in text)
    # Collapse multiple spaces
    text = re.sub(r'[ \t]+', ' ', text)
    # Collapse 3+ newlines to 2
    text = re.sub(r'\n{3,}', '\n\n', text)
    # Strip trailing spaces
    text = re.sub(r'[ \t]+\n', '\n', text)
    return text.strip()

def normalize_bullets_and_lists(text: str) -> str:
    # Ensure "- " at start of bullet lines
    text = re.sub(r'(?m)^\s*-\s*', '- ', text)
    # Ensure "1." has a space
    text = re.sub(r'(?m)^(\d+)\.\s*', r'\1. ', text)
    return text

def basic_clean(text: str) -> str:
    t = unicode_normalize(text)
    t = remove_control_chars(t)
    t = normalize_quotes_dashes_ligatures(t)
    t = normalize_bullets_and_lists(t)
    t = normalize_spaces(t)
    return t

def learn_header_footer_patterns(pages: List[str], sample: int = 8) -> Tuple[Optional[re.Pattern], Optional[re.Pattern]]:
    """

    Heuristic: find most frequent first and last lines among first N pages.

    If repeated enough, build a regex to remove them.

    """
    first_lines, last_lines = {}, {}
    for p in pages[:sample]:
        lines = [ln.strip() for ln in p.splitlines() if ln.strip()]
        if not lines: continue
        first_lines[lines[0]] = first_lines.get(lines, 0) + 1
        last_lines[lines[-1]] = last_lines.get(lines[-1], 0) + 1

    def build(d: Dict[str,int]) -> Optional[re.Pattern]:
        if not d: return None
        top, cnt = max(d.items(), key=lambda kv: kv[1])
        if cnt >= max(3, sample//2):
            esc = re.escape(top)
            return re.compile(rf'(?m)^\s*{esc}(\s+\d+[-–]?\d*)?\s*$')
        return None

    return build(first_lines), build(last_lines)

def strip_headers_footers(text: str, head_re: Optional[re.Pattern], foot_re: Optional[re.Pattern]) -> str:
    if head_re: text = head_re.sub('', text)
    if foot_re: text = foot_re.sub('', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

def looks_like_toc_or_schedule(text: str, section_title: str = '') -> bool:
    title = (section_title or '').lower()
    if any(k in title for k in ['table of contents','contents','maintenance schedule','scheduled maintenance','index']):
        return True
    # Many lines with dotted leaders ending in page numbers
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    dotted = sum(1 for ln in lines if re.search(r'\.{3,}\s*\d{1,4}([.-]\d{1,4})?$', ln))
    return dotted >= max(4, len(lines)//3)

def bullet_density(text: str) -> float:
    lines = [ln.strip() for ln in text.splitlines()]
    if not lines: return 0.0
    bullets = sum(1 for ln in lines if re.match(r'^(-|\d+\.)\s+', ln))
    return bullets / max(1, len(lines))

def truncate_nicely(text: str, max_len: int = 600) -> str:
    if len(text) <= max_len: return text
    cut = text[:max_len]
    m = re.search(r'([.!?])[^.!?]*$', cut)
    if m and m.end() > int(max_len*0.6):
        end = m.end()
    else:
        nl = cut.rfind('\n'); sp = cut.rfind(' ')
        end = max(nl, sp, int(max_len*0.9))
    return cut[:end].rstrip() + '...'