# src/utils/text_cleaning.py from __future__ import annotations import re import unicodedata from typing import List, Dict, Tuple, Optional # Control chars except \n and \t _CONTROL_CHARS = ''.join(map(chr, list(range(0,9)) + [11,12] + list(range(14,32)) + [127])) _CTRL_RE = re.compile(f'[{re.escape(_CONTROL_CHARS)}]') BULLET_MAP = { '•': '-', '◦': '-', '·': '-', '●': '-', '': '-', '*': '-', '–': '-', '—': '-', # normalize dashes } QUOTE_MAP = { '“':'"', '”':'"', '‟':'"', '„':'"', '’':"'", '‘':"'", '‚':"'", } LIG_MAP = { 'fi':'fi','fl':'fl','ffi':'ffi','ffl':'ffl','ff':'ff','ſt':'st','st':'st' } def unicode_normalize(text: str) -> str: if not text: return '' return unicodedata.normalize('NFC', text) def remove_control_chars(text: str) -> str: return _CTRL_RE.sub(' ', text) def normalize_quotes_dashes_ligatures(text: str) -> str: for k,v in QUOTE_MAP.items(): text = text.replace(k,v) for k,v in BULLET_MAP.items(): text = text.replace(k,v) for k,v in LIG_MAP.items(): text = text.replace(k,v) return text def normalize_spaces(text: str) -> str: # Map all unicode spaces to normal space text = ''.join(' ' if unicodedata.category(ch).startswith('Z') else ch for ch in text) # Collapse multiple spaces text = re.sub(r'[ \t]+', ' ', text) # Collapse 3+ newlines to 2 text = re.sub(r'\n{3,}', '\n\n', text) # Strip trailing spaces text = re.sub(r'[ \t]+\n', '\n', text) return text.strip() def normalize_bullets_and_lists(text: str) -> str: # Ensure "- " at start of bullet lines text = re.sub(r'(?m)^\s*-\s*', '- ', text) # Ensure "1." has a space text = re.sub(r'(?m)^(\d+)\.\s*', r'\1. ', text) return text def basic_clean(text: str) -> str: t = unicode_normalize(text) t = remove_control_chars(t) t = normalize_quotes_dashes_ligatures(t) t = normalize_bullets_and_lists(t) t = normalize_spaces(t) return t def learn_header_footer_patterns(pages: List[str], sample: int = 8) -> Tuple[Optional[re.Pattern], Optional[re.Pattern]]: """ Heuristic: find most frequent first and last lines among first N pages. If repeated enough, build a regex to remove them. """ first_lines, last_lines = {}, {} for p in pages[:sample]: lines = [ln.strip() for ln in p.splitlines() if ln.strip()] if not lines: continue first_lines[lines[0]] = first_lines.get(lines, 0) + 1 last_lines[lines[-1]] = last_lines.get(lines[-1], 0) + 1 def build(d: Dict[str,int]) -> Optional[re.Pattern]: if not d: return None top, cnt = max(d.items(), key=lambda kv: kv[1]) if cnt >= max(3, sample//2): esc = re.escape(top) return re.compile(rf'(?m)^\s*{esc}(\s+\d+[-–]?\d*)?\s*$') return None return build(first_lines), build(last_lines) def strip_headers_footers(text: str, head_re: Optional[re.Pattern], foot_re: Optional[re.Pattern]) -> str: if head_re: text = head_re.sub('', text) if foot_re: text = foot_re.sub('', text) text = re.sub(r'\n{3,}', '\n\n', text) return text.strip() def looks_like_toc_or_schedule(text: str, section_title: str = '') -> bool: title = (section_title or '').lower() if any(k in title for k in ['table of contents','contents','maintenance schedule','scheduled maintenance','index']): return True # Many lines with dotted leaders ending in page numbers lines = [ln.strip() for ln in text.splitlines() if ln.strip()] dotted = sum(1 for ln in lines if re.search(r'\.{3,}\s*\d{1,4}([.-]\d{1,4})?$', ln)) return dotted >= max(4, len(lines)//3) def bullet_density(text: str) -> float: lines = [ln.strip() for ln in text.splitlines()] if not lines: return 0.0 bullets = sum(1 for ln in lines if re.match(r'^(-|\d+\.)\s+', ln)) return bullets / max(1, len(lines)) def truncate_nicely(text: str, max_len: int = 600) -> str: if len(text) <= max_len: return text cut = text[:max_len] m = re.search(r'([.!?])[^.!?]*$', cut) if m and m.end() > int(max_len*0.6): end = m.end() else: nl = cut.rfind('\n'); sp = cut.rfind(' ') end = max(nl, sp, int(max_len*0.9)) return cut[:end].rstrip() + '...'