Spaces:
Sleeping
Sleeping
| # src/utils/text_cleaning.py | |
| from __future__ import annotations | |
| import re | |
| import unicodedata | |
| from typing import List, Dict, Tuple, Optional | |
| # Control chars except \n and \t | |
| _CONTROL_CHARS = ''.join(map(chr, list(range(0,9)) + [11,12] + list(range(14,32)) + [127])) | |
| _CTRL_RE = re.compile(f'[{re.escape(_CONTROL_CHARS)}]') | |
| BULLET_MAP = { | |
| 'β’': '-', 'β¦': '-', 'Β·': '-', 'β': '-', 'ο§': '-', '*': '-', | |
| 'β': '-', 'β': '-', # normalize dashes | |
| } | |
| QUOTE_MAP = { | |
| 'β':'"', 'β':'"', 'β':'"', 'β':'"', | |
| 'β':"'", 'β':"'", 'β':"'", | |
| } | |
| LIG_MAP = { | |
| 'ο¬':'fi','ο¬':'fl','ο¬':'ffi','ο¬':'ffl','ο¬':'ff','ο¬ ':'st','ο¬':'st' | |
| } | |
| def unicode_normalize(text: str) -> str: | |
| if not text: return '' | |
| return unicodedata.normalize('NFC', text) | |
| def remove_control_chars(text: str) -> str: | |
| return _CTRL_RE.sub(' ', text) | |
| def normalize_quotes_dashes_ligatures(text: str) -> str: | |
| for k,v in QUOTE_MAP.items(): text = text.replace(k,v) | |
| for k,v in BULLET_MAP.items(): text = text.replace(k,v) | |
| for k,v in LIG_MAP.items(): text = text.replace(k,v) | |
| return text | |
| def normalize_spaces(text: str) -> str: | |
| # Map all unicode spaces to normal space | |
| text = ''.join(' ' if unicodedata.category(ch).startswith('Z') else ch for ch in text) | |
| # Collapse multiple spaces | |
| text = re.sub(r'[ \t]+', ' ', text) | |
| # Collapse 3+ newlines to 2 | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| # Strip trailing spaces | |
| text = re.sub(r'[ \t]+\n', '\n', text) | |
| return text.strip() | |
| def normalize_bullets_and_lists(text: str) -> str: | |
| # Ensure "- " at start of bullet lines | |
| text = re.sub(r'(?m)^\s*-\s*', '- ', text) | |
| # Ensure "1." has a space | |
| text = re.sub(r'(?m)^(\d+)\.\s*', r'\1. ', text) | |
| return text | |
| def basic_clean(text: str) -> str: | |
| t = unicode_normalize(text) | |
| t = remove_control_chars(t) | |
| t = normalize_quotes_dashes_ligatures(t) | |
| t = normalize_bullets_and_lists(t) | |
| t = normalize_spaces(t) | |
| return t | |
| def learn_header_footer_patterns(pages: List[str], sample: int = 8) -> Tuple[Optional[re.Pattern], Optional[re.Pattern]]: | |
| """ | |
| Heuristic: find most frequent first and last lines among first N pages. | |
| If repeated enough, build a regex to remove them. | |
| """ | |
| first_lines, last_lines = {}, {} | |
| for p in pages[:sample]: | |
| lines = [ln.strip() for ln in p.splitlines() if ln.strip()] | |
| if not lines: continue | |
| first_lines[lines[0]] = first_lines.get(lines, 0) + 1 | |
| last_lines[lines[-1]] = last_lines.get(lines[-1], 0) + 1 | |
| def build(d: Dict[str,int]) -> Optional[re.Pattern]: | |
| if not d: return None | |
| top, cnt = max(d.items(), key=lambda kv: kv[1]) | |
| if cnt >= max(3, sample//2): | |
| esc = re.escape(top) | |
| return re.compile(rf'(?m)^\s*{esc}(\s+\d+[-β]?\d*)?\s*$') | |
| return None | |
| return build(first_lines), build(last_lines) | |
| def strip_headers_footers(text: str, head_re: Optional[re.Pattern], foot_re: Optional[re.Pattern]) -> str: | |
| if head_re: text = head_re.sub('', text) | |
| if foot_re: text = foot_re.sub('', text) | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| return text.strip() | |
| def looks_like_toc_or_schedule(text: str, section_title: str = '') -> bool: | |
| title = (section_title or '').lower() | |
| if any(k in title for k in ['table of contents','contents','maintenance schedule','scheduled maintenance','index']): | |
| return True | |
| # Many lines with dotted leaders ending in page numbers | |
| lines = [ln.strip() for ln in text.splitlines() if ln.strip()] | |
| dotted = sum(1 for ln in lines if re.search(r'\.{3,}\s*\d{1,4}([.-]\d{1,4})?$', ln)) | |
| return dotted >= max(4, len(lines)//3) | |
| def bullet_density(text: str) -> float: | |
| lines = [ln.strip() for ln in text.splitlines()] | |
| if not lines: return 0.0 | |
| bullets = sum(1 for ln in lines if re.match(r'^(-|\d+\.)\s+', ln)) | |
| return bullets / max(1, len(lines)) | |
| def truncate_nicely(text: str, max_len: int = 600) -> str: | |
| if len(text) <= max_len: return text | |
| cut = text[:max_len] | |
| m = re.search(r'([.!?])[^.!?]*$', cut) | |
| if m and m.end() > int(max_len*0.6): | |
| end = m.end() | |
| else: | |
| nl = cut.rfind('\n'); sp = cut.rfind(' ') | |
| end = max(nl, sp, int(max_len*0.9)) | |
| return cut[:end].rstrip() + '...' | |