Spaces:
Sleeping
Sleeping
File size: 4,411 Bytes
f05e8f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
# src/utils/text_cleaning.py
from __future__ import annotations
import re
import unicodedata
from typing import List, Dict, Tuple, Optional
# Control chars except \n and \t
_CONTROL_CHARS = ''.join(map(chr, list(range(0,9)) + [11,12] + list(range(14,32)) + [127]))
_CTRL_RE = re.compile(f'[{re.escape(_CONTROL_CHARS)}]')
BULLET_MAP = {
'β’': '-', 'β¦': '-', 'Β·': '-', 'β': '-', 'ο§': '-', '*': '-',
'β': '-', 'β': '-', # normalize dashes
}
QUOTE_MAP = {
'β':'"', 'β':'"', 'β':'"', 'β':'"',
'β':"'", 'β':"'", 'β':"'",
}
LIG_MAP = {
'ο¬':'fi','ο¬':'fl','ο¬':'ffi','ο¬':'ffl','ο¬':'ff','ο¬
':'st','ο¬':'st'
}
def unicode_normalize(text: str) -> str:
if not text: return ''
return unicodedata.normalize('NFC', text)
def remove_control_chars(text: str) -> str:
return _CTRL_RE.sub(' ', text)
def normalize_quotes_dashes_ligatures(text: str) -> str:
for k,v in QUOTE_MAP.items(): text = text.replace(k,v)
for k,v in BULLET_MAP.items(): text = text.replace(k,v)
for k,v in LIG_MAP.items(): text = text.replace(k,v)
return text
def normalize_spaces(text: str) -> str:
# Map all unicode spaces to normal space
text = ''.join(' ' if unicodedata.category(ch).startswith('Z') else ch for ch in text)
# Collapse multiple spaces
text = re.sub(r'[ \t]+', ' ', text)
# Collapse 3+ newlines to 2
text = re.sub(r'\n{3,}', '\n\n', text)
# Strip trailing spaces
text = re.sub(r'[ \t]+\n', '\n', text)
return text.strip()
def normalize_bullets_and_lists(text: str) -> str:
# Ensure "- " at start of bullet lines
text = re.sub(r'(?m)^\s*-\s*', '- ', text)
# Ensure "1." has a space
text = re.sub(r'(?m)^(\d+)\.\s*', r'\1. ', text)
return text
def basic_clean(text: str) -> str:
t = unicode_normalize(text)
t = remove_control_chars(t)
t = normalize_quotes_dashes_ligatures(t)
t = normalize_bullets_and_lists(t)
t = normalize_spaces(t)
return t
def learn_header_footer_patterns(pages: List[str], sample: int = 8) -> Tuple[Optional[re.Pattern], Optional[re.Pattern]]:
"""
Heuristic: find most frequent first and last lines among first N pages.
If repeated enough, build a regex to remove them.
"""
first_lines, last_lines = {}, {}
for p in pages[:sample]:
lines = [ln.strip() for ln in p.splitlines() if ln.strip()]
if not lines: continue
first_lines[lines[0]] = first_lines.get(lines, 0) + 1
last_lines[lines[-1]] = last_lines.get(lines[-1], 0) + 1
def build(d: Dict[str,int]) -> Optional[re.Pattern]:
if not d: return None
top, cnt = max(d.items(), key=lambda kv: kv[1])
if cnt >= max(3, sample//2):
esc = re.escape(top)
return re.compile(rf'(?m)^\s*{esc}(\s+\d+[-β]?\d*)?\s*$')
return None
return build(first_lines), build(last_lines)
def strip_headers_footers(text: str, head_re: Optional[re.Pattern], foot_re: Optional[re.Pattern]) -> str:
if head_re: text = head_re.sub('', text)
if foot_re: text = foot_re.sub('', text)
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def looks_like_toc_or_schedule(text: str, section_title: str = '') -> bool:
title = (section_title or '').lower()
if any(k in title for k in ['table of contents','contents','maintenance schedule','scheduled maintenance','index']):
return True
# Many lines with dotted leaders ending in page numbers
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
dotted = sum(1 for ln in lines if re.search(r'\.{3,}\s*\d{1,4}([.-]\d{1,4})?$', ln))
return dotted >= max(4, len(lines)//3)
def bullet_density(text: str) -> float:
lines = [ln.strip() for ln in text.splitlines()]
if not lines: return 0.0
bullets = sum(1 for ln in lines if re.match(r'^(-|\d+\.)\s+', ln))
return bullets / max(1, len(lines))
def truncate_nicely(text: str, max_len: int = 600) -> str:
if len(text) <= max_len: return text
cut = text[:max_len]
m = re.search(r'([.!?])[^.!?]*$', cut)
if m and m.end() > int(max_len*0.6):
end = m.end()
else:
nl = cut.rfind('\n'); sp = cut.rfind(' ')
end = max(nl, sp, int(max_len*0.9))
return cut[:end].rstrip() + '...'
|