Spaces:
Running
Running
File size: 972 Bytes
0c6fb97 90e6570 0c6fb97 90e6570 0c6fb97 90e6570 0c6fb97 90e6570 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | import re
import unicodedata
def clean_text(text: str) -> str:
if not text:
return ""
# 1. Normalize Unicode (Fixes LaTeX ligatures like 'fi' -> 'f' + 'i')
text = unicodedata.normalize('NFKD', text)
# 2. Fix common bullet points and weird LaTeX whitespace
text = text.replace('\uf0b7', '-') # Common LaTeX bullet
text = text.replace('\u2022', '-') # Standard bullet
text = text.replace('\u2013', '-') # En dash
text = text.replace('\u2014', '-') # Em dash
# 3. Remove non-printable chars (except newlines/tabs)
text = "".join(ch for ch in text if ch.isprintable() or ch in ['\n', '\t'])
# 4. Collapse multiple spaces but PRESERVE NEWLINES
# (Important for detecting headers vs bullets)
lines = []
for line in text.split('\n'):
clean_line = re.sub(r'\s+', ' ', line).strip()
if clean_line:
lines.append(clean_line)
return "\n".join(lines) |