Spaces:
Running
Running
| import re | |
| import unicodedata | |
| def clean_text(text: str) -> str: | |
| if not text: | |
| return "" | |
| # 1. Normalize Unicode (Fixes LaTeX ligatures like 'fi' -> 'f' + 'i') | |
| text = unicodedata.normalize('NFKD', text) | |
| # 2. Fix common bullet points and weird LaTeX whitespace | |
| text = text.replace('\uf0b7', '-') # Common LaTeX bullet | |
| text = text.replace('\u2022', '-') # Standard bullet | |
| text = text.replace('\u2013', '-') # En dash | |
| text = text.replace('\u2014', '-') # Em dash | |
| # 3. Remove non-printable chars (except newlines/tabs) | |
| text = "".join(ch for ch in text if ch.isprintable() or ch in ['\n', '\t']) | |
| # 4. Collapse multiple spaces but PRESERVE NEWLINES | |
| # (Important for detecting headers vs bullets) | |
| lines = [] | |
| for line in text.split('\n'): | |
| clean_line = re.sub(r'\s+', ' ', line).strip() | |
| if clean_line: | |
| lines.append(clean_line) | |
| return "\n".join(lines) |