File size: 972 Bytes
0c6fb97
90e6570
0c6fb97
 
 
 
 
90e6570
 
0c6fb97
90e6570
 
 
 
 
0c6fb97
90e6570
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import re
import unicodedata

def clean_text(text: str) -> str:
    if not text:
        return ""
    
    # 1. Normalize Unicode (Fixes LaTeX ligatures like 'fi' -> 'f' + 'i')
    text = unicodedata.normalize('NFKD', text)
    
    # 2. Fix common bullet points and weird LaTeX whitespace
    text = text.replace('\uf0b7', '-')  # Common LaTeX bullet
    text = text.replace('\u2022', '-')  # Standard bullet
    text = text.replace('\u2013', '-')  # En dash
    text = text.replace('\u2014', '-')  # Em dash
    
    # 3. Remove non-printable chars (except newlines/tabs)
    text = "".join(ch for ch in text if ch.isprintable() or ch in ['\n', '\t'])
    
    # 4. Collapse multiple spaces but PRESERVE NEWLINES 
    # (Important for detecting headers vs bullets)
    lines = []
    for line in text.split('\n'):
        clean_line = re.sub(r'\s+', ' ', line).strip()
        if clean_line:
            lines.append(clean_line)
            
    return "\n".join(lines)