JermaineAI's picture
Update backend logic
90e6570
raw
history blame contribute delete
972 Bytes
import re
import unicodedata
def clean_text(text: str) -> str:
if not text:
return ""
# 1. Normalize Unicode (Fixes LaTeX ligatures like 'fi' -> 'f' + 'i')
text = unicodedata.normalize('NFKD', text)
# 2. Fix common bullet points and weird LaTeX whitespace
text = text.replace('\uf0b7', '-') # Common LaTeX bullet
text = text.replace('\u2022', '-') # Standard bullet
text = text.replace('\u2013', '-') # En dash
text = text.replace('\u2014', '-') # Em dash
# 3. Remove non-printable chars (except newlines/tabs)
text = "".join(ch for ch in text if ch.isprintable() or ch in ['\n', '\t'])
# 4. Collapse multiple spaces but PRESERVE NEWLINES
# (Important for detecting headers vs bullets)
lines = []
for line in text.split('\n'):
clean_line = re.sub(r'\s+', ' ', line).strip()
if clean_line:
lines.append(clean_line)
return "\n".join(lines)