subashpoudel's picture
Included CI CD
583f6dd
raw
history blame
1.12 kB
import re
def clean_text(text: str) -> str:
"""
General-purpose text cleaner for LLMs or downstream NLP tasks.
Removes special characters, escape sequences, excess spaces, and normalizes punctuation.
"""
if not isinstance(text, str):
return ""
# Normalize encoded newlines and tabs
text = text.replace("\\n", "\n").replace("\\t", " ")
# Remove stray backslashes (\\), unless part of newline
text = re.sub(r"\\(?!n)", '', text)
# Remove brackets often used for metadata or markup
text = re.sub(r'[\[\]{}<>]', '', text)
# Remove quotes
text = re.sub(r"[\"']", '', text)
# Remove special characters except basic punctuation (.,!?)
text = re.sub(r"[^a-zA-Z0-9.,!? \n]", '', text)
# Remove repeated punctuation like "!!!" or "???"
text = re.sub(r'([!?.,]){2,}', r'\1', text)
# Normalize multiple spaces and newlines
text = re.sub(r'[ \t]+', ' ', text)
text = re.sub(r'\n{3,}', '\n\n', text) # Collapse more than 2 newlines to just 2
text = re.sub(r' *\n *', '\n', text) # Clean spaces around newlines
return text.strip()