Spaces:
Sleeping
Sleeping
| import re | |
| def clean_text(text: str) -> str: | |
| """ | |
| General-purpose text cleaner for LLMs or downstream NLP tasks. | |
| Removes special characters, escape sequences, excess spaces, and normalizes punctuation. | |
| """ | |
| if not isinstance(text, str): | |
| return "" | |
| # Normalize encoded newlines and tabs | |
| text = text.replace("\\n", "\n").replace("\\t", " ") | |
| # Remove stray backslashes (\\), unless part of newline | |
| text = re.sub(r"\\(?!n)", '', text) | |
| # Remove brackets often used for metadata or markup | |
| text = re.sub(r'[\[\]{}<>]', '', text) | |
| # Remove quotes | |
| text = re.sub(r"[\"']", '', text) | |
| # Remove special characters except basic punctuation (.,!?) | |
| text = re.sub(r"[^a-zA-Z0-9.,!? \n]", '', text) | |
| # Remove repeated punctuation like "!!!" or "???" | |
| text = re.sub(r'([!?.,]){2,}', r'\1', text) | |
| # Normalize multiple spaces and newlines | |
| text = re.sub(r'[ \t]+', ' ', text) | |
| text = re.sub(r'\n{3,}', '\n\n', text) # Collapse more than 2 newlines to just 2 | |
| text = re.sub(r' *\n *', '\n', text) # Clean spaces around newlines | |
| return text.strip() |