|
|
import re |
|
|
def clean_text(text: str) -> str: |
|
|
""" |
|
|
General-purpose text cleaner for LLMs or downstream NLP tasks. |
|
|
Removes special characters, escape sequences, excess spaces, and normalizes punctuation. |
|
|
""" |
|
|
|
|
|
if not isinstance(text, str): |
|
|
return "" |
|
|
|
|
|
|
|
|
text = text.replace("\\n", "\n").replace("\\t", " ") |
|
|
|
|
|
|
|
|
text = re.sub(r"\\(?!n)", '', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'[\[\]{}<>]', '', text) |
|
|
|
|
|
|
|
|
text = re.sub(r"[\"']", '', text) |
|
|
|
|
|
|
|
|
text = re.sub(r"[^a-zA-Z0-9.,!? \n]", '', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'([!?.,]){2,}', r'\1', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'[ \t]+', ' ', text) |
|
|
text = re.sub(r'\n{3,}', '\n\n', text) |
|
|
text = re.sub(r' *\n *', '\n', text) |
|
|
|
|
|
return text.strip() |