File size: 705 Bytes
6c655a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f72bcf
 
6c655a3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import re
def clean_text(text: str) -> str:
    """
    General-purpose text cleaner for LLMs or downstream NLP tasks.
    Removes special characters, escape sequences, excess spaces, and normalizes punctuation.
    """

    if not isinstance(text, str):
        return ""
    text = text.replace("\\n", "\n").replace("\\t", " ")
    text = re.sub(r"\\(?!n)", '', text)
    text = re.sub(r'[\[\]{}<>]', '', text)
    text = re.sub(r"[\"']", '', text)
    text = re.sub(r"[^a-zA-Z0-9.,!? \n]", '', text)
    text = re.sub(r'([!?.,]){2,}', r'\1', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)  
    text = re.sub(r' *\n *', '\n', text)    

    return text.strip()