File size: 1,124 Bytes
6c655a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import re
def clean_text(text: str) -> str:
    """
    General-purpose text cleaner for LLMs or downstream NLP tasks.
    Removes special characters, escape sequences, excess spaces, and normalizes punctuation.
    """

    if not isinstance(text, str):
        return ""

    # Normalize encoded newlines and tabs
    text = text.replace("\\n", "\n").replace("\\t", " ")

    # Remove stray backslashes (\\), unless part of newline
    text = re.sub(r"\\(?!n)", '', text)

    # Remove brackets often used for metadata or markup
    text = re.sub(r'[\[\]{}<>]', '', text)

    # Remove quotes
    text = re.sub(r"[\"']", '', text)

    # Remove special characters except basic punctuation (.,!?)
    text = re.sub(r"[^a-zA-Z0-9.,!? \n]", '', text)

    # Remove repeated punctuation like "!!!" or "???"
    text = re.sub(r'([!?.,]){2,}', r'\1', text)

    # Normalize multiple spaces and newlines
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)  # Collapse more than 2 newlines to just 2
    text = re.sub(r' *\n *', '\n', text)    # Clean spaces around newlines

    return text.strip()