File size: 516 Bytes
01a3d35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def clean_text(text):
    import re
    import ftfy

    # Replace newlines, tabs, carriage returns with space
    text = re.sub(r'[\n\r\t]', ' ', text)
    # Strip leading and trailing whitespace
    text = text.strip()
    # Remove excessive spaces
    text = re.sub(r'\s+', ' ', text)
    # Fix encoding artifacts
    text = ftfy.fix_text(text)

    return text

def preprocess_data(df):
    # Apply cleaning to the 'comment_text' column
    df['comment_text'] = df['comment_text'].apply(clean_text)
    return df