Spaces:
Sleeping
Sleeping
File size: 516 Bytes
01a3d35 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
def clean_text(text):
import re
import ftfy
# Replace newlines, tabs, carriage returns with space
text = re.sub(r'[\n\r\t]', ' ', text)
# Strip leading and trailing whitespace
text = text.strip()
# Remove excessive spaces
text = re.sub(r'\s+', ' ', text)
# Fix encoding artifacts
text = ftfy.fix_text(text)
return text
def preprocess_data(df):
# Apply cleaning to the 'comment_text' column
df['comment_text'] = df['comment_text'].apply(clean_text)
return df |