toxic-comment-detector / src /preprocess.py
CoEdd
first commit
01a3d35
def clean_text(text):
import re
import ftfy
# Replace newlines, tabs, carriage returns with space
text = re.sub(r'[\n\r\t]', ' ', text)
# Strip leading and trailing whitespace
text = text.strip()
# Remove excessive spaces
text = re.sub(r'\s+', ' ', text)
# Fix encoding artifacts
text = ftfy.fix_text(text)
return text
def preprocess_data(df):
# Apply cleaning to the 'comment_text' column
df['comment_text'] = df['comment_text'].apply(clean_text)
return df