Spaces:
Sleeping
Sleeping
| def clean_text(text): | |
| import re | |
| import ftfy | |
| # Replace newlines, tabs, carriage returns with space | |
| text = re.sub(r'[\n\r\t]', ' ', text) | |
| # Strip leading and trailing whitespace | |
| text = text.strip() | |
| # Remove excessive spaces | |
| text = re.sub(r'\s+', ' ', text) | |
| # Fix encoding artifacts | |
| text = ftfy.fix_text(text) | |
| return text | |
| def preprocess_data(df): | |
| # Apply cleaning to the 'comment_text' column | |
| df['comment_text'] = df['comment_text'].apply(clean_text) | |
| return df |