Spaces:
Sleeping
Sleeping
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| import pandas as pd | |
| import re | |
| # You only need to run these once — can comment out after first successful run | |
| #nltk.download('punkt') | |
| #nltk.download('stopwords') | |
| #nltk.download('wordnet') | |
| # Init | |
| stop_words = set(stopwords.words("english")) | |
| lemmatizer = WordNetLemmatizer() | |
| def full_preprocess_reviews(df, column="reviews.text"): | |
| # Drop rows with empty reviews | |
| df = df.dropna(subset=[column]) | |
| # Remove duplicates | |
| df = df.drop_duplicates(subset=[column]) | |
| # 3. Clean text | |
| df[column] = df[column].str.lower() | |
| df[column] = df[column].str.replace(r"[^\w\s]", "", regex=True) | |
| df[column] = df[column].str.replace(r"\d+", "", regex=True) | |
| df[column] = df[column].str.strip() | |
| # Tokenize, remove stopwords, lemmatize | |
| def clean_text(text): | |
| tokens = word_tokenize(text) | |
| tokens = [word for word in tokens if word not in stop_words] | |
| lemmatized = [lemmatizer.lemmatize(word) for word in tokens] | |
| return " ".join(lemmatized) | |
| df[column] = df[column].apply(clean_text) | |
| return df.reset_index(drop=True) | |