nlp-project / utils /preprocessing_pipeline.py
Julseb42's picture
Test deploy
8e83170
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
# You only need to run these once — can comment out after first successful run
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
# Init
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
def full_preprocess_reviews(df, column="reviews.text"):
# Drop rows with empty reviews
df = df.dropna(subset=[column])
# Remove duplicates
df = df.drop_duplicates(subset=[column])
# 3. Clean text
df[column] = df[column].str.lower()
df[column] = df[column].str.replace(r"[^\w\s]", "", regex=True)
df[column] = df[column].str.replace(r"\d+", "", regex=True)
df[column] = df[column].str.strip()
# Tokenize, remove stopwords, lemmatize
def clean_text(text):
tokens = word_tokenize(text)
tokens = [word for word in tokens if word not in stop_words]
lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
return " ".join(lemmatized)
df[column] = df[column].apply(clean_text)
return df.reset_index(drop=True)