File size: 1,200 Bytes
3ccf31a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re

# You only need to run these once — can comment out after first successful run
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

# Init
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def full_preprocess_reviews(df, column="reviews.text"):
    # Drop rows with empty reviews
    df = df.dropna(subset=[column])

    # Remove duplicates
    df = df.drop_duplicates(subset=[column])

    # 3. Clean text
    df[column] = df[column].str.lower()
    df[column] = df[column].str.replace(r"[^\w\s]", "", regex=True)
    df[column] = df[column].str.replace(r"\d+", "", regex=True)
    df[column] = df[column].str.strip()

    # Tokenize, remove stopwords, lemmatize
    def clean_text(text):
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
        return " ".join(lemmatized)

    df[column] = df[column].apply(clean_text)

    return df.reset_index(drop=True)