import re import nltk from nltk import WordNetLemmatizer from nltk.corpus import stopwords class Preprocessor: def __init__(self) -> None: nltk.download('stopwords') nltk.download('wordnet') nltk.download('omw-1.4') def tokenize_and_remove_stopwords(self, text): tweet_list = [ele for ele in text.split()] clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)] clean_s = ' '.join(clean_tokens) clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')] return clean_mess def normalization(self, text): lem = WordNetLemmatizer() normalized_text = "" for word in text: normalized_word = lem.lemmatize(word,'v') normalized_text += normalized_word + " " return normalized_text.strip() def preprocess(self, textlist): preprocessed_text = [] for text in textlist: text = self.tokenize_and_remove_stopwords(text) text = self.normalization(text) preprocessed_text.append(text) return preprocessed_text