sentiment-analysis / process_text.py
hazardous's picture
add files
94e649c
import re
import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
class Preprocessor:
def __init__(self) -> None:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
def tokenize_and_remove_stopwords(self, text):
tweet_list = [ele for ele in text.split()]
clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
clean_s = ' '.join(clean_tokens)
clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
return clean_mess
def normalization(self, text):
lem = WordNetLemmatizer()
normalized_text = ""
for word in text:
normalized_word = lem.lemmatize(word,'v')
normalized_text += normalized_word + " "
return normalized_text.strip()
def preprocess(self, textlist):
preprocessed_text = []
for text in textlist:
text = self.tokenize_and_remove_stopwords(text)
text = self.normalization(text)
preprocessed_text.append(text)
return preprocessed_text