Spaces:
No application file
No application file
File size: 1,145 Bytes
94e649c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import re
import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
class Preprocessor:
def __init__(self) -> None:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
def tokenize_and_remove_stopwords(self, text):
tweet_list = [ele for ele in text.split()]
clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
clean_s = ' '.join(clean_tokens)
clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
return clean_mess
def normalization(self, text):
lem = WordNetLemmatizer()
normalized_text = ""
for word in text:
normalized_word = lem.lemmatize(word,'v')
normalized_text += normalized_word + " "
return normalized_text.strip()
def preprocess(self, textlist):
preprocessed_text = []
for text in textlist:
text = self.tokenize_and_remove_stopwords(text)
text = self.normalization(text)
preprocessed_text.append(text)
return preprocessed_text |