Spaces:
No application file
No application file
| import re | |
| import nltk | |
| from nltk import WordNetLemmatizer | |
| from nltk.corpus import stopwords | |
| class Preprocessor: | |
| def __init__(self) -> None: | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| nltk.download('omw-1.4') | |
| def tokenize_and_remove_stopwords(self, text): | |
| tweet_list = [ele for ele in text.split()] | |
| clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)] | |
| clean_s = ' '.join(clean_tokens) | |
| clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')] | |
| return clean_mess | |
| def normalization(self, text): | |
| lem = WordNetLemmatizer() | |
| normalized_text = "" | |
| for word in text: | |
| normalized_word = lem.lemmatize(word,'v') | |
| normalized_text += normalized_word + " " | |
| return normalized_text.strip() | |
| def preprocess(self, textlist): | |
| preprocessed_text = [] | |
| for text in textlist: | |
| text = self.tokenize_and_remove_stopwords(text) | |
| text = self.normalization(text) | |
| preprocessed_text.append(text) | |
| return preprocessed_text |