Spaces:

hazardous
/

sentiment-analysis

No application file

sentiment-analysis / process_text.py

add files

94e649c over 3 years ago

1.15 kB

	import re
	import nltk
	from nltk import WordNetLemmatizer
	from nltk.corpus import stopwords

	class Preprocessor:
	def __init__(self) -> None:
	nltk.download('stopwords')
	nltk.download('wordnet')
	nltk.download('omw-1.4')

	def tokenize_and_remove_stopwords(self, text):
	tweet_list = [ele for ele in text.split()]
	clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
	clean_s = ' '.join(clean_tokens)
	clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
	return clean_mess

	def normalization(self, text):
	lem = WordNetLemmatizer()
	normalized_text = ""
	for word in text:
	normalized_word = lem.lemmatize(word,'v')
	normalized_text += normalized_word + " "
	return normalized_text.strip()

	def preprocess(self, textlist):
	preprocessed_text = []
	for text in textlist:
	text = self.tokenize_and_remove_stopwords(text)
	text = self.normalization(text)
	preprocessed_text.append(text)
	return preprocessed_text