Spaces:

tlong-ds
/

sentiment-analysis-api

Sleeping

App Files Files Community

sentiment-analysis-api / model.py

tlong-ds

Update model.py

249b47a verified 4 months ago

raw

history blame contribute delete

5.07 kB

	import joblib
	import numpy as np
	import re
	import string

	import nltk
	from nltk.stem import PorterStemmer
	from nltk.tokenize import TweetTokenizer
	from nltk.corpus import stopwords, twitter_samples
	from sklearn.linear_model import LogisticRegression

	#nltk.data.path.append("/app/nltk_data")
	#nltk.download('twitter_samples')
	#nltk.download('stopwords')
	all_positive_tweets = twitter_samples.strings('positive_tweets.json')
	all_negative_tweets = twitter_samples.strings('negative_tweets.json')

	class LogisticRegressionModel:
	def __init__(self):
	# split data into train and test set
	train_pos = all_positive_tweets[:4000]
	train_neg = all_negative_tweets[:4000]
	self.train_x = train_pos + train_neg
	self.train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
	self.freqs = LogisticRegressionModel.build_freqs(self.train_x, self.train_y)
	try:
	self.model = joblib.load("sk_logreg.pkl")
	except:
	self.model = LogisticRegressionModel.train(self.train_x, self.train_y, self.freqs)
	joblib.dump(self.model, "sk_logreg.pkl")

	def predict(self, query):
	features = LogisticRegressionModel.extract_features(query, self.freqs)
	result = self.model.predict_proba(features[:, 1:])
	return result

	@staticmethod
	def train(train_x, train_y, freqs):
	train_x_vec = np.vstack([LogisticRegressionModel.extract_features(t,freqs) for t in train_x])

	model = LogisticRegression()
	model.fit(train_x_vec[:, 1:], train_y.ravel())

	return model

	@staticmethod
	def process_tweet(tweet):
	"""
	Input:
	:tweet: a string
	Output:
	:tweets_clean: a list of words containing the processed tweet
	"""
	stemmer = PorterStemmer()
	stopwords_english = stopwords.words('english')

	# remove stock market tickers like $GE
	tweet = re.sub(r'\$\w*', '', tweet)
	# remove old style retweet text "RT"
	tweet = re.sub(r'^RT[\s]+', '', tweet)
	# remove hyperlinks
	tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
	# remove hashtags
	# only removing the hash # sign from the word
	tweet = re.sub(r'#', '', tweet)

	# tokenize tweets
	tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) #the tokenizer will downcase everything except for emoticons
	tweet_tokens = tokenizer.tokenize(tweet)

	tweets_clean = []
	for word in tweet_tokens:
	if (word not in stopwords_english and # remove stopwords
	word not in string.punctuation): # remove punctuation
	stem_word = stemmer.stem(word)
	tweets_clean.append(stem_word)

	return tweets_clean

	@staticmethod
	def build_freqs(tweets, ys):
	""" Build frequencies
	Input:
	tweets: a list of tweets
	ys: an mx1 array with the sentiment label of each tweet (either 0 or 1)
	Output:
	freqs: a dictionary mapping each (word, sentiment) pair to its frequency
	"""
	yslist = np.squeeze(ys).tolist()
	# start with an empty dict and populate it by looping over all tweets
	freqs = {}
	for y, tweet in zip(yslist, tweets):
	for word in LogisticRegressionModel.process_tweet(tweet):
	pair = (word, y)
	if pair in freqs:
	freqs[pair] += 1
	else:
	freqs[pair] = 1

	return freqs

	@staticmethod
	def extract_features(tweet, freqs, process_tweet=process_tweet):
	'''
	Input:
	tweet: a list of words for one tweet
	freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
	Output:
	x: a feature vector of dimension (1,3)
	'''
	# process_tweet tokenizes, stems, and removes stopwords
	word_l = process_tweet(tweet)

	# 3 elements in the form of a 1 x 3 vector
	x = np.zeros((1, 3))

	#bias term is set to 1
	x[0,0] = 1
	# loop through each word in the list of words
	for word in word_l:

	# increment the word count for the positive label 1
	if (word, 1) in freqs.keys():
	x[0,1] += freqs[(word, 1)]

	# increment the word count for the negative label 0
	if (word, 0) in freqs.keys():
	x[0,2] += freqs[(word, 0)]

	assert(x.shape == (1, 3))
	return x


	if __name__ == "__main__":
	# Example usage
	lr_instance = LogisticRegressionModel()
	test_tweet = "I am happy happy happy!"
	prediction = lr_instance.predict(test_tweet)
	print(f"Tweet: {test_tweet}")
	print(f"Prediction (probabilities for [neg, pos]): {prediction}")
	print(f"Predicted sentiment: {'Positive' if prediction[0][1] >= 0.5 else 'Negative'}")