Spaces:
Sleeping
Sleeping
File size: 5,071 Bytes
155aa38 249b47a ff1e8ec 155aa38 97784bd 155aa38 97784bd 155aa38 97784bd 155aa38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import joblib
import numpy as np
import re
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples
from sklearn.linear_model import LogisticRegression
#nltk.data.path.append("/app/nltk_data")
#nltk.download('twitter_samples')
#nltk.download('stopwords')
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
class LogisticRegressionModel:
def __init__(self):
# split data into train and test set
train_pos = all_positive_tweets[:4000]
train_neg = all_negative_tweets[:4000]
self.train_x = train_pos + train_neg
self.train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
self.freqs = LogisticRegressionModel.build_freqs(self.train_x, self.train_y)
try:
self.model = joblib.load("sk_logreg.pkl")
except:
self.model = LogisticRegressionModel.train(self.train_x, self.train_y, self.freqs)
joblib.dump(self.model, "sk_logreg.pkl")
def predict(self, query):
features = LogisticRegressionModel.extract_features(query, self.freqs)
result = self.model.predict_proba(features[:, 1:])
return result
@staticmethod
def train(train_x, train_y, freqs):
train_x_vec = np.vstack([LogisticRegressionModel.extract_features(t,freqs) for t in train_x])
model = LogisticRegression()
model.fit(train_x_vec[:, 1:], train_y.ravel())
return model
@staticmethod
def process_tweet(tweet):
"""
Input:
:tweet: a string
Output:
:tweets_clean: a list of words containing the processed tweet
"""
stemmer = PorterStemmer()
stopwords_english = stopwords.words('english')
# remove stock market tickers like $GE
tweet = re.sub(r'\$\w*', '', tweet)
# remove old style retweet text "RT"
tweet = re.sub(r'^RT[\s]+', '', tweet)
# remove hyperlinks
tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
# remove hashtags
# only removing the hash # sign from the word
tweet = re.sub(r'#', '', tweet)
# tokenize tweets
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) #the tokenizer will downcase everything except for emoticons
tweet_tokens = tokenizer.tokenize(tweet)
tweets_clean = []
for word in tweet_tokens:
if (word not in stopwords_english and # remove stopwords
word not in string.punctuation): # remove punctuation
stem_word = stemmer.stem(word)
tweets_clean.append(stem_word)
return tweets_clean
@staticmethod
def build_freqs(tweets, ys):
""" Build frequencies
Input:
tweets: a list of tweets
ys: an mx1 array with the sentiment label of each tweet (either 0 or 1)
Output:
freqs: a dictionary mapping each (word, sentiment) pair to its frequency
"""
yslist = np.squeeze(ys).tolist()
# start with an empty dict and populate it by looping over all tweets
freqs = {}
for y, tweet in zip(yslist, tweets):
for word in LogisticRegressionModel.process_tweet(tweet):
pair = (word, y)
if pair in freqs:
freqs[pair] += 1
else:
freqs[pair] = 1
return freqs
@staticmethod
def extract_features(tweet, freqs, process_tweet=process_tweet):
'''
Input:
tweet: a list of words for one tweet
freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
Output:
x: a feature vector of dimension (1,3)
'''
# process_tweet tokenizes, stems, and removes stopwords
word_l = process_tweet(tweet)
# 3 elements in the form of a 1 x 3 vector
x = np.zeros((1, 3))
#bias term is set to 1
x[0,0] = 1
# loop through each word in the list of words
for word in word_l:
# increment the word count for the positive label 1
if (word, 1) in freqs.keys():
x[0,1] += freqs[(word, 1)]
# increment the word count for the negative label 0
if (word, 0) in freqs.keys():
x[0,2] += freqs[(word, 0)]
assert(x.shape == (1, 3))
return x
if __name__ == "__main__":
# Example usage
lr_instance = LogisticRegressionModel()
test_tweet = "I am happy happy happy!"
prediction = lr_instance.predict(test_tweet)
print(f"Tweet: {test_tweet}")
print(f"Prediction (probabilities for [neg, pos]): {prediction}")
print(f"Predicted sentiment: {'Positive' if prediction[0][1] >= 0.5 else 'Negative'}") |