tlong-ds commited on
Commit
155aa38
·
1 Parent(s): afffb6d

them file model

Browse files
Files changed (1) hide show
  1. model.py +142 -0
model.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import numpy as np
3
+ import re
4
+ import string
5
+
6
+ import nltk
7
+ from nltk.stem import PorterStemmer
8
+ from nltk.tokenize import TweetTokenizer
9
+ from nltk.corpus import stopwords, twitter_samples
10
+ from sklearn.linear_model import LogisticRegression
11
+
12
+ nltk.download('twitter_samples')
13
+ nltk.download('stopwords')
14
+ all_positive_tweets = twitter_samples.strings('positive_tweets.json')
15
+ all_negative_tweets = twitter_samples.strings('negative_tweets.json')
16
+
17
+ class LogisticRegressionModel:
18
+ def __init__(self):
19
+ # split data into train and test set
20
+ train_pos = all_positive_tweets[:4000]
21
+ train_neg = all_negative_tweets[:4000]
22
+ self.train_x = train_pos + train_neg
23
+ self.train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
24
+ self.freqs = LogisticRegressionModel.build_freqs(self.train_x, self.train_y)
25
+ try:
26
+ self.model = joblib.load("sk_logreg.pkl")
27
+ except:
28
+ self.model = LogisticRegressionModel.train(self.train_x, self.train_y, self.freqs)
29
+ joblib.dump(self.model, "sk_logreg.pkl")
30
+
31
+ def predict(self, query):
32
+ features = LogisticRegressionModel.extract_features(query, self.freqs)
33
+ result = self.model.predict_proba(features)
34
+ return result
35
+
36
+ @staticmethod
37
+ def train(train_x, train_y, freqs):
38
+ train_x_vec = np.zeros((len(train_x),3))
39
+ for i in range(len(train_x)):
40
+ train_x_vec[i,:] = LogisticRegressionModel.extract_features(train_x[i],freqs)
41
+
42
+ model = LogisticRegression()
43
+ model.fit(train_x_vec, train_y.ravel())
44
+
45
+ return model
46
+
47
+ @staticmethod
48
+ def process_tweet(tweet):
49
+ """
50
+ Input:
51
+ :tweet: a string
52
+ Output:
53
+ :tweets_clean: a list of words containing the processed tweet
54
+ """
55
+ stemmer = PorterStemmer()
56
+ stopwords_english = stopwords.words('english')
57
+
58
+ # remove stock market tickers like $GE
59
+ tweet = re.sub(r'\$\w*', '', tweet)
60
+ # remove old style retweet text "RT"
61
+ tweet = re.sub(r'^RT[\s]+', '', tweet)
62
+ # remove hyperlinks
63
+ tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
64
+ # remove hashtags
65
+ # only removing the hash # sign from the word
66
+ tweet = re.sub(r'#', '', tweet)
67
+
68
+ # tokenize tweets
69
+ tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) #the tokenizer will downcase everything except for emoticons
70
+ tweet_tokens = tokenizer.tokenize(tweet)
71
+
72
+ tweets_clean = []
73
+ for word in tweet_tokens:
74
+ if (word not in stopwords_english and # remove stopwords
75
+ word not in string.punctuation): # remove punctuation
76
+ stem_word = stemmer.stem(word)
77
+ tweets_clean.append(stem_word)
78
+
79
+ return tweets_clean
80
+
81
+ @staticmethod
82
+ def build_freqs(tweets, ys):
83
+ """ Build frequencies
84
+ Input:
85
+ tweets: a list of tweets
86
+ ys: an mx1 array with the sentiment label of each tweet (either 0 or 1)
87
+ Output:
88
+ freqs: a dictionary mapping each (word, sentiment) pair to its frequency
89
+ """
90
+ yslist = np.squeeze(ys).tolist()
91
+ # start with an empty dict and populate it by looping over all tweets
92
+ freqs = {}
93
+ for y, tweet in zip(yslist, tweets):
94
+ for word in LogisticRegressionModel.process_tweet(tweet):
95
+ pair = (word, y)
96
+ if pair in freqs:
97
+ freqs[pair] += 1
98
+ else:
99
+ freqs[pair] = 1
100
+
101
+ return freqs
102
+
103
+ @staticmethod
104
+ def extract_features(tweet, freqs, process_tweet=process_tweet):
105
+ '''
106
+ Input:
107
+ tweet: a list of words for one tweet
108
+ freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
109
+ Output:
110
+ x: a feature vector of dimension (1,3)
111
+ '''
112
+ # process_tweet tokenizes, stems, and removes stopwords
113
+ word_l = process_tweet(tweet)
114
+
115
+ # 3 elements in the form of a 1 x 3 vector
116
+ x = np.zeros((1, 3))
117
+
118
+ #bias term is set to 1
119
+ x[0,0] = 1
120
+ # loop through each word in the list of words
121
+ for word in word_l:
122
+
123
+ # increment the word count for the positive label 1
124
+ if (word, 1) in freqs.keys():
125
+ x[0,1] += freqs[(word, 1)]
126
+
127
+ # increment the word count for the negative label 0
128
+ if (word, 0) in freqs.keys():
129
+ x[0,2] += freqs[(word, 0)]
130
+
131
+ assert(x.shape == (1, 3))
132
+ return x
133
+
134
+
135
+ if __name__ == "__main__":
136
+ # Example usage
137
+ lr_instance = LogisticRegressionModel()
138
+ test_tweet = "I am happy happy happy!"
139
+ prediction = lr_instance.predict(test_tweet)
140
+ print(f"Tweet: {test_tweet}")
141
+ print(f"Prediction (probabilities for [neg, pos]): {prediction}")
142
+ print(f"Predicted sentiment: {'Positive' if prediction[0][1] >= 0.5 else 'Negative'}")