File size: 5,071 Bytes
155aa38
 
 
 
 
 
 
 
 
 
 
249b47a
ff1e8ec
 
155aa38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97784bd
155aa38
 
 
 
97784bd
155aa38
 
97784bd
155aa38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import joblib
import numpy as np
import re
import string

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples
from sklearn.linear_model import LogisticRegression

#nltk.data.path.append("/app/nltk_data")
#nltk.download('twitter_samples')
#nltk.download('stopwords')
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

class LogisticRegressionModel:
    def __init__(self):
        # split data into train and test set
        train_pos = all_positive_tweets[:4000]
        train_neg = all_negative_tweets[:4000]
        self.train_x = train_pos + train_neg 
        self.train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
        self.freqs = LogisticRegressionModel.build_freqs(self.train_x, self.train_y)
        try:
            self.model = joblib.load("sk_logreg.pkl")
        except:
            self.model = LogisticRegressionModel.train(self.train_x, self.train_y, self.freqs)
            joblib.dump(self.model, "sk_logreg.pkl")

    def predict(self, query):
        features = LogisticRegressionModel.extract_features(query, self.freqs)
        result = self.model.predict_proba(features[:, 1:])
        return result
    
    @staticmethod
    def train(train_x, train_y, freqs):
        train_x_vec = np.vstack([LogisticRegressionModel.extract_features(t,freqs) for t in train_x])

        model = LogisticRegression()
        model.fit(train_x_vec[:, 1:], train_y.ravel())

        return model

    @staticmethod
    def process_tweet(tweet):
        """
        Input:
            :tweet: a string
        Output:
            :tweets_clean: a list of words containing the processed tweet
        """
        stemmer = PorterStemmer()
        stopwords_english = stopwords.words('english')

        # remove stock market tickers like $GE
        tweet = re.sub(r'\$\w*', '', tweet)
        # remove old style retweet text "RT"
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # remove hyperlinks
        tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
        # remove hashtags
        # only removing the hash # sign from the word
        tweet = re.sub(r'#', '', tweet)

        # tokenize tweets
        tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) #the tokenizer will downcase everything except for emoticons
        tweet_tokens = tokenizer.tokenize(tweet)

        tweets_clean = []
        for word in tweet_tokens:
            if (word not in stopwords_english and   # remove stopwords
                    word not in string.punctuation): # remove punctuation
                stem_word = stemmer.stem(word)
                tweets_clean.append(stem_word)

        return tweets_clean

    @staticmethod
    def build_freqs(tweets, ys):
        """ Build frequencies
        Input:
        tweets: a list of tweets
        ys: an mx1 array with the sentiment label of each tweet (either 0 or 1)
        Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its frequency
        """
        yslist = np.squeeze(ys).tolist()
        # start with an empty dict and populate it by looping over all tweets
        freqs = {}
        for y, tweet in zip(yslist, tweets):
            for word in LogisticRegressionModel.process_tweet(tweet):
                pair = (word, y)
                if pair in freqs:
                    freqs[pair] += 1
                else:
                    freqs[pair] = 1

        return freqs

    @staticmethod
    def extract_features(tweet, freqs, process_tweet=process_tweet):
        '''
        Input: 
            tweet: a list of words for one tweet
            freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        Output: 
            x: a feature vector of dimension (1,3)
        '''
        # process_tweet tokenizes, stems, and removes stopwords
        word_l = process_tweet(tweet)
        
        # 3 elements in the form of a 1 x 3 vector
        x = np.zeros((1, 3)) 
        
        #bias term is set to 1
        x[0,0] = 1    
        # loop through each word in the list of words
        for word in word_l:
            
            # increment the word count for the positive label 1
            if (word, 1) in freqs.keys():
                x[0,1] += freqs[(word, 1)]
            
            # increment the word count for the negative label 0
            if (word, 0) in freqs.keys():
                x[0,2] += freqs[(word, 0)]
            
        assert(x.shape == (1, 3))
        return x
    

if __name__ == "__main__":
    # Example usage
    lr_instance = LogisticRegressionModel()
    test_tweet = "I am happy happy happy!"
    prediction = lr_instance.predict(test_tweet)
    print(f"Tweet: {test_tweet}")
    print(f"Prediction (probabilities for [neg, pos]): {prediction}")
    print(f"Predicted sentiment: {'Positive' if prediction[0][1] >= 0.5 else 'Negative'}")