Spaces:

tlong-ds
/

sentiment-analysis-api

Sleeping

File size: 5,071 Bytes

import joblib
import numpy as np
import re
import string

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples
from sklearn.linear_model import LogisticRegression

#nltk.data.path.append("/app/nltk_data")
#nltk.download('twitter_samples')
#nltk.download('stopwords')
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

class LogisticRegressionModel:
    def __init__(self):
        # split data into train and test set
        train_pos = all_positive_tweets[:4000]
        train_neg = all_negative_tweets[:4000]
        self.train_x = train_pos + train_neg 
        self.train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
        self.freqs = LogisticRegressionModel.build_freqs(self.train_x, self.train_y)
        try:
            self.model = joblib.load("sk_logreg.pkl")
        except:
            self.model = LogisticRegressionModel.train(self.train_x, self.train_y, self.freqs)
            joblib.dump(self.model, "sk_logreg.pkl")

    def predict(self, query):
        features = LogisticRegressionModel.extract_features(query, self.freqs)
        result = self.model.predict_proba(features[:, 1:])
        return result
    
    @staticmethod
    def train(train_x, train_y, freqs):
        train_x_vec = np.vstack([LogisticRegressionModel.extract_features(t,freqs) for t in train_x])

        model = LogisticRegression()
        model.fit(train_x_vec[:, 1:], train_y.ravel())

        return model

    @staticmethod
    def process_tweet(tweet):
        """
        Input:
            :tweet: a string
        Output:
            :tweets_clean: a list of words containing the processed tweet
        """
        stemmer = PorterStemmer()
        stopwords_english = stopwords.words('english')

        # remove stock market tickers like $GE
        tweet = re.sub(r'\$\w*', '', tweet)
        # remove old style retweet text "RT"
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # remove hyperlinks
        tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
        # remove hashtags
        # only removing the hash # sign from the word
        tweet = re.sub(r'#', '', tweet)

        # tokenize tweets
        tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) #the tokenizer will downcase everything except for emoticons
        tweet_tokens = tokenizer.tokenize(tweet)

        tweets_clean = []
        for word in tweet_tokens:
            if (word not in stopwords_english and   # remove stopwords
                    word not in string.punctuation): # remove punctuation
                stem_word = stemmer.stem(word)
                tweets_clean.append(stem_word)

        return tweets_clean

    @staticmethod
    def build_freqs(tweets, ys):
        """ Build frequencies
        Input:
        tweets: a list of tweets
        ys: an mx1 array with the sentiment label of each tweet (either 0 or 1)
        Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its frequency
        """
        yslist = np.squeeze(ys).tolist()
        # start with an empty dict and populate it by looping over all tweets
        freqs = {}
        for y, tweet in zip(yslist, tweets):
            for word in LogisticRegressionModel.process_tweet(tweet):
                pair = (word, y)
                if pair in freqs:
                    freqs[pair] += 1
                else:
                    freqs[pair] = 1

        return freqs

    @staticmethod
    def extract_features(tweet, freqs, process_tweet=process_tweet):
        '''
        Input: 
            tweet: a list of words for one tweet
            freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        Output: 
            x: a feature vector of dimension (1,3)
        '''
        # process_tweet tokenizes, stems, and removes stopwords
        word_l = process_tweet(tweet)
        
        # 3 elements in the form of a 1 x 3 vector
        x = np.zeros((1, 3)) 
        
        #bias term is set to 1
        x[0,0] = 1    
        # loop through each word in the list of words
        for word in word_l:
            
            # increment the word count for the positive label 1
            if (word, 1) in freqs.keys():
                x[0,1] += freqs[(word, 1)]
            
            # increment the word count for the negative label 0
            if (word, 0) in freqs.keys():
                x[0,2] += freqs[(word, 0)]
            
        assert(x.shape == (1, 3))
        return x
    

if __name__ == "__main__":
    # Example usage
    lr_instance = LogisticRegressionModel()
    test_tweet = "I am happy happy happy!"
    prediction = lr_instance.predict(test_tweet)
    print(f"Tweet: {test_tweet}")
    print(f"Prediction (probabilities for [neg, pos]): {prediction}")
    print(f"Predicted sentiment: {'Positive' if prediction[0][1] >= 0.5 else 'Negative'}")