Spaces:

tlong-ds
/

sentiment-analysis-api

Sleeping

App Files Files Community

tlong-ds commited on Sep 24, 2025

Commit

155aa38

1 Parent(s): afffb6d

them file model

Browse files

Files changed (1) hide show

model.py +142 -0

model.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import joblib
+import numpy as np
+import re
+import string
+import nltk
+from nltk.stem import PorterStemmer
+from nltk.tokenize import TweetTokenizer
+from nltk.corpus import stopwords, twitter_samples
+from sklearn.linear_model import LogisticRegression
+nltk.download('twitter_samples')
+nltk.download('stopwords')
+all_positive_tweets = twitter_samples.strings('positive_tweets.json')
+all_negative_tweets = twitter_samples.strings('negative_tweets.json')
+class LogisticRegressionModel:
+    def __init__(self):
+        # split data into train and test set
+        train_pos = all_positive_tweets[:4000]
+        train_neg = all_negative_tweets[:4000]
+        self.train_x = train_pos + train_neg
+        self.train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
+        self.freqs = LogisticRegressionModel.build_freqs(self.train_x, self.train_y)
+        try:
+            self.model = joblib.load("sk_logreg.pkl")
+        except:
+            self.model = LogisticRegressionModel.train(self.train_x, self.train_y, self.freqs)
+            joblib.dump(self.model, "sk_logreg.pkl")
+    def predict(self, query):
+        features = LogisticRegressionModel.extract_features(query, self.freqs)
+        result = self.model.predict_proba(features)
+        return result
+    @staticmethod
+    def train(train_x, train_y, freqs):
+        train_x_vec = np.zeros((len(train_x),3))
+        for i in range(len(train_x)):
+            train_x_vec[i,:] = LogisticRegressionModel.extract_features(train_x[i],freqs)
+        model = LogisticRegression()
+        model.fit(train_x_vec, train_y.ravel())
+        return model
+    @staticmethod
+    def process_tweet(tweet):
+        """
+        Input:
+            :tweet: a string
+        Output:
+            :tweets_clean: a list of words containing the processed tweet
+        """
+        stemmer = PorterStemmer()
+        stopwords_english = stopwords.words('english')
+        # remove stock market tickers like $GE
+        tweet = re.sub(r'\$\w*', '', tweet)
+        # remove old style retweet text "RT"
+        tweet = re.sub(r'^RT[\s]+', '', tweet)
+        # remove hyperlinks
+        tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
+        # remove hashtags
+        # only removing the hash # sign from the word
+        tweet = re.sub(r'#', '', tweet)
+        # tokenize tweets
+        tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) #the tokenizer will downcase everything except for emoticons
+        tweet_tokens = tokenizer.tokenize(tweet)
+        tweets_clean = []
+        for word in tweet_tokens:
+            if (word not in stopwords_english and   # remove stopwords
+                    word not in string.punctuation): # remove punctuation
+                stem_word = stemmer.stem(word)
+                tweets_clean.append(stem_word)
+        return tweets_clean
+    @staticmethod
+    def build_freqs(tweets, ys):
+        """ Build frequencies
+        Input:
+        tweets: a list of tweets
+        ys: an mx1 array with the sentiment label of each tweet (either 0 or 1)
+        Output:
+        freqs: a dictionary mapping each (word, sentiment) pair to its frequency
+        """
+        yslist = np.squeeze(ys).tolist()
+        # start with an empty dict and populate it by looping over all tweets
+        freqs = {}
+        for y, tweet in zip(yslist, tweets):
+            for word in LogisticRegressionModel.process_tweet(tweet):
+                pair = (word, y)
+                if pair in freqs:
+                    freqs[pair] += 1
+                else:
+                    freqs[pair] = 1
+        return freqs
+    @staticmethod
+    def extract_features(tweet, freqs, process_tweet=process_tweet):
+        '''
+        Input:
+            tweet: a list of words for one tweet
+            freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
+        Output:
+            x: a feature vector of dimension (1,3)
+        '''
+        # process_tweet tokenizes, stems, and removes stopwords
+        word_l = process_tweet(tweet)
+        # 3 elements in the form of a 1 x 3 vector
+        x = np.zeros((1, 3))
+        #bias term is set to 1
+        x[0,0] = 1
+        # loop through each word in the list of words
+        for word in word_l:
+            # increment the word count for the positive label 1
+            if (word, 1) in freqs.keys():
+                x[0,1] += freqs[(word, 1)]
+            # increment the word count for the negative label 0
+            if (word, 0) in freqs.keys():
+                x[0,2] += freqs[(word, 0)]
+        assert(x.shape == (1, 3))
+        return x
+if __name__ == "__main__":
+    # Example usage
+    lr_instance = LogisticRegressionModel()
+    test_tweet = "I am happy happy happy!"
+    prediction = lr_instance.predict(test_tweet)
+    print(f"Tweet: {test_tweet}")
+    print(f"Prediction (probabilities for [neg, pos]): {prediction}")
+    print(f"Predicted sentiment: {'Positive' if prediction[0][1] >= 0.5 else 'Negative'}")