Spaces:
Sleeping
Sleeping
| import joblib | |
| import numpy as np | |
| import re | |
| import string | |
| import nltk | |
| from nltk.stem import PorterStemmer | |
| from nltk.tokenize import TweetTokenizer | |
| from nltk.corpus import stopwords, twitter_samples | |
| from sklearn.linear_model import LogisticRegression | |
| #nltk.data.path.append("/app/nltk_data") | |
| #nltk.download('twitter_samples') | |
| #nltk.download('stopwords') | |
| all_positive_tweets = twitter_samples.strings('positive_tweets.json') | |
| all_negative_tweets = twitter_samples.strings('negative_tweets.json') | |
| class LogisticRegressionModel: | |
| def __init__(self): | |
| # split data into train and test set | |
| train_pos = all_positive_tweets[:4000] | |
| train_neg = all_negative_tweets[:4000] | |
| self.train_x = train_pos + train_neg | |
| self.train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0) | |
| self.freqs = LogisticRegressionModel.build_freqs(self.train_x, self.train_y) | |
| try: | |
| self.model = joblib.load("sk_logreg.pkl") | |
| except: | |
| self.model = LogisticRegressionModel.train(self.train_x, self.train_y, self.freqs) | |
| joblib.dump(self.model, "sk_logreg.pkl") | |
| def predict(self, query): | |
| features = LogisticRegressionModel.extract_features(query, self.freqs) | |
| result = self.model.predict_proba(features[:, 1:]) | |
| return result | |
| def train(train_x, train_y, freqs): | |
| train_x_vec = np.vstack([LogisticRegressionModel.extract_features(t,freqs) for t in train_x]) | |
| model = LogisticRegression() | |
| model.fit(train_x_vec[:, 1:], train_y.ravel()) | |
| return model | |
| def process_tweet(tweet): | |
| """ | |
| Input: | |
| :tweet: a string | |
| Output: | |
| :tweets_clean: a list of words containing the processed tweet | |
| """ | |
| stemmer = PorterStemmer() | |
| stopwords_english = stopwords.words('english') | |
| # remove stock market tickers like $GE | |
| tweet = re.sub(r'\$\w*', '', tweet) | |
| # remove old style retweet text "RT" | |
| tweet = re.sub(r'^RT[\s]+', '', tweet) | |
| # remove hyperlinks | |
| tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet) | |
| # remove hashtags | |
| # only removing the hash # sign from the word | |
| tweet = re.sub(r'#', '', tweet) | |
| # tokenize tweets | |
| tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) #the tokenizer will downcase everything except for emoticons | |
| tweet_tokens = tokenizer.tokenize(tweet) | |
| tweets_clean = [] | |
| for word in tweet_tokens: | |
| if (word not in stopwords_english and # remove stopwords | |
| word not in string.punctuation): # remove punctuation | |
| stem_word = stemmer.stem(word) | |
| tweets_clean.append(stem_word) | |
| return tweets_clean | |
| def build_freqs(tweets, ys): | |
| """ Build frequencies | |
| Input: | |
| tweets: a list of tweets | |
| ys: an mx1 array with the sentiment label of each tweet (either 0 or 1) | |
| Output: | |
| freqs: a dictionary mapping each (word, sentiment) pair to its frequency | |
| """ | |
| yslist = np.squeeze(ys).tolist() | |
| # start with an empty dict and populate it by looping over all tweets | |
| freqs = {} | |
| for y, tweet in zip(yslist, tweets): | |
| for word in LogisticRegressionModel.process_tweet(tweet): | |
| pair = (word, y) | |
| if pair in freqs: | |
| freqs[pair] += 1 | |
| else: | |
| freqs[pair] = 1 | |
| return freqs | |
| def extract_features(tweet, freqs, process_tweet=process_tweet): | |
| ''' | |
| Input: | |
| tweet: a list of words for one tweet | |
| freqs: a dictionary corresponding to the frequencies of each tuple (word, label) | |
| Output: | |
| x: a feature vector of dimension (1,3) | |
| ''' | |
| # process_tweet tokenizes, stems, and removes stopwords | |
| word_l = process_tweet(tweet) | |
| # 3 elements in the form of a 1 x 3 vector | |
| x = np.zeros((1, 3)) | |
| #bias term is set to 1 | |
| x[0,0] = 1 | |
| # loop through each word in the list of words | |
| for word in word_l: | |
| # increment the word count for the positive label 1 | |
| if (word, 1) in freqs.keys(): | |
| x[0,1] += freqs[(word, 1)] | |
| # increment the word count for the negative label 0 | |
| if (word, 0) in freqs.keys(): | |
| x[0,2] += freqs[(word, 0)] | |
| assert(x.shape == (1, 3)) | |
| return x | |
| if __name__ == "__main__": | |
| # Example usage | |
| lr_instance = LogisticRegressionModel() | |
| test_tweet = "I am happy happy happy!" | |
| prediction = lr_instance.predict(test_tweet) | |
| print(f"Tweet: {test_tweet}") | |
| print(f"Prediction (probabilities for [neg, pos]): {prediction}") | |
| print(f"Predicted sentiment: {'Positive' if prediction[0][1] >= 0.5 else 'Negative'}") |