Soundaryasos commited on
Commit
4d9a934
·
verified ·
1 Parent(s): 675c3c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -54
app.py CHANGED
@@ -5,9 +5,9 @@ import matplotlib.pyplot as plt
5
  from datetime import datetime, timedelta
6
  from sklearn.preprocessing import MinMaxScaler
7
  from sklearn.linear_model import LogisticRegression
8
- import tensorflow as tf
9
- from tensorflow.keras.models import Sequential
10
- from tensorflow.keras.layers import LSTM, Dense, Dropout
11
  from transformers import pipeline
12
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
13
  import shap
@@ -16,25 +16,19 @@ from googleapiclient.discovery import build
16
  import warnings
17
  warnings.filterwarnings('ignore')
18
 
19
- # Set random seeds for reproducibility
20
  np.random.seed(42)
21
- tf.random.set_seed(42)
22
 
23
- # Streamlit page configuration
24
  st.set_page_config(page_title="Sentiment Pulse", layout="wide")
25
  st.markdown("<h1 style='text-align: center; color: #7B68EE;'>Sentiment Pulse: Multi-Platform Analysis</h1>", unsafe_allow_html=True)
26
 
27
- # API credentials (replace with your own if needed)
28
  REDDIT_CLIENT_ID = "S7pTXhj5JDFGDb3-_zrJEA"
29
  REDDIT_CLIENT_SECRET = "QP3NYN4lrAKVLrBamzLGrpFywiVg8w"
30
  REDDIT_USER_AGENT = "SoundaryaR_Bot/1.0"
31
  YOUTUBE_API_KEY = "AIzaSyAChqXPaiNE9hKhApkgjgonzdgiCCOo"
32
 
33
- # Initialize APIs
34
  reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)
35
  youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
36
-
37
- # Load sentiment analysis models
38
  bert_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
39
  vader_analyzer = SentimentIntensityAnalyzer()
40
 
@@ -54,7 +48,7 @@ def fetch_reddit_data(keyword):
54
  try:
55
  subreddit = reddit.subreddit("all")
56
  posts = subreddit.search(keyword, limit=100)
57
- return pd.DataFrame([{'date': datetime.fromtimestamp(post.created_utc), 'text': post.title + " " + post.selftext} for post in posts])
58
  except Exception as e:
59
  st.error(f"Error fetching Reddit data: {e}")
60
  return pd.DataFrame()
@@ -88,11 +82,9 @@ def combined_sentiment(text):
88
  avg_score = (bert_score + abs(vader_score)) / 2
89
  return 1 if avg_score > 0.5 else 0, avg_score
90
 
91
- # Sidebar for keyword input
92
  st.sidebar.title("Keyword Search")
93
  keyword = st.sidebar.text_input("Enter a keyword (e.g., 'happy')", value="happy")
94
 
95
- # Load and filter data
96
  twitter_df = load_twitter_data()
97
  twitter_filtered = twitter_df[twitter_df['text'].str.contains(keyword, case=False, na=False)]
98
  reddit_df = fetch_reddit_data(keyword)
@@ -119,7 +111,7 @@ else:
119
  daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
120
  daily_sentiment['tweet_count'] = df.groupby(df['date'].dt.date).size().values
121
 
122
- if len(daily_sentiment) < 8:
123
  st.warning(f"Not enough {platform} data for prediction.")
124
  fig, ax = plt.subplots()
125
  ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], label='Historical')
@@ -130,54 +122,64 @@ else:
130
  scaler = MinMaxScaler()
131
  daily_sentiment['scaled_score'] = scaler.fit_transform(daily_sentiment[['combined_score']])
132
 
133
- def create_sequences(data, seq_length):
134
- X, y = [], []
135
- for i in range(len(data) - seq_length):
136
- X.append(data[i:i + seq_length])
137
- y.append(data[i + seq_length])
138
- return np.array(X), np.array(y)
139
-
140
- seq_length = 7
141
- X, y = create_sequences(daily_sentiment['scaled_score'].values, seq_length)
142
- X = X.reshape((X.shape[0], X.shape[1], 1))
143
-
144
- model = Sequential([
145
- LSTM(50, return_sequences=True, input_shape=(seq_length, 1)),
146
- Dropout(0.2),
147
- LSTM(25),
148
- Dropout(0.2),
149
- Dense(1, activation='sigmoid')
150
- ])
151
- model.compile(optimizer='adam', loss='mse')
152
- model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2, verbose=0)
153
-
154
- last_seq = daily_sentiment['scaled_score'][-seq_length:].values.reshape((1, seq_length, 1))
155
- predictions = []
156
- for _ in range(30):
157
- pred = model.predict(last_seq, verbose=0)
158
- predictions.append(pred[0][0])
159
- last_seq = np.roll(last_seq, -1)
160
- last_seq[0, -1, 0] = pred[0][0]
161
 
162
- predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
 
 
 
 
 
 
163
 
164
- X_lr = np.column_stack((daily_sentiment['scaled_score'], daily_sentiment['tweet_count']))
165
- y_lr = (daily_sentiment['combined_score'] > 0.5).astype(int)
166
- lr_model = LogisticRegression().fit(X_lr, y_lr)
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  future_dates = [daily_sentiment['date'].iloc[-1] + timedelta(days=i) for i in range(1, 31)]
169
- X_future = np.column_stack((predictions, [daily_sentiment['tweet_count'].mean()] * 30))
170
- lr_predictions = lr_model.predict_proba(X_future)[:, 1]
171
 
172
- st.subheader(f"{platform} 30-Day Prediction")
 
 
 
 
 
 
 
 
 
173
  fig, ax = plt.subplots()
174
  ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical')
175
- ax.plot(future_dates, predictions, 'b--', label='Predicted')
176
  ax.legend()
177
  st.pyplot(fig)
178
 
179
- st.subheader(f"{platform} Logistic Regression SHAP")
180
- explainer = shap.Explainer(lr_model, X_lr)
181
- shap_values = explainer(X_lr)
182
- shap.plots.beeswarm(shap_values, show=False)
183
  st.pyplot(plt.gcf())
 
5
  from datetime import datetime, timedelta
6
  from sklearn.preprocessing import MinMaxScaler
7
  from sklearn.linear_model import LogisticRegression
8
+ from sklearn.ensemble import RandomForestRegressor
9
+ from sklearn.model_selection import train_test_split
10
+ from sklearn.metrics import mean_squared_error
11
  from transformers import pipeline
12
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
13
  import shap
 
16
  import warnings
17
  warnings.filterwarnings('ignore')
18
 
 
19
  np.random.seed(42)
 
20
 
 
21
  st.set_page_config(page_title="Sentiment Pulse", layout="wide")
22
  st.markdown("<h1 style='text-align: center; color: #7B68EE;'>Sentiment Pulse: Multi-Platform Analysis</h1>", unsafe_allow_html=True)
23
 
24
+ # API credentials
25
  REDDIT_CLIENT_ID = "S7pTXhj5JDFGDb3-_zrJEA"
26
  REDDIT_CLIENT_SECRET = "QP3NYN4lrAKVLrBamzLGrpFywiVg8w"
27
  REDDIT_USER_AGENT = "SoundaryaR_Bot/1.0"
28
  YOUTUBE_API_KEY = "AIzaSyAChqXPaiNE9hKhApkgjgonzdgiCCOo"
29
 
 
30
  reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)
31
  youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
 
 
32
  bert_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
33
  vader_analyzer = SentimentIntensityAnalyzer()
34
 
 
48
  try:
49
  subreddit = reddit.subreddit("all")
50
  posts = subreddit.search(keyword, limit=100)
51
+ return pd.DataFrame([{'date': datetime.fromtimestamp(post.created_utc), 'text': post.title + " " + post.selftext}iety for post in posts])
52
  except Exception as e:
53
  st.error(f"Error fetching Reddit data: {e}")
54
  return pd.DataFrame()
 
82
  avg_score = (bert_score + abs(vader_score)) / 2
83
  return 1 if avg_score > 0.5 else 0, avg_score
84
 
 
85
  st.sidebar.title("Keyword Search")
86
  keyword = st.sidebar.text_input("Enter a keyword (e.g., 'happy')", value="happy")
87
 
 
88
  twitter_df = load_twitter_data()
89
  twitter_filtered = twitter_df[twitter_df['text'].str.contains(keyword, case=False, na=False)]
90
  reddit_df = fetch_reddit_data(keyword)
 
111
  daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
112
  daily_sentiment['tweet_count'] = df.groupby(df['date'].dt.date).size().values
113
 
114
+ if len(daily_sentiment) < 2:
115
  st.warning(f"Not enough {platform} data for prediction.")
116
  fig, ax = plt.subplots()
117
  ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], label='Historical')
 
122
  scaler = MinMaxScaler()
123
  daily_sentiment['scaled_score'] = scaler.fit_transform(daily_sentiment[['combined_score']])
124
 
125
+ # Prepare features: use lagged sentiment scores and tweet counts
126
+ X = pd.DataFrame({
127
+ 'lag1_score': daily_sentiment['scaled_score'].shift(1),
128
+ 'tweet_count': daily_sentiment['tweet_count']
129
+ }).dropna()
130
+ y = daily_sentiment['scaled_score'][1:] # Align with lagged features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ if len(X) < 5: # Minimum data for meaningful split
133
+ st.warning(f"Not enough {platform} data points for prediction after lagging.")
134
+ fig, ax = plt.subplots()
135
+ ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], label='Historical')
136
+ ax.legend()
137
+ st.pyplot(fig)
138
+ continue
139
 
140
+ # Split data for validation
141
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
142
 
143
+ # Train Logistic Regression (using regression mode with continuous output)
144
+ lr_model = LogisticRegression(max_iter=1000)
145
+ lr_model.fit(X_train, (y_train > 0.5).astype(int)) # Binary classification for validation
146
+ lr_pred_train = lr_model.predict_proba(X_train)[:, 1]
147
+ lr_mse = mean_squared_error(y_train, lr_pred_train)
148
+
149
+ # Train Random Forest
150
+ rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
151
+ rf_model.fit(X_train, y_train)
152
+ rf_pred_train = rf_model.predict(X_train)
153
+ rf_mse = mean_squared_error(y_train, rf_pred_train)
154
+
155
+ # Weighted ensemble based on inverse MSE
156
+ total_mse = lr_mse + rf_mse
157
+ lr_weight = (1 - lr_mse / total_mse) if total_mse > 0 else 0.5
158
+ rf_weight = (1 - rf_mse / total_mse) if total_mse > 0 else 0.5
159
+
160
+ # Predict 30 days into the future
161
+ last_data = X.iloc[-1:].copy()
162
+ predictions = []
163
  future_dates = [daily_sentiment['date'].iloc[-1] + timedelta(days=i) for i in range(1, 31)]
 
 
164
 
165
+ for _ in range(30):
166
+ lr_pred = lr_model.predict_proba(last_data)[:, 1][0]
167
+ rf_pred = rf_model.predict(last_data)[0]
168
+ ensemble_pred = lr_weight * lr_pred + rf_weight * rf_pred
169
+ predictions.append(ensemble_pred)
170
+ last_data['lag1_score'] = ensemble_pred # Update lag for next prediction
171
+
172
+ predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
173
+
174
+ st.subheader(f"{platform} 30-Day Prediction (Ensemble: LR + RF)")
175
  fig, ax = plt.subplots()
176
  ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical')
177
+ ax.plot(future_dates, predictions, 'b--', label=f'Predicted (LR: {lr_weight:.2f}, RF: {rf_weight:.2f})')
178
  ax.legend()
179
  st.pyplot(fig)
180
 
181
+ st.subheader(f"{platform} Random Forest SHAP")
182
+ explainer = shap.TreeExplainer(rf_model)
183
+ shap_values = explainer.shap_values(X)
184
+ shap.summary_plot(shap_values, X, show=False)
185
  st.pyplot(plt.gcf())