SentimentAnalyzerFinbert

Sleeping

App Files Files Community

Soundaryasos commited on Apr 10, 2025

Commit

4d9a934

verified ·

1 Parent(s): 675c3c9

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -54

app.py CHANGED Viewed

@@ -5,9 +5,9 @@ import matplotlib.pyplot as plt
 from datetime import datetime, timedelta
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.linear_model import LogisticRegression
-import tensorflow as tf
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import LSTM, Dense, Dropout
 from transformers import pipeline
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 import shap
@@ -16,25 +16,19 @@ from googleapiclient.discovery import build
 import warnings
 warnings.filterwarnings('ignore')
-# Set random seeds for reproducibility
 np.random.seed(42)
-tf.random.set_seed(42)
-# Streamlit page configuration
 st.set_page_config(page_title="Sentiment Pulse", layout="wide")
 st.markdown("<h1 style='text-align: center; color: #7B68EE;'>Sentiment Pulse: Multi-Platform Analysis</h1>", unsafe_allow_html=True)
-# API credentials (replace with your own if needed)
 REDDIT_CLIENT_ID = "S7pTXhj5JDFGDb3-_zrJEA"
 REDDIT_CLIENT_SECRET = "QP3NYN4lrAKVLrBamzLGrpFywiVg8w"
 REDDIT_USER_AGENT = "SoundaryaR_Bot/1.0"
 YOUTUBE_API_KEY = "AIzaSyAChqXPaiNE9hKhApkgjgonzdgiCCOo"
-# Initialize APIs
 reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)
 youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
-# Load sentiment analysis models
 bert_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
 vader_analyzer = SentimentIntensityAnalyzer()
@@ -54,7 +48,7 @@ def fetch_reddit_data(keyword):
     try:
         subreddit = reddit.subreddit("all")
         posts = subreddit.search(keyword, limit=100)
-        return pd.DataFrame([{'date': datetime.fromtimestamp(post.created_utc), 'text': post.title + " " + post.selftext} for post in posts])
     except Exception as e:
         st.error(f"Error fetching Reddit data: {e}")
         return pd.DataFrame()
@@ -88,11 +82,9 @@ def combined_sentiment(text):
     avg_score = (bert_score + abs(vader_score)) / 2
     return 1 if avg_score > 0.5 else 0, avg_score
-# Sidebar for keyword input
 st.sidebar.title("Keyword Search")
 keyword = st.sidebar.text_input("Enter a keyword (e.g., 'happy')", value="happy")
-# Load and filter data
 twitter_df = load_twitter_data()
 twitter_filtered = twitter_df[twitter_df['text'].str.contains(keyword, case=False, na=False)]
 reddit_df = fetch_reddit_data(keyword)
@@ -119,7 +111,7 @@ else:
         daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
         daily_sentiment['tweet_count'] = df.groupby(df['date'].dt.date).size().values
-        if len(daily_sentiment) < 8:
             st.warning(f"Not enough {platform} data for prediction.")
             fig, ax = plt.subplots()
             ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], label='Historical')
@@ -130,54 +122,64 @@ else:
         scaler = MinMaxScaler()
         daily_sentiment['scaled_score'] = scaler.fit_transform(daily_sentiment[['combined_score']])
-        def create_sequences(data, seq_length):
-            X, y = [], []
-            for i in range(len(data) - seq_length):
-                X.append(data[i:i + seq_length])
-                y.append(data[i + seq_length])
-            return np.array(X), np.array(y)
-        seq_length = 7
-        X, y = create_sequences(daily_sentiment['scaled_score'].values, seq_length)
-        X = X.reshape((X.shape[0], X.shape[1], 1))
-        model = Sequential([
-            LSTM(50, return_sequences=True, input_shape=(seq_length, 1)),
-            Dropout(0.2),
-            LSTM(25),
-            Dropout(0.2),
-            Dense(1, activation='sigmoid')
-        ])
-        model.compile(optimizer='adam', loss='mse')
-        model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2, verbose=0)
-        last_seq = daily_sentiment['scaled_score'][-seq_length:].values.reshape((1, seq_length, 1))
-        predictions = []
-        for _ in range(30):
-            pred = model.predict(last_seq, verbose=0)
-            predictions.append(pred[0][0])
-            last_seq = np.roll(last_seq, -1)
-            last_seq[0, -1, 0] = pred[0][0]
-        predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
-        X_lr = np.column_stack((daily_sentiment['scaled_score'], daily_sentiment['tweet_count']))
-        y_lr = (daily_sentiment['combined_score'] > 0.5).astype(int)
-        lr_model = LogisticRegression().fit(X_lr, y_lr)
         future_dates = [daily_sentiment['date'].iloc[-1] + timedelta(days=i) for i in range(1, 31)]
-        X_future = np.column_stack((predictions, [daily_sentiment['tweet_count'].mean()] * 30))
-        lr_predictions = lr_model.predict_proba(X_future)[:, 1]
-        st.subheader(f"{platform} 30-Day Prediction")
         fig, ax = plt.subplots()
         ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical')
-        ax.plot(future_dates, predictions, 'b--', label='Predicted')
         ax.legend()
         st.pyplot(fig)
-        st.subheader(f"{platform} Logistic Regression SHAP")
-        explainer = shap.Explainer(lr_model, X_lr)
-        shap_values = explainer(X_lr)
-        shap.plots.beeswarm(shap_values, show=False)
         st.pyplot(plt.gcf())

 from datetime import datetime, timedelta
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
 from transformers import pipeline
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 import shap
 import warnings
 warnings.filterwarnings('ignore')
 np.random.seed(42)
 st.set_page_config(page_title="Sentiment Pulse", layout="wide")
 st.markdown("<h1 style='text-align: center; color: #7B68EE;'>Sentiment Pulse: Multi-Platform Analysis</h1>", unsafe_allow_html=True)
+# API credentials
 REDDIT_CLIENT_ID = "S7pTXhj5JDFGDb3-_zrJEA"
 REDDIT_CLIENT_SECRET = "QP3NYN4lrAKVLrBamzLGrpFywiVg8w"
 REDDIT_USER_AGENT = "SoundaryaR_Bot/1.0"
 YOUTUBE_API_KEY = "AIzaSyAChqXPaiNE9hKhApkgjgonzdgiCCOo"
 reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)
 youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
 bert_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
 vader_analyzer = SentimentIntensityAnalyzer()
     try:
         subreddit = reddit.subreddit("all")
         posts = subreddit.search(keyword, limit=100)
+        return pd.DataFrame([{'date': datetime.fromtimestamp(post.created_utc), 'text': post.title + " " + post.selftext}iety for post in posts])
     except Exception as e:
         st.error(f"Error fetching Reddit data: {e}")
         return pd.DataFrame()
     avg_score = (bert_score + abs(vader_score)) / 2
     return 1 if avg_score > 0.5 else 0, avg_score
 st.sidebar.title("Keyword Search")
 keyword = st.sidebar.text_input("Enter a keyword (e.g., 'happy')", value="happy")
 twitter_df = load_twitter_data()
 twitter_filtered = twitter_df[twitter_df['text'].str.contains(keyword, case=False, na=False)]
 reddit_df = fetch_reddit_data(keyword)
         daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
         daily_sentiment['tweet_count'] = df.groupby(df['date'].dt.date).size().values
+        if len(daily_sentiment) < 2:
             st.warning(f"Not enough {platform} data for prediction.")
             fig, ax = plt.subplots()
             ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], label='Historical')
         scaler = MinMaxScaler()
         daily_sentiment['scaled_score'] = scaler.fit_transform(daily_sentiment[['combined_score']])
+        # Prepare features: use lagged sentiment scores and tweet counts
+        X = pd.DataFrame({
+            'lag1_score': daily_sentiment['scaled_score'].shift(1),
+            'tweet_count': daily_sentiment['tweet_count']
+        }).dropna()
+        y = daily_sentiment['scaled_score'][1:]  # Align with lagged features
+        if len(X) < 5:  # Minimum data for meaningful split
+            st.warning(f"Not enough {platform} data points for prediction after lagging.")
+            fig, ax = plt.subplots()
+            ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], label='Historical')
+            ax.legend()
+            st.pyplot(fig)
+            continue
+        # Split data for validation
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+        # Train Logistic Regression (using regression mode with continuous output)
+        lr_model = LogisticRegression(max_iter=1000)
+        lr_model.fit(X_train, (y_train > 0.5).astype(int))  # Binary classification for validation
+        lr_pred_train = lr_model.predict_proba(X_train)[:, 1]
+        lr_mse = mean_squared_error(y_train, lr_pred_train)
+        # Train Random Forest
+        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
+        rf_model.fit(X_train, y_train)
+        rf_pred_train = rf_model.predict(X_train)
+        rf_mse = mean_squared_error(y_train, rf_pred_train)
+        # Weighted ensemble based on inverse MSE
+        total_mse = lr_mse + rf_mse
+        lr_weight = (1 - lr_mse / total_mse) if total_mse > 0 else 0.5
+        rf_weight = (1 - rf_mse / total_mse) if total_mse > 0 else 0.5
+        # Predict 30 days into the future
+        last_data = X.iloc[-1:].copy()
+        predictions = []
         future_dates = [daily_sentiment['date'].iloc[-1] + timedelta(days=i) for i in range(1, 31)]
+        for _ in range(30):
+            lr_pred = lr_model.predict_proba(last_data)[:, 1][0]
+            rf_pred = rf_model.predict(last_data)[0]
+            ensemble_pred = lr_weight * lr_pred + rf_weight * rf_pred
+            predictions.append(ensemble_pred)
+            last_data['lag1_score'] = ensemble_pred  # Update lag for next prediction
+        predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
+        st.subheader(f"{platform} 30-Day Prediction (Ensemble: LR + RF)")
         fig, ax = plt.subplots()
         ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical')
+        ax.plot(future_dates, predictions, 'b--', label=f'Predicted (LR: {lr_weight:.2f}, RF: {rf_weight:.2f})')
         ax.legend()
         st.pyplot(fig)
+        st.subheader(f"{platform} Random Forest SHAP")
+        explainer = shap.TreeExplainer(rf_model)
+        shap_values = explainer.shap_values(X)
+        shap.summary_plot(shap_values, X, show=False)
         st.pyplot(plt.gcf())