import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt from datetime import datetime, timedelta from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LogisticRegression import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import LSTM, Dense, Dropout from transformers import pipeline from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import shap import praw from googleapiclient.discovery import build import warnings warnings.filterwarnings('ignore') # Set random seeds np.random.seed(42) tf.random.set_seed(42) # Page Configuration st.set_page_config(page_title="Sentiment Pulse", layout="wide") st.markdown("

Sentiment Pulse: Multi-Platform Analysis

", unsafe_allow_html=True) # API Credentials (replace with your own) REDDIT_CLIENT_ID = "S7pTXhj5JDFGDb3-_zrJEA" REDDIT_CLIENT_SECRET = "QP3NYN4lrAKVLrBamzLGrpFywiVg8w" REDDIT_USER_AGENT = "SoundaryaR_Bot/1.0" YOUTUBE_API_KEY = "AIzaSyAChqXPaiNE9hKhApkgjgonzdgiCCOo" # Initialize APIs reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT) youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY) bert_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") vader_analyzer = SentimentIntensityAnalyzer() # Load Twitter Dataset @st.cache_data def load_twitter_data(): df = pd.read_csv("twitter_dataset.csv", encoding='latin-1', names=['sentiment', 'id', 'date', 'query', 'user', 'text']) df['date'] = pd.to_datetime(df['date']) df['sentiment'] = df['sentiment'].map({0: 'negative', 4: 'positive'}) return df.sample(10000) # Fetch Live Reddit Data def fetch_reddit_data(keyword): subreddit = reddit.subreddit("all") posts = subreddit.search(keyword, limit=100) data = [] for post in posts: data.append({'date': datetime.fromtimestamp(post.created_utc), 'text': post.title + " " + post.selftext}) return pd.DataFrame(data) # Fetch Live YouTube Data def fetch_youtube_data(keyword): request = youtube.search().list(q=keyword, part="snippet", maxResults=50, type="video") response = request.execute() data = [] for item in response['items']: title = item['snippet']['title'] description = item['snippet']['description'] published_at = datetime.strptime(item['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ") data.append({'date': published_at, 'text': title + " " + description}) return pd.DataFrame(data) # Sentiment Analysis Functions def get_bert_sentiment(text): try: result = bert_classifier(text[:512])[0] return 1 if result['label'] == 'POSITIVE' else 0, result['score'] except: return 0, 0.5 def get_vader_sentiment(text): score = vader_analyzer.polarity_scores(text)['compound'] return 1 if score > 0 else 0, score def combined_sentiment(text): bert_label, bert_score = get_bert_sentiment(text) vader_label, vader_score = get_vader_sentiment(text) avg_score = (bert_score + abs(vader_score)) / 2 return 1 if avg_score > 0.5 else 0, avg_score # Sidebar for Keyword Input st.sidebar.title("Keyword Search") keyword = st.sidebar.text_input("Enter a keyword (e.g., 'happy')", value="happy") # Process Data twitter_df = load_twitter_data() twitter_filtered = twitter_df[twitter_df['text'].str.contains(keyword, case=False, na=False)] reddit_df = fetch_reddit_data(keyword) youtube_df = fetch_youtube_data(keyword) # Check Validity platforms = {'Twitter': twitter_filtered, 'Reddit': reddit_df, 'YouTube': youtube_df} valid_platforms = {k: v for k, v in platforms.items() if not v.empty} if not valid_platforms: st.error(f"Error: '{keyword}' is not a valid keyword. No matching data found across Twitter, Reddit, or YouTube.") else: for platform, df in valid_platforms.items(): st.subheader(f"{platform} Analysis for '{keyword}'") if platform == 'Twitter': st.write(f"{platform} Dataset Preview:", df[['text', 'date']].head()) else: st.write(f"{platform} Live Data Preview:", df.head()) # Sentiment Analysis with st.spinner(f"Analyzing {platform} sentiments..."): df['bert_sentiment'], df['bert_score'] = zip(*df['text'].apply(get_bert_sentiment)) df['vader_sentiment'], df['vader_score'] = zip(*df['text'].apply(get_vader_sentiment)) df['combined_sentiment'], df['combined_score'] = zip(*df['text'].apply(combined_sentiment)) st.write(f"{platform} Sentiment Results:", df[['text', 'combined_sentiment', 'combined_score']].head()) # Time-Series Preparation daily_sentiment = df.groupby(df['date'].dt.date)['combined_score'].mean().reset_index() daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date']) daily_sentiment['tweet_count'] = df.groupby(df['date'].dt.date).size().values if len(daily_sentiment) < 8: st.warning(f"Not enough {platform} data for '{keyword}' to predict 30 days.") fig, ax = plt.subplots(figsize=(10, 6)) ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical Sentiment') ax.set_xlabel('Date') ax.set_ylabel('Sentiment Score') ax.set_title(f"{platform} Historical Sentiment for '{keyword}'") ax.legend() st.pyplot(fig) else: scaler = MinMaxScaler() daily_sentiment['scaled_score'] = scaler.fit_transform(daily_sentiment[['combined_score']]) # LSTM Sequences def create_sequences(data, seq_length): X, y = [], [] for i in range(len(data) - seq_length): X.append(data[i:i + seq_length]) y.append(data[i + seq_length]) return np.array(X), np.array(y) seq_length = 7 X, y = create_sequences(daily_sentiment['scaled_score'].values, seq_length) X = X.reshape((X.shape[0], X.shape[1], 1)) # Train LSTM model = Sequential([ LSTM(50, return_sequences=True, input_shape=(seq_length, 1)), Dropout(0.2), LSTM(25), Dropout(0.2), Dense(1, activation='sigmoid') ]) model.compile(optimizer='adam', loss='mse') model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2, verbose=0) # Predict 30 Days last_sequence = daily_sentiment['scaled_score'][-seq_length:].values.reshape((1, seq_length, 1)) predictions = [] for _ in range(30): pred = model.predict(last_sequence, verbose=0) predictions.append(pred[0][0]) last_sequence = np.roll(last_sequence, -1) last_sequence[0, -1, 0] = pred[0][0] predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten() # Logistic Regression X_lr = np.column_stack((daily_sentiment['scaled_score'], daily_sentiment['tweet_count'])) y_lr = (daily_sentiment['combined_score'] > 0.5).astype(int) lr_model = LogisticRegression() lr_model.fit(X_lr, y_lr) future_dates = [daily_sentiment['date'].iloc[-1] + timedelta(days=i) for i in range(1, 31)] X_future = np.column_stack((predictions, [daily_sentiment['tweet_count'].mean()] * 30)) lr_predictions = lr_model.predict_proba(X_future)[:, 1] # SHAP Explainability st.subheader(f"{platform} SHAP Explainability") explainer_lr = shap.LinearExplainer(lr_model, X_lr) shap_values_lr = explainer_lr.shap_values(X_lr) fig_lr, ax = plt.subplots() shap.summary_plot(shap_values_lr, X_lr, feature_names=['Sentiment Score', 'Count'], show=False) st.pyplot(fig_lr) def lstm_predict(inputs): inputs = inputs.reshape((inputs.shape[0], seq_length, 1)) return model.predict(inputs, verbose=0) explainer_lstm = shap.KernelExplainer(lstm_predict, X[:50]) shap_values_lstm = explainer_lstm.shap_values(X[:50], nsamples=100) fig_lstm, ax = plt.subplots() shap.summary_plot(shap_values_lstm, X[:50], plot_type="bar", show=False) st.pyplot(fig_lstm) # Visualization st.subheader(f"{platform} 30-Day Sentiment Prediction") results_df = pd.DataFrame({ 'Date': future_dates, 'Predicted Sentiment': predictions, 'Positive Probability': lr_predictions }) fig, ax1 = plt.subplots(figsize=(10, 6)) ax1.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical Sentiment') ax1.plot(results_df['Date'], results_df['Predicted Sentiment'], 'b-', label='Predicted Sentiment') ax1.set_xlabel('Date') ax1.set_ylabel('Sentiment Score', color='b') ax2 = ax1.twinx() ax2.plot(results_df['Date'], results_df['Positive Probability'], 'r-', label='Positive Probability') ax2.set_ylabel('Positive Probability', color='r') fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9)) plt.title(f"{platform} Sentiment Forecast for '{keyword}'") st.pyplot(fig) # Sidebar Instructions st.sidebar.write("1. Ensure 'sentiment140.csv' is in the folder.") st.sidebar.write("2. Enter a keyword to analyze live Reddit/YouTube and Twitter dataset.") st.sidebar.write("3. Run: `streamlit run sentiment_app.py`")