Soundaryasos's picture
Update app.py
2c64d59 verified
raw
history blame
9.89 kB
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import shap
import praw
from googleapiclient.discovery import build
import warnings
warnings.filterwarnings('ignore')
# Set random seeds
np.random.seed(42)
tf.random.set_seed(42)
# Page Configuration
st.set_page_config(page_title="Sentiment Pulse", layout="wide")
st.markdown("<h1 style='text-align: center; color: #7B68EE;'>Sentiment Pulse: Multi-Platform Analysis</h1>", unsafe_allow_html=True)
# API Credentials (replace with your own)
REDDIT_CLIENT_ID = "S7pTXhj5JDFGDb3-_zrJEA"
REDDIT_CLIENT_SECRET = "QP3NYN4lrAKVLrBamzLGrpFywiVg8w"
REDDIT_USER_AGENT = "SoundaryaR_Bot/1.0"
YOUTUBE_API_KEY = "AIzaSyAChqXPaiNE9hKhApkgjgonzdgiCCOo"
# Initialize APIs
reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
bert_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
vader_analyzer = SentimentIntensityAnalyzer()
# Load Twitter Dataset
@st.cache_data
def load_twitter_data():
df = pd.read_csv("twitter_dataset.csv", encoding='latin-1',
names=['sentiment', 'id', 'date', 'query', 'user', 'text'])
df['date'] = pd.to_datetime(df['date'])
df['sentiment'] = df['sentiment'].map({0: 'negative', 4: 'positive'})
return df.sample(10000)
# Fetch Live Reddit Data
def fetch_reddit_data(keyword):
subreddit = reddit.subreddit("all")
posts = subreddit.search(keyword, limit=100)
data = []
for post in posts:
data.append({'date': datetime.fromtimestamp(post.created_utc), 'text': post.title + " " + post.selftext})
return pd.DataFrame(data)
# Fetch Live YouTube Data
def fetch_youtube_data(keyword):
request = youtube.search().list(q=keyword, part="snippet", maxResults=50, type="video")
response = request.execute()
data = []
for item in response['items']:
title = item['snippet']['title']
description = item['snippet']['description']
published_at = datetime.strptime(item['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ")
data.append({'date': published_at, 'text': title + " " + description})
return pd.DataFrame(data)
# Sentiment Analysis Functions
def get_bert_sentiment(text):
try:
result = bert_classifier(text[:512])[0]
return 1 if result['label'] == 'POSITIVE' else 0, result['score']
except:
return 0, 0.5
def get_vader_sentiment(text):
score = vader_analyzer.polarity_scores(text)['compound']
return 1 if score > 0 else 0, score
def combined_sentiment(text):
bert_label, bert_score = get_bert_sentiment(text)
vader_label, vader_score = get_vader_sentiment(text)
avg_score = (bert_score + abs(vader_score)) / 2
return 1 if avg_score > 0.5 else 0, avg_score
# Sidebar for Keyword Input
st.sidebar.title("Keyword Search")
keyword = st.sidebar.text_input("Enter a keyword (e.g., 'happy')", value="happy")
# Process Data
twitter_df = load_twitter_data()
twitter_filtered = twitter_df[twitter_df['text'].str.contains(keyword, case=False, na=False)]
reddit_df = fetch_reddit_data(keyword)
youtube_df = fetch_youtube_data(keyword)
# Check Validity
platforms = {'Twitter': twitter_filtered, 'Reddit': reddit_df, 'YouTube': youtube_df}
valid_platforms = {k: v for k, v in platforms.items() if not v.empty}
if not valid_platforms:
st.error(f"Error: '{keyword}' is not a valid keyword. No matching data found across Twitter, Reddit, or YouTube.")
else:
for platform, df in valid_platforms.items():
st.subheader(f"{platform} Analysis for '{keyword}'")
if platform == 'Twitter':
st.write(f"{platform} Dataset Preview:", df[['text', 'date']].head())
else:
st.write(f"{platform} Live Data Preview:", df.head())
# Sentiment Analysis
with st.spinner(f"Analyzing {platform} sentiments..."):
df['bert_sentiment'], df['bert_score'] = zip(*df['text'].apply(get_bert_sentiment))
df['vader_sentiment'], df['vader_score'] = zip(*df['text'].apply(get_vader_sentiment))
df['combined_sentiment'], df['combined_score'] = zip(*df['text'].apply(combined_sentiment))
st.write(f"{platform} Sentiment Results:", df[['text', 'combined_sentiment', 'combined_score']].head())
# Time-Series Preparation
daily_sentiment = df.groupby(df['date'].dt.date)['combined_score'].mean().reset_index()
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
daily_sentiment['tweet_count'] = df.groupby(df['date'].dt.date).size().values
if len(daily_sentiment) < 8:
st.warning(f"Not enough {platform} data for '{keyword}' to predict 30 days.")
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical Sentiment')
ax.set_xlabel('Date')
ax.set_ylabel('Sentiment Score')
ax.set_title(f"{platform} Historical Sentiment for '{keyword}'")
ax.legend()
st.pyplot(fig)
else:
scaler = MinMaxScaler()
daily_sentiment['scaled_score'] = scaler.fit_transform(daily_sentiment[['combined_score']])
# LSTM Sequences
def create_sequences(data, seq_length):
X, y = [], []
for i in range(len(data) - seq_length):
X.append(data[i:i + seq_length])
y.append(data[i + seq_length])
return np.array(X), np.array(y)
seq_length = 7
X, y = create_sequences(daily_sentiment['scaled_score'].values, seq_length)
X = X.reshape((X.shape[0], X.shape[1], 1))
# Train LSTM
model = Sequential([
LSTM(50, return_sequences=True, input_shape=(seq_length, 1)),
Dropout(0.2),
LSTM(25),
Dropout(0.2),
Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='mse')
model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2, verbose=0)
# Predict 30 Days
last_sequence = daily_sentiment['scaled_score'][-seq_length:].values.reshape((1, seq_length, 1))
predictions = []
for _ in range(30):
pred = model.predict(last_sequence, verbose=0)
predictions.append(pred[0][0])
last_sequence = np.roll(last_sequence, -1)
last_sequence[0, -1, 0] = pred[0][0]
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
# Logistic Regression
X_lr = np.column_stack((daily_sentiment['scaled_score'], daily_sentiment['tweet_count']))
y_lr = (daily_sentiment['combined_score'] > 0.5).astype(int)
lr_model = LogisticRegression()
lr_model.fit(X_lr, y_lr)
future_dates = [daily_sentiment['date'].iloc[-1] + timedelta(days=i) for i in range(1, 31)]
X_future = np.column_stack((predictions, [daily_sentiment['tweet_count'].mean()] * 30))
lr_predictions = lr_model.predict_proba(X_future)[:, 1]
# SHAP Explainability
st.subheader(f"{platform} SHAP Explainability")
explainer_lr = shap.LinearExplainer(lr_model, X_lr)
shap_values_lr = explainer_lr.shap_values(X_lr)
fig_lr, ax = plt.subplots()
shap.summary_plot(shap_values_lr, X_lr, feature_names=['Sentiment Score', 'Count'], show=False)
st.pyplot(fig_lr)
def lstm_predict(inputs):
inputs = inputs.reshape((inputs.shape[0], seq_length, 1))
return model.predict(inputs, verbose=0)
explainer_lstm = shap.KernelExplainer(lstm_predict, X[:50])
shap_values_lstm = explainer_lstm.shap_values(X[:50], nsamples=100)
fig_lstm, ax = plt.subplots()
shap.summary_plot(shap_values_lstm, X[:50], plot_type="bar", show=False)
st.pyplot(fig_lstm)
# Visualization
st.subheader(f"{platform} 30-Day Sentiment Prediction")
results_df = pd.DataFrame({
'Date': future_dates,
'Predicted Sentiment': predictions,
'Positive Probability': lr_predictions
})
fig, ax1 = plt.subplots(figsize=(10, 6))
ax1.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical Sentiment')
ax1.plot(results_df['Date'], results_df['Predicted Sentiment'], 'b-', label='Predicted Sentiment')
ax1.set_xlabel('Date')
ax1.set_ylabel('Sentiment Score', color='b')
ax2 = ax1.twinx()
ax2.plot(results_df['Date'], results_df['Positive Probability'], 'r-', label='Positive Probability')
ax2.set_ylabel('Positive Probability', color='r')
fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))
plt.title(f"{platform} Sentiment Forecast for '{keyword}'")
st.pyplot(fig)
# Sidebar Instructions
st.sidebar.write("1. Ensure 'sentiment140.csv' is in the folder.")
st.sidebar.write("2. Enter a keyword to analyze live Reddit/YouTube and Twitter dataset.")
st.sidebar.write("3. Run: `streamlit run sentiment_app.py`")