Soundaryasos's picture
Update app.py
9b1852d verified
raw
history blame
7.09 kB
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import shap
import praw
from googleapiclient.discovery import build
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
tf.random.set_seed(42)
st.set_page_config(page_title="Sentiment Pulse", layout="wide")
st.markdown("<h1 style='text-align: center; color: #7B68EE;'>Sentiment Pulse: Multi-Platform Analysis</h1>", unsafe_allow_html=True)
REDDIT_CLIENT_ID = "S7pTXhj5JDFGDb3-_zrJEA"
REDDIT_CLIENT_SECRET = "QP3NYN4lrAKVLrBamzLGrpFywiVg8w"
REDDIT_USER_AGENT = "SoundaryaR_Bot/1.0"
YOUTUBE_API_KEY = "AIzaSyAChqXPaiNE9hKhApkgjgonzdgiCCOo"
reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
bert_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
vader_analyzer = SentimentIntensityAnalyzer()
@st.cache_data
def load_twitter_data():
df = pd.read_csv("twitter_dataset.csv", encoding='latin-1',
names=['sentiment', 'id', 'date', 'query', 'user', 'text'])
df['date'] = pd.to_datetime(df['date'])
df['sentiment'] = df['sentiment'].map({0: 'negative', 4: 'positive'})
return df.sample(10000)
def fetch_reddit_data(keyword):
subreddit = reddit.subreddit("all")
posts = subreddit.search(keyword, limit=100)
return pd.DataFrame([{'date': datetime.fromtimestamp(post.created_utc), 'text': post.title + " " + post.selftext} for post in posts])
def fetch_youtube_data(keyword):
request = youtube.search().list(q=keyword, part="snippet", maxResults=50, type="video")
response = request.execute()
return pd.DataFrame([{
'date': datetime.strptime(item['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ"),
'text': item['snippet']['title'] + " " + item['snippet']['description']
} for item in response['items']])
def get_bert_sentiment(text):
try:
result = bert_classifier(text[:512])[0]
return 1 if result['label'] == 'POSITIVE' else 0, result['score']
except:
return 0, 0.5
def get_vader_sentiment(text):
score = vader_analyzer.polarity_scores(text)['compound']
return 1 if score > 0 else 0, score
def combined_sentiment(text):
bert_label, bert_score = get_bert_sentiment(text)
vader_label, vader_score = get_vader_sentiment(text)
avg_score = (bert_score + abs(vader_score)) / 2
return 1 if avg_score > 0.5 else 0, avg_score
st.sidebar.title("Keyword Search")
keyword = st.sidebar.text_input("Enter a keyword (e.g., 'happy')", value="happy")
twitter_df = load_twitter_data()
twitter_filtered = twitter_df[twitter_df['text'].str.contains(keyword, case=False, na=False)]
reddit_df = fetch_reddit_data(keyword)
youtube_df = fetch_youtube_data(keyword)
platforms = {'Twitter': twitter_filtered, 'Reddit': reddit_df, 'YouTube': youtube_df}
valid_platforms = {k: v for k, v in platforms.items() if not v.empty}
if not valid_platforms:
st.error(f"Error: '{keyword}' is not a valid keyword. No matching data found across Twitter, Reddit, or YouTube.")
else:
for platform, df in valid_platforms.items():
st.subheader(f"{platform} Analysis for '{keyword}'")
st.write(f"{platform} Data Preview:", df.head())
with st.spinner(f"Analyzing {platform} sentiments..."):
df['bert_sentiment'], df['bert_score'] = zip(*df['text'].apply(get_bert_sentiment))
df['vader_sentiment'], df['vader_score'] = zip(*df['text'].apply(get_vader_sentiment))
df['combined_sentiment'], df['combined_score'] = zip(*df['text'].apply(combined_sentiment))
st.write(df[['text', 'combined_sentiment', 'combined_score']].head())
daily_sentiment = df.groupby(df['date'].dt.date)['combined_score'].mean().reset_index()
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
daily_sentiment['tweet_count'] = df.groupby(df['date'].dt.date).size().values
if len(daily_sentiment) < 8:
st.warning(f"Not enough {platform} data for prediction.")
fig, ax = plt.subplots()
ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], label='Historical')
st.pyplot(fig)
continue
scaler = MinMaxScaler()
daily_sentiment['scaled_score'] = scaler.fit_transform(daily_sentiment[['combined_score']])
def create_sequences(data, seq_length):
X, y = [], []
for i in range(len(data) - seq_length):
X.append(data[i:i + seq_length])
y.append(data[i + seq_length])
return np.array(X), np.array(y)
seq_length = 7
X, y = create_sequences(daily_sentiment['scaled_score'].values, seq_length)
X = X.reshape((X.shape[0], X.shape[1], 1))
model = Sequential([
LSTM(50, return_sequences=True, input_shape=(seq_length, 1)),
Dropout(0.2),
LSTM(25),
Dropout(0.2),
Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='mse')
model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2, verbose=0)
last_seq = daily_sentiment['scaled_score'][-seq_length:].values.reshape((1, seq_length, 1))
predictions = []
for _ in range(30):
pred = model.predict(last_seq, verbose=0)
predictions.append(pred[0][0])
last_seq = np.roll(last_seq, -1)
last_seq[0, -1, 0] = pred[0][0]
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
X_lr = np.column_stack((daily_sentiment['scaled_score'], daily_sentiment['tweet_count']))
y_lr = (daily_sentiment['combined_score'] > 0.5).astype(int)
lr_model = LogisticRegression().fit(X_lr, y_lr)
future_dates = [daily_sentiment['date'].iloc[-1] + timedelta(days=i) for i in range(1, 31)]
X_future = np.column_stack((predictions, [daily_sentiment['tweet_count'].mean()] * 30))
lr_predictions = lr_model.predict_proba(X_future)[:, 1]
st.subheader(f"{platform} 30-Day Prediction")
fig, ax = plt.subplots()
ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical')
ax.plot(future_dates, predictions, 'b--', label='Predicted')
ax.legend()
st.pyplot(fig)
st.subheader(f"{platform} Logistic Regression SHAP")
explainer = shap.Explainer(lr_model, X_lr)
shap_values = explainer(X_lr)
shap.plots.beeswarm(shap_values, show=False)
st.pyplot(plt.gcf())