import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import shap
import praw
from googleapiclient.discovery import build
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
st.set_page_config(page_title="Sentiment Pulse", layout="wide")
st.markdown("
Sentiment Pulse: Multi-Platform Analysis
", unsafe_allow_html=True)
# API credentials
REDDIT_CLIENT_ID = "S7pTXhj5JDFGDb3-_zrJEA"
REDDIT_CLIENT_SECRET = "QP3NYN4lrAKVLrBamzLGrpFywiVg8w"
REDDIT_USER_AGENT = "SoundaryaR_Bot/1.0"
YOUTUBE_API_KEY = "AIzaSyAChqXPaiNE9hKhApkgjgonzdgiCCOo"
reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
bert_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
vader_analyzer = SentimentIntensityAnalyzer()
@st.cache_data
def load_twitter_data():
try:
df = pd.read_csv("twitter_dataset.csv", encoding='latin-1',
names=['sentiment', 'id', 'date', 'query', 'user', 'text'])
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['sentiment'] = df['sentiment'].map({0: 'negative', 4: 'positive'})
return df.sample(10000, random_state=42)
except FileNotFoundError:
st.error("twitter_dataset.csv not found. Please ensure the file is in the working directory.")
return pd.DataFrame()
def fetch_reddit_data(keyword):
try:
subreddit = reddit.subreddit("all")
posts = subreddit.search(keyword, limit=100)
return pd.DataFrame([{'date': datetime.fromtimestamp(post.created_utc), 'text': post.title + " " + post.selftext}iety for post in posts])
except Exception as e:
st.error(f"Error fetching Reddit data: {e}")
return pd.DataFrame()
def fetch_youtube_data(keyword):
try:
request = youtube.search().list(q=keyword, part="snippet", maxResults=50, type="video")
response = request.execute()
return pd.DataFrame([{
'date': datetime.strptime(item['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ"),
'text': item['snippet']['title'] + " " + item['snippet']['description']
} for item in response['items']])
except Exception as e:
st.error(f"Error fetching YouTube data: {e}")
return pd.DataFrame()
def get_bert_sentiment(text):
try:
result = bert_classifier(text[:512])[0]
return 1 if result['label'] == 'POSITIVE' else 0, result['score']
except:
return 0, 0.5
def get_vader_sentiment(text):
score = vader_analyzer.polarity_scores(text)['compound']
return 1 if score > 0 else 0, score
def combined_sentiment(text):
bert_label, bert_score = get_bert_sentiment(text)
vader_label, vader_score = get_vader_sentiment(text)
avg_score = (bert_score + abs(vader_score)) / 2
return 1 if avg_score > 0.5 else 0, avg_score
st.sidebar.title("Keyword Search")
keyword = st.sidebar.text_input("Enter a keyword (e.g., 'happy')", value="happy")
twitter_df = load_twitter_data()
twitter_filtered = twitter_df[twitter_df['text'].str.contains(keyword, case=False, na=False)]
reddit_df = fetch_reddit_data(keyword)
youtube_df = fetch_youtube_data(keyword)
platforms = {'Twitter': twitter_filtered, 'Reddit': reddit_df, 'YouTube': youtube_df}
valid_platforms = {k: v for k, v in platforms.items() if not v.empty}
if not valid_platforms:
st.error(f"Error: '{keyword}' is not a valid keyword. No matching data found across Twitter, Reddit, or YouTube.")
else:
for platform, df in valid_platforms.items():
st.subheader(f"{platform} Analysis for '{keyword}'")
st.write(f"{platform} Data Preview:", df.head())
with st.spinner(f"Analyzing {platform} sentiments..."):
df['bert_sentiment'], df['bert_score'] = zip(*df['text'].apply(get_bert_sentiment))
df['vader_sentiment'], df['vader_score'] = zip(*df['text'].apply(get_vader_sentiment))
df['combined_sentiment'], df['combined_score'] = zip(*df['text'].apply(combined_sentiment))
st.write(df[['text', 'combined_sentiment', 'combined_score']].head())
daily_sentiment = df.groupby(df['date'].dt.date)['combined_score'].mean().reset_index()
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
daily_sentiment['tweet_count'] = df.groupby(df['date'].dt.date).size().values
if len(daily_sentiment) < 2:
st.warning(f"Not enough {platform} data for prediction.")
fig, ax = plt.subplots()
ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], label='Historical')
ax.legend()
st.pyplot(fig)
continue
scaler = MinMaxScaler()
daily_sentiment['scaled_score'] = scaler.fit_transform(daily_sentiment[['combined_score']])
# Prepare features: use lagged sentiment scores and tweet counts
X = pd.DataFrame({
'lag1_score': daily_sentiment['scaled_score'].shift(1),
'tweet_count': daily_sentiment['tweet_count']
}).dropna()
y = daily_sentiment['scaled_score'][1:] # Align with lagged features
if len(X) < 5: # Minimum data for meaningful split
st.warning(f"Not enough {platform} data points for prediction after lagging.")
fig, ax = plt.subplots()
ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], label='Historical')
ax.legend()
st.pyplot(fig)
continue
# Split data for validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Logistic Regression (using regression mode with continuous output)
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, (y_train > 0.5).astype(int)) # Binary classification for validation
lr_pred_train = lr_model.predict_proba(X_train)[:, 1]
lr_mse = mean_squared_error(y_train, lr_pred_train)
# Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred_train = rf_model.predict(X_train)
rf_mse = mean_squared_error(y_train, rf_pred_train)
# Weighted ensemble based on inverse MSE
total_mse = lr_mse + rf_mse
lr_weight = (1 - lr_mse / total_mse) if total_mse > 0 else 0.5
rf_weight = (1 - rf_mse / total_mse) if total_mse > 0 else 0.5
# Predict 30 days into the future
last_data = X.iloc[-1:].copy()
predictions = []
future_dates = [daily_sentiment['date'].iloc[-1] + timedelta(days=i) for i in range(1, 31)]
for _ in range(30):
lr_pred = lr_model.predict_proba(last_data)[:, 1][0]
rf_pred = rf_model.predict(last_data)[0]
ensemble_pred = lr_weight * lr_pred + rf_weight * rf_pred
predictions.append(ensemble_pred)
last_data['lag1_score'] = ensemble_pred # Update lag for next prediction
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
st.subheader(f"{platform} 30-Day Prediction (Ensemble: LR + RF)")
fig, ax = plt.subplots()
ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical')
ax.plot(future_dates, predictions, 'b--', label=f'Predicted (LR: {lr_weight:.2f}, RF: {rf_weight:.2f})')
ax.legend()
st.pyplot(fig)
st.subheader(f"{platform} Random Forest SHAP")
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
st.pyplot(plt.gcf())