Soundaryasos's picture
Update app.py
4d9a934 verified
raw
history blame
8.3 kB
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import shap
import praw
from googleapiclient.discovery import build
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
st.set_page_config(page_title="Sentiment Pulse", layout="wide")
st.markdown("<h1 style='text-align: center; color: #7B68EE;'>Sentiment Pulse: Multi-Platform Analysis</h1>", unsafe_allow_html=True)
# API credentials
REDDIT_CLIENT_ID = "S7pTXhj5JDFGDb3-_zrJEA"
REDDIT_CLIENT_SECRET = "QP3NYN4lrAKVLrBamzLGrpFywiVg8w"
REDDIT_USER_AGENT = "SoundaryaR_Bot/1.0"
YOUTUBE_API_KEY = "AIzaSyAChqXPaiNE9hKhApkgjgonzdgiCCOo"
reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
bert_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
vader_analyzer = SentimentIntensityAnalyzer()
@st.cache_data
def load_twitter_data():
try:
df = pd.read_csv("twitter_dataset.csv", encoding='latin-1',
names=['sentiment', 'id', 'date', 'query', 'user', 'text'])
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['sentiment'] = df['sentiment'].map({0: 'negative', 4: 'positive'})
return df.sample(10000, random_state=42)
except FileNotFoundError:
st.error("twitter_dataset.csv not found. Please ensure the file is in the working directory.")
return pd.DataFrame()
def fetch_reddit_data(keyword):
try:
subreddit = reddit.subreddit("all")
posts = subreddit.search(keyword, limit=100)
return pd.DataFrame([{'date': datetime.fromtimestamp(post.created_utc), 'text': post.title + " " + post.selftext}iety for post in posts])
except Exception as e:
st.error(f"Error fetching Reddit data: {e}")
return pd.DataFrame()
def fetch_youtube_data(keyword):
try:
request = youtube.search().list(q=keyword, part="snippet", maxResults=50, type="video")
response = request.execute()
return pd.DataFrame([{
'date': datetime.strptime(item['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ"),
'text': item['snippet']['title'] + " " + item['snippet']['description']
} for item in response['items']])
except Exception as e:
st.error(f"Error fetching YouTube data: {e}")
return pd.DataFrame()
def get_bert_sentiment(text):
try:
result = bert_classifier(text[:512])[0]
return 1 if result['label'] == 'POSITIVE' else 0, result['score']
except:
return 0, 0.5
def get_vader_sentiment(text):
score = vader_analyzer.polarity_scores(text)['compound']
return 1 if score > 0 else 0, score
def combined_sentiment(text):
bert_label, bert_score = get_bert_sentiment(text)
vader_label, vader_score = get_vader_sentiment(text)
avg_score = (bert_score + abs(vader_score)) / 2
return 1 if avg_score > 0.5 else 0, avg_score
st.sidebar.title("Keyword Search")
keyword = st.sidebar.text_input("Enter a keyword (e.g., 'happy')", value="happy")
twitter_df = load_twitter_data()
twitter_filtered = twitter_df[twitter_df['text'].str.contains(keyword, case=False, na=False)]
reddit_df = fetch_reddit_data(keyword)
youtube_df = fetch_youtube_data(keyword)
platforms = {'Twitter': twitter_filtered, 'Reddit': reddit_df, 'YouTube': youtube_df}
valid_platforms = {k: v for k, v in platforms.items() if not v.empty}
if not valid_platforms:
st.error(f"Error: '{keyword}' is not a valid keyword. No matching data found across Twitter, Reddit, or YouTube.")
else:
for platform, df in valid_platforms.items():
st.subheader(f"{platform} Analysis for '{keyword}'")
st.write(f"{platform} Data Preview:", df.head())
with st.spinner(f"Analyzing {platform} sentiments..."):
df['bert_sentiment'], df['bert_score'] = zip(*df['text'].apply(get_bert_sentiment))
df['vader_sentiment'], df['vader_score'] = zip(*df['text'].apply(get_vader_sentiment))
df['combined_sentiment'], df['combined_score'] = zip(*df['text'].apply(combined_sentiment))
st.write(df[['text', 'combined_sentiment', 'combined_score']].head())
daily_sentiment = df.groupby(df['date'].dt.date)['combined_score'].mean().reset_index()
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
daily_sentiment['tweet_count'] = df.groupby(df['date'].dt.date).size().values
if len(daily_sentiment) < 2:
st.warning(f"Not enough {platform} data for prediction.")
fig, ax = plt.subplots()
ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], label='Historical')
ax.legend()
st.pyplot(fig)
continue
scaler = MinMaxScaler()
daily_sentiment['scaled_score'] = scaler.fit_transform(daily_sentiment[['combined_score']])
# Prepare features: use lagged sentiment scores and tweet counts
X = pd.DataFrame({
'lag1_score': daily_sentiment['scaled_score'].shift(1),
'tweet_count': daily_sentiment['tweet_count']
}).dropna()
y = daily_sentiment['scaled_score'][1:] # Align with lagged features
if len(X) < 5: # Minimum data for meaningful split
st.warning(f"Not enough {platform} data points for prediction after lagging.")
fig, ax = plt.subplots()
ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], label='Historical')
ax.legend()
st.pyplot(fig)
continue
# Split data for validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Logistic Regression (using regression mode with continuous output)
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, (y_train > 0.5).astype(int)) # Binary classification for validation
lr_pred_train = lr_model.predict_proba(X_train)[:, 1]
lr_mse = mean_squared_error(y_train, lr_pred_train)
# Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred_train = rf_model.predict(X_train)
rf_mse = mean_squared_error(y_train, rf_pred_train)
# Weighted ensemble based on inverse MSE
total_mse = lr_mse + rf_mse
lr_weight = (1 - lr_mse / total_mse) if total_mse > 0 else 0.5
rf_weight = (1 - rf_mse / total_mse) if total_mse > 0 else 0.5
# Predict 30 days into the future
last_data = X.iloc[-1:].copy()
predictions = []
future_dates = [daily_sentiment['date'].iloc[-1] + timedelta(days=i) for i in range(1, 31)]
for _ in range(30):
lr_pred = lr_model.predict_proba(last_data)[:, 1][0]
rf_pred = rf_model.predict(last_data)[0]
ensemble_pred = lr_weight * lr_pred + rf_weight * rf_pred
predictions.append(ensemble_pred)
last_data['lag1_score'] = ensemble_pred # Update lag for next prediction
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
st.subheader(f"{platform} 30-Day Prediction (Ensemble: LR + RF)")
fig, ax = plt.subplots()
ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical')
ax.plot(future_dates, predictions, 'b--', label=f'Predicted (LR: {lr_weight:.2f}, RF: {rf_weight:.2f})')
ax.legend()
st.pyplot(fig)
st.subheader(f"{platform} Random Forest SHAP")
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
st.pyplot(plt.gcf())