Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from datetime import datetime, timedelta | |
| from sklearn.preprocessing import MinMaxScaler | |
| from sklearn.linear_model import LogisticRegression | |
| import tensorflow as tf | |
| from tensorflow.keras.models import Sequential | |
| from tensorflow.keras.layers import LSTM, Dense, Dropout | |
| from transformers import pipeline | |
| from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
| import shap | |
| import praw | |
| from googleapiclient.discovery import build | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Set random seeds | |
| np.random.seed(42) | |
| tf.random.set_seed(42) | |
| # Page Configuration | |
| st.set_page_config(page_title="Sentiment Pulse", layout="wide") | |
| st.markdown("<h1 style='text-align: center; color: #7B68EE;'>Sentiment Pulse: Multi-Platform Analysis</h1>", unsafe_allow_html=True) | |
| # API Credentials (replace with your own) | |
| REDDIT_CLIENT_ID = "S7pTXhj5JDFGDb3-_zrJEA" | |
| REDDIT_CLIENT_SECRET = "QP3NYN4lrAKVLrBamzLGrpFywiVg8w" | |
| REDDIT_USER_AGENT = "SoundaryaR_Bot/1.0" | |
| YOUTUBE_API_KEY = "AIzaSyAChqXPaiNE9hKhApkgjgonzdgiCCOo" | |
| # Initialize APIs | |
| reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT) | |
| youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY) | |
| bert_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") | |
| vader_analyzer = SentimentIntensityAnalyzer() | |
| # Load Twitter Dataset | |
| def load_twitter_data(): | |
| df = pd.read_csv("twitter_dataset.csv", encoding='latin-1', | |
| names=['sentiment', 'id', 'date', 'query', 'user', 'text']) | |
| df['date'] = pd.to_datetime(df['date']) | |
| df['sentiment'] = df['sentiment'].map({0: 'negative', 4: 'positive'}) | |
| return df.sample(10000) | |
| # Fetch Live Reddit Data | |
| def fetch_reddit_data(keyword): | |
| subreddit = reddit.subreddit("all") | |
| posts = subreddit.search(keyword, limit=100) | |
| data = [] | |
| for post in posts: | |
| data.append({'date': datetime.fromtimestamp(post.created_utc), 'text': post.title + " " + post.selftext}) | |
| return pd.DataFrame(data) | |
| # Fetch Live YouTube Data | |
| def fetch_youtube_data(keyword): | |
| request = youtube.search().list(q=keyword, part="snippet", maxResults=50, type="video") | |
| response = request.execute() | |
| data = [] | |
| for item in response['items']: | |
| title = item['snippet']['title'] | |
| description = item['snippet']['description'] | |
| published_at = datetime.strptime(item['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ") | |
| data.append({'date': published_at, 'text': title + " " + description}) | |
| return pd.DataFrame(data) | |
| # Sentiment Analysis Functions | |
| def get_bert_sentiment(text): | |
| try: | |
| result = bert_classifier(text[:512])[0] | |
| return 1 if result['label'] == 'POSITIVE' else 0, result['score'] | |
| except: | |
| return 0, 0.5 | |
| def get_vader_sentiment(text): | |
| score = vader_analyzer.polarity_scores(text)['compound'] | |
| return 1 if score > 0 else 0, score | |
| def combined_sentiment(text): | |
| bert_label, bert_score = get_bert_sentiment(text) | |
| vader_label, vader_score = get_vader_sentiment(text) | |
| avg_score = (bert_score + abs(vader_score)) / 2 | |
| return 1 if avg_score > 0.5 else 0, avg_score | |
| # Sidebar for Keyword Input | |
| st.sidebar.title("Keyword Search") | |
| keyword = st.sidebar.text_input("Enter a keyword (e.g., 'happy')", value="happy") | |
| # Process Data | |
| twitter_df = load_twitter_data() | |
| twitter_filtered = twitter_df[twitter_df['text'].str.contains(keyword, case=False, na=False)] | |
| reddit_df = fetch_reddit_data(keyword) | |
| youtube_df = fetch_youtube_data(keyword) | |
| # Check Validity | |
| platforms = {'Twitter': twitter_filtered, 'Reddit': reddit_df, 'YouTube': youtube_df} | |
| valid_platforms = {k: v for k, v in platforms.items() if not v.empty} | |
| if not valid_platforms: | |
| st.error(f"Error: '{keyword}' is not a valid keyword. No matching data found across Twitter, Reddit, or YouTube.") | |
| else: | |
| for platform, df in valid_platforms.items(): | |
| st.subheader(f"{platform} Analysis for '{keyword}'") | |
| if platform == 'Twitter': | |
| st.write(f"{platform} Dataset Preview:", df[['text', 'date']].head()) | |
| else: | |
| st.write(f"{platform} Live Data Preview:", df.head()) | |
| # Sentiment Analysis | |
| with st.spinner(f"Analyzing {platform} sentiments..."): | |
| df['bert_sentiment'], df['bert_score'] = zip(*df['text'].apply(get_bert_sentiment)) | |
| df['vader_sentiment'], df['vader_score'] = zip(*df['text'].apply(get_vader_sentiment)) | |
| df['combined_sentiment'], df['combined_score'] = zip(*df['text'].apply(combined_sentiment)) | |
| st.write(f"{platform} Sentiment Results:", df[['text', 'combined_sentiment', 'combined_score']].head()) | |
| # Time-Series Preparation | |
| daily_sentiment = df.groupby(df['date'].dt.date)['combined_score'].mean().reset_index() | |
| daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date']) | |
| daily_sentiment['tweet_count'] = df.groupby(df['date'].dt.date).size().values | |
| if len(daily_sentiment) < 8: | |
| st.warning(f"Not enough {platform} data for '{keyword}' to predict 30 days.") | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical Sentiment') | |
| ax.set_xlabel('Date') | |
| ax.set_ylabel('Sentiment Score') | |
| ax.set_title(f"{platform} Historical Sentiment for '{keyword}'") | |
| ax.legend() | |
| st.pyplot(fig) | |
| else: | |
| scaler = MinMaxScaler() | |
| daily_sentiment['scaled_score'] = scaler.fit_transform(daily_sentiment[['combined_score']]) | |
| # LSTM Sequences | |
| def create_sequences(data, seq_length): | |
| X, y = [], [] | |
| for i in range(len(data) - seq_length): | |
| X.append(data[i:i + seq_length]) | |
| y.append(data[i + seq_length]) | |
| return np.array(X), np.array(y) | |
| seq_length = 7 | |
| X, y = create_sequences(daily_sentiment['scaled_score'].values, seq_length) | |
| X = X.reshape((X.shape[0], X.shape[1], 1)) | |
| # Train LSTM | |
| model = Sequential([ | |
| LSTM(50, return_sequences=True, input_shape=(seq_length, 1)), | |
| Dropout(0.2), | |
| LSTM(25), | |
| Dropout(0.2), | |
| Dense(1, activation='sigmoid') | |
| ]) | |
| model.compile(optimizer='adam', loss='mse') | |
| model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2, verbose=0) | |
| # Predict 30 Days | |
| last_sequence = daily_sentiment['scaled_score'][-seq_length:].values.reshape((1, seq_length, 1)) | |
| predictions = [] | |
| for _ in range(30): | |
| pred = model.predict(last_sequence, verbose=0) | |
| predictions.append(pred[0][0]) | |
| last_sequence = np.roll(last_sequence, -1) | |
| last_sequence[0, -1, 0] = pred[0][0] | |
| predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten() | |
| # Logistic Regression | |
| X_lr = np.column_stack((daily_sentiment['scaled_score'], daily_sentiment['tweet_count'])) | |
| y_lr = (daily_sentiment['combined_score'] > 0.5).astype(int) | |
| lr_model = LogisticRegression() | |
| lr_model.fit(X_lr, y_lr) | |
| future_dates = [daily_sentiment['date'].iloc[-1] + timedelta(days=i) for i in range(1, 31)] | |
| X_future = np.column_stack((predictions, [daily_sentiment['tweet_count'].mean()] * 30)) | |
| lr_predictions = lr_model.predict_proba(X_future)[:, 1] | |
| # SHAP Explainability | |
| st.subheader(f"{platform} SHAP Explainability") | |
| explainer_lr = shap.LinearExplainer(lr_model, X_lr) | |
| shap_values_lr = explainer_lr.shap_values(X_lr) | |
| fig_lr, ax = plt.subplots() | |
| shap.summary_plot(shap_values_lr, X_lr, feature_names=['Sentiment Score', 'Count'], show=False) | |
| st.pyplot(fig_lr) | |
| def lstm_predict(inputs): | |
| inputs = inputs.reshape((inputs.shape[0], seq_length, 1)) | |
| return model.predict(inputs, verbose=0) | |
| explainer_lstm = shap.KernelExplainer(lstm_predict, X[:50]) | |
| shap_values_lstm = explainer_lstm.shap_values(X[:50], nsamples=100) | |
| fig_lstm, ax = plt.subplots() | |
| shap.summary_plot(shap_values_lstm, X[:50], plot_type="bar", show=False) | |
| st.pyplot(fig_lstm) | |
| # Visualization | |
| st.subheader(f"{platform} 30-Day Sentiment Prediction") | |
| results_df = pd.DataFrame({ | |
| 'Date': future_dates, | |
| 'Predicted Sentiment': predictions, | |
| 'Positive Probability': lr_predictions | |
| }) | |
| fig, ax1 = plt.subplots(figsize=(10, 6)) | |
| ax1.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical Sentiment') | |
| ax1.plot(results_df['Date'], results_df['Predicted Sentiment'], 'b-', label='Predicted Sentiment') | |
| ax1.set_xlabel('Date') | |
| ax1.set_ylabel('Sentiment Score', color='b') | |
| ax2 = ax1.twinx() | |
| ax2.plot(results_df['Date'], results_df['Positive Probability'], 'r-', label='Positive Probability') | |
| ax2.set_ylabel('Positive Probability', color='r') | |
| fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9)) | |
| plt.title(f"{platform} Sentiment Forecast for '{keyword}'") | |
| st.pyplot(fig) | |
| # Sidebar Instructions | |
| st.sidebar.write("1. Ensure 'sentiment140.csv' is in the folder.") | |
| st.sidebar.write("2. Enter a keyword to analyze live Reddit/YouTube and Twitter dataset.") | |
| st.sidebar.write("3. Run: `streamlit run sentiment_app.py`") |