SentimentAnalyzerFinbert

Sleeping

App Files Files Community

Soundaryasos commited on Apr 10, 2025

Commit

2c64d59

verified ·

1 Parent(s): f3883f4

Update app.py

Browse files

Files changed (1) hide show

app.py +206 -442

app.py CHANGED Viewed

@@ -2,454 +2,218 @@ import streamlit as st
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
-import seaborn as sns
-import nltk
-from textblob import TextBlob
-from wordcloud import WordCloud, STOPWORDS
-import plotly.express as px
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
 from datetime import datetime, timedelta
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
-from sklearn.linear_model import LinearRegression
-from sklearn.ensemble import RandomForestRegressor
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder, MinMaxScaler
-from sklearn.metrics import mean_squared_error, r2_score
-from io import BytesIO
-import base64
-import re
-import json
-import altair as alt
-import time
-import requests
-from PIL import Image
-from collections import Counter
-import spacy
-import emoji
 import warnings
 warnings.filterwarnings('ignore')
-# Initialize spaCy for advanced NLP
-try:
-    nlp = spacy.load("en_core_web_sm")
-except:
-    st.warning("Installing spaCy model. This might take a minute...")
-    import subprocess
-    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], capture_output=True)
-    nlp = spacy.load("en_core_web_sm")
-# Ensure necessary NLTK data is available
-nltk.download('punkt', quiet=True)
-nltk.download('stopwords', quiet=True)
-nltk.download('wordnet', quiet=True)
-nltk.download('vader_lexicon', quiet=True)
 # Page Configuration
-st.set_page_config(
-    page_title="Sentiment Pulse | Advanced Sentiment Analyzer",
-    page_icon="🔮",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-# Apply custom CSS for modern look
-st.markdown("""
-<style>
-    /* Main theme colors */
-    :root {
-        --primary: #7B68EE;
-        --secondary: #00BFFF;
-        --background: #F8F9FA;
-        --text: #333333;
-        --accent: #FF69B4;
-    }
-    /* Base Styles */
-    .reportview-container {
-        background-color: var(--background);
-        color: var(--text);
-    }
-    .sidebar .sidebar-content {
-        background-image: linear-gradient(to bottom, var(--primary), var(--secondary));
-        color: white;
-    }
-    /* Card-like containers */
-    .card {
-        background-color: white;
-        border-radius: 10px;
-        padding: 20px;
-        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-        margin-bottom: 20px;
-    }
-    /* Header styling */
-    h1, h2, h3 {
-        color: var(--primary);
-        font-weight: 700;
-    }
-    /* Button styling */
-    .stButton>button {
-        background-color: var(--primary);
-        color: white;
-        border-radius: 8px;
-        border: none;
-        transition: all 0.3s;
-    }
-    .stButton>button:hover {
-        background-color: var(--secondary);
-        transform: translateY(-2px);
-        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15);
-    }
-    /* Metric styling */
-    .metric-value {
-        font-size: 32px;
-        font-weight: 700;
-        color: var(--primary);
-    }
-    .metric-label {
-        font-size: 14px;
-        color: var(--text);
-        opacity: 0.7;
-    }
-    /* Divider */
-    .divider {
-        height: 3px;
-        background-image: linear-gradient(to right, var(--primary), var(--secondary));
-        margin: 20px 0;
-        border-radius: 3px;
-    }
-    /* Hide hamburger menu and footer */
-    #MainMenu {visibility: hidden;}
-    footer {visibility: hidden;}
-    /* Custom tab styling */
-    .stTabs [data-baseweb="tab-list"] {
-        gap: 8px;
-    }
-    .stTabs [data-baseweb="tab"] {
-        background-color: transparent;
-        border-radius: 4px 4px 0px 0px;
-        border: none;
-        color: var(--text);
-        padding: 10px 16px;
-    }
-    .stTabs [aria-selected="true"] {
-        background-color: white !important;
-        color: var(--primary) !important;
-        font-weight: bold;
-        border-top: 2px solid var(--primary);
-    }
-    /* Tooltip */
-    .tooltip {
-        position: relative;
-        display: inline-block;
-        border-bottom: 1px dotted black;
-    }
-    .tooltip .tooltiptext {
-        visibility: hidden;
-        width: 200px;
-        background-color: #555;
-        color: #fff;
-        text-align: center;
-        border-radius: 6px;
-        padding: 5px;
-        position: absolute;
-        z-index: 1;
-        bottom: 125%;
-        left: 50%;
-        margin-left: -100px;
-        opacity: 0;
-        transition: opacity 0.3s;
-    }
-    .tooltip:hover .tooltiptext {
-        visibility: visible;
-        opacity: 1;
-    }
-</style>
-""", unsafe_allow_html=True)
-# ===== UTILITY FUNCTIONS =====
-def clean_text(text):
-    """Clean and preprocess text for analysis"""
-    if not isinstance(text, str):
-        return ""
-    # Convert to lowercase
-    text = text.lower()
-    # Remove URLs
-    text = re.sub(r'https?://\S+|www\.\S+', '', text)
-    # Remove mentions and hashtags for analysis
-    text = re.sub(r'@\w+|#\w+', '', text)
-    # Remove punctuation and special characters
-    text = re.sub(r'[^\w\s]', '', text)
-    # Remove extra whitespace
-    text = re.sub(r'\s+', ' ', text).strip()
-    return text
-def extract_hashtags(text):
-    """Extract hashtags from text"""
-    if not isinstance(text, str):
-        return []
-    return re.findall(r'#(\w+)', text)
-def extract_mentions(text):
-    """Extract mentions from text"""
-    if not isinstance(text, str):
-        return []
-    return re.findall(r'@(\w+)', text)
-def count_emojis(text):
-    """Count emojis in text"""
-    if not isinstance(text, str):
-        return 0
-    return len([c for c in text if c in emoji.EMOJI_DATA])
-def get_emoji_sentiment(text):
-    """Get sentiment of emojis in text"""
-    if not isinstance(text, str):
-        return 0
-    # Simple dictionary of emoji sentiment (expand as needed)
-    emoji_sentiment = {
-        '😊': 1, '😃': 1, '😄': 1, '😁': 1, '😍': 1,
-        '😢': -1, '😭': -1, '😡': -1, '😠': -1, '😞': -1
-    }
-    sentiment = 0
-    for char in text:
-        if char in emoji_sentiment:
-            sentiment += emoji_sentiment[char]
-    return sentiment
-def generate_wordcloud(text, mask=None, background_color='white'):
-    """Generate word cloud from text"""
-    if not text or not isinstance(text, str):
-        return None
-    stopwords = set(STOPWORDS)
-    # Add custom stopwords
-    custom_stopwords = {'the', 'and', 'to', 'of', 'a', 'in', 'is', 'that', 'it', 'was'}
-    stopwords.update(custom_stopwords)
-    wordcloud = WordCloud(
-        width=800,
-        height=400,
-        background_color=background_color,
-        stopwords=stopwords,
-        max_words=150,
-        colormap='viridis',
-        contour_width=3,
-        contour_color='steelblue',
-        collocations=False
-    ).generate(text)
-    return wordcloud
-def get_entity_analysis(text):
-    """Extract named entities from text using spaCy"""
-    if not text or not isinstance(text, str):
-        return {}
-    doc = nlp(text)
-    entities = {}
-    for ent in doc.ents:
-        if ent.label_ not in entities:
-            entities[ent.label_] = []
-        entities[ent.label_].append(ent.text)
-    return entities
-def extract_keywords(text, top_n=10):
-    """Extract keywords from text using spaCy"""
-    if not text or not isinstance(text, str):
-        return []
-    doc = nlp(text)
-    keywords = []
-    for token in doc:
-        if (not token.is_stop and
-            not token.is_punct and
-            token.pos_ in ('NOUN', 'PROPN', 'ADJ') and
-            len(token.text) > 1):
-            keywords.append(token.text.lower())
-    # Count and get top keywords
-    keyword_counts = Counter(keywords)
-    return keyword_counts.most_common(top_n)
-def analyze_tone(text):
-    """Analyze the tone of text"""
-    if not text or not isinstance(text, str):
-        return "Neutral"
-    # Use TextBlob for sentiment
-    blob = TextBlob(text)
-    polarity = blob.sentiment.polarity
-    subjectivity = blob.sentiment.subjectivity
-    # Tone categories
-    if polarity > 0.5:
-        if subjectivity > 0.7:
-            return "Enthusiastic"
-        else:
-            return "Positive"
-    elif polarity > 0.1:
-        if subjectivity > 0.7:
-            return "Interested"
-        else:
-            return "Somewhat Positive"
-    elif polarity < -0.5:
-        if subjectivity > 0.7:
-            return "Angry"
         else:
-            return "Negative"
-    elif polarity < -0.1:
-        if subjectivity > 0.7:
-            return "Frustrated"
         else:
-            return "Somewhat Negative"
-    else:
-        if subjectivity > 0.7:
-            return "Uncertain"
-        else:
-            return "Neutral"
-def analyze_readability(text):
-    """Analyze text readability metrics"""
-    if not text or not isinstance(text, str):
-        return {}
-    # Word count
-    words = text.split()
-    word_count = len(words)
-    if word_count == 0:
-        return {
-            "word_count": 0,
-            "avg_word_length": 0,
-            "avg_sentence_length": 0,
-            "readability_score": 0,
-            "complexity": "N/A"
-        }
-    # Sentence count
-    sentences = nltk.sent_tokenize(text)
-    sentence_count = len(sentences)
-    # Average word length
-    avg_word_length = sum(len(word) for word in words) / word_count if word_count > 0 else 0
-    # Average sentence length
-    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
-    # Simplified readability score (based on avg word & sentence length)
-    readability_score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_word_length / 5)
-    readability_score = max(0, min(100, readability_score))
-    # Determine complexity
-    if readability_score > 90:
-        complexity = "Very Easy"
-    elif readability_score > 80:
-        complexity = "Easy"
-    elif readability_score > 70:
-        complexity = "Fairly Easy"
-    elif readability_score > 60:
-        complexity = "Standard"
-    elif readability_score > 50:
-        complexity = "Fairly Difficult"
-    elif readability_score > 30:
-        complexity = "Difficult"
-    else:
-        complexity = "Very Difficult"
-    return {
-        "word_count": word_count,
-        "avg_word_length": round(avg_word_length, 2),
-        "avg_sentence_length": round(avg_sentence_length, 2),
-        "readability_score": round(readability_score, 2),
-        "complexity": complexity
-    }
-def get_sentiment_color(score):
-    """Get color based on sentiment score"""
-    if score > 0.5:
-        return "#2E8B57"  # Strong positive: Sea Green
-    elif score > 0:
-        return "#90EE90"  # Positive: Light Green
-    elif score == 0:
-        return "#D3D3D3"  # Neutral: Light Gray
-    elif score > -0.5:
-        return "#FFA07A"  # Negative: Light Salmon
-    else:
-        return "#DC143C"  # Strong negative: Crimson
-def map_sentiment_to_emoji(score):
-    """Map sentiment score to emoji"""
-    if score > 0.75:
-        return "😍"
-    elif score > 0.5:
-        return "😁"
-    elif score > 0.25:
-        return "🙂"
-    elif score > 0:
-        return "😊"
-    elif score == 0:
-        return "😐"
-    elif score > -0.25:
-        return "😕"
-    elif score > -0.5:
-        return "😟"
-    elif score > -0.75:
-        return "😞"
-    else:
-        return "😡"
-def download_as_file(object_to_download, download_filename, button_text, pickle_it=False):
-    """
-    Generates a link to download the given object_to_download.
-    Args:
-        object_to_download: The object to be downloaded.
-        download_filename: Filename that the object will be saved as.
-        button_text: Text to display on the download button.
-        pickle_it: If True, pickle file.
-    """
-    if pickle_it:
-        try:
-            object_to_download = pickle.dumps(object_to_download)
-        except pickle.PicklingError:
-            return None
-    # Convert to bytes
-    if isinstance(object_to_download, bytes):
-        pass
-    elif isinstance(object_to_download, pd.DataFrame):
-        object_to_download = object_to_download.to_csv(index=False).encode()
-    # Add other data types as needed
-    else:
-        object_to_download = str(object_to_download).encode()
-    # Generate download button
-    b64 = base64.b64encode(object_to_download).decode()
-    button_uuid = str(hash(button_text))

 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from datetime import datetime, timedelta
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.linear_model import LogisticRegression
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import LSTM, Dense, Dropout
+from transformers import pipeline
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+import shap
+import praw
+from googleapiclient.discovery import build
 import warnings
 warnings.filterwarnings('ignore')
+# Set random seeds
+np.random.seed(42)
+tf.random.set_seed(42)
 # Page Configuration
+st.set_page_config(page_title="Sentiment Pulse", layout="wide")
+st.markdown("<h1 style='text-align: center; color: #7B68EE;'>Sentiment Pulse: Multi-Platform Analysis</h1>", unsafe_allow_html=True)
+# API Credentials (replace with your own)
+REDDIT_CLIENT_ID = "S7pTXhj5JDFGDb3-_zrJEA"
+REDDIT_CLIENT_SECRET = "QP3NYN4lrAKVLrBamzLGrpFywiVg8w"
+REDDIT_USER_AGENT = "SoundaryaR_Bot/1.0"
+YOUTUBE_API_KEY = "AIzaSyAChqXPaiNE9hKhApkgjgonzdgiCCOo"
+# Initialize APIs
+reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)
+youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
+bert_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
+vader_analyzer = SentimentIntensityAnalyzer()
+# Load Twitter Dataset
+@st.cache_data
+def load_twitter_data():
+    df = pd.read_csv("twitter_dataset.csv", encoding='latin-1',
+                     names=['sentiment', 'id', 'date', 'query', 'user', 'text'])
+    df['date'] = pd.to_datetime(df['date'])
+    df['sentiment'] = df['sentiment'].map({0: 'negative', 4: 'positive'})
+    return df.sample(10000)
+# Fetch Live Reddit Data
+def fetch_reddit_data(keyword):
+    subreddit = reddit.subreddit("all")
+    posts = subreddit.search(keyword, limit=100)
+    data = []
+    for post in posts:
+        data.append({'date': datetime.fromtimestamp(post.created_utc), 'text': post.title + " " + post.selftext})
+    return pd.DataFrame(data)
+# Fetch Live YouTube Data
+def fetch_youtube_data(keyword):
+    request = youtube.search().list(q=keyword, part="snippet", maxResults=50, type="video")
+    response = request.execute()
+    data = []
+    for item in response['items']:
+        title = item['snippet']['title']
+        description = item['snippet']['description']
+        published_at = datetime.strptime(item['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ")
+        data.append({'date': published_at, 'text': title + " " + description})
+    return pd.DataFrame(data)
+# Sentiment Analysis Functions
+def get_bert_sentiment(text):
+    try:
+        result = bert_classifier(text[:512])[0]
+        return 1 if result['label'] == 'POSITIVE' else 0, result['score']
+    except:
+        return 0, 0.5
+def get_vader_sentiment(text):
+    score = vader_analyzer.polarity_scores(text)['compound']
+    return 1 if score > 0 else 0, score
+def combined_sentiment(text):
+    bert_label, bert_score = get_bert_sentiment(text)
+    vader_label, vader_score = get_vader_sentiment(text)
+    avg_score = (bert_score + abs(vader_score)) / 2
+    return 1 if avg_score > 0.5 else 0, avg_score
+# Sidebar for Keyword Input
+st.sidebar.title("Keyword Search")
+keyword = st.sidebar.text_input("Enter a keyword (e.g., 'happy')", value="happy")
+# Process Data
+twitter_df = load_twitter_data()
+twitter_filtered = twitter_df[twitter_df['text'].str.contains(keyword, case=False, na=False)]
+reddit_df = fetch_reddit_data(keyword)
+youtube_df = fetch_youtube_data(keyword)
+# Check Validity
+platforms = {'Twitter': twitter_filtered, 'Reddit': reddit_df, 'YouTube': youtube_df}
+valid_platforms = {k: v for k, v in platforms.items() if not v.empty}
+if not valid_platforms:
+    st.error(f"Error: '{keyword}' is not a valid keyword. No matching data found across Twitter, Reddit, or YouTube.")
+else:
+    for platform, df in valid_platforms.items():
+        st.subheader(f"{platform} Analysis for '{keyword}'")
+        if platform == 'Twitter':
+            st.write(f"{platform} Dataset Preview:", df[['text', 'date']].head())
         else:
+            st.write(f"{platform} Live Data Preview:", df.head())
+        # Sentiment Analysis
+        with st.spinner(f"Analyzing {platform} sentiments..."):
+            df['bert_sentiment'], df['bert_score'] = zip(*df['text'].apply(get_bert_sentiment))
+            df['vader_sentiment'], df['vader_score'] = zip(*df['text'].apply(get_vader_sentiment))
+            df['combined_sentiment'], df['combined_score'] = zip(*df['text'].apply(combined_sentiment))
+        st.write(f"{platform} Sentiment Results:", df[['text', 'combined_sentiment', 'combined_score']].head())
+        # Time-Series Preparation
+        daily_sentiment = df.groupby(df['date'].dt.date)['combined_score'].mean().reset_index()
+        daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
+        daily_sentiment['tweet_count'] = df.groupby(df['date'].dt.date).size().values
+        if len(daily_sentiment) < 8:
+            st.warning(f"Not enough {platform} data for '{keyword}' to predict 30 days.")
+            fig, ax = plt.subplots(figsize=(10, 6))
+            ax.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical Sentiment')
+            ax.set_xlabel('Date')
+            ax.set_ylabel('Sentiment Score')
+            ax.set_title(f"{platform} Historical Sentiment for '{keyword}'")
+            ax.legend()
+            st.pyplot(fig)
         else:
+            scaler = MinMaxScaler()
+            daily_sentiment['scaled_score'] = scaler.fit_transform(daily_sentiment[['combined_score']])
+            # LSTM Sequences
+            def create_sequences(data, seq_length):
+                X, y = [], []
+                for i in range(len(data) - seq_length):
+                    X.append(data[i:i + seq_length])
+                    y.append(data[i + seq_length])
+                return np.array(X), np.array(y)
+            seq_length = 7
+            X, y = create_sequences(daily_sentiment['scaled_score'].values, seq_length)
+            X = X.reshape((X.shape[0], X.shape[1], 1))
+            # Train LSTM
+            model = Sequential([
+                LSTM(50, return_sequences=True, input_shape=(seq_length, 1)),
+                Dropout(0.2),
+                LSTM(25),
+                Dropout(0.2),
+                Dense(1, activation='sigmoid')
+            ])
+            model.compile(optimizer='adam', loss='mse')
+            model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2, verbose=0)
+            # Predict 30 Days
+            last_sequence = daily_sentiment['scaled_score'][-seq_length:].values.reshape((1, seq_length, 1))
+            predictions = []
+            for _ in range(30):
+                pred = model.predict(last_sequence, verbose=0)
+                predictions.append(pred[0][0])
+                last_sequence = np.roll(last_sequence, -1)
+                last_sequence[0, -1, 0] = pred[0][0]
+            predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
+            # Logistic Regression
+            X_lr = np.column_stack((daily_sentiment['scaled_score'], daily_sentiment['tweet_count']))
+            y_lr = (daily_sentiment['combined_score'] > 0.5).astype(int)
+            lr_model = LogisticRegression()
+            lr_model.fit(X_lr, y_lr)
+            future_dates = [daily_sentiment['date'].iloc[-1] + timedelta(days=i) for i in range(1, 31)]
+            X_future = np.column_stack((predictions, [daily_sentiment['tweet_count'].mean()] * 30))
+            lr_predictions = lr_model.predict_proba(X_future)[:, 1]
+            # SHAP Explainability
+            st.subheader(f"{platform} SHAP Explainability")
+            explainer_lr = shap.LinearExplainer(lr_model, X_lr)
+            shap_values_lr = explainer_lr.shap_values(X_lr)
+            fig_lr, ax = plt.subplots()
+            shap.summary_plot(shap_values_lr, X_lr, feature_names=['Sentiment Score', 'Count'], show=False)
+            st.pyplot(fig_lr)
+            def lstm_predict(inputs):
+                inputs = inputs.reshape((inputs.shape[0], seq_length, 1))
+                return model.predict(inputs, verbose=0)
+            explainer_lstm = shap.KernelExplainer(lstm_predict, X[:50])
+            shap_values_lstm = explainer_lstm.shap_values(X[:50], nsamples=100)
+            fig_lstm, ax = plt.subplots()
+            shap.summary_plot(shap_values_lstm, X[:50], plot_type="bar", show=False)
+            st.pyplot(fig_lstm)
+            # Visualization
+            st.subheader(f"{platform} 30-Day Sentiment Prediction")
+            results_df = pd.DataFrame({
+                'Date': future_dates,
+                'Predicted Sentiment': predictions,
+                'Positive Probability': lr_predictions
+            })
+            fig, ax1 = plt.subplots(figsize=(10, 6))
+            ax1.plot(daily_sentiment['date'], daily_sentiment['combined_score'], 'g-', label='Historical Sentiment')
+            ax1.plot(results_df['Date'], results_df['Predicted Sentiment'], 'b-', label='Predicted Sentiment')
+            ax1.set_xlabel('Date')
+            ax1.set_ylabel('Sentiment Score', color='b')
+            ax2 = ax1.twinx()
+            ax2.plot(results_df['Date'], results_df['Positive Probability'], 'r-', label='Positive Probability')
+            ax2.set_ylabel('Positive Probability', color='r')
+            fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))
+            plt.title(f"{platform} Sentiment Forecast for '{keyword}'")
+            st.pyplot(fig)
+# Sidebar Instructions
+st.sidebar.write("1. Ensure 'sentiment140.csv' is in the folder.")
+st.sidebar.write("2. Enter a keyword to analyze live Reddit/YouTube and Twitter dataset.")
+st.sidebar.write("3. Run: `streamlit run sentiment_app.py`")