SentimentAnalyzerFinbert

Sleeping

App Files Files Community

Soundaryasos commited on Apr 14, 2025

Commit

73213d8

verified ·

1 Parent(s): 20d33b1

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -62

app.py CHANGED Viewed

@@ -19,7 +19,6 @@ import numpy as np
 from sklearn.linear_model import Ridge
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.pipeline import make_pipeline
-from sklearn.model_selection import train_test_split
 # --------------------------
 # Initial Setup
@@ -41,7 +40,6 @@ def load_models():
     progress = st.progress(0, text="Loading sentiment models...")
     try:
-        # Initialize sentiment models
         with st.spinner("Loading BERT model..."):
             bert_sentiment = pipeline(
                 "sentiment-analysis",
@@ -78,7 +76,7 @@ def setup_api_clients():
         return None, None
 # --------------------------
-# Core Functions (Optimized)
 # --------------------------
 def analyze_text(text, models):
@@ -86,17 +84,23 @@ def analyze_text(text, models):
     bert_sentiment, vader_analyzer = models
     # Truncate very long texts to improve performance
-    truncated_text = text[:2000]  # Process first 2000 chars only
     try:
-        # Parallel processing would be better here, but keeping it simple
         vader_score = vader_analyzer.polarity_scores(truncated_text)['compound']
         textblob_score = TextBlob(truncated_text).sentiment.polarity
-        # Batch BERT processing for better performance
-        bert_result = bert_sentiment(truncated_text[:512])[0]  # BERT has 512 token limit
-        # Convert BERT label to numerical score
         label_map = {
             '1 star': -1,
             '2 stars': -0.5,
@@ -172,18 +176,33 @@ def fetch_youtube_data(keyword, limit=30):
         return pd.DataFrame()
 # --------------------------
-# Prediction Functions
 # --------------------------
 def prepare_data_for_prediction(data):
-    """Prepare time series data for prediction"""
     try:
         # Ensure data is sorted by date
         data = data.sort_values('date')
         # Create daily aggregates
         daily_data = data.groupby(pd.Grouper(key='date', freq='D'))['average'].mean().reset_index()
         # Create numerical features (days since first date)
         daily_data['days'] = (daily_data['date'] - daily_data['date'].min()).dt.days
@@ -193,17 +212,27 @@ def prepare_data_for_prediction(data):
         return None
 def train_sentiment_model(data):
-    """Train Ridge regression model for sentiment prediction"""
     try:
         if len(data) < 5:
-            st.warning("Not enough data points for reliable prediction (minimum 5 days required)")
             return None, None
-        # Split data into features (days) and target (sentiment)
         X = data['days'].values.reshape(-1, 1)
         y = data['average'].values
-        # Create polynomial features (degree=2 for simple curves)
         model = make_pipeline(
             PolynomialFeatures(degree=2),
             Ridge(alpha=1.0)
@@ -219,9 +248,10 @@ def train_sentiment_model(data):
 def predict_future_sentiment(model, training_data, days_to_predict=15):
     """Predict future sentiment using trained model"""
     try:
-        if model is None:
             return None
         # Create future dates
         last_date = training_data['date'].max()
         future_dates = [last_date + timedelta(days=i) for i in range(1, days_to_predict+1)]
@@ -250,31 +280,13 @@ def predict_future_sentiment(model, training_data, days_to_predict=15):
         st.error(f"Prediction error: {str(e)}")
         return None
-# --------------------------
-# Visualization Functions
-# --------------------------
-def generate_wordcloud(text):
-    """Fast word cloud generation"""
-    try:
-        wordcloud = WordCloud(
-            width=800,
-            height=400,
-            background_color='white',
-            collocations=False,  # Faster processing
-            stopwords=nltk.corpus.stopwords.words('english')
-        ).generate(text)
-        img = BytesIO()
-        wordcloud.to_image().save(img, format='PNG')
-        return base64.b64encode(img.getvalue()).decode()
-    except Exception as e:
-        st.error(f"Word cloud error: {str(e)}")
-        return ""
 def plot_sentiment(data, keyword):
-    """Optimized plotting function"""
     try:
         # Separate actual and predicted data
         actual_data = data[data['type'] == 'actual']
         pred_data = data[data['type'] == 'prediction']
@@ -282,13 +294,14 @@ def plot_sentiment(data, keyword):
         fig = go.Figure()
         # Add actual data
-        fig.add_trace(go.Scatter(
-            x=actual_data['date'],
-            y=actual_data['average'],
-            name='Actual Sentiment',
-            mode='lines+markers',
-            line=dict(color='#636EFA')
-        ))
         # Add predicted data if available
         if not pred_data.empty:
@@ -300,7 +313,7 @@ def plot_sentiment(data, keyword):
                 line=dict(color='#EF553B', dash='dot')
             ))
-            # Add confidence interval (simple version)
             fig.add_trace(go.Scatter(
                 x=pred_data['date'],
                 y=pred_data['average'] + 0.1,
@@ -388,7 +401,6 @@ def main():
                 st.success(f"Analysis completed in {processing_time:.2f} seconds")
-                # Display results
                 cols = st.columns(3)
                 cols[0].metric("VADER Score", f"{result['vader']:.2f}",
                              "Positive" if result['vader'] > 0 else "Negative" if result['vader'] < 0 else "Neutral")
@@ -396,7 +408,6 @@ def main():
                 cols[2].metric("TextBlob Score", f"{result['textblob']:.2f}",
                              "Positive" if result['textblob'] > 0 else "Negative" if result['textblob'] < 0 else "Neutral")
-                # Word cloud
                 st.subheader("📊 Text Visualization")
                 wordcloud_img = f'data:image/png;base64,{generate_wordcloud(user_input)}'
                 st.image(wordcloud_img, use_column_width=True)
@@ -409,7 +420,6 @@ def main():
             with st.spinner(f"Gathering data for '{keyword}'..."):
                 start_time = time.time()
-                # Parallel fetching would be better here
                 reddit_data = fetch_reddit_data(keyword)
                 youtube_data = fetch_youtube_data(keyword)
@@ -419,6 +429,9 @@ def main():
                 combined_data = pd.concat([reddit_data, youtube_data], ignore_index=True)
                 # Analyze in batches
                 analysis_results = []
                 for _, row in combined_data.iterrows():
@@ -428,12 +441,14 @@ def main():
                 combined_data['vader'] = [r['vader'] for r in analysis_results]
                 combined_data['bert'] = [r['bert'] for r in analysis_results]
                 combined_data['textblob'] = [r['textblob'] for r in analysis_results]
                 combined_data['average'] = combined_data[['vader', 'bert', 'textblob']].mean(axis=1)
                 processing_time = time.time() - start_time
                 st.success(f"Analyzed {len(combined_data)} sources in {processing_time:.2f} seconds")
-                # Display summary
                 st.subheader(f"📈 Overall Sentiment for '{keyword}'")
                 cols = st.columns(3)
@@ -446,7 +461,6 @@ def main():
                 cols[1].metric("Positive Content", f"{pos_pct:.1f}%")
                 cols[2].metric("Negative Content", f"{neg_pct:.1f}%")
-                # Word cloud
                 st.subheader("📊 Content Visualization")
                 all_text = " ".join(combined_data['text'])
                 wordcloud_img = f'data:image/png;base64,{generate_wordcloud(all_text)}'
@@ -454,31 +468,29 @@ def main():
                 # Filter recent data
                 combined_data['date'] = pd.to_datetime(combined_data['date'])
-                recent_data = combined_data[combined_data['date'] >= (datetime.now() - timedelta(days=60))]  # Increased to 60 days for better prediction
                 if not recent_data.empty:
-                    # Sentiment trends
                     st.subheader("📅 Sentiment Over Time")
-                    # Prepare data for prediction if enabled
-                    if enable_prediction and len(recent_data) >= 5:
                         with st.spinner("Training prediction model..."):
                             daily_data = prepare_data_for_prediction(recent_data)
                             model, training_data = train_sentiment_model(daily_data)
-                            if model is not None:
                                 full_data = predict_future_sentiment(model, training_data)
                                 fig = plot_sentiment(full_data, keyword)
                             else:
-                                fig = plot_sentiment(training_data, keyword)
                     else:
                         daily_data = prepare_data_for_prediction(recent_data)
-                        fig = plot_sentiment(daily_data.assign(type='actual'), keyword)
                     if fig:
                         st.plotly_chart(fig, use_container_width=True)
-                    # Show prediction insights
                     if enable_prediction and 'full_data' in locals() and full_data is not None:
                         last_actual = full_data[full_data['type'] == 'actual']['average'].iloc[-1]
                         last_pred = full_data[full_data['type'] == 'prediction']['average'].iloc[-1]
@@ -490,7 +502,6 @@ def main():
                         else:
                             st.info("📊 Prediction: Sentiment is expected to remain stable in the next 15 days")
-                    # Show details if enabled
                     if show_details:
                         st.subheader("🔍 Detailed Results")
                         st.dataframe(recent_data[['date', 'source', 'text', 'average']], use_container_width=True)
@@ -498,7 +509,6 @@ def main():
                     st.info("No recent data found (within last 60 days).")
 if __name__ == "__main__":
-    # Initialize NLTK data
     try:
         nltk.data.path.append(os.path.join(os.path.expanduser("~"), "nltk_data"))
         nltk.download('punkt', quiet=True)

 from sklearn.linear_model import Ridge
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.pipeline import make_pipeline
 # --------------------------
 # Initial Setup
     progress = st.progress(0, text="Loading sentiment models...")
     try:
         with st.spinner("Loading BERT model..."):
             bert_sentiment = pipeline(
                 "sentiment-analysis",
         return None, None
 # --------------------------
+# Core Functions
 # --------------------------
 def analyze_text(text, models):
     bert_sentiment, vader_analyzer = models
     # Truncate very long texts to improve performance
+    truncated_text = text[:2000] if text else ""
     try:
+        if not truncated_text.strip():
+            return {
+                'vader': 0,
+                'bert': 0,
+                'textblob': 0,
+                'bert_label': 'Neutral',
+                'bert_confidence': 0
+            }
         vader_score = vader_analyzer.polarity_scores(truncated_text)['compound']
         textblob_score = TextBlob(truncated_text).sentiment.polarity
+        bert_result = bert_sentiment(truncated_text[:512])[0]  # BERT 512 token limit
         label_map = {
             '1 star': -1,
             '2 stars': -0.5,
         return pd.DataFrame()
 # --------------------------
+# Prediction Functions (Rewritten to Fix Error)
 # --------------------------
 def prepare_data_for_prediction(data):
+    """Prepare time series data for prediction, handling NaN values"""
     try:
+        if data.empty:
+            st.warning("No data available for prediction")
+            return None
         # Ensure data is sorted by date
         data = data.sort_values('date')
+        # Filter out rows with invalid sentiment scores
+        data = data.dropna(subset=['average'])
         # Create daily aggregates
         daily_data = data.groupby(pd.Grouper(key='date', freq='D'))['average'].mean().reset_index()
+        # Remove any remaining NaN values from aggregation
+        daily_data = daily_data.dropna(subset=['average'])
+        # Check if enough data points remain
+        if len(daily_data) < 5:
+            st.warning("Insufficient valid data points for prediction (minimum 5 required)")
+            return None
         # Create numerical features (days since first date)
         daily_data['days'] = (daily_data['date'] - daily_data['date'].min()).dt.days
         return None
 def train_sentiment_model(data):
+    """Train Ridge regression model, ensuring valid input"""
     try:
+        if data is None:
+            st.warning("No valid data for model training")
+            return None, None
+        # Verify sufficient data points
         if len(data) < 5:
+            st.warning("Not enough data points for reliable prediction (minimum 5 required)")
             return None, None
+        # Extract features and target
         X = data['days'].values.reshape(-1, 1)
         y = data['average'].values
+        # Check for NaN values
+        if np.any(np.isnan(X)) or np.any(np.isnan(y)):
+            st.warning("Invalid values detected in data. Skipping prediction.")
+            return None, None
+        # Train polynomial Ridge regression
         model = make_pipeline(
             PolynomialFeatures(degree=2),
             Ridge(alpha=1.0)
 def predict_future_sentiment(model, training_data, days_to_predict=15):
     """Predict future sentiment using trained model"""
     try:
+        if model is None or training_data is None:
+            st.warning("No valid model or data for prediction")
             return None
         # Create future dates
         last_date = training_data['date'].max()
         future_dates = [last_date + timedelta(days=i) for i in range(1, days_to_predict+1)]
         st.error(f"Prediction error: {str(e)}")
         return None
 def plot_sentiment(data, keyword):
+    """Plot sentiment trends, handling missing data"""
     try:
+        if data is None or data.empty:
+            st.warning("No data available for plotting sentiment trends")
+            return None
         # Separate actual and predicted data
         actual_data = data[data['type'] == 'actual']
         pred_data = data[data['type'] == 'prediction']
         fig = go.Figure()
         # Add actual data
+        if not actual_data.empty:
+            fig.add_trace(go.Scatter(
+                x=actual_data['date'],
+                y=actual_data['average'],
+                name='Actual Sentiment',
+                mode='lines+markers',
+                line=dict(color='#636EFA')
+            ))
         # Add predicted data if available
         if not pred_data.empty:
                 line=dict(color='#EF553B', dash='dot')
             ))
+            # Add confidence interval
             fig.add_trace(go.Scatter(
                 x=pred_data['date'],
                 y=pred_data['average'] + 0.1,
                 st.success(f"Analysis completed in {processing_time:.2f} seconds")
                 cols = st.columns(3)
                 cols[0].metric("VADER Score", f"{result['vader']:.2f}",
                              "Positive" if result['vader'] > 0 else "Negative" if result['vader'] < 0 else "Neutral")
                 cols[2].metric("TextBlob Score", f"{result['textblob']:.2f}",
                              "Positive" if result['textblob'] > 0 else "Negative" if result['textblob'] < 0 else "Neutral")
                 st.subheader("📊 Text Visualization")
                 wordcloud_img = f'data:image/png;base64,{generate_wordcloud(user_input)}'
                 st.image(wordcloud_img, use_column_width=True)
             with st.spinner(f"Gathering data for '{keyword}'..."):
                 start_time = time.time()
                 reddit_data = fetch_reddit_data(keyword)
                 youtube_data = fetch_youtube_data(keyword)
                 combined_data = pd.concat([reddit_data, youtube_data], ignore_index=True)
+                # Filter out empty or invalid texts
+                combined_data = combined_data[combined_data['text'].str.strip() != '']
                 # Analyze in batches
                 analysis_results = []
                 for _, row in combined_data.iterrows():
                 combined_data['vader'] = [r['vader'] for r in analysis_results]
                 combined_data['bert'] = [r['bert'] for r in analysis_results]
                 combined_data['textblob'] = [r['textblob'] for r in analysis_results]
+                # Ensure no NaN values in sentiment scores
+                combined_data = combined_data.dropna(subset=['vader', 'bert', 'textblob'])
                 combined_data['average'] = combined_data[['vader', 'bert', 'textblob']].mean(axis=1)
                 processing_time = time.time() - start_time
                 st.success(f"Analyzed {len(combined_data)} sources in {processing_time:.2f} seconds")
                 st.subheader(f"📈 Overall Sentiment for '{keyword}'")
                 cols = st.columns(3)
                 cols[1].metric("Positive Content", f"{pos_pct:.1f}%")
                 cols[2].metric("Negative Content", f"{neg_pct:.1f}%")
                 st.subheader("📊 Content Visualization")
                 all_text = " ".join(combined_data['text'])
                 wordcloud_img = f'data:image/png;base64,{generate_wordcloud(all_text)}'
                 # Filter recent data
                 combined_data['date'] = pd.to_datetime(combined_data['date'])
+                recent_data = combined_data[combined_data['date'] >= (datetime.now() - timedelta(days=60))]
                 if not recent_data.empty:
                     st.subheader("📅 Sentiment Over Time")
+                    if enable_prediction:
                         with st.spinner("Training prediction model..."):
                             daily_data = prepare_data_for_prediction(recent_data)
                             model, training_data = train_sentiment_model(daily_data)
+                            if model is not None and training_data is not None:
                                 full_data = predict_future_sentiment(model, training_data)
                                 fig = plot_sentiment(full_data, keyword)
                             else:
+                                daily_data = daily_data if daily_data is not None else recent_data[['date', 'average']].assign(type='actual')
+                                fig = plot_sentiment(daily_data, keyword)
                     else:
                         daily_data = prepare_data_for_prediction(recent_data)
+                        fig = plot_sentiment(daily_data.assign(type='actual') if daily_data is not None else recent_data[['date', 'average']].assign(type='actual'), keyword)
                     if fig:
                         st.plotly_chart(fig, use_container_width=True)
                     if enable_prediction and 'full_data' in locals() and full_data is not None:
                         last_actual = full_data[full_data['type'] == 'actual']['average'].iloc[-1]
                         last_pred = full_data[full_data['type'] == 'prediction']['average'].iloc[-1]
                         else:
                             st.info("📊 Prediction: Sentiment is expected to remain stable in the next 15 days")
                     if show_details:
                         st.subheader("🔍 Detailed Results")
                         st.dataframe(recent_data[['date', 'source', 'text', 'average']], use_container_width=True)
                     st.info("No recent data found (within last 60 days).")
 if __name__ == "__main__":
     try:
         nltk.data.path.append(os.path.join(os.path.expanduser("~"), "nltk_data"))
         nltk.download('punkt', quiet=True)