Soundaryasos commited on
Commit
73213d8
Β·
verified Β·
1 Parent(s): 20d33b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -62
app.py CHANGED
@@ -19,7 +19,6 @@ import numpy as np
19
  from sklearn.linear_model import Ridge
20
  from sklearn.preprocessing import PolynomialFeatures
21
  from sklearn.pipeline import make_pipeline
22
- from sklearn.model_selection import train_test_split
23
 
24
  # --------------------------
25
  # Initial Setup
@@ -41,7 +40,6 @@ def load_models():
41
  progress = st.progress(0, text="Loading sentiment models...")
42
 
43
  try:
44
- # Initialize sentiment models
45
  with st.spinner("Loading BERT model..."):
46
  bert_sentiment = pipeline(
47
  "sentiment-analysis",
@@ -78,7 +76,7 @@ def setup_api_clients():
78
  return None, None
79
 
80
  # --------------------------
81
- # Core Functions (Optimized)
82
  # --------------------------
83
 
84
  def analyze_text(text, models):
@@ -86,17 +84,23 @@ def analyze_text(text, models):
86
  bert_sentiment, vader_analyzer = models
87
 
88
  # Truncate very long texts to improve performance
89
- truncated_text = text[:2000] # Process first 2000 chars only
90
 
91
  try:
92
- # Parallel processing would be better here, but keeping it simple
 
 
 
 
 
 
 
 
93
  vader_score = vader_analyzer.polarity_scores(truncated_text)['compound']
94
  textblob_score = TextBlob(truncated_text).sentiment.polarity
95
 
96
- # Batch BERT processing for better performance
97
- bert_result = bert_sentiment(truncated_text[:512])[0] # BERT has 512 token limit
98
 
99
- # Convert BERT label to numerical score
100
  label_map = {
101
  '1 star': -1,
102
  '2 stars': -0.5,
@@ -172,18 +176,33 @@ def fetch_youtube_data(keyword, limit=30):
172
  return pd.DataFrame()
173
 
174
  # --------------------------
175
- # Prediction Functions
176
  # --------------------------
177
 
178
  def prepare_data_for_prediction(data):
179
- """Prepare time series data for prediction"""
180
  try:
 
 
 
 
181
  # Ensure data is sorted by date
182
  data = data.sort_values('date')
183
 
 
 
 
184
  # Create daily aggregates
185
  daily_data = data.groupby(pd.Grouper(key='date', freq='D'))['average'].mean().reset_index()
186
 
 
 
 
 
 
 
 
 
187
  # Create numerical features (days since first date)
188
  daily_data['days'] = (daily_data['date'] - daily_data['date'].min()).dt.days
189
 
@@ -193,17 +212,27 @@ def prepare_data_for_prediction(data):
193
  return None
194
 
195
  def train_sentiment_model(data):
196
- """Train Ridge regression model for sentiment prediction"""
197
  try:
 
 
 
 
 
198
  if len(data) < 5:
199
- st.warning("Not enough data points for reliable prediction (minimum 5 days required)")
200
  return None, None
201
 
202
- # Split data into features (days) and target (sentiment)
203
  X = data['days'].values.reshape(-1, 1)
204
  y = data['average'].values
205
 
206
- # Create polynomial features (degree=2 for simple curves)
 
 
 
 
 
207
  model = make_pipeline(
208
  PolynomialFeatures(degree=2),
209
  Ridge(alpha=1.0)
@@ -219,9 +248,10 @@ def train_sentiment_model(data):
219
  def predict_future_sentiment(model, training_data, days_to_predict=15):
220
  """Predict future sentiment using trained model"""
221
  try:
222
- if model is None:
 
223
  return None
224
-
225
  # Create future dates
226
  last_date = training_data['date'].max()
227
  future_dates = [last_date + timedelta(days=i) for i in range(1, days_to_predict+1)]
@@ -250,31 +280,13 @@ def predict_future_sentiment(model, training_data, days_to_predict=15):
250
  st.error(f"Prediction error: {str(e)}")
251
  return None
252
 
253
- # --------------------------
254
- # Visualization Functions
255
- # --------------------------
256
-
257
- def generate_wordcloud(text):
258
- """Fast word cloud generation"""
259
- try:
260
- wordcloud = WordCloud(
261
- width=800,
262
- height=400,
263
- background_color='white',
264
- collocations=False, # Faster processing
265
- stopwords=nltk.corpus.stopwords.words('english')
266
- ).generate(text)
267
-
268
- img = BytesIO()
269
- wordcloud.to_image().save(img, format='PNG')
270
- return base64.b64encode(img.getvalue()).decode()
271
- except Exception as e:
272
- st.error(f"Word cloud error: {str(e)}")
273
- return ""
274
-
275
  def plot_sentiment(data, keyword):
276
- """Optimized plotting function"""
277
  try:
 
 
 
 
278
  # Separate actual and predicted data
279
  actual_data = data[data['type'] == 'actual']
280
  pred_data = data[data['type'] == 'prediction']
@@ -282,13 +294,14 @@ def plot_sentiment(data, keyword):
282
  fig = go.Figure()
283
 
284
  # Add actual data
285
- fig.add_trace(go.Scatter(
286
- x=actual_data['date'],
287
- y=actual_data['average'],
288
- name='Actual Sentiment',
289
- mode='lines+markers',
290
- line=dict(color='#636EFA')
291
- ))
 
292
 
293
  # Add predicted data if available
294
  if not pred_data.empty:
@@ -300,7 +313,7 @@ def plot_sentiment(data, keyword):
300
  line=dict(color='#EF553B', dash='dot')
301
  ))
302
 
303
- # Add confidence interval (simple version)
304
  fig.add_trace(go.Scatter(
305
  x=pred_data['date'],
306
  y=pred_data['average'] + 0.1,
@@ -388,7 +401,6 @@ def main():
388
 
389
  st.success(f"Analysis completed in {processing_time:.2f} seconds")
390
 
391
- # Display results
392
  cols = st.columns(3)
393
  cols[0].metric("VADER Score", f"{result['vader']:.2f}",
394
  "Positive" if result['vader'] > 0 else "Negative" if result['vader'] < 0 else "Neutral")
@@ -396,7 +408,6 @@ def main():
396
  cols[2].metric("TextBlob Score", f"{result['textblob']:.2f}",
397
  "Positive" if result['textblob'] > 0 else "Negative" if result['textblob'] < 0 else "Neutral")
398
 
399
- # Word cloud
400
  st.subheader("πŸ“Š Text Visualization")
401
  wordcloud_img = f'data:image/png;base64,{generate_wordcloud(user_input)}'
402
  st.image(wordcloud_img, use_column_width=True)
@@ -409,7 +420,6 @@ def main():
409
  with st.spinner(f"Gathering data for '{keyword}'..."):
410
  start_time = time.time()
411
 
412
- # Parallel fetching would be better here
413
  reddit_data = fetch_reddit_data(keyword)
414
  youtube_data = fetch_youtube_data(keyword)
415
 
@@ -419,6 +429,9 @@ def main():
419
 
420
  combined_data = pd.concat([reddit_data, youtube_data], ignore_index=True)
421
 
 
 
 
422
  # Analyze in batches
423
  analysis_results = []
424
  for _, row in combined_data.iterrows():
@@ -428,12 +441,14 @@ def main():
428
  combined_data['vader'] = [r['vader'] for r in analysis_results]
429
  combined_data['bert'] = [r['bert'] for r in analysis_results]
430
  combined_data['textblob'] = [r['textblob'] for r in analysis_results]
 
 
 
431
  combined_data['average'] = combined_data[['vader', 'bert', 'textblob']].mean(axis=1)
432
 
433
  processing_time = time.time() - start_time
434
  st.success(f"Analyzed {len(combined_data)} sources in {processing_time:.2f} seconds")
435
 
436
- # Display summary
437
  st.subheader(f"πŸ“ˆ Overall Sentiment for '{keyword}'")
438
 
439
  cols = st.columns(3)
@@ -446,7 +461,6 @@ def main():
446
  cols[1].metric("Positive Content", f"{pos_pct:.1f}%")
447
  cols[2].metric("Negative Content", f"{neg_pct:.1f}%")
448
 
449
- # Word cloud
450
  st.subheader("πŸ“Š Content Visualization")
451
  all_text = " ".join(combined_data['text'])
452
  wordcloud_img = f'data:image/png;base64,{generate_wordcloud(all_text)}'
@@ -454,31 +468,29 @@ def main():
454
 
455
  # Filter recent data
456
  combined_data['date'] = pd.to_datetime(combined_data['date'])
457
- recent_data = combined_data[combined_data['date'] >= (datetime.now() - timedelta(days=60))] # Increased to 60 days for better prediction
458
 
459
  if not recent_data.empty:
460
- # Sentiment trends
461
  st.subheader("πŸ“… Sentiment Over Time")
462
 
463
- # Prepare data for prediction if enabled
464
- if enable_prediction and len(recent_data) >= 5:
465
  with st.spinner("Training prediction model..."):
466
  daily_data = prepare_data_for_prediction(recent_data)
467
  model, training_data = train_sentiment_model(daily_data)
468
 
469
- if model is not None:
470
  full_data = predict_future_sentiment(model, training_data)
471
  fig = plot_sentiment(full_data, keyword)
472
  else:
473
- fig = plot_sentiment(training_data, keyword)
 
474
  else:
475
  daily_data = prepare_data_for_prediction(recent_data)
476
- fig = plot_sentiment(daily_data.assign(type='actual'), keyword)
477
 
478
  if fig:
479
  st.plotly_chart(fig, use_container_width=True)
480
 
481
- # Show prediction insights
482
  if enable_prediction and 'full_data' in locals() and full_data is not None:
483
  last_actual = full_data[full_data['type'] == 'actual']['average'].iloc[-1]
484
  last_pred = full_data[full_data['type'] == 'prediction']['average'].iloc[-1]
@@ -490,7 +502,6 @@ def main():
490
  else:
491
  st.info("πŸ“Š Prediction: Sentiment is expected to remain stable in the next 15 days")
492
 
493
- # Show details if enabled
494
  if show_details:
495
  st.subheader("πŸ” Detailed Results")
496
  st.dataframe(recent_data[['date', 'source', 'text', 'average']], use_container_width=True)
@@ -498,7 +509,6 @@ def main():
498
  st.info("No recent data found (within last 60 days).")
499
 
500
  if __name__ == "__main__":
501
- # Initialize NLTK data
502
  try:
503
  nltk.data.path.append(os.path.join(os.path.expanduser("~"), "nltk_data"))
504
  nltk.download('punkt', quiet=True)
 
19
  from sklearn.linear_model import Ridge
20
  from sklearn.preprocessing import PolynomialFeatures
21
  from sklearn.pipeline import make_pipeline
 
22
 
23
  # --------------------------
24
  # Initial Setup
 
40
  progress = st.progress(0, text="Loading sentiment models...")
41
 
42
  try:
 
43
  with st.spinner("Loading BERT model..."):
44
  bert_sentiment = pipeline(
45
  "sentiment-analysis",
 
76
  return None, None
77
 
78
  # --------------------------
79
+ # Core Functions
80
  # --------------------------
81
 
82
  def analyze_text(text, models):
 
84
  bert_sentiment, vader_analyzer = models
85
 
86
  # Truncate very long texts to improve performance
87
+ truncated_text = text[:2000] if text else ""
88
 
89
  try:
90
+ if not truncated_text.strip():
91
+ return {
92
+ 'vader': 0,
93
+ 'bert': 0,
94
+ 'textblob': 0,
95
+ 'bert_label': 'Neutral',
96
+ 'bert_confidence': 0
97
+ }
98
+
99
  vader_score = vader_analyzer.polarity_scores(truncated_text)['compound']
100
  textblob_score = TextBlob(truncated_text).sentiment.polarity
101
 
102
+ bert_result = bert_sentiment(truncated_text[:512])[0] # BERT 512 token limit
 
103
 
 
104
  label_map = {
105
  '1 star': -1,
106
  '2 stars': -0.5,
 
176
  return pd.DataFrame()
177
 
178
  # --------------------------
179
+ # Prediction Functions (Rewritten to Fix Error)
180
  # --------------------------
181
 
182
  def prepare_data_for_prediction(data):
183
+ """Prepare time series data for prediction, handling NaN values"""
184
  try:
185
+ if data.empty:
186
+ st.warning("No data available for prediction")
187
+ return None
188
+
189
  # Ensure data is sorted by date
190
  data = data.sort_values('date')
191
 
192
+ # Filter out rows with invalid sentiment scores
193
+ data = data.dropna(subset=['average'])
194
+
195
  # Create daily aggregates
196
  daily_data = data.groupby(pd.Grouper(key='date', freq='D'))['average'].mean().reset_index()
197
 
198
+ # Remove any remaining NaN values from aggregation
199
+ daily_data = daily_data.dropna(subset=['average'])
200
+
201
+ # Check if enough data points remain
202
+ if len(daily_data) < 5:
203
+ st.warning("Insufficient valid data points for prediction (minimum 5 required)")
204
+ return None
205
+
206
  # Create numerical features (days since first date)
207
  daily_data['days'] = (daily_data['date'] - daily_data['date'].min()).dt.days
208
 
 
212
  return None
213
 
214
  def train_sentiment_model(data):
215
+ """Train Ridge regression model, ensuring valid input"""
216
  try:
217
+ if data is None:
218
+ st.warning("No valid data for model training")
219
+ return None, None
220
+
221
+ # Verify sufficient data points
222
  if len(data) < 5:
223
+ st.warning("Not enough data points for reliable prediction (minimum 5 required)")
224
  return None, None
225
 
226
+ # Extract features and target
227
  X = data['days'].values.reshape(-1, 1)
228
  y = data['average'].values
229
 
230
+ # Check for NaN values
231
+ if np.any(np.isnan(X)) or np.any(np.isnan(y)):
232
+ st.warning("Invalid values detected in data. Skipping prediction.")
233
+ return None, None
234
+
235
+ # Train polynomial Ridge regression
236
  model = make_pipeline(
237
  PolynomialFeatures(degree=2),
238
  Ridge(alpha=1.0)
 
248
  def predict_future_sentiment(model, training_data, days_to_predict=15):
249
  """Predict future sentiment using trained model"""
250
  try:
251
+ if model is None or training_data is None:
252
+ st.warning("No valid model or data for prediction")
253
  return None
254
+
255
  # Create future dates
256
  last_date = training_data['date'].max()
257
  future_dates = [last_date + timedelta(days=i) for i in range(1, days_to_predict+1)]
 
280
  st.error(f"Prediction error: {str(e)}")
281
  return None
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  def plot_sentiment(data, keyword):
284
+ """Plot sentiment trends, handling missing data"""
285
  try:
286
+ if data is None or data.empty:
287
+ st.warning("No data available for plotting sentiment trends")
288
+ return None
289
+
290
  # Separate actual and predicted data
291
  actual_data = data[data['type'] == 'actual']
292
  pred_data = data[data['type'] == 'prediction']
 
294
  fig = go.Figure()
295
 
296
  # Add actual data
297
+ if not actual_data.empty:
298
+ fig.add_trace(go.Scatter(
299
+ x=actual_data['date'],
300
+ y=actual_data['average'],
301
+ name='Actual Sentiment',
302
+ mode='lines+markers',
303
+ line=dict(color='#636EFA')
304
+ ))
305
 
306
  # Add predicted data if available
307
  if not pred_data.empty:
 
313
  line=dict(color='#EF553B', dash='dot')
314
  ))
315
 
316
+ # Add confidence interval
317
  fig.add_trace(go.Scatter(
318
  x=pred_data['date'],
319
  y=pred_data['average'] + 0.1,
 
401
 
402
  st.success(f"Analysis completed in {processing_time:.2f} seconds")
403
 
 
404
  cols = st.columns(3)
405
  cols[0].metric("VADER Score", f"{result['vader']:.2f}",
406
  "Positive" if result['vader'] > 0 else "Negative" if result['vader'] < 0 else "Neutral")
 
408
  cols[2].metric("TextBlob Score", f"{result['textblob']:.2f}",
409
  "Positive" if result['textblob'] > 0 else "Negative" if result['textblob'] < 0 else "Neutral")
410
 
 
411
  st.subheader("πŸ“Š Text Visualization")
412
  wordcloud_img = f'data:image/png;base64,{generate_wordcloud(user_input)}'
413
  st.image(wordcloud_img, use_column_width=True)
 
420
  with st.spinner(f"Gathering data for '{keyword}'..."):
421
  start_time = time.time()
422
 
 
423
  reddit_data = fetch_reddit_data(keyword)
424
  youtube_data = fetch_youtube_data(keyword)
425
 
 
429
 
430
  combined_data = pd.concat([reddit_data, youtube_data], ignore_index=True)
431
 
432
+ # Filter out empty or invalid texts
433
+ combined_data = combined_data[combined_data['text'].str.strip() != '']
434
+
435
  # Analyze in batches
436
  analysis_results = []
437
  for _, row in combined_data.iterrows():
 
441
  combined_data['vader'] = [r['vader'] for r in analysis_results]
442
  combined_data['bert'] = [r['bert'] for r in analysis_results]
443
  combined_data['textblob'] = [r['textblob'] for r in analysis_results]
444
+
445
+ # Ensure no NaN values in sentiment scores
446
+ combined_data = combined_data.dropna(subset=['vader', 'bert', 'textblob'])
447
  combined_data['average'] = combined_data[['vader', 'bert', 'textblob']].mean(axis=1)
448
 
449
  processing_time = time.time() - start_time
450
  st.success(f"Analyzed {len(combined_data)} sources in {processing_time:.2f} seconds")
451
 
 
452
  st.subheader(f"πŸ“ˆ Overall Sentiment for '{keyword}'")
453
 
454
  cols = st.columns(3)
 
461
  cols[1].metric("Positive Content", f"{pos_pct:.1f}%")
462
  cols[2].metric("Negative Content", f"{neg_pct:.1f}%")
463
 
 
464
  st.subheader("πŸ“Š Content Visualization")
465
  all_text = " ".join(combined_data['text'])
466
  wordcloud_img = f'data:image/png;base64,{generate_wordcloud(all_text)}'
 
468
 
469
  # Filter recent data
470
  combined_data['date'] = pd.to_datetime(combined_data['date'])
471
+ recent_data = combined_data[combined_data['date'] >= (datetime.now() - timedelta(days=60))]
472
 
473
  if not recent_data.empty:
 
474
  st.subheader("πŸ“… Sentiment Over Time")
475
 
476
+ if enable_prediction:
 
477
  with st.spinner("Training prediction model..."):
478
  daily_data = prepare_data_for_prediction(recent_data)
479
  model, training_data = train_sentiment_model(daily_data)
480
 
481
+ if model is not None and training_data is not None:
482
  full_data = predict_future_sentiment(model, training_data)
483
  fig = plot_sentiment(full_data, keyword)
484
  else:
485
+ daily_data = daily_data if daily_data is not None else recent_data[['date', 'average']].assign(type='actual')
486
+ fig = plot_sentiment(daily_data, keyword)
487
  else:
488
  daily_data = prepare_data_for_prediction(recent_data)
489
+ fig = plot_sentiment(daily_data.assign(type='actual') if daily_data is not None else recent_data[['date', 'average']].assign(type='actual'), keyword)
490
 
491
  if fig:
492
  st.plotly_chart(fig, use_container_width=True)
493
 
 
494
  if enable_prediction and 'full_data' in locals() and full_data is not None:
495
  last_actual = full_data[full_data['type'] == 'actual']['average'].iloc[-1]
496
  last_pred = full_data[full_data['type'] == 'prediction']['average'].iloc[-1]
 
502
  else:
503
  st.info("πŸ“Š Prediction: Sentiment is expected to remain stable in the next 15 days")
504
 
 
505
  if show_details:
506
  st.subheader("πŸ” Detailed Results")
507
  st.dataframe(recent_data[['date', 'source', 'text', 'average']], use_container_width=True)
 
509
  st.info("No recent data found (within last 60 days).")
510
 
511
  if __name__ == "__main__":
 
512
  try:
513
  nltk.data.path.append(os.path.join(os.path.expanduser("~"), "nltk_data"))
514
  nltk.download('punkt', quiet=True)