Soundaryasos commited on
Commit
a2129f9
·
verified ·
1 Parent(s): ddc40e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -453
app.py CHANGED
@@ -1,14 +1,10 @@
1
  import streamlit as st
2
  from transformers import pipeline
3
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
4
- import numpy as np
5
  import pandas as pd
6
  from datetime import datetime, timedelta
7
  import plotly.express as px
8
- from sklearn.linear_model import Ridge
9
- from sklearn.ensemble import RandomForestRegressor
10
- from sklearn.model_selection import train_test_split
11
- from sklearn.metrics import mean_absolute_error
12
  from wordcloud import WordCloud
13
  import base64
14
  from io import BytesIO
@@ -17,569 +13,351 @@ from textblob import TextBlob
17
  import praw
18
  from googleapiclient.discovery import build
19
  import os
20
- from statsmodels.tsa.arima.model import ARIMA
21
- from prophet import Prophet
22
 
23
  # --------------------------
24
- # Initial Setup & Configuration
25
  # --------------------------
26
 
27
- # Set page config
28
  st.set_page_config(
29
- page_title="🌟 SentimentSync: Live Sentiment Analysis & Prediction Dashboard",
30
- page_icon="📊",
31
  layout="wide"
32
  )
33
 
34
  # --------------------------
35
- # NLTK Data Download
36
- # --------------------------
37
-
38
- def download_nltk_data():
39
- try:
40
- nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
41
- if not os.path.exists(nltk_data_dir):
42
- os.makedirs(nltk_data_dir)
43
-
44
- nltk.download('punkt', download_dir=nltk_data_dir)
45
- nltk.download('stopwords', download_dir=nltk_data_dir)
46
- nltk.download('punkt_tab', download_dir=nltk_data_dir)
47
- nltk.data.path.append(nltk_data_dir)
48
- except Exception as e:
49
- st.error(f"Error downloading NLTK data: {str(e)}")
50
- return False
51
- return True
52
-
53
- if not download_nltk_data():
54
- st.warning("Some NLTK features may not work properly without the required data files.")
55
-
56
- # --------------------------
57
- # Model Initialization
58
  # --------------------------
59
 
60
  @st.cache_resource
61
  def load_models():
 
 
 
62
  try:
63
  # Initialize sentiment models
64
- bert_sentiment = pipeline(
65
- "sentiment-analysis",
66
- model="nlptown/bert-base-multilingual-uncased-sentiment"
67
- )
68
- vader_analyzer = SentimentIntensityAnalyzer()
 
 
 
 
 
 
69
  return bert_sentiment, vader_analyzer
70
  except Exception as e:
71
- st.error(f"Error loading models: {str(e)}")
72
  return None, None
73
 
74
- bert_sentiment, vader_analyzer = load_models()
75
-
76
- if bert_sentiment is None or vader_analyzer is None:
77
- st.stop()
78
-
79
- # --------------------------
80
- # API Clients Setup
81
- # --------------------------
82
-
83
  @st.cache_resource
84
  def setup_api_clients():
 
85
  try:
86
- # Reddit API setup
87
- reddit = praw.Reddit(
88
- client_id="S7pTXhj5JDFGDb3-_zrJEA",
89
- client_secret="QP3NYN4lrAKVLrBamzLGrpFywiVg8w",
90
- user_agent="SoundaryaR_Bot/1.0"
91
- )
92
 
93
- youtube = build('youtube', 'v3', developerKey="AIzaSyDcUAkcoPvkTwN_tksmiW0dVPI5Bse7qos")
 
94
 
95
  return reddit, youtube
96
  except Exception as e:
97
- st.error(f"Error setting up API clients: {str(e)}")
98
  return None, None
99
 
100
- reddit, youtube = setup_api_clients()
101
-
102
- if reddit is None or youtube is None:
103
- st.stop()
104
-
105
  # --------------------------
106
- # Helper Functions
107
  # --------------------------
108
 
109
- def bert_score(result):
110
- """Convert BERT label to numerical score"""
111
- label_map = {
112
- '1 star': -1,
113
- '2 stars': -0.5,
114
- '3 stars': 0,
115
- '4 stars': 0.5,
116
- '5 stars': 1
117
- }
118
- return label_map.get(result['label'], 0)
119
-
120
- def analyze_text(text):
121
- """Analyze sentiment using multiple models"""
122
- try:
123
- vader_score = vader_analyzer.polarity_scores(text)['compound']
124
- bert_result = bert_sentiment(text[:512])[0] # Truncate to avoid token limits
125
- bert_num = bert_score(bert_result)
126
- textblob_score = TextBlob(text).sentiment.polarity
127
- return vader_score, bert_num, textblob_score, bert_result
128
- except Exception as e:
129
- st.error(f"Error analyzing text: {str(e)}")
130
- return 0, 0, 0, {'label': 'Error', 'score': 0}
131
-
132
- def generate_wordcloud(text):
133
- """Generate word cloud image"""
134
- try:
135
- wordcloud = WordCloud(
136
- width=800,
137
- height=400,
138
- background_color='white',
139
- stopwords=nltk.corpus.stopwords.words('english')
140
- ).generate(text)
141
-
142
- img = BytesIO()
143
- wordcloud.to_image().save(img, format='PNG')
144
- return base64.b64encode(img.getvalue()).decode()
145
- except Exception as e:
146
- st.error(f"Error generating word cloud: {str(e)}")
147
- return ""
148
-
149
- def prepare_time_series_data(df):
150
- """Prepare time series data for forecasting"""
151
- try:
152
- # Resample to daily data
153
- ts_df = df.set_index('date').resample('D').agg({
154
- 'Average': 'mean',
155
- 'VADER': 'mean',
156
- 'BERT': 'mean',
157
- 'TextBlob': 'mean'
158
- }).ffill().reset_index()
159
-
160
- # Create features
161
- ts_df['day_of_week'] = ts_df['date'].dt.dayofweek
162
- ts_df['day_of_month'] = ts_df['date'].dt.day
163
- ts_df['days_since_start'] = (ts_df['date'] - ts_df['date'].min()).dt.days
164
-
165
- return ts_df
166
- except Exception as e:
167
- st.error(f"Error preparing time series data: {str(e)}")
168
- return None
169
-
170
- def predict_sentiment_prophet(df, periods=15):
171
- """Predict future sentiment using Facebook Prophet"""
172
- try:
173
- # Prepare data for Prophet
174
- prophet_df = df[['date', 'Average']].rename(columns={'date': 'ds', 'Average': 'y'})
175
-
176
- # Initialize and fit model
177
- model = Prophet(
178
- daily_seasonality=True,
179
- weekly_seasonality=True,
180
- yearly_seasonality=False
181
- )
182
- model.fit(prophet_df)
183
-
184
- # Make future dataframe
185
- future = model.make_future_dataframe(periods=periods)
186
-
187
- # Predict
188
- forecast = model.predict(future)
189
-
190
- return forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].rename(columns={
191
- 'ds': 'date',
192
- 'yhat': 'predicted_sentiment',
193
- 'yhat_lower': 'lower_bound',
194
- 'yhat_upper': 'upper_bound'
195
- })
196
- except Exception as e:
197
- st.error(f"Error with Prophet prediction: {str(e)}")
198
- return None
199
-
200
- def predict_sentiment_arima(df, periods=15):
201
- """Predict future sentiment using ARIMA"""
202
  try:
203
- # Fit ARIMA model
204
- model = ARIMA(df['Average'], order=(2, 1, 2))
205
- model_fit = model.fit()
206
 
207
- # Make predictions
208
- forecast = model_fit.forecast(steps=periods)
209
 
210
- # Create future dates
211
- last_date = df['date'].max()
212
- future_dates = [last_date + timedelta(days=i) for i in range(1, periods+1)]
 
 
 
 
 
 
213
 
214
- return pd.DataFrame({
215
- 'date': future_dates,
216
- 'predicted_sentiment': forecast,
217
- 'model': 'ARIMA'
218
- })
 
 
219
  except Exception as e:
220
- st.error(f"Error with ARIMA prediction: {str(e)}")
221
- return None
 
 
 
 
 
 
222
 
223
- def predict_sentiment_rf(df, periods=15):
224
- """Predict future sentiment using Random Forest"""
 
225
  try:
226
- # Prepare features
227
- ts_df = prepare_time_series_data(df)
228
- if ts_df is None or len(ts_df) < 10:
229
- return None
230
 
231
- X = ts_df[['days_since_start', 'day_of_week', 'day_of_month']]
232
- y = ts_df['Average']
233
-
234
- # Train model
235
- model = RandomForestRegressor(n_estimators=100, random_state=42)
236
- model.fit(X, y)
237
-
238
- # Create future features
239
- last_date = ts_df['date'].max()
240
- future_dates = [last_date + timedelta(days=i) for i in range(1, periods+1)]
241
- future_days_since = [(d - ts_df['date'].min()).days for d in future_dates]
242
-
243
- future_X = pd.DataFrame({
244
- 'days_since_start': future_days_since,
245
- 'day_of_week': [d.weekday() for d in future_dates],
246
- 'day_of_month': [d.day for d in future_dates]
247
- })
248
 
249
- # Make predictions
250
- predictions = model.predict(future_X)
 
 
 
 
251
 
252
- return pd.DataFrame({
253
- 'date': future_dates,
254
- 'predicted_sentiment': predictions,
255
- 'model': 'Random Forest'
256
- })
257
  except Exception as e:
258
- st.error(f"Error with Random Forest prediction: {str(e)}")
259
- return None
260
-
261
- def plot_sentiment_predictions(history_df, predictions):
262
- """Plot historical data and predictions"""
263
- try:
264
- # Prepare historical data
265
- history_df = history_df.set_index('date').resample('D')['Average'].mean().reset_index()
266
-
267
- # Create figure
268
- fig = px.line(history_df, x='date', y='Average',
269
- title='Historical Sentiment & Future Predictions',
270
- labels={'Average': 'Sentiment Score'})
271
-
272
- # Add prediction traces
273
- for model_name, pred_df in predictions.items():
274
- if pred_df is not None:
275
- fig.add_scatter(x=pred_df['date'], y=pred_df['predicted_sentiment'],
276
- mode='lines', name=f'{model_name} Prediction',
277
- line=dict(dash='dot'))
278
-
279
- # Add confidence interval if available
280
- if 'lower_bound' in pred_df.columns and 'upper_bound' in pred_df.columns:
281
- fig.add_trace(px.area(pred_df, x='date',
282
- y_upper='upper_bound',
283
- y_lower='lower_bound',
284
- title='').data[0])
285
-
286
- fig.update_layout(hovermode="x unified", showlegend=True)
287
- return fig
288
- except Exception as e:
289
- st.error(f"Error plotting predictions: {str(e)}")
290
- return None
291
-
292
- # --------------------------
293
- # Data Fetching Functions
294
- # --------------------------
295
-
296
- @st.cache_data(ttl=3600) # Cache for 1 hour
297
- def fetch_reddit_data(keyword, limit=50):
298
- """Fetch Reddit posts containing the keyword"""
299
- try:
300
- subreddit = reddit.subreddit("all")
301
- posts = subreddit.search(keyword, limit=limit)
302
-
303
- data = []
304
- for post in posts:
305
- data.append({
306
- 'date': datetime.fromtimestamp(post.created_utc),
307
- 'text': f"{post.title}\n{post.selftext}",
308
- 'source': 'Reddit',
309
- 'url': f"https://reddit.com{post.permalink}"
310
- })
311
- return pd.DataFrame(data)
312
- except Exception as e:
313
- st.error(f"Error fetching Reddit data: {str(e)}")
314
  return pd.DataFrame()
315
 
316
- @st.cache_data(ttl=3600) # Cache for 1 hour
317
- def fetch_youtube_data(keyword, limit=100):
318
- """Fetch YouTube videos containing the keyword"""
319
  try:
320
- request = youtube.search().list(
 
 
 
 
321
  q=keyword,
322
  part="snippet",
323
  maxResults=limit,
324
  type="video",
325
  order="relevance"
326
- )
327
- response = request.execute()
 
 
 
 
 
 
328
 
329
- data = []
330
- for item in response['items']:
331
- data.append({
332
- 'date': datetime.strptime(item['snippet']['publishedAt'], '%Y-%m-%dT%H:%M:%SZ'),
333
- 'text': f"{item['snippet']['title']}\n{item['snippet']['description']}",
334
- 'source': 'YouTube',
335
- 'url': f"https://youtube.com/watch?v={item['id']['videoId']}"
336
- })
337
- return pd.DataFrame(data)
338
  except Exception as e:
339
- st.error(f"Error fetching YouTube data: {str(e)}")
340
  return pd.DataFrame()
341
 
342
  # --------------------------
343
  # Visualization Functions
344
  # --------------------------
345
 
346
- def plot_sentiment_trends(df, keyword):
347
- """Plot sentiment trends over time"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  try:
349
  fig = px.line(
350
- df,
351
  x='date',
352
- y=["VADER", "BERT", "TextBlob", "Average"],
353
- title=f'Sentiment Over Time for "{keyword}"',
354
  labels={'value': 'Sentiment Score', 'date': 'Date'},
355
  color_discrete_map={
356
- "VADER": "#636EFA",
357
- "BERT": "#EF553B",
358
- "TextBlob": "#00CC96",
359
- "Average": "#AB63FA"
360
  }
361
  )
362
- fig.update_layout(hovermode="x unified")
363
- st.plotly_chart(fig, use_container_width=True)
364
- except Exception as e:
365
- st.error(f"Error plotting sentiment trends: {str(e)}")
366
-
367
- def plot_sentiment_distribution(df, keyword):
368
- """Plot sentiment distribution"""
369
- try:
370
- dist_values = [
371
- sum(df['Average'] > 0.1), # Positive
372
- sum(df['Average'] < -0.1), # Negative
373
- sum((df['Average'] >= -0.1) & (df['Average'] <= 0.1)) # Neutral
374
- ]
375
-
376
- fig = px.pie(
377
- values=dist_values,
378
- names=['Positive', 'Negative', 'Neutral'],
379
- title=f'Sentiment Distribution for "{keyword}"',
380
- color=['Positive', 'Negative', 'Neutral'],
381
- color_discrete_map={
382
- 'Positive': '#00CC96',
383
- 'Negative': '#EF553B',
384
- 'Neutral': '#636EFA'
385
- },
386
- hole=0.3
387
  )
388
- st.plotly_chart(fig, use_container_width=True)
389
  except Exception as e:
390
- st.error(f"Error plotting sentiment distribution: {str(e)}")
 
391
 
392
  # --------------------------
393
- # Main App Interface
394
  # --------------------------
395
 
396
  def main():
397
- st.title("🌟 SentimentSync: Live Sentiment Analysis & Prediction Dashboard")
398
 
399
  # Sidebar controls
400
  with st.sidebar:
401
- st.header("🔍 Analysis Controls")
402
  analysis_mode = st.radio(
403
- "Analysis Mode",
404
- ["Manual Text", "Live Data (Reddit & YouTube)"],
405
  index=0
406
  )
407
 
408
- if analysis_mode == "Manual Text":
409
  user_input = st.text_area(
410
- "Enter text for sentiment analysis",
411
  height=200,
412
- placeholder="Type or paste your text here..."
413
  )
414
- analyze_btn = st.button("Analyze Text")
415
  else:
416
  keyword = st.text_input(
417
- "Enter keyword for live data",
418
- placeholder="e.g., Tesla, Bitcoin, etc."
419
  )
420
- analyze_btn = st.button("Fetch & Analyze Data")
421
 
422
  st.markdown("---")
423
- st.markdown("### Settings")
424
- show_raw_data = st.checkbox("Show raw data", value=False)
425
- enable_prediction = st.checkbox("Enable sentiment prediction", value=True)
426
  st.markdown("---")
427
- st.button("🔄 Reset Analysis")
428
-
429
- # Main content area
430
  if analyze_btn:
431
- with st.spinner("Analyzing..."):
432
- if analysis_mode == "Manual Text":
433
- if not user_input or not any(c.isalpha() for c in user_input):
434
- st.warning("Please enter valid text for analysis")
435
- return
 
 
 
 
 
 
 
 
 
436
 
437
- # Analyze the text
438
- vader_score, bert_num, textblob_score, bert_result = analyze_text(user_input)
439
 
440
  # Display results
441
- st.subheader("📊 Sentiment Analysis Results")
442
  cols = st.columns(3)
443
- cols[0].metric("VADER Score", f"{vader_score:.2f}",
444
- "Positive" if vader_score > 0 else "Negative" if vader_score < 0 else "Neutral")
445
- cols[1].metric("BERT Sentiment", bert_result['label'], f"Confidence: {bert_result['score']:.2f}")
446
- cols[2].metric("TextBlob Polarity", f"{textblob_score:.2f}",
447
- "Positive" if textblob_score > 0 else "Negative" if textblob_score < 0 else "Neutral")
448
 
449
  # Word cloud
450
- st.subheader("📝 Word Cloud")
451
  wordcloud_img = f'data:image/png;base64,{generate_wordcloud(user_input)}'
452
  st.image(wordcloud_img, use_column_width=True)
453
 
454
- # Sentence-level analysis
455
- try:
456
- sentences = nltk.sent_tokenize(user_input)
457
- if len(sentences) > 1:
458
- st.subheader("🔍 Sentence-level Analysis")
459
- dates = [datetime.now() - timedelta(minutes=len(sentences)-i) for i in range(len(sentences))]
460
- sentence_data = [analyze_text(s) for s in sentences]
461
-
462
- df = pd.DataFrame({
463
- "Sentence": sentences,
464
- "VADER": [d[0] for d in sentence_data],
465
- "BERT": [d[1] for d in sentence_data],
466
- "TextBlob": [d[2] for d in sentence_data]
467
- })
468
- df["Average"] = df[["VADER", "BERT", "TextBlob"]].mean(axis=1)
469
-
470
- st.dataframe(df.style.background_gradient(
471
- cmap='RdYlGn',
472
- subset=["VADER", "BERT", "TextBlob", "Average"],
473
- vmin=-1, vmax=1
474
- ), use_container_width=True)
475
-
476
- plot_sentiment_trends(df, "Your Text")
477
- except Exception as e:
478
- st.error(f"Error in sentence analysis: {str(e)}")
479
-
480
- else: # Live Data mode
481
- if not keyword:
482
- st.warning("Please enter a keyword to search")
483
- return
484
 
485
- # Fetch data
486
- with st.spinner(f"Fetching data for '{keyword}'..."):
487
- reddit_df = fetch_reddit_data(keyword)
488
- youtube_df = fetch_youtube_data(keyword)
489
-
490
- if reddit_df.empty and youtube_df.empty:
491
- st.error("No data found. Try a different keyword.")
492
- return
493
-
494
- df = pd.concat([reddit_df, youtube_df], ignore_index=True)
495
 
496
- # Analyze sentiment for each item
497
- with st.spinner("Analyzing sentiment..."):
498
- results = []
499
- for _, row in df.iterrows():
500
- vader, bert, textblob, _ = analyze_text(row['text'])
501
- results.append((vader, bert, textblob))
 
502
 
503
- df['VADER'] = [r[0] for r in results]
504
- df['BERT'] = [r[1] for r in results]
505
- df['TextBlob'] = [r[2] for r in results]
506
- df['Average'] = df[['VADER', 'BERT', 'TextBlob']].mean(axis=1)
507
 
508
- # Display results
509
- st.subheader(f"📊 Overall Sentiment for '{keyword}'")
 
 
 
 
 
 
 
 
510
 
511
- # Metrics
512
- avg_sentiment = df['Average'].mean()
513
- pos_pct = len(df[df['Average'] > 0.1]) / len(df) * 100
514
- neg_pct = len(df[df['Average'] < -0.1]) / len(df) * 100
 
515
 
516
  cols = st.columns(3)
517
- cols[0].metric("Average Sentiment", f"{avg_sentiment:.2f}",
518
- "Positive" if avg_sentiment > 0 else "Negative" if avg_sentiment < 0 else "Neutral")
 
 
 
 
519
  cols[1].metric("Positive Content", f"{pos_pct:.1f}%")
520
  cols[2].metric("Negative Content", f"{neg_pct:.1f}%")
521
 
522
  # Word cloud
523
- st.subheader("📝 Word Cloud")
524
- combined_text = " ".join(df['text'])
525
- wordcloud_img = f'data:image/png;base64,{generate_wordcloud(combined_text)}'
526
- st.image(wordcloud_img, use_container_width=True)
527
 
528
- # Filter recent data (last 14 days)
529
- df['date'] = pd.to_datetime(df['date'])
530
- cutoff_date = datetime.now() - timedelta(days=14)
531
- df_recent = df[df['date'] >= cutoff_date].sort_values('date')
532
 
533
- if not df_recent.empty:
534
  # Sentiment trends
535
- st.subheader("📅 Sentiment Trends (Last 14 Days)")
536
- plot_sentiment_trends(df_recent, keyword)
537
-
538
- # Sentiment distribution
539
- st.subheader("📊 Sentiment Distribution")
540
- plot_sentiment_distribution(df_recent, keyword)
541
 
542
- # Sentiment prediction
543
- if enable_prediction and len(df_recent) >= 7: # Need at least 7 days of data
544
- st.subheader("🔮 Sentiment Prediction (Next 15 Days)")
545
-
546
- with st.spinner("Training prediction models..."):
547
- # Prepare time series data
548
- ts_df = prepare_time_series_data(df_recent)
549
-
550
- if ts_df is not None and len(ts_df) >= 7:
551
- # Get predictions from different models
552
- predictions = {
553
- 'Prophet': predict_sentiment_prophet(ts_df),
554
- 'ARIMA': predict_sentiment_arima(ts_df),
555
- 'Random Forest': predict_sentiment_rf(ts_df)
556
- }
557
-
558
- # Filter out None predictions
559
- valid_predictions = {k: v for k, v in predictions.items() if v is not None}
560
-
561
- if valid_predictions:
562
- # Plot predictions
563
- fig = plot_sentiment_predictions(df_recent, valid_predictions)
564
- if fig:
565
- st.plotly_chart(fig, use_container_width=True)
566
-
567
- # Show prediction details
568
- st.subheader("📋 Prediction Details")
569
- for model_name, pred_df in valid_predictions.items():
570
- st.markdown(f"**{model_name} Prediction**")
571
- st.dataframe(pred_df.set_index('date').style.format("{:.2f}"), use_container_width=True)
572
- else:
573
- st.warning("Could not generate predictions with the available data.")
574
- else:
575
- st.warning("Not enough data points for reliable prediction. Need at least 7 days of data.")
576
-
577
- # Raw data (if enabled)
578
- if show_raw_data:
579
- st.subheader("📋 Raw Data")
580
- st.dataframe(df_recent[['date', 'source', 'text', 'Average']], use_container_width=True)
581
  else:
582
  st.info("No recent data found (within last 14 days).")
583
 
584
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
585
  main()
 
1
  import streamlit as st
2
  from transformers import pipeline
3
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 
4
  import pandas as pd
5
  from datetime import datetime, timedelta
6
  import plotly.express as px
7
+ import plotly.graph_objects as go
 
 
 
8
  from wordcloud import WordCloud
9
  import base64
10
  from io import BytesIO
 
13
  import praw
14
  from googleapiclient.discovery import build
15
  import os
16
+ import time
17
+ from functools import lru_cache
18
 
19
  # --------------------------
20
+ # Initial Setup
21
  # --------------------------
22
 
 
23
  st.set_page_config(
24
+ page_title="🚀 SentimentSync Pro",
25
+ page_icon="📈",
26
  layout="wide"
27
  )
28
 
29
  # --------------------------
30
+ # Performance Optimizations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # --------------------------
32
 
33
  @st.cache_resource
34
  def load_models():
35
+ """Load models with progress indicators"""
36
+ progress = st.progress(0, text="Loading sentiment models...")
37
+
38
  try:
39
  # Initialize sentiment models
40
+ with st.spinner("Loading BERT model..."):
41
+ bert_sentiment = pipeline(
42
+ "sentiment-analysis",
43
+ model="nlptown/bert-base-multilingual-uncased-sentiment"
44
+ )
45
+ progress.progress(50)
46
+
47
+ with st.spinner("Loading VADER analyzer..."):
48
+ vader_analyzer = SentimentIntensityAnalyzer()
49
+ progress.progress(100)
50
+
51
  return bert_sentiment, vader_analyzer
52
  except Exception as e:
53
+ st.error(f"Model loading failed: {str(e)}")
54
  return None, None
55
 
 
 
 
 
 
 
 
 
 
56
  @st.cache_resource
57
  def setup_api_clients():
58
+ """Initialize API clients with error handling"""
59
  try:
60
+ with st.spinner("Initializing Reddit API..."):
61
+ reddit = praw.Reddit(
62
+ client_id="S7pTXhj5JDFGDb3-_zrJEA",
63
+ client_secret="QP3NYN4lrAKVLrBamzLGrpFywiVg8w",
64
+ user_agent="SentimentSync/1.0"
65
+ )
66
 
67
+ with st.spinner("Initializing YouTube API..."):
68
+ youtube = build('youtube', 'v3', developerKey="AIzaSyDcUAkcoPvkTwN_tksmiW0dVPI5Bse7qos")
69
 
70
  return reddit, youtube
71
  except Exception as e:
72
+ st.error(f"API initialization failed: {str(e)}")
73
  return None, None
74
 
 
 
 
 
 
75
  # --------------------------
76
+ # Core Functions (Optimized)
77
  # --------------------------
78
 
79
+ def analyze_text(text, models):
80
+ """Optimized text analysis with batch processing"""
81
+ bert_sentiment, vader_analyzer = models
82
+
83
+ # Truncate very long texts to improve performance
84
+ truncated_text = text[:2000] # Process first 2000 chars only
85
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  try:
87
+ # Parallel processing would be better here, but keeping it simple
88
+ vader_score = vader_analyzer.polarity_scores(truncated_text)['compound']
89
+ textblob_score = TextBlob(truncated_text).sentiment.polarity
90
 
91
+ # Batch BERT processing for better performance
92
+ bert_result = bert_sentiment(truncated_text[:512])[0] # BERT has 512 token limit
93
 
94
+ # Convert BERT label to numerical score
95
+ label_map = {
96
+ '1 star': -1,
97
+ '2 stars': -0.5,
98
+ '3 stars': 0,
99
+ '4 stars': 0.5,
100
+ '5 stars': 1
101
+ }
102
+ bert_num = label_map.get(bert_result['label'], 0)
103
 
104
+ return {
105
+ 'vader': vader_score,
106
+ 'bert': bert_num,
107
+ 'textblob': textblob_score,
108
+ 'bert_label': bert_result['label'],
109
+ 'bert_confidence': bert_result['score']
110
+ }
111
  except Exception as e:
112
+ st.error(f"Analysis error: {str(e)}")
113
+ return {
114
+ 'vader': 0,
115
+ 'bert': 0,
116
+ 'textblob': 0,
117
+ 'bert_label': 'Error',
118
+ 'bert_confidence': 0
119
+ }
120
 
121
+ @st.cache_data(ttl=3600, show_spinner="Fetching data...")
122
+ def fetch_reddit_data(keyword, limit=30):
123
+ """Optimized Reddit data fetching"""
124
  try:
125
+ reddit, _ = setup_api_clients()
126
+ if not reddit:
127
+ return pd.DataFrame()
 
128
 
129
+ posts = list(reddit.subreddit("all").search(keyword, limit=limit))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
+ return pd.DataFrame([{
132
+ 'date': datetime.fromtimestamp(post.created_utc),
133
+ 'text': f"{post.title}\n{post.selftext}",
134
+ 'source': 'Reddit',
135
+ 'url': f"https://reddit.com{post.permalink}"
136
+ } for post in posts])
137
 
 
 
 
 
 
138
  except Exception as e:
139
+ st.error(f"Reddit fetch error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  return pd.DataFrame()
141
 
142
+ @st.cache_data(ttl=3600, show_spinner="Fetching data...")
143
+ def fetch_youtube_data(keyword, limit=30):
144
+ """Optimized YouTube data fetching"""
145
  try:
146
+ _, youtube = setup_api_clients()
147
+ if not youtube:
148
+ return pd.DataFrame()
149
+
150
+ response = youtube.search().list(
151
  q=keyword,
152
  part="snippet",
153
  maxResults=limit,
154
  type="video",
155
  order="relevance"
156
+ ).execute()
157
+
158
+ return pd.DataFrame([{
159
+ 'date': datetime.strptime(item['snippet']['publishedAt'], '%Y-%m-%dT%H:%M:%SZ'),
160
+ 'text': f"{item['snippet']['title']}\n{item['snippet']['description']}",
161
+ 'source': 'YouTube',
162
+ 'url': f"https://youtube.com/watch?v={item['id']['videoId']}"
163
+ } for item in response['items']])
164
 
 
 
 
 
 
 
 
 
 
165
  except Exception as e:
166
+ st.error(f"YouTube fetch error: {str(e)}")
167
  return pd.DataFrame()
168
 
169
  # --------------------------
170
  # Visualization Functions
171
  # --------------------------
172
 
173
+ def generate_wordcloud(text):
174
+ """Fast word cloud generation"""
175
+ try:
176
+ wordcloud = WordCloud(
177
+ width=800,
178
+ height=400,
179
+ background_color='white',
180
+ collocations=False, # Faster processing
181
+ stopwords=nltk.corpus.stopwords.words('english')
182
+ ).generate(text)
183
+
184
+ img = BytesIO()
185
+ wordcloud.to_image().save(img, format='PNG')
186
+ return base64.b64encode(img.getvalue()).decode()
187
+ except Exception as e:
188
+ st.error(f"Word cloud error: {str(e)}")
189
+ return ""
190
+
191
+ def plot_sentiment(data, keyword):
192
+ """Optimized plotting function"""
193
  try:
194
  fig = px.line(
195
+ data,
196
  x='date',
197
+ y=['vader', 'bert', 'textblob', 'average'],
198
+ title=f'Sentiment Analysis for "{keyword}"',
199
  labels={'value': 'Sentiment Score', 'date': 'Date'},
200
  color_discrete_map={
201
+ "vader": "#636EFA",
202
+ "bert": "#EF553B",
203
+ "textblob": "#00CC96",
204
+ "average": "#AB63FA"
205
  }
206
  )
207
+ fig.update_layout(
208
+ hovermode="x unified",
209
+ xaxis_title="Date",
210
+ yaxis_title="Sentiment Score",
211
+ legend_title="Metric"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  )
213
+ return fig
214
  except Exception as e:
215
+ st.error(f"Plotting error: {str(e)}")
216
+ return None
217
 
218
  # --------------------------
219
+ # Main Application
220
  # --------------------------
221
 
222
  def main():
223
+ st.title("🚀 SentimentSync Pro - Real-time Analysis Dashboard")
224
 
225
  # Sidebar controls
226
  with st.sidebar:
227
+ st.header("🔧 Analysis Controls")
228
  analysis_mode = st.radio(
229
+ "Mode",
230
+ ["Text Analysis", "Live Data Analysis"],
231
  index=0
232
  )
233
 
234
+ if analysis_mode == "Text Analysis":
235
  user_input = st.text_area(
236
+ "Enter text to analyze",
237
  height=200,
238
+ placeholder="Paste your content here..."
239
  )
240
+ analyze_btn = st.button("Analyze Now")
241
  else:
242
  keyword = st.text_input(
243
+ "Search keyword",
244
+ placeholder="e.g., Apple, Tesla, etc."
245
  )
246
+ analyze_btn = st.button("Fetch & Analyze")
247
 
248
  st.markdown("---")
249
+ st.markdown("### Options")
250
+ show_details = st.checkbox("Show detailed results", value=False)
 
251
  st.markdown("---")
252
+
253
+ # Main content
 
254
  if analyze_btn:
255
+ models = load_models()
256
+ if not all(models):
257
+ st.error("Required models failed to load")
258
+ return
259
+
260
+ if analysis_mode == "Text Analysis":
261
+ if not user_input.strip():
262
+ st.warning("Please enter some text to analyze")
263
+ return
264
+
265
+ with st.spinner("Analyzing content..."):
266
+ start_time = time.time()
267
+ result = analyze_text(user_input, models)
268
+ processing_time = time.time() - start_time
269
 
270
+ st.success(f"Analysis completed in {processing_time:.2f} seconds")
 
271
 
272
  # Display results
 
273
  cols = st.columns(3)
274
+ cols[0].metric("VADER Score", f"{result['vader']:.2f}",
275
+ "Positive" if result['vader'] > 0 else "Negative" if result['vader'] < 0 else "Neutral")
276
+ cols[1].metric("BERT Sentiment", result['bert_label'], f"Confidence: {result['bert_confidence']:.2f}")
277
+ cols[2].metric("TextBlob Score", f"{result['textblob']:.2f}",
278
+ "Positive" if result['textblob'] > 0 else "Negative" if result['textblob'] < 0 else "Neutral")
279
 
280
  # Word cloud
281
+ st.subheader("📊 Text Visualization")
282
  wordcloud_img = f'data:image/png;base64,{generate_wordcloud(user_input)}'
283
  st.image(wordcloud_img, use_column_width=True)
284
 
285
+ else: # Live Data Analysis
286
+ if not keyword.strip():
287
+ st.warning("Please enter a search keyword")
288
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
 
290
+ with st.spinner(f"Gathering data for '{keyword}'..."):
291
+ start_time = time.time()
 
 
 
 
 
 
 
 
292
 
293
+ # Parallel fetching would be better here
294
+ reddit_data = fetch_reddit_data(keyword)
295
+ youtube_data = fetch_youtube_data(keyword)
296
+
297
+ if reddit_data.empty and youtube_data.empty:
298
+ st.error("No data found. Try a different keyword.")
299
+ return
300
 
301
+ combined_data = pd.concat([reddit_data, youtube_data], ignore_index=True)
 
 
 
302
 
303
+ # Analyze in batches
304
+ analysis_results = []
305
+ for _, row in combined_data.iterrows():
306
+ analysis_results.append(analyze_text(row['text'], models))
307
+
308
+ # Add results to dataframe
309
+ combined_data['vader'] = [r['vader'] for r in analysis_results]
310
+ combined_data['bert'] = [r['bert'] for r in analysis_results]
311
+ combined_data['textblob'] = [r['textblob'] for r in analysis_results]
312
+ combined_data['average'] = combined_data[['vader', 'bert', 'textblob']].mean(axis=1)
313
 
314
+ processing_time = time.time() - start_time
315
+ st.success(f"Analyzed {len(combined_data)} sources in {processing_time:.2f} seconds")
316
+
317
+ # Display summary
318
+ st.subheader(f"📈 Overall Sentiment for '{keyword}'")
319
 
320
  cols = st.columns(3)
321
+ avg_sentiment = combined_data['average'].mean()
322
+ pos_pct = (combined_data['average'] > 0.1).mean() * 100
323
+ neg_pct = (combined_data['average'] < -0.1).mean() * 100
324
+
325
+ cols[0].metric("Avg Sentiment", f"{avg_sentiment:.2f}",
326
+ "Positive" if avg_sentiment > 0 else "Negative" if avg_sentiment < 0 else "Neutral")
327
  cols[1].metric("Positive Content", f"{pos_pct:.1f}%")
328
  cols[2].metric("Negative Content", f"{neg_pct:.1f}%")
329
 
330
  # Word cloud
331
+ st.subheader("📊 Content Visualization")
332
+ all_text = " ".join(combined_data['text'])
333
+ wordcloud_img = f'data:image/png;base64,{generate_wordcloud(all_text)}'
334
+ st.image(wordcloud_img, use_column_width=True)
335
 
336
+ # Filter recent data
337
+ combined_data['date'] = pd.to_datetime(combined_data['date'])
338
+ recent_data = combined_data[combined_data['date'] >= (datetime.now() - timedelta(days=14))]
 
339
 
340
+ if not recent_data.empty:
341
  # Sentiment trends
342
+ st.subheader("📅 Sentiment Over Time")
343
+ fig = plot_sentiment(recent_data, keyword)
344
+ if fig:
345
+ st.plotly_chart(fig, use_container_width=True)
 
 
346
 
347
+ # Show details if enabled
348
+ if show_details:
349
+ st.subheader("🔍 Detailed Results")
350
+ st.dataframe(recent_data[['date', 'source', 'text', 'average']], use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  else:
352
  st.info("No recent data found (within last 14 days).")
353
 
354
  if __name__ == "__main__":
355
+ # Initialize NLTK data
356
+ try:
357
+ nltk.data.path.append(os.path.join(os.path.expanduser("~"), "nltk_data"))
358
+ nltk.download('punkt', quiet=True)
359
+ nltk.download('stopwords', quiet=True)
360
+ except:
361
+ pass
362
+
363
  main()