Arjon07CSE commited on
Commit
6372aa2
·
verified ·
1 Parent(s): 1a6533b

performed all the fixes

Browse files
Files changed (1) hide show
  1. app.py +970 -516
app.py CHANGED
@@ -1,44 +1,28 @@
1
  # ==============================================================================
2
  # SOCIAL PERCEPTION ANALYZER - FINAL COMPLETE APPLICATION
3
- # Version: 3.0 (Architecturally Refactored, Production Ready)
4
  # ==============================================================================
5
-
6
  # --- IMPORTS ---
7
- import gradio as gr
 
 
8
  import pandas as pd
9
- import numpy as np
10
- import torch
11
- import sqlite3
12
- import json
13
  import logging
14
- import requests
15
- import os
16
  import time
17
- import random
18
- import functools
19
- from io import StringIO
20
  from datetime import datetime, timezone
21
  from logging.handlers import RotatingFileHandler
22
-
23
- # --- NLP & Machine Learning ---
24
- # BanglaBERT tokenizer removed for simplicity
25
- import dateparser
26
-
27
- # --- NLP & Machine Learning ---
28
- from transformers import pipeline, BitsAndBytesConfig, AutoTokenizer
29
- from sentence_transformers import SentenceTransformer
30
- from huggingface_hub.utils import HfHubHTTPError
31
-
32
- # --- Visualization ---
33
  import matplotlib.pyplot as plt
34
- from matplotlib.font_manager import FontProperties
35
  import seaborn as sns
36
  from wordcloud import WordCloud
 
 
 
37
 
38
  # ==============================================================================
39
  # SETUP PRODUCTION-GRADE LOGGING & CONFIGURATION
40
  # ==============================================================================
41
-
42
  log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
43
  log_handler = RotatingFileHandler('app.log', maxBytes=5*1024*1024, backupCount=2)
44
  log_handler.setFormatter(log_formatter)
@@ -50,466 +34,725 @@ logger.info("Application starting up.")
50
 
51
  # --- APPLICATION CONFIGURATION ---
52
  APP_TITLE = "Social Perception Analyzer"
53
- APP_TAGLINE = "A flatform for understanding Netizens dynamics"
54
- APP_FOOTER = "Developed by Arjon"
55
 
56
  # --- FONT CONFIGURATION ---
57
  FONT_PATH = 'NotoSansBengali-Regular.ttf'
58
- try:
59
- BANGLA_FONT = FontProperties(fname=FONT_PATH)
60
- logger.info("Successfully loaded 'NotoSansBengali-Regular.ttf' font.")
61
- except OSError:
62
- logger.error("Failed to load 'NotoSansBengali-Regular.ttf'. Ensure the file is in the root directory.")
63
- gr.Warning("Bangla font not found! Visualizations may not render text correctly.")
64
- BANGLA_FONT = FontProperties()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  # ==============================================================================
67
  # CORE HELPER FUNCTIONS
 
68
  def clean_bengali_text(text):
69
- # Remove non-Bengali characters except spaces and underscores (for joined phrases)
70
- # Preserve word shapes by not removing valid combining marks
71
  cleaned = re.sub(r'[^\u0980-\u09FF_\s]', '', str(text))
72
- # Remove extra spaces
73
  cleaned = re.sub(r'\s+', ' ', cleaned).strip()
74
  return cleaned
75
- # --- DEFINE ALL YOUR STOPWORDS FIRST ---
76
 
77
- # List of Bengali stop words
78
  BANGLA_STOP_WORDS = [
79
  'অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য',
80
  'অভিপ্রায়', 'একে', 'একই', 'একেবারে', 'একটি', 'একবার', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটি',
81
  'এতটাই', 'এতদূর', 'এতটুকু', 'এক', 'এবং', 'এবার', 'এমন', 'এমনভাবে', 'এর', 'এরা', 'এঁরা', 'এঁদের',
82
  'এই', 'এইভাবে', 'ও', 'ওঁরা', 'ওঁর', 'ওঁদের', 'ওকে', '��খানে', 'ওদের', 'ওর', 'কাছ', 'কাছে', 'কাজ',
83
- 'কারণ', 'কিছু', 'কিছুই', 'কিন্তু', 'কিভাবে', 'কেন', 'কোন', 'কোনও', 'কোনো', 'ক্ষেত্রে', 'খুব',
84
- 'গুলি', 'গিয়ে', 'চায়', 'ছাড়া', 'জন্য', 'জানা', 'ঠিক', 'তিনি', 'তিন', 'তিনিও', 'তাকে', 'তাঁকে',
85
- 'তার', 'তাঁর', 'তারা', 'তাঁরা', 'তাদের', 'তাঁদের', 'তাহলে', ' থাকলেও', 'থেকে', 'মধ্যেই', 'মধ্যে',
86
- 'द्वारा', 'নয়', 'না', 'নিজের', 'নিজে', 'নিয়ে', 'পারেন', 'পারা', 'পারে', 'পরে', 'পর্যন্ত', 'পুনরায়',
87
- 'ফলে', 'বজায়', 'বা', 'বাদে', 'বার', 'বিশেষ', 'বিভিন্ন', 'ব্যবহার', 'ব্যাপারে', 'ভাবে', 'ভাবেই', 'মাধ্যমে',
88
- 'মতো', 'মতোই', 'যখন', 'যদি', 'যদিও', 'যা', 'যাকে', 'যাওয়া', 'যায়', 'যে', 'যেখানে', 'যেতে', 'যেমন',
89
  'যেহেতু', 'রহিছে', 'শিক্ষা', 'শুধু', 'সঙ্গে', 'সব', 'সমস্ত', 'সম্প্রতি', 'সহ', 'সাধারণ', 'সামনে', 'হতে',
90
- 'হতেই', 'হবে', 'হয়', 'হয়তো', 'হয়', 'হচ্ছে', 'হত', 'হলে', 'হলেও', 'হয়নি', 'হাজার', 'হোওয়া', 'আরও', 'আমরা',
91
  'আমার', 'আমি', 'আর', 'আগে', 'আগেই', 'আছে', 'আজ', 'তাকে', 'তাতে', 'তাদের', 'তাহার', 'তাহাতে', 'তাহারই',
92
  'তথা', 'তথাপি', 'সে', 'সেই', 'সেখান', 'সেখানে', 'থেকে', 'নাকি', 'নাগাদ', 'দু', 'দুটি', 'সুতরাং',
93
- 'সম্পর্কে', 'সঙ্গেও', 'সর্বাধিক', 'সর্বদা', 'সহ', 'হৈতে', 'হইবে', 'হইয়া', 'হৈল', 'জানিয়েছেন', 'প্রতিবেদক'
94
  ]
95
 
96
- # Another set of stop words from your notebook
97
- NOTEBOOK_STOPWORDS = set([
98
- 'এবং', 'ও', 'বা', 'কিংবা', 'অথবা', 'কিন্তু', 'এর', 'এ', 'এই', 'সেই', 'ওই', 'এক', 'জন্য',
99
- 'আমার', 'তোমার', 'তার', 'আমাদের', 'তাদের', 'সে', 'তিনি', 'আমি', 'তুমি', 'যে', 'যায়', 'হয়',
100
- 'হবে', 'ছিল', 'আছে', 'নেই', 'এটা', 'ওটা', 'সেটা', 'করে', 'করতে', 'করেছে', 'করছেন', 'থেকে',
101
- 'সাথে', 'মধ্যে', 'উপরে', 'নিচে', 'পরে', 'আগে', 'শুধু', 'খুব', 'অনেক', 'আরও', 'হিসাবে', 'তাহলে',
102
- 'হলে', 'তাই', 'সুতরাং', 'কারণে', 'একটি', 'হয়ে', 'হয়েছিল', 'হচ্ছে', 'হয়ে���ে', 'না', 'হ্যাঁ', 'কি',
103
- 'কী', 'কে', 'কোন', 'গুলো', 'কিছু', 'বলেন', 'বললেন', 'বলল', 'আর', 'ভাই', 'হোক', 'চাই', 'বাদ',
104
- 'দিতে', 'দিয়ে', 'দিলেন', 'দেন', 'যাবে', 'যাক', 'পারা', 'পারে', 'করা', 'করি', 'করার', 'করছে',
105
- 'করবে', 'সব', 'এখন', 'যদি', 'কেন', 'কবে', 'কেমন', 'ইনশাআল্লাহ', 'আপনি', 'আপনার', 'আপনারা', 'আমরা'
106
- ])
107
 
108
- COMBINED_STOPWORDS = set(BANGLA_STOP_WORDS) | NOTEBOOK_STOPWORDS
109
  PHRASES_TO_JOIN = {
110
  "তারেক রহমান": "তারেক_রহমান",
111
- "খালেদা জিয়া": "খালেদা_জিয়া",
112
  "বিএনপি জিন্দাবাদ": "বিএনপি_জিন্দাবাদ"
113
- # Add more as needed
114
  }
115
 
116
-
117
  def get_dynamic_time_agg(start_date, end_date):
118
- """Hardened helper to determine time aggregation level."""
119
  if not isinstance(start_date, pd.Timestamp) or not isinstance(end_date, pd.Timestamp):
120
- return 'D', 'Daily' # Graceful fallback
 
121
  delta = end_date - start_date
122
- if delta.days <= 2: return 'H', 'Hourly'
123
- if delta.days <= 90: return 'D', 'Daily'
124
- if delta.days <= 730: return 'W', 'Weekly'
 
 
 
125
  return 'M', 'Monthly'
126
 
127
- # ==============================================================================
128
- # ML MODEL MANAGEMENT
129
- TOKENIZER_MODEL_ID = "csebuetnlp/banglabert_large"
130
- TOKENIZER = None
131
-
132
- def get_bangla_tokenizer():
133
- global TOKENIZER
134
- if TOKENIZER is None:
135
- try:
136
- TOKENIZER = AutoTokenizer.from_pretrained(TOKENIZER_MODEL_ID)
137
- logger.info("BanglaBERT tokenizer loaded successfully.")
138
- except Exception as e:
139
- logger.error(f"Failed to load BanglaBERT tokenizer: {e}")
140
- TOKENIZER = None
141
- return TOKENIZER
142
- # ==============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
- ## Sentiment pipeline code removed for optimization
 
 
 
 
 
 
 
146
 
147
  # ==============================================================================
148
  # NEWS SCRAPER BACKEND
149
  # ==============================================================================
150
-
151
  def run_news_scraper_pipeline(search_keywords, sites, start_date_str, end_date_str, interval, max_pages, filter_keys, progress=gr.Progress()):
152
- """Full, robust implementation of the news scraper."""
153
  # Input validation and sanitization
154
- search_keywords = search_keywords.strip()
 
 
 
 
 
155
  if not all([search_keywords, start_date_str, end_date_str]):
156
  raise gr.Error("Search Keywords, Start Date, and End Date are required.")
157
-
158
  start_dt = dateparser.parse(start_date_str)
159
  end_dt = dateparser.parse(end_date_str)
 
160
  if not all([start_dt, end_dt]):
161
  raise gr.Error("Invalid date format. Please use a recognizable format like YYYY-MM-DD or '2 weeks ago'.")
162
-
 
 
 
 
 
163
  all_articles, current_dt = [], start_dt
 
 
164
  while current_dt <= end_dt:
165
  try:
166
  interval_end_dt = min(current_dt + pd.Timedelta(days=interval - 1), end_dt)
167
  start_str, end_str = current_dt.strftime('%Y-%m-%d'), interval_end_dt.strftime('%Y-%m-%d')
168
- progress(0, desc=f"Fetching news from {start_str} to {end_str}")
 
 
 
169
  site_query = f"({' OR '.join(['site:' + s.strip() for s in sites.split(',') if s.strip()])})" if sites else ""
170
  final_query = f'"{search_keywords}" {site_query} after:{start_str} before:{end_str}'
171
- googlenews = GoogleNews(lang='bn', region='BD')
 
172
  googlenews.search(final_query)
 
173
  for page in range(1, max_pages + 1):
174
  try:
175
  results = googlenews.results()
176
- if not results: break
 
177
  all_articles.extend(results)
 
178
  if page < max_pages:
179
  googlenews.getpage(page + 1)
180
- time.sleep(0.5) # Reduced sleep for performance
181
  except HTTPError as e:
182
- if e.code == 429:
183
- wait_time = 5 # Reduced wait for optimization
184
- gr.Warning(f"Rate limited by Google News. Pausing for {wait_time:.0f} seconds.")
185
  time.sleep(wait_time)
186
  else:
187
- logger.error(f"HTTP Error fetching news: {e}"); break
 
188
  except Exception as e:
189
- logger.error(f"An error occurred fetching news: {e}"); break
 
 
190
  current_dt += pd.Timedelta(days=interval)
191
  except Exception as e:
192
  logger.error(f"Error in news scraping loop: {e}")
193
  break
194
-
195
- if not all_articles: return pd.DataFrame(), pd.DataFrame()
196
 
 
 
 
 
197
  df = pd.DataFrame(all_articles).drop_duplicates(subset=['link'])
198
- df['published_date'] = df['date'].apply(lambda x: dateparser.parse(x, languages=['bn']))
199
- df.dropna(subset=['published_date', 'title'], inplace=True)
200
-
 
 
 
 
 
201
  if filter_keys and filter_keys.strip():
202
- # Advanced filtering logic: supports AND, OR, NOT, and phrase search
203
- def parse_query(query):
204
- # Simple parser for AND, OR, NOT, and phrase queries
 
 
 
205
  query = query.lower()
 
 
206
  tokens = re.findall(r'"[^"]+"|\S+', query)
207
- expr = []
 
 
208
  for token in tokens:
209
- if token == 'and': expr.append('&')
210
- elif token == 'or': expr.append('|')
211
- elif token == 'not': expr.append('!')
 
 
 
212
  else:
213
- if token.startswith('"') and token.endswith('"'):
214
- expr.append(f'"{token[1:-1]}"')
215
- else:
216
- expr.append(f'"{token}"')
217
- return ' '.join(expr)
218
-
219
- def match_complex_query(text, query):
220
- # Evaluate the parsed query against the text
221
- text = text.lower()
222
- expr = parse_query(query)
223
- # Replace quoted terms with their presence in text
224
- def term_eval(term):
225
- term = term.strip('"')
226
- return term in text
227
- # Replace operators with Python equivalents
228
- expr = re.sub(r'"([^"]+)"', lambda m: str(term_eval(m.group(0))), expr)
229
- expr = expr.replace('&', ' and ').replace('|', ' or ').replace('!', ' not ')
230
  try:
231
- return eval(expr)
232
- except Exception:
233
- return False
234
-
235
- mask = df.apply(lambda row: match_complex_query(str(row['title']) + ' ' + str(row['desc']), filter_keys), axis=1)
 
 
 
 
 
 
236
  df = df[mask]
237
-
238
- return df, df[['published_date', 'title', 'media', 'desc', 'link']].sort_values(by='published_date', ascending=False)
 
 
 
 
 
239
 
240
  # ==============================================================================
241
  # YOUTUBE ANALYZER BACKEND
242
  # ==============================================================================
243
- # (This section remains unchanged from the previous robust version)
244
- def _fetch_video_details(youtube_service, video_ids: list):
245
- all_videos_data = []
246
- try:
247
- for i in range(0, len(video_ids), 50):
248
- id_batch = video_ids[i:i+50]
249
- video_request = youtube_service.videos().list(part="snippet,statistics", id=",".join(id_batch))
250
- video_response = video_request.execute()
251
- for item in video_response.get('items', []):
252
- stats = item.get('statistics', {})
253
- all_videos_data.append({
254
- 'video_id': item['id'], 'video_title': item['snippet']['title'],
255
- 'channel': item['snippet']['channelTitle'], 'published_date': item['snippet']['publishedAt'],
256
- 'view_count': int(stats.get('viewCount', 0)), 'like_count': int(stats.get('likeCount', 0)),
257
- 'comment_count': int(stats.get('commentCount', 0))
258
- })
259
- except HttpError as e:
260
- logger.error(f"Could not fetch video details. Error: {e}")
261
- gr.Warning("Could not fetch details for some videos due to an API error.")
262
- return all_videos_data
263
-
264
- def _scrape_single_video_comments(youtube_service, video_id, max_comments):
265
- comments_list = []
266
- try:
267
- request = youtube_service.commentThreads().list(
268
- part="snippet", videoId=video_id, maxResults=min(max_comments, 100),
269
- order='relevance', textFormat="plainText"
270
- )
271
- response = request.execute()
272
- for item in response.get('items', []):
273
- snippet = item['snippet']['topLevelComment']['snippet']
274
- comments_list.append({
275
- 'author': snippet['authorDisplayName'], 'published_date_comment': snippet['publishedAt'],
276
- 'comment_text': snippet['textDisplay'], 'likes': snippet['likeCount'],
277
- 'replies': item['snippet']['totalReplyCount']
278
- })
279
- except HttpError as e:
280
- logger.warning(f"Could not retrieve comments for video {video_id} (may be disabled). Error: {e}")
281
- return comments_list
282
-
283
  def run_youtube_analysis_pipeline(api_key, query, max_videos_for_stats, num_videos_for_comments, max_comments_per_video, published_after, progress=gr.Progress()):
 
284
  # Use integrated API key for seamless experience
285
- api_key = "AIzaSyB_f3uROqZfwBWsc_sDEV63WmUHBgvGGqw"
286
- if not query: raise gr.Error("Search Keywords are required.")
 
 
 
287
  try:
 
 
288
  youtube = build('youtube', 'v3', developerKey=api_key)
 
 
 
289
  except HttpError as e:
290
  raise gr.Error(f"Failed to initialize YouTube service. Check API Key. Error: {e}")
291
  except Exception as e:
292
  raise gr.Error(f"An unexpected error occurred during API initialization: {e}")
293
-
294
  progress(0.1, desc="Performing broad scan for videos...")
295
  all_video_ids, next_page_token, total_results_estimate = [], None, 0
296
  PAGES_TO_FETCH = min(15, (max_videos_for_stats // 50) + 1)
297
- search_params = {'q': query, 'part': 'id', 'maxResults': 50, 'type': 'video', 'order': 'relevance'}
 
 
 
 
 
 
 
 
298
  if published_after:
299
  parsed_date = dateparser.parse(published_after)
300
  if parsed_date:
301
  search_params['publishedAfter'] = parsed_date.replace(tzinfo=timezone.utc).isoformat()
302
  else:
303
  gr.Warning(f"Could not parse date: '{published_after}'. Ignoring filter.")
304
-
305
  for page in range(PAGES_TO_FETCH):
306
  try:
307
- if next_page_token: search_params['pageToken'] = next_page_token
 
 
308
  response = youtube.search().list(**search_params).execute()
 
309
  if page == 0:
310
  total_results_estimate = response.get('pageInfo', {}).get('totalResults', 0)
311
- all_video_ids.extend([item['id']['videoId'] for item in response.get('items', [])])
 
 
 
 
 
 
 
 
312
  next_page_token = response.get('nextPageToken')
313
- progress(0.1 + (0.3 * (page / PAGES_TO_FETCH)), desc=f"Broad scan: Found {len(all_video_ids)} videos...")
314
- if not next_page_token: break
 
 
 
315
  except HttpError as e:
316
- if "quotaExceeded" in str(e): raise gr.Error("CRITICAL: YouTube API daily quota exceeded. Try again tomorrow.")
317
- logger.error(f"HTTP error during video search: {e}"); break
318
-
 
 
 
 
 
319
  if not all_video_ids:
320
- return pd.DataFrame(), pd.DataFrame(), 0
321
-
 
322
  progress(0.4, desc=f"Fetching details for {len(all_video_ids)} videos...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  videos_df_full_scan = pd.DataFrame(_fetch_video_details(youtube, all_video_ids))
 
324
  if videos_df_full_scan.empty:
325
- return pd.DataFrame(), pd.DataFrame(), 0
326
-
 
327
  videos_df_full_scan['published_date'] = pd.to_datetime(videos_df_full_scan['published_date'])
328
- videos_df_full_scan['engagement_rate'] = ((videos_df_full_scan['like_count'] + videos_df_full_scan['comment_count']) / videos_df_full_scan['view_count']).fillna(0)
329
- videos_df_full_scan = videos_df_full_scan.sort_values(by='view_count', ascending=False).reset_index(drop=True)
330
-
331
- videos_to_scrape_df, all_comments = videos_df_full_scan.head(int(num_videos_for_comments)), []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  for index, row in videos_to_scrape_df.iterrows():
333
- progress(0.7 + (0.3 * (index / len(videos_to_scrape_df))), desc=f"Deep dive: Scraping comments from video {index+1}/{len(videos_to_scrape_df)}...")
334
- comments_for_video = _scrape_single_video_comments(youtube, row['video_id'], max_comments_per_video)
 
 
 
 
 
 
 
335
  if comments_for_video:
336
  for comment in comments_for_video:
337
- comment.update({'video_id': row['video_id'], 'video_title': row['video_title']})
 
 
 
338
  all_comments.extend(comments_for_video)
339
-
340
  comments_df = pd.DataFrame(all_comments)
341
  if not comments_df.empty:
342
  comments_df['published_date_comment'] = pd.to_datetime(comments_df['published_date_comment'])
343
-
344
- logger.info(f"YouTube analysis complete. Est. total videos: {total_results_estimate}. Scanned: {len(videos_df_full_scan)}. Comments: {len(comments_df)}.")
345
- return videos_df_full_scan, comments_df, total_results_estimate
346
-
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  # ==============================================================================
349
  # ADVANCED ANALYTICS MODULE
350
  # ==============================================================================
351
- # (This section remains unchanged, as it was already robust)
352
- def set_plot_style():
353
- plt.style.use('seaborn-v0_8-whitegrid')
354
- plt.rcParams['figure.dpi'] = 100
355
-
356
- def run_sentiment_analysis(df: pd.DataFrame, text_column: str, progress=gr.Progress()):
357
- # Sentiment analysis removed
358
- return df
359
-
360
  def generate_scraper_dashboard(df: pd.DataFrame):
 
 
 
 
 
 
 
 
 
 
 
 
361
  set_plot_style()
362
 
 
363
  total_articles, unique_media = len(df), df['media'].nunique()
364
  start_date, end_date = pd.to_datetime(df['published_date']).min(), pd.to_datetime(df['published_date']).max()
365
  date_range_str = f"{start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"
366
-
 
 
 
 
 
 
 
 
 
 
 
 
367
  agg_code, agg_name = get_dynamic_time_agg(start_date, end_date)
368
  timeline_df = df.set_index(pd.to_datetime(df['published_date'])).resample(agg_code).size().reset_index(name='count')
369
- timeline_plot = gr.LinePlot(timeline_df, x='published_date', y='count', title=f'{agg_name} News Volume', tooltip=['published_date', 'count'])
 
 
 
 
 
 
 
 
 
370
 
 
371
  media_counts = df['media'].dropna().value_counts().nlargest(15).sort_values()
372
  fig_media = None
373
  if not media_counts.empty:
374
- fig_media, ax = plt.subplots(figsize=(8, 6)); media_counts.plot(kind='barh', ax=ax, color='skyblue'); ax.set_title("Top 15 Media Sources", fontproperties=BANGLA_FONT)
375
- ax.set_yticklabels(media_counts.index, fontproperties=BANGLA_FONT); ax.set_xlabel("Article Count"); plt.tight_layout()
376
-
377
- text = " ".join(title for title in df['title'].astype(str))
378
- text = clean_bengali_text(text)
379
- for phrase, joined in PHRASES_TO_JOIN.items():
380
- text = text.replace(phrase, joined)
 
 
 
 
 
 
 
 
 
381
  fig_wc = None
382
  try:
 
 
 
 
 
 
 
 
 
383
  words = re.findall(r'[\u0980-\u09FF_]{2,}', text)
384
  words = [w for w in words if w not in COMBINED_STOPWORDS]
385
  words = [w for w in words if len(w) > 1]
386
  words = [w for w in words if not re.search(r'[a-zA-Z]', w)]
 
 
387
  from collections import Counter
388
  word_freq = Counter(words)
389
  min_freq = 2
390
  most_common = set([w for w, _ in word_freq.most_common(3)])
391
  filtered_words = [w for w in words if word_freq[w] >= min_freq and w not in most_common]
392
  wc_text = " ".join(filtered_words)
393
- wc = WordCloud(
394
- font_path=FONT_PATH,
395
- width=1600,
396
- height=900,
397
- background_color='white',
398
- stopwords=COMBINED_STOPWORDS,
399
- collocations=False,
400
- colormap='plasma',
401
- max_words=200,
402
- contour_width=2,
403
- contour_color='steelblue',
404
- regexp=r"[\u0980-\u09FF_]+"
405
- ).generate(wc_text)
406
- fig_wc, ax = plt.subplots(figsize=(15, 8))
407
- ax.imshow(wc, interpolation='bilinear')
408
- ax.axis("off")
409
- ax.set_title("Bengali Headline Word Cloud", fontproperties=BANGLA_FONT, fontsize=22)
410
- plt.tight_layout()
 
 
 
 
411
  except Exception as e:
412
- gr.Warning(f"WordCloud failed: {e}")
 
413
 
414
  return {
415
- kpi_total_articles: str(total_articles), kpi_unique_media: str(unique_media), kpi_date_range: date_range_str,
416
- dashboard_timeline_plot: timeline_plot, dashboard_media_plot: fig_media, dashboard_wordcloud_plot: fig_wc,
417
- scraper_dashboard_group: gr.update(visible=True)
 
 
 
418
  }
419
 
420
- def generate_sentiment_dashboard(df: pd.DataFrame):
421
- # Sentiment dashboard removed
422
- return {sentiment_dashboard_tab: gr.update(visible=False)}
423
-
424
  def generate_youtube_dashboard(videos_df, comments_df):
425
- set_plot_style()
426
- kpis = {
427
- kpi_yt_videos_found: f"{len(videos_df):,}" if videos_df is not None and not videos_df.empty else "0",
428
- kpi_yt_views_scanned: f"{videos_df['view_count'].sum():,}" if videos_df is not None and not videos_df.empty and 'view_count' in videos_df.columns else "0",
429
- kpi_yt_comments_scraped: f"{len(comments_df):,}" if comments_df is not None and not comments_df.empty else "0"
 
 
 
 
 
 
 
 
 
430
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
 
432
- fig_channels, ax = None, None
 
 
 
 
 
 
433
  if videos_df is not None and not videos_df.empty and 'channel' in videos_df.columns:
434
  channel_counts = videos_df['channel'].value_counts().nlargest(15).sort_values()
435
  if not channel_counts.empty:
436
  fig_channels, ax = plt.subplots(figsize=(8, 6))
437
- channel_counts.plot(kind='barh', ax=ax, color='coral'); ax.set_title("Top 15 Channels by Video Volume", fontproperties=BANGLA_FONT); ax.set_yticklabels(channel_counts.index, fontproperties=BANGLA_FONT); plt.tight_layout()
438
-
439
- # Rich analytics: engagement, top videos, comment activity, time series, etc.
440
- fig_wc, fig_top_videos, fig_engagement, fig_comment_activity, fig_time_series = None, None, None, None, None
441
- if comments_df is not None and not comments_df.empty:
442
- # Top commented videos
443
- fig_top_videos, ax = None, None
444
- if 'video_title' in comments_df.columns:
445
- top_videos = comments_df['video_title'].value_counts().nlargest(10)
446
- if not top_videos.empty:
447
- fig_top_videos, ax = plt.subplots(figsize=(10, 6))
448
- top_videos.plot(kind='barh', ax=ax, color='dodgerblue')
449
- ax.set_title("Top 10 Videos by Comment Count", fontproperties=BANGLA_FONT)
450
- ax.set_xlabel("Comment Count")
451
- ax.set_yticklabels(top_videos.index, fontproperties=BANGLA_FONT)
452
- plt.tight_layout()
453
- plt.close(fig_top_videos)
454
-
455
- # Engagement rate per video
456
- fig_engagement, ax = None, None
457
- if 'video_id' in comments_df.columns and 'video_title' in comments_df.columns:
458
- engagement_df = comments_df.groupby('video_title').size().to_frame('comment_count')
459
- if videos_df is not None and not videos_df.empty:
460
- merged = videos_df.set_index('video_title').join(engagement_df, lsuffix='_video', rsuffix='_comment')
461
- # If 'comment_count' is missing, fill with 0
462
- if 'comment_count' not in merged.columns:
463
- merged['comment_count'] = 0
464
- # If 'view_count' is missing, fill with 1 to avoid division by zero
465
- if 'view_count' not in merged.columns:
466
- merged['view_count'] = 1
467
- merged['engagement_rate'] = merged['comment_count'] / merged['view_count']
468
- merged = merged.sort_values('engagement_rate', ascending=False).head(10)
469
- if not merged.empty:
470
- fig_engagement, ax = plt.subplots(figsize=(10, 6))
471
- merged['engagement_rate'].plot(kind='barh', ax=ax, color='mediumseagreen')
472
- ax.set_title("Top 10 Videos by Engagement Rate", fontproperties=BANGLA_FONT)
473
- ax.set_xlabel("Engagement Rate (Comments / Views)")
474
- ax.set_yticklabels(merged.index, fontproperties=BANGLA_FONT)
475
- plt.tight_layout()
476
- plt.close(fig_engagement)
477
-
478
- # Comment activity over time
479
- fig_time_series, ax = None, None
480
- if 'published_date_comment' in comments_df.columns:
481
- try:
482
- comments_df['published_date_comment'] = pd.to_datetime(comments_df['published_date_comment'])
483
- time_series = comments_df.set_index('published_date_comment').resample('D').size()
484
- if not time_series.empty:
485
- fig_time_series, ax = plt.subplots(figsize=(10, 4))
486
- time_series.plot(ax=ax, color='darkorange')
487
- ax.set_title("Comment Activity Over Time", fontproperties=BANGLA_FONT)
488
- ax.set_xlabel("Date")
489
- ax.set_ylabel("Number of Comments")
490
- plt.tight_layout()
491
- plt.close(fig_time_series)
492
- except Exception as e:
493
- logger.error(f"Error in comment activity plot: {e}")
494
-
495
- # Beautiful Bengali word cloud from YouTube comments
496
- fig_wc, ax = None, None
497
- if 'comment_text' in comments_df.columns:
498
  text = " ".join(comment for comment in comments_df['comment_text'].astype(str))
499
  text = clean_bengali_text(text)
 
 
500
  for phrase, joined in PHRASES_TO_JOIN.items():
501
  text = text.replace(phrase, joined)
502
- try:
503
- words = re.findall(r'[\u0980-\u09FF_]{2,}', text)
504
- words = [w for w in words if w not in COMBINED_STOPWORDS]
505
- words = [w for w in words if len(w) > 1]
506
- words = [w for w in words if not re.search(r'[a-zA-Z]', w)]
507
- from collections import Counter
508
- word_freq = Counter(words)
509
- min_freq = 2
510
- most_common = set([w for w, _ in word_freq.most_common(3)])
511
- filtered_words = [w for w in words if word_freq[w] >= min_freq and w not in most_common]
512
- wc_text = " ".join(filtered_words)
 
 
 
 
 
 
513
  wc = WordCloud(
514
  font_path=FONT_PATH,
515
  width=1600,
@@ -523,233 +766,444 @@ def generate_youtube_dashboard(videos_df, comments_df):
523
  contour_color='darkorange',
524
  regexp=r"[\u0980-\u09FF_]+"
525
  ).generate(wc_text)
 
526
  fig_wc, ax = plt.subplots(figsize=(15, 8))
527
  ax.imshow(wc, interpolation='bilinear')
528
  ax.axis("off")
529
  ax.set_title("Bengali Word Cloud from YouTube Comments", fontproperties=BANGLA_FONT, fontsize=22)
530
  plt.tight_layout()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
  except Exception as e:
532
- logger.error(f"YouTube WordCloud failed: {e}")
533
-
534
- return {
535
- **kpis,
536
- yt_channel_plot: fig_channels,
537
- yt_wordcloud_plot: fig_wc,
538
- 'yt_top_videos_plot': fig_top_videos,
539
- 'yt_engagement_plot': fig_engagement,
540
- 'yt_comment_activity_plot': fig_comment_activity,
541
- 'yt_time_series_plot': fig_time_series
542
- }
543
-
544
- def generate_youtube_topic_dashboard(videos_df_full_scan: pd.DataFrame):
545
- if videos_df_full_scan is None or videos_df_full_scan.empty: return None, None, None
546
- set_plot_style()
547
 
548
- channel_views = videos_df_full_scan.groupby('channel')['view_count'].sum().nlargest(15).sort_values()
549
- fig_channel_views, ax = plt.subplots(figsize=(10, 7)); channel_views.plot(kind='barh', ax=ax, color='purple'); ax.set_title("Channel Dominance by Total Views (Top 15)", fontproperties=BANGLA_FONT); ax.set_xlabel("Combined Views on Topic"); ax.set_yticklabels(channel_views.index, fontproperties=BANGLA_FONT); plt.tight_layout(); plt.close(fig_channel_views)
550
-
551
- df_sample = videos_df_full_scan.sample(n=min(len(videos_df_full_scan), 200))
552
- avg_views, avg_engagement = df_sample['view_count'].median(), df_sample['engagement_rate'].median()
553
- fig_quadrant, ax = plt.subplots(figsize=(10, 8)); sns.scatterplot(data=df_sample, x='view_count', y='engagement_rate', size='like_count', sizes=(20, 400), hue='channel', alpha=0.7, ax=ax, legend=False)
554
- ax.set_xscale('log'); ax.set_yscale('log'); ax.set_title("Content Performance Quadrant", fontproperties=BANGLA_FONT); ax.set_xlabel("Video Views (Log Scale)", fontproperties=BANGLA_FONT); ax.set_ylabel("Engagement Rate (Log Scale)", fontproperties=BANGLA_FONT)
555
- ax.axhline(avg_engagement, ls='--', color='gray'); ax.axvline(avg_views, ls='--', color='gray'); ax.text(avg_views*1.1, ax.get_ylim()[1], 'High Performers', color='green', fontproperties=BANGLA_FONT); ax.text(ax.get_xlim()[0], avg_engagement*1.1, 'Niche Stars', color='blue', fontproperties=BANGLA_FONT); plt.close(fig_quadrant)
556
-
557
- fig_age, ax = plt.subplots(figsize=(10, 7)); sns.scatterplot(data=df_sample, x='published_date', y='view_count', size='engagement_rate', sizes=(20, 400), alpha=0.6, ax=ax)
558
- ax.set_yscale('log'); ax.set_title("Content Age vs. Impact", fontproperties=BANGLA_FONT); ax.set_xlabel("Publication Date", fontproperties=BANGLA_FONT); ax.set_ylabel("Views (Log Scale)", fontproperties=BANGLA_FONT); plt.xticks(rotation=45); plt.close(fig_age)
559
- return fig_channel_views, fig_quadrant, fig_age
 
 
 
 
 
 
 
 
 
 
 
560
 
561
  # ==============================================================================
562
  # GRADIO UI DEFINITION
563
  # ==============================================================================
564
-
565
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), title=APP_TITLE) as app:
566
  gr.Markdown(f"# {APP_TITLE}\n*{APP_TAGLINE}*")
567
-
568
  # --- STATE MANAGEMENT ---
569
  scraper_results_state = gr.State()
570
  youtube_results_state = gr.State()
571
-
572
- with gr.Tabs() as tabs:
573
  with gr.TabItem("1. News Scraper", id=0):
574
  with gr.Row():
575
  with gr.Column(scale=1):
576
- gr.Markdown("### 1. Search Criteria")
577
- search_keywords_textbox = gr.Textbox(label="Search Keywords", placeholder="e.g.,ডাকসু ")
578
- sites_to_search_textbox = gr.Textbox(label="Target Sites (Optional, comma-separated)", placeholder="e.g., prothomalo.com")
579
- start_date_textbox = gr.Textbox(label="Start Date", placeholder="YYYY-MM-DD or 'last week'")
580
- end_date_textbox = gr.Textbox(label="End Date", placeholder="YYYY-MM-DD or 'today'")
581
- gr.Markdown("### 2. Scraping Parameters")
582
- interval_days_slider = gr.Slider(1, 7, 3, step=1, label="Days per Interval")
583
- max_pages_slider = gr.Slider(1, 10, 5, step=1, label="Max Pages per Interval")
584
- filter_keywords_textbox = gr.Textbox(label="Filter Keywords (comma-separated, optional)", placeholder="e.g., নির্বাচন, ভিসি")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585
  start_scraper_button = gr.Button("Start Scraping & Analysis", variant="primary")
 
 
586
  with gr.Column(scale=2):
587
- scraper_results_df = gr.DataFrame(label="Filtered Results", interactive=False, wrap=True)
588
- scraper_download_file = gr.File(label="Download Filtered Results CSV")
589
-
 
 
 
 
 
590
  with gr.TabItem("2. News Analytics", id=1):
591
- with gr.Group(visible=False) as scraper_dashboard_group:
592
- with gr.Tabs():
593
- with gr.TabItem("Overview"):
594
- with gr.Row():
595
- kpi_total_articles = gr.Textbox(label="Total Articles Found", interactive=False)
596
- kpi_unique_media = gr.Textbox(label="Unique Media Sources", interactive=False)
597
- kpi_date_range = gr.Textbox(label="Date Range of Articles", interactive=False)
598
- dashboard_timeline_plot = gr.LinePlot(label="News Volume Timeline")
599
- with gr.Row():
600
- dashboard_media_plot = gr.Plot(label="Top Media Sources by Article Count")
601
- dashboard_wordcloud_plot = gr.Plot(label="Headline Word Cloud")
602
- with gr.TabItem("Sentiment Analysis", visible=False) as sentiment_dashboard_tab:
603
- with gr.Row():
604
- sentiment_pie_plot = gr.Plot(label="Overall Sentiment")
605
- sentiment_by_media_plot = gr.Plot(label="Sentiment by Media Source")
606
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
  with gr.TabItem("3. YouTube Topic Analysis", id=2):
 
 
608
  with gr.Row():
609
  with gr.Column(scale=1):
610
- gr.Markdown("### YouTube Search & Analysis")
611
- yt_api_key = gr.Textbox(label="YouTube API Key", placeholder="Paste your YouTube Data API v3 key here")
612
- yt_search_keywords = gr.Textbox(label="Search Keywords", placeholder="e.g.,বাংলাদেশ, নির্বাচন")
613
- yt_published_after = gr.Textbox(label="Published After Date (Optional)", placeholder="YYYY-MM-DD or '1 month ago'")
614
- gr.Markdown("### Analysis Parameters")
615
- yt_max_videos_for_stats = gr.Slider(label="Videos to Scan for Topic Stats (Broad Scan)", minimum=50, maximum=750, value=300, step=50)
616
- yt_num_videos_for_comments = gr.Slider(label="Top Videos for Comment Analysis (Deep Dive)", minimum=5, maximum=100, value=25, step=5)
617
- yt_max_comments = gr.Slider(10, 100, 30, step=10, label="Max Comments per Video")
618
- start_yt_analysis_button = gr.Button("Start YouTube Analysis", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
619
  with gr.Column(scale=2):
620
- with gr.Group(visible=False) as yt_dashboard_group:
621
- gr.Markdown("### YouTube Topic Analytics Dashboard")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
622
  with gr.Row():
623
- kpi_yt_total_topic_videos = gr.Textbox(label="Est. Total Videos on Topic (YT)", interactive=False)
624
- kpi_yt_videos_found = gr.Textbox(label="Videos Scanned for Stats", interactive=False)
625
- kpi_yt_views_scanned = gr.Textbox(label="Combined Views (of Scanned)", interactive=False)
626
- kpi_yt_comments_scraped = gr.Textbox(label="Comments Analyzed (from Top Videos)", interactive=False)
627
- with gr.Tabs():
628
- with gr.TabItem("Top Videos & Engagement"):
629
- yt_videos_df_output = gr.DataFrame(label="Top Videos Analyzed for Comments (sorted by views)")
630
- yt_top_videos_plot = gr.Plot(label="Top 10 Videos by Comment Count")
631
- yt_engagement_plot = gr.Plot(label="Top 10 Videos by Engagement Rate")
632
- with gr.TabItem("Comment Activity & Word Cloud"):
633
- yt_comment_activity_plot = gr.Plot(label="Comment Activity Over Time")
634
- yt_wordcloud_plot = gr.Plot(label="Bengali Word Cloud from Comments")
635
- with gr.TabItem("Channel & Topic Analytics"):
636
- yt_channel_plot = gr.Plot(label="Channel Contribution by Video Count")
637
- yt_channel_views_plot = gr.Plot(label="Channel Dominance by Views")
638
- yt_performance_quadrant_plot = gr.Plot(label="Content Performance Quadrant")
639
- yt_content_age_plot = gr.Plot(label="Content Age vs. Impact")
640
-
641
- gr.Markdown(f"<div style='text-align: center; margin-top: 20px;'>{APP_FOOTER}</div>")
642
 
643
- # ==============================================================================
644
- # EVENT HANDLERS
645
- # ==============================================================================
646
-
647
- # --- NEWS SCRAPER WORKFLOW ---
648
- def news_scraper_workflow(search_keywords, sites, start_date, end_date, interval, max_pages, filter_keys, progress=gr.Progress()):
649
- progress(0, desc="Starting news analysis...")
650
- raw_df, display_df = run_news_scraper_pipeline(search_keywords, sites, start_date, end_date, interval, max_pages, filter_keys, progress)
651
-
652
- if raw_df.empty:
653
- gr.Info("No news articles found for your query."); return None, None, None
654
 
655
- progress(0.8, desc="Analyzing sentiment of news headlines...")
656
- analyzed_df = run_sentiment_analysis(raw_df.copy(), 'title', progress)
657
-
658
- output_path = "filtered_news_data.csv"; display_df.to_csv(output_path, index=False)
659
- return display_df, output_path, analyzed_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
660
 
661
  start_scraper_button.click(
662
- fn=news_scraper_workflow,
663
- inputs=[search_keywords_textbox, sites_to_search_textbox, start_date_textbox, end_date_textbox, interval_days_slider, max_pages_slider, filter_keywords_textbox],
664
- outputs=[scraper_results_df, scraper_download_file, scraper_results_state]
665
- )
666
-
667
- def update_news_dashboards(analyzed_df):
668
- if analyzed_df is None or analyzed_df.empty:
669
- return [gr.update(visible=False), '', '', '', None, None, None, gr.update(visible=False), None, None]
670
- scraper_updates = generate_scraper_dashboard(analyzed_df)
671
- sentiment_updates = generate_sentiment_dashboard(analyzed_df)
672
- # Return outputs in the exact order of news_ui_components
673
- return [
674
- scraper_updates.get(scraper_dashboard_group, gr.update(visible=False)),
675
- scraper_updates.get(kpi_total_articles, ''),
676
- scraper_updates.get(kpi_unique_media, ''),
677
- scraper_updates.get(kpi_date_range, ''),
678
- scraper_updates.get(dashboard_timeline_plot, None),
679
- scraper_updates.get(dashboard_media_plot, None),
680
- scraper_updates.get(dashboard_wordcloud_plot, None),
681
- sentiment_updates.get(sentiment_dashboard_tab, gr.update(visible=False)),
682
- sentiment_updates.get(sentiment_pie_plot, None),
683
- sentiment_updates.get(sentiment_by_media_plot, None)
684
  ]
685
-
686
- news_ui_components = [
687
- scraper_dashboard_group, kpi_total_articles, kpi_unique_media, kpi_date_range,
688
- dashboard_timeline_plot, dashboard_media_plot, dashboard_wordcloud_plot,
689
- sentiment_dashboard_tab, sentiment_pie_plot, sentiment_by_media_plot
690
- ]
691
- scraper_results_state.change(fn=update_news_dashboards, inputs=scraper_results_state, outputs=news_ui_components)
692
-
693
- # --- YOUTUBE WORKFLOW ---
694
- def youtube_workflow(api_key, query, max_stats, num_comments, max_comments, published_after, progress=gr.Progress()):
695
- sanitized_api_key = api_key.strip()
696
- sanitized_query = query.strip()
697
- videos_df_full, comments_df, total_vids_est = run_youtube_analysis_pipeline(
698
- sanitized_api_key, sanitized_query, max_stats, num_comments, max_comments, published_after, progress
699
- )
700
- if videos_df_full.empty:
701
- gr.Info("No videos found for your YouTube query."); return None, None
702
-
703
- if comments_df is not None and not comments_df.empty:
704
- progress(0.9, desc="Analyzing comment sentiment...")
705
- comments_df = run_sentiment_analysis(comments_df.copy(), 'comment_text', progress)
706
-
707
- top_videos_for_display = videos_df_full.head(int(num_comments))
708
- return top_videos_for_display, {"full_scan": videos_df_full, "comments": comments_df, "total_estimate": total_vids_est}
709
-
710
- start_yt_analysis_button.click(
711
- fn=youtube_workflow,
712
- inputs=[yt_api_key, yt_search_keywords, yt_max_videos_for_stats, yt_num_videos_for_comments, yt_max_comments, yt_published_after],
713
- outputs=[yt_videos_df_output, youtube_results_state]
714
  )
715
 
716
- def update_youtube_dashboards(results_data):
717
- # This function is now corrected.
718
- if not results_data or results_data.get("full_scan") is None or results_data["full_scan"].empty:
719
- # Return the correct number of empty items (13) to match the component list
720
- return [gr.update(visible=False), "0", "0", "0", "0", None, None, None, None, None, None, None, None]
721
-
722
- videos_df_full, comments_df, total_estimate = results_data.get("full_scan"), results_data.get("comments"), results_data.get("total_estimate", 0)
723
- deep_dive_updates = generate_youtube_dashboard(videos_df_full, comments_df)
724
- fig_ch_views, fig_quad, fig_age = generate_youtube_topic_dashboard(videos_df_full)
725
-
726
- # This return list now perfectly matches the yt_ui_components list below.
727
- return [
728
- gr.update(visible=True),
729
- f"{total_estimate:,}",
730
- deep_dive_updates.get(kpi_yt_videos_found, "0"),
731
- deep_dive_updates.get(kpi_yt_views_scanned, "0"),
732
- deep_dive_updates.get(kpi_yt_comments_scraped, "0"),
733
- deep_dive_updates.get('yt_channel_plot', None),
734
- deep_dive_updates.get('yt_wordcloud_plot', None),
735
- deep_dive_updates.get('yt_top_videos_plot', None),
736
- deep_dive_updates.get('yt_engagement_plot', None),
737
- deep_dive_updates.get('yt_comment_activity_plot', None),
738
- fig_ch_views,
739
- fig_quad,
740
- fig_age
741
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
 
743
- # This is the list that was causing the error.
744
- # The undefined variables `yt_sentiment_pie_plot` and `yt_sentiment_by_video_plot` have been removed.
745
- yt_ui_components = [
746
- yt_dashboard_group, kpi_yt_total_topic_videos, kpi_yt_videos_found, kpi_yt_views_scanned, kpi_yt_comments_scraped,
747
- yt_channel_plot, yt_wordcloud_plot, yt_top_videos_plot, yt_engagement_plot,
748
- yt_comment_activity_plot, yt_channel_views_plot, yt_performance_quadrant_plot, yt_content_age_plot
749
- ]
750
- youtube_results_state.change(fn=update_youtube_dashboards, inputs=youtube_results_state, outputs=yt_ui_components)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
  # ==============================================================================
752
  # LAUNCH THE APP
753
-
754
  if __name__ == "__main__":
755
- app.launch(debug=True, share=True)
 
1
  # ==============================================================================
2
  # SOCIAL PERCEPTION ANALYZER - FINAL COMPLETE APPLICATION
3
+ # Version: 4.1 (Fully Refactored, Production-Ready)
4
  # ==============================================================================
 
5
  # --- IMPORTS ---
6
+ import re
7
+ from GoogleNews import GoogleNews
8
+ from requests.exceptions import HTTPError
9
  import pandas as pd
 
 
 
 
10
  import logging
 
 
11
  import time
 
 
 
12
  from datetime import datetime, timezone
13
  from logging.handlers import RotatingFileHandler
14
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
15
  import matplotlib.pyplot as plt
16
+ from matplotlib.font_manager import FontProperties, fontManager
17
  import seaborn as sns
18
  from wordcloud import WordCloud
19
+ import dateparser
20
+ import numpy as np
21
+ import os
22
 
23
  # ==============================================================================
24
  # SETUP PRODUCTION-GRADE LOGGING & CONFIGURATION
25
  # ==============================================================================
 
26
  log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
27
  log_handler = RotatingFileHandler('app.log', maxBytes=5*1024*1024, backupCount=2)
28
  log_handler.setFormatter(log_formatter)
 
34
 
35
  # --- APPLICATION CONFIGURATION ---
36
  APP_TITLE = "Social Perception Analyzer"
37
+ APP_TAGLINE = "Prepared for the Policymakers of Bangladesh Nationalist Party (BNP)"
38
+ APP_FOOTER = "Developed by CDSR"
39
 
40
  # --- FONT CONFIGURATION ---
41
  FONT_PATH = 'NotoSansBengali-Regular.ttf'
42
+ BANGLA_FONT = None
43
+
44
+ def setup_bangla_font():
45
+ """Properly set up Bengali font for all visualizations"""
46
+ global BANGLA_FONT
47
+ # Strictly enforce NotoSansBengali-Regular.ttf for all Bengali text
48
+ if os.path.exists(FONT_PATH):
49
+ try:
50
+ fontManager.addfont(FONT_PATH)
51
+ BANGLA_FONT = FontProperties(fname=FONT_PATH)
52
+ plt.rcParams['font.family'] = BANGLA_FONT.get_name()
53
+ plt.rcParams['axes.unicode_minus'] = False
54
+ logger.info(f"Successfully loaded '{FONT_PATH}' for Bengali text.")
55
+ return True
56
+ except Exception as e:
57
+ logger.error(f"Error loading Bengali font: {e}")
58
+ return False
59
+ else:
60
+ logger.error(f"Font file {FONT_PATH} not found. Bengali text will not render correctly.")
61
+ BANGLA_FONT = None
62
+ plt.rcParams['font.family'] = 'sans-serif'
63
+ return False
64
+
65
+ # Initialize font system
66
+ font_loaded = setup_bangla_font()
67
 
68
  # ==============================================================================
69
  # CORE HELPER FUNCTIONS
70
+ # ==============================================================================
71
  def clean_bengali_text(text):
72
+ """Remove non-Bengali characters except spaces and underscores (for joined phrases)"""
 
73
  cleaned = re.sub(r'[^\u0980-\u09FF_\s]', '', str(text))
 
74
  cleaned = re.sub(r'\s+', ' ', cleaned).strip()
75
  return cleaned
 
76
 
77
+ # Comprehensive stopword list for Bengali text analysis
78
  BANGLA_STOP_WORDS = [
79
  'অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য',
80
  'অভিপ্রায়', 'একে', 'একই', 'একেবারে', 'একটি', 'একবার', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটি',
81
  'এতটাই', 'এতদূর', 'এতটুকু', 'এক', 'এবং', 'এবার', 'এমন', 'এমনভাবে', 'এর', 'এরা', 'এঁরা', 'এঁদের',
82
  'এই', 'এইভাবে', 'ও', 'ওঁরা', 'ওঁর', 'ওঁদের', 'ওকে', '��খানে', 'ওদের', 'ওর', 'কাছ', 'কাছে', 'কাজ',
83
+ 'কারণ', 'কিছু', 'কিছুই', 'কিন্তু', 'কিভাবে', 'কেন', 'কোন', 'কোনও', 'কোনো', 'ক্ষেত্রে', 'খুব',
84
+ 'গুলি', 'গিয়ে', 'চায়', 'ছাড়া', 'জন্য', 'জানা', 'ঠিক', 'তিনি', 'তিন', 'তিনিও', 'তাকে', 'তাঁকে',
85
+ 'তার', 'তাঁর', 'তারা', 'তাঁরা', 'তাদের', 'তাঁদের', 'তাহলে', 'থাকলেও', 'থেকে', 'মধ্যেই', 'মধ্যে',
86
+ 'দ্বারা', 'নয়', 'না', 'নিজের', 'নিজে', 'নিয়ে', 'পারেন', 'পারা', 'পারে', 'পরে', 'পর্যন্ত', 'পুনরায়',
87
+ 'ফলে', 'বজায়', 'বা', 'বাদে', 'বার', 'বিশেষ', 'বিভিন্ন', 'ব্যবহার', 'ব্যাপারে', 'ভাবে', 'ভাবেই', 'মাধ্যমে',
88
+ 'মতো', 'মতোই', 'যখন', 'যদি', 'যদিও', 'যা', 'যাকে', 'যাওয়া', 'যায়', 'যে', 'যেখানে', 'যেতে', 'যেমন',
89
  'যেহেতু', 'রহিছে', 'শিক্ষা', 'শুধু', 'সঙ্গে', 'সব', 'সমস্ত', 'সম্প্রতি', 'সহ', 'সাধারণ', 'সামনে', 'হতে',
90
+ 'হতেই', 'হবে', 'হয়', 'হয়তো', 'হয়', 'হচ্ছে', 'হত', 'হলে', 'হলেও', 'হয়নি', 'হাজার', 'হোওয়া', 'আরও', 'আমরা',
91
  'আমার', 'আমি', 'আর', 'আগে', 'আগেই', 'আছে', 'আজ', 'তাকে', 'তাতে', 'তাদের', 'তাহার', 'তাহাতে', 'তাহারই',
92
  'তথা', 'তথাপি', 'সে', 'সেই', 'সেখান', 'সেখানে', 'থেকে', 'নাকি', 'নাগাদ', 'দু', 'দুটি', 'সুতরাং',
93
+ 'সম্পর্কে', 'সঙ্গেও', 'সর্বাধিক', 'সর্বদা', 'সহ', 'হৈতে', 'হইবে', 'হইয়া', 'হৈল', 'জানিয়েছেন', 'প্রতিবেদক'
94
  ]
95
 
96
+ COMBINED_STOPWORDS = set(BANGLA_STOP_WORDS)
 
 
 
 
 
 
 
 
 
 
97
 
 
98
  PHRASES_TO_JOIN = {
99
  "তারেক রহমান": "তারেক_রহমান",
100
+ "খালেদা জিয়া": "খালেদা_জিয়া",
101
  "বিএনপি জিন্দাবাদ": "বিএনপি_জিন্দাবাদ"
 
102
  }
103
 
 
104
  def get_dynamic_time_agg(start_date, end_date):
105
+ """Determine appropriate time aggregation level based on date range"""
106
  if not isinstance(start_date, pd.Timestamp) or not isinstance(end_date, pd.Timestamp):
107
+ return 'D', 'Daily' # Graceful fallback
108
+
109
  delta = end_date - start_date
110
+ if delta.days <= 2:
111
+ return 'H', 'Hourly'
112
+ if delta.days <= 90:
113
+ return 'D', 'Daily'
114
+ if delta.days <= 730:
115
+ return 'W', 'Weekly'
116
  return 'M', 'Monthly'
117
 
118
+ def kpi_badge_html(value, label, threshold_high=None, threshold_low=None):
119
+ """
120
+ Returns HTML for a color-coded KPI badge.
121
+ Green for high, red for low, yellow for medium.
122
+ """
123
+ try:
124
+ # Handle comma-separated numbers
125
+ if isinstance(value, str) and ',' in value:
126
+ val = float(value.replace(',', ''))
127
+ else:
128
+ val = float(value)
129
+ except (TypeError, ValueError, AttributeError):
130
+ val = value
131
+
132
+ color = '#e0e0e0' # default
133
+ if threshold_high is not None and isinstance(val, (int, float)) and val >= threshold_high:
134
+ color = '#4caf50' # green
135
+ elif threshold_low is not None and isinstance(val, (int, float)) and val <= threshold_low:
136
+ color = '#f44336' # red
137
+ elif threshold_high is not None and threshold_low is not None and isinstance(val, (int, float)):
138
+ color = '#ffeb3b' # yellow
139
+
140
+ # Format value with commas for large numbers
141
+ if isinstance(value, (int, float)):
142
+ formatted_value = f"{value:,.0f}"
143
+ else:
144
+ formatted_value = str(value)
145
+
146
+ return f"<div style='display:inline-block;padding:8px 16px;border-radius:8px;background:{color};color:#222;font-weight:bold;margin:2px;'>{label}: {formatted_value}</div>"
147
 
148
+ def set_plot_style():
149
+ """Configure consistent matplotlib style for all visualizations"""
150
+ plt.style.use('seaborn-v0_8-whitegrid')
151
+ plt.rcParams['figure.dpi'] = 100
152
+ plt.rcParams['savefig.dpi'] = 300
153
+ plt.rcParams['figure.figsize'] = (10, 6)
154
+ # Always use NotoSansBengali-Regular.ttf for Bengali text
155
+ if BANGLA_FONT and BANGLA_FONT.get_name():
156
+ plt.rcParams['font.family'] = BANGLA_FONT.get_name()
157
+ else:
158
+ plt.rcParams['font.family'] = 'sans-serif'
159
+ plt.rcParams['axes.unicode_minus'] = False # Fix for minus sign rendering
160
 
161
+ def cleanup_figures(*figures):
162
+ """Properly close matplotlib figures to prevent memory leaks"""
163
+ for fig in figures:
164
+ if fig is not None:
165
+ try:
166
+ plt.close(fig)
167
+ except:
168
+ pass
169
 
170
  # ==============================================================================
171
  # NEWS SCRAPER BACKEND
172
  # ==============================================================================
 
173
  def run_news_scraper_pipeline(search_keywords, sites, start_date_str, end_date_str, interval, max_pages, filter_keys, progress=gr.Progress()):
174
+ """Full implementation of the news scraper with robust error handling."""
175
  # Input validation and sanitization
176
+ search_keywords = str(search_keywords).strip() if search_keywords else ""
177
+ sites = str(sites).strip() if sites else ""
178
+ start_date_str = str(start_date_str).strip() if start_date_str else ""
179
+ end_date_str = str(end_date_str).strip() if end_date_str else ""
180
+ filter_keys = str(filter_keys).strip() if filter_keys else ""
181
+
182
  if not all([search_keywords, start_date_str, end_date_str]):
183
  raise gr.Error("Search Keywords, Start Date, and End Date are required.")
184
+
185
  start_dt = dateparser.parse(start_date_str)
186
  end_dt = dateparser.parse(end_date_str)
187
+
188
  if not all([start_dt, end_dt]):
189
  raise gr.Error("Invalid date format. Please use a recognizable format like YYYY-MM-DD or '2 weeks ago'.")
190
+
191
+ # Ensure start date is before end date
192
+ if start_dt > end_dt:
193
+ start_dt, end_dt = end_dt, start_dt
194
+ gr.Warning("Start date was after end date. Dates have been swapped.")
195
+
196
  all_articles, current_dt = [], start_dt
197
+ total_intervals = (end_dt - start_dt).days // interval + 1
198
+
199
  while current_dt <= end_dt:
200
  try:
201
  interval_end_dt = min(current_dt + pd.Timedelta(days=interval - 1), end_dt)
202
  start_str, end_str = current_dt.strftime('%Y-%m-%d'), interval_end_dt.strftime('%Y-%m-%d')
203
+
204
+ progress((current_dt - start_dt).days / (end_dt - start_dt).days,
205
+ desc=f"Fetching news from {start_str} to {end_str}")
206
+
207
  site_query = f"({' OR '.join(['site:' + s.strip() for s in sites.split(',') if s.strip()])})" if sites else ""
208
  final_query = f'"{search_keywords}" {site_query} after:{start_str} before:{end_str}'
209
+
210
+ googlenews = GoogleNews(lang='bn', region='BD', period='1d')
211
  googlenews.search(final_query)
212
+
213
  for page in range(1, max_pages + 1):
214
  try:
215
  results = googlenews.results()
216
+ if not results:
217
+ break
218
  all_articles.extend(results)
219
+
220
  if page < max_pages:
221
  googlenews.getpage(page + 1)
222
+ time.sleep(0.3) # Reduced sleep for performance
223
  except HTTPError as e:
224
+ if e.response.status_code == 429:
225
+ wait_time = 3 # Reduced wait for optimization
226
+ gr.Warning(f"Rate limited by Google News. Pausing for {wait_time} seconds.")
227
  time.sleep(wait_time)
228
  else:
229
+ logger.error(f"HTTP Error fetching news: {e}")
230
+ break
231
  except Exception as e:
232
+ logger.error(f"An error occurred fetching news: {e}")
233
+ break
234
+
235
  current_dt += pd.Timedelta(days=interval)
236
  except Exception as e:
237
  logger.error(f"Error in news scraping loop: {e}")
238
  break
 
 
239
 
240
+ if not all_articles:
241
+ return pd.DataFrame(), pd.DataFrame()
242
+
243
+ # Create DataFrame and clean data
244
  df = pd.DataFrame(all_articles).drop_duplicates(subset=['link'])
245
+
246
+ # Parse dates safely
247
+ df['published_date'] = df['date'].apply(lambda x: dateparser.parse(x, languages=['bn']) if pd.notna(x) else None)
248
+
249
+ # Drop rows with missing critical data
250
+ df = df.dropna(subset=['published_date', 'title'])
251
+
252
+ # Apply advanced filtering if filter keywords are provided
253
  if filter_keys and filter_keys.strip():
254
+ def match_complex_query(text, query):
255
+ """Advanced query parser supporting AND, OR, NOT logic"""
256
+ if not text or not query:
257
+ return False
258
+
259
+ text = str(text).lower()
260
  query = query.lower()
261
+
262
+ # Simple tokenization that preserves phrases in quotes
263
  tokens = re.findall(r'"[^"]+"|\S+', query)
264
+
265
+ # Build a regex pattern from the tokens
266
+ patterns = []
267
  for token in tokens:
268
+ if token == 'and':
269
+ continue # We'll handle this with the final pattern
270
+ elif token == 'or':
271
+ patterns.append('|')
272
+ elif token == 'not':
273
+ patterns.append('(?=^(?!.*')
274
  else:
275
+ # Clean token and convert to regex pattern
276
+ clean_token = token.strip('"')
277
+ if clean_token.startswith('"') and clean_token.endswith('"'):
278
+ clean_token = clean_token[1:-1]
279
+ patterns.append(re.escape(clean_token))
280
+
281
+ # Join patterns and handle negation
282
+ final_pattern = ''.join(patterns)
283
+ if '(?=' in final_pattern:
284
+ final_pattern += '))'
285
+
 
 
 
 
 
 
286
  try:
287
+ return bool(re.search(final_pattern, text))
288
+ except:
289
+ # Fallback to simple substring match if regex fails
290
+ return any(token in text for token in tokens if token not in ['and', 'or', 'not'])
291
+
292
+ # Apply filtering to title and description
293
+ mask = df.apply(lambda row: match_complex_query(
294
+ str(row['title']) + ' ' + str(row.get('desc', '')),
295
+ filter_keys
296
+ ), axis=1)
297
+
298
  df = df[mask]
299
+
300
+ # Return both full dataset and filtered display dataset
301
+ # Always return all Google News fields (published_date, title, media, description, link)
302
+ # Some sources use 'desc', some use 'description'. Unify to 'description'.
303
+ if 'desc' in df.columns and 'description' not in df.columns:
304
+ df['description'] = df['desc']
305
+ return df, df[['published_date', 'title', 'media', 'description', 'link']].sort_values(by='published_date', ascending=False)
306
 
307
  # ==============================================================================
308
  # YOUTUBE ANALYZER BACKEND
309
  # ==============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  def run_youtube_analysis_pipeline(api_key, query, max_videos_for_stats, num_videos_for_comments, max_comments_per_video, published_after, progress=gr.Progress()):
311
+ """Complete YouTube analysis pipeline with robust error handling."""
312
  # Use integrated API key for seamless experience
313
+ api_key = os.getenv("YOUTUBE_API_KEY", "AIzaSyAiiGsKTJyIe4SRfC2uUXwhQ6KO-DEjgIA")
314
+
315
+ if not query:
316
+ raise gr.Error("Search Keywords are required.")
317
+
318
  try:
319
+ from googleapiclient.discovery import build
320
+ from googleapiclient.errors import HttpError
321
  youtube = build('youtube', 'v3', developerKey=api_key)
322
+ except ImportError:
323
+ logger.error("Required YouTube API libraries not installed")
324
+ raise gr.Error("YouTube analysis requires additional libraries. Please install google-api-python-client.")
325
  except HttpError as e:
326
  raise gr.Error(f"Failed to initialize YouTube service. Check API Key. Error: {e}")
327
  except Exception as e:
328
  raise gr.Error(f"An unexpected error occurred during API initialization: {e}")
329
+
330
  progress(0.1, desc="Performing broad scan for videos...")
331
  all_video_ids, next_page_token, total_results_estimate = [], None, 0
332
  PAGES_TO_FETCH = min(15, (max_videos_for_stats // 50) + 1)
333
+
334
+ search_params = {
335
+ 'q': query,
336
+ 'part': 'id',
337
+ 'maxResults': 50,
338
+ 'type': 'video',
339
+ 'order': 'relevance'
340
+ }
341
+
342
  if published_after:
343
  parsed_date = dateparser.parse(published_after)
344
  if parsed_date:
345
  search_params['publishedAfter'] = parsed_date.replace(tzinfo=timezone.utc).isoformat()
346
  else:
347
  gr.Warning(f"Could not parse date: '{published_after}'. Ignoring filter.")
348
+
349
  for page in range(PAGES_TO_FETCH):
350
  try:
351
+ if next_page_token:
352
+ search_params['pageToken'] = next_page_token
353
+
354
  response = youtube.search().list(**search_params).execute()
355
+
356
  if page == 0:
357
  total_results_estimate = response.get('pageInfo', {}).get('totalResults', 0)
358
+
359
+ # Extract valid video IDs
360
+ valid_ids = []
361
+ for item in response.get('items', []):
362
+ if 'id' in item and 'videoId' in item['id']:
363
+ valid_ids.append(item['id']['videoId'])
364
+
365
+ all_video_ids.extend(valid_ids)
366
+
367
  next_page_token = response.get('nextPageToken')
368
+ progress(0.1 + (0.3 * (page / PAGES_TO_FETCH)),
369
+ desc=f"Broad scan: Found {len(all_video_ids)} videos...")
370
+
371
+ if not next_page_token:
372
+ break
373
  except HttpError as e:
374
+ if "quotaExceeded" in str(e):
375
+ raise gr.Error("CRITICAL: YouTube API daily quota exceeded. Try again tomorrow.")
376
+ logger.error(f"HTTP error during video search: {e}")
377
+ break
378
+ except Exception as e:
379
+ logger.error(f"Unexpected error during YouTube search: {e}")
380
+ break
381
+
382
  if not all_video_ids:
383
+ return pd.DataFrame(), pd.DataFrame(), ""
384
+
385
+ # Fetch video details in batches
386
  progress(0.4, desc=f"Fetching details for {len(all_video_ids)} videos...")
387
+
388
+ def _fetch_video_details(youtube_service, video_ids: list):
389
+ """Fetch detailed information for a batch of video IDs"""
390
+ all_videos_data = []
391
+ try:
392
+ for i in range(0, len(video_ids), 50):
393
+ id_batch = video_ids[i:i+50]
394
+ video_request = youtube_service.videos().list(
395
+ part="snippet,statistics",
396
+ id=",".join(id_batch)
397
+ )
398
+ video_response = video_request.execute()
399
+
400
+ for item in video_response.get('items', []):
401
+ stats = item.get('statistics', {})
402
+ all_videos_data.append({
403
+ 'video_id': item['id'],
404
+ 'video_title': item['snippet']['title'],
405
+ 'channel': item['snippet']['channelTitle'],
406
+ 'published_date': item['snippet']['publishedAt'],
407
+ 'view_count': int(stats.get('viewCount', 0)),
408
+ 'like_count': int(stats.get('likeCount', 0)),
409
+ 'comment_count': int(stats.get('commentCount', 0))
410
+ })
411
+ except Exception as e:
412
+ logger.error(f"Could not fetch video details: {e}")
413
+
414
+ return all_videos_data
415
+
416
  videos_df_full_scan = pd.DataFrame(_fetch_video_details(youtube, all_video_ids))
417
+
418
  if videos_df_full_scan.empty:
419
+ return pd.DataFrame(), pd.DataFrame(), ""
420
+
421
+ # Process and clean video data
422
  videos_df_full_scan['published_date'] = pd.to_datetime(videos_df_full_scan['published_date'])
423
+
424
+ # Calculate engagement rate safely
425
+ videos_df_full_scan['engagement_rate'] = (
426
+ (videos_df_full_scan['like_count'] + videos_df_full_scan['comment_count']) /
427
+ videos_df_full_scan['view_count'].replace(0, 1)
428
+ ).fillna(0)
429
+
430
+ videos_df_full_scan = videos_df_full_scan.sort_values(
431
+ by='view_count',
432
+ ascending=False
433
+ ).reset_index(drop=True)
434
+
435
+ # Fetch comments for top videos
436
+ videos_to_scrape_df = videos_df_full_scan.head(int(num_videos_for_comments))
437
+ all_comments = []
438
+
439
+ def _scrape_single_video_comments(youtube_service, video_id, max_comments):
440
+ """Scrape comments for a single video with error handling"""
441
+ comments_list = []
442
+ try:
443
+ request = youtube_service.commentThreads().list(
444
+ part="snippet",
445
+ videoId=video_id,
446
+ maxResults=min(max_comments, 100),
447
+ order='relevance',
448
+ textFormat="plainText"
449
+ )
450
+ response = request.execute()
451
+
452
+ for item in response.get('items', []):
453
+ snippet = item['snippet']['topLevelComment']['snippet']
454
+ comments_list.append({
455
+ 'author': snippet['authorDisplayName'],
456
+ 'published_date_comment': snippet['publishedAt'],
457
+ 'comment_text': snippet['textDisplay'],
458
+ 'likes': snippet['likeCount'],
459
+ 'replies': item['snippet']['totalReplyCount']
460
+ })
461
+ except Exception as e:
462
+ logger.warning(f"Could not retrieve comments for video {video_id}: {e}")
463
+
464
+ return comments_list
465
+
466
  for index, row in videos_to_scrape_df.iterrows():
467
+ progress(0.7 + (0.3 * (index / len(videos_to_scrape_df))),
468
+ desc=f"Deep dive: Scraping comments from video {index+1}/{len(videos_to_scrape_df)}...")
469
+
470
+ comments_for_video = _scrape_single_video_comments(
471
+ youtube,
472
+ row['video_id'],
473
+ max_comments_per_video
474
+ )
475
+
476
  if comments_for_video:
477
  for comment in comments_for_video:
478
+ comment.update({
479
+ 'video_id': row['video_id'],
480
+ 'video_title': row['video_title']
481
+ })
482
  all_comments.extend(comments_for_video)
483
+
484
  comments_df = pd.DataFrame(all_comments)
485
  if not comments_df.empty:
486
  comments_df['published_date_comment'] = pd.to_datetime(comments_df['published_date_comment'])
487
+
488
+ logger.info(f"YouTube analysis complete. Est. total videos: {total_results_estimate}. "
489
+ f"Scanned: {len(videos_df_full_scan)}. Comments: {len(comments_df)}.")
490
+
491
+ # Create summary HTML
492
+ summary_html = f"""
493
+ <div style='background:#f5f5f5;padding:16px;border-radius:12px;margin-bottom:12px;box-shadow:0 2px 8px #eee;'>
494
+ <h3 style='margin:0 0 8px 0;'>YouTube Analytics Summary</h3>
495
+ <ul style='margin:0;padding-left:18px;'>
496
+ <li><b>Total Videos:</b> {len(videos_df_full_scan):,}</li>
497
+ <li><b>Total Comments:</b> {len(comments_df):,}</li>
498
+ <li><b>Total Views:</b> {videos_df_full_scan['view_count'].sum():,}</li>
499
+ </ul>
500
+ </div>
501
+ """
502
+
503
+ return videos_df_full_scan, comments_df, summary_html
504
 
505
  # ==============================================================================
506
  # ADVANCED ANALYTICS MODULE
507
  # ==============================================================================
 
 
 
 
 
 
 
 
 
508
  def generate_scraper_dashboard(df: pd.DataFrame):
509
+ """Generate comprehensive dashboard from news scraper results."""
510
+ if df.empty:
511
+ # Return empty dashboard components
512
+ return {
513
+ "kpi_total_articles": gr.HTML(""),
514
+ "kpi_unique_media": gr.HTML(""),
515
+ "kpi_date_range": gr.HTML(""),
516
+ "dashboard_timeline_plot": None,
517
+ "dashboard_media_plot": None,
518
+ "dashboard_wordcloud_plot": None
519
+ }
520
+
521
  set_plot_style()
522
 
523
+ # Calculate KPIs
524
  total_articles, unique_media = len(df), df['media'].nunique()
525
  start_date, end_date = pd.to_datetime(df['published_date']).min(), pd.to_datetime(df['published_date']).max()
526
  date_range_str = f"{start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"
527
+
528
+ # Color-coded KPI badges
529
+ kpi_total_articles_html = kpi_badge_html(
530
+ total_articles, 'Total Articles', threshold_high=100, threshold_low=10
531
+ )
532
+ kpi_unique_media_html = kpi_badge_html(
533
+ unique_media, 'Unique Media', threshold_high=10, threshold_low=2
534
+ )
535
+ kpi_date_range_html = kpi_badge_html(
536
+ date_range_str, 'Date Range', threshold_high=None, threshold_low=None
537
+ )
538
+
539
+ # Time series visualization - FIXED GRADIO API USAGE
540
  agg_code, agg_name = get_dynamic_time_agg(start_date, end_date)
541
  timeline_df = df.set_index(pd.to_datetime(df['published_date'])).resample(agg_code).size().reset_index(name='count')
542
+ timeline_df.rename(columns={'published_date': 'date'}, inplace=True)
543
+ timeline_plot = gr.LinePlot(
544
+ value=timeline_df,
545
+ x='date',
546
+ y='count',
547
+ title=f'{agg_name} News Volume',
548
+ tooltip=['date', 'count'],
549
+ x_title="Date",
550
+ y_title="Number of Articles"
551
+ )
552
 
553
+ # Media source analysis
554
  media_counts = df['media'].dropna().value_counts().nlargest(15).sort_values()
555
  fig_media = None
556
  if not media_counts.empty:
557
+ fig_media, ax = plt.subplots(figsize=(8, 6))
558
+ media_counts.plot(kind='barh', ax=ax, color='skyblue')
559
+ ax.set_title("Top 15 Media Sources", fontproperties=BANGLA_FONT)
560
+ ax.set_xlabel("Article Count", fontproperties=BANGLA_FONT)
561
+ ax.set_ylabel("মিডিয়া", fontproperties=BANGLA_FONT)
562
+ yticks = np.arange(len(media_counts.index))
563
+ ax.set_yticks(yticks)
564
+ ax.set_yticklabels(media_counts.index, fontproperties=BANGLA_FONT, fontsize=12)
565
+ # Ensure all tick labels use Bengali font
566
+ for label in ax.get_xticklabels():
567
+ label.set_fontproperties(BANGLA_FONT)
568
+ for label in ax.get_yticklabels():
569
+ label.set_fontproperties(BANGLA_FONT)
570
+ plt.tight_layout()
571
+
572
+ # Word cloud generation
573
  fig_wc = None
574
  try:
575
+ # Combine all titles and clean text
576
+ text = " ".join(title for title in df['title'].astype(str))
577
+ text = clean_bengali_text(text)
578
+
579
+ # Join special phrases
580
+ for phrase, joined in PHRASES_TO_JOIN.items():
581
+ text = text.replace(phrase, joined)
582
+
583
+ # Extract and filter words
584
  words = re.findall(r'[\u0980-\u09FF_]{2,}', text)
585
  words = [w for w in words if w not in COMBINED_STOPWORDS]
586
  words = [w for w in words if len(w) > 1]
587
  words = [w for w in words if not re.search(r'[a-zA-Z]', w)]
588
+
589
+ # Filter by frequency
590
  from collections import Counter
591
  word_freq = Counter(words)
592
  min_freq = 2
593
  most_common = set([w for w, _ in word_freq.most_common(3)])
594
  filtered_words = [w for w in words if word_freq[w] >= min_freq and w not in most_common]
595
  wc_text = " ".join(filtered_words)
596
+
597
+ # Generate word cloud
598
+ if wc_text.strip():
599
+ wc = WordCloud(
600
+ font_path=FONT_PATH,
601
+ width=1600,
602
+ height=900,
603
+ background_color='white',
604
+ stopwords=COMBINED_STOPWORDS,
605
+ collocations=False,
606
+ colormap='plasma',
607
+ max_words=200,
608
+ contour_width=2,
609
+ contour_color='steelblue',
610
+ regexp=r"[\u0980-\u09FF_]+"
611
+ ).generate(wc_text)
612
+
613
+ fig_wc, ax = plt.subplots(figsize=(15, 8))
614
+ ax.imshow(wc, interpolation='bilinear')
615
+ ax.axis("off")
616
+ ax.set_title("Bengali Headline Word Cloud", fontproperties=BANGLA_FONT, fontsize=22)
617
+ plt.tight_layout()
618
  except Exception as e:
619
+ logger.error(f"WordCloud failed: {e}")
620
+ gr.Warning(f"WordCloud generation failed: {str(e)}")
621
 
622
  return {
623
+ "kpi_total_articles": gr.HTML(kpi_total_articles_html),
624
+ "kpi_unique_media": gr.HTML(kpi_unique_media_html),
625
+ "kpi_date_range": gr.HTML(kpi_date_range_html),
626
+ "dashboard_timeline_plot": timeline_plot,
627
+ "dashboard_media_plot": fig_media,
628
+ "dashboard_wordcloud_plot": fig_wc
629
  }
630
 
 
 
 
 
631
  def generate_youtube_dashboard(videos_df, comments_df):
632
+ """Generate comprehensive dashboard from YouTube analysis results."""
633
+ # Initialize all dashboard components FIRST
634
+ dashboard_components = {
635
+ "kpi_yt_videos_found": gr.HTML(""),
636
+ "kpi_yt_views_scanned": gr.HTML(""),
637
+ "kpi_yt_comments_scraped": gr.HTML(""),
638
+ "yt_channel_plot": None,
639
+ "yt_channel_dominance_plot": None,
640
+ "yt_time_series_plot": None,
641
+ "yt_top_videos_plot": None,
642
+ "yt_content_quadrant_plot": None,
643
+ "yt_engagement_plot": None,
644
+ "yt_wordcloud_plot": None,
645
+ "yt_detailed_summary": gr.HTML("")
646
  }
647
+
648
+ # Channel dominance by view
649
+ fig_channel_dominance = None
650
+ if videos_df is not None and not videos_df.empty and 'channel' in videos_df.columns:
651
+ channel_views = videos_df.groupby('channel')['view_count'].sum().sort_values(ascending=False).head(10)
652
+ if not channel_views.empty:
653
+ fig_channel_dominance, ax = plt.subplots(figsize=(10, 6))
654
+ channel_views.plot(kind='barh', ax=ax, color='slateblue')
655
+ ax.set_title("Top 10 Dominant Channels by View Count", fontproperties=BANGLA_FONT)
656
+ ax.set_xlabel("মোট ভিউ", fontproperties=BANGLA_FONT)
657
+ ax.set_ylabel("চ্যানেল", fontproperties=BANGLA_FONT)
658
+ yticks = np.arange(len(channel_views.index))
659
+ ax.set_yticks(yticks)
660
+ ax.set_yticklabels(channel_views.index, fontproperties=BANGLA_FONT, fontsize=12)
661
+ plt.tight_layout()
662
+ dashboard_components["yt_channel_dominance_plot"] = fig_channel_dominance
663
+
664
+ # Content performance quadrant
665
+ fig_quadrant = None
666
+ if videos_df is not None and not videos_df.empty:
667
+ try:
668
+ # Define quadrant boundaries
669
+ median_views = videos_df['view_count'].median()
670
+ median_engagement = videos_df['engagement_rate'].median()
671
+ fig_quadrant, ax = plt.subplots(figsize=(10, 8))
672
+ scatter = ax.scatter(
673
+ videos_df['view_count'],
674
+ videos_df['engagement_rate'],
675
+ c='darkorange', alpha=0.7
676
+ )
677
+ ax.axvline(median_views, color='blue', linestyle='--', label='Median Views')
678
+ ax.axhline(median_engagement, color='green', linestyle='--', label='Median Engagement')
679
+ ax.set_xlabel("মোট ভিউ", fontproperties=BANGLA_FONT)
680
+ ax.set_ylabel("এনগেজমেন্ট রেট", fontproperties=BANGLA_FONT)
681
+ ax.set_title("Content Performance Quadrant", fontproperties=BANGLA_FONT)
682
+ plt.tight_layout()
683
+ except Exception as e:
684
+ logger.error(f"Quadrant plot failed: {e}")
685
+ dashboard_components["yt_content_quadrant_plot"] = fig_quadrant
686
+
687
+ # Detailed analysis summary from YouTube API
688
+ detailed_summary = ""
689
+ if videos_df is not None and not videos_df.empty:
690
+ top_video = videos_df.iloc[0]
691
+ detailed_summary = f"<div style='background:#e3f2fd;padding:12px;border-radius:8px;margin-bottom:8px;'>"
692
+ detailed_summary += f"<b>Top Video:</b> {top_video['video_title']}<br>"
693
+ detailed_summary += f"<b>Channel:</b> {top_video['channel']}<br>"
694
+ detailed_summary += f"<b>Views:</b> {top_video['view_count']:,}<br>"
695
+ detailed_summary += f"<b>Likes:</b> {top_video['like_count']:,}<br>"
696
+ detailed_summary += f"<b>Comments:</b> {top_video['comment_count']:,}<br>"
697
+ detailed_summary += f"<b>Published:</b> {top_video['published_date'].strftime('%Y-%m-%d')}<br>"
698
+ detailed_summary += f"<b>Engagement Rate:</b> {top_video['engagement_rate']:.2f}"
699
+ detailed_summary += "</div>"
700
+ dashboard_components["yt_detailed_summary"] = gr.HTML(detailed_summary)
701
+
702
+ # Generate KPIs if data exists
703
+ if videos_df is not None and not videos_df.empty:
704
+ dashboard_components["kpi_yt_videos_found"] = gr.HTML(
705
+ kpi_badge_html(len(videos_df), 'Videos Found', threshold_high=50, threshold_low=5)
706
+ )
707
+ dashboard_components["kpi_yt_views_scanned"] = gr.HTML(
708
+ kpi_badge_html(videos_df['view_count'].sum(), 'Views Scanned', threshold_high=100000, threshold_low=1000)
709
+ )
710
 
711
+ if comments_df is not None and not comments_df.empty:
712
+ dashboard_components["kpi_yt_comments_scraped"] = gr.HTML(
713
+ kpi_badge_html(len(comments_df), 'Comments Scraped', threshold_high=100, threshold_low=10)
714
+ )
715
+
716
+ # Channel analysis
717
+ fig_channels = None
718
  if videos_df is not None and not videos_df.empty and 'channel' in videos_df.columns:
719
  channel_counts = videos_df['channel'].value_counts().nlargest(15).sort_values()
720
  if not channel_counts.empty:
721
  fig_channels, ax = plt.subplots(figsize=(8, 6))
722
+ channel_counts.plot(kind='barh', ax=ax, color='coral')
723
+ ax.set_title("Top 15 Channels by Video Volume", fontproperties=BANGLA_FONT)
724
+ ax.set_yticklabels(channel_counts.index, fontproperties=BANGLA_FONT)
725
+ ax.set_xlabel("Video Count", fontproperties=BANGLA_FONT)
726
+ plt.tight_layout()
727
+ dashboard_components["yt_channel_plot"] = fig_channels
728
+
729
+ # Word cloud from comments
730
+ fig_wc = None
731
+ if comments_df is not None and not comments_df.empty and 'comment_text' in comments_df.columns:
732
+ try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733
  text = " ".join(comment for comment in comments_df['comment_text'].astype(str))
734
  text = clean_bengali_text(text)
735
+
736
+ # Join special phrases
737
  for phrase, joined in PHRASES_TO_JOIN.items():
738
  text = text.replace(phrase, joined)
739
+
740
+ # Extract and filter words
741
+ words = re.findall(r'[\u0980-\u09FF_]{2,}', text)
742
+ words = [w for w in words if w not in COMBINED_STOPWORDS]
743
+ words = [w for w in words if len(w) > 1]
744
+ words = [w for w in words if not re.search(r'[a-zA-Z]', w)]
745
+
746
+ # Filter by frequency
747
+ from collections import Counter
748
+ word_freq = Counter(words)
749
+ min_freq = 2
750
+ most_common = set([w for w, _ in word_freq.most_common(3)])
751
+ filtered_words = [w for w in words if word_freq[w] >= min_freq and w not in most_common]
752
+ wc_text = " ".join(filtered_words)
753
+
754
+ # Generate word cloud
755
+ if wc_text.strip():
756
  wc = WordCloud(
757
  font_path=FONT_PATH,
758
  width=1600,
 
766
  contour_color='darkorange',
767
  regexp=r"[\u0980-\u09FF_]+"
768
  ).generate(wc_text)
769
+
770
  fig_wc, ax = plt.subplots(figsize=(15, 8))
771
  ax.imshow(wc, interpolation='bilinear')
772
  ax.axis("off")
773
  ax.set_title("Bengali Word Cloud from YouTube Comments", fontproperties=BANGLA_FONT, fontsize=22)
774
  plt.tight_layout()
775
+ except Exception as e:
776
+ logger.error(f"YouTube WordCloud failed: {e}")
777
+ dashboard_components["yt_wordcloud_plot"] = fig_wc
778
+
779
+ # Top commented videos
780
+ fig_top_videos = None
781
+ if comments_df is not None and not comments_df.empty and 'video_title' in comments_df.columns:
782
+ top_videos = comments_df['video_title'].value_counts().nlargest(10)
783
+ if not top_videos.empty:
784
+ fig_top_videos, ax = plt.subplots(figsize=(10, 6))
785
+ top_videos.plot(kind='barh', ax=ax, color='dodgerblue')
786
+ ax.set_title("Top 10 Videos by Comment Count", fontproperties=BANGLA_FONT)
787
+ ax.set_xlabel("মন্তব্য সংখ্যা", fontproperties=BANGLA_FONT)
788
+ ax.set_ylabel("ভিডিও শিরোনাম", fontproperties=BANGLA_FONT)
789
+ yticks = np.arange(len(top_videos.index))
790
+ ax.set_yticks(yticks)
791
+ ax.set_yticklabels(top_videos.index, fontproperties=BANGLA_FONT, fontsize=12)
792
+ plt.tight_layout()
793
+ dashboard_components["yt_top_videos_plot"] = fig_top_videos
794
+
795
+ # Engagement rate per video
796
+ fig_engagement = None
797
+ if videos_df is not None and not videos_df.empty and comments_df is not None and not comments_df.empty:
798
+ if 'video_id' in videos_df.columns and 'video_id' in comments_df.columns:
799
+ try:
800
+ # Count comments per video
801
+ comment_counts = comments_df['video_id'].value_counts().reset_index()
802
+ comment_counts.columns = ['video_id', 'comment_count']
803
+ # Ensure 'comment_count' column exists in videos_df
804
+ merged = videos_df.merge(comment_counts, on='video_id', how='left')
805
+ if 'comment_count' not in merged.columns:
806
+ merged['comment_count'] = 0
807
+ merged['comment_count'] = merged['comment_count'].fillna(0)
808
+ # Calculate engagement rate
809
+ merged['engagement_rate'] = merged['comment_count'] / merged['view_count'].replace(0, 1)
810
+ # Get top 10 videos by engagement
811
+ top_engagement = merged.nlargest(10, 'engagement_rate')
812
+ if not top_engagement.empty:
813
+ fig_engagement, ax = plt.subplots(figsize=(10, 6))
814
+ ax.barh(top_engagement['video_title'], top_engagement['engagement_rate'], color='mediumseagreen')
815
+ ax.set_title("Top 10 Videos by Engagement Rate", fontproperties=BANGLA_FONT)
816
+ ax.set_xlabel("এনগেজমেন্ট রেট (মন্তব্য/ভিউ)", fontproperties=BANGLA_FONT)
817
+ ax.set_ylabel("ভিডিও শিরোনাম", fontproperties=BANGLA_FONT)
818
+ yticks = np.arange(len(top_engagement['video_title']))
819
+ ax.set_yticks(yticks)
820
+ ax.set_yticklabels(top_engagement['video_title'], fontproperties=BANGLA_FONT, fontsize=12)
821
+ plt.tight_layout()
822
  except Exception as e:
823
+ logger.error(f"Engagement rate calculation failed: {e}")
824
+ dashboard_components["yt_engagement_plot"] = fig_engagement
 
 
 
 
 
 
 
 
 
 
 
 
 
825
 
826
+ # Comment activity over time
827
+ fig_time_series = None
828
+ if comments_df is not None and not comments_df.empty and 'published_date_comment' in comments_df.columns:
829
+ try:
830
+ comments_df['published_date_comment'] = pd.to_datetime(comments_df['published_date_comment'])
831
+ time_series = comments_df.set_index('published_date_comment').resample('D').size().reset_index()
832
+ time_series.columns = ['date', 'count']
833
+
834
+ if not time_series.empty:
835
+ fig_time_series = gr.LinePlot(
836
+ value=time_series,
837
+ x='date',
838
+ y='count',
839
+ title="Comment Activity Over Time",
840
+ tooltip=['date', 'count'],
841
+ x_title="Date",
842
+ y_title="Number of Comments"
843
+ )
844
+ except Exception as e:
845
+ logger.error(f"Error in comment activity plot: {e}")
846
+ dashboard_components["yt_time_series_plot"] = fig_time_series
847
+
848
+ return dashboard_components
849
 
850
  # ==============================================================================
851
  # GRADIO UI DEFINITION
852
  # ==============================================================================
 
853
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), title=APP_TITLE) as app:
854
  gr.Markdown(f"# {APP_TITLE}\n*{APP_TAGLINE}*")
855
+
856
  # --- STATE MANAGEMENT ---
857
  scraper_results_state = gr.State()
858
  youtube_results_state = gr.State()
859
+
860
+ with gr.Tabs():
861
  with gr.TabItem("1. News Scraper", id=0):
862
  with gr.Row():
863
  with gr.Column(scale=1):
864
+ gr.Markdown("### Search Criteria")
865
+ search_keywords_textbox = gr.Textbox(
866
+ label="Search Keywords",
867
+ placeholder="e.g., বিএনপি সমাবেশ",
868
+ info="Keywords to search for in news articles."
869
+ )
870
+ sites_to_search_textbox = gr.Textbox(
871
+ label="Target Sites (Optional, comma-separated)",
872
+ placeholder="e.g., prothomalo.com",
873
+ info="Limit search to specific news sites."
874
+ )
875
+ start_date_textbox = gr.Textbox(
876
+ label="Start Date",
877
+ placeholder="YYYY-MM-DD or 'last week'",
878
+ info="Start date for news scraping."
879
+ )
880
+ end_date_textbox = gr.Textbox(
881
+ label="End Date",
882
+ placeholder="YYYY-MM-DD or 'today'",
883
+ info="End date for news scraping."
884
+ )
885
+
886
+ gr.Markdown("### Scraping Parameters")
887
+ interval_days_slider = gr.Slider(
888
+ 1, 7, 3, step=1,
889
+ label="Days per Interval",
890
+ info="How many days to group each scraping interval."
891
+ )
892
+ max_pages_slider = gr.Slider(
893
+ 1, 10, 5, step=1,
894
+ label="Max Pages per Interval",
895
+ info="Maximum number of pages to fetch per interval."
896
+ )
897
+ filter_keywords_textbox = gr.Textbox(
898
+ label="Filter Keywords (comma-separated, optional)",
899
+ placeholder="e.g., নির্বাচন, সরকার",
900
+ info="Filter results by these keywords."
901
+ )
902
+
903
  start_scraper_button = gr.Button("Start Scraping & Analysis", variant="primary")
904
+ scraper_progress = gr.Progress()
905
+
906
  with gr.Column(scale=2):
907
+ scraper_results_df = gr.DataFrame(
908
+ label="Filtered Results",
909
+ interactive=True
910
+ )
911
+ scraper_download_file = gr.File(
912
+ label="Download Filtered Results CSV"
913
+ )
914
+
915
  with gr.TabItem("2. News Analytics", id=1):
916
+ gr.Markdown("### News Analytics Dashboard")
917
+
918
+ with gr.Group():
919
+ news_summary_card = gr.HTML(
920
+ "<div style='background:#f5f5f5;padding:16px;border-radius:12px;margin-bottom:12px;box-shadow:0 2px 8px #eee;'>"
921
+ "<h3 style='margin:0 0 8px 0;'>Key Findings</h3>"
922
+ "<ul style='margin:0;padding-left:18px;'>"
923
+ "<li><b>Total Articles:</b> <span id='news_total_articles'></span></li>"
924
+ "<li><b>Unique Media:</b> <span id='news_unique_media'></span></li>"
925
+ "<li><b>Date Range:</b> <span id='news_date_range'></span></li>"
926
+ "</ul></div>"
927
+ )
928
+
929
+ kpi_total_articles = gr.HTML()
930
+ kpi_unique_media = gr.HTML()
931
+ kpi_date_range = gr.HTML()
932
+
933
+ with gr.Row():
934
+ with gr.Column():
935
+ dashboard_timeline_plot = gr.LinePlot(
936
+ label="News Volume Timeline"
937
+ )
938
+ with gr.Column():
939
+ dashboard_media_plot = gr.Plot(
940
+ label="Top Media Sources by Article Count"
941
+ )
942
+
943
+ dashboard_wordcloud_plot = gr.Plot(
944
+ label="Headline Word Cloud"
945
+ )
946
+
947
  with gr.TabItem("3. YouTube Topic Analysis", id=2):
948
+ gr.Markdown("## YouTube Topic Analysis")
949
+
950
  with gr.Row():
951
  with gr.Column(scale=1):
952
+ yt_search_keywords = gr.Textbox(
953
+ label="YouTube Search Keywords",
954
+ placeholder="e.g., BNP Rally",
955
+ info="Keywords to search for in YouTube videos."
956
+ )
957
+ yt_max_videos_slider = gr.Slider(
958
+ 10, 100, 30, step=5,
959
+ label="Max Videos for Stats",
960
+ info="Maximum number of videos to scan for statistics."
961
+ )
962
+ yt_num_videos_comments_slider = gr.Slider(
963
+ 1, 20, 5, step=1,
964
+ label="Videos for Comments",
965
+ info="Number of top videos to scrape comments from."
966
+ )
967
+ yt_max_comments_slider = gr.Slider(
968
+ 10, 200, 50, step=10,
969
+ label="Max Comments per Video",
970
+ info="Maximum number of comments to fetch per video."
971
+ )
972
+ yt_published_after = gr.Textbox(
973
+ label="Published After (Optional)",
974
+ placeholder="YYYY-MM-DD",
975
+ info="Only include videos published after this date."
976
+ )
977
+
978
+ start_youtube_analysis_button = gr.Button(
979
+ "Start YouTube Analysis",
980
+ variant="primary"
981
+ )
982
+ yt_progress = gr.Progress()
983
+
984
  with gr.Column(scale=2):
985
+ yt_results_df = gr.DataFrame(
986
+ label="YouTube Video Results",
987
+ interactive=True
988
+ )
989
+ yt_videos_download_file = gr.File(
990
+ label="Download YouTube Video Results CSV"
991
+ )
992
+ yt_comments_df = gr.DataFrame(
993
+ label="YouTube Comments Results",
994
+ interactive=True
995
+ )
996
+ yt_comments_download_file = gr.File(
997
+ label="Download YouTube Comments CSV"
998
+ )
999
+ yt_dashboard_html = gr.HTML()
1000
+ with gr.Group():
1001
+ kpi_yt_videos_found = gr.HTML()
1002
+ kpi_yt_views_scanned = gr.HTML()
1003
+ kpi_yt_comments_scraped = gr.HTML()
1004
+ with gr.Row():
1005
+ with gr.Column():
1006
+ yt_channel_plot = gr.Plot(
1007
+ label="Top Channels by Video Volume"
1008
+ )
1009
+ yt_channel_dominance_plot = gr.Plot(
1010
+ label="Channel Dominance by View Count"
1011
+ )
1012
+ with gr.Column():
1013
+ yt_time_series_plot = gr.LinePlot(
1014
+ label="Comment Activity Over Time"
1015
+ )
1016
  with gr.Row():
1017
+ with gr.Column():
1018
+ yt_top_videos_plot = gr.Plot(
1019
+ label="Top Videos by Comment Count"
1020
+ )
1021
+ yt_content_quadrant_plot = gr.Plot(
1022
+ label="Content Performance Quadrant"
1023
+ )
1024
+ with gr.Column():
1025
+ yt_engagement_plot = gr.Plot(
1026
+ label="Top Videos by Engagement Rate"
1027
+ )
1028
+ yt_wordcloud_plot = gr.Plot(
1029
+ label="Bengali Word Cloud from Comments"
1030
+ )
1031
+ yt_detailed_summary = gr.HTML()
 
 
 
 
1032
 
1033
+ # --- EVENT HANDLERS ---
1034
+ def scraper_button_handler(search_keywords, sites, start_date, end_date, interval, max_pages, filter_keys):
1035
+ """Handle news scraper button click event."""
1036
+ try:
1037
+ df, filtered_df = run_news_scraper_pipeline(
1038
+ search_keywords, sites, start_date, end_date,
1039
+ interval, max_pages, filter_keys
1040
+ )
 
 
 
1041
 
1042
+ # Update the state with the full results
1043
+ scraper_results_state = df
1044
+
1045
+ # Generate dashboard visualizations
1046
+ dashboard = generate_scraper_dashboard(df)
1047
+
1048
+ # Prepare download file for news results
1049
+ if not df.empty:
1050
+ csv_path = "news_results.csv"
1051
+ df.to_csv(csv_path, index=False)
1052
+ scraper_download_file = gr.File(value=csv_path, visible=True)
1053
+ else:
1054
+ scraper_download_file = gr.File(visible=False)
1055
+
1056
+ return (
1057
+ filtered_df,
1058
+ scraper_download_file,
1059
+ dashboard["kpi_total_articles"],
1060
+ dashboard["kpi_unique_media"],
1061
+ dashboard["kpi_date_range"],
1062
+ dashboard["dashboard_timeline_plot"],
1063
+ dashboard["dashboard_media_plot"],
1064
+ dashboard["dashboard_wordcloud_plot"]
1065
+ )
1066
+ except Exception as e:
1067
+ logger.error(f"Error in scraper button handler: {str(e)}")
1068
+ gr.Error(f"An error occurred during scraping: {str(e)}")
1069
+ # Return empty values to reset the UI
1070
+ return (
1071
+ pd.DataFrame(),
1072
+ gr.File(visible=False),
1073
+ gr.HTML(""), gr.HTML(""), gr.HTML(""),
1074
+ None, None, None
1075
+ )
1076
 
1077
  start_scraper_button.click(
1078
+ fn=scraper_button_handler,
1079
+ inputs=[
1080
+ search_keywords_textbox,
1081
+ sites_to_search_textbox,
1082
+ start_date_textbox,
1083
+ end_date_textbox,
1084
+ interval_days_slider,
1085
+ max_pages_slider,
1086
+ filter_keywords_textbox
1087
+ ],
1088
+ outputs=[
1089
+ scraper_results_df,
1090
+ scraper_download_file,
1091
+ kpi_total_articles,
1092
+ kpi_unique_media,
1093
+ kpi_date_range,
1094
+ dashboard_timeline_plot,
1095
+ dashboard_media_plot,
1096
+ dashboard_wordcloud_plot
 
 
 
1097
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1098
  )
1099
 
1100
+ def youtube_button_handler(keywords, max_videos, num_comments_videos, max_comments, published_after):
1101
+ """Handle YouTube analysis button click event."""
1102
+ try:
1103
+ videos_df, comments_df, summary_html = run_youtube_analysis_pipeline(
1104
+ api_key=None,
1105
+ query=keywords,
1106
+ max_videos_for_stats=max_videos,
1107
+ num_videos_for_comments=num_comments_videos,
1108
+ max_comments_per_video=max_comments,
1109
+ published_after=published_after
1110
+ )
1111
+ # Update the state with the results
1112
+ youtube_results_state = (videos_df, comments_df)
1113
+ # Prepare download files for YouTube results
1114
+ yt_videos_csv = "youtube_videos.csv"
1115
+ yt_comments_csv = "youtube_comments.csv"
1116
+ if not videos_df.empty:
1117
+ videos_df.to_csv(yt_videos_csv, index=False)
1118
+ yt_videos_download_file = gr.File(value=yt_videos_csv, visible=True)
1119
+ else:
1120
+ yt_videos_download_file = gr.File(visible=False)
1121
+ # For comments, add video title and channel if not present
1122
+ if not comments_df.empty:
1123
+ if "video_title" not in comments_df.columns and "video_id" in comments_df.columns:
1124
+ # Map video title from videos_df
1125
+ title_map = videos_df.set_index("video_id")["video_title"].to_dict()
1126
+ comments_df["video_title"] = comments_df["video_id"].map(title_map)
1127
+ if "channel" not in comments_df.columns and "channel_title" in comments_df.columns:
1128
+ comments_df["channel"] = comments_df["channel_title"]
1129
+ comments_df.to_csv(yt_comments_csv, index=False)
1130
+ yt_comments_download_file = gr.File(value=yt_comments_csv, visible=True)
1131
+ else:
1132
+ yt_comments_download_file = gr.File(visible=False)
1133
+ # Generate dashboard visualizations
1134
+ dashboard = generate_youtube_dashboard(videos_df, comments_df)
1135
+ return (
1136
+ videos_df,
1137
+ yt_videos_download_file,
1138
+ comments_df,
1139
+ yt_comments_download_file,
1140
+ summary_html,
1141
+ dashboard["kpi_yt_videos_found"],
1142
+ dashboard["kpi_yt_views_scanned"],
1143
+ dashboard["kpi_yt_comments_scraped"],
1144
+ dashboard["yt_channel_plot"],
1145
+ dashboard["yt_channel_dominance_plot"],
1146
+ dashboard["yt_time_series_plot"],
1147
+ dashboard["yt_top_videos_plot"],
1148
+ dashboard["yt_content_quadrant_plot"],
1149
+ dashboard["yt_engagement_plot"],
1150
+ dashboard["yt_wordcloud_plot"],
1151
+ dashboard["yt_detailed_summary"]
1152
+ )
1153
+ except Exception as e:
1154
+ logger.error(f"Error in YouTube button handler: {str(e)}")
1155
+ gr.Error(f"An error occurred during YouTube analysis: {str(e)}")
1156
+ # Return empty values to reset the UI (16 outputs)
1157
+ return (
1158
+ pd.DataFrame(), # yt_results_df
1159
+ gr.File(visible=False), # yt_videos_download_file
1160
+ pd.DataFrame(), # yt_comments_df
1161
+ gr.File(visible=False), # yt_comments_download_file
1162
+ gr.HTML(""), # yt_dashboard_html
1163
+ gr.HTML(""), # kpi_yt_videos_found
1164
+ gr.HTML(""), # kpi_yt_views_scanned
1165
+ gr.HTML(""), # kpi_yt_comments_scraped
1166
+ None, # yt_channel_plot
1167
+ None, # yt_channel_dominance_plot
1168
+ None, # yt_time_series_plot
1169
+ None, # yt_top_videos_plot
1170
+ None, # yt_content_quadrant_plot
1171
+ None, # yt_engagement_plot
1172
+ None, # yt_wordcloud_plot
1173
+ gr.HTML("") # yt_detailed_summary
1174
+ )
1175
 
1176
+ start_youtube_analysis_button.click(
1177
+ fn=youtube_button_handler,
1178
+ inputs=[
1179
+ yt_search_keywords,
1180
+ yt_max_videos_slider,
1181
+ yt_num_videos_comments_slider,
1182
+ yt_max_comments_slider,
1183
+ yt_published_after
1184
+ ],
1185
+ outputs=[
1186
+ yt_results_df,
1187
+ yt_videos_download_file,
1188
+ yt_comments_df,
1189
+ yt_comments_download_file,
1190
+ yt_dashboard_html,
1191
+ kpi_yt_videos_found,
1192
+ kpi_yt_views_scanned,
1193
+ kpi_yt_comments_scraped,
1194
+ yt_channel_plot,
1195
+ yt_channel_dominance_plot,
1196
+ yt_time_series_plot,
1197
+ yt_top_videos_plot,
1198
+ yt_content_quadrant_plot,
1199
+ yt_engagement_plot,
1200
+ yt_wordcloud_plot,
1201
+ yt_detailed_summary
1202
+ ]
1203
+ )
1204
+
1205
  # ==============================================================================
1206
  # LAUNCH THE APP
1207
+ # ==============================================================================
1208
  if __name__ == "__main__":
1209
+ app.launch( debug=True,share=True)