Arjon07CSE commited on
Commit
8d11382
·
verified ·
1 Parent(s): 968fea9

uploaded the complete application and requirements file

Browse files
Files changed (2) hide show
  1. app.py +618 -0
  2. requirements.txt +25 -0
app.py ADDED
@@ -0,0 +1,618 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # SOCIAL PERCEPTION ANALYZER - FINAL COMPLETE APPLICATION
3
+ # Version: 3.0 (Architecturally Refactored, Production Ready)
4
+ # ==============================================================================
5
+
6
+ # --- IMPORTS ---
7
+ import gradio as gr
8
+ import pandas as pd
9
+ import numpy as np
10
+ import torch
11
+ import re
12
+ import sqlite3
13
+ import json
14
+ import logging
15
+ import requests
16
+ import os
17
+ import time
18
+ import random
19
+ import functools
20
+ from io import StringIO
21
+ from datetime import datetime, timezone
22
+ from logging.handlers import RotatingFileHandler
23
+
24
+ # --- APIs and Web Scraping ---
25
+ from googleapiclient.discovery import build
26
+ from googleapiclient.errors import HttpError
27
+ from GoogleNews import GoogleNews
28
+ from urllib.error import HTTPError
29
+ import dateparser
30
+
31
+ # --- NLP & Machine Learning ---
32
+ from transformers import pipeline, BitsAndBytesConfig
33
+ from sentence_transformers import SentenceTransformer
34
+ from huggingface_hub.utils import HfHubHTTPError
35
+
36
+ # --- Visualization ---
37
+ import matplotlib.pyplot as plt
38
+ from matplotlib.font_manager import FontProperties
39
+ import seaborn as sns
40
+ from wordcloud import WordCloud
41
+
42
+ # ==============================================================================
43
+ # SETUP PRODUCTION-GRADE LOGGING & CONFIGURATION
44
+ # ==============================================================================
45
+
46
+ log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
47
+ log_handler = RotatingFileHandler('app.log', maxBytes=5*1024*1024, backupCount=2)
48
+ log_handler.setFormatter(log_formatter)
49
+ logger = logging.getLogger()
50
+ logger.setLevel(logging.INFO)
51
+ if not logger.handlers:
52
+ logger.addHandler(log_handler)
53
+ logger.info("Application starting up.")
54
+
55
+ # --- APPLICATION CONFIGURATION ---
56
+ APP_TITLE = "Social Perception Analyzer"
57
+ APP_TAGLINE = "Prepared for the Policymakers of Bangladesh Nationalist Party (BNP)"
58
+ APP_FOOTER = "Developed by CDSR"
59
+
60
+ # --- FONT CONFIGURATION ---
61
+ FONT_PATH = 'NotoSansBengali-Regular.ttf'
62
+ try:
63
+ BANGLA_FONT = FontProperties(fname=FONT_PATH)
64
+ logger.info("Successfully loaded 'NotoSansBengali-Regular.ttf' font.")
65
+ except OSError:
66
+ logger.error("Failed to load 'NotoSansBengali-Regular.ttf'. Ensure the file is in the root directory.")
67
+ gr.Warning("Bangla font not found! Visualizations may not render text correctly.")
68
+ BANGLA_FONT = FontProperties()
69
+
70
+ # ==============================================================================
71
+ # CORE HELPER FUNCTIONS
72
+ # ==============================================================================
73
+
74
+ BANGLA_STOP_WORDS = [
75
+ 'অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য',
76
+ 'অভিপ্রায়', 'একে', 'একই', 'একেবারে', 'একটি', 'একবার', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটি',
77
+ 'এতটাই', 'এতদূর', 'এতটুকু', 'এক', 'এবং', 'এবার', 'এমন', 'এমনভাবে', 'এর', 'এরা', 'এঁরা', 'এঁদের',
78
+ 'এই', 'এইভাবে', 'ও', 'ওঁরা', 'ওঁর', 'ওঁদের', 'ওকে', 'ওখানে', 'ওদের', 'ওর', 'কাছ', 'কাছে', 'কাজ',
79
+ 'কারণ', 'কিছু', 'কিছুই', 'কিন্তু', 'কিভাবে', 'কেন', 'কোন', 'কোনও', 'কোনো', 'ক্ষেত্রে', 'খুব',
80
+ 'গুলি', 'গিয়ে', 'চায়', 'ছাড়া', 'জন্য', 'জানা', 'ঠিক', 'তিনি', 'তিন', 'তিনিও', 'তাকে', 'তাঁকে',
81
+ 'তার', 'তাঁর', 'তারা', 'তাঁরা', 'তাদের', 'তাঁদের', 'তাহলে', ' থাকলেও', 'থেকে', 'মধ্যেই', 'মধ্যে',
82
+ 'द्वारा', 'নয়', 'না', 'নিজের', 'নিজে', 'নিয়ে', 'পারেন', 'পারা', 'পারে', 'পরে', 'পর্যন্ত', 'পুনরায়',
83
+ 'ফলে', 'বজায়', 'বা', 'বাদে', 'বার', 'বিশেষ', 'বিভিন্ন', 'ব্যবহার', 'ব্যাপারে', 'ভাবে', 'ভাবেই', 'মাধ্যমে',
84
+ 'মতো', 'মতোই', 'যখন', 'যদি', 'যদিও', 'যা', 'যাকে', 'যাওয়া', 'যায়', 'যে', 'যেখানে', 'যেতে', 'যেমন',
85
+ 'যেহেতু', 'রহিছে', 'শিক্ষা', 'শুধু', 'সঙ্গে', 'সব', 'সমস্ত', 'সম্প্রতি', 'সহ', 'সাধারণ', 'সামনে', 'হতে',
86
+ 'হতেই', 'হবে', 'হয়', 'হয়তো', 'হয়', 'হ���্ছে', 'হত', 'হলে', 'হলেও', 'হয়নি', 'হাজার', 'হোওয়া', 'আরও', 'আমরা',
87
+ 'আমার', 'আমি', 'আর', 'আগে', 'আগেই', 'আছে', 'আজ', 'তাকে', 'তাতে', 'তাদের', 'তাহার', 'তাহাতে', 'তাহারই',
88
+ 'তথা', 'তথাপি', 'সে', 'সেই', 'সেখান', 'সেখানে', 'থেকে', 'নাকি', 'নাগাদ', 'দু', 'দুটি', 'সুতরাং',
89
+ 'সম্পর্কে', 'সঙ্গেও', 'সর্বাধিক', 'সর্বদা', 'সহ', 'হৈতে', 'হইবে', 'হইয়া', 'হৈল', 'জানিয়েছেন', 'প্রতিবেদক'
90
+ ]
91
+
92
+ def get_dynamic_time_agg(start_date, end_date):
93
+ """Hardened helper to determine time aggregation level."""
94
+ if not isinstance(start_date, pd.Timestamp) or not isinstance(end_date, pd.Timestamp):
95
+ return 'D', 'Daily' # Graceful fallback
96
+ delta = end_date - start_date
97
+ if delta.days <= 2: return 'H', 'Hourly'
98
+ if delta.days <= 90: return 'D', 'Daily'
99
+ if delta.days <= 730: return 'W', 'Weekly'
100
+ return 'M', 'Monthly'
101
+
102
+ # ==============================================================================
103
+ # ML MODEL MANAGEMENT
104
+ # ==============================================================================
105
+
106
+
107
+ SENTIMENT_MODEL_ID = 'ahs95/banglabert-sentiment-analysis'
108
+ MODELS = {"sentiment_pipeline": None}
109
+
110
+ def _load_pipeline_with_retry(task, model_id, retries=3):
111
+ logger.info(f"Initializing {task} pipeline for model: {model_id}")
112
+ for attempt in range(retries):
113
+ try:
114
+ device = 0 if torch.cuda.is_available() else -1
115
+ if device == -1: gr.Warning(f"{model_id} will run on CPU and may be very slow.")
116
+ pipe = pipeline(task, model=model_id, device=device)
117
+ logger.info(f"Pipeline '{task}' loaded successfully.")
118
+ return pipe
119
+ except (HfHubHTTPError, requests.exceptions.ConnectionError) as e:
120
+ logger.warning(f"Network error on loading {model_id} (Attempt {attempt + 1}/{retries}): {e}")
121
+ if attempt < retries - 1: time.sleep(5)
122
+ else: raise gr.Error(f"Failed to download model '{model_id}' after {retries} attempts. Check network.")
123
+ except Exception as e:
124
+ logger.error(f"An unexpected error occurred while loading {model_id}: {e}")
125
+ raise gr.Error(f"Could not initialize model '{model_id}'. Error: {e}")
126
+ return None
127
+
128
+ def get_sentiment_pipeline():
129
+ if MODELS["sentiment_pipeline"] is None:
130
+ MODELS["sentiment_pipeline"] = _load_pipeline_with_retry("sentiment-analysis", SENTIMENT_MODEL_ID)
131
+ return MODELS["sentiment_pipeline"]
132
+
133
+ # ==============================================================================
134
+ # NEWS SCRAPER BACKEND
135
+ # ==============================================================================
136
+
137
+ def run_news_scraper_pipeline(search_keywords, sites, start_date_str, end_date_str, interval, max_pages, filter_keys, progress=gr.Progress()):
138
+ """Full, robust implementation of the news scraper."""
139
+ # Input validation and sanitization
140
+ search_keywords = search_keywords.strip()
141
+ if not all([search_keywords, start_date_str, end_date_str]):
142
+ raise gr.Error("Search Keywords, Start Date, and End Date are required.")
143
+
144
+ start_dt = dateparser.parse(start_date_str)
145
+ end_dt = dateparser.parse(end_date_str)
146
+ if not all([start_dt, end_dt]):
147
+ raise gr.Error("Invalid date format. Please use a recognizable format like YYYY-MM-DD or '2 weeks ago'.")
148
+
149
+ all_articles, current_dt = [], start_dt
150
+ while current_dt <= end_dt:
151
+ interval_end_dt = min(current_dt + pd.Timedelta(days=interval - 1), end_dt)
152
+ start_str, end_str = current_dt.strftime('%Y-%m-%d'), interval_end_dt.strftime('%Y-%m-%d')
153
+ progress(0, desc=f"Fetching news from {start_str} to {end_str}")
154
+
155
+ site_query = f"({' OR '.join(['site:' + s.strip() for s in sites.split(',') if s.strip()])})" if sites else ""
156
+ final_query = f'"{search_keywords}" {site_query} after:{start_str} before:{end_str}'
157
+
158
+ googlenews = GoogleNews(lang='bn', region='BD')
159
+ googlenews.search(final_query)
160
+
161
+ for page in range(1, max_pages + 1):
162
+ try:
163
+ results = googlenews.results()
164
+ if not results: break
165
+ all_articles.extend(results)
166
+ if page < max_pages:
167
+ googlenews.getpage(page + 1)
168
+ time.sleep(random.uniform(2, 5))
169
+ except HTTPError as e:
170
+ if e.code == 429:
171
+ wait_time = random.uniform(15, 30)
172
+ gr.Warning(f"Rate limited by Google News. Pausing for {wait_time:.0f} seconds.")
173
+ time.sleep(wait_time)
174
+ else:
175
+ logger.error(f"HTTP Error fetching news: {e}"); break
176
+ except Exception as e:
177
+ logger.error(f"An error occurred fetching news: {e}"); break
178
+
179
+ current_dt += pd.Timedelta(days=interval)
180
+
181
+ if not all_articles: return pd.DataFrame(), pd.DataFrame()
182
+
183
+ df = pd.DataFrame(all_articles).drop_duplicates(subset=['link'])
184
+ df['published_date'] = df['date'].apply(lambda x: dateparser.parse(x, languages=['bn']))
185
+ df.dropna(subset=['published_date', 'title'], inplace=True)
186
+
187
+ if filter_keys and filter_keys.strip():
188
+ keywords = [k.strip().lower() for k in filter_keys.split(',')]
189
+ mask = df.apply(lambda row: any(key in str(row['title']).lower() or key in str(row['desc']).lower() for key in keywords), axis=1)
190
+ df = df[mask]
191
+
192
+ return df, df[['published_date', 'title', 'media', 'desc', 'link']].sort_values(by='published_date', ascending=False)
193
+
194
+ # ==============================================================================
195
+ # YOUTUBE ANALYZER BACKEND
196
+ # ==============================================================================
197
+ # (This section remains unchanged from the previous robust version)
198
+ def _fetch_video_details(youtube_service, video_ids: list):
199
+ all_videos_data = []
200
+ try:
201
+ for i in range(0, len(video_ids), 50):
202
+ id_batch = video_ids[i:i+50]
203
+ video_request = youtube_service.videos().list(part="snippet,statistics", id=",".join(id_batch))
204
+ video_response = video_request.execute()
205
+ for item in video_response.get('items', []):
206
+ stats = item.get('statistics', {})
207
+ all_videos_data.append({
208
+ 'video_id': item['id'], 'video_title': item['snippet']['title'],
209
+ 'channel': item['snippet']['channelTitle'], 'published_date': item['snippet']['publishedAt'],
210
+ 'view_count': int(stats.get('viewCount', 0)), 'like_count': int(stats.get('likeCount', 0)),
211
+ 'comment_count': int(stats.get('commentCount', 0))
212
+ })
213
+ except HttpError as e:
214
+ logger.error(f"Could not fetch video details. Error: {e}")
215
+ gr.Warning("Could not fetch details for some videos due to an API error.")
216
+ return all_videos_data
217
+
218
+ def _scrape_single_video_comments(youtube_service, video_id, max_comments):
219
+ comments_list = []
220
+ try:
221
+ request = youtube_service.commentThreads().list(
222
+ part="snippet", videoId=video_id, maxResults=min(max_comments, 100),
223
+ order='relevance', textFormat="plainText"
224
+ )
225
+ response = request.execute()
226
+ for item in response.get('items', []):
227
+ snippet = item['snippet']['topLevelComment']['snippet']
228
+ comments_list.append({
229
+ 'author': snippet['authorDisplayName'], 'published_date_comment': snippet['publishedAt'],
230
+ 'comment_text': snippet['textDisplay'], 'likes': snippet['likeCount'],
231
+ 'replies': item['snippet']['totalReplyCount']
232
+ })
233
+ except HttpError as e:
234
+ logger.warning(f"Could not retrieve comments for video {video_id} (may be disabled). Error: {e}")
235
+ return comments_list
236
+
237
+ def run_youtube_analysis_pipeline(api_key, query, max_videos_for_stats, num_videos_for_comments, max_comments_per_video, published_after, progress=gr.Progress()):
238
+ if not api_key: raise gr.Error("YouTube API Key is required.")
239
+ if not query: raise gr.Error("Search Keywords are required.")
240
+ try:
241
+ youtube = build('youtube', 'v3', developerKey=api_key)
242
+ except HttpError as e:
243
+ raise gr.Error(f"Failed to initialize YouTube service. Check API Key. Error: {e}")
244
+ except Exception as e:
245
+ raise gr.Error(f"An unexpected error occurred during API initialization: {e}")
246
+
247
+ progress(0.1, desc="Performing broad scan for videos...")
248
+ all_video_ids, next_page_token, total_results_estimate = [], None, 0
249
+ PAGES_TO_FETCH = min(15, (max_videos_for_stats // 50) + 1)
250
+ search_params = {'q': query, 'part': 'id', 'maxResults': 50, 'type': 'video', 'order': 'relevance'}
251
+ if published_after:
252
+ parsed_date = dateparser.parse(published_after)
253
+ if parsed_date:
254
+ search_params['publishedAfter'] = parsed_date.replace(tzinfo=timezone.utc).isoformat()
255
+ else:
256
+ gr.Warning(f"Could not parse date: '{published_after}'. Ignoring filter.")
257
+
258
+ for page in range(PAGES_TO_FETCH):
259
+ try:
260
+ if next_page_token: search_params['pageToken'] = next_page_token
261
+ response = youtube.search().list(**search_params).execute()
262
+ if page == 0:
263
+ total_results_estimate = response.get('pageInfo', {}).get('totalResults', 0)
264
+ all_video_ids.extend([item['id']['videoId'] for item in response.get('items', [])])
265
+ next_page_token = response.get('nextPageToken')
266
+ progress(0.1 + (0.3 * (page / PAGES_TO_FETCH)), desc=f"Broad scan: Found {len(all_video_ids)} videos...")
267
+ if not next_page_token: break
268
+ except HttpError as e:
269
+ if "quotaExceeded" in str(e): raise gr.Error("CRITICAL: YouTube API daily quota exceeded. Try again tomorrow.")
270
+ logger.error(f"HTTP error during video search: {e}"); break
271
+
272
+ if not all_video_ids:
273
+ return pd.DataFrame(), pd.DataFrame(), 0
274
+
275
+ progress(0.4, desc=f"Fetching details for {len(all_video_ids)} videos...")
276
+ videos_df_full_scan = pd.DataFrame(_fetch_video_details(youtube, all_video_ids))
277
+ if videos_df_full_scan.empty:
278
+ return pd.DataFrame(), pd.DataFrame(), 0
279
+
280
+ videos_df_full_scan['published_date'] = pd.to_datetime(videos_df_full_scan['published_date'])
281
+ videos_df_full_scan['engagement_rate'] = ((videos_df_full_scan['like_count'] + videos_df_full_scan['comment_count']) / videos_df_full_scan['view_count']).fillna(0)
282
+ videos_df_full_scan = videos_df_full_scan.sort_values(by='view_count', ascending=False).reset_index(drop=True)
283
+
284
+ videos_to_scrape_df, all_comments = videos_df_full_scan.head(int(num_videos_for_comments)), []
285
+ for index, row in videos_to_scrape_df.iterrows():
286
+ progress(0.7 + (0.3 * (index / len(videos_to_scrape_df))), desc=f"Deep dive: Scraping comments from video {index+1}/{len(videos_to_scrape_df)}...")
287
+ comments_for_video = _scrape_single_video_comments(youtube, row['video_id'], max_comments_per_video)
288
+ if comments_for_video:
289
+ for comment in comments_for_video:
290
+ comment.update({'video_id': row['video_id'], 'video_title': row['video_title']})
291
+ all_comments.extend(comments_for_video)
292
+
293
+ comments_df = pd.DataFrame(all_comments)
294
+ if not comments_df.empty:
295
+ comments_df['published_date_comment'] = pd.to_datetime(comments_df['published_date_comment'])
296
+
297
+ logger.info(f"YouTube analysis complete. Est. total videos: {total_results_estimate}. Scanned: {len(videos_df_full_scan)}. Comments: {len(comments_df)}.")
298
+ return videos_df_full_scan, comments_df, total_results_estimate
299
+
300
+
301
+ # ==============================================================================
302
+ # ADVANCED ANALYTICS MODULE
303
+ # ==============================================================================
304
+ # (This section remains unchanged, as it was already robust)
305
+ def set_plot_style():
306
+ plt.style.use('seaborn-v0_8-whitegrid')
307
+ plt.rcParams['figure.dpi'] = 100
308
+
309
+ def run_sentiment_analysis(df: pd.DataFrame, text_column: str, progress=gr.Progress()):
310
+ if text_column not in df.columns: return df
311
+ sentiment_pipeline = get_sentiment_pipeline()
312
+ if not sentiment_pipeline:
313
+ gr.Warning("Sentiment model failed to load. Skipping analysis.")
314
+ return df
315
+
316
+ texts = df[text_column].dropna().tolist()
317
+ if not texts: return df
318
+
319
+ progress(0, desc="Running sentiment analysis...")
320
+ results = sentiment_pipeline(texts, batch_size=32)
321
+
322
+ text_to_sentiment = {text: result for text, result in zip(texts, results)}
323
+ df['sentiment_label'] = df[text_column].map(lambda x: text_to_sentiment.get(x, {}).get('label'))
324
+ df['sentiment_score'] = df[text_column].map(lambda x: text_to_sentiment.get(x, {}).get('score'))
325
+ logger.info("Sentiment analysis complete.")
326
+ return df
327
+
328
+ def generate_scraper_dashboard(df: pd.DataFrame):
329
+ set_plot_style()
330
+
331
+ total_articles, unique_media = len(df), df['media'].nunique()
332
+ start_date, end_date = pd.to_datetime(df['published_date']).min(), pd.to_datetime(df['published_date']).max()
333
+ date_range_str = f"{start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"
334
+
335
+ agg_code, agg_name = get_dynamic_time_agg(start_date, end_date)
336
+ timeline_df = df.set_index(pd.to_datetime(df['published_date'])).resample(agg_code).size().reset_index(name='count')
337
+ timeline_plot = gr.LinePlot(timeline_df, x='published_date', y='count', title=f'{agg_name} News Volume', tooltip=['published_date', 'count'])
338
+
339
+ media_counts = df['media'].dropna().value_counts().nlargest(15).sort_values()
340
+ fig_media = None
341
+ if not media_counts.empty:
342
+ fig_media, ax = plt.subplots(figsize=(8, 6)); media_counts.plot(kind='barh', ax=ax, color='skyblue'); ax.set_title("Top 15 Media Sources", fontproperties=BANGLA_FONT)
343
+ ax.set_yticklabels(media_counts.index, fontproperties=BANGLA_FONT); ax.set_xlabel("Article Count"); plt.tight_layout()
344
+
345
+ text = " ".join(title for title in df['title'].astype(str))
346
+ fig_wc = None
347
+ try:
348
+ wc = WordCloud(font_path=FONT_PATH, width=800, height=400, background_color='white', stopwords=BANGLA_STOP_WORDS, collocations=False).generate(text)
349
+ fig_wc, ax = plt.subplots(figsize=(10, 5)); ax.imshow(wc, interpolation='bilinear'); ax.axis("off")
350
+ except Exception as e: logger.error(f"WordCloud failed: {e}")
351
+
352
+ return {
353
+ kpi_total_articles: str(total_articles), kpi_unique_media: str(unique_media), kpi_date_range: date_range_str,
354
+ dashboard_timeline_plot: timeline_plot, dashboard_media_plot: fig_media, dashboard_wordcloud_plot: fig_wc,
355
+ scraper_dashboard_group: gr.update(visible=True)
356
+ }
357
+
358
+ def generate_sentiment_dashboard(df: pd.DataFrame):
359
+ updates = {sentiment_dashboard_tab: gr.update(visible=False)}
360
+ set_plot_style()
361
+
362
+ if 'sentiment_label' in df.columns:
363
+ sentiment_counts = df['sentiment_label'].value_counts()
364
+ fig_pie, fig_media_sent = None, None
365
+ if not sentiment_counts.empty:
366
+ fig_pie, ax = plt.subplots(figsize=(6, 6)); ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=['#66c2a5', '#fc8d62', '#8da0cb'])
367
+ ax.set_title("Overall Sentiment Distribution", fontproperties=BANGLA_FONT); ax.axis('equal')
368
+
369
+ top_media = df['media'].value_counts().nlargest(10).index
370
+ media_sentiment = pd.crosstab(df[df['media'].isin(top_media)]['media'], df['sentiment_label'], normalize='index').mul(100)
371
+ if not media_sentiment.empty:
372
+ fig_media_sent, ax = plt.subplots(figsize=(10, 7)); media_sentiment.plot(kind='barh', stacked=True, ax=ax, colormap='viridis')
373
+ ax.set_title("Sentiment by Top Media Sources", fontproperties=BANGLA_FONT); ax.set_yticklabels(media_sentiment.index, fontproperties=BANGLA_FONT); plt.tight_layout()
374
+
375
+ updates.update({sentiment_pie_plot: fig_pie, sentiment_by_media_plot: fig_media_sent, sentiment_dashboard_tab: gr.update(visible=True)})
376
+ return updates
377
+
378
+ def generate_youtube_dashboard(videos_df, comments_df):
379
+ set_plot_style()
380
+ kpis = {
381
+ kpi_yt_videos_found: f"{len(videos_df):,}" if videos_df is not None else "0",
382
+ kpi_yt_views_scanned: f"{videos_df['view_count'].sum():,}" if videos_df is not None else "0",
383
+ kpi_yt_comments_scraped: f"{len(comments_df):,}" if comments_df is not None else "0"
384
+ }
385
+
386
+ channel_counts = videos_df['channel'].value_counts().nlargest(15).sort_values()
387
+ fig_channels, ax = plt.subplots(figsize=(8, 6))
388
+ if not channel_counts.empty:
389
+ channel_counts.plot(kind='barh', ax=ax, color='coral'); ax.set_title("Top 15 Channels by Video Volume", fontproperties=BANGLA_FONT); ax.set_yticklabels(channel_counts.index, fontproperties=BANGLA_FONT); plt.tight_layout()
390
+
391
+ fig_wc, fig_pie, fig_sentiment_video = None, None, None
392
+ if comments_df is not None and not comments_df.empty:
393
+ text = " ".join(comment for comment in comments_df['comment_text'].astype(str))
394
+ try:
395
+ wc = WordCloud(font_path=FONT_PATH, width=800, height=400, background_color='white', stopwords=BANGLA_STOP_WORDS, collocations=False).generate(text)
396
+ fig_wc, ax = plt.subplots(figsize=(10, 5)); ax.imshow(wc, interpolation='bilinear'); ax.axis("off"); ax.set_title("Most Common Words in Comments", fontproperties=BANGLA_FONT)
397
+ except Exception as e: logger.error(f"YouTube WordCloud failed: {e}")
398
+
399
+ if 'sentiment_label' in comments_df.columns:
400
+ sentiment_counts = comments_df['sentiment_label'].value_counts()
401
+ if not sentiment_counts.empty:
402
+ fig_pie, ax = plt.subplots(figsize=(6, 6)); ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=['#66c2a5', '#fc8d62', '#8da0cb']); ax.set_title("Overall Comment Sentiment", fontproperties=BANGLA_FONT)
403
+
404
+ top_videos_by_comment = comments_df['video_title'].value_counts().nlargest(10).index
405
+ video_sentiment = comments_df.groupby('video_title')['sentiment_label'].value_counts(normalize=True).unstack().mul(100).reindex(top_videos_by_comment).dropna(how='all')
406
+ if not video_sentiment.empty:
407
+ fig_sentiment_video, ax = plt.subplots(figsize=(10, 8)); video_sentiment.plot(kind='barh', stacked=True, ax=ax, colormap='viridis'); ax.set_title("Comment Sentiment by Top 10 Videos", fontproperties=BANGLA_FONT); ax.set_yticklabels(video_sentiment.index, fontproperties=BANGLA_FONT); plt.tight_layout()
408
+
409
+ return {**kpis, yt_channel_plot: fig_channels, yt_wordcloud_plot: fig_wc, yt_sentiment_pie_plot: fig_pie, yt_sentiment_by_video_plot: fig_sentiment_video}
410
+
411
+ def generate_youtube_topic_dashboard(videos_df_full_scan: pd.DataFrame):
412
+ if videos_df_full_scan is None or videos_df_full_scan.empty: return None, None, None
413
+ set_plot_style()
414
+
415
+ channel_views = videos_df_full_scan.groupby('channel')['view_count'].sum().nlargest(15).sort_values()
416
+ fig_channel_views, ax = plt.subplots(figsize=(10, 7)); channel_views.plot(kind='barh', ax=ax, color='purple'); ax.set_title("Channel Dominance by Total Views (Top 15)", fontproperties=BANGLA_FONT); ax.set_xlabel("Combined Views on Topic"); ax.set_yticklabels(channel_views.index, fontproperties=BANGLA_FONT); plt.tight_layout()
417
+
418
+ df_sample = videos_df_full_scan.sample(n=min(len(videos_df_full_scan), 200))
419
+ avg_views, avg_engagement = df_sample['view_count'].median(), df_sample['engagement_rate'].median()
420
+ fig_quadrant, ax = plt.subplots(figsize=(10, 8)); sns.scatterplot(data=df_sample, x='view_count', y='engagement_rate', size='like_count', sizes=(20, 400), hue='channel', alpha=0.7, ax=ax, legend=False)
421
+ ax.set_xscale('log'); ax.set_yscale('log'); ax.set_title("Content Performance Quadrant", fontproperties=BANGLA_FONT); ax.set_xlabel("Video Views (Log Scale)", fontproperties=BANGLA_FONT); ax.set_ylabel("Engagement Rate (Log Scale)", fontproperties=BANGLA_FONT)
422
+ ax.axhline(avg_engagement, ls='--', color='gray'); ax.axvline(avg_views, ls='--', color='gray'); ax.text(avg_views*1.1, ax.get_ylim()[1], 'High Performers', color='green', fontproperties=BANGLA_FONT); ax.text(ax.get_xlim()[0], avg_engagement*1.1, 'Niche Stars', color='blue', fontproperties=BANGLA_FONT)
423
+
424
+ fig_age, ax = plt.subplots(figsize=(10, 7)); sns.scatterplot(data=df_sample, x='published_date', y='view_count', size='engagement_rate', sizes=(20, 400), alpha=0.6, ax=ax)
425
+ ax.set_yscale('log'); ax.set_title("Content Age vs. Impact", fontproperties=BANGLA_FONT); ax.set_xlabel("Publication Date", fontproperties=BANGLA_FONT); ax.set_ylabel("Views (Log Scale)", fontproperties=BANGLA_FONT); plt.xticks(rotation=45)
426
+
427
+ return fig_channel_views, fig_quadrant, fig_age
428
+
429
+ # ==============================================================================
430
+ # GRADIO UI DEFINITION
431
+ # ==============================================================================
432
+
433
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), title=APP_TITLE) as app:
434
+ gr.Markdown(f"# {APP_TITLE}\n*{APP_TAGLINE}*")
435
+
436
+ # --- STATE MANAGEMENT ---
437
+ scraper_results_state = gr.State()
438
+ youtube_results_state = gr.State()
439
+
440
+ with gr.Tabs() as tabs:
441
+ with gr.TabItem("1. News Scraper", id=0):
442
+ with gr.Row():
443
+ with gr.Column(scale=1):
444
+ gr.Markdown("### 1. Search Criteria")
445
+ search_keywords_textbox = gr.Textbox(label="Search Keywords", placeholder="e.g., বিএনপি সমাবেশ")
446
+ sites_to_search_textbox = gr.Textbox(label="Target Sites (Optional, comma-separated)", placeholder="e.g., prothomalo.com")
447
+ start_date_textbox = gr.Textbox(label="Start Date", placeholder="YYYY-MM-DD or 'last week'")
448
+ end_date_textbox = gr.Textbox(label="End Date", placeholder="YYYY-MM-DD or 'today'")
449
+ gr.Markdown("### 2. Scraping Parameters")
450
+ interval_days_slider = gr.Slider(1, 7, 3, step=1, label="Days per Interval")
451
+ max_pages_slider = gr.Slider(1, 10, 5, step=1, label="Max Pages per Interval")
452
+ filter_keywords_textbox = gr.Textbox(label="Filter Keywords (comma-separated, optional)", placeholder="e.g., নির্বাচন, সরকার")
453
+ start_scraper_button = gr.Button("Start Scraping & Analysis", variant="primary")
454
+ with gr.Column(scale=2):
455
+ scraper_results_df = gr.DataFrame(label="Filtered Results", interactive=False, wrap=True)
456
+ scraper_download_file = gr.File(label="Download Filtered Results CSV")
457
+
458
+ with gr.TabItem("2. News Analytics", id=1):
459
+ with gr.Group(visible=False) as scraper_dashboard_group:
460
+ with gr.Tabs():
461
+ with gr.TabItem("Overview"):
462
+ with gr.Row():
463
+ kpi_total_articles = gr.Textbox(label="Total Articles Found", interactive=False)
464
+ kpi_unique_media = gr.Textbox(label="Unique Media Sources", interactive=False)
465
+ kpi_date_range = gr.Textbox(label="Date Range of Articles", interactive=False)
466
+ dashboard_timeline_plot = gr.LinePlot(label="News Volume Timeline")
467
+ with gr.Row():
468
+ dashboard_media_plot = gr.Plot(label="Top Media Sources by Article Count")
469
+ dashboard_wordcloud_plot = gr.Plot(label="Headline Word Cloud")
470
+ with gr.TabItem("Sentiment Analysis", visible=False) as sentiment_dashboard_tab:
471
+ with gr.Row():
472
+ sentiment_pie_plot = gr.Plot(label="Overall Sentiment")
473
+ sentiment_by_media_plot = gr.Plot(label="Sentiment by Media Source")
474
+
475
+ with gr.TabItem("3. YouTube Topic Analysis", id=2):
476
+ with gr.Row():
477
+ with gr.Column(scale=1):
478
+ gr.Markdown("### 1. YouTube API & Search")
479
+ yt_api_key = gr.Textbox(label="YouTube API Key", type="password", placeholder="Paste your API key")
480
+ yt_search_keywords = gr.Textbox(label="Search Keywords", placeholder="e.g., বিএনপি, তারেক রহমান")
481
+ yt_published_after = gr.Textbox(label="Published After Date (Optional)", placeholder="YYYY-MM-DD or '1 month ago'")
482
+ gr.Markdown("### 2. Analysis Parameters")
483
+ yt_max_videos_for_stats = gr.Slider(label="Videos to Scan for Topic Stats (Broad Scan)", minimum=50, maximum=750, value=300, step=50)
484
+ yt_num_videos_for_comments = gr.Slider(label="Top Videos for Comment Analysis (Deep Dive)", minimum=5, maximum=100, value=25, step=5)
485
+ yt_max_comments = gr.Slider(10, 100, 30, step=10, label="Max Comments per Video")
486
+ start_yt_analysis_button = gr.Button("Start YouTube Analysis", variant="primary")
487
+ with gr.Column(scale=2):
488
+ with gr.Group(visible=False) as yt_dashboard_group:
489
+ gr.Markdown("### Topic Footprint KPIs (Based on Broad Scan)")
490
+ with gr.Row():
491
+ kpi_yt_total_topic_videos = gr.Textbox(label="Est. Total Videos on Topic (YT)", interactive=False)
492
+ kpi_yt_videos_found = gr.Textbox(label="Videos Scanned for Stats", interactive=False)
493
+ kpi_yt_views_scanned = gr.Textbox(label="Combined Views (of Scanned)", interactive=False)
494
+ kpi_yt_comments_scraped = gr.Textbox(label="Comments Analyzed (from Top Videos)", interactive=False)
495
+ with gr.Tabs():
496
+ with gr.TabItem("Deep Dive Analysis (on Top Videos)"):
497
+ yt_videos_df_output = gr.DataFrame(label="Top Videos Analyzed for Comments (sorted by views)")
498
+ with gr.Row():
499
+ yt_channel_plot = gr.Plot(label="Channel Contribution by Video Count")
500
+ yt_sentiment_pie_plot = gr.Plot(label="Overall Comment Sentiment")
501
+ with gr.Row():
502
+ yt_wordcloud_plot = gr.Plot(label="Comment Word Cloud")
503
+ yt_sentiment_by_video_plot = gr.Plot(label="Comment Sentiment by Video")
504
+ with gr.TabItem("Topic-Level Analytics (on All Scanned Videos)"):
505
+ yt_channel_views_plot = gr.Plot(label="Channel Dominance by Views")
506
+ yt_performance_quadrant_plot = gr.Plot(label="Content Performance Quadrant")
507
+ yt_content_age_plot = gr.Plot(label="Content Age vs. Impact")
508
+
509
+ gr.Markdown(f"<div style='text-align: center; margin-top: 20px;'>{APP_FOOTER}</div>")
510
+
511
+ # ==============================================================================
512
+ # EVENT HANDLERS
513
+ # ==============================================================================
514
+
515
+ # --- NEWS SCRAPER WORKFLOW ---
516
+ def news_scraper_workflow(search_keywords, sites, start_date, end_date, interval, max_pages, filter_keys, progress=gr.Progress()):
517
+ progress(0, desc="Starting news analysis...")
518
+ raw_df, display_df = run_news_scraper_pipeline(search_keywords, sites, start_date, end_date, interval, max_pages, filter_keys, progress)
519
+
520
+ if raw_df.empty:
521
+ gr.Info("No news articles found for your query."); return None, None, None
522
+
523
+ progress(0.8, desc="Analyzing sentiment of news headlines...")
524
+ analyzed_df = run_sentiment_analysis(raw_df.copy(), 'title', progress)
525
+
526
+ output_path = "filtered_news_data.csv"; display_df.to_csv(output_path, index=False)
527
+ return display_df, output_path, analyzed_df
528
+
529
+ start_scraper_button.click(
530
+ fn=news_scraper_workflow,
531
+ inputs=[search_keywords_textbox, sites_to_search_textbox, start_date_textbox, end_date_textbox, interval_days_slider, max_pages_slider, filter_keywords_textbox],
532
+ outputs=[scraper_results_df, scraper_download_file, scraper_results_state]
533
+ )
534
+
535
+ def update_news_dashboards(analyzed_df):
536
+ if analyzed_df is None or analyzed_df.empty:
537
+ return {scraper_dashboard_group: gr.update(visible=False), sentiment_dashboard_tab: gr.update(visible=False)}
538
+
539
+ scraper_updates = generate_scraper_dashboard(analyzed_df)
540
+ sentiment_updates = generate_sentiment_dashboard(analyzed_df)
541
+ return {**scraper_updates, **sentiment_updates}
542
+
543
+ news_ui_components = [
544
+ scraper_dashboard_group, kpi_total_articles, kpi_unique_media, kpi_date_range,
545
+ dashboard_timeline_plot, dashboard_media_plot, dashboard_wordcloud_plot,
546
+ sentiment_dashboard_tab, sentiment_pie_plot, sentiment_by_media_plot
547
+ ]
548
+ scraper_results_state.change(fn=update_news_dashboards, inputs=scraper_results_state, outputs=news_ui_components)
549
+
550
+ # --- YOUTUBE WORKFLOW ---
551
+ def youtube_workflow(api_key, query, max_stats, num_comments, max_comments, published_after, progress=gr.Progress()):
552
+ sanitized_api_key = api_key.strip()
553
+ sanitized_query = query.strip()
554
+ videos_df_full, comments_df, total_vids_est = run_youtube_analysis_pipeline(
555
+ sanitized_api_key, sanitized_query, max_stats, num_comments, max_comments, published_after, progress
556
+ )
557
+ if videos_df_full.empty:
558
+ gr.Info("No videos found for your YouTube query."); return None, None
559
+
560
+ if comments_df is not None and not comments_df.empty:
561
+ progress(0.9, desc="Analyzing comment sentiment...")
562
+ comments_df = run_sentiment_analysis(comments_df.copy(), 'comment_text', progress)
563
+
564
+ top_videos_for_display = videos_df_full.head(int(num_comments))
565
+ return top_videos_for_display, {"full_scan": videos_df_full, "comments": comments_df, "total_estimate": total_vids_est}
566
+
567
+ start_yt_analysis_button.click(
568
+ fn=youtube_workflow,
569
+ inputs=[yt_api_key, yt_search_keywords, yt_max_videos_for_stats, yt_num_videos_for_comments, yt_max_comments, yt_published_after],
570
+ outputs=[yt_videos_df_output, youtube_results_state]
571
+ )
572
+
573
+ def update_youtube_dashboards(results_data):
574
+ if not results_data or results_data.get("full_scan") is None or results_data["full_scan"].empty:
575
+ return {
576
+ yt_dashboard_group: gr.update(visible=False), kpi_yt_total_topic_videos: "0",
577
+ kpi_yt_videos_found: "0", kpi_yt_views_scanned: "0", kpi_yt_comments_scraped: "0",
578
+ yt_channel_plot: None, yt_wordcloud_plot: None, yt_sentiment_pie_plot: None,
579
+ yt_sentiment_by_video_plot: None, yt_channel_views_plot: None,
580
+ yt_performance_quadrant_plot: None, yt_content_age_plot: None
581
+ }
582
+
583
+ videos_df_full, comments_df, total_estimate = results_data.get("full_scan"), results_data.get("comments"), results_data.get("total_estimate", 0)
584
+ deep_dive_updates = generate_youtube_dashboard(videos_df_full, comments_df)
585
+ fig_ch_views, fig_quad, fig_age = generate_youtube_topic_dashboard(videos_df_full)
586
+
587
+ return {
588
+ yt_dashboard_group: gr.update(visible=True),
589
+ kpi_yt_total_topic_videos: f"{total_estimate:,}",
590
+ **deep_dive_updates,
591
+ yt_channel_views_plot: fig_ch_views,
592
+ yt_performance_quadrant_plot: fig_quad,
593
+ yt_content_age_plot: fig_age,
594
+ }
595
+
596
+ yt_ui_components = [
597
+ yt_dashboard_group, kpi_yt_total_topic_videos, kpi_yt_videos_found, kpi_yt_views_scanned, kpi_yt_comments_scraped,
598
+ yt_channel_plot, yt_wordcloud_plot, yt_sentiment_pie_plot, yt_sentiment_by_video_plot,
599
+ yt_channel_views_plot, yt_performance_quadrant_plot, yt_content_age_plot
600
+ ]
601
+ youtube_results_state.change(fn=update_youtube_dashboards, inputs=youtube_results_state, outputs=yt_ui_components)
602
+
603
+ # ==============================================================================
604
+ # LAUNCH THE APP
605
+ # ==============================================================================
606
+
607
+ if __name__ == "__main__":
608
+ auth_credentials = os.getenv("AUTH_CREDENTIALS")
609
+ auth_tuple = None
610
+ if auth_credentials and ":" in auth_credentials:
611
+ user, pwd = auth_credentials.split(":", 1)
612
+ auth_tuple = (user, pwd)
613
+ logger.info("Using authentication credentials from environment variable.")
614
+ else:
615
+ logger.warning("No AUTH_CREDENTIALS found. Using default insecure credentials. Set this as an environment variable for production.")
616
+ auth_tuple = ("bnp", "12345")
617
+
618
+ app.launch(debug=True, auth=auth_tuple)
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # requirements.txt
2
+ gradio
3
+ pandas
4
+ numpy
5
+ torch
6
+ transformers
7
+ accelerate
8
+ bitsandbytes
9
+ scipy
10
+ sentence-transformers
11
+ bertopic[visualization]
12
+ umap-learn
13
+ hdbscan
14
+ scikit-learn
15
+ bnlp-toolkit
16
+ bangla-stemmer
17
+ KeyBERT
18
+ huggingface_hub
19
+ google-api-python-client
20
+ GoogleNews
21
+ requests
22
+ dateparser
23
+ matplotlib
24
+ wordcloud
25
+ seaborn