Arjon07CSE's picture
disabled ssr
99e41ff verified
# ==============================================================================
# SOCIAL PERCEPTION ANALYZER - FINAL COMPLETE APPLICATION
# Version: 4.1 (Fully Refactored, Production-Ready)
# ==============================================================================
# --- IMPORTS ---
import re
from GoogleNews import GoogleNews
from requests.exceptions import HTTPError
import pandas as pd
import logging
import time
from datetime import datetime, timezone
from logging.handlers import RotatingFileHandler
import gradio as gr
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties, fontManager
import seaborn as sns
from wordcloud import WordCloud
import dateparser
import numpy as np
import os
# ==============================================================================
# SETUP PRODUCTION-GRADE LOGGING & CONFIGURATION
# ==============================================================================
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
log_handler = RotatingFileHandler('app.log', maxBytes=5*1024*1024, backupCount=2)
log_handler.setFormatter(log_formatter)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
if not logger.handlers:
logger.addHandler(log_handler)
logger.info("Application starting up.")
# --- APPLICATION CONFIGURATION ---
APP_TITLE = "Social Perception Analyzer"
APP_TAGLINE = "Analyze GoogleNews & YouTube video trends, engagement, and comment activity for your search topics."
APP_FOOTER = "Developed by Arjon"
# --- FONT CONFIGURATION ---
FONT_PATH = 'NotoSansBengali-Regular.ttf'
BANGLA_FONT = FONT_PATH
def setup_bangla_font():
"""Properly set up Bengali font for all visualizations"""
global BANGLA_FONT
# Strictly enforce NotoSansBengali-Regular.ttf for all Bengali text
if os.path.exists(FONT_PATH):
try:
fontManager.addfont(FONT_PATH)
BANGLA_FONT = FontProperties(fname=FONT_PATH)
plt.rcParams['font.family'] = BANGLA_FONT.get_name()
plt.rcParams['axes.unicode_minus'] = False
logger.info(f"Successfully loaded '{FONT_PATH}' for Bengali text.")
return True
except Exception as e:
logger.error(f"Error loading Bengali font: {e}")
return False
else:
logger.error(f"Font file {FONT_PATH} not found. Bengali text will not render correctly.")
BANGLA_FONT = None
plt.rcParams['font.family'] = 'sans-serif'
return False
# Initialize font system
font_loaded = setup_bangla_font()
# ==============================================================================
# CORE HELPER FUNCTIONS
# ==============================================================================
def clean_bengali_text(text):
"""Remove non-Bengali characters except spaces and underscores (for joined phrases)"""
cleaned = re.sub(r'[^\u0980-\u09FF_\s]', '', str(text))
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
return cleaned
# Comprehensive stopword list for Bengali text analysis
BANGLA_STOP_WORDS = [
'অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য',
'অভিপ্রায়', 'একে', 'একই', 'একেবারে', 'একটি', 'একবার', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটি',
'এতটাই', 'এতদূর', 'এতটুকু', 'এক', 'এবং', 'এবার', 'এমন', 'এমনভাবে', 'এর', 'এরা', 'এঁরা', 'এঁদের',
'এই', 'এইভাবে', 'ও', 'ওঁরা', 'ওঁর', 'ওঁদের', 'ওকে', 'ওখানে', 'ওদের', 'ওর', 'কাছ', 'কাছে', 'কাজ',
'কারণ', 'কিছু', 'কিছুই', 'কিন্তু', 'কিভাবে', 'কেন', 'কোন', 'কোনও', 'কোনো', 'ক্ষেত্রে', 'খুব',
'গুলি', 'গিয়ে', 'চায়', 'ছাড়া', 'জন্য', 'জানা', 'ঠিক', 'তিনি', 'তিন', 'তিনিও', 'তাকে', 'তাঁকে',
'তার', 'তাঁর', 'তারা', 'তাঁরা', 'তাদের', 'তাঁদের', 'তাহলে', 'থাকলেও', 'থেকে', 'মধ্যেই', 'মধ্যে',
'দ্বারা', 'নয়', 'না', 'নিজের', 'নিজে', 'নিয়ে', 'পারেন', 'পারা', 'পারে', 'পরে', 'পর্যন্ত', 'পুনরায়',
'ফলে', 'বজায়', 'বা', 'বাদে', 'বার', 'বিশেষ', 'বিভিন্ন', 'ব্যবহার', 'ব্যাপারে', 'ভাবে', 'ভাবেই', 'মাধ্যমে',
'মতো', 'মতোই', 'যখন', 'যদি', 'যদিও', 'যা', 'যাকে', 'যাওয়া', 'যায়', 'যে', 'যেখানে', 'যেতে', 'যেমন',
'যেহেতু', 'রহিছে', 'শিক্ষা', 'শুধু', 'সঙ্গে', 'সব', 'সমস্ত', 'সম্প্রতি', 'সহ', 'সাধারণ', 'সামনে', 'হতে',
'হতেই', 'হবে', 'হয়', 'হয়তো', 'হয়', 'হচ্ছে', 'হত', 'হলে', 'হলেও', 'হয়নি', 'হাজার', 'হোওয়া', 'আরও', 'আমরা',
'আমার', 'আমি', 'আর', 'আগে', 'আগেই', 'আছে', 'আজ', 'তাকে', 'তাতে', 'তাদের', 'তাহার', 'তাহাতে', 'তাহারই',
'তথা', 'তথাপি', 'সে', 'সেই', 'সেখান', 'সেখানে', 'থেকে', 'নাকি', 'নাগাদ', 'দু', 'দুটি', 'সুতরাং',
'সম্পর্কে', 'সঙ্গেও', 'সর্বাধিক', 'সর্বদা', 'সহ', 'হৈতে', 'হইবে', 'হইয়া', 'হৈল', 'জানিয়েছেন', 'প্রতিবেদক'
]
COMBINED_STOPWORDS = set(BANGLA_STOP_WORDS)
PHRASES_TO_JOIN = {
"তারেক রহমান": "তারেক_রহমান",
"খালেদা জিয়া": "খালেদা_জিয়া",
"বিএনপি জিন্দাবাদ": "বিএনপি_জিন্দাবাদ"
}
def get_dynamic_time_agg(start_date, end_date):
"""Determine appropriate time aggregation level based on date range"""
if not isinstance(start_date, pd.Timestamp) or not isinstance(end_date, pd.Timestamp):
return 'D', 'Daily' # Graceful fallback
delta = end_date - start_date
if delta.days <= 2:
return 'H', 'Hourly'
if delta.days <= 90:
return 'D', 'Daily'
if delta.days <= 730:
return 'W', 'Weekly'
return 'M', 'Monthly'
def kpi_badge_html(value, label, threshold_high=None, threshold_low=None):
"""
Returns HTML for a color-coded KPI badge.
Green for high, red for low, yellow for medium.
"""
try:
# Handle comma-separated numbers
if isinstance(value, str) and ',' in value:
val = float(value.replace(',', ''))
else:
val = float(value)
except (TypeError, ValueError, AttributeError):
val = value
color = '#e0e0e0' # default
if threshold_high is not None and isinstance(val, (int, float)) and val >= threshold_high:
color = '#4caf50' # green
elif threshold_low is not None and isinstance(val, (int, float)) and val <= threshold_low:
color = '#f44336' # red
elif threshold_high is not None and threshold_low is not None and isinstance(val, (int, float)):
color = '#ffeb3b' # yellow
# Format value with commas for large numbers
if isinstance(value, (int, float)):
formatted_value = f"{value:,.0f}"
else:
formatted_value = str(value)
return f"<div style='display:inline-block;padding:8px 16px;border-radius:8px;background:{color};color:#222;font-weight:bold;margin:2px;'>{label}: {formatted_value}</div>"
def set_plot_style():
"""Configure consistent matplotlib style for all visualizations"""
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.figsize'] = (10, 6)
# Always use NotoSansBengali-Regular.ttf for Bengali text
if BANGLA_FONT and BANGLA_FONT.get_name():
plt.rcParams['font.family'] = BANGLA_FONT.get_name()
else:
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['axes.unicode_minus'] = False # Fix for minus sign rendering
def cleanup_figures(*figures):
"""Properly close matplotlib figures to prevent memory leaks"""
for fig in figures:
if fig is not None:
try:
plt.close(fig)
except:
pass
# ==============================================================================
# NEWS SCRAPER BACKEND
# ==============================================================================
def run_news_scraper_pipeline(search_keywords, sites, start_date_str, end_date_str, interval, max_pages, filter_keys, progress=gr.Progress()):
"""Full implementation of the news scraper with robust error handling."""
# Input validation and sanitization
search_keywords = str(search_keywords).strip() if search_keywords else ""
sites = str(sites).strip() if sites else ""
start_date_str = str(start_date_str).strip() if start_date_str else ""
end_date_str = str(end_date_str).strip() if end_date_str else ""
filter_keys = str(filter_keys).strip() if filter_keys else ""
if not all([search_keywords, start_date_str, end_date_str]):
raise gr.Error("Search Keywords, Start Date, and End Date are required.")
start_dt = dateparser.parse(start_date_str)
end_dt = dateparser.parse(end_date_str)
if not all([start_dt, end_dt]):
raise gr.Error("Invalid date format. Please use a recognizable format like YYYY-MM-DD or '2 weeks ago'.")
# Ensure start date is before end date
if start_dt > end_dt:
start_dt, end_dt = end_dt, start_dt
gr.Warning("Start date was after end date. Dates have been swapped.")
all_articles, current_dt = [], start_dt
total_intervals = (end_dt - start_dt).days // interval + 1
while current_dt <= end_dt:
try:
interval_end_dt = min(current_dt + pd.Timedelta(days=interval - 1), end_dt)
start_str, end_str = current_dt.strftime('%Y-%m-%d'), interval_end_dt.strftime('%Y-%m-%d')
progress((current_dt - start_dt).days / (end_dt - start_dt).days,
desc=f"Fetching news from {start_str} to {end_str}")
site_query = f"({' OR '.join(['site:' + s.strip() for s in sites.split(',') if s.strip()])})" if sites else ""
final_query = f'"{search_keywords}" {site_query} after:{start_str} before:{end_str}'
googlenews = GoogleNews(lang='bn', region='BD', period='1d')
googlenews.search(final_query)
for page in range(1, max_pages + 1):
try:
results = googlenews.results()
if not results:
break
all_articles.extend(results)
if page < max_pages:
googlenews.getpage(page + 1)
time.sleep(0.3) # Reduced sleep for performance
except HTTPError as e:
if e.response.status_code == 429:
wait_time = 3 # Reduced wait for optimization
gr.Warning(f"Rate limited by Google News. Pausing for {wait_time} seconds.")
time.sleep(wait_time)
else:
logger.error(f"HTTP Error fetching news: {e}")
break
except Exception as e:
logger.error(f"An error occurred fetching news: {e}")
break
current_dt += pd.Timedelta(days=interval)
except Exception as e:
logger.error(f"Error in news scraping loop: {e}")
break
if not all_articles:
return pd.DataFrame(), pd.DataFrame()
# Create DataFrame and clean data
df = pd.DataFrame(all_articles).drop_duplicates(subset=['link'])
# Parse dates safely
df['published_date'] = df['date'].apply(lambda x: dateparser.parse(x, languages=['bn']) if pd.notna(x) else None)
# Drop rows with missing critical data
df = df.dropna(subset=['published_date', 'title'])
# Apply advanced filtering if filter keywords are provided
if filter_keys and filter_keys.strip():
def match_complex_query(text, query):
"""Advanced query parser supporting AND, OR, NOT logic"""
if not text or not query:
return False
text = str(text).lower()
query = query.lower()
# Simple tokenization that preserves phrases in quotes
tokens = re.findall(r'"[^"]+"|\S+', query)
# Build a regex pattern from the tokens
patterns = []
for token in tokens:
if token == 'and':
continue # We'll handle this with the final pattern
elif token == 'or':
patterns.append('|')
elif token == 'not':
patterns.append('(?=^(?!.*')
else:
# Clean token and convert to regex pattern
clean_token = token.strip('"')
if clean_token.startswith('"') and clean_token.endswith('"'):
clean_token = clean_token[1:-1]
patterns.append(re.escape(clean_token))
# Join patterns and handle negation
final_pattern = ''.join(patterns)
if '(?=' in final_pattern:
final_pattern += '))'
try:
return bool(re.search(final_pattern, text))
except:
# Fallback to simple substring match if regex fails
return any(token in text for token in tokens if token not in ['and', 'or', 'not'])
# Apply filtering to title and description
mask = df.apply(lambda row: match_complex_query(
str(row['title']) + ' ' + str(row.get('desc', '')),
filter_keys
), axis=1)
df = df[mask]
# Return both full dataset and filtered display dataset
# Always return all Google News fields (published_date, title, media, description, link)
# Some sources use 'desc', some use 'description'. Unify to 'description'.
if 'desc' in df.columns and 'description' not in df.columns:
df['description'] = df['desc']
return df, df[['published_date', 'title', 'media', 'description', 'link']].sort_values(by='published_date', ascending=False)
# ==============================================================================
# YOUTUBE ANALYZER BACKEND
# ==============================================================================
def run_youtube_analysis_pipeline(api_key, query, max_videos_for_stats, num_videos_for_comments, max_comments_per_video, published_after, progress=gr.Progress()):
"""Complete YouTube analysis pipeline with robust error handling."""
# Use integrated API key for seamless experience
api_key = os.getenv("YOUTUBE_API_KEY", "AIzaSyAiiGsKTJyIe4SRfC2uUXwhQ6KO-DEjgIA")
if not query:
raise gr.Error("Search Keywords are required.")
try:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
youtube = build('youtube', 'v3', developerKey=api_key)
except ImportError:
logger.error("Required YouTube API libraries not installed")
raise gr.Error("YouTube analysis requires additional libraries. Please install google-api-python-client.")
except HttpError as e:
raise gr.Error(f"Failed to initialize YouTube service. Check API Key. Error: {e}")
except Exception as e:
raise gr.Error(f"An unexpected error occurred during API initialization: {e}")
progress(0.1, desc="Performing broad scan for videos...")
all_video_ids, next_page_token, total_results_estimate = [], None, 0
PAGES_TO_FETCH = min(15, (max_videos_for_stats // 50) + 1)
search_params = {
'q': query,
'part': 'id',
'maxResults': 50,
'type': 'video',
'order': 'relevance'
}
if published_after:
parsed_date = dateparser.parse(published_after)
if parsed_date:
search_params['publishedAfter'] = parsed_date.replace(tzinfo=timezone.utc).isoformat()
else:
gr.Warning(f"Could not parse date: '{published_after}'. Ignoring filter.")
for page in range(PAGES_TO_FETCH):
try:
if next_page_token:
search_params['pageToken'] = next_page_token
response = youtube.search().list(**search_params).execute()
if page == 0:
total_results_estimate = response.get('pageInfo', {}).get('totalResults', 0)
# Extract valid video IDs
valid_ids = []
for item in response.get('items', []):
if 'id' in item and 'videoId' in item['id']:
valid_ids.append(item['id']['videoId'])
all_video_ids.extend(valid_ids)
next_page_token = response.get('nextPageToken')
progress(0.1 + (0.3 * (page / PAGES_TO_FETCH)),
desc=f"Broad scan: Found {len(all_video_ids)} videos...")
if not next_page_token:
break
except HttpError as e:
if "quotaExceeded" in str(e):
raise gr.Error("CRITICAL: YouTube API daily quota exceeded. Try again tomorrow.")
logger.error(f"HTTP error during video search: {e}")
break
except Exception as e:
logger.error(f"Unexpected error during YouTube search: {e}")
break
if not all_video_ids:
return pd.DataFrame(), pd.DataFrame(), ""
# Fetch video details in batches
progress(0.4, desc=f"Fetching details for {len(all_video_ids)} videos...")
def _fetch_video_details(youtube_service, video_ids: list):
"""Fetch detailed information for a batch of video IDs"""
all_videos_data = []
try:
for i in range(0, len(video_ids), 50):
id_batch = video_ids[i:i+50]
video_request = youtube_service.videos().list(
part="snippet,statistics",
id=",".join(id_batch)
)
video_response = video_request.execute()
for item in video_response.get('items', []):
stats = item.get('statistics', {})
all_videos_data.append({
'video_id': item['id'],
'video_title': item['snippet']['title'],
'channel': item['snippet']['channelTitle'],
'published_date': item['snippet']['publishedAt'],
'view_count': int(stats.get('viewCount', 0)),
'like_count': int(stats.get('likeCount', 0)),
'comment_count': int(stats.get('commentCount', 0))
})
except Exception as e:
logger.error(f"Could not fetch video details: {e}")
return all_videos_data
videos_df_full_scan = pd.DataFrame(_fetch_video_details(youtube, all_video_ids))
if videos_df_full_scan.empty:
return pd.DataFrame(), pd.DataFrame(), ""
# Process and clean video data
videos_df_full_scan['published_date'] = pd.to_datetime(videos_df_full_scan['published_date'])
# Calculate engagement rate safely
videos_df_full_scan['engagement_rate'] = (
(videos_df_full_scan['like_count'] + videos_df_full_scan['comment_count']) /
videos_df_full_scan['view_count'].replace(0, 1)
).fillna(0)
videos_df_full_scan = videos_df_full_scan.sort_values(
by='view_count',
ascending=False
).reset_index(drop=True)
# Fetch comments for top videos
videos_to_scrape_df = videos_df_full_scan.head(int(num_videos_for_comments))
all_comments = []
def _scrape_single_video_comments(youtube_service, video_id, max_comments):
"""Scrape comments for a single video with error handling"""
comments_list = []
try:
request = youtube_service.commentThreads().list(
part="snippet",
videoId=video_id,
maxResults=min(max_comments, 100),
order='relevance',
textFormat="plainText"
)
response = request.execute()
for item in response.get('items', []):
snippet = item['snippet']['topLevelComment']['snippet']
comments_list.append({
'author': snippet['authorDisplayName'],
'published_date_comment': snippet['publishedAt'],
'comment_text': snippet['textDisplay'],
'likes': snippet['likeCount'],
'replies': item['snippet']['totalReplyCount']
})
except Exception as e:
logger.warning(f"Could not retrieve comments for video {video_id}: {e}")
return comments_list
for index, row in videos_to_scrape_df.iterrows():
progress(0.7 + (0.3 * (index / len(videos_to_scrape_df))),
desc=f"Deep dive: Scraping comments from video {index+1}/{len(videos_to_scrape_df)}...")
comments_for_video = _scrape_single_video_comments(
youtube,
row['video_id'],
max_comments_per_video
)
if comments_for_video:
for comment in comments_for_video:
comment.update({
'video_id': row['video_id'],
'video_title': row['video_title']
})
all_comments.extend(comments_for_video)
comments_df = pd.DataFrame(all_comments)
if not comments_df.empty:
comments_df['published_date_comment'] = pd.to_datetime(comments_df['published_date_comment'])
logger.info(f"YouTube analysis complete. Est. total videos: {total_results_estimate}. "
f"Scanned: {len(videos_df_full_scan)}. Comments: {len(comments_df)}.")
# Create summary HTML
summary_html = f"""
<div style='background:#f5f5f5;padding:16px;border-radius:12px;margin-bottom:12px;box-shadow:0 2px 8px #eee;'>
<h3 style='margin:0 0 8px 0;'>YouTube Analytics Summary</h3>
<ul style='margin:0;padding-left:18px;'>
<li><b>Total Videos:</b> {len(videos_df_full_scan):,}</li>
<li><b>Total Comments:</b> {len(comments_df):,}</li>
<li><b>Total Views:</b> {videos_df_full_scan['view_count'].sum():,}</li>
</ul>
</div>
"""
return videos_df_full_scan, comments_df, summary_html
# ==============================================================================
# ADVANCED ANALYTICS MODULE
# ==============================================================================
def generate_scraper_dashboard(df: pd.DataFrame):
"""Generate comprehensive dashboard from news scraper results."""
if df.empty:
# Return empty dashboard components
return {
"kpi_total_articles": gr.HTML(""),
"kpi_unique_media": gr.HTML(""),
"kpi_date_range": gr.HTML(""),
"dashboard_timeline_plot": None,
"dashboard_media_plot": None,
"dashboard_wordcloud_plot": None
}
set_plot_style()
# Calculate KPIs
total_articles, unique_media = len(df), df['media'].nunique()
start_date, end_date = pd.to_datetime(df['published_date']).min(), pd.to_datetime(df['published_date']).max()
date_range_str = f"{start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"
# Color-coded KPI badges
kpi_total_articles_html = kpi_badge_html(
total_articles, 'Total Articles', threshold_high=100, threshold_low=10
)
kpi_unique_media_html = kpi_badge_html(
unique_media, 'Unique Media', threshold_high=10, threshold_low=2
)
kpi_date_range_html = kpi_badge_html(
date_range_str, 'Date Range', threshold_high=None, threshold_low=None
)
# Time series visualization - FIXED GRADIO API USAGE
agg_code, agg_name = get_dynamic_time_agg(start_date, end_date)
timeline_df = df.set_index(pd.to_datetime(df['published_date'])).resample(agg_code).size().reset_index(name='count')
timeline_df.rename(columns={'published_date': 'date'}, inplace=True)
timeline_plot = gr.LinePlot(
value=timeline_df,
x='date',
y='count',
title=f'{agg_name} News Volume',
tooltip=['date', 'count'],
x_title="Date",
y_title="Number of Articles"
)
# Media source analysis
media_counts = df['media'].dropna().value_counts().nlargest(15).sort_values()
fig_media = None
if not media_counts.empty:
fig_media, ax = plt.subplots(figsize=(8, 6))
media_counts.plot(kind='barh', ax=ax, color='skyblue')
ax.set_title("Top 15 Media Sources", fontproperties=BANGLA_FONT, fontsize=18)
ax.set_xlabel("Article Count", fontproperties=BANGLA_FONT, fontsize=14)
ax.set_ylabel("মিডিয়া", fontproperties=BANGLA_FONT, fontsize=14)
yticks = np.arange(len(media_counts.index))
ax.set_yticks(yticks)
ax.set_yticklabels(media_counts.index, fontproperties=BANGLA_FONT, fontsize=14)
for label in ax.get_xticklabels():
label.set_fontproperties(BANGLA_FONT)
label.set_fontsize(12)
for label in ax.get_yticklabels():
label.set_fontproperties(BANGLA_FONT)
label.set_fontsize(14)
legend = ax.get_legend()
if legend:
for text in legend.get_texts():
text.set_fontproperties(BANGLA_FONT)
plt.tight_layout()
# Word cloud generation
fig_wc = None
try:
# Combine all titles and clean text
text = " ".join(title for title in df['title'].astype(str))
text = clean_bengali_text(text)
# Join special phrases
for phrase, joined in PHRASES_TO_JOIN.items():
text = text.replace(phrase, joined)
# Extract and filter words
words = re.findall(r'[\u0980-\u09FF_]{2,}', text)
words = [w for w in words if w not in COMBINED_STOPWORDS]
words = [w for w in words if len(w) > 1]
words = [w for w in words if not re.search(r'[a-zA-Z]', w)]
# Filter by frequency
from collections import Counter
word_freq = Counter(words)
min_freq = 2
most_common = set([w for w, _ in word_freq.most_common(3)])
filtered_words = [w for w in words if word_freq[w] >= min_freq and w not in most_common]
wc_text = " ".join(filtered_words)
# Generate word cloud
if wc_text.strip():
wc = WordCloud(
font_path=FONT_PATH,
width=1600,
height=900,
background_color='white',
stopwords=COMBINED_STOPWORDS,
collocations=False,
colormap='plasma',
max_words=200,
contour_width=2,
contour_color='steelblue',
regexp=r"[\u0980-\u09FF_]+"
).generate(wc_text)
fig_wc, ax = plt.subplots(figsize=(15, 8))
ax.imshow(wc, interpolation='bilinear')
ax.axis("off")
ax.set_title("Bengali Headline Word Cloud", fontproperties=BANGLA_FONT, fontsize=22)
plt.tight_layout()
except Exception as e:
logger.error(f"WordCloud failed: {e}")
gr.Warning(f"WordCloud generation failed: {str(e)}")
return {
"kpi_total_articles": gr.HTML(kpi_total_articles_html),
"kpi_unique_media": gr.HTML(kpi_unique_media_html),
"kpi_date_range": gr.HTML(kpi_date_range_html),
"dashboard_timeline_plot": timeline_plot,
"dashboard_media_plot": fig_media,
"dashboard_wordcloud_plot": fig_wc
}
def generate_youtube_dashboard(videos_df, comments_df):
"""Generate comprehensive dashboard from YouTube analysis results."""
# Initialize all dashboard components FIRST
dashboard_components = {
"kpi_yt_videos_found": gr.HTML(""),
"kpi_yt_views_scanned": gr.HTML(""),
"kpi_yt_comments_scraped": gr.HTML(""),
"yt_channel_plot": None,
"yt_channel_dominance_plot": None,
"yt_time_series_plot": None,
"yt_top_videos_plot": None,
"yt_content_quadrant_plot": None,
"yt_engagement_plot": None,
"yt_wordcloud_plot": None,
"yt_detailed_summary": gr.HTML("")
}
# Channel dominance by view
fig_channel_dominance = None
if videos_df is not None and not videos_df.empty and 'channel' in videos_df.columns:
channel_views = videos_df.groupby('channel')['view_count'].sum().sort_values(ascending=False).head(10)
if not channel_views.empty:
fig_channel_dominance, ax = plt.subplots(figsize=(10, 6))
channel_views.plot(kind='barh', ax=ax, color='slateblue')
ax.set_title("Top 10 Dominant Channels by View Count", fontproperties=BANGLA_FONT, fontsize=18)
ax.set_xlabel("মোট ভিউ", fontproperties=BANGLA_FONT, fontsize=14)
ax.set_ylabel("চ্যানেল", fontproperties=BANGLA_FONT, fontsize=14)
yticks = np.arange(len(channel_views.index))
ax.set_yticks(yticks)
ax.set_yticklabels(channel_views.index, fontproperties=BANGLA_FONT, fontsize=14)
for label in ax.get_xticklabels():
label.set_fontproperties(BANGLA_FONT)
label.set_fontsize(12)
for label in ax.get_yticklabels():
label.set_fontproperties(BANGLA_FONT)
label.set_fontsize(14)
legend = ax.get_legend()
if legend:
for text in legend.get_texts():
text.set_fontproperties(BANGLA_FONT)
plt.tight_layout()
dashboard_components["yt_channel_dominance_plot"] = fig_channel_dominance
# Content performance quadrant
fig_quadrant = None
if videos_df is not None and not videos_df.empty:
try:
# Define quadrant boundaries
median_views = videos_df['view_count'].median()
median_engagement = videos_df['engagement_rate'].median()
fig_quadrant, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(
videos_df['view_count'],
videos_df['engagement_rate'],
c='darkorange', alpha=0.7
)
ax.axvline(median_views, color='blue', linestyle='--', label='Median Views')
ax.axhline(median_engagement, color='green', linestyle='--', label='Median Engagement')
ax.set_xlabel("মোট ভিউ", fontproperties=BANGLA_FONT, fontsize=14)
ax.set_ylabel("এনগেজমেন্ট রেট", fontproperties=BANGLA_FONT, fontsize=14)
ax.set_title("Content Performance Quadrant", fontproperties=BANGLA_FONT, fontsize=18)
for label in ax.get_xticklabels():
label.set_fontproperties(BANGLA_FONT)
label.set_fontsize(12)
for label in ax.get_yticklabels():
label.set_fontproperties(BANGLA_FONT)
label.set_fontsize(14)
legend = ax.get_legend()
if legend:
for text in legend.get_texts():
text.set_fontproperties(BANGLA_FONT)
plt.tight_layout()
except Exception as e:
logger.error(f"Quadrant plot failed: {e}")
dashboard_components["yt_content_quadrant_plot"] = fig_quadrant
# Detailed analysis summary from YouTube API
detailed_summary = ""
if videos_df is not None and not videos_df.empty:
top_video = videos_df.iloc[0]
detailed_summary = f"<div style='background:#e3f2fd;padding:12px;border-radius:8px;margin-bottom:8px;'>"
detailed_summary += f"<b>Top Video:</b> {top_video['video_title']}<br>"
detailed_summary += f"<b>Channel:</b> {top_video['channel']}<br>"
detailed_summary += f"<b>Views:</b> {top_video['view_count']:,}<br>"
detailed_summary += f"<b>Likes:</b> {top_video['like_count']:,}<br>"
detailed_summary += f"<b>Comments:</b> {top_video['comment_count']:,}<br>"
detailed_summary += f"<b>Published:</b> {top_video['published_date'].strftime('%Y-%m-%d')}<br>"
detailed_summary += f"<b>Engagement Rate:</b> {top_video['engagement_rate']:.2f}"
detailed_summary += "</div>"
dashboard_components["yt_detailed_summary"] = gr.HTML(detailed_summary)
# Generate KPIs if data exists
if videos_df is not None and not videos_df.empty:
dashboard_components["kpi_yt_videos_found"] = gr.HTML(
kpi_badge_html(len(videos_df), 'Videos Found', threshold_high=50, threshold_low=5)
)
dashboard_components["kpi_yt_views_scanned"] = gr.HTML(
kpi_badge_html(videos_df['view_count'].sum(), 'Views Scanned', threshold_high=100000, threshold_low=1000)
)
if comments_df is not None and not comments_df.empty:
dashboard_components["kpi_yt_comments_scraped"] = gr.HTML(
kpi_badge_html(len(comments_df), 'Comments Scraped', threshold_high=100, threshold_low=10)
)
# Channel analysis
fig_channels = None
if videos_df is not None and not videos_df.empty and 'channel' in videos_df.columns:
channel_counts = videos_df['channel'].value_counts().nlargest(15).sort_values()
if not channel_counts.empty:
fig_channels, ax = plt.subplots(figsize=(8, 6))
channel_counts.plot(kind='barh', ax=ax, color='coral')
ax.set_title("Top 15 Channels by Video Volume", fontproperties=BANGLA_FONT, fontsize=18)
ax.set_yticklabels(channel_counts.index, fontproperties=BANGLA_FONT, fontsize=14)
ax.set_xlabel("Video Count", fontproperties=BANGLA_FONT, fontsize=14)
for label in ax.get_xticklabels():
label.set_fontproperties(BANGLA_FONT)
label.set_fontsize(12)
for label in ax.get_yticklabels():
label.set_fontproperties(BANGLA_FONT)
label.set_fontsize(14)
legend = ax.get_legend()
if legend:
for text in legend.get_texts():
text.set_fontproperties(BANGLA_FONT)
plt.tight_layout()
dashboard_components["yt_channel_plot"] = fig_channels
# Word cloud from comments
fig_wc = None
if comments_df is not None and not comments_df.empty and 'comment_text' in comments_df.columns:
try:
text = " ".join(comment for comment in comments_df['comment_text'].astype(str))
text = clean_bengali_text(text)
# Join special phrases
for phrase, joined in PHRASES_TO_JOIN.items():
text = text.replace(phrase, joined)
# Extract and filter words
words = re.findall(r'[\u0980-\u09FF_]{2,}', text)
words = [w for w in words if w not in COMBINED_STOPWORDS]
words = [w for w in words if len(w) > 1]
words = [w for w in words if not re.search(r'[a-zA-Z]', w)]
# Filter by frequency
from collections import Counter
word_freq = Counter(words)
min_freq = 2
most_common = set([w for w, _ in word_freq.most_common(3)])
filtered_words = [w for w in words if word_freq[w] >= min_freq and w not in most_common]
wc_text = " ".join(filtered_words)
# Generate word cloud
if wc_text.strip():
wc = WordCloud(
font_path=FONT_PATH,
width=1600,
height=900,
background_color='white',
stopwords=COMBINED_STOPWORDS,
collocations=False,
colormap='plasma',
max_words=250,
contour_width=2,
contour_color='darkorange',
regexp=r"[\u0980-\u09FF_]+"
).generate(wc_text)
fig_wc, ax = plt.subplots(figsize=(15, 8))
ax.imshow(wc, interpolation='bilinear')
ax.axis("off")
ax.set_title("Bengali Word Cloud from YouTube Comments", fontproperties=BANGLA_FONT, fontsize=22)
plt.tight_layout()
except Exception as e:
logger.error(f"YouTube WordCloud failed: {e}")
dashboard_components["yt_wordcloud_plot"] = fig_wc
# Top commented videos
fig_top_videos = None
if comments_df is not None and not comments_df.empty and 'video_title' in comments_df.columns:
top_videos = comments_df['video_title'].value_counts().nlargest(10)
if not top_videos.empty:
fig_top_videos, ax = plt.subplots(figsize=(10, 6))
top_videos.plot(kind='barh', ax=ax, color='dodgerblue')
ax.set_title("Top 10 Videos by Comment Count", fontproperties=BANGLA_FONT, fontsize=18)
ax.set_xlabel("মন্তব্য সংখ্যা", fontproperties=BANGLA_FONT, fontsize=14)
ax.set_ylabel("ভিডিও শিরোনাম", fontproperties=BANGLA_FONT, fontsize=14)
yticks = np.arange(len(top_videos.index))
ax.set_yticks(yticks)
ax.set_yticklabels(top_videos.index, fontproperties=BANGLA_FONT, fontsize=14)
for label in ax.get_xticklabels():
label.set_fontproperties(BANGLA_FONT)
label.set_fontsize(12)
for label in ax.get_yticklabels():
label.set_fontproperties(BANGLA_FONT)
label.set_fontsize(14)
legend = ax.get_legend()
if legend:
for text in legend.get_texts():
text.set_fontproperties(BANGLA_FONT)
plt.tight_layout()
dashboard_components["yt_top_videos_plot"] = fig_top_videos
# Engagement rate per video
fig_engagement = None
if videos_df is not None and not videos_df.empty and comments_df is not None and not comments_df.empty:
if 'video_id' in videos_df.columns and 'video_id' in comments_df.columns:
try:
# Count comments per video
comment_counts = comments_df['video_id'].value_counts().reset_index()
comment_counts.columns = ['video_id', 'comment_count']
# Ensure 'comment_count' column exists in videos_df
merged = videos_df.merge(comment_counts, on='video_id', how='left')
if 'comment_count' not in merged.columns:
merged['comment_count'] = 0
merged['comment_count'] = merged['comment_count'].fillna(0)
# Calculate engagement rate
merged['engagement_rate'] = merged['comment_count'] / merged['view_count'].replace(0, 1)
# Get top 10 videos by engagement
top_engagement = merged.nlargest(10, 'engagement_rate')
if not top_engagement.empty:
fig_engagement, ax = plt.subplots(figsize=(10, 6))
ax.barh(top_engagement['video_title'], top_engagement['engagement_rate'], color='mediumseagreen')
ax.set_title("Top 10 Videos by Engagement Rate", fontproperties=BANGLA_FONT, fontsize=18)
ax.set_xlabel("এনগেজমেন্ট রেট (মন্তব্য/ভিউ)", fontproperties=BANGLA_FONT, fontsize=14)
ax.set_ylabel("ভিডিও শিরোনাম", fontproperties=BANGLA_FONT, fontsize=14)
yticks = np.arange(len(top_engagement['video_title']))
ax.set_yticks(yticks)
ax.set_yticklabels(top_engagement['video_title'], fontproperties=BANGLA_FONT, fontsize=14)
for label in ax.get_xticklabels():
label.set_fontproperties(BANGLA_FONT)
label.set_fontsize(12)
for label in ax.get_yticklabels():
label.set_fontproperties(BANGLA_FONT)
label.set_fontsize(14)
legend = ax.get_legend()
if legend:
for text in legend.get_texts():
text.set_fontproperties(BANGLA_FONT)
plt.tight_layout()
except Exception as e:
logger.error(f"Engagement rate calculation failed: {e}")
dashboard_components["yt_engagement_plot"] = fig_engagement
# Comment activity over time
fig_time_series = None
if comments_df is not None and not comments_df.empty and 'published_date_comment' in comments_df.columns:
try:
comments_df['published_date_comment'] = pd.to_datetime(comments_df['published_date_comment'])
time_series = comments_df.set_index('published_date_comment').resample('D').size().reset_index()
time_series.columns = ['date', 'count']
if not time_series.empty:
fig_time_series = gr.LinePlot(
value=time_series,
x='date',
y='count',
title="Comment Activity Over Time",
tooltip=['date', 'count'],
x_title="Date",
y_title="Number of Comments"
)
except Exception as e:
logger.error(f"Error in comment activity plot: {e}")
dashboard_components["yt_time_series_plot"] = fig_time_series
return dashboard_components
# ==============================================================================
# GRADIO UI DEFINITION
# ==============================================================================
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), title=APP_TITLE) as app:
gr.Markdown(f"# {APP_TITLE}\n*{APP_TAGLINE}*")
# --- STATE MANAGEMENT ---
scraper_results_state = gr.State()
youtube_results_state = gr.State()
with gr.Tabs():
with gr.TabItem("1. News Scraper", id=0):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Search Criteria")
search_keywords_textbox = gr.Textbox(
label="Search Keywords",
placeholder="e.g., বাংলাদেশ, নির্বাচন",
info="Keywords to search for in news articles."
)
sites_to_search_textbox = gr.Textbox(
label="Target Sites (Optional, comma-separated)",
placeholder="e.g., prothomalo.com",
info="Limit search to specific news sites."
)
start_date_textbox = gr.Textbox(
label="Start Date",
placeholder="YYYY-MM-DD or 'last week'",
info="Start date for news scraping."
)
end_date_textbox = gr.Textbox(
label="End Date",
placeholder="YYYY-MM-DD or 'today'",
info="End date for news scraping."
)
gr.Markdown("### Scraping Parameters")
interval_days_slider = gr.Slider(
1, 7, 3, step=1,
label="Days per Interval",
info="How many days to group each scraping interval."
)
max_pages_slider = gr.Slider(
1, 10, 5, step=1,
label="Max Pages per Interval",
info="Maximum number of pages to fetch per interval."
)
filter_keywords_textbox = gr.Textbox(
label="Filter Keywords (comma-separated, optional)",
placeholder="e.g., ডাকসু, নোবেল",
info="Filter results by these keywords."
)
start_scraper_button = gr.Button("Start Scraping & Analysis", variant="primary")
scraper_progress = gr.Progress()
with gr.Column(scale=2):
scraper_results_df = gr.DataFrame(
label="Filtered Results",
interactive=True
)
scraper_download_file = gr.File(
label="Download Filtered Results CSV"
)
with gr.TabItem("2. News Analytics", id=1):
gr.Markdown("### News Analytics Dashboard")
with gr.Group():
news_summary_card = gr.HTML(
"<div style='background:#f5f5f5;padding:16px;border-radius:12px;margin-bottom:12px;box-shadow:0 2px 8px #eee;'>"
"<h3 style='margin:0 0 8px 0;'>Key Findings</h3>"
"<ul style='margin:0;padding-left:18px;'>"
"<li><b>Total Articles:</b> <span id='news_total_articles'></span></li>"
"<li><b>Unique Media:</b> <span id='news_unique_media'></span></li>"
"<li><b>Date Range:</b> <span id='news_date_range'></span></li>"
"</ul></div>"
)
kpi_total_articles = gr.HTML()
kpi_unique_media = gr.HTML()
kpi_date_range = gr.HTML()
with gr.Row():
with gr.Column():
dashboard_timeline_plot = gr.LinePlot(
label="News Volume Timeline"
)
with gr.Column():
dashboard_media_plot = gr.Plot(
label="Top Media Sources by Article Count"
)
dashboard_wordcloud_plot = gr.Plot(
label="Headline Word Cloud"
)
with gr.TabItem("3. YouTube Topic Analysis", id=2):
gr.Markdown("## YouTube Topic Analysis")
with gr.Row():
with gr.Column(scale=1):
yt_search_keywords = gr.Textbox(
label="YouTube Search Keywords",
placeholder="e.g., ক্রিকেট",
info="Keywords to search for in YouTube videos."
)
yt_max_videos_slider = gr.Slider(
10, 100, 30, step=5,
label="Max Videos for Stats",
info="Maximum number of videos to scan for statistics."
)
yt_num_videos_comments_slider = gr.Slider(
1, 20, 5, step=1,
label="Videos for Comments",
info="Number of top videos to scrape comments from."
)
yt_max_comments_slider = gr.Slider(
10, 200, 50, step=10,
label="Max Comments per Video",
info="Maximum number of comments to fetch per video."
)
yt_published_after = gr.Textbox(
label="Published After (Optional)",
placeholder="YYYY-MM-DD",
info="Only include videos published after this date."
)
start_youtube_analysis_button = gr.Button(
"Start YouTube Analysis",
variant="primary"
)
yt_progress = gr.Progress()
with gr.Column(scale=2):
yt_results_df = gr.DataFrame(
label="YouTube Video Results",
interactive=True
)
yt_videos_download_file = gr.File(
label="Download YouTube Video Results CSV"
)
yt_comments_df = gr.DataFrame(
label="YouTube Comments Results",
interactive=True
)
yt_comments_download_file = gr.File(
label="Download YouTube Comments CSV"
)
yt_dashboard_html = gr.HTML()
with gr.Group():
kpi_yt_videos_found = gr.HTML()
kpi_yt_views_scanned = gr.HTML()
kpi_yt_comments_scraped = gr.HTML()
with gr.Row():
with gr.Column():
yt_channel_plot = gr.Plot(
label="Top Channels by Video Volume"
)
yt_channel_dominance_plot = gr.Plot(
label="Channel Dominance by View Count"
)
with gr.Column():
yt_time_series_plot = gr.LinePlot(
label="Comment Activity Over Time"
)
with gr.Row():
with gr.Column():
yt_top_videos_plot = gr.Plot(
label="Top Videos by Comment Count"
)
yt_content_quadrant_plot = gr.Plot(
label="Content Performance Quadrant"
)
with gr.Column():
yt_engagement_plot = gr.Plot(
label="Top Videos by Engagement Rate"
)
yt_wordcloud_plot = gr.Plot(
label="Bengali Word Cloud from Comments"
)
yt_detailed_summary = gr.HTML()
# --- EVENT HANDLERS ---
def scraper_button_handler(search_keywords, sites, start_date, end_date, interval, max_pages, filter_keys):
"""Handle news scraper button click event."""
try:
df, filtered_df = run_news_scraper_pipeline(
search_keywords, sites, start_date, end_date,
interval, max_pages, filter_keys
)
# Update the state with the full results
scraper_results_state = df
# Generate dashboard visualizations
dashboard = generate_scraper_dashboard(df)
# Prepare download file for news results
if not df.empty:
csv_path = "news_results.csv"
df.to_csv(csv_path, index=False)
scraper_download_file = gr.File(value=csv_path, visible=True)
else:
scraper_download_file = gr.File(visible=False)
return (
filtered_df,
scraper_download_file,
dashboard["kpi_total_articles"],
dashboard["kpi_unique_media"],
dashboard["kpi_date_range"],
dashboard["dashboard_timeline_plot"],
dashboard["dashboard_media_plot"],
dashboard["dashboard_wordcloud_plot"]
)
except Exception as e:
logger.error(f"Error in scraper button handler: {str(e)}")
gr.Error(f"An error occurred during scraping: {str(e)}")
# Return empty values to reset the UI
return (
pd.DataFrame(),
gr.File(visible=False),
gr.HTML(""), gr.HTML(""), gr.HTML(""),
None, None, None
)
start_scraper_button.click(
fn=scraper_button_handler,
inputs=[
search_keywords_textbox,
sites_to_search_textbox,
start_date_textbox,
end_date_textbox,
interval_days_slider,
max_pages_slider,
filter_keywords_textbox
],
outputs=[
scraper_results_df,
scraper_download_file,
kpi_total_articles,
kpi_unique_media,
kpi_date_range,
dashboard_timeline_plot,
dashboard_media_plot,
dashboard_wordcloud_plot
]
)
def youtube_button_handler(keywords, max_videos, num_comments_videos, max_comments, published_after):
"""Handle YouTube analysis button click event."""
try:
videos_df, comments_df, summary_html = run_youtube_analysis_pipeline(
api_key=None,
query=keywords,
max_videos_for_stats=max_videos,
num_videos_for_comments=num_comments_videos,
max_comments_per_video=max_comments,
published_after=published_after
)
# Update the state with the results
youtube_results_state = (videos_df, comments_df)
# Prepare download files for YouTube results
yt_videos_csv = "youtube_videos.csv"
yt_comments_csv = "youtube_comments.csv"
if not videos_df.empty:
videos_df.to_csv(yt_videos_csv, index=False)
yt_videos_download_file = gr.File(value=yt_videos_csv, visible=True)
else:
yt_videos_download_file = gr.File(visible=False)
# For comments, add video title and channel if not present
if not comments_df.empty:
if "video_title" not in comments_df.columns and "video_id" in comments_df.columns:
# Map video title from videos_df
title_map = videos_df.set_index("video_id")["video_title"].to_dict()
comments_df["video_title"] = comments_df["video_id"].map(title_map)
if "channel" not in comments_df.columns and "channel_title" in comments_df.columns:
comments_df["channel"] = comments_df["channel_title"]
comments_df.to_csv(yt_comments_csv, index=False)
yt_comments_download_file = gr.File(value=yt_comments_csv, visible=True)
else:
yt_comments_download_file = gr.File(visible=False)
# Generate dashboard visualizations
dashboard = generate_youtube_dashboard(videos_df, comments_df)
return (
videos_df,
yt_videos_download_file,
comments_df,
yt_comments_download_file,
summary_html,
dashboard["kpi_yt_videos_found"],
dashboard["kpi_yt_views_scanned"],
dashboard["kpi_yt_comments_scraped"],
dashboard["yt_channel_plot"],
dashboard["yt_channel_dominance_plot"],
dashboard["yt_time_series_plot"],
dashboard["yt_top_videos_plot"],
dashboard["yt_content_quadrant_plot"],
dashboard["yt_engagement_plot"],
dashboard["yt_wordcloud_plot"],
dashboard["yt_detailed_summary"]
)
except Exception as e:
logger.error(f"Error in YouTube button handler: {str(e)}")
gr.Error(f"An error occurred during YouTube analysis: {str(e)}")
# Return empty values to reset the UI (16 outputs)
return (
pd.DataFrame(), # yt_results_df
gr.File(visible=False), # yt_videos_download_file
pd.DataFrame(), # yt_comments_df
gr.File(visible=False), # yt_comments_download_file
gr.HTML(""), # yt_dashboard_html
gr.HTML(""), # kpi_yt_videos_found
gr.HTML(""), # kpi_yt_views_scanned
gr.HTML(""), # kpi_yt_comments_scraped
None, # yt_channel_plot
None, # yt_channel_dominance_plot
None, # yt_time_series_plot
None, # yt_top_videos_plot
None, # yt_content_quadrant_plot
None, # yt_engagement_plot
None, # yt_wordcloud_plot
gr.HTML("") # yt_detailed_summary
)
start_youtube_analysis_button.click(
fn=youtube_button_handler,
inputs=[
yt_search_keywords,
yt_max_videos_slider,
yt_num_videos_comments_slider,
yt_max_comments_slider,
yt_published_after
],
outputs=[
yt_results_df,
yt_videos_download_file,
yt_comments_df,
yt_comments_download_file,
yt_dashboard_html,
kpi_yt_videos_found,
kpi_yt_views_scanned,
kpi_yt_comments_scraped,
yt_channel_plot,
yt_channel_dominance_plot,
yt_time_series_plot,
yt_top_videos_plot,
yt_content_quadrant_plot,
yt_engagement_plot,
yt_wordcloud_plot,
yt_detailed_summary
]
)
# ==============================================================================
# LAUNCH THE APP
# ==============================================================================
custom_css = """
body, .gradio-container {
background: #181a20 !important;
font-family: 'Inter', 'Noto Sans', sans-serif;
}
.gr-card {
background: #23263a;
border-radius: 18px;
box-shadow: 0 4px 24px rgba(0,0,0,0.12);
padding: 24px;
margin-bottom: 24px;
}
.gr-title {
color: #fff;
font-size: 2.2rem;
font-weight: 700;
margin-bottom: 12px;
}
.gr-metric {
color: #22d3ee;
font-size: 2.5rem;
font-weight: 800;
}
.gr-label {
color: #94a3b8;
font-size: 1.1rem;
margin-bottom: 6px;
}
.gradio-row, .gradio-column {
background: transparent !important;
}
.gradio-button {
border-radius: 8px !important;
background: linear-gradient(90deg,#3b82f6,#22d3ee) !important;
color: #fff !important;
font-weight: 600 !important;
box-shadow: 0 2px 8px rgba(34,211,238,0.08);
transition: background 0.2s;
}
.gradio-button:hover {
background: linear-gradient(90deg,#22d3ee,#3b82f6) !important;
}
.gradio-markdown h1, .gradio-markdown h2, .gradio-markdown h3 {
color: #fff !important;
}
.gradio-markdown {
color: #cbd5e1 !important;
}
"""
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), title=APP_TITLE, css=custom_css) as app:
gr.HTML("""
<div class='gr-card' style='margin-bottom:32px;'>
<div class='gr-title'>Social Perception Analyzer</div>
<div style='color:#94a3b8;font-size:1.2rem;margin-bottom:8px;'>Prepared for the Policymakers of Bangladesh Nationalist Party (BNP)</div>
<div style='color:#22d3ee;font-size:1rem;'>Developed by CDSR</div>
</div>
""")
# --- STATE MANAGEMENT ---
scraper_results_state = gr.State()
youtube_results_state = gr.State()
with gr.Tabs():
with gr.TabItem("1. News Scraper", id=0):
gr.HTML("<div class='gr-card' style='margin-bottom:24px;'><h2>News Scraper</h2><p>Search and filter news articles from top Bangladeshi sources. Use advanced filters and download results.</p></div>")
with gr.Row():
with gr.Column(scale=1):
gr.HTML("<div class='gr-card'><h3>Search Criteria</h3></div>")
search_keywords_textbox = gr.Textbox(
label="Search Keywords",
placeholder="e.g., বিএনপি সমাবেশ",
info="Keywords to search for in news articles."
)
sites_to_search_textbox = gr.Textbox(
label="Target Sites (Optional, comma-separated)",
placeholder="e.g., prothomalo.com",
info="Limit search to specific news sites."
)
start_date_textbox = gr.Textbox(
label="Start Date",
placeholder="YYYY-MM-DD or 'last week'",
info="Start date for news scraping."
)
end_date_textbox = gr.Textbox(
label="End Date",
placeholder="YYYY-MM-DD or 'today'",
info="End date for news scraping."
)
gr.HTML("<div class='gr-card'><h3>Scraping Parameters</h3></div>")
interval_days_slider = gr.Slider(
1, 7, 3, step=1,
label="Days per Interval",
info="How many days to group each scraping interval."
)
max_pages_slider = gr.Slider(
1, 10, 5, step=1,
label="Max Pages per Interval",
info="Maximum number of pages to fetch per interval."
)
filter_keywords_textbox = gr.Textbox(
label="Filter Keywords (comma-separated, optional)",
placeholder="e.g., নির্বাচন, সরকার",
info="Filter results by these keywords."
)
start_scraper_button = gr.Button("Start Scraping & Analysis", variant="primary")
scraper_progress = gr.Progress()
with gr.Column(scale=2):
gr.HTML("<div class='gr-card'><h3>Filtered Results</h3></div>")
scraper_results_df = gr.DataFrame(
label="Filtered Results",
interactive=True
)
scraper_download_file = gr.File(
label="Download Filtered Results CSV"
)
with gr.TabItem("2. News Analytics", id=1):
gr.HTML("<div class='gr-card' style='margin-bottom:24px;'><h2>News Analytics Dashboard</h2><p>Visualize key metrics, trends, and top sources from scraped news data. All plots and metrics update dynamically.</p></div>")
with gr.Row():
with gr.Column(scale=1):
gr.HTML("<div class='gr-card'><h3>Key Metrics</h3></div>")
kpi_total_articles = gr.HTML()
kpi_unique_media = gr.HTML()
kpi_date_range = gr.HTML()
with gr.Column(scale=2):
gr.HTML("<div class='gr-card'><h3>Trends</h3></div>")
dashboard_timeline_plot = gr.LinePlot(
label="News Volume Timeline"
)
with gr.Row():
with gr.Column(scale=1):
gr.HTML("<div class='gr-card'><h3>Top Sources</h3></div>")
dashboard_media_plot = gr.Plot(
label="Top Media Sources by Article Count"
)
with gr.Column(scale=1):
gr.HTML("<div class='gr-card'><h3>Headline Word Cloud</h3></div>")
dashboard_wordcloud_plot = gr.Plot(
label="Headline Word Cloud"
)
with gr.TabItem("3. YouTube Topic Analysis", id=2):
gr.HTML("<div class='gr-card' style='margin-bottom:24px;'><h2>YouTube Topic Analysis</h2><p>Analyze YouTube video trends, engagement, and comment activity for your search topics.</p></div>")
with gr.Row():
with gr.Column(scale=1):
gr.HTML("<div class='gr-card'><h3>Search Criteria</h3></div>")
yt_search_keywords = gr.Textbox(
label="YouTube Search Keywords",
placeholder="e.g., BNP Rally",
info="Keywords to search for in YouTube videos."
)
yt_max_videos_slider = gr.Slider(
10, 100, 30, step=5,
label="Max Videos for Stats",
info="Maximum number of videos to scan for statistics."
)
yt_num_videos_comments_slider = gr.Slider(
1, 20, 5, step=1,
label="Videos for Comments",
info="Number of top videos to scrape comments from."
)
yt_max_comments_slider = gr.Slider(
10, 200, 50, step=10,
label="Max Comments per Video",
info="Maximum number of comments to fetch per video."
)
yt_published_after = gr.Textbox(
label="Published After (Optional)",
placeholder="YYYY-MM-DD",
info="Only include videos published after this date."
)
start_youtube_analysis_button = gr.Button(
"Start YouTube Analysis",
variant="primary"
)
yt_progress = gr.Progress()
with gr.Column(scale=2):
gr.HTML("<div class='gr-card'><h3>Video Results</h3></div>")
yt_results_df = gr.DataFrame(
label="YouTube Video Results",
interactive=True
)
yt_videos_download_file = gr.File(
label="Download YouTube Video Results CSV"
)
yt_comments_df = gr.DataFrame(
label="YouTube Comments Results",
interactive=True
)
yt_comments_download_file = gr.File(
label="Download YouTube Comments CSV"
)
yt_dashboard_html = gr.HTML()
with gr.Row():
with gr.Column(scale=1):
gr.HTML("<div class='gr-card'><h3>Top Channels & Engagement</h3></div>")
kpi_yt_videos_found = gr.HTML()
kpi_yt_views_scanned = gr.HTML()
kpi_yt_comments_scraped = gr.HTML()
yt_channel_plot = gr.Plot(
label="Top Channels by Video Volume"
)
yt_channel_dominance_plot = gr.Plot(
label="Channel Dominance by View Count"
)
yt_top_videos_plot = gr.Plot(
label="Top Videos by Comment Count"
)
yt_content_quadrant_plot = gr.Plot(
label="Content Performance Quadrant"
)
yt_engagement_plot = gr.Plot(
label="Top Videos by Engagement Rate"
)
with gr.Column(scale=1):
gr.HTML("<div class='gr-card'><h3>Comment Activity & Word Cloud</h3></div>")
yt_time_series_plot = gr.LinePlot(
label="Comment Activity Over Time"
)
yt_wordcloud_plot = gr.Plot(
label="Bengali Word Cloud from Comments"
)
yt_detailed_summary = gr.HTML()
# --- EVENT HANDLERS ---
def scraper_button_handler(search_keywords, sites, start_date, end_date, interval, max_pages, filter_keys):
"""Handle news scraper button click event."""
try:
df, filtered_df = run_news_scraper_pipeline(
search_keywords, sites, start_date, end_date,
interval, max_pages, filter_keys
)
scraper_results_state = df
dashboard = generate_scraper_dashboard(df)
if not df.empty:
csv_path = "news_results.csv"
df.to_csv(csv_path, index=False)
scraper_download_file = gr.File(value=csv_path, visible=True)
else:
scraper_download_file = gr.File(visible=False)
return (
filtered_df,
scraper_download_file,
dashboard["kpi_total_articles"],
dashboard["kpi_unique_media"],
dashboard["kpi_date_range"],
dashboard["dashboard_timeline_plot"],
dashboard["dashboard_media_plot"],
dashboard["dashboard_wordcloud_plot"]
)
except Exception as e:
logger.error(f"Error in scraper button handler: {str(e)}")
gr.Error(f"An error occurred during scraping: {str(e)}")
return (
pd.DataFrame(),
gr.File(visible=False),
gr.HTML(""), gr.HTML(""), gr.HTML(""),
None, None, None
)
start_scraper_button.click(
fn=scraper_button_handler,
inputs=[
search_keywords_textbox,
sites_to_search_textbox,
start_date_textbox,
end_date_textbox,
interval_days_slider,
max_pages_slider,
filter_keywords_textbox
],
outputs=[
scraper_results_df,
scraper_download_file,
kpi_total_articles,
kpi_unique_media,
kpi_date_range,
dashboard_timeline_plot,
dashboard_media_plot,
dashboard_wordcloud_plot
]
)
def youtube_button_handler(keywords, max_videos, num_comments_videos, max_comments, published_after):
"""Handle YouTube analysis button click event."""
try:
videos_df, comments_df, summary_html = run_youtube_analysis_pipeline(
api_key=None,
query=keywords,
max_videos_for_stats=max_videos,
num_videos_for_comments=num_comments_videos,
max_comments_per_video=max_comments,
published_after=published_after
)
youtube_results_state = (videos_df, comments_df)
yt_videos_csv = "youtube_videos.csv"
yt_comments_csv = "youtube_comments.csv"
if not videos_df.empty:
videos_df.to_csv(yt_videos_csv, index=False)
yt_videos_download_file = gr.File(value=yt_videos_csv, visible=True)
else:
yt_videos_download_file = gr.File(visible=False)
if not comments_df.empty:
if "video_title" not in comments_df.columns and "video_id" in comments_df.columns:
title_map = videos_df.set_index("video_id")["video_title"].to_dict()
comments_df["video_title"] = comments_df["video_id"].map(title_map)
if "channel" not in comments_df.columns and "channel_title" in comments_df.columns:
comments_df["channel"] = comments_df["channel_title"]
comments_df.to_csv(yt_comments_csv, index=False)
yt_comments_download_file = gr.File(value=yt_comments_csv, visible=True)
else:
yt_comments_download_file = gr.File(visible=False)
dashboard = generate_youtube_dashboard(videos_df, comments_df)
return (
videos_df,
yt_videos_download_file,
comments_df,
yt_comments_download_file,
summary_html,
dashboard["kpi_yt_videos_found"],
dashboard["kpi_yt_views_scanned"],
dashboard["kpi_yt_comments_scraped"],
dashboard["yt_channel_plot"],
dashboard["yt_channel_dominance_plot"],
dashboard["yt_time_series_plot"],
dashboard["yt_top_videos_plot"],
dashboard["yt_content_quadrant_plot"],
dashboard["yt_engagement_plot"],
dashboard["yt_wordcloud_plot"],
dashboard["yt_detailed_summary"]
)
except Exception as e:
logger.error(f"Error in YouTube button handler: {str(e)}")
gr.Error(f"An error occurred during YouTube analysis: {str(e)}")
return (
pd.DataFrame(), # yt_results_df
gr.File(visible=False), # yt_videos_download_file
pd.DataFrame(), # yt_comments_df
gr.File(visible=False), # yt_comments_download_file
gr.HTML(""), # yt_dashboard_html
gr.HTML(""), # kpi_yt_videos_found
gr.HTML(""), # kpi_yt_views_scanned
gr.HTML(""), # kpi_yt_comments_scraped
None, # yt_channel_plot
None, # yt_channel_dominance_plot
None, # yt_time_series_plot
None, # yt_top_videos_plot
None, # yt_content_quadrant_plot
None, # yt_engagement_plot
None, # yt_wordcloud_plot
gr.HTML("") # yt_detailed_summary
)
start_youtube_analysis_button.click(
fn=youtube_button_handler,
inputs=[
yt_search_keywords,
yt_max_videos_slider,
yt_num_videos_comments_slider,
yt_max_comments_slider,
yt_published_after
],
outputs=[
yt_results_df,
yt_videos_download_file,
yt_comments_df,
yt_comments_download_file,
yt_dashboard_html,
kpi_yt_videos_found,
kpi_yt_views_scanned,
kpi_yt_comments_scraped,
yt_channel_plot,
yt_channel_dominance_plot,
yt_time_series_plot,
yt_top_videos_plot,
yt_content_quadrant_plot,
yt_engagement_plot,
yt_wordcloud_plot,
yt_detailed_summary
]
)
AUTH_USERS = [
("admin", "admin123"),
("user", "user123")
]
if __name__ == "__main__":
app.launch(debug=True, share=True, auth=AUTH_USERS, ssr_mode=False)