Arjon07CSE commited on
Commit
c0df0eb
·
verified ·
1 Parent(s): e4bb20b

added youtube analyzer section

Browse files
Files changed (1) hide show
  1. app.py +547 -658
app.py CHANGED
@@ -1,4 +1,9 @@
1
- # --- IMPORTS & GLOBAL SETUP ---
 
 
 
 
 
2
  import gradio as gr
3
  import pandas as pd
4
  import numpy as np
@@ -8,41 +13,64 @@ import sqlite3
8
  import json
9
  import logging
10
  import requests
 
 
 
 
11
  from io import StringIO
 
 
 
 
 
 
 
 
 
12
 
13
- # Transformers and BERTopic components
14
  from transformers import pipeline, BitsAndBytesConfig
15
  from sentence_transformers import SentenceTransformer
16
- from bertopic import BERTopic
17
- from bertopic.representation import KeyBERTInspired
18
- from umap import UMAP
19
- from hdbscan import HDBSCAN
20
- from sklearn.feature_extraction.text import CountVectorizer
21
-
22
- # Hugging Face and Colab integration (optional, for LLM access)
23
- from huggingface_hub import login
24
- # from google.colab import userdata # We will disable this for HF Spaces deployment
25
-
26
- # Setup basic logging to monitor the application's health
27
- logging.basicConfig(
28
- level=logging.INFO,
29
- format='%(asctime)s - %(levelname)s - %(message)s'
30
- )
31
-
32
- # A simple dictionary to hold data between UI interactions, acting as a session state.
33
- APP_STATE = {
34
- "df": None,
35
- "bertopic_model": None,
36
- "topics_df": None,
37
- "final_df": None,
38
- }
39
-
40
- print("✅ app.py created. Initial imports written.")
41
- print("✅ Dependencies installed in Colab environment.")
42
-
43
- # --- TEXT PREPROCESSING & NORMALIZATION ---
44
-
45
- # A comprehensive list of Bangla stop words, tailored for news and general text.
 
 
 
 
 
 
 
 
 
 
46
  BANGLA_STOP_WORDS = [
47
  'অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য',
48
  'অভিপ্রায়', 'একে', 'একই', 'একেবারে', 'একটি', 'একবার', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটি',
@@ -61,669 +89,530 @@ BANGLA_STOP_WORDS = [
61
  'সম্পর্কে', 'সঙ্গেও', 'সর্বাধিক', 'সর্বদা', 'সহ', 'হৈতে', 'হইবে', 'হইয়া', 'হৈল', 'জানিয়েছেন', 'প্রতিবেদক'
62
  ]
63
 
64
- def normalize_bangla_manual(text):
65
- """A robust, self-contained function to normalize Bangla text."""
66
- if not isinstance(text, str): return ""
67
- replacements = {
68
- '[\u09F7]': '\u09B0', '[\u09F2]': '\u09B2', '[\u09E4]': '\u098B', '[\u09E5]': '\u09E1',
69
- '[\u09FA]': '\u09B8\u09CD\u09AE', '[\u09FB]': '\u0995\u09CD\u09B7', '[\u0970]': '\u0966',
70
- '[\u09F3]': '\u09B0\u09C2', '[\u09F8]': '\u09A3', '[\u09F9]': '\u09B6', '[\u0984]': '',
71
- '[\u0980]': '\u0981', r'(\s)।(\s)': r'\1।\2', r'(\S)।(\S)': r'\1 । \2',
72
- '[\u0964][\u0964]': '\u0964', '[|]': '\u0964', '[\u09DC]': '\u09A1\u09BC',
73
- '[\u09DD]': '\u09A2\u09BC', '[\u09DF]': '\u09AF\u09BC',
74
- }
75
- for old, new in replacements.items():
76
- text = re.sub(old, new, text)
77
- return text
78
-
79
- def preprocess_bangla_text(text):
80
- """Cleans and normalizes a single Bangla text string for NLP tasks."""
81
- if not isinstance(text, str): return ""
82
- text = normalize_bangla_manual(text)
83
- text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
84
- text = re.sub(r'\S*@\S*\s?', '', text)
85
- text = re.sub(r'[^\u0980-\u09FF\s]', '', text)
86
- words = text.split()
87
- words = [word for word in words if word not in BANGLA_STOP_WORDS]
88
- text = " ".join(words)
89
- return re.sub(r'\s+', ' ', text).strip()
90
-
91
- print("✅ Helper functions appended to app.py")
92
-
93
- # --- APP BRANDING & CONFIGURATION ---
94
- # Easily update the application's title, tagline, and footer here.
95
- APP_TITLE = "Social Perception Analyzer"
96
- APP_TAGLINE = "Prepared for the Policymakers of Bangladesh Nationalist Party (BNP)"
97
- APP_FOOTER = "Developed by Centre for Data Science Research (CDSR), and Strategy and Policy Forum (SPF)"
98
 
 
 
 
99
 
100
- # --- LOCAL LLM INITIALIZATION ---
101
- def initialize_local_llm(hf_token=None):
102
- """
103
- Initializes and returns a local, quantized, lightweight LLM pipeline.
104
- This model is chosen for its efficiency and Bangla language specialization.
105
- """
106
- model_id = "hishab/titulm-llama-3.2-1b-v1.1"
107
 
108
- # 4-bit quantization to reduce memory usage significantly
109
- quantization_config = BitsAndBytesConfig(
110
- load_in_4bit=True,
111
- bnb_4bit_compute_dtype=torch.bfloat16
112
- )
113
 
114
- try:
115
- # Check for GPU availability
116
- if not torch.cuda.is_available():
117
- logging.warning("GPU not available. LLM will run on CPU and be very slow.")
118
- llm_pipeline = pipeline("text-generation", model=model_id, token=hf_token)
119
- else:
120
- logging.info(f"Initializing quantized local LLM: {model_id} on GPU.")
121
- llm_pipeline = pipeline(
122
- "text-generation",
123
- model=model_id,
124
- model_kwargs={"quantization_config": quantization_config},
125
- device_map="auto",
126
- token=hf_token
127
- )
128
- return llm_pipeline
129
- except Exception as e:
130
- logging.error(f"Failed to initialize local LLM: {e}")
131
- # Add a note about potential trust issues for some models
132
- logging.info("Trying again with 'trust_remote_code=True'.")
133
- try:
134
- llm_pipeline = pipeline(
135
- "text-generation",
136
- model=model_id,
137
- model_kwargs={"trust_remote_code": True, "quantization_config": quantization_config},
138
- device_map="auto",
139
- token=hf_token
140
- )
141
- return llm_pipeline
142
- except Exception as e2:
143
- logging.error(f"Secondary attempt failed: {e2}")
144
- gr.Warning("Could not initialize the local LLM. AI features will be disabled.")
145
- return None
146
-
147
- # --- DATA LOADING HELPER ---
148
- def load_data(file_obj, gsheet_url):
149
- """Loads a DataFrame from an uploaded file or a direct Google Sheets CSV URL."""
150
- if file_obj is not None:
151
- logging.info(f"Loading data from uploaded file: {file_obj.name}")
152
- return pd.read_csv(file_obj.name)
153
- elif gsheet_url and gsheet_url.strip():
154
- logging.info(f"Loading data directly from URL: {gsheet_url}")
155
  try:
156
- # FIX: Removed the unreliable .replace() logic.
157
- # We now expect a direct CSV link from the user.
158
- response = requests.get(gsheet_url)
159
- response.raise_for_status() # Raise an exception for bad status codes
160
- return pd.read_csv(StringIO(response.text))
 
 
 
 
161
  except Exception as e:
162
- raise ValueError(f"Failed to load from URL. Please ensure it is a direct CSV link. Error: {e}")
163
- else:
164
- raise ValueError("Please upload a CSV file or provide a public Google Sheets URL.")
165
-
166
- # --- MAIN ANALYSIS ENGINE ---
167
-
168
- # We will define the AI agent in the next cell. For now, this is a placeholder.
169
- LLM_PIPELINE = None
170
-
171
- def run_analysis_pipeline(file_obj, gsheet_url, text_columns, analysis_mode, manual_seeds,
172
- top_n_topics_slider, enable_ai_merging, hf_token, progress=gr.Progress()):
173
- """
174
- The main orchestrator function for the analysis pipeline.
175
- This function incorporates all our agreed-upon refinements.
176
- """
177
- global LLM_PIPELINE
178
- if enable_ai_merging and LLM_PIPELINE is None:
179
- progress(0, desc="Initializing LLM...")
180
- LLM_PIPELINE = initialize_local_llm(hf_token)
181
- if LLM_PIPELINE is None:
182
- gr.Warning("AI features enabled, but LLM failed to initialize. Skipping AI steps.")
183
- enable_ai_merging = False
184
-
185
- # === STEP 1: LOAD AND VALIDATE DATA ===
186
- progress(0.1, desc="Step 1/8: Loading and Validating Data...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  try:
188
- df = load_data(file_obj, gsheet_url)
189
- if not text_columns: raise ValueError("Please select at least one text column to analyze.")
190
- df['combined_text'] = df[text_columns].fillna('').astype(str).agg(' '.join, axis=1)
191
- df.dropna(subset=['combined_text'], inplace=True)
192
- df['processed_text'] = df['combined_text'].apply(preprocess_bangla_text)
193
-
194
- # REFINEMENT: Filter by word count for more robust document validation.
195
- df_analysis = df[df['processed_text'].str.split().str.len() > 2].copy()
196
- if df_analysis.empty:
197
- raise ValueError("No documents with sufficient content found after cleaning. Please check your data and column selection.")
198
- documents = df_analysis['processed_text'].tolist()
199
- APP_STATE["df"] = df_analysis # Save the analyzable dataframe
 
 
 
 
 
 
 
 
 
 
 
200
  except Exception as e:
201
- logging.error(f"Data Loading Error: {e}")
202
- return {log_output: f"Error during data loading: {e}"}
 
 
 
 
 
 
 
 
 
 
203
 
204
- # === STEP 2: PREPARE GUIDANCE (IF MANUAL SEEDING) ===
205
- progress(0.2, desc="Step 2/8: Preparing Analysis Mode...")
206
- y_guidance = None
207
- if analysis_mode == "Manual Seeding" and manual_seeds:
208
  try:
209
- seed_topics_dict = json.loads(manual_seeds)
210
- y_guidance = [-1] * len(documents)
211
- topic_name_to_id = {name: i for i, name in enumerate(seed_topics_dict.keys())}
212
- for i, doc in enumerate(documents):
213
- for topic_name, keywords in seed_topics_dict.items():
214
- if any(keyword in doc for keyword in keywords):
215
- y_guidance[i] = topic_name_to_id[topic_name]
216
- break # Prioritizes the first match in the JSON
217
- except Exception as e:
218
- return {log_output: f"Error: Invalid JSON in Manual Seeds. Details: {e}"}
219
-
220
- # === STEP 3: EMBEDDINGS & MODEL SETUP (WITH REFINEMENTS) ===
221
- progress(0.3, desc="Step 3/8: Calculating Document Embeddings...")
222
- embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
223
- embeddings = embedding_model.encode(documents, show_progress_bar=True)
224
-
225
- # REFINEMENT: Lower min_cluster_size for more sensitive topic detection.
226
- hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
227
- # REFINEMENT: Use max_df and min_df for adaptive stop word filtering.
228
- vectorizer_model = CountVectorizer(tokenizer=lambda doc: doc.split(), ngram_range=(1, 3), max_df=0.90, min_df=5)
229
-
230
- # Other components remain robust
231
- umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
232
- representation_model = KeyBERTInspired()
233
-
234
- # === STEP 4: TRAIN TOPIC MODEL ===
235
- progress(0.5, desc="Step 4/8: Training BERTopic Model...")
236
- topic_model = BERTopic(
237
- embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model,
238
- vectorizer_model=vectorizer_model, representation_model=representation_model,
239
- language="multilingual", verbose=False
240
- )
241
- topics, _ = topic_model.fit_transform(documents, embeddings, y=y_guidance)
242
-
243
- # === STEP 5: AI REFINEMENT (IF ENABLED) ===
244
- if enable_ai_merging and LLM_PIPELINE:
245
- progress(0.6, desc="Step 5/8: Running AI Refinement Agent...")
246
- # We will define `run_ai_refinement` in the next cell. This is the hook.
247
- topic_model = run_ai_refinement(topic_model, LLM_PIPELINE, progress)
248
- else:
249
- progress(0.6, desc="Step 5/8: Skipping AI Refinement...")
250
- # Fallback to default naming if AI is disabled
251
- generated_labels = topic_model.generate_topic_labels(nr_words=4, separator=", ")
252
- topic_model.set_topic_labels(generated_labels)
253
-
254
- # === STEP 6: APPLY MANUAL SEED NAMES ===
255
- progress(0.7, desc="Step 6/8: Finalizing Topic Names...")
256
- if analysis_mode == "Manual Seeding" and 'seed_topics_dict' in locals():
257
- for topic_name, topic_id in topic_name_to_id.items():
258
- if topic_id in topic_model.get_topic_info()['Topic'].values:
259
- topic_model.set_topic_labels({topic_id: topic_name})
260
-
261
- # === STEP 7: PREPARE FINAL OUTPUTS & VISUALIZATIONS ===
262
- progress(0.85, desc="Step 7/8: Preparing Visualizations...")
263
- APP_STATE["bertopic_model"] = topic_model
264
- df_analysis['Topic'] = topics
265
- APP_STATE["final_df"] = df_analysis
266
- topics_df = topic_model.get_topic_info()
267
- APP_STATE["topics_df"] = topics_df
268
-
269
- # REFINEMENT: Safeguard against memory errors on very large datasets.
270
- if len(documents) > 50000:
271
- gr.Info("Dataset is large. Visualizing a sample of 50,000 documents for performance.")
272
- indices = np.random.choice(len(documents), 50000, replace=False)
273
- sampled_docs = [documents[i] for i in indices]
274
- sampled_embeddings = embeddings[indices]
275
- doc_topic_landscape_plot = topic_model.visualize_documents(sampled_docs, embeddings=sampled_embeddings)
276
- else:
277
- doc_topic_landscape_plot = topic_model.visualize_documents(documents, embeddings=embeddings)
278
-
279
- inter_topic_map_plot = topic_model.visualize_topics()
280
- # REFINEMENT: Use slider value for dynamic chart generation.
281
- num_chart_topics = int(top_n_topics_slider)
282
- top_topics_barchart_plot = topic_model.visualize_barchart(top_n_topics=num_chart_topics)
283
- topic_similarity_heatmap_plot = topic_model.visualize_heatmap(top_n_topics=num_chart_topics)
284
- topic_hierarchy_plot = topic_model.visualize_hierarchy(top_n_topics=num_chart_topics)
285
-
286
- review_topic_table = topics_df[['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'})
287
-
288
- # Check for date columns for the temporal analysis tab
289
- date_columns = [col for col in df_analysis.columns if pd.to_datetime(df_analysis[col], errors='coerce').notna().any()]
290
 
291
- # === STEP 8: UPDATE UI WITH RESULTS ===
292
- progress(1.0, desc="Step 8/8: Finalizing UI...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  return {
294
- log_output: f"✅ Analysis Complete! Discovered {len(topics_df)-1} topics.",
295
- # Make result tabs visible
296
- review_tab: gr.update(visible=True),
297
- visualize_tab: gr.update(visible=True),
298
- # Populate the review tab
299
- review_topic_table_df: gr.update(value=review_topic_table),
300
- # Populate the visualization tab
301
- doc_topic_landscape_plot_ui: doc_topic_landscape_plot,
302
- inter_topic_map_plot_ui: inter_topic_map_plot, # Hook for the fixed plot
303
- top_topics_barchart_plot_ui: top_topics_barchart_plot,
304
- topic_similarity_heatmap_ui: topic_similarity_heatmap_plot,
305
- topic_hierarchy_plot_ui: topic_hierarchy_plot,
306
- # Update and enable the temporal analysis tab if date columns exist
307
- temporal_analysis_group: gr.update(visible=len(date_columns) > 0),
308
- date_column_dropdown: gr.update(choices=date_columns, value=date_columns[0] if date_columns else None),
309
- }
310
-
311
- print("✅ Main analysis pipeline function appended to app.py")
312
-
313
- # --- AI REFINEMENT AGENT ---
314
-
315
- def run_ai_refinement(topic_model, llm_pipeline, progress=gr.Progress()):
316
- """
317
- Uses a lightweight LLM to generate high-quality, contextual topic names.
318
- Includes a conceptual hook for future AI-powered topic merging.
319
- """
320
- logging.info("Starting AI Refinement Agent...")
321
-
322
- # --- Task 1: AI-Powered Topic Naming ---
323
- progress(0, desc="AI Agent: Generating Topic Names...")
324
- topic_info_df = topic_model.get_topic_info()
325
- new_labels = {}
326
-
327
- # This is the advanced, few-shot Bangla prompt we designed.
328
- # It will be used for each topic.
329
- prompt_template = """
330
- আপনি একজন পেশাদার সংবাদ সম্পাদক। আপনার কাজ হলো বাংলাদেশের রাজনৈতিক ঘটনাবলী, বিশেষ করে বিএনপির 'তারুণ্যের সমাবেশ' সংক্রান্ত সংবাদের জন্য একটি সংক্ষিপ্ত ও প্রাসঙ্গিক শিরোনাম তৈরি করা। প্রদত্ত কীওয়ার্ডগুলো ব্যবহার করে একটি (৩-৫ শব্দের) সারগর্ভ বাংলা শিরোনাম লিখুন, যেখানে সমাবেশের মূল বিষয় বা স্থান স্পষ্টভাবে ফুটে উঠবে। উদাহরণগুলো দেখুন।
331
-
332
- --- উদাহরণ ---
333
- ইনপুট কীওয়ার্ড: ['খুলনা', 'তারুণ্যের', 'সমাবেশ', 'বিএনপি']
334
- আউটপুট শিরোনাম: খুলনায় বিএনপির তারুণ্যের সমাবেশ
335
-
336
- ইনপুট কীওয়ার্ড: ['ঢাকা', 'নয়াপল্টন', 'তারুণ্যের', 'স্রোত', 'বৃষ্টি']
337
- আউটপুট শিরোনাম: ঢাকায় তারুণ্যের সমাবেশে জনতার ঢল
338
-
339
- ইনপুট কীওয়ার্ড: ['চট্টগ্রাম', 'বক্তব্য', 'মির্জা ফখরুল', 'শোডাউন']
340
- আউটপুট শিরোনাম: চট্টগ্রামে মির্জা ফখরুলের তারুণ্যের সমাবেশ
341
- --- উদাহরণের শেষ ---
342
-
343
- --- আপনার কাজ ---
344
- ইনপুট কীওয়ার্ড: {keywords}
345
- আউটপুট শিরোনাম:
346
- """
347
-
348
- # Tuned parameters for reliable, non-creative naming
349
- generation_params = {
350
- "temperature": 0.3,
351
- "max_new_tokens": 30,
352
- "repetition_penalty": 1.2,
353
- "do_sample": True
354
  }
355
 
356
- # Iterate through each topic to generate a new name
357
- for index, row in topic_info_df.iterrows():
358
- topic_id = row['Topic']
359
- if topic_id == -1:
360
- # We don't rename the outlier topic
361
- new_labels[topic_id] = "Topic -1: Outliers"
362
- continue
363
-
364
- keywords = row['Representation']
365
-
366
- # Format the prompt for the current topic
367
- prompt = prompt_template.format(keywords=keywords)
368
-
369
- try:
370
- # Call the LLM pipeline
371
- response = llm_pipeline(prompt, **generation_params)
372
- # Extract the generated text, stripping whitespace and the prompt's artifacts
373
- generated_name = response[0]['generated_text'].split("আউটপুট শিরোনাম:")[1].strip()
374
-
375
- if generated_name:
376
- new_labels[topic_id] = f"Topic {topic_id}: {generated_name}"
377
- logging.info(f"Generated name for Topic {topic_id}: {generated_name}")
378
- else:
379
- # Fallback to default name if generation fails
380
- new_labels[topic_id] = topic_model.get_topic_label(topic_id, nr_words=4)
381
- except Exception as e:
382
- logging.error(f"LLM failed for Topic {topic_id}. Error: {e}")
383
- # Fallback for safety
384
- new_labels[topic_id] = topic_model.get_topic_label(topic_id, nr_words=4)
385
-
386
- progress.update((index + 1) / len(topic_info_df))
387
-
388
- # Apply all the new, AI-generated labels at once
389
- topic_model.set_topic_labels(new_labels)
390
- logging.info("✅ AI Naming complete.")
391
-
392
- # --- Task 2: AI-Powered Merging (Conceptual Hook) ---
393
- # This section is a placeholder for a future enhancement.
394
- # The logic would be:
395
- # 1. Calculate topic similarity matrix.
396
- # 2. Identify pairs with similarity > threshold (e.g., 0.85).
397
- # 3. Use a "Judge" prompt to ask the LLM if they should be merged.
398
- # 4. If LLM says "YES", call `topic_model.merge_topics()`.
399
- logging.info("Skipping AI Topic Merging (conceptual feature).")
400
-
401
- return topic_model
402
-
403
- print("✅ AI Refinement Agent function appended to app.py")
404
-
405
- # --- FINAL BACKEND HANDLERS & HELPERS ---
406
-
407
- def get_topic_details(topic_id: int):
408
- """Fetches details for a selected topic to display in the review tab."""
409
- empty_return = {topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
410
- model = APP_STATE.get("bertopic_model")
411
- if model is None or topic_id is None: return empty_return
412
- try:
413
- topic_id = int(topic_id)
414
- topic_info = model.get_topic_info(topic_id=topic_id)
415
- if topic_info.empty: return empty_return
416
-
417
- # Strip the "Topic X: " prefix for cleaner editing
418
- topic_name = topic_info['Name'].iloc[0]
419
- cleaned_name = re.sub(r'^Topic \d+:\s*', '', topic_name)
420
-
421
- # For the outlier topic, don't generate plots
422
- if topic_id == -1:
423
- return {topic_name_textbox: cleaned_name, topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
424
-
425
- word_cloud_fig = model.visualize_barchart(top_n_topics=1, topics=[topic_id])
426
- docs_df = pd.DataFrame(model.get_representative_docs(topic_id), columns=['Representative Document'])
427
- return {topic_name_textbox: cleaned_name, topic_word_cloud_plot: word_cloud_fig, topic_docs_df: docs_df}
428
- except Exception as e:
429
- logging.error(f"Error getting topic details for ID {topic_id}: {e}")
430
- return empty_return
431
-
432
- def update_topic_name(topic_id, new_name):
433
- """Handler for manual topic renaming."""
434
- model = APP_STATE.get("bertopic_model")
435
- if model and topic_id is not None and new_name:
436
- topic_id = int(topic_id)
437
- # Add the prefix back for consistency
438
- full_name = f"Topic {topic_id}: {new_name}"
439
- model.set_topic_labels({topic_id: full_name})
440
- APP_STATE["topics_df"] = model.get_topic_info()
441
- gr.Info(f"Topic {topic_id} renamed to '{new_name}'")
442
- # Return the updated table for the UI
443
- return gr.update(value=APP_STATE["topics_df"][['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'}))
444
- return gr.update() # No change
445
-
446
- def merge_selected_topics(topics_to_merge):
447
- """Handler for manual topic merging."""
448
- model = APP_STATE.get("bertopic_model")
449
- if model and topics_to_merge and len(topics_to_merge) > 1:
450
- # Convert topic names like "Topic 0: ..." to integer IDs
451
- topic_ids = [int(re.search(r'\d+', t).group()) for t in topics_to_merge]
452
-
453
- model.merge_topics(topics_to_merge=[topic_ids])
454
-
455
- # After merging, we need to refresh the state and UI components
456
- APP_STATE["topics_df"] = model.get_topic_info()
457
- review_topic_table = APP_STATE["topics_df"][['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'})
458
-
459
- gr.Info(f"Successfully merged topics: {topic_ids}")
460
- return {
461
- review_topic_table_df: gr.update(value=review_topic_table),
462
- # Clear the selection and the details view
463
- topic_merger_checkboxgroup: gr.update(value=[]),
464
- topic_name_textbox: "",
465
- topic_word_cloud_plot: None,
466
- topic_docs_df: pd.DataFrame(),
467
- }
468
- gr.Warning("Please select at least two topics to merge.")
469
- return {review_topic_table_df: gr.update(), topic_merger_checkboxgroup: gr.update()}
470
-
471
-
472
- def generate_temporal_plot(date_column, progress=gr.Progress()):
473
- """Generates and displays the topics over time plot."""
474
- progress(0, desc="Preparing time data...")
475
- if not date_column: return None
476
- model, df = APP_STATE.get("bertopic_model"), APP_STATE.get("final_df")
477
- if model is None or df is None: return None
478
-
479
- df_temporal = df.copy()
480
- df_temporal['timestamp'] = pd.to_datetime(df_temporal[date_column], errors='coerce')
481
- df_temporal.dropna(subset=['timestamp'], inplace=True)
482
-
483
- if df_temporal.empty:
484
- gr.Warning(f"The column '{date_column}' contains no valid dates after conversion.")
485
- return None
486
-
487
- progress(0.6, desc="Generating topic trends over time...")
488
- try:
489
- # BERTopic requires the original documents and timestamps for this plot
490
- docs_temporal = df_temporal['processed_text'].tolist()
491
- timestamps_temporal = df_temporal['timestamp'].tolist()
492
- topics_over_time = model.topics_over_time(docs=docs_temporal, timestamps=timestamps_temporal)
493
- return model.visualize_topics_over_time(topics_over_time)
494
- except Exception as e:
495
- gr.Error(f"Could not generate temporal plot. This can happen if topics are not found in the selected time range. Error: {e}")
496
- return None
497
-
498
- def generate_media_analysis(media_column):
499
- """Generates a horizontal bar chart for media source analysis to prevent label overlap."""
500
- if not media_column:
501
- gr.Warning("Please select a media column to analyze.")
502
- return None
503
- df = APP_STATE.get("df")
504
- if df is None or media_column not in df.columns:
505
- return None
506
 
507
- counts = df[media_column].value_counts().nlargest(20).sort_values() # Get top 20 and sort for a nice plot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
 
509
- plot_df = pd.DataFrame({'Media Source': counts.index, 'Article Count': counts.values})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
 
511
- # FIX: Swapped x and y to create a horizontal plot.
512
- return gr.BarPlot(
513
- plot_df,
514
- x='Article Count', # The numeric value is now on the x-axis
515
- y='Media Source', # The categorical labels are now on the y-axis
516
- title='Top 20 Media Sources by Article Count',
517
- tooltip=['Media Source', 'Article Count'],
518
- height=500,
519
- # FIX: Changed to horizontal_guides
520
- horizontal_guides=[{'value': counts.mean(), 'label': 'Average'}]
521
- )
522
-
523
- def finalize_and_save():
524
- """Saves the final DataFrame and topic definitions to files."""
525
- if APP_STATE.get("final_df") is None or APP_STATE.get("topics_df") is None:
526
- gr.Warning("No data available to save.")
527
- return None
528
 
529
- final_df_to_save, topics_df_to_save = APP_STATE["final_df"].copy(), APP_STATE["topics_df"].copy()
 
 
 
 
530
 
531
- # Convert list columns to JSON strings for compatibility
532
- for col in ['Representation', 'Representative_Docs']:
533
- if col in topics_df_to_save.columns:
534
- topics_df_to_save[col] = topics_df_to_save[col].apply(
535
- lambda x: json.dumps(x) if isinstance(x, list) else x
536
- )
537
-
538
- db_path, csv_path = "topic_analysis_results.sqlite", "labeled_documents.csv"
539
-
540
- with sqlite3.connect(db_path) as conn:
541
- topics_df_to_save.to_sql("topic_definitions", conn, if_exists="replace", index=False)
542
- final_df_to_save.to_sql("enriched_documents", conn, if_exists="replace", index=False)
543
-
544
- topic_map = topics_df_to_save.set_index('Topic')['Name'].to_dict()
545
- final_df_to_save['Topic_Name'] = final_df_to_save['Topic'].map(topic_map)
546
- final_df_to_save.to_csv(csv_path, index=False, encoding='utf-8-sig')
547
-
548
- gr.Info(f"Results saved to {db_path} and {csv_path}")
549
- return [db_path, csv_path]
550
 
551
- print("✅ Final backend handlers appended to app.py")
 
 
552
 
553
- # --- GRADIO UI LAYOUT & EVENT HANDLERS ---
 
554
 
555
- with gr.Blocks(theme=gr.themes.Soft(), title=APP_TITLE) as app:
556
- gr.Markdown(f"# {APP_TITLE}")
557
- gr.Markdown(f"*{APP_TAGLINE}*")
558
 
559
  with gr.Tabs() as tabs:
560
- # === SETUP & RUN TAB ===
561
- with gr.TabItem("1. Setup & Run Analysis", id=0):
562
  with gr.Row():
563
  with gr.Column(scale=1):
564
- gr.Markdown("### 1. Data Input")
565
- file_upload = gr.File(label="Upload CSV File", file_types=[".csv"])
566
- gsheet_url = gr.Textbox(
567
- label="Or Paste Google Sheets URL",
568
- placeholder="https://docs.google.com/spreadsheets/d/e/.../pub?output=csv",
569
- # FIX: Using triple quotes to create a correctly terminated string
570
- # that is also more readable.
571
- info="""How to get the link: In Google Sheets, go to File > Share > Publish to web.
572
- Select 'Comma-separated values (.csv)' and copy the generated link.
573
- Example: https://docs.google.com/spreadsheets/d/e/2PACX-1vTn-mRrOCk6fww892XfziUk63pJu9g8uOdy4nHjygKXcN7oO3EAhXLMD7WZAatvoLubSPpMdQ5ymouz/pub?output=csv"""
574
- )
575
-
576
-
577
- gr.Markdown("### 2. Select Columns")
578
- text_columns_checkboxgroup = gr.CheckboxGroup(label="Select Text Columns for Analysis", interactive=True)
579
-
580
- gr.Markdown("### 3. Configure Analysis")
581
- analysis_mode_radio = gr.Radio(["Discovery Mode", "Manual Seeding"], value="Discovery Mode", label="Analysis Mode")
582
- manual_seeds_textbox = gr.Textbox(label="Manual Seed Topics (JSON format)", visible=False, lines=5)
583
- # FIX: Assign the markdown to a variable so we can target it directly
584
- manual_seeds_example = gr.Markdown("Example: `{\"Topic A\": [\"keyword1\", \"keyword2\"], \"Topic B\": [\"wordA\", \"wordB\"]}`", visible=False)
585
-
586
- top_n_topics_slider = gr.Slider(label="Number of Topics for Charts", minimum=5, maximum=50, value=15, step=1)
587
-
588
- gr.Markdown("### 4. Advanced (Optional)")
589
- enable_ai_merging_checkbox = gr.Checkbox(label="Enable AI Topic Naming (Requires GPU & HF Token)", value=False)
590
- hf_token_textbox = gr.Textbox(label="Hugging Face Token", type="password", placeholder="hf_...", info="Required if AI is enabled.")
591
-
592
- start_button = gr.Button("Start Analysis", variant="primary")
593
-
594
  with gr.Column(scale=2):
595
- log_output = gr.Textbox(label="Pipeline Progress", lines=25, interactive=False, autoscroll=True)
 
596
 
597
- # === REVIEW & FINALIZE TAB ===
598
- with gr.TabItem("2. Review & Finalize", id=1, visible=False) as review_tab:
599
- gr.Markdown("### Review, Refine, and Finalize Your Topic Model")
600
- with gr.Row():
601
- with gr.Column(scale=2):
602
- gr.Markdown("**Topics Found**")
603
- review_topic_table_df = gr.DataFrame(headers=["ID", "Topic Name", "Documents"], interactive=True, wrap=True, scale=2)
604
- with gr.Column(scale=3):
605
- gr.Markdown("**Selected Topic Details**")
606
- topic_id_state = gr.State() # Hidden state to store the selected topic ID
607
- topic_name_textbox = gr.Textbox(label="Topic Name (Editable)")
608
- update_name_button = gr.Button("Update Name")
609
- topic_word_cloud_plot = gr.Plot(label="Top Words for Selected Topic")
610
- topic_docs_df = gr.DataFrame(headers=["Representative Document"], wrap=True)
611
-
612
- with gr.Row():
613
- gr.Markdown("### Manual Topic Merging")
614
- with gr.Row():
615
- topic_merger_checkboxgroup = gr.CheckboxGroup(label="Select 2 or more topics to merge", interactive=True)
616
- merge_button = gr.Button("Merge Selected Topics", variant="stop")
617
- with gr.Row():
618
- finalize_button = gr.Button("Save Final Results to Files", variant="primary")
619
- download_link = gr.File(label="Download Results (SQLite DB and CSV)", file_count="multiple")
620
-
621
-
622
- # === VISUALIZE & EXPLORE TAB ===
623
- with gr.TabItem("3. Visualize & Explore", id=2, visible=False) as visualize_tab:
624
- with gr.Tabs():
625
- with gr.TabItem("Document Landscape"):
626
- gr.Markdown("A 2D map of every document, colored by its assigned topic. This shows the overall structure of your data.")
627
- doc_topic_landscape_plot_ui = gr.Plot()
628
- with gr.TabItem("Topic Relationships"):
629
- gr.Markdown("Visualizations showing how topics relate to each other.")
630
- inter_topic_map_plot_ui = gr.Plot(label="Inter-Topic Distance Map")
631
- topic_hierarchy_plot_ui = gr.Plot(label="Hierarchical Clustering of Topics")
632
- topic_similarity_heatmap_ui = gr.Plot(label="Topic Similarity Heatmap")
633
- with gr.TabItem("Topic Keywords"):
634
- gr.Markdown("A bar chart showing the most important keywords for the most prominent topics.")
635
- top_topics_barchart_plot_ui = gr.Plot()
636
- with gr.TabItem("Temporal Analysis"):
637
- with gr.Group(visible=False) as temporal_analysis_group:
638
- gr.Markdown("Select a date column from your data to see how topic popularity has changed over time.")
639
  with gr.Row():
640
- date_column_dropdown = gr.Dropdown(label="Select Date Column")
641
- generate_trends_button = gr.Button("Generate Trend Plot")
642
- temporal_plot_ui = gr.Plot()
643
 
644
- # === SOURCE ANALYSIS TAB ===
645
- with gr.TabItem("4. Source Analysis", id=3, visible=False) as source_tab:
646
- gr.Markdown("### Analyze the Distribution of News Sources")
647
  with gr.Row():
648
- media_column_dropdown = gr.Dropdown(label="Select Your Media/Source Column")
649
- analyze_media_button = gr.Button("Analyze Sources")
650
- with gr.Row():
651
- media_plot = gr.BarPlot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
 
653
- gr.Markdown(f"<div style='text-align: center;'>{APP_FOOTER}</div>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
 
655
- # --- EVENT HANDLERS ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
 
657
- def update_column_selector(file, url):
658
- """Populates column selectors after data is loaded."""
659
- # This function also makes the source analysis tab visible if data loads
660
- if file is None and not url:
661
- return {text_columns_checkboxgroup: gr.update(choices=[], value=None), media_column_dropdown: gr.update(choices=[], value=None), source_tab: gr.update(visible=False)}
662
- try:
663
- df = load_data(file, url)
664
- text_cols = [col for col in df.columns if df[col].dtype == 'object']
665
  return {
666
- text_columns_checkboxgroup: gr.update(choices=text_cols, value=text_cols if text_cols else None),
667
- media_column_dropdown: gr.update(choices=df.columns.tolist()),
668
- source_tab: gr.update(visible=True)
 
 
669
  }
670
- except Exception as e:
671
- gr.Warning(f"Failed to read columns: {e}")
672
- return {text_columns_checkboxgroup: gr.update(choices=[], value=None), media_column_dropdown: gr.update(choices=[], value=None), source_tab: gr.update(visible=False)}
673
-
674
- file_upload.upload(fn=update_column_selector, inputs=[file_upload, gsheet_url], outputs=[text_columns_checkboxgroup, media_column_dropdown, source_tab])
675
- gsheet_url.submit(fn=update_column_selector, inputs=[file_upload, gsheet_url], outputs=[text_columns_checkboxgroup, media_column_dropdown, source_tab])
676
-
677
- # FIX: A single, robust function to control the visibility of manual seeding UI elements
678
- def toggle_manual_seeding_ui(mode):
679
- is_visible = mode == "Manual Seeding"
680
  return {
681
- manual_seeds_textbox: gr.update(visible=is_visible),
682
- manual_seeds_example: gr.update(visible=is_visible)
 
 
 
 
683
  }
 
 
 
 
 
 
 
 
 
 
 
684
 
685
- analysis_mode_radio.change(
686
- fn=toggle_manual_seeding_ui,
687
- inputs=analysis_mode_radio,
688
- outputs=[manual_seeds_textbox, manual_seeds_example]
689
- )
690
-
691
- start_button.click(
692
- fn=run_analysis_pipeline,
693
- inputs=[file_upload, gsheet_url, text_columns_checkboxgroup, analysis_mode_radio, manual_seeds_textbox, top_n_topics_slider, enable_ai_merging_checkbox, hf_token_textbox],
694
- outputs=[log_output, review_tab, visualize_tab, review_topic_table_df, doc_topic_landscape_plot_ui, inter_topic_map_plot_ui,
695
- top_topics_barchart_plot_ui, topic_similarity_heatmap_ui, topic_hierarchy_plot_ui, temporal_analysis_group, date_column_dropdown]
696
- )
697
-
698
- def on_select_topic(evt: gr.SelectData):
699
- """Handles selecting a topic from the main review table."""
700
- if not isinstance(evt.index, tuple) or len(evt.index) == 0:
701
- return {topic_id_state: None, topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
702
- try:
703
- topic_id_val = APP_STATE["topics_df"].iloc[evt.index[0]]['ID']
704
- details = get_topic_details(topic_id_val)
705
- details[topic_id_state] = topic_id_val # Store the ID in the hidden state
706
- return details
707
- except Exception:
708
- return {topic_id_state: None, topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
709
-
710
- review_topic_table_df.select(fn=on_select_topic, outputs=[topic_id_state, topic_name_textbox, topic_word_cloud_plot, topic_docs_df])
711
-
712
- # Connect the new manual refinement buttons
713
- update_name_button.click(fn=update_topic_name, inputs=[topic_id_state, topic_name_textbox], outputs=[review_topic_table_df])
714
-
715
- # When the main results are generated, populate the topic merger checklist
716
- review_topic_table_df.change(lambda df: gr.update(choices=df['Topic Name'].tolist()), inputs=review_topic_table_df, outputs=topic_merger_checkboxgroup)
717
-
718
- merge_button.click(fn=merge_selected_topics, inputs=[topic_merger_checkboxgroup], outputs=[review_topic_table_df, topic_merger_checkboxgroup, topic_name_textbox, topic_word_cloud_plot, topic_docs_df])
719
-
720
- # Connect the new Source Analysis tab
721
- analyze_media_button.click(fn=generate_media_analysis, inputs=[media_column_dropdown], outputs=[media_plot])
722
-
723
- # Other handlers
724
- generate_trends_button.click(fn=generate_temporal_plot, inputs=[date_column_dropdown], outputs=[temporal_plot_ui])
725
- finalize_button.click(fn=finalize_and_save, inputs=[], outputs=[download_link])
726
-
727
- # --- LAUNCH THE APP ---
728
  if __name__ == "__main__":
729
- app.launch(debug=True, share=True)
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # SOCIAL PERCEPTION ANALYZER - FINAL COMPLETE APPLICATION
3
+ # Version: 3.0 (Architecturally Refactored, Production Ready)
4
+ # ==============================================================================
5
+
6
+ # --- IMPORTS ---
7
  import gradio as gr
8
  import pandas as pd
9
  import numpy as np
 
13
  import json
14
  import logging
15
  import requests
16
+ import os
17
+ import time
18
+ import random
19
+ import functools
20
  from io import StringIO
21
+ from datetime import datetime, timezone
22
+ from logging.handlers import RotatingFileHandler
23
+
24
+ # --- APIs and Web Scraping ---
25
+ from googleapiclient.discovery import build
26
+ from googleapiclient.errors import HttpError
27
+ from GoogleNews import GoogleNews
28
+ from urllib.error import HTTPError
29
+ import dateparser
30
 
31
+ # --- NLP & Machine Learning ---
32
  from transformers import pipeline, BitsAndBytesConfig
33
  from sentence_transformers import SentenceTransformer
34
+ from huggingface_hub.utils import HfHubHTTPError
35
+
36
+ # --- Visualization ---
37
+ import matplotlib.pyplot as plt
38
+ from matplotlib.font_manager import FontProperties
39
+ import seaborn as sns
40
+ from wordcloud import WordCloud
41
+
42
+ # ==============================================================================
43
+ # SETUP PRODUCTION-GRADE LOGGING & CONFIGURATION
44
+ # ==============================================================================
45
+
46
+ log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
47
+ log_handler = RotatingFileHandler('app.log', maxBytes=5*1024*1024, backupCount=2)
48
+ log_handler.setFormatter(log_formatter)
49
+ logger = logging.getLogger()
50
+ logger.setLevel(logging.INFO)
51
+ if not logger.handlers:
52
+ logger.addHandler(log_handler)
53
+ logger.info("Application starting up.")
54
+
55
+ # --- APPLICATION CONFIGURATION ---
56
+ APP_TITLE = "Social Perception Analyzer"
57
+ APP_TAGLINE = "Prepared for the Policymakers of Bangladesh Nationalist Party (BNP)"
58
+ APP_FOOTER = "Developed by CDSR"
59
+
60
+ # --- FONT CONFIGURATION ---
61
+ FONT_PATH = 'NotoSansBengali-Regular.ttf'
62
+ try:
63
+ BANGLA_FONT = FontProperties(fname=FONT_PATH)
64
+ logger.info("Successfully loaded 'NotoSansBengali-Regular.ttf' font.")
65
+ except OSError:
66
+ logger.error("Failed to load 'NotoSansBengali-Regular.ttf'. Ensure the file is in the root directory.")
67
+ gr.Warning("Bangla font not found! Visualizations may not render text correctly.")
68
+ BANGLA_FONT = FontProperties()
69
+
70
+ # ==============================================================================
71
+ # CORE HELPER FUNCTIONS
72
+ # ==============================================================================
73
+
74
  BANGLA_STOP_WORDS = [
75
  'অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য',
76
  'অভিপ্রায়', 'একে', 'একই', 'একেবারে', 'একটি', 'একবার', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটি',
 
89
  'সম্পর্কে', 'সঙ্গেও', 'সর্বাধিক', 'সর্বদা', 'সহ', 'হৈতে', 'হইবে', 'হইয়া', 'হৈল', 'জানিয়েছেন', 'প্রতিবেদক'
90
  ]
91
 
92
+ def get_dynamic_time_agg(start_date, end_date):
93
+ """Hardened helper to determine time aggregation level."""
94
+ if not isinstance(start_date, pd.Timestamp) or not isinstance(end_date, pd.Timestamp):
95
+ return 'D', 'Daily' # Graceful fallback
96
+ delta = end_date - start_date
97
+ if delta.days <= 2: return 'H', 'Hourly'
98
+ if delta.days <= 90: return 'D', 'Daily'
99
+ if delta.days <= 730: return 'W', 'Weekly'
100
+ return 'M', 'Monthly'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ # ==============================================================================
103
+ # ML MODEL MANAGEMENT
104
+ # ==============================================================================
105
 
 
 
 
 
 
 
 
106
 
107
+ SENTIMENT_MODEL_ID = 'ahs95/banglabert-sentiment-analysis'
108
+ MODELS = {"sentiment_pipeline": None}
 
 
 
109
 
110
+ def _load_pipeline_with_retry(task, model_id, retries=3):
111
+ logger.info(f"Initializing {task} pipeline for model: {model_id}")
112
+ for attempt in range(retries):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  try:
114
+ device = 0 if torch.cuda.is_available() else -1
115
+ if device == -1: gr.Warning(f"{model_id} will run on CPU and may be very slow.")
116
+ pipe = pipeline(task, model=model_id, device=device)
117
+ logger.info(f"Pipeline '{task}' loaded successfully.")
118
+ return pipe
119
+ except (HfHubHTTPError, requests.exceptions.ConnectionError) as e:
120
+ logger.warning(f"Network error on loading {model_id} (Attempt {attempt + 1}/{retries}): {e}")
121
+ if attempt < retries - 1: time.sleep(5)
122
+ else: raise gr.Error(f"Failed to download model '{model_id}' after {retries} attempts. Check network.")
123
  except Exception as e:
124
+ logger.error(f"An unexpected error occurred while loading {model_id}: {e}")
125
+ raise gr.Error(f"Could not initialize model '{model_id}'. Error: {e}")
126
+ return None
127
+
128
+ def get_sentiment_pipeline():
129
+ if MODELS["sentiment_pipeline"] is None:
130
+ MODELS["sentiment_pipeline"] = _load_pipeline_with_retry("sentiment-analysis", SENTIMENT_MODEL_ID)
131
+ return MODELS["sentiment_pipeline"]
132
+
133
+ # ==============================================================================
134
+ # NEWS SCRAPER BACKEND
135
+ # ==============================================================================
136
+
137
+ def run_news_scraper_pipeline(search_keywords, sites, start_date_str, end_date_str, interval, max_pages, filter_keys, progress=gr.Progress()):
138
+ """Full, robust implementation of the news scraper."""
139
+ # Input validation and sanitization
140
+ search_keywords = search_keywords.strip()
141
+ if not all([search_keywords, start_date_str, end_date_str]):
142
+ raise gr.Error("Search Keywords, Start Date, and End Date are required.")
143
+
144
+ start_dt = dateparser.parse(start_date_str)
145
+ end_dt = dateparser.parse(end_date_str)
146
+ if not all([start_dt, end_dt]):
147
+ raise gr.Error("Invalid date format. Please use a recognizable format like YYYY-MM-DD or '2 weeks ago'.")
148
+
149
+ all_articles, current_dt = [], start_dt
150
+ while current_dt <= end_dt:
151
+ interval_end_dt = min(current_dt + pd.Timedelta(days=interval - 1), end_dt)
152
+ start_str, end_str = current_dt.strftime('%Y-%m-%d'), interval_end_dt.strftime('%Y-%m-%d')
153
+ progress(0, desc=f"Fetching news from {start_str} to {end_str}")
154
+
155
+ site_query = f"({' OR '.join(['site:' + s.strip() for s in sites.split(',') if s.strip()])})" if sites else ""
156
+ final_query = f'"{search_keywords}" {site_query} after:{start_str} before:{end_str}'
157
+
158
+ googlenews = GoogleNews(lang='bn', region='BD')
159
+ googlenews.search(final_query)
160
+
161
+ for page in range(1, max_pages + 1):
162
+ try:
163
+ results = googlenews.results()
164
+ if not results: break
165
+ all_articles.extend(results)
166
+ if page < max_pages:
167
+ googlenews.getpage(page + 1)
168
+ time.sleep(random.uniform(2, 5))
169
+ except HTTPError as e:
170
+ if e.code == 429:
171
+ wait_time = random.uniform(15, 30)
172
+ gr.Warning(f"Rate limited by Google News. Pausing for {wait_time:.0f} seconds.")
173
+ time.sleep(wait_time)
174
+ else:
175
+ logger.error(f"HTTP Error fetching news: {e}"); break
176
+ except Exception as e:
177
+ logger.error(f"An error occurred fetching news: {e}"); break
178
+
179
+ current_dt += pd.Timedelta(days=interval)
180
+
181
+ if not all_articles: return pd.DataFrame(), pd.DataFrame()
182
+
183
+ df = pd.DataFrame(all_articles).drop_duplicates(subset=['link'])
184
+ df['published_date'] = df['date'].apply(lambda x: dateparser.parse(x, languages=['bn']))
185
+ df.dropna(subset=['published_date', 'title'], inplace=True)
186
+
187
+ if filter_keys and filter_keys.strip():
188
+ keywords = [k.strip().lower() for k in filter_keys.split(',')]
189
+ mask = df.apply(lambda row: any(key in str(row['title']).lower() or key in str(row['desc']).lower() for key in keywords), axis=1)
190
+ df = df[mask]
191
+
192
+ return df, df[['published_date', 'title', 'media', 'desc', 'link']].sort_values(by='published_date', ascending=False)
193
+
194
+ # ==============================================================================
195
+ # YOUTUBE ANALYZER BACKEND
196
+ # ==============================================================================
197
+ # (This section remains unchanged from the previous robust version)
198
+ def _fetch_video_details(youtube_service, video_ids: list):
199
+ all_videos_data = []
200
+ try:
201
+ for i in range(0, len(video_ids), 50):
202
+ id_batch = video_ids[i:i+50]
203
+ video_request = youtube_service.videos().list(part="snippet,statistics", id=",".join(id_batch))
204
+ video_response = video_request.execute()
205
+ for item in video_response.get('items', []):
206
+ stats = item.get('statistics', {})
207
+ all_videos_data.append({
208
+ 'video_id': item['id'], 'video_title': item['snippet']['title'],
209
+ 'channel': item['snippet']['channelTitle'], 'published_date': item['snippet']['publishedAt'],
210
+ 'view_count': int(stats.get('viewCount', 0)), 'like_count': int(stats.get('likeCount', 0)),
211
+ 'comment_count': int(stats.get('commentCount', 0))
212
+ })
213
+ except HttpError as e:
214
+ logger.error(f"Could not fetch video details. Error: {e}")
215
+ gr.Warning("Could not fetch details for some videos due to an API error.")
216
+ return all_videos_data
217
+
218
+ def _scrape_single_video_comments(youtube_service, video_id, max_comments):
219
+ comments_list = []
220
  try:
221
+ request = youtube_service.commentThreads().list(
222
+ part="snippet", videoId=video_id, maxResults=min(max_comments, 100),
223
+ order='relevance', textFormat="plainText"
224
+ )
225
+ response = request.execute()
226
+ for item in response.get('items', []):
227
+ snippet = item['snippet']['topLevelComment']['snippet']
228
+ comments_list.append({
229
+ 'author': snippet['authorDisplayName'], 'published_date_comment': snippet['publishedAt'],
230
+ 'comment_text': snippet['textDisplay'], 'likes': snippet['likeCount'],
231
+ 'replies': item['snippet']['totalReplyCount']
232
+ })
233
+ except HttpError as e:
234
+ logger.warning(f"Could not retrieve comments for video {video_id} (may be disabled). Error: {e}")
235
+ return comments_list
236
+
237
+ def run_youtube_analysis_pipeline(api_key, query, max_videos_for_stats, num_videos_for_comments, max_comments_per_video, published_after, progress=gr.Progress()):
238
+ if not api_key: raise gr.Error("YouTube API Key is required.")
239
+ if not query: raise gr.Error("Search Keywords are required.")
240
+ try:
241
+ youtube = build('youtube', 'v3', developerKey=api_key)
242
+ except HttpError as e:
243
+ raise gr.Error(f"Failed to initialize YouTube service. Check API Key. Error: {e}")
244
  except Exception as e:
245
+ raise gr.Error(f"An unexpected error occurred during API initialization: {e}")
246
+
247
+ progress(0.1, desc="Performing broad scan for videos...")
248
+ all_video_ids, next_page_token, total_results_estimate = [], None, 0
249
+ PAGES_TO_FETCH = min(15, (max_videos_for_stats // 50) + 1)
250
+ search_params = {'q': query, 'part': 'id', 'maxResults': 50, 'type': 'video', 'order': 'relevance'}
251
+ if published_after:
252
+ parsed_date = dateparser.parse(published_after)
253
+ if parsed_date:
254
+ search_params['publishedAfter'] = parsed_date.replace(tzinfo=timezone.utc).isoformat()
255
+ else:
256
+ gr.Warning(f"Could not parse date: '{published_after}'. Ignoring filter.")
257
 
258
+ for page in range(PAGES_TO_FETCH):
 
 
 
259
  try:
260
+ if next_page_token: search_params['pageToken'] = next_page_token
261
+ response = youtube.search().list(**search_params).execute()
262
+ if page == 0:
263
+ total_results_estimate = response.get('pageInfo', {}).get('totalResults', 0)
264
+ all_video_ids.extend([item['id']['videoId'] for item in response.get('items', [])])
265
+ next_page_token = response.get('nextPageToken')
266
+ progress(0.1 + (0.3 * (page / PAGES_TO_FETCH)), desc=f"Broad scan: Found {len(all_video_ids)} videos...")
267
+ if not next_page_token: break
268
+ except HttpError as e:
269
+ if "quotaExceeded" in str(e): raise gr.Error("CRITICAL: YouTube API daily quota exceeded. Try again tomorrow.")
270
+ logger.error(f"HTTP error during video search: {e}"); break
271
+
272
+ if not all_video_ids:
273
+ return pd.DataFrame(), pd.DataFrame(), 0
274
+
275
+ progress(0.4, desc=f"Fetching details for {len(all_video_ids)} videos...")
276
+ videos_df_full_scan = pd.DataFrame(_fetch_video_details(youtube, all_video_ids))
277
+ if videos_df_full_scan.empty:
278
+ return pd.DataFrame(), pd.DataFrame(), 0
279
+
280
+ videos_df_full_scan['published_date'] = pd.to_datetime(videos_df_full_scan['published_date'])
281
+ videos_df_full_scan['engagement_rate'] = ((videos_df_full_scan['like_count'] + videos_df_full_scan['comment_count']) / videos_df_full_scan['view_count']).fillna(0)
282
+ videos_df_full_scan = videos_df_full_scan.sort_values(by='view_count', ascending=False).reset_index(drop=True)
283
+
284
+ videos_to_scrape_df, all_comments = videos_df_full_scan.head(int(num_videos_for_comments)), []
285
+ for index, row in videos_to_scrape_df.iterrows():
286
+ progress(0.7 + (0.3 * (index / len(videos_to_scrape_df))), desc=f"Deep dive: Scraping comments from video {index+1}/{len(videos_to_scrape_df)}...")
287
+ comments_for_video = _scrape_single_video_comments(youtube, row['video_id'], max_comments_per_video)
288
+ if comments_for_video:
289
+ for comment in comments_for_video:
290
+ comment.update({'video_id': row['video_id'], 'video_title': row['video_title']})
291
+ all_comments.extend(comments_for_video)
292
+
293
+ comments_df = pd.DataFrame(all_comments)
294
+ if not comments_df.empty:
295
+ comments_df['published_date_comment'] = pd.to_datetime(comments_df['published_date_comment'])
296
+
297
+ logger.info(f"YouTube analysis complete. Est. total videos: {total_results_estimate}. Scanned: {len(videos_df_full_scan)}. Comments: {len(comments_df)}.")
298
+ return videos_df_full_scan, comments_df, total_results_estimate
299
+
300
+
301
+ # ==============================================================================
302
+ # ADVANCED ANALYTICS MODULE
303
+ # ==============================================================================
304
+ # (This section remains unchanged, as it was already robust)
305
+ def set_plot_style():
306
+ plt.style.use('seaborn-v0_8-whitegrid')
307
+ plt.rcParams['figure.dpi'] = 100
308
+
309
+ def run_sentiment_analysis(df: pd.DataFrame, text_column: str, progress=gr.Progress()):
310
+ if text_column not in df.columns: return df
311
+ sentiment_pipeline = get_sentiment_pipeline()
312
+ if not sentiment_pipeline:
313
+ gr.Warning("Sentiment model failed to load. Skipping analysis.")
314
+ return df
315
+
316
+ texts = df[text_column].dropna().tolist()
317
+ if not texts: return df
318
+
319
+ progress(0, desc="Running sentiment analysis...")
320
+ results = sentiment_pipeline(texts, batch_size=32)
321
+
322
+ text_to_sentiment = {text: result for text, result in zip(texts, results)}
323
+ df['sentiment_label'] = df[text_column].map(lambda x: text_to_sentiment.get(x, {}).get('label'))
324
+ df['sentiment_score'] = df[text_column].map(lambda x: text_to_sentiment.get(x, {}).get('score'))
325
+ logger.info("Sentiment analysis complete.")
326
+ return df
327
+
328
+ def generate_scraper_dashboard(df: pd.DataFrame):
329
+ set_plot_style()
330
+
331
+ total_articles, unique_media = len(df), df['media'].nunique()
332
+ start_date, end_date = pd.to_datetime(df['published_date']).min(), pd.to_datetime(df['published_date']).max()
333
+ date_range_str = f"{start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"
 
 
 
 
 
 
 
334
 
335
+ agg_code, agg_name = get_dynamic_time_agg(start_date, end_date)
336
+ timeline_df = df.set_index(pd.to_datetime(df['published_date'])).resample(agg_code).size().reset_index(name='count')
337
+ timeline_plot = gr.LinePlot(timeline_df, x='published_date', y='count', title=f'{agg_name} News Volume', tooltip=['published_date', 'count'])
338
+
339
+ media_counts = df['media'].dropna().value_counts().nlargest(15).sort_values()
340
+ fig_media = None
341
+ if not media_counts.empty:
342
+ fig_media, ax = plt.subplots(figsize=(8, 6)); media_counts.plot(kind='barh', ax=ax, color='skyblue'); ax.set_title("Top 15 Media Sources", fontproperties=BANGLA_FONT)
343
+ ax.set_yticklabels(media_counts.index, fontproperties=BANGLA_FONT); ax.set_xlabel("Article Count"); plt.tight_layout()
344
+
345
+ text = " ".join(title for title in df['title'].astype(str))
346
+ fig_wc = None
347
+ try:
348
+ wc = WordCloud(font_path=FONT_PATH, width=800, height=400, background_color='white', stopwords=BANGLA_STOP_WORDS, collocations=False).generate(text)
349
+ fig_wc, ax = plt.subplots(figsize=(10, 5)); ax.imshow(wc, interpolation='bilinear'); ax.axis("off")
350
+ except Exception as e: logger.error(f"WordCloud failed: {e}")
351
+
352
  return {
353
+ kpi_total_articles: str(total_articles), kpi_unique_media: str(unique_media), kpi_date_range: date_range_str,
354
+ dashboard_timeline_plot: timeline_plot, dashboard_media_plot: fig_media, dashboard_wordcloud_plot: fig_wc,
355
+ scraper_dashboard_group: gr.update(visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  }
357
 
358
+ def generate_sentiment_dashboard(df: pd.DataFrame):
359
+ updates = {sentiment_dashboard_tab: gr.update(visible=False)}
360
+ set_plot_style()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
+ if 'sentiment_label' in df.columns:
363
+ sentiment_counts = df['sentiment_label'].value_counts()
364
+ fig_pie, fig_media_sent = None, None
365
+ if not sentiment_counts.empty:
366
+ fig_pie, ax = plt.subplots(figsize=(6, 6)); ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=['#66c2a5', '#fc8d62', '#8da0cb'])
367
+ ax.set_title("Overall Sentiment Distribution", fontproperties=BANGLA_FONT); ax.axis('equal')
368
+
369
+ top_media = df['media'].value_counts().nlargest(10).index
370
+ media_sentiment = pd.crosstab(df[df['media'].isin(top_media)]['media'], df['sentiment_label'], normalize='index').mul(100)
371
+ if not media_sentiment.empty:
372
+ fig_media_sent, ax = plt.subplots(figsize=(10, 7)); media_sentiment.plot(kind='barh', stacked=True, ax=ax, colormap='viridis')
373
+ ax.set_title("Sentiment by Top Media Sources", fontproperties=BANGLA_FONT); ax.set_yticklabels(media_sentiment.index, fontproperties=BANGLA_FONT); plt.tight_layout()
374
+
375
+ updates.update({sentiment_pie_plot: fig_pie, sentiment_by_media_plot: fig_media_sent, sentiment_dashboard_tab: gr.update(visible=True)})
376
+ return updates
377
+
378
+ def generate_youtube_dashboard(videos_df, comments_df):
379
+ set_plot_style()
380
+ kpis = {
381
+ kpi_yt_videos_found: f"{len(videos_df):,}" if videos_df is not None else "0",
382
+ kpi_yt_views_scanned: f"{videos_df['view_count'].sum():,}" if videos_df is not None else "0",
383
+ kpi_yt_comments_scraped: f"{len(comments_df):,}" if comments_df is not None else "0"
384
+ }
385
 
386
+ channel_counts = videos_df['channel'].value_counts().nlargest(15).sort_values()
387
+ fig_channels, ax = plt.subplots(figsize=(8, 6))
388
+ if not channel_counts.empty:
389
+ channel_counts.plot(kind='barh', ax=ax, color='coral'); ax.set_title("Top 15 Channels by Video Volume", fontproperties=BANGLA_FONT); ax.set_yticklabels(channel_counts.index, fontproperties=BANGLA_FONT); plt.tight_layout()
390
+
391
+ fig_wc, fig_pie, fig_sentiment_video = None, None, None
392
+ if comments_df is not None and not comments_df.empty:
393
+ text = " ".join(comment for comment in comments_df['comment_text'].astype(str))
394
+ try:
395
+ wc = WordCloud(font_path=FONT_PATH, width=800, height=400, background_color='white', stopwords=BANGLA_STOP_WORDS, collocations=False).generate(text)
396
+ fig_wc, ax = plt.subplots(figsize=(10, 5)); ax.imshow(wc, interpolation='bilinear'); ax.axis("off"); ax.set_title("Most Common Words in Comments", fontproperties=BANGLA_FONT)
397
+ except Exception as e: logger.error(f"YouTube WordCloud failed: {e}")
398
+
399
+ if 'sentiment_label' in comments_df.columns:
400
+ sentiment_counts = comments_df['sentiment_label'].value_counts()
401
+ if not sentiment_counts.empty:
402
+ fig_pie, ax = plt.subplots(figsize=(6, 6)); ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=['#66c2a5', '#fc8d62', '#8da0cb']); ax.set_title("Overall Comment Sentiment", fontproperties=BANGLA_FONT)
403
+
404
+ top_videos_by_comment = comments_df['video_title'].value_counts().nlargest(10).index
405
+ video_sentiment = comments_df.groupby('video_title')['sentiment_label'].value_counts(normalize=True).unstack().mul(100).reindex(top_videos_by_comment).dropna(how='all')
406
+ if not video_sentiment.empty:
407
+ fig_sentiment_video, ax = plt.subplots(figsize=(10, 8)); video_sentiment.plot(kind='barh', stacked=True, ax=ax, colormap='viridis'); ax.set_title("Comment Sentiment by Top 10 Videos", fontproperties=BANGLA_FONT); ax.set_yticklabels(video_sentiment.index, fontproperties=BANGLA_FONT); plt.tight_layout()
408
+
409
+ return {**kpis, yt_channel_plot: fig_channels, yt_wordcloud_plot: fig_wc, yt_sentiment_pie_plot: fig_pie, yt_sentiment_by_video_plot: fig_sentiment_video}
410
+
411
+ def generate_youtube_topic_dashboard(videos_df_full_scan: pd.DataFrame):
412
+ if videos_df_full_scan is None or videos_df_full_scan.empty: return None, None, None
413
+ set_plot_style()
414
 
415
+ channel_views = videos_df_full_scan.groupby('channel')['view_count'].sum().nlargest(15).sort_values()
416
+ fig_channel_views, ax = plt.subplots(figsize=(10, 7)); channel_views.plot(kind='barh', ax=ax, color='purple'); ax.set_title("Channel Dominance by Total Views (Top 15)", fontproperties=BANGLA_FONT); ax.set_xlabel("Combined Views on Topic"); ax.set_yticklabels(channel_views.index, fontproperties=BANGLA_FONT); plt.tight_layout()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
 
418
+ df_sample = videos_df_full_scan.sample(n=min(len(videos_df_full_scan), 200))
419
+ avg_views, avg_engagement = df_sample['view_count'].median(), df_sample['engagement_rate'].median()
420
+ fig_quadrant, ax = plt.subplots(figsize=(10, 8)); sns.scatterplot(data=df_sample, x='view_count', y='engagement_rate', size='like_count', sizes=(20, 400), hue='channel', alpha=0.7, ax=ax, legend=False)
421
+ ax.set_xscale('log'); ax.set_yscale('log'); ax.set_title("Content Performance Quadrant", fontproperties=BANGLA_FONT); ax.set_xlabel("Video Views (Log Scale)", fontproperties=BANGLA_FONT); ax.set_ylabel("Engagement Rate (Log Scale)", fontproperties=BANGLA_FONT)
422
+ ax.axhline(avg_engagement, ls='--', color='gray'); ax.axvline(avg_views, ls='--', color='gray'); ax.text(avg_views*1.1, ax.get_ylim()[1], 'High Performers', color='green', fontproperties=BANGLA_FONT); ax.text(ax.get_xlim()[0], avg_engagement*1.1, 'Niche Stars', color='blue', fontproperties=BANGLA_FONT)
423
 
424
+ fig_age, ax = plt.subplots(figsize=(10, 7)); sns.scatterplot(data=df_sample, x='published_date', y='view_count', size='engagement_rate', sizes=(20, 400), alpha=0.6, ax=ax)
425
+ ax.set_yscale('log'); ax.set_title("Content Age vs. Impact", fontproperties=BANGLA_FONT); ax.set_xlabel("Publication Date", fontproperties=BANGLA_FONT); ax.set_ylabel("Views (Log Scale)", fontproperties=BANGLA_FONT); plt.xticks(rotation=45)
426
+
427
+ return fig_channel_views, fig_quadrant, fig_age
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
429
+ # ==============================================================================
430
+ # GRADIO UI DEFINITION
431
+ # ==============================================================================
432
 
433
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), title=APP_TITLE) as app:
434
+ gr.Markdown(f"# {APP_TITLE}\n*{APP_TAGLINE}*")
435
 
436
+ # --- STATE MANAGEMENT ---
437
+ scraper_results_state = gr.State()
438
+ youtube_results_state = gr.State()
439
 
440
  with gr.Tabs() as tabs:
441
+ with gr.TabItem("1. News Scraper", id=0):
 
442
  with gr.Row():
443
  with gr.Column(scale=1):
444
+ gr.Markdown("### 1. Search Criteria")
445
+ search_keywords_textbox = gr.Textbox(label="Search Keywords", placeholder="e.g., বিএনপি সমাবেশ")
446
+ sites_to_search_textbox = gr.Textbox(label="Target Sites (Optional, comma-separated)", placeholder="e.g., prothomalo.com")
447
+ start_date_textbox = gr.Textbox(label="Start Date", placeholder="YYYY-MM-DD or 'last week'")
448
+ end_date_textbox = gr.Textbox(label="End Date", placeholder="YYYY-MM-DD or 'today'")
449
+ gr.Markdown("### 2. Scraping Parameters")
450
+ interval_days_slider = gr.Slider(1, 7, 3, step=1, label="Days per Interval")
451
+ max_pages_slider = gr.Slider(1, 10, 5, step=1, label="Max Pages per Interval")
452
+ filter_keywords_textbox = gr.Textbox(label="Filter Keywords (comma-separated, optional)", placeholder="e.g., নির্বাচন, সরকার")
453
+ start_scraper_button = gr.Button("Start Scraping & Analysis", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  with gr.Column(scale=2):
455
+ scraper_results_df = gr.DataFrame(label="Filtered Results", interactive=False, wrap=True)
456
+ scraper_download_file = gr.File(label="Download Filtered Results CSV")
457
 
458
+ with gr.TabItem("2. News Analytics", id=1):
459
+ with gr.Group(visible=False) as scraper_dashboard_group:
460
+ with gr.Tabs():
461
+ with gr.TabItem("Overview"):
462
+ with gr.Row():
463
+ kpi_total_articles = gr.Textbox(label="Total Articles Found", interactive=False)
464
+ kpi_unique_media = gr.Textbox(label="Unique Media Sources", interactive=False)
465
+ kpi_date_range = gr.Textbox(label="Date Range of Articles", interactive=False)
466
+ dashboard_timeline_plot = gr.LinePlot(label="News Volume Timeline")
467
+ with gr.Row():
468
+ dashboard_media_plot = gr.Plot(label="Top Media Sources by Article Count")
469
+ dashboard_wordcloud_plot = gr.Plot(label="Headline Word Cloud")
470
+ with gr.TabItem("Sentiment Analysis", visible=False) as sentiment_dashboard_tab:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
  with gr.Row():
472
+ sentiment_pie_plot = gr.Plot(label="Overall Sentiment")
473
+ sentiment_by_media_plot = gr.Plot(label="Sentiment by Media Source")
 
474
 
475
+ with gr.TabItem("3. YouTube Topic Analysis", id=2):
 
 
476
  with gr.Row():
477
+ with gr.Column(scale=1):
478
+ gr.Markdown("### 1. YouTube API & Search")
479
+ yt_api_key = gr.Textbox(label="YouTube API Key", type="password", placeholder="Paste your API key")
480
+ yt_search_keywords = gr.Textbox(label="Search Keywords", placeholder="e.g., বিএনপি, তারেক রহমান")
481
+ yt_published_after = gr.Textbox(label="Published After Date (Optional)", placeholder="YYYY-MM-DD or '1 month ago'")
482
+ gr.Markdown("### 2. Analysis Parameters")
483
+ yt_max_videos_for_stats = gr.Slider(label="Videos to Scan for Topic Stats (Broad Scan)", minimum=50, maximum=750, value=300, step=50)
484
+ yt_num_videos_for_comments = gr.Slider(label="Top Videos for Comment Analysis (Deep Dive)", minimum=5, maximum=100, value=25, step=5)
485
+ yt_max_comments = gr.Slider(10, 100, 30, step=10, label="Max Comments per Video")
486
+ start_yt_analysis_button = gr.Button("Start YouTube Analysis", variant="primary")
487
+ with gr.Column(scale=2):
488
+ with gr.Group(visible=False) as yt_dashboard_group:
489
+ gr.Markdown("### Topic Footprint KPIs (Based on Broad Scan)")
490
+ with gr.Row():
491
+ kpi_yt_total_topic_videos = gr.Textbox(label="Est. Total Videos on Topic (YT)", interactive=False)
492
+ kpi_yt_videos_found = gr.Textbox(label="Videos Scanned for Stats", interactive=False)
493
+ kpi_yt_views_scanned = gr.Textbox(label="Combined Views (of Scanned)", interactive=False)
494
+ kpi_yt_comments_scraped = gr.Textbox(label="Comments Analyzed (from Top Videos)", interactive=False)
495
+ with gr.Tabs():
496
+ with gr.TabItem("Deep Dive Analysis (on Top Videos)"):
497
+ yt_videos_df_output = gr.DataFrame(label="Top Videos Analyzed for Comments (sorted by views)")
498
+ with gr.Row():
499
+ yt_channel_plot = gr.Plot(label="Channel Contribution by Video Count")
500
+ yt_sentiment_pie_plot = gr.Plot(label="Overall Comment Sentiment")
501
+ with gr.Row():
502
+ yt_wordcloud_plot = gr.Plot(label="Comment Word Cloud")
503
+ yt_sentiment_by_video_plot = gr.Plot(label="Comment Sentiment by Video")
504
+ with gr.TabItem("Topic-Level Analytics (on All Scanned Videos)"):
505
+ yt_channel_views_plot = gr.Plot(label="Channel Dominance by Views")
506
+ yt_performance_quadrant_plot = gr.Plot(label="Content Performance Quadrant")
507
+ yt_content_age_plot = gr.Plot(label="Content Age vs. Impact")
508
+
509
+ gr.Markdown(f"<div style='text-align: center; margin-top: 20px;'>{APP_FOOTER}</div>")
510
 
511
+ # ==============================================================================
512
+ # EVENT HANDLERS
513
+ # ==============================================================================
514
+
515
+ # --- NEWS SCRAPER WORKFLOW ---
516
+ def news_scraper_workflow(search_keywords, sites, start_date, end_date, interval, max_pages, filter_keys, progress=gr.Progress()):
517
+ progress(0, desc="Starting news analysis...")
518
+ raw_df, display_df = run_news_scraper_pipeline(search_keywords, sites, start_date, end_date, interval, max_pages, filter_keys, progress)
519
+
520
+ if raw_df.empty:
521
+ gr.Info("No news articles found for your query."); return None, None, None
522
+
523
+ progress(0.8, desc="Analyzing sentiment of news headlines...")
524
+ analyzed_df = run_sentiment_analysis(raw_df.copy(), 'title', progress)
525
+
526
+ output_path = "filtered_news_data.csv"; display_df.to_csv(output_path, index=False)
527
+ return display_df, output_path, analyzed_df
528
+
529
+ start_scraper_button.click(
530
+ fn=news_scraper_workflow,
531
+ inputs=[search_keywords_textbox, sites_to_search_textbox, start_date_textbox, end_date_textbox, interval_days_slider, max_pages_slider, filter_keywords_textbox],
532
+ outputs=[scraper_results_df, scraper_download_file, scraper_results_state]
533
+ )
534
 
535
+ def update_news_dashboards(analyzed_df):
536
+ if analyzed_df is None or analyzed_df.empty:
537
+ return {scraper_dashboard_group: gr.update(visible=False), sentiment_dashboard_tab: gr.update(visible=False)}
538
+
539
+ scraper_updates = generate_scraper_dashboard(analyzed_df)
540
+ sentiment_updates = generate_sentiment_dashboard(analyzed_df)
541
+ return {**scraper_updates, **sentiment_updates}
542
+
543
+ news_ui_components = [
544
+ scraper_dashboard_group, kpi_total_articles, kpi_unique_media, kpi_date_range,
545
+ dashboard_timeline_plot, dashboard_media_plot, dashboard_wordcloud_plot,
546
+ sentiment_dashboard_tab, sentiment_pie_plot, sentiment_by_media_plot
547
+ ]
548
+ scraper_results_state.change(fn=update_news_dashboards, inputs=scraper_results_state, outputs=news_ui_components)
549
+
550
+ # --- YOUTUBE WORKFLOW ---
551
+ def youtube_workflow(api_key, query, max_stats, num_comments, max_comments, published_after, progress=gr.Progress()):
552
+ sanitized_api_key = api_key.strip()
553
+ sanitized_query = query.strip()
554
+ videos_df_full, comments_df, total_vids_est = run_youtube_analysis_pipeline(
555
+ sanitized_api_key, sanitized_query, max_stats, num_comments, max_comments, published_after, progress
556
+ )
557
+ if videos_df_full.empty:
558
+ gr.Info("No videos found for your YouTube query."); return None, None
559
+
560
+ if comments_df is not None and not comments_df.empty:
561
+ progress(0.9, desc="Analyzing comment sentiment...")
562
+ comments_df = run_sentiment_analysis(comments_df.copy(), 'comment_text', progress)
563
+
564
+ top_videos_for_display = videos_df_full.head(int(num_comments))
565
+ return top_videos_for_display, {"full_scan": videos_df_full, "comments": comments_df, "total_estimate": total_vids_est}
566
+
567
+ start_yt_analysis_button.click(
568
+ fn=youtube_workflow,
569
+ inputs=[yt_api_key, yt_search_keywords, yt_max_videos_for_stats, yt_num_videos_for_comments, yt_max_comments, yt_published_after],
570
+ outputs=[yt_videos_df_output, youtube_results_state]
571
+ )
572
 
573
+ def update_youtube_dashboards(results_data):
574
+ if not results_data or results_data.get("full_scan") is None or results_data["full_scan"].empty:
 
 
 
 
 
 
575
  return {
576
+ yt_dashboard_group: gr.update(visible=False), kpi_yt_total_topic_videos: "0",
577
+ kpi_yt_videos_found: "0", kpi_yt_views_scanned: "0", kpi_yt_comments_scraped: "0",
578
+ yt_channel_plot: None, yt_wordcloud_plot: None, yt_sentiment_pie_plot: None,
579
+ yt_sentiment_by_video_plot: None, yt_channel_views_plot: None,
580
+ yt_performance_quadrant_plot: None, yt_content_age_plot: None
581
  }
582
+
583
+ videos_df_full, comments_df, total_estimate = results_data.get("full_scan"), results_data.get("comments"), results_data.get("total_estimate", 0)
584
+ deep_dive_updates = generate_youtube_dashboard(videos_df_full, comments_df)
585
+ fig_ch_views, fig_quad, fig_age = generate_youtube_topic_dashboard(videos_df_full)
586
+
 
 
 
 
 
587
  return {
588
+ yt_dashboard_group: gr.update(visible=True),
589
+ kpi_yt_total_topic_videos: f"{total_estimate:,}",
590
+ **deep_dive_updates,
591
+ yt_channel_views_plot: fig_ch_views,
592
+ yt_performance_quadrant_plot: fig_quad,
593
+ yt_content_age_plot: fig_age,
594
  }
595
+
596
+ yt_ui_components = [
597
+ yt_dashboard_group, kpi_yt_total_topic_videos, kpi_yt_videos_found, kpi_yt_views_scanned, kpi_yt_comments_scraped,
598
+ yt_channel_plot, yt_wordcloud_plot, yt_sentiment_pie_plot, yt_sentiment_by_video_plot,
599
+ yt_channel_views_plot, yt_performance_quadrant_plot, yt_content_age_plot
600
+ ]
601
+ youtube_results_state.change(fn=update_youtube_dashboards, inputs=youtube_results_state, outputs=yt_ui_components)
602
+
603
+ # ==============================================================================
604
+ # LAUNCH THE APP
605
+ # ==============================================================================
606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
  if __name__ == "__main__":
608
+ auth_credentials = os.getenv("AUTH_CREDENTIALS")
609
+ auth_tuple = None
610
+ if auth_credentials and ":" in auth_credentials:
611
+ user, pwd = auth_credentials.split(":", 1)
612
+ auth_tuple = (user, pwd)
613
+ logger.info("Using authentication credentials from environment variable.")
614
+ else:
615
+ logger.warning("No AUTH_CREDENTIALS found. Using default insecure credentials. Set this as an environment variable for production.")
616
+ auth_tuple = ("bnp", "12345")
617
+
618
+ app.launch(debug=True, auth=auth_tuple)