Arjon07CSE commited on
Commit
db79a4d
·
verified ·
1 Parent(s): fe9fa1b

Upload 2 files

Browse files

added the main code and requirement file

Files changed (2) hide show
  1. app.py +720 -0
  2. requirements.txt +11 -0
app.py ADDED
@@ -0,0 +1,720 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- IMPORTS & GLOBAL SETUP ---
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import numpy as np
5
+ import torch
6
+ import re
7
+ import sqlite3
8
+ import json
9
+ import logging
10
+ import requests
11
+ from io import StringIO
12
+
13
+ # Transformers and BERTopic components
14
+ from transformers import pipeline, BitsAndBytesConfig
15
+ from sentence_transformers import SentenceTransformer
16
+ from bertopic import BERTopic
17
+ from bertopic.representation import KeyBERTInspired
18
+ from umap import UMAP
19
+ from hdbscan import HDBSCAN
20
+ from sklearn.feature_extraction.text import CountVectorizer
21
+
22
+ # Hugging Face and Colab integration (optional, for LLM access)
23
+ from huggingface_hub import login
24
+ # from google.colab import userdata # We will disable this for HF Spaces deployment
25
+
26
+ # Setup basic logging to monitor the application's health
27
+ logging.basicConfig(
28
+ level=logging.INFO,
29
+ format='%(asctime)s - %(levelname)s - %(message)s'
30
+ )
31
+
32
+ # A simple dictionary to hold data between UI interactions, acting as a session state.
33
+ APP_STATE = {
34
+ "df": None,
35
+ "bertopic_model": None,
36
+ "topics_df": None,
37
+ "final_df": None,
38
+ }
39
+
40
+ print("✅ app.py created. Initial imports written.")
41
+ print("✅ Dependencies installed in Colab environment.")
42
+
43
+ # --- TEXT PREPROCESSING & NORMALIZATION ---
44
+
45
+ # A comprehensive list of Bangla stop words, tailored for news and general text.
46
+ BANGLA_STOP_WORDS = [
47
+ 'অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য',
48
+ 'অভিপ্রায়', 'একে', 'একই', 'একেবারে', 'একটি', 'একবার', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটি',
49
+ 'এতটাই', 'এতদূর', 'এতটুকু', 'এক', 'এবং', 'এবার', 'এমন', 'এমনভাবে', 'এর', 'এরা', 'এঁরা', 'এঁদের',
50
+ 'এই', 'এইভাবে', 'ও', 'ওঁরা', 'ওঁর', 'ওঁদের', 'ওকে', 'ওখানে', 'ওদের', 'ওর', 'কাছ', 'কাছে', 'কাজ',
51
+ 'কারণ', 'কিছু', 'কিছুই', 'কিন্তু', 'কিভাবে', 'কেন', 'কোন', 'কোনও', 'কোনো', 'ক্ষেত্রে', 'খুব',
52
+ 'গুলি', 'গিয়ে', 'চায়', 'ছাড়া', 'জন্য', 'জানা', 'ঠিক', 'তিনি', 'তিন', 'তিনিও', 'তাকে', 'তাঁকে',
53
+ 'তার', 'তাঁর', 'তারা', 'তাঁরা', 'তাদের', 'তাঁদের', 'তাহলে', ' থাকলেও', 'থেকে', 'মধ্যেই', 'মধ্যে',
54
+ 'द्वारा', 'নয়', 'না', 'নিজের', 'নিজে', 'নিয়ে', 'পারেন', 'পারা', 'পারে', 'পরে', 'পর্যন্ত', 'পুনরায়',
55
+ 'ফলে', 'বজায়', 'বা', 'বাদে', 'বার', 'বিশেষ', 'বিভিন্ন', 'ব্যবহার', 'ব্যাপারে', 'ভাবে', 'ভাবেই', 'মাধ্যমে',
56
+ 'মতো', 'মতোই', 'যখন', 'যদি', 'যদিও', 'যা', 'যাকে', 'যাওয়া', 'যায়', 'যে', 'যেখানে', 'যেতে', 'যেমন',
57
+ 'যেহেতু', 'রহিছে', 'শিক্ষা', 'শুধু', 'সঙ্গে', 'সব', 'সমস্ত', 'সম্প্রতি', 'সহ', 'সাধারণ', 'সামনে', 'হতে',
58
+ 'হতেই', 'হবে', 'হয়', 'হয়তো', 'হয়', 'হচ্ছে', 'হত', 'হলে', 'হলেও', 'হয়নি', 'হাজার', 'হোওয়া', 'আরও', 'আমরা',
59
+ 'আমার', 'আমি', 'আর', 'আগে', 'আগেই', 'আছে', 'আজ', 'তাকে', 'তাতে', 'তাদের', 'তাহার', 'তাহাতে', 'তাহারই',
60
+ 'তথা', 'তথাপি', 'সে', 'সেই', 'সেখান', 'সেখানে', 'থেকে', 'নাকি', 'নাগাদ', 'দু', 'দুটি', 'সুতরাং',
61
+ 'সম্পর্কে', 'সঙ্গেও', 'সর্বাধিক', 'সর্বদা', 'সহ', 'হৈতে', 'হইবে', 'হইয়া', 'হৈল', 'জানিয়েছেন', 'প্রতিবেদক'
62
+ ]
63
+
64
+ def normalize_bangla_manual(text):
65
+ """A robust, self-contained function to normalize Bangla text."""
66
+ if not isinstance(text, str): return ""
67
+ replacements = {
68
+ '[\u09F7]': '\u09B0', '[\u09F2]': '\u09B2', '[\u09E4]': '\u098B', '[\u09E5]': '\u09E1',
69
+ '[\u09FA]': '\u09B8\u09CD\u09AE', '[\u09FB]': '\u0995\u09CD\u09B7', '[\u0970]': '\u0966',
70
+ '[\u09F3]': '\u09B0\u09C2', '[\u09F8]': '\u09A3', '[\u09F9]': '\u09B6', '[\u0984]': '',
71
+ '[\u0980]': '\u0981', r'(\s)।(\s)': r'\1।\2', r'(\S)।(\S)': r'\1 । \2',
72
+ '[\u0964][\u0964]': '\u0964', '[|]': '\u0964', '[\u09DC]': '\u09A1\u09BC',
73
+ '[\u09DD]': '\u09A2\u09BC', '[\u09DF]': '\u09AF\u09BC',
74
+ }
75
+ for old, new in replacements.items():
76
+ text = re.sub(old, new, text)
77
+ return text
78
+
79
+ def preprocess_bangla_text(text):
80
+ """Cleans and normalizes a single Bangla text string for NLP tasks."""
81
+ if not isinstance(text, str): return ""
82
+ text = normalize_bangla_manual(text)
83
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
84
+ text = re.sub(r'\S*@\S*\s?', '', text)
85
+ text = re.sub(r'[^\u0980-\u09FF\s]', '', text)
86
+ words = text.split()
87
+ words = [word for word in words if word not in BANGLA_STOP_WORDS]
88
+ text = " ".join(words)
89
+ return re.sub(r'\s+', ' ', text).strip()
90
+
91
+ print("✅ Helper functions appended to app.py")
92
+
93
+ # --- APP BRANDING & CONFIGURATION ---
94
+ # Easily update the application's title, tagline, and footer here.
95
+ APP_TITLE = "Social Perception Analyzer"
96
+ APP_TAGLINE = "Prepared for the Policymakers of Bangladesh Nationalist Party (BNP)"
97
+ APP_FOOTER = "Developed by Centre for Data Science Research (CDSR), and Strategy and Policy Forum (SPF)"
98
+
99
+
100
+ # --- LOCAL LLM INITIALIZATION ---
101
+ def initialize_local_llm(hf_token=None):
102
+ """
103
+ Initializes and returns a local, quantized, lightweight LLM pipeline.
104
+ This model is chosen for its efficiency and Bangla language specialization.
105
+ """
106
+ model_id = "hishab/titulm-llama-3.2-1b-v1.1"
107
+
108
+ # 4-bit quantization to reduce memory usage significantly
109
+ quantization_config = BitsAndBytesConfig(
110
+ load_in_4bit=True,
111
+ bnb_4bit_compute_dtype=torch.bfloat16
112
+ )
113
+
114
+ try:
115
+ # Check for GPU availability
116
+ if not torch.cuda.is_available():
117
+ logging.warning("GPU not available. LLM will run on CPU and be very slow.")
118
+ llm_pipeline = pipeline("text-generation", model=model_id, token=hf_token)
119
+ else:
120
+ logging.info(f"Initializing quantized local LLM: {model_id} on GPU.")
121
+ llm_pipeline = pipeline(
122
+ "text-generation",
123
+ model=model_id,
124
+ model_kwargs={"quantization_config": quantization_config},
125
+ device_map="auto",
126
+ token=hf_token
127
+ )
128
+ return llm_pipeline
129
+ except Exception as e:
130
+ logging.error(f"Failed to initialize local LLM: {e}")
131
+ # Add a note about potential trust issues for some models
132
+ logging.info("Trying again with 'trust_remote_code=True'.")
133
+ try:
134
+ llm_pipeline = pipeline(
135
+ "text-generation",
136
+ model=model_id,
137
+ model_kwargs={"trust_remote_code": True, "quantization_config": quantization_config},
138
+ device_map="auto",
139
+ token=hf_token
140
+ )
141
+ return llm_pipeline
142
+ except Exception as e2:
143
+ logging.error(f"Secondary attempt failed: {e2}")
144
+ gr.Warning("Could not initialize the local LLM. AI features will be disabled.")
145
+ return None
146
+
147
+ # --- DATA LOADING HELPER ---
148
+ def load_data(file_obj, gsheet_url):
149
+ """Loads a DataFrame from either an uploaded file or a Google Sheets URL."""
150
+ if file_obj is not None:
151
+ logging.info(f"Loading data from uploaded file: {file_obj.name}")
152
+ return pd.read_csv(file_obj.name)
153
+ elif gsheet_url and gsheet_url.strip():
154
+ logging.info(f"Loading data from Google Sheets URL.")
155
+ try:
156
+ # Manipulate the URL for direct CSV export
157
+ csv_url = gsheet_url.replace('/edit?usp=sharing', '/export?format=csv&gid=0')
158
+ response = requests.get(csv_url)
159
+ response.raise_for_status() # Raise an exception for bad status codes
160
+ return pd.read_csv(StringIO(response.text))
161
+ except Exception as e:
162
+ raise ValueError(f"Failed to load from Google Sheets URL. Please ensure the link is correct and publicly accessible. Error: {e}")
163
+ else:
164
+ raise ValueError("Please upload a CSV file or provide a public Google Sheets URL.")
165
+
166
+ print("✅ App branding, LLM initialization, and data loading functions appended to app.py")
167
+
168
+ # --- MAIN ANALYSIS ENGINE ---
169
+
170
+ # We will define the AI agent in the next cell. For now, this is a placeholder.
171
+ LLM_PIPELINE = None
172
+
173
+ def run_analysis_pipeline(file_obj, gsheet_url, text_columns, analysis_mode, manual_seeds,
174
+ top_n_topics_slider, enable_ai_merging, hf_token, progress=gr.Progress()):
175
+ """
176
+ The main orchestrator function for the analysis pipeline.
177
+ This function incorporates all our agreed-upon refinements.
178
+ """
179
+ global LLM_PIPELINE
180
+ if enable_ai_merging and LLM_PIPELINE is None:
181
+ progress(0, desc="Initializing LLM...")
182
+ LLM_PIPELINE = initialize_local_llm(hf_token)
183
+ if LLM_PIPELINE is None:
184
+ gr.Warning("AI features enabled, but LLM failed to initialize. Skipping AI steps.")
185
+ enable_ai_merging = False
186
+
187
+ # === STEP 1: LOAD AND VALIDATE DATA ===
188
+ progress(0.1, desc="Step 1/8: Loading and Validating Data...")
189
+ try:
190
+ df = load_data(file_obj, gsheet_url)
191
+ if not text_columns: raise ValueError("Please select at least one text column to analyze.")
192
+ df['combined_text'] = df[text_columns].fillna('').astype(str).agg(' '.join, axis=1)
193
+ df.dropna(subset=['combined_text'], inplace=True)
194
+ df['processed_text'] = df['combined_text'].apply(preprocess_bangla_text)
195
+
196
+ # REFINEMENT: Filter by word count for more robust document validation.
197
+ df_analysis = df[df['processed_text'].str.split().str.len() > 2].copy()
198
+ if df_analysis.empty:
199
+ raise ValueError("No documents with sufficient content found after cleaning. Please check your data and column selection.")
200
+ documents = df_analysis['processed_text'].tolist()
201
+ APP_STATE["df"] = df_analysis # Save the analyzable dataframe
202
+ except Exception as e:
203
+ logging.error(f"Data Loading Error: {e}")
204
+ return {log_output: f"Error during data loading: {e}"}
205
+
206
+ # === STEP 2: PREPARE GUIDANCE (IF MANUAL SEEDING) ===
207
+ progress(0.2, desc="Step 2/8: Preparing Analysis Mode...")
208
+ y_guidance = None
209
+ if analysis_mode == "Manual Seeding" and manual_seeds:
210
+ try:
211
+ seed_topics_dict = json.loads(manual_seeds)
212
+ y_guidance = [-1] * len(documents)
213
+ topic_name_to_id = {name: i for i, name in enumerate(seed_topics_dict.keys())}
214
+ for i, doc in enumerate(documents):
215
+ for topic_name, keywords in seed_topics_dict.items():
216
+ if any(keyword in doc for keyword in keywords):
217
+ y_guidance[i] = topic_name_to_id[topic_name]
218
+ break # Prioritizes the first match in the JSON
219
+ except Exception as e:
220
+ return {log_output: f"Error: Invalid JSON in Manual Seeds. Details: {e}"}
221
+
222
+ # === STEP 3: EMBEDDINGS & MODEL SETUP (WITH REFINEMENTS) ===
223
+ progress(0.3, desc="Step 3/8: Calculating Document Embeddings...")
224
+ embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
225
+ embeddings = embedding_model.encode(documents, show_progress_bar=True)
226
+
227
+ # REFINEMENT: Lower min_cluster_size for more sensitive topic detection.
228
+ hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
229
+ # REFINEMENT: Use max_df and min_df for adaptive stop word filtering.
230
+ vectorizer_model = CountVectorizer(tokenizer=lambda doc: doc.split(), ngram_range=(1, 3), max_df=0.90, min_df=5)
231
+
232
+ # Other components remain robust
233
+ umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
234
+ representation_model = KeyBERTInspired()
235
+
236
+ # === STEP 4: TRAIN TOPIC MODEL ===
237
+ progress(0.5, desc="Step 4/8: Training BERTopic Model...")
238
+ topic_model = BERTopic(
239
+ embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model,
240
+ vectorizer_model=vectorizer_model, representation_model=representation_model,
241
+ language="multilingual", verbose=False
242
+ )
243
+ topics, _ = topic_model.fit_transform(documents, embeddings, y=y_guidance)
244
+
245
+ # === STEP 5: AI REFINEMENT (IF ENABLED) ===
246
+ if enable_ai_merging and LLM_PIPELINE:
247
+ progress(0.6, desc="Step 5/8: Running AI Refinement Agent...")
248
+ # We will define `run_ai_refinement` in the next cell. This is the hook.
249
+ topic_model = run_ai_refinement(topic_model, LLM_PIPELINE, progress)
250
+ else:
251
+ progress(0.6, desc="Step 5/8: Skipping AI Refinement...")
252
+ # Fallback to default naming if AI is disabled
253
+ generated_labels = topic_model.generate_topic_labels(nr_words=4, separator=", ")
254
+ topic_model.set_topic_labels(generated_labels)
255
+
256
+ # === STEP 6: APPLY MANUAL SEED NAMES ===
257
+ progress(0.7, desc="Step 6/8: Finalizing Topic Names...")
258
+ if analysis_mode == "Manual Seeding" and 'seed_topics_dict' in locals():
259
+ for topic_name, topic_id in topic_name_to_id.items():
260
+ if topic_id in topic_model.get_topic_info()['Topic'].values:
261
+ topic_model.set_topic_labels({topic_id: topic_name})
262
+
263
+ # === STEP 7: PREPARE FINAL OUTPUTS & VISUALIZATIONS ===
264
+ progress(0.85, desc="Step 7/8: Preparing Visualizations...")
265
+ APP_STATE["bertopic_model"] = topic_model
266
+ df_analysis['Topic'] = topics
267
+ APP_STATE["final_df"] = df_analysis
268
+ topics_df = topic_model.get_topic_info()
269
+ APP_STATE["topics_df"] = topics_df
270
+
271
+ # REFINEMENT: Safeguard against memory errors on very large datasets.
272
+ if len(documents) > 50000:
273
+ gr.Info("Dataset is large. Visualizing a sample of 50,000 documents for performance.")
274
+ indices = np.random.choice(len(documents), 50000, replace=False)
275
+ sampled_docs = [documents[i] for i in indices]
276
+ sampled_embeddings = embeddings[indices]
277
+ doc_topic_landscape_plot = topic_model.visualize_documents(sampled_docs, embeddings=sampled_embeddings)
278
+ else:
279
+ doc_topic_landscape_plot = topic_model.visualize_documents(documents, embeddings=embeddings)
280
+
281
+ inter_topic_map_plot = topic_model.visualize_topics()
282
+ # REFINEMENT: Use slider value for dynamic chart generation.
283
+ num_chart_topics = int(top_n_topics_slider)
284
+ top_topics_barchart_plot = topic_model.visualize_barchart(top_n_topics=num_chart_topics)
285
+ topic_similarity_heatmap_plot = topic_model.visualize_heatmap(top_n_topics=num_chart_topics)
286
+ topic_hierarchy_plot = topic_model.visualize_hierarchy(top_n_topics=num_chart_topics)
287
+
288
+ review_topic_table = topics_df[['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'})
289
+
290
+ # Check for date columns for the temporal analysis tab
291
+ date_columns = [col for col in df_analysis.columns if pd.to_datetime(df_analysis[col], errors='coerce').notna().any()]
292
+
293
+ # === STEP 8: UPDATE UI WITH RESULTS ===
294
+ progress(1.0, desc="Step 8/8: Finalizing UI...")
295
+ return {
296
+ log_output: f"✅ Analysis Complete! Discovered {len(topics_df)-1} topics.",
297
+ # Make result tabs visible
298
+ review_tab: gr.update(visible=True),
299
+ visualize_tab: gr.update(visible=True),
300
+ # Populate the review tab
301
+ review_topic_table_df: gr.update(value=review_topic_table),
302
+ # Populate the visualization tab
303
+ doc_topic_landscape_plot_ui: doc_topic_landscape_plot,
304
+ inter_topic_map_plot_ui: inter_topic_map_plot, # Hook for the fixed plot
305
+ top_topics_barchart_plot_ui: top_topics_barchart_plot,
306
+ topic_similarity_heatmap_ui: topic_similarity_heatmap_plot,
307
+ topic_hierarchy_plot_ui: topic_hierarchy_plot,
308
+ # Update and enable the temporal analysis tab if date columns exist
309
+ temporal_analysis_group: gr.update(visible=len(date_columns) > 0),
310
+ date_column_dropdown: gr.update(choices=date_columns, value=date_columns[0] if date_columns else None),
311
+ }
312
+
313
+ print("✅ Main analysis pipeline function appended to app.py")
314
+
315
+ # --- AI REFINEMENT AGENT ---
316
+
317
+ def run_ai_refinement(topic_model, llm_pipeline, progress=gr.Progress()):
318
+ """
319
+ Uses a lightweight LLM to generate high-quality, contextual topic names.
320
+ Includes a conceptual hook for future AI-powered topic merging.
321
+ """
322
+ logging.info("Starting AI Refinement Agent...")
323
+
324
+ # --- Task 1: AI-Powered Topic Naming ---
325
+ progress(0, desc="AI Agent: Generating Topic Names...")
326
+ topic_info_df = topic_model.get_topic_info()
327
+ new_labels = {}
328
+
329
+ # This is the advanced, few-shot Bangla prompt we designed.
330
+ # It will be used for each topic.
331
+ prompt_template = """
332
+ আপনি একজন পেশাদার সংবাদ সম্পাদক। আপনার কাজ হলো বাংলাদেশের রাজনৈতিক ঘটনাবলী, বিশেষ করে বিএনপির 'তারুণ্যের সমাবেশ' সংক্রান্ত সংবাদের জন্য একটি সংক্ষিপ্ত ও প্রাসঙ্গিক শিরোনাম তৈরি করা। প্রদত্ত কীওয়ার্ডগুলো ব্যবহার করে একটি (৩-৫ শব্দের) সারগর্ভ বাংলা শিরোনাম লিখুন, যেখানে সমাবেশের মূল বিষয় বা স্থান স্পষ্টভাবে ফুটে উঠবে। উদাহরণগুলো দেখুন।
333
+
334
+ --- উদাহরণ ---
335
+ ইনপুট কীওয়ার্ড: ['খুলনা', 'তারুণ্যের', 'সমাবেশ', 'বিএনপি']
336
+ আউটপুট শিরোনাম: খুলনায় বিএনপির তারুণ্যের সমাবেশ
337
+
338
+ ইনপুট কীওয়ার্ড: ['ঢাকা', 'নয়াপল্টন', 'তারুণ্যের', 'স্রোত', 'বৃষ্টি']
339
+ আউটপুট শিরোনাম: ঢাকায় তারুণ্যের সমাবেশে জনতার ঢল
340
+
341
+ ইনপুট কীওয়ার্ড: ['চট্টগ্রাম', 'বক্তব্য', 'মির্জা ফখরুল', 'শোডাউন']
342
+ আউটপুট শিরোনাম: চট্টগ্রামে মির্জা ফখরুলের তারুণ্যের সমাবেশ
343
+ --- উদাহরণের শেষ ---
344
+
345
+ --- আপনার কাজ ---
346
+ ইনপুট কীওয়ার্ড: {keywords}
347
+ আউটপুট শিরোনাম:
348
+ """
349
+
350
+ # Tuned parameters for reliable, non-creative naming
351
+ generation_params = {
352
+ "temperature": 0.3,
353
+ "max_new_tokens": 30,
354
+ "repetition_penalty": 1.2,
355
+ "do_sample": True
356
+ }
357
+
358
+ # Iterate through each topic to generate a new name
359
+ for index, row in topic_info_df.iterrows():
360
+ topic_id = row['Topic']
361
+ if topic_id == -1:
362
+ # We don't rename the outlier topic
363
+ new_labels[topic_id] = "Topic -1: Outliers"
364
+ continue
365
+
366
+ keywords = row['Representation']
367
+
368
+ # Format the prompt for the current topic
369
+ prompt = prompt_template.format(keywords=keywords)
370
+
371
+ try:
372
+ # Call the LLM pipeline
373
+ response = llm_pipeline(prompt, **generation_params)
374
+ # Extract the generated text, stripping whitespace and the prompt's artifacts
375
+ generated_name = response[0]['generated_text'].split("আউটপুট শিরোনাম:")[1].strip()
376
+
377
+ if generated_name:
378
+ new_labels[topic_id] = f"Topic {topic_id}: {generated_name}"
379
+ logging.info(f"Generated name for Topic {topic_id}: {generated_name}")
380
+ else:
381
+ # Fallback to default name if generation fails
382
+ new_labels[topic_id] = topic_model.get_topic_label(topic_id, nr_words=4)
383
+ except Exception as e:
384
+ logging.error(f"LLM failed for Topic {topic_id}. Error: {e}")
385
+ # Fallback for safety
386
+ new_labels[topic_id] = topic_model.get_topic_label(topic_id, nr_words=4)
387
+
388
+ progress.update((index + 1) / len(topic_info_df))
389
+
390
+ # Apply all the new, AI-generated labels at once
391
+ topic_model.set_topic_labels(new_labels)
392
+ logging.info("✅ AI Naming complete.")
393
+
394
+ # --- Task 2: AI-Powered Merging (Conceptual Hook) ---
395
+ # This section is a placeholder for a future enhancement.
396
+ # The logic would be:
397
+ # 1. Calculate topic similarity matrix.
398
+ # 2. Identify pairs with similarity > threshold (e.g., 0.85).
399
+ # 3. Use a "Judge" prompt to ask the LLM if they should be merged.
400
+ # 4. If LLM says "YES", call `topic_model.merge_topics()`.
401
+ logging.info("Skipping AI Topic Merging (conceptual feature).")
402
+
403
+ return topic_model
404
+
405
+ print("✅ AI Refinement Agent function appended to app.py")
406
+
407
+ # --- FINAL BACKEND HANDLERS & HELPERS ---
408
+
409
+ def get_topic_details(topic_id: int):
410
+ """Fetches details for a selected topic to display in the review tab."""
411
+ empty_return = {topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
412
+ model = APP_STATE.get("bertopic_model")
413
+ if model is None or topic_id is None: return empty_return
414
+ try:
415
+ topic_id = int(topic_id)
416
+ topic_info = model.get_topic_info(topic_id=topic_id)
417
+ if topic_info.empty: return empty_return
418
+
419
+ # Strip the "Topic X: " prefix for cleaner editing
420
+ topic_name = topic_info['Name'].iloc[0]
421
+ cleaned_name = re.sub(r'^Topic \d+:\s*', '', topic_name)
422
+
423
+ # For the outlier topic, don't generate plots
424
+ if topic_id == -1:
425
+ return {topic_name_textbox: cleaned_name, topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
426
+
427
+ word_cloud_fig = model.visualize_barchart(top_n_topics=1, topics=[topic_id])
428
+ docs_df = pd.DataFrame(model.get_representative_docs(topic_id), columns=['Representative Document'])
429
+ return {topic_name_textbox: cleaned_name, topic_word_cloud_plot: word_cloud_fig, topic_docs_df: docs_df}
430
+ except Exception as e:
431
+ logging.error(f"Error getting topic details for ID {topic_id}: {e}")
432
+ return empty_return
433
+
434
+ def update_topic_name(topic_id, new_name):
435
+ """Handler for manual topic renaming."""
436
+ model = APP_STATE.get("bertopic_model")
437
+ if model and topic_id is not None and new_name:
438
+ topic_id = int(topic_id)
439
+ # Add the prefix back for consistency
440
+ full_name = f"Topic {topic_id}: {new_name}"
441
+ model.set_topic_labels({topic_id: full_name})
442
+ APP_STATE["topics_df"] = model.get_topic_info()
443
+ gr.Info(f"Topic {topic_id} renamed to '{new_name}'")
444
+ # Return the updated table for the UI
445
+ return gr.update(value=APP_STATE["topics_df"][['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'}))
446
+ return gr.update() # No change
447
+
448
+ def merge_selected_topics(topics_to_merge):
449
+ """Handler for manual topic merging."""
450
+ model = APP_STATE.get("bertopic_model")
451
+ if model and topics_to_merge and len(topics_to_merge) > 1:
452
+ # Convert topic names like "Topic 0: ..." to integer IDs
453
+ topic_ids = [int(re.search(r'\d+', t).group()) for t in topics_to_merge]
454
+
455
+ model.merge_topics(topics_to_merge=[topic_ids])
456
+
457
+ # After merging, we need to refresh the state and UI components
458
+ APP_STATE["topics_df"] = model.get_topic_info()
459
+ review_topic_table = APP_STATE["topics_df"][['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'})
460
+
461
+ gr.Info(f"Successfully merged topics: {topic_ids}")
462
+ return {
463
+ review_topic_table_df: gr.update(value=review_topic_table),
464
+ # Clear the selection and the details view
465
+ topic_merger_checkboxgroup: gr.update(value=[]),
466
+ topic_name_textbox: "",
467
+ topic_word_cloud_plot: None,
468
+ topic_docs_df: pd.DataFrame(),
469
+ }
470
+ gr.Warning("Please select at least two topics to merge.")
471
+ return {review_topic_table_df: gr.update(), topic_merger_checkboxgroup: gr.update()}
472
+
473
+
474
+ def generate_temporal_plot(date_column, progress=gr.Progress()):
475
+ """Generates and displays the topics over time plot."""
476
+ progress(0, desc="Preparing time data...")
477
+ if not date_column: return None
478
+ model, df = APP_STATE.get("bertopic_model"), APP_STATE.get("final_df")
479
+ if model is None or df is None: return None
480
+
481
+ df_temporal = df.copy()
482
+ df_temporal['timestamp'] = pd.to_datetime(df_temporal[date_column], errors='coerce')
483
+ df_temporal.dropna(subset=['timestamp'], inplace=True)
484
+
485
+ if df_temporal.empty:
486
+ gr.Warning(f"The column '{date_column}' contains no valid dates after conversion.")
487
+ return None
488
+
489
+ progress(0.6, desc="Generating topic trends over time...")
490
+ try:
491
+ # BERTopic requires the original documents and timestamps for this plot
492
+ docs_temporal = df_temporal['processed_text'].tolist()
493
+ timestamps_temporal = df_temporal['timestamp'].tolist()
494
+ topics_over_time = model.topics_over_time(docs=docs_temporal, timestamps=timestamps_temporal)
495
+ return model.visualize_topics_over_time(topics_over_time)
496
+ except Exception as e:
497
+ gr.Error(f"Could not generate temporal plot. This can happen if topics are not found in the selected time range. Error: {e}")
498
+ return None
499
+
500
+ def generate_media_analysis(media_column):
501
+ """Generates a bar chart for media source analysis."""
502
+ if not media_column:
503
+ gr.Warning("Please select a media column to analyze.")
504
+ return None
505
+ df = APP_STATE.get("df")
506
+ if df is None or media_column not in df.columns:
507
+ return None
508
+
509
+ counts = df[media_column].value_counts().nlargest(20) # Get top 20 sources
510
+
511
+ # Using Gradio's built-in plotting for simplicity
512
+ plot_df = pd.DataFrame({'Media Source': counts.index, 'Article Count': counts.values})
513
+ return gr.BarPlot(
514
+ plot_df,
515
+ x='Media Source',
516
+ y='Article Count',
517
+ title=f'Top 20 Media Sources by Article Count',
518
+ tooltip=['Media Source', 'Article Count'],
519
+ height=500,
520
+ vertical_guides=[{'value': counts.mean(), 'label': 'Average'}]
521
+ )
522
+
523
+ def finalize_and_save():
524
+ """Saves the final DataFrame and topic definitions to files."""
525
+ if APP_STATE.get("final_df") is None or APP_STATE.get("topics_df") is None:
526
+ gr.Warning("No data available to save.")
527
+ return None
528
+
529
+ final_df_to_save, topics_df_to_save = APP_STATE["final_df"].copy(), APP_STATE["topics_df"].copy()
530
+
531
+ # Convert list columns to JSON strings for compatibility
532
+ for col in ['Representation', 'Representative_Docs']:
533
+ if col in topics_df_to_save.columns:
534
+ topics_df_to_save[col] = topics_df_to_save[col].apply(
535
+ lambda x: json.dumps(x) if isinstance(x, list) else x
536
+ )
537
+
538
+ db_path, csv_path = "topic_analysis_results.sqlite", "labeled_documents.csv"
539
+
540
+ with sqlite3.connect(db_path) as conn:
541
+ topics_df_to_save.to_sql("topic_definitions", conn, if_exists="replace", index=False)
542
+ final_df_to_save.to_sql("enriched_documents", conn, if_exists="replace", index=False)
543
+
544
+ topic_map = topics_df_to_save.set_index('Topic')['Name'].to_dict()
545
+ final_df_to_save['Topic_Name'] = final_df_to_save['Topic'].map(topic_map)
546
+ final_df_to_save.to_csv(csv_path, index=False, encoding='utf-8-sig')
547
+
548
+ gr.Info(f"Results saved to {db_path} and {csv_path}")
549
+ return [db_path, csv_path]
550
+
551
+ print("✅ Final backend handlers appended to app.py")
552
+
553
+ # --- GRADIO UI LAYOUT & EVENT HANDLERS ---
554
+
555
+ with gr.Blocks(theme=gr.themes.Soft(), title=APP_TITLE) as app:
556
+ gr.Markdown(f"# {APP_TITLE}")
557
+ gr.Markdown(f"*{APP_TAGLINE}*")
558
+
559
+ with gr.Tabs() as tabs:
560
+ # === SETUP & RUN TAB ===
561
+ with gr.TabItem("1. Setup & Run Analysis", id=0):
562
+ with gr.Row():
563
+ with gr.Column(scale=1):
564
+ gr.Markdown("### 1. Data Input")
565
+ file_upload = gr.File(label="Upload CSV File", file_types=[".csv"])
566
+ gsheet_url = gr.Textbox(label="Or Paste Google Sheets URL", placeholder="https://docs.google.com/spreadsheets/d/...")
567
+
568
+ gr.Markdown("### 2. Select Columns")
569
+ text_columns_checkboxgroup = gr.CheckboxGroup(label="Select Text Columns for Analysis", interactive=True)
570
+
571
+ gr.Markdown("### 3. Configure Analysis")
572
+ analysis_mode_radio = gr.Radio(["Discovery Mode", "Manual Seeding"], value="Discovery Mode", label="Analysis Mode")
573
+ manual_seeds_textbox = gr.Textbox(label="Manual Seed Topics (JSON format)", visible=False, lines=5)
574
+ # FIX: Assign the markdown to a variable so we can target it directly
575
+ manual_seeds_example = gr.Markdown("Example: `{\"Topic A\": [\"keyword1\", \"keyword2\"], \"Topic B\": [\"wordA\", \"wordB\"]}`", visible=False)
576
+
577
+ top_n_topics_slider = gr.Slider(label="Number of Topics for Charts", minimum=5, maximum=50, value=15, step=1)
578
+
579
+ gr.Markdown("### 4. Advanced (Optional)")
580
+ enable_ai_merging_checkbox = gr.Checkbox(label="Enable AI Topic Naming (Requires GPU & HF Token)", value=False)
581
+ hf_token_textbox = gr.Textbox(label="Hugging Face Token", type="password", placeholder="hf_...", info="Required if AI is enabled.")
582
+
583
+ start_button = gr.Button("Start Analysis", variant="primary")
584
+
585
+ with gr.Column(scale=2):
586
+ log_output = gr.Textbox(label="Pipeline Progress", lines=25, interactive=False, autoscroll=True)
587
+
588
+ # === REVIEW & FINALIZE TAB ===
589
+ with gr.TabItem("2. Review & Finalize", id=1, visible=False) as review_tab:
590
+ gr.Markdown("### Review, Refine, and Finalize Your Topic Model")
591
+ with gr.Row():
592
+ with gr.Column(scale=2):
593
+ gr.Markdown("**Topics Found**")
594
+ review_topic_table_df = gr.DataFrame(headers=["ID", "Topic Name", "Documents"], interactive=True, wrap=True, scale=2)
595
+ with gr.Column(scale=3):
596
+ gr.Markdown("**Selected Topic Details**")
597
+ topic_id_state = gr.State() # Hidden state to store the selected topic ID
598
+ topic_name_textbox = gr.Textbox(label="Topic Name (Editable)")
599
+ update_name_button = gr.Button("Update Name")
600
+ topic_word_cloud_plot = gr.Plot(label="Top Words for Selected Topic")
601
+ topic_docs_df = gr.DataFrame(headers=["Representative Document"], wrap=True)
602
+
603
+ with gr.Row():
604
+ gr.Markdown("### Manual Topic Merging")
605
+ with gr.Row():
606
+ topic_merger_checkboxgroup = gr.CheckboxGroup(label="Select 2 or more topics to merge", interactive=True)
607
+ merge_button = gr.Button("Merge Selected Topics", variant="stop")
608
+ with gr.Row():
609
+ finalize_button = gr.Button("Save Final Results to Files", variant="primary")
610
+ download_link = gr.File(label="Download Results (SQLite DB and CSV)", file_count="multiple")
611
+
612
+
613
+ # === VISUALIZE & EXPLORE TAB ===
614
+ with gr.TabItem("3. Visualize & Explore", id=2, visible=False) as visualize_tab:
615
+ with gr.Tabs():
616
+ with gr.TabItem("Document Landscape"):
617
+ gr.Markdown("A 2D map of every document, colored by its assigned topic. This shows the overall structure of your data.")
618
+ doc_topic_landscape_plot_ui = gr.Plot()
619
+ with gr.TabItem("Topic Relationships"):
620
+ gr.Markdown("Visualizations showing how topics relate to each other.")
621
+ inter_topic_map_plot_ui = gr.Plot(label="Inter-Topic Distance Map")
622
+ topic_hierarchy_plot_ui = gr.Plot(label="Hierarchical Clustering of Topics")
623
+ topic_similarity_heatmap_ui = gr.Plot(label="Topic Similarity Heatmap")
624
+ with gr.TabItem("Topic Keywords"):
625
+ gr.Markdown("A bar chart showing the most important keywords for the most prominent topics.")
626
+ top_topics_barchart_plot_ui = gr.Plot()
627
+ with gr.TabItem("Temporal Analysis"):
628
+ with gr.Group(visible=False) as temporal_analysis_group:
629
+ gr.Markdown("Select a date column from your data to see how topic popularity has changed over time.")
630
+ with gr.Row():
631
+ date_column_dropdown = gr.Dropdown(label="Select Date Column")
632
+ generate_trends_button = gr.Button("Generate Trend Plot")
633
+ temporal_plot_ui = gr.Plot()
634
+
635
+ # === SOURCE ANALYSIS TAB ===
636
+ with gr.TabItem("4. Source Analysis", id=3, visible=False) as source_tab:
637
+ gr.Markdown("### Analyze the Distribution of News Sources")
638
+ with gr.Row():
639
+ media_column_dropdown = gr.Dropdown(label="Select Your Media/Source Column")
640
+ analyze_media_button = gr.Button("Analyze Sources")
641
+ with gr.Row():
642
+ media_plot = gr.BarPlot()
643
+
644
+ gr.Markdown(f"<div style='text-align: center;'>{APP_FOOTER}</div>")
645
+
646
+ # --- EVENT HANDLERS ---
647
+
648
+ def update_column_selector(file, url):
649
+ """Populates column selectors after data is loaded."""
650
+ # This function also makes the source analysis tab visible if data loads
651
+ if file is None and not url:
652
+ return {text_columns_checkboxgroup: gr.update(choices=[], value=None), media_column_dropdown: gr.update(choices=[], value=None), source_tab: gr.update(visible=False)}
653
+ try:
654
+ df = load_data(file, url)
655
+ text_cols = [col for col in df.columns if df[col].dtype == 'object']
656
+ return {
657
+ text_columns_checkboxgroup: gr.update(choices=text_cols, value=text_cols if text_cols else None),
658
+ media_column_dropdown: gr.update(choices=df.columns.tolist()),
659
+ source_tab: gr.update(visible=True)
660
+ }
661
+ except Exception as e:
662
+ gr.Warning(f"Failed to read columns: {e}")
663
+ return {text_columns_checkboxgroup: gr.update(choices=[], value=None), media_column_dropdown: gr.update(choices=[], value=None), source_tab: gr.update(visible=False)}
664
+
665
+ file_upload.upload(fn=update_column_selector, inputs=[file_upload, gsheet_url], outputs=[text_columns_checkboxgroup, media_column_dropdown, source_tab])
666
+ gsheet_url.submit(fn=update_column_selector, inputs=[file_upload, gsheet_url], outputs=[text_columns_checkboxgroup, media_column_dropdown, source_tab])
667
+
668
+ # FIX: A single, robust function to control the visibility of manual seeding UI elements
669
+ def toggle_manual_seeding_ui(mode):
670
+ is_visible = mode == "Manual Seeding"
671
+ return {
672
+ manual_seeds_textbox: gr.update(visible=is_visible),
673
+ manual_seeds_example: gr.update(visible=is_visible)
674
+ }
675
+
676
+ analysis_mode_radio.change(
677
+ fn=toggle_manual_seeding_ui,
678
+ inputs=analysis_mode_radio,
679
+ outputs=[manual_seeds_textbox, manual_seeds_example]
680
+ )
681
+
682
+ start_button.click(
683
+ fn=run_analysis_pipeline,
684
+ inputs=[file_upload, gsheet_url, text_columns_checkboxgroup, analysis_mode_radio, manual_seeds_textbox, top_n_topics_slider, enable_ai_merging_checkbox, hf_token_textbox],
685
+ outputs=[log_output, review_tab, visualize_tab, review_topic_table_df, doc_topic_landscape_plot_ui, inter_topic_map_plot_ui,
686
+ top_topics_barchart_plot_ui, topic_similarity_heatmap_ui, topic_hierarchy_plot_ui, temporal_analysis_group, date_column_dropdown]
687
+ )
688
+
689
+ def on_select_topic(evt: gr.SelectData):
690
+ """Handles selecting a topic from the main review table."""
691
+ if not isinstance(evt.index, tuple) or len(evt.index) == 0:
692
+ return {topic_id_state: None, topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
693
+ try:
694
+ topic_id_val = APP_STATE["topics_df"].iloc[evt.index[0]]['ID']
695
+ details = get_topic_details(topic_id_val)
696
+ details[topic_id_state] = topic_id_val # Store the ID in the hidden state
697
+ return details
698
+ except Exception:
699
+ return {topic_id_state: None, topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
700
+
701
+ review_topic_table_df.select(fn=on_select_topic, outputs=[topic_id_state, topic_name_textbox, topic_word_cloud_plot, topic_docs_df])
702
+
703
+ # Connect the new manual refinement buttons
704
+ update_name_button.click(fn=update_topic_name, inputs=[topic_id_state, topic_name_textbox], outputs=[review_topic_table_df])
705
+
706
+ # When the main results are generated, populate the topic merger checklist
707
+ review_topic_table_df.change(lambda df: gr.update(choices=df['Topic Name'].tolist()), inputs=review_topic_table_df, outputs=topic_merger_checkboxgroup)
708
+
709
+ merge_button.click(fn=merge_selected_topics, inputs=[topic_merger_checkboxgroup], outputs=[review_topic_table_df, topic_merger_checkboxgroup, topic_name_textbox, topic_word_cloud_plot, topic_docs_df])
710
+
711
+ # Connect the new Source Analysis tab
712
+ analyze_media_button.click(fn=generate_media_analysis, inputs=[media_column_dropdown], outputs=[media_plot])
713
+
714
+ # Other handlers
715
+ generate_trends_button.click(fn=generate_temporal_plot, inputs=[date_column_dropdown], outputs=[temporal_plot_ui])
716
+ finalize_button.click(fn=finalize_and_save, inputs=[], outputs=[download_link])
717
+
718
+ # --- LAUNCH THE APP ---
719
+ if __name__ == "__main__":
720
+ app.launch(debug=True, share=True)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ scikit-learn
4
+ bertopic[visualization]
5
+ sentence_transformers
6
+ torch
7
+ transformers
8
+ accelerate
9
+ bitsandbytes
10
+ huggingface_hub
11
+ requests