Spaces:

Arjon07CSE
/

Social-Perception-Analyzer

Sleeping

App Files Files Community

Arjon07CSE commited on Aug 11, 2025

Commit

db79a4d

verified ·

1 Parent(s): fe9fa1b

Upload 2 files

Browse files

added the main code and requirement file

Files changed (2) hide show

app.py +720 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,720 @@

+# --- IMPORTS & GLOBAL SETUP ---
+import gradio as gr
+import pandas as pd
+import numpy as np
+import torch
+import re
+import sqlite3
+import json
+import logging
+import requests
+from io import StringIO
+# Transformers and BERTopic components
+from transformers import pipeline, BitsAndBytesConfig
+from sentence_transformers import SentenceTransformer
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired
+from umap import UMAP
+from hdbscan import HDBSCAN
+from sklearn.feature_extraction.text import CountVectorizer
+# Hugging Face and Colab integration (optional, for LLM access)
+from huggingface_hub import login
+# from google.colab import userdata # We will disable this for HF Spaces deployment
+# Setup basic logging to monitor the application's health
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+# A simple dictionary to hold data between UI interactions, acting as a session state.
+APP_STATE = {
+    "df": None,
+    "bertopic_model": None,
+    "topics_df": None,
+    "final_df": None,
+}
+print("✅ app.py created. Initial imports written.")
+print("✅ Dependencies installed in Colab environment.")
+# --- TEXT PREPROCESSING & NORMALIZATION ---
+# A comprehensive list of Bangla stop words, tailored for news and general text.
+BANGLA_STOP_WORDS = [
+    'অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য',
+    'অভিপ্রায়', 'একে', 'একই', 'একেবারে', 'একটি', 'একবার', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটি',
+    'এতটাই', 'এতদূর', 'এতটুকু', 'এক', 'এবং', 'এবার', 'এমন', 'এমনভাবে', 'এর', 'এরা', 'এঁরা', 'এঁদের',
+    'এই', 'এইভাবে', 'ও', 'ওঁরা', 'ওঁর', 'ওঁদের', 'ওকে', 'ওখানে', 'ওদের', 'ওর', 'কাছ', 'কাছে', 'কাজ',
+    'কারণ', 'কিছু', 'কিছুই', 'কিন্তু', 'কিভাবে', 'কেন', 'কোন', 'কোনও', 'কোনো', 'ক্ষেত্রে', 'খুব',
+    'গুলি', 'গিয়ে', 'চায়', 'ছাড়া', 'জন্য', 'জানা', 'ঠিক', 'তিনি', 'তিন', 'তিনিও', 'তাকে', 'তাঁকে',
+    'তার', 'তাঁর', 'তারা', 'তাঁরা', 'তাদের', 'তাঁদের', 'তাহলে', ' থাকলেও', 'থেকে', 'মধ্যেই', 'মধ্যে',
+    'द्वारा', 'নয়', 'না', 'নিজের', 'নিজে', 'নিয়ে', 'পারেন', 'পারা', 'পারে', 'পরে', 'পর্যন্ত', 'পুনরায়',
+    'ফলে', 'বজায়', 'বা', 'বাদে', 'বার', 'বিশেষ', 'বিভিন্ন', 'ব্যবহার', 'ব্যাপারে', 'ভাবে', 'ভাবেই', 'মাধ্যমে',
+    'মতো', 'মতোই', 'যখন', 'যদি', 'যদিও', 'যা', 'যাকে', 'যাওয়া', 'যায়', 'যে', 'যেখানে', 'যেতে', 'যেমন',
+    'যেহেতু', 'রহিছে', 'শিক্ষা', 'শুধু', 'সঙ্গে', 'সব', 'সমস্ত', 'সম্প্রতি', 'সহ', 'সাধারণ', 'সামনে', 'হতে',
+    'হতেই', 'হবে', 'হয়', 'হয়তো', 'হয়', 'হচ্ছে', 'হত', 'হলে', 'হলেও', 'হয়নি', 'হাজার', 'হোওয়া', 'আরও', 'আমরা',
+    'আমার', 'আমি', 'আর', 'আগে', 'আগেই', 'আছে', 'আজ', 'তাকে', 'তাতে', 'তাদের', 'তাহার', 'তাহাতে', 'তাহারই',
+    'তথা', 'তথাপি', 'সে', 'সেই', 'সেখান', 'সেখানে', 'থেকে', 'নাকি', 'নাগাদ', 'দু', 'দুটি', 'সুতরাং',
+    'সম্পর্কে', 'সঙ্গেও', 'সর্বাধিক', 'সর্বদা', 'সহ', 'হৈতে', 'হইবে', 'হইয়া', 'হৈল', 'জানিয়েছেন', 'প্রতিবেদক'
+]
+def normalize_bangla_manual(text):
+    """A robust, self-contained function to normalize Bangla text."""
+    if not isinstance(text, str): return ""
+    replacements = {
+        '[\u09F7]': '\u09B0', '[\u09F2]': '\u09B2', '[\u09E4]': '\u098B', '[\u09E5]': '\u09E1',
+        '[\u09FA]': '\u09B8\u09CD\u09AE', '[\u09FB]': '\u0995\u09CD\u09B7', '[\u0970]': '\u0966',
+        '[\u09F3]': '\u09B0\u09C2', '[\u09F8]': '\u09A3', '[\u09F9]': '\u09B6', '[\u0984]': '',
+        '[\u0980]': '\u0981', r'(\s)।(\s)': r'\1।\2', r'(\S)।(\S)': r'\1 । \2',
+        '[\u0964][\u0964]': '\u0964', '[|]': '\u0964', '[\u09DC]': '\u09A1\u09BC',
+        '[\u09DD]': '\u09A2\u09BC', '[\u09DF]': '\u09AF\u09BC',
+    }
+    for old, new in replacements.items():
+        text = re.sub(old, new, text)
+    return text
+def preprocess_bangla_text(text):
+    """Cleans and normalizes a single Bangla text string for NLP tasks."""
+    if not isinstance(text, str): return ""
+    text = normalize_bangla_manual(text)
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
+    text = re.sub(r'\S*@\S*\s?', '', text)
+    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)
+    words = text.split()
+    words = [word for word in words if word not in BANGLA_STOP_WORDS]
+    text = " ".join(words)
+    return re.sub(r'\s+', ' ', text).strip()
+print("✅ Helper functions appended to app.py")
+# --- APP BRANDING & CONFIGURATION ---
+# Easily update the application's title, tagline, and footer here.
+APP_TITLE = "Social Perception Analyzer"
+APP_TAGLINE = "Prepared for the Policymakers of Bangladesh Nationalist Party (BNP)"
+APP_FOOTER = "Developed by Centre for Data Science Research (CDSR), and Strategy and Policy Forum (SPF)"
+# --- LOCAL LLM INITIALIZATION ---
+def initialize_local_llm(hf_token=None):
+    """
+    Initializes and returns a local, quantized, lightweight LLM pipeline.
+    This model is chosen for its efficiency and Bangla language specialization.
+    """
+    model_id = "hishab/titulm-llama-3.2-1b-v1.1"
+    # 4-bit quantization to reduce memory usage significantly
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.bfloat16
+    )
+    try:
+        # Check for GPU availability
+        if not torch.cuda.is_available():
+            logging.warning("GPU not available. LLM will run on CPU and be very slow.")
+            llm_pipeline = pipeline("text-generation", model=model_id, token=hf_token)
+        else:
+            logging.info(f"Initializing quantized local LLM: {model_id} on GPU.")
+            llm_pipeline = pipeline(
+                "text-generation",
+                model=model_id,
+                model_kwargs={"quantization_config": quantization_config},
+                device_map="auto",
+                token=hf_token
+            )
+        return llm_pipeline
+    except Exception as e:
+        logging.error(f"Failed to initialize local LLM: {e}")
+        # Add a note about potential trust issues for some models
+        logging.info("Trying again with 'trust_remote_code=True'.")
+        try:
+             llm_pipeline = pipeline(
+                "text-generation",
+                model=model_id,
+                model_kwargs={"trust_remote_code": True, "quantization_config": quantization_config},
+                device_map="auto",
+                token=hf_token
+            )
+             return llm_pipeline
+        except Exception as e2:
+             logging.error(f"Secondary attempt failed: {e2}")
+             gr.Warning("Could not initialize the local LLM. AI features will be disabled.")
+             return None
+# --- DATA LOADING HELPER ---
+def load_data(file_obj, gsheet_url):
+    """Loads a DataFrame from either an uploaded file or a Google Sheets URL."""
+    if file_obj is not None:
+        logging.info(f"Loading data from uploaded file: {file_obj.name}")
+        return pd.read_csv(file_obj.name)
+    elif gsheet_url and gsheet_url.strip():
+        logging.info(f"Loading data from Google Sheets URL.")
+        try:
+            # Manipulate the URL for direct CSV export
+            csv_url = gsheet_url.replace('/edit?usp=sharing', '/export?format=csv&gid=0')
+            response = requests.get(csv_url)
+            response.raise_for_status() # Raise an exception for bad status codes
+            return pd.read_csv(StringIO(response.text))
+        except Exception as e:
+            raise ValueError(f"Failed to load from Google Sheets URL. Please ensure the link is correct and publicly accessible. Error: {e}")
+    else:
+        raise ValueError("Please upload a CSV file or provide a public Google Sheets URL.")
+print("✅ App branding, LLM initialization, and data loading functions appended to app.py")
+# --- MAIN ANALYSIS ENGINE ---
+# We will define the AI agent in the next cell. For now, this is a placeholder.
+LLM_PIPELINE = None
+def run_analysis_pipeline(file_obj, gsheet_url, text_columns, analysis_mode, manual_seeds,
+                          top_n_topics_slider, enable_ai_merging, hf_token, progress=gr.Progress()):
+    """
+    The main orchestrator function for the analysis pipeline.
+    This function incorporates all our agreed-upon refinements.
+    """
+    global LLM_PIPELINE
+    if enable_ai_merging and LLM_PIPELINE is None:
+        progress(0, desc="Initializing LLM...")
+        LLM_PIPELINE = initialize_local_llm(hf_token)
+        if LLM_PIPELINE is None:
+            gr.Warning("AI features enabled, but LLM failed to initialize. Skipping AI steps.")
+            enable_ai_merging = False
+    # === STEP 1: LOAD AND VALIDATE DATA ===
+    progress(0.1, desc="Step 1/8: Loading and Validating Data...")
+    try:
+        df = load_data(file_obj, gsheet_url)
+        if not text_columns: raise ValueError("Please select at least one text column to analyze.")
+        df['combined_text'] = df[text_columns].fillna('').astype(str).agg(' '.join, axis=1)
+        df.dropna(subset=['combined_text'], inplace=True)
+        df['processed_text'] = df['combined_text'].apply(preprocess_bangla_text)
+        # REFINEMENT: Filter by word count for more robust document validation.
+        df_analysis = df[df['processed_text'].str.split().str.len() > 2].copy()
+        if df_analysis.empty:
+            raise ValueError("No documents with sufficient content found after cleaning. Please check your data and column selection.")
+        documents = df_analysis['processed_text'].tolist()
+        APP_STATE["df"] = df_analysis # Save the analyzable dataframe
+    except Exception as e:
+        logging.error(f"Data Loading Error: {e}")
+        return {log_output: f"Error during data loading: {e}"}
+    # === STEP 2: PREPARE GUIDANCE (IF MANUAL SEEDING) ===
+    progress(0.2, desc="Step 2/8: Preparing Analysis Mode...")
+    y_guidance = None
+    if analysis_mode == "Manual Seeding" and manual_seeds:
+        try:
+            seed_topics_dict = json.loads(manual_seeds)
+            y_guidance = [-1] * len(documents)
+            topic_name_to_id = {name: i for i, name in enumerate(seed_topics_dict.keys())}
+            for i, doc in enumerate(documents):
+                for topic_name, keywords in seed_topics_dict.items():
+                    if any(keyword in doc for keyword in keywords):
+                        y_guidance[i] = topic_name_to_id[topic_name]
+                        break # Prioritizes the first match in the JSON
+        except Exception as e:
+            return {log_output: f"Error: Invalid JSON in Manual Seeds. Details: {e}"}
+    # === STEP 3: EMBEDDINGS & MODEL SETUP (WITH REFINEMENTS) ===
+    progress(0.3, desc="Step 3/8: Calculating Document Embeddings...")
+    embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
+    embeddings = embedding_model.encode(documents, show_progress_bar=True)
+    # REFINEMENT: Lower min_cluster_size for more sensitive topic detection.
+    hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
+    # REFINEMENT: Use max_df and min_df for adaptive stop word filtering.
+    vectorizer_model = CountVectorizer(tokenizer=lambda doc: doc.split(), ngram_range=(1, 3), max_df=0.90, min_df=5)
+    # Other components remain robust
+    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
+    representation_model = KeyBERTInspired()
+    # === STEP 4: TRAIN TOPIC MODEL ===
+    progress(0.5, desc="Step 4/8: Training BERTopic Model...")
+    topic_model = BERTopic(
+        embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model,
+        vectorizer_model=vectorizer_model, representation_model=representation_model,
+        language="multilingual", verbose=False
+    )
+    topics, _ = topic_model.fit_transform(documents, embeddings, y=y_guidance)
+    # === STEP 5: AI REFINEMENT (IF ENABLED) ===
+    if enable_ai_merging and LLM_PIPELINE:
+        progress(0.6, desc="Step 5/8: Running AI Refinement Agent...")
+        # We will define `run_ai_refinement` in the next cell. This is the hook.
+        topic_model = run_ai_refinement(topic_model, LLM_PIPELINE, progress)
+    else:
+        progress(0.6, desc="Step 5/8: Skipping AI Refinement...")
+        # Fallback to default naming if AI is disabled
+        generated_labels = topic_model.generate_topic_labels(nr_words=4, separator=", ")
+        topic_model.set_topic_labels(generated_labels)
+    # === STEP 6: APPLY MANUAL SEED NAMES ===
+    progress(0.7, desc="Step 6/8: Finalizing Topic Names...")
+    if analysis_mode == "Manual Seeding" and 'seed_topics_dict' in locals():
+        for topic_name, topic_id in topic_name_to_id.items():
+            if topic_id in topic_model.get_topic_info()['Topic'].values:
+                topic_model.set_topic_labels({topic_id: topic_name})
+    # === STEP 7: PREPARE FINAL OUTPUTS & VISUALIZATIONS ===
+    progress(0.85, desc="Step 7/8: Preparing Visualizations...")
+    APP_STATE["bertopic_model"] = topic_model
+    df_analysis['Topic'] = topics
+    APP_STATE["final_df"] = df_analysis
+    topics_df = topic_model.get_topic_info()
+    APP_STATE["topics_df"] = topics_df
+    # REFINEMENT: Safeguard against memory errors on very large datasets.
+    if len(documents) > 50000:
+        gr.Info("Dataset is large. Visualizing a sample of 50,000 documents for performance.")
+        indices = np.random.choice(len(documents), 50000, replace=False)
+        sampled_docs = [documents[i] for i in indices]
+        sampled_embeddings = embeddings[indices]
+        doc_topic_landscape_plot = topic_model.visualize_documents(sampled_docs, embeddings=sampled_embeddings)
+    else:
+        doc_topic_landscape_plot = topic_model.visualize_documents(documents, embeddings=embeddings)
+    inter_topic_map_plot = topic_model.visualize_topics()
+    # REFINEMENT: Use slider value for dynamic chart generation.
+    num_chart_topics = int(top_n_topics_slider)
+    top_topics_barchart_plot = topic_model.visualize_barchart(top_n_topics=num_chart_topics)
+    topic_similarity_heatmap_plot = topic_model.visualize_heatmap(top_n_topics=num_chart_topics)
+    topic_hierarchy_plot = topic_model.visualize_hierarchy(top_n_topics=num_chart_topics)
+    review_topic_table = topics_df[['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'})
+    # Check for date columns for the temporal analysis tab
+    date_columns = [col for col in df_analysis.columns if pd.to_datetime(df_analysis[col], errors='coerce').notna().any()]
+    # === STEP 8: UPDATE UI WITH RESULTS ===
+    progress(1.0, desc="Step 8/8: Finalizing UI...")
+    return {
+        log_output: f"✅ Analysis Complete! Discovered {len(topics_df)-1} topics.",
+        # Make result tabs visible
+        review_tab: gr.update(visible=True),
+        visualize_tab: gr.update(visible=True),
+        # Populate the review tab
+        review_topic_table_df: gr.update(value=review_topic_table),
+        # Populate the visualization tab
+        doc_topic_landscape_plot_ui: doc_topic_landscape_plot,
+        inter_topic_map_plot_ui: inter_topic_map_plot, # Hook for the fixed plot
+        top_topics_barchart_plot_ui: top_topics_barchart_plot,
+        topic_similarity_heatmap_ui: topic_similarity_heatmap_plot,
+        topic_hierarchy_plot_ui: topic_hierarchy_plot,
+        # Update and enable the temporal analysis tab if date columns exist
+        temporal_analysis_group: gr.update(visible=len(date_columns) > 0),
+        date_column_dropdown: gr.update(choices=date_columns, value=date_columns[0] if date_columns else None),
+    }
+print("✅ Main analysis pipeline function appended to app.py")
+# --- AI REFINEMENT AGENT ---
+def run_ai_refinement(topic_model, llm_pipeline, progress=gr.Progress()):
+    """
+    Uses a lightweight LLM to generate high-quality, contextual topic names.
+    Includes a conceptual hook for future AI-powered topic merging.
+    """
+    logging.info("Starting AI Refinement Agent...")
+    # --- Task 1: AI-Powered Topic Naming ---
+    progress(0, desc="AI Agent: Generating Topic Names...")
+    topic_info_df = topic_model.get_topic_info()
+    new_labels = {}
+    # This is the advanced, few-shot Bangla prompt we designed.
+    # It will be used for each topic.
+    prompt_template = """
+আপনি একজন পেশাদার সংবাদ সম্পাদক। আপনার কাজ হলো বাংলাদেশের রাজনৈতিক ঘটনাবলী, বিশেষ করে বিএনপির 'তারুণ্যের সমাবেশ' সংক্রান্ত সংবাদের জন্য একটি সংক্ষিপ্ত ও প্রাসঙ্গিক শিরোনাম তৈরি করা। প্রদত্ত কীওয়ার্ডগুলো ব্যবহার করে একটি (৩-৫ শব্দের) সারগর্ভ বাংলা শিরোনাম লিখুন, যেখানে সমাবেশের মূল বিষয় বা স্থান স্পষ্টভাবে ফুটে উঠবে। উদাহরণগুলো দেখুন।
+--- উদাহরণ ---
+ইনপুট কীওয়ার্ড: ['খুলনা', 'তারুণ্যের', 'সমাবেশ', 'বিএনপি']
+আউটপুট শিরোনাম: খুলনায় বিএনপির তারুণ্যের সমাবেশ
+ইনপুট কীওয়ার্ড: ['ঢাকা', 'নয়াপল্টন', 'তারুণ্যের', 'স্রোত', 'বৃষ্টি']
+আউটপুট শিরোনাম: ঢাকায় তারুণ্যের সমাবেশে জনতার ঢল
+ইনপুট কীওয়ার্ড: ['চট্টগ্রাম', 'বক্তব্য', 'মির্জা ফখরুল', 'শোডাউন']
+আউটপুট শিরোনাম: চট্টগ্রামে মির্জা ফখরুলের তারুণ্যের সমাবেশ
+--- উদাহরণের শেষ ---
+--- আপনার কাজ ---
+ইনপুট কীওয়ার্ড: {keywords}
+আউটপুট শিরোনাম:
+"""
+    # Tuned parameters for reliable, non-creative naming
+    generation_params = {
+        "temperature": 0.3,
+        "max_new_tokens": 30,
+        "repetition_penalty": 1.2,
+        "do_sample": True
+    }
+    # Iterate through each topic to generate a new name
+    for index, row in topic_info_df.iterrows():
+        topic_id = row['Topic']
+        if topic_id == -1:
+            # We don't rename the outlier topic
+            new_labels[topic_id] = "Topic -1: Outliers"
+            continue
+        keywords = row['Representation']
+        # Format the prompt for the current topic
+        prompt = prompt_template.format(keywords=keywords)
+        try:
+            # Call the LLM pipeline
+            response = llm_pipeline(prompt, **generation_params)
+            # Extract the generated text, stripping whitespace and the prompt's artifacts
+            generated_name = response[0]['generated_text'].split("আউটপুট শিরোনাম:")[1].strip()
+            if generated_name:
+                new_labels[topic_id] = f"Topic {topic_id}: {generated_name}"
+                logging.info(f"Generated name for Topic {topic_id}: {generated_name}")
+            else:
+                # Fallback to default name if generation fails
+                new_labels[topic_id] = topic_model.get_topic_label(topic_id, nr_words=4)
+        except Exception as e:
+            logging.error(f"LLM failed for Topic {topic_id}. Error: {e}")
+            # Fallback for safety
+            new_labels[topic_id] = topic_model.get_topic_label(topic_id, nr_words=4)
+        progress.update((index + 1) / len(topic_info_df))
+    # Apply all the new, AI-generated labels at once
+    topic_model.set_topic_labels(new_labels)
+    logging.info("✅ AI Naming complete.")
+    # --- Task 2: AI-Powered Merging (Conceptual Hook) ---
+    # This section is a placeholder for a future enhancement.
+    # The logic would be:
+    # 1. Calculate topic similarity matrix.
+    # 2. Identify pairs with similarity > threshold (e.g., 0.85).
+    # 3. Use a "Judge" prompt to ask the LLM if they should be merged.
+    # 4. If LLM says "YES", call `topic_model.merge_topics()`.
+    logging.info("Skipping AI Topic Merging (conceptual feature).")
+    return topic_model
+print("✅ AI Refinement Agent function appended to app.py")
+# --- FINAL BACKEND HANDLERS & HELPERS ---
+def get_topic_details(topic_id: int):
+    """Fetches details for a selected topic to display in the review tab."""
+    empty_return = {topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
+    model = APP_STATE.get("bertopic_model")
+    if model is None or topic_id is None: return empty_return
+    try:
+        topic_id = int(topic_id)
+        topic_info = model.get_topic_info(topic_id=topic_id)
+        if topic_info.empty: return empty_return
+        # Strip the "Topic X: " prefix for cleaner editing
+        topic_name = topic_info['Name'].iloc[0]
+        cleaned_name = re.sub(r'^Topic \d+:\s*', '', topic_name)
+        # For the outlier topic, don't generate plots
+        if topic_id == -1:
+            return {topic_name_textbox: cleaned_name, topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
+        word_cloud_fig = model.visualize_barchart(top_n_topics=1, topics=[topic_id])
+        docs_df = pd.DataFrame(model.get_representative_docs(topic_id), columns=['Representative Document'])
+        return {topic_name_textbox: cleaned_name, topic_word_cloud_plot: word_cloud_fig, topic_docs_df: docs_df}
+    except Exception as e:
+        logging.error(f"Error getting topic details for ID {topic_id}: {e}")
+        return empty_return
+def update_topic_name(topic_id, new_name):
+    """Handler for manual topic renaming."""
+    model = APP_STATE.get("bertopic_model")
+    if model and topic_id is not None and new_name:
+        topic_id = int(topic_id)
+        # Add the prefix back for consistency
+        full_name = f"Topic {topic_id}: {new_name}"
+        model.set_topic_labels({topic_id: full_name})
+        APP_STATE["topics_df"] = model.get_topic_info()
+        gr.Info(f"Topic {topic_id} renamed to '{new_name}'")
+        # Return the updated table for the UI
+        return gr.update(value=APP_STATE["topics_df"][['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'}))
+    return gr.update() # No change
+def merge_selected_topics(topics_to_merge):
+    """Handler for manual topic merging."""
+    model = APP_STATE.get("bertopic_model")
+    if model and topics_to_merge and len(topics_to_merge) > 1:
+        # Convert topic names like "Topic 0: ..." to integer IDs
+        topic_ids = [int(re.search(r'\d+', t).group()) for t in topics_to_merge]
+        model.merge_topics(topics_to_merge=[topic_ids])
+        # After merging, we need to refresh the state and UI components
+        APP_STATE["topics_df"] = model.get_topic_info()
+        review_topic_table = APP_STATE["topics_df"][['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'})
+        gr.Info(f"Successfully merged topics: {topic_ids}")
+        return {
+            review_topic_table_df: gr.update(value=review_topic_table),
+            # Clear the selection and the details view
+            topic_merger_checkboxgroup: gr.update(value=[]),
+            topic_name_textbox: "",
+            topic_word_cloud_plot: None,
+            topic_docs_df: pd.DataFrame(),
+        }
+    gr.Warning("Please select at least two topics to merge.")
+    return {review_topic_table_df: gr.update(), topic_merger_checkboxgroup: gr.update()}
+def generate_temporal_plot(date_column, progress=gr.Progress()):
+    """Generates and displays the topics over time plot."""
+    progress(0, desc="Preparing time data...")
+    if not date_column: return None
+    model, df = APP_STATE.get("bertopic_model"), APP_STATE.get("final_df")
+    if model is None or df is None: return None
+    df_temporal = df.copy()
+    df_temporal['timestamp'] = pd.to_datetime(df_temporal[date_column], errors='coerce')
+    df_temporal.dropna(subset=['timestamp'], inplace=True)
+    if df_temporal.empty:
+        gr.Warning(f"The column '{date_column}' contains no valid dates after conversion.")
+        return None
+    progress(0.6, desc="Generating topic trends over time...")
+    try:
+        # BERTopic requires the original documents and timestamps for this plot
+        docs_temporal = df_temporal['processed_text'].tolist()
+        timestamps_temporal = df_temporal['timestamp'].tolist()
+        topics_over_time = model.topics_over_time(docs=docs_temporal, timestamps=timestamps_temporal)
+        return model.visualize_topics_over_time(topics_over_time)
+    except Exception as e:
+        gr.Error(f"Could not generate temporal plot. This can happen if topics are not found in the selected time range. Error: {e}")
+        return None
+def generate_media_analysis(media_column):
+    """Generates a bar chart for media source analysis."""
+    if not media_column:
+        gr.Warning("Please select a media column to analyze.")
+        return None
+    df = APP_STATE.get("df")
+    if df is None or media_column not in df.columns:
+        return None
+    counts = df[media_column].value_counts().nlargest(20) # Get top 20 sources
+    # Using Gradio's built-in plotting for simplicity
+    plot_df = pd.DataFrame({'Media Source': counts.index, 'Article Count': counts.values})
+    return gr.BarPlot(
+        plot_df,
+        x='Media Source',
+        y='Article Count',
+        title=f'Top 20 Media Sources by Article Count',
+        tooltip=['Media Source', 'Article Count'],
+        height=500,
+        vertical_guides=[{'value': counts.mean(), 'label': 'Average'}]
+    )
+def finalize_and_save():
+    """Saves the final DataFrame and topic definitions to files."""
+    if APP_STATE.get("final_df") is None or APP_STATE.get("topics_df") is None:
+        gr.Warning("No data available to save.")
+        return None
+    final_df_to_save, topics_df_to_save = APP_STATE["final_df"].copy(), APP_STATE["topics_df"].copy()
+    # Convert list columns to JSON strings for compatibility
+    for col in ['Representation', 'Representative_Docs']:
+        if col in topics_df_to_save.columns:
+            topics_df_to_save[col] = topics_df_to_save[col].apply(
+                lambda x: json.dumps(x) if isinstance(x, list) else x
+            )
+    db_path, csv_path = "topic_analysis_results.sqlite", "labeled_documents.csv"
+    with sqlite3.connect(db_path) as conn:
+        topics_df_to_save.to_sql("topic_definitions", conn, if_exists="replace", index=False)
+        final_df_to_save.to_sql("enriched_documents", conn, if_exists="replace", index=False)
+    topic_map = topics_df_to_save.set_index('Topic')['Name'].to_dict()
+    final_df_to_save['Topic_Name'] = final_df_to_save['Topic'].map(topic_map)
+    final_df_to_save.to_csv(csv_path, index=False, encoding='utf-8-sig')
+    gr.Info(f"Results saved to {db_path} and {csv_path}")
+    return [db_path, csv_path]
+print("✅ Final backend handlers appended to app.py")
+# --- GRADIO UI LAYOUT & EVENT HANDLERS ---
+with gr.Blocks(theme=gr.themes.Soft(), title=APP_TITLE) as app:
+    gr.Markdown(f"# {APP_TITLE}")
+    gr.Markdown(f"*{APP_TAGLINE}*")
+    with gr.Tabs() as tabs:
+        # === SETUP & RUN TAB ===
+        with gr.TabItem("1. Setup & Run Analysis", id=0):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### 1. Data Input")
+                    file_upload = gr.File(label="Upload CSV File", file_types=[".csv"])
+                    gsheet_url = gr.Textbox(label="Or Paste Google Sheets URL", placeholder="https://docs.google.com/spreadsheets/d/...")
+                    gr.Markdown("### 2. Select Columns")
+                    text_columns_checkboxgroup = gr.CheckboxGroup(label="Select Text Columns for Analysis", interactive=True)
+                    gr.Markdown("### 3. Configure Analysis")
+                    analysis_mode_radio = gr.Radio(["Discovery Mode", "Manual Seeding"], value="Discovery Mode", label="Analysis Mode")
+                    manual_seeds_textbox = gr.Textbox(label="Manual Seed Topics (JSON format)", visible=False, lines=5)
+                    # FIX: Assign the markdown to a variable so we can target it directly
+                    manual_seeds_example = gr.Markdown("Example: `{\"Topic A\": [\"keyword1\", \"keyword2\"], \"Topic B\": [\"wordA\", \"wordB\"]}`", visible=False)
+                    top_n_topics_slider = gr.Slider(label="Number of Topics for Charts", minimum=5, maximum=50, value=15, step=1)
+                    gr.Markdown("### 4. Advanced (Optional)")
+                    enable_ai_merging_checkbox = gr.Checkbox(label="Enable AI Topic Naming (Requires GPU & HF Token)", value=False)
+                    hf_token_textbox = gr.Textbox(label="Hugging Face Token", type="password", placeholder="hf_...", info="Required if AI is enabled.")
+                    start_button = gr.Button("Start Analysis", variant="primary")
+                with gr.Column(scale=2):
+                    log_output = gr.Textbox(label="Pipeline Progress", lines=25, interactive=False, autoscroll=True)
+        # === REVIEW & FINALIZE TAB ===
+        with gr.TabItem("2. Review & Finalize", id=1, visible=False) as review_tab:
+            gr.Markdown("### Review, Refine, and Finalize Your Topic Model")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    gr.Markdown("**Topics Found**")
+                    review_topic_table_df = gr.DataFrame(headers=["ID", "Topic Name", "Documents"], interactive=True, wrap=True, scale=2)
+                with gr.Column(scale=3):
+                    gr.Markdown("**Selected Topic Details**")
+                    topic_id_state = gr.State() # Hidden state to store the selected topic ID
+                    topic_name_textbox = gr.Textbox(label="Topic Name (Editable)")
+                    update_name_button = gr.Button("Update Name")
+                    topic_word_cloud_plot = gr.Plot(label="Top Words for Selected Topic")
+                    topic_docs_df = gr.DataFrame(headers=["Representative Document"], wrap=True)
+            with gr.Row():
+                gr.Markdown("### Manual Topic Merging")
+            with gr.Row():
+                topic_merger_checkboxgroup = gr.CheckboxGroup(label="Select 2 or more topics to merge", interactive=True)
+                merge_button = gr.Button("Merge Selected Topics", variant="stop")
+            with gr.Row():
+                finalize_button = gr.Button("Save Final Results to Files", variant="primary")
+                download_link = gr.File(label="Download Results (SQLite DB and CSV)", file_count="multiple")
+        # === VISUALIZE & EXPLORE TAB ===
+        with gr.TabItem("3. Visualize & Explore", id=2, visible=False) as visualize_tab:
+            with gr.Tabs():
+                with gr.TabItem("Document Landscape"):
+                    gr.Markdown("A 2D map of every document, colored by its assigned topic. This shows the overall structure of your data.")
+                    doc_topic_landscape_plot_ui = gr.Plot()
+                with gr.TabItem("Topic Relationships"):
+                    gr.Markdown("Visualizations showing how topics relate to each other.")
+                    inter_topic_map_plot_ui = gr.Plot(label="Inter-Topic Distance Map")
+                    topic_hierarchy_plot_ui = gr.Plot(label="Hierarchical Clustering of Topics")
+                    topic_similarity_heatmap_ui = gr.Plot(label="Topic Similarity Heatmap")
+                with gr.TabItem("Topic Keywords"):
+                    gr.Markdown("A bar chart showing the most important keywords for the most prominent topics.")
+                    top_topics_barchart_plot_ui = gr.Plot()
+                with gr.TabItem("Temporal Analysis"):
+                    with gr.Group(visible=False) as temporal_analysis_group:
+                        gr.Markdown("Select a date column from your data to see how topic popularity has changed over time.")
+                        with gr.Row():
+                            date_column_dropdown = gr.Dropdown(label="Select Date Column")
+                            generate_trends_button = gr.Button("Generate Trend Plot")
+                        temporal_plot_ui = gr.Plot()
+        # === SOURCE ANALYSIS TAB ===
+        with gr.TabItem("4. Source Analysis", id=3, visible=False) as source_tab:
+            gr.Markdown("### Analyze the Distribution of News Sources")
+            with gr.Row():
+                media_column_dropdown = gr.Dropdown(label="Select Your Media/Source Column")
+                analyze_media_button = gr.Button("Analyze Sources")
+            with gr.Row():
+                media_plot = gr.BarPlot()
+    gr.Markdown(f"<div style='text-align: center;'>{APP_FOOTER}</div>")
+    # --- EVENT HANDLERS ---
+    def update_column_selector(file, url):
+        """Populates column selectors after data is loaded."""
+        # This function also makes the source analysis tab visible if data loads
+        if file is None and not url:
+            return {text_columns_checkboxgroup: gr.update(choices=[], value=None), media_column_dropdown: gr.update(choices=[], value=None), source_tab: gr.update(visible=False)}
+        try:
+            df = load_data(file, url)
+            text_cols = [col for col in df.columns if df[col].dtype == 'object']
+            return {
+                text_columns_checkboxgroup: gr.update(choices=text_cols, value=text_cols if text_cols else None),
+                media_column_dropdown: gr.update(choices=df.columns.tolist()),
+                source_tab: gr.update(visible=True)
+            }
+        except Exception as e:
+            gr.Warning(f"Failed to read columns: {e}")
+            return {text_columns_checkboxgroup: gr.update(choices=[], value=None), media_column_dropdown: gr.update(choices=[], value=None), source_tab: gr.update(visible=False)}
+    file_upload.upload(fn=update_column_selector, inputs=[file_upload, gsheet_url], outputs=[text_columns_checkboxgroup, media_column_dropdown, source_tab])
+    gsheet_url.submit(fn=update_column_selector, inputs=[file_upload, gsheet_url], outputs=[text_columns_checkboxgroup, media_column_dropdown, source_tab])
+    # FIX: A single, robust function to control the visibility of manual seeding UI elements
+    def toggle_manual_seeding_ui(mode):
+        is_visible = mode == "Manual Seeding"
+        return {
+            manual_seeds_textbox: gr.update(visible=is_visible),
+            manual_seeds_example: gr.update(visible=is_visible)
+        }
+    analysis_mode_radio.change(
+        fn=toggle_manual_seeding_ui,
+        inputs=analysis_mode_radio,
+        outputs=[manual_seeds_textbox, manual_seeds_example]
+    )
+    start_button.click(
+        fn=run_analysis_pipeline,
+        inputs=[file_upload, gsheet_url, text_columns_checkboxgroup, analysis_mode_radio, manual_seeds_textbox, top_n_topics_slider, enable_ai_merging_checkbox, hf_token_textbox],
+        outputs=[log_output, review_tab, visualize_tab, review_topic_table_df, doc_topic_landscape_plot_ui, inter_topic_map_plot_ui,
+                 top_topics_barchart_plot_ui, topic_similarity_heatmap_ui, topic_hierarchy_plot_ui, temporal_analysis_group, date_column_dropdown]
+    )
+    def on_select_topic(evt: gr.SelectData):
+        """Handles selecting a topic from the main review table."""
+        if not isinstance(evt.index, tuple) or len(evt.index) == 0:
+            return {topic_id_state: None, topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
+        try:
+            topic_id_val = APP_STATE["topics_df"].iloc[evt.index[0]]['ID']
+            details = get_topic_details(topic_id_val)
+            details[topic_id_state] = topic_id_val # Store the ID in the hidden state
+            return details
+        except Exception:
+            return {topic_id_state: None, topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
+    review_topic_table_df.select(fn=on_select_topic, outputs=[topic_id_state, topic_name_textbox, topic_word_cloud_plot, topic_docs_df])
+    # Connect the new manual refinement buttons
+    update_name_button.click(fn=update_topic_name, inputs=[topic_id_state, topic_name_textbox], outputs=[review_topic_table_df])
+    # When the main results are generated, populate the topic merger checklist
+    review_topic_table_df.change(lambda df: gr.update(choices=df['Topic Name'].tolist()), inputs=review_topic_table_df, outputs=topic_merger_checkboxgroup)
+    merge_button.click(fn=merge_selected_topics, inputs=[topic_merger_checkboxgroup], outputs=[review_topic_table_df, topic_merger_checkboxgroup, topic_name_textbox, topic_word_cloud_plot, topic_docs_df])
+    # Connect the new Source Analysis tab
+    analyze_media_button.click(fn=generate_media_analysis, inputs=[media_column_dropdown], outputs=[media_plot])
+    # Other handlers
+    generate_trends_button.click(fn=generate_temporal_plot, inputs=[date_column_dropdown], outputs=[temporal_plot_ui])
+    finalize_button.click(fn=finalize_and_save, inputs=[], outputs=[download_link])
+# --- LAUNCH THE APP ---
+if __name__ == "__main__":
+    app.launch(debug=True, share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio
+pandas
+scikit-learn
+bertopic[visualization]
+sentence_transformers
+torch
+transformers
+accelerate
+bitsandbytes
+huggingface_hub
+requests