Spaces:

Arjon07CSE
/

Social-Perception-Analyzer

Sleeping

App Files Files Community

Arjon07CSE commited on Aug 28, 2025

Commit

c0df0eb

verified ·

1 Parent(s): e4bb20b

added youtube analyzer section

Browse files

Files changed (1) hide show

app.py +547 -658

app.py CHANGED Viewed

@@ -1,4 +1,9 @@
-# --- IMPORTS & GLOBAL SETUP ---
 import gradio as gr
 import pandas as pd
 import numpy as np
@@ -8,41 +13,64 @@ import sqlite3
 import json
 import logging
 import requests
 from io import StringIO
-# Transformers and BERTopic components
 from transformers import pipeline, BitsAndBytesConfig
 from sentence_transformers import SentenceTransformer
-from bertopic import BERTopic
-from bertopic.representation import KeyBERTInspired
-from umap import UMAP
-from hdbscan import HDBSCAN
-from sklearn.feature_extraction.text import CountVectorizer
-# Hugging Face and Colab integration (optional, for LLM access)
-from huggingface_hub import login
-# from google.colab import userdata # We will disable this for HF Spaces deployment
-# Setup basic logging to monitor the application's health
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s'
-)
-# A simple dictionary to hold data between UI interactions, acting as a session state.
-APP_STATE = {
-    "df": None,
-    "bertopic_model": None,
-    "topics_df": None,
-    "final_df": None,
-}
-print("✅ app.py created. Initial imports written.")
-print("✅ Dependencies installed in Colab environment.")
-# --- TEXT PREPROCESSING & NORMALIZATION ---
-# A comprehensive list of Bangla stop words, tailored for news and general text.
 BANGLA_STOP_WORDS = [
     'অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য',
     'অভিপ্রায়', 'একে', 'একই', 'একেবারে', 'একটি', 'একবার', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটি',
@@ -61,669 +89,530 @@ BANGLA_STOP_WORDS = [
     'সম্পর্কে', 'সঙ্গেও', 'সর্বাধিক', 'সর্বদা', 'সহ', 'হৈতে', 'হইবে', 'হইয়া', 'হৈল', 'জানিয়েছেন', 'প্রতিবেদক'
 ]
-def normalize_bangla_manual(text):
-    """A robust, self-contained function to normalize Bangla text."""
-    if not isinstance(text, str): return ""
-    replacements = {
-        '[\u09F7]': '\u09B0', '[\u09F2]': '\u09B2', '[\u09E4]': '\u098B', '[\u09E5]': '\u09E1',
-        '[\u09FA]': '\u09B8\u09CD\u09AE', '[\u09FB]': '\u0995\u09CD\u09B7', '[\u0970]': '\u0966',
-        '[\u09F3]': '\u09B0\u09C2', '[\u09F8]': '\u09A3', '[\u09F9]': '\u09B6', '[\u0984]': '',
-        '[\u0980]': '\u0981', r'(\s)।(\s)': r'\1।\2', r'(\S)।(\S)': r'\1 । \2',
-        '[\u0964][\u0964]': '\u0964', '[|]': '\u0964', '[\u09DC]': '\u09A1\u09BC',
-        '[\u09DD]': '\u09A2\u09BC', '[\u09DF]': '\u09AF\u09BC',
-    }
-    for old, new in replacements.items():
-        text = re.sub(old, new, text)
-    return text
-def preprocess_bangla_text(text):
-    """Cleans and normalizes a single Bangla text string for NLP tasks."""
-    if not isinstance(text, str): return ""
-    text = normalize_bangla_manual(text)
-    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
-    text = re.sub(r'\S*@\S*\s?', '', text)
-    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)
-    words = text.split()
-    words = [word for word in words if word not in BANGLA_STOP_WORDS]
-    text = " ".join(words)
-    return re.sub(r'\s+', ' ', text).strip()
-print("✅ Helper functions appended to app.py")
-# --- APP BRANDING & CONFIGURATION ---
-# Easily update the application's title, tagline, and footer here.
-APP_TITLE = "Social Perception Analyzer"
-APP_TAGLINE = "Prepared for the Policymakers of Bangladesh Nationalist Party (BNP)"
-APP_FOOTER = "Developed by Centre for Data Science Research (CDSR), and Strategy and Policy Forum (SPF)"
-# --- LOCAL LLM INITIALIZATION ---
-def initialize_local_llm(hf_token=None):
-    """
-    Initializes and returns a local, quantized, lightweight LLM pipeline.
-    This model is chosen for its efficiency and Bangla language specialization.
-    """
-    model_id = "hishab/titulm-llama-3.2-1b-v1.1"
-    # 4-bit quantization to reduce memory usage significantly
-    quantization_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.bfloat16
-    )
-    try:
-        # Check for GPU availability
-        if not torch.cuda.is_available():
-            logging.warning("GPU not available. LLM will run on CPU and be very slow.")
-            llm_pipeline = pipeline("text-generation", model=model_id, token=hf_token)
-        else:
-            logging.info(f"Initializing quantized local LLM: {model_id} on GPU.")
-            llm_pipeline = pipeline(
-                "text-generation",
-                model=model_id,
-                model_kwargs={"quantization_config": quantization_config},
-                device_map="auto",
-                token=hf_token
-            )
-        return llm_pipeline
-    except Exception as e:
-        logging.error(f"Failed to initialize local LLM: {e}")
-        # Add a note about potential trust issues for some models
-        logging.info("Trying again with 'trust_remote_code=True'.")
-        try:
-             llm_pipeline = pipeline(
-                "text-generation",
-                model=model_id,
-                model_kwargs={"trust_remote_code": True, "quantization_config": quantization_config},
-                device_map="auto",
-                token=hf_token
-            )
-             return llm_pipeline
-        except Exception as e2:
-             logging.error(f"Secondary attempt failed: {e2}")
-             gr.Warning("Could not initialize the local LLM. AI features will be disabled.")
-             return None
-# --- DATA LOADING HELPER ---
-def load_data(file_obj, gsheet_url):
-    """Loads a DataFrame from an uploaded file or a direct Google Sheets CSV URL."""
-    if file_obj is not None:
-        logging.info(f"Loading data from uploaded file: {file_obj.name}")
-        return pd.read_csv(file_obj.name)
-    elif gsheet_url and gsheet_url.strip():
-        logging.info(f"Loading data directly from URL: {gsheet_url}")
         try:
-            # FIX: Removed the unreliable .replace() logic.
-            # We now expect a direct CSV link from the user.
-            response = requests.get(gsheet_url)
-            response.raise_for_status() # Raise an exception for bad status codes
-            return pd.read_csv(StringIO(response.text))
         except Exception as e:
-            raise ValueError(f"Failed to load from URL. Please ensure it is a direct CSV link. Error: {e}")
-    else:
-        raise ValueError("Please upload a CSV file or provide a public Google Sheets URL.")
-# --- MAIN ANALYSIS ENGINE ---
-# We will define the AI agent in the next cell. For now, this is a placeholder.
-LLM_PIPELINE = None
-def run_analysis_pipeline(file_obj, gsheet_url, text_columns, analysis_mode, manual_seeds,
-                          top_n_topics_slider, enable_ai_merging, hf_token, progress=gr.Progress()):
-    """
-    The main orchestrator function for the analysis pipeline.
-    This function incorporates all our agreed-upon refinements.
-    """
-    global LLM_PIPELINE
-    if enable_ai_merging and LLM_PIPELINE is None:
-        progress(0, desc="Initializing LLM...")
-        LLM_PIPELINE = initialize_local_llm(hf_token)
-        if LLM_PIPELINE is None:
-            gr.Warning("AI features enabled, but LLM failed to initialize. Skipping AI steps.")
-            enable_ai_merging = False
-    # === STEP 1: LOAD AND VALIDATE DATA ===
-    progress(0.1, desc="Step 1/8: Loading and Validating Data...")
     try:
-        df = load_data(file_obj, gsheet_url)
-        if not text_columns: raise ValueError("Please select at least one text column to analyze.")
-        df['combined_text'] = df[text_columns].fillna('').astype(str).agg(' '.join, axis=1)
-        df.dropna(subset=['combined_text'], inplace=True)
-        df['processed_text'] = df['combined_text'].apply(preprocess_bangla_text)
-        # REFINEMENT: Filter by word count for more robust document validation.
-        df_analysis = df[df['processed_text'].str.split().str.len() > 2].copy()
-        if df_analysis.empty:
-            raise ValueError("No documents with sufficient content found after cleaning. Please check your data and column selection.")
-        documents = df_analysis['processed_text'].tolist()
-        APP_STATE["df"] = df_analysis # Save the analyzable dataframe
     except Exception as e:
-        logging.error(f"Data Loading Error: {e}")
-        return {log_output: f"Error during data loading: {e}"}
-    # === STEP 2: PREPARE GUIDANCE (IF MANUAL SEEDING) ===
-    progress(0.2, desc="Step 2/8: Preparing Analysis Mode...")
-    y_guidance = None
-    if analysis_mode == "Manual Seeding" and manual_seeds:
         try:
-            seed_topics_dict = json.loads(manual_seeds)
-            y_guidance = [-1] * len(documents)
-            topic_name_to_id = {name: i for i, name in enumerate(seed_topics_dict.keys())}
-            for i, doc in enumerate(documents):
-                for topic_name, keywords in seed_topics_dict.items():
-                    if any(keyword in doc for keyword in keywords):
-                        y_guidance[i] = topic_name_to_id[topic_name]
-                        break # Prioritizes the first match in the JSON
-        except Exception as e:
-            return {log_output: f"Error: Invalid JSON in Manual Seeds. Details: {e}"}
-    # === STEP 3: EMBEDDINGS & MODEL SETUP (WITH REFINEMENTS) ===
-    progress(0.3, desc="Step 3/8: Calculating Document Embeddings...")
-    embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
-    embeddings = embedding_model.encode(documents, show_progress_bar=True)
-    # REFINEMENT: Lower min_cluster_size for more sensitive topic detection.
-    hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
-    # REFINEMENT: Use max_df and min_df for adaptive stop word filtering.
-    vectorizer_model = CountVectorizer(tokenizer=lambda doc: doc.split(), ngram_range=(1, 3), max_df=0.90, min_df=5)
-    # Other components remain robust
-    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
-    representation_model = KeyBERTInspired()
-    # === STEP 4: TRAIN TOPIC MODEL ===
-    progress(0.5, desc="Step 4/8: Training BERTopic Model...")
-    topic_model = BERTopic(
-        embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model,
-        vectorizer_model=vectorizer_model, representation_model=representation_model,
-        language="multilingual", verbose=False
-    )
-    topics, _ = topic_model.fit_transform(documents, embeddings, y=y_guidance)
-    # === STEP 5: AI REFINEMENT (IF ENABLED) ===
-    if enable_ai_merging and LLM_PIPELINE:
-        progress(0.6, desc="Step 5/8: Running AI Refinement Agent...")
-        # We will define `run_ai_refinement` in the next cell. This is the hook.
-        topic_model = run_ai_refinement(topic_model, LLM_PIPELINE, progress)
-    else:
-        progress(0.6, desc="Step 5/8: Skipping AI Refinement...")
-        # Fallback to default naming if AI is disabled
-        generated_labels = topic_model.generate_topic_labels(nr_words=4, separator=", ")
-        topic_model.set_topic_labels(generated_labels)
-    # === STEP 6: APPLY MANUAL SEED NAMES ===
-    progress(0.7, desc="Step 6/8: Finalizing Topic Names...")
-    if analysis_mode == "Manual Seeding" and 'seed_topics_dict' in locals():
-        for topic_name, topic_id in topic_name_to_id.items():
-            if topic_id in topic_model.get_topic_info()['Topic'].values:
-                topic_model.set_topic_labels({topic_id: topic_name})
-    # === STEP 7: PREPARE FINAL OUTPUTS & VISUALIZATIONS ===
-    progress(0.85, desc="Step 7/8: Preparing Visualizations...")
-    APP_STATE["bertopic_model"] = topic_model
-    df_analysis['Topic'] = topics
-    APP_STATE["final_df"] = df_analysis
-    topics_df = topic_model.get_topic_info()
-    APP_STATE["topics_df"] = topics_df
-    # REFINEMENT: Safeguard against memory errors on very large datasets.
-    if len(documents) > 50000:
-        gr.Info("Dataset is large. Visualizing a sample of 50,000 documents for performance.")
-        indices = np.random.choice(len(documents), 50000, replace=False)
-        sampled_docs = [documents[i] for i in indices]
-        sampled_embeddings = embeddings[indices]
-        doc_topic_landscape_plot = topic_model.visualize_documents(sampled_docs, embeddings=sampled_embeddings)
-    else:
-        doc_topic_landscape_plot = topic_model.visualize_documents(documents, embeddings=embeddings)
-    inter_topic_map_plot = topic_model.visualize_topics()
-    # REFINEMENT: Use slider value for dynamic chart generation.
-    num_chart_topics = int(top_n_topics_slider)
-    top_topics_barchart_plot = topic_model.visualize_barchart(top_n_topics=num_chart_topics)
-    topic_similarity_heatmap_plot = topic_model.visualize_heatmap(top_n_topics=num_chart_topics)
-    topic_hierarchy_plot = topic_model.visualize_hierarchy(top_n_topics=num_chart_topics)
-    review_topic_table = topics_df[['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'})
-    # Check for date columns for the temporal analysis tab
-    date_columns = [col for col in df_analysis.columns if pd.to_datetime(df_analysis[col], errors='coerce').notna().any()]
-    # === STEP 8: UPDATE UI WITH RESULTS ===
-    progress(1.0, desc="Step 8/8: Finalizing UI...")
     return {
-        log_output: f"✅ Analysis Complete! Discovered {len(topics_df)-1} topics.",
-        # Make result tabs visible
-        review_tab: gr.update(visible=True),
-        visualize_tab: gr.update(visible=True),
-        # Populate the review tab
-        review_topic_table_df: gr.update(value=review_topic_table),
-        # Populate the visualization tab
-        doc_topic_landscape_plot_ui: doc_topic_landscape_plot,
-        inter_topic_map_plot_ui: inter_topic_map_plot, # Hook for the fixed plot
-        top_topics_barchart_plot_ui: top_topics_barchart_plot,
-        topic_similarity_heatmap_ui: topic_similarity_heatmap_plot,
-        topic_hierarchy_plot_ui: topic_hierarchy_plot,
-        # Update and enable the temporal analysis tab if date columns exist
-        temporal_analysis_group: gr.update(visible=len(date_columns) > 0),
-        date_column_dropdown: gr.update(choices=date_columns, value=date_columns[0] if date_columns else None),
-    }
-print("✅ Main analysis pipeline function appended to app.py")
-# --- AI REFINEMENT AGENT ---
-def run_ai_refinement(topic_model, llm_pipeline, progress=gr.Progress()):
-    """
-    Uses a lightweight LLM to generate high-quality, contextual topic names.
-    Includes a conceptual hook for future AI-powered topic merging.
-    """
-    logging.info("Starting AI Refinement Agent...")
-    # --- Task 1: AI-Powered Topic Naming ---
-    progress(0, desc="AI Agent: Generating Topic Names...")
-    topic_info_df = topic_model.get_topic_info()
-    new_labels = {}
-    # This is the advanced, few-shot Bangla prompt we designed.
-    # It will be used for each topic.
-    prompt_template = """
-আপনি একজন পেশাদার সংবাদ সম্পাদক। আপনার কাজ হলো বাংলাদেশের রাজনৈতিক ঘটনাবলী, বিশেষ করে বিএনপির 'তারুণ্যের সমাবেশ' সংক্রান্ত সংবাদের জন্য একটি সংক্ষিপ্ত ও প্রাসঙ্গিক শিরোনাম তৈরি করা। প্রদত্ত কীওয়ার্ডগুলো ব্যবহার করে একটি (৩-৫ শব্দের) সারগর্ভ বাংলা শিরোনাম লিখুন, যেখানে সমাবেশের মূল বিষয় বা স্থান স্পষ্টভাবে ফুটে উঠবে। উদাহরণগুলো দেখুন।
---- উদাহরণ ---
-ইনপুট কীওয়ার্ড: ['খুলনা', 'তারুণ্যের', 'সমাবেশ', 'বিএনপি']
-আউটপুট শিরোনাম: খুলনায় বিএনপির তারুণ্যের সমাবেশ
-ইনপুট কীওয়ার্ড: ['ঢাকা', 'নয়াপল্টন', 'তারুণ্যের', 'স্রোত', 'বৃষ্টি']
-আউটপুট শিরোনাম: ঢাকায় তারুণ্যের সমাবেশে জনতার ঢল
-ইনপুট কীওয়ার্ড: ['চট্টগ্রাম', 'বক্তব্য', 'মির্জা ফখরুল', 'শোডাউন']
-আউটপুট শিরোনাম: চট্টগ্রামে মির্জা ফখরুলের তারুণ্যের সমাবেশ
---- উদাহরণের শেষ ---
---- আপনার কাজ ---
-ইনপুট কীওয়ার্ড: {keywords}
-আউটপুট শিরোনাম:
-"""
-    # Tuned parameters for reliable, non-creative naming
-    generation_params = {
-        "temperature": 0.3,
-        "max_new_tokens": 30,
-        "repetition_penalty": 1.2,
-        "do_sample": True
     }
-    # Iterate through each topic to generate a new name
-    for index, row in topic_info_df.iterrows():
-        topic_id = row['Topic']
-        if topic_id == -1:
-            # We don't rename the outlier topic
-            new_labels[topic_id] = "Topic -1: Outliers"
-            continue
-        keywords = row['Representation']
-        # Format the prompt for the current topic
-        prompt = prompt_template.format(keywords=keywords)
-        try:
-            # Call the LLM pipeline
-            response = llm_pipeline(prompt, **generation_params)
-            # Extract the generated text, stripping whitespace and the prompt's artifacts
-            generated_name = response[0]['generated_text'].split("আউটপুট শিরোনাম:")[1].strip()
-            if generated_name:
-                new_labels[topic_id] = f"Topic {topic_id}: {generated_name}"
-                logging.info(f"Generated name for Topic {topic_id}: {generated_name}")
-            else:
-                # Fallback to default name if generation fails
-                new_labels[topic_id] = topic_model.get_topic_label(topic_id, nr_words=4)
-        except Exception as e:
-            logging.error(f"LLM failed for Topic {topic_id}. Error: {e}")
-            # Fallback for safety
-            new_labels[topic_id] = topic_model.get_topic_label(topic_id, nr_words=4)
-        progress.update((index + 1) / len(topic_info_df))
-    # Apply all the new, AI-generated labels at once
-    topic_model.set_topic_labels(new_labels)
-    logging.info("✅ AI Naming complete.")
-    # --- Task 2: AI-Powered Merging (Conceptual Hook) ---
-    # This section is a placeholder for a future enhancement.
-    # The logic would be:
-    # 1. Calculate topic similarity matrix.
-    # 2. Identify pairs with similarity > threshold (e.g., 0.85).
-    # 3. Use a "Judge" prompt to ask the LLM if they should be merged.
-    # 4. If LLM says "YES", call `topic_model.merge_topics()`.
-    logging.info("Skipping AI Topic Merging (conceptual feature).")
-    return topic_model
-print("✅ AI Refinement Agent function appended to app.py")
-# --- FINAL BACKEND HANDLERS & HELPERS ---
-def get_topic_details(topic_id: int):
-    """Fetches details for a selected topic to display in the review tab."""
-    empty_return = {topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
-    model = APP_STATE.get("bertopic_model")
-    if model is None or topic_id is None: return empty_return
-    try:
-        topic_id = int(topic_id)
-        topic_info = model.get_topic_info(topic_id=topic_id)
-        if topic_info.empty: return empty_return
-        # Strip the "Topic X: " prefix for cleaner editing
-        topic_name = topic_info['Name'].iloc[0]
-        cleaned_name = re.sub(r'^Topic \d+:\s*', '', topic_name)
-        # For the outlier topic, don't generate plots
-        if topic_id == -1:
-            return {topic_name_textbox: cleaned_name, topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
-        word_cloud_fig = model.visualize_barchart(top_n_topics=1, topics=[topic_id])
-        docs_df = pd.DataFrame(model.get_representative_docs(topic_id), columns=['Representative Document'])
-        return {topic_name_textbox: cleaned_name, topic_word_cloud_plot: word_cloud_fig, topic_docs_df: docs_df}
-    except Exception as e:
-        logging.error(f"Error getting topic details for ID {topic_id}: {e}")
-        return empty_return
-def update_topic_name(topic_id, new_name):
-    """Handler for manual topic renaming."""
-    model = APP_STATE.get("bertopic_model")
-    if model and topic_id is not None and new_name:
-        topic_id = int(topic_id)
-        # Add the prefix back for consistency
-        full_name = f"Topic {topic_id}: {new_name}"
-        model.set_topic_labels({topic_id: full_name})
-        APP_STATE["topics_df"] = model.get_topic_info()
-        gr.Info(f"Topic {topic_id} renamed to '{new_name}'")
-        # Return the updated table for the UI
-        return gr.update(value=APP_STATE["topics_df"][['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'}))
-    return gr.update() # No change
-def merge_selected_topics(topics_to_merge):
-    """Handler for manual topic merging."""
-    model = APP_STATE.get("bertopic_model")
-    if model and topics_to_merge and len(topics_to_merge) > 1:
-        # Convert topic names like "Topic 0: ..." to integer IDs
-        topic_ids = [int(re.search(r'\d+', t).group()) for t in topics_to_merge]
-        model.merge_topics(topics_to_merge=[topic_ids])
-        # After merging, we need to refresh the state and UI components
-        APP_STATE["topics_df"] = model.get_topic_info()
-        review_topic_table = APP_STATE["topics_df"][['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'})
-        gr.Info(f"Successfully merged topics: {topic_ids}")
-        return {
-            review_topic_table_df: gr.update(value=review_topic_table),
-            # Clear the selection and the details view
-            topic_merger_checkboxgroup: gr.update(value=[]),
-            topic_name_textbox: "",
-            topic_word_cloud_plot: None,
-            topic_docs_df: pd.DataFrame(),
-        }
-    gr.Warning("Please select at least two topics to merge.")
-    return {review_topic_table_df: gr.update(), topic_merger_checkboxgroup: gr.update()}
-def generate_temporal_plot(date_column, progress=gr.Progress()):
-    """Generates and displays the topics over time plot."""
-    progress(0, desc="Preparing time data...")
-    if not date_column: return None
-    model, df = APP_STATE.get("bertopic_model"), APP_STATE.get("final_df")
-    if model is None or df is None: return None
-    df_temporal = df.copy()
-    df_temporal['timestamp'] = pd.to_datetime(df_temporal[date_column], errors='coerce')
-    df_temporal.dropna(subset=['timestamp'], inplace=True)
-    if df_temporal.empty:
-        gr.Warning(f"The column '{date_column}' contains no valid dates after conversion.")
-        return None
-    progress(0.6, desc="Generating topic trends over time...")
-    try:
-        # BERTopic requires the original documents and timestamps for this plot
-        docs_temporal = df_temporal['processed_text'].tolist()
-        timestamps_temporal = df_temporal['timestamp'].tolist()
-        topics_over_time = model.topics_over_time(docs=docs_temporal, timestamps=timestamps_temporal)
-        return model.visualize_topics_over_time(topics_over_time)
-    except Exception as e:
-        gr.Error(f"Could not generate temporal plot. This can happen if topics are not found in the selected time range. Error: {e}")
-        return None
-def generate_media_analysis(media_column):
-    """Generates a horizontal bar chart for media source analysis to prevent label overlap."""
-    if not media_column:
-        gr.Warning("Please select a media column to analyze.")
-        return None
-    df = APP_STATE.get("df")
-    if df is None or media_column not in df.columns:
-        return None
-    counts = df[media_column].value_counts().nlargest(20).sort_values() # Get top 20 and sort for a nice plot
-    plot_df = pd.DataFrame({'Media Source': counts.index, 'Article Count': counts.values})
-    # FIX: Swapped x and y to create a horizontal plot.
-    return gr.BarPlot(
-        plot_df,
-        x='Article Count', # The numeric value is now on the x-axis
-        y='Media Source',  # The categorical labels are now on the y-axis
-        title='Top 20 Media Sources by Article Count',
-        tooltip=['Media Source', 'Article Count'],
-        height=500,
-        # FIX: Changed to horizontal_guides
-        horizontal_guides=[{'value': counts.mean(), 'label': 'Average'}]
-    )
-def finalize_and_save():
-    """Saves the final DataFrame and topic definitions to files."""
-    if APP_STATE.get("final_df") is None or APP_STATE.get("topics_df") is None:
-        gr.Warning("No data available to save.")
-        return None
-    final_df_to_save, topics_df_to_save = APP_STATE["final_df"].copy(), APP_STATE["topics_df"].copy()
-    # Convert list columns to JSON strings for compatibility
-    for col in ['Representation', 'Representative_Docs']:
-        if col in topics_df_to_save.columns:
-            topics_df_to_save[col] = topics_df_to_save[col].apply(
-                lambda x: json.dumps(x) if isinstance(x, list) else x
-            )
-    db_path, csv_path = "topic_analysis_results.sqlite", "labeled_documents.csv"
-    with sqlite3.connect(db_path) as conn:
-        topics_df_to_save.to_sql("topic_definitions", conn, if_exists="replace", index=False)
-        final_df_to_save.to_sql("enriched_documents", conn, if_exists="replace", index=False)
-    topic_map = topics_df_to_save.set_index('Topic')['Name'].to_dict()
-    final_df_to_save['Topic_Name'] = final_df_to_save['Topic'].map(topic_map)
-    final_df_to_save.to_csv(csv_path, index=False, encoding='utf-8-sig')
-    gr.Info(f"Results saved to {db_path} and {csv_path}")
-    return [db_path, csv_path]
-print("✅ Final backend handlers appended to app.py")
-# --- GRADIO UI LAYOUT & EVENT HANDLERS ---
-with gr.Blocks(theme=gr.themes.Soft(), title=APP_TITLE) as app:
-    gr.Markdown(f"# {APP_TITLE}")
-    gr.Markdown(f"*{APP_TAGLINE}*")
     with gr.Tabs() as tabs:
-        # === SETUP & RUN TAB ===
-        with gr.TabItem("1. Setup & Run Analysis", id=0):
             with gr.Row():
                 with gr.Column(scale=1):
-                    gr.Markdown("### 1. Data Input")
-                    file_upload = gr.File(label="Upload CSV File", file_types=[".csv"])
-                    gsheet_url = gr.Textbox(
-                    label="Or Paste Google Sheets URL",
-                    placeholder="https://docs.google.com/spreadsheets/d/e/.../pub?output=csv",
-                    # FIX: Using triple quotes to create a correctly terminated string
-                    # that is also more readable.
-                    info="""How to get the link: In Google Sheets, go to File > Share > Publish to web.
-                    Select 'Comma-separated values (.csv)' and copy the generated link.
-                    Example: https://docs.google.com/spreadsheets/d/e/2PACX-1vTn-mRrOCk6fww892XfziUk63pJu9g8uOdy4nHjygKXcN7oO3EAhXLMD7WZAatvoLubSPpMdQ5ymouz/pub?output=csv"""
-                    )
-                    gr.Markdown("### 2. Select Columns")
-                    text_columns_checkboxgroup = gr.CheckboxGroup(label="Select Text Columns for Analysis", interactive=True)
-                    gr.Markdown("### 3. Configure Analysis")
-                    analysis_mode_radio = gr.Radio(["Discovery Mode", "Manual Seeding"], value="Discovery Mode", label="Analysis Mode")
-                    manual_seeds_textbox = gr.Textbox(label="Manual Seed Topics (JSON format)", visible=False, lines=5)
-                    # FIX: Assign the markdown to a variable so we can target it directly
-                    manual_seeds_example = gr.Markdown("Example: `{\"Topic A\": [\"keyword1\", \"keyword2\"], \"Topic B\": [\"wordA\", \"wordB\"]}`", visible=False)
-                    top_n_topics_slider = gr.Slider(label="Number of Topics for Charts", minimum=5, maximum=50, value=15, step=1)
-                    gr.Markdown("### 4. Advanced (Optional)")
-                    enable_ai_merging_checkbox = gr.Checkbox(label="Enable AI Topic Naming (Requires GPU & HF Token)", value=False)
-                    hf_token_textbox = gr.Textbox(label="Hugging Face Token", type="password", placeholder="hf_...", info="Required if AI is enabled.")
-                    start_button = gr.Button("Start Analysis", variant="primary")
                 with gr.Column(scale=2):
-                    log_output = gr.Textbox(label="Pipeline Progress", lines=25, interactive=False, autoscroll=True)
-        # === REVIEW & FINALIZE TAB ===
-        with gr.TabItem("2. Review & Finalize", id=1, visible=False) as review_tab:
-            gr.Markdown("### Review, Refine, and Finalize Your Topic Model")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    gr.Markdown("**Topics Found**")
-                    review_topic_table_df = gr.DataFrame(headers=["ID", "Topic Name", "Documents"], interactive=True, wrap=True, scale=2)
-                with gr.Column(scale=3):
-                    gr.Markdown("**Selected Topic Details**")
-                    topic_id_state = gr.State() # Hidden state to store the selected topic ID
-                    topic_name_textbox = gr.Textbox(label="Topic Name (Editable)")
-                    update_name_button = gr.Button("Update Name")
-                    topic_word_cloud_plot = gr.Plot(label="Top Words for Selected Topic")
-                    topic_docs_df = gr.DataFrame(headers=["Representative Document"], wrap=True)
-            with gr.Row():
-                gr.Markdown("### Manual Topic Merging")
-            with gr.Row():
-                topic_merger_checkboxgroup = gr.CheckboxGroup(label="Select 2 or more topics to merge", interactive=True)
-                merge_button = gr.Button("Merge Selected Topics", variant="stop")
-            with gr.Row():
-                finalize_button = gr.Button("Save Final Results to Files", variant="primary")
-                download_link = gr.File(label="Download Results (SQLite DB and CSV)", file_count="multiple")
-        # === VISUALIZE & EXPLORE TAB ===
-        with gr.TabItem("3. Visualize & Explore", id=2, visible=False) as visualize_tab:
-            with gr.Tabs():
-                with gr.TabItem("Document Landscape"):
-                    gr.Markdown("A 2D map of every document, colored by its assigned topic. This shows the overall structure of your data.")
-                    doc_topic_landscape_plot_ui = gr.Plot()
-                with gr.TabItem("Topic Relationships"):
-                    gr.Markdown("Visualizations showing how topics relate to each other.")
-                    inter_topic_map_plot_ui = gr.Plot(label="Inter-Topic Distance Map")
-                    topic_hierarchy_plot_ui = gr.Plot(label="Hierarchical Clustering of Topics")
-                    topic_similarity_heatmap_ui = gr.Plot(label="Topic Similarity Heatmap")
-                with gr.TabItem("Topic Keywords"):
-                    gr.Markdown("A bar chart showing the most important keywords for the most prominent topics.")
-                    top_topics_barchart_plot_ui = gr.Plot()
-                with gr.TabItem("Temporal Analysis"):
-                    with gr.Group(visible=False) as temporal_analysis_group:
-                        gr.Markdown("Select a date column from your data to see how topic popularity has changed over time.")
                         with gr.Row():
-                            date_column_dropdown = gr.Dropdown(label="Select Date Column")
-                            generate_trends_button = gr.Button("Generate Trend Plot")
-                        temporal_plot_ui = gr.Plot()
-        # === SOURCE ANALYSIS TAB ===
-        with gr.TabItem("4. Source Analysis", id=3, visible=False) as source_tab:
-            gr.Markdown("### Analyze the Distribution of News Sources")
             with gr.Row():
-                media_column_dropdown = gr.Dropdown(label="Select Your Media/Source Column")
-                analyze_media_button = gr.Button("Analyze Sources")
-            with gr.Row():
-                media_plot = gr.BarPlot()
-    gr.Markdown(f"<div style='text-align: center;'>{APP_FOOTER}</div>")
-    # --- EVENT HANDLERS ---
-    def update_column_selector(file, url):
-        """Populates column selectors after data is loaded."""
-        # This function also makes the source analysis tab visible if data loads
-        if file is None and not url:
-            return {text_columns_checkboxgroup: gr.update(choices=[], value=None), media_column_dropdown: gr.update(choices=[], value=None), source_tab: gr.update(visible=False)}
-        try:
-            df = load_data(file, url)
-            text_cols = [col for col in df.columns if df[col].dtype == 'object']
             return {
-                text_columns_checkboxgroup: gr.update(choices=text_cols, value=text_cols if text_cols else None),
-                media_column_dropdown: gr.update(choices=df.columns.tolist()),
-                source_tab: gr.update(visible=True)
             }
-        except Exception as e:
-            gr.Warning(f"Failed to read columns: {e}")
-            return {text_columns_checkboxgroup: gr.update(choices=[], value=None), media_column_dropdown: gr.update(choices=[], value=None), source_tab: gr.update(visible=False)}
-    file_upload.upload(fn=update_column_selector, inputs=[file_upload, gsheet_url], outputs=[text_columns_checkboxgroup, media_column_dropdown, source_tab])
-    gsheet_url.submit(fn=update_column_selector, inputs=[file_upload, gsheet_url], outputs=[text_columns_checkboxgroup, media_column_dropdown, source_tab])
-    # FIX: A single, robust function to control the visibility of manual seeding UI elements
-    def toggle_manual_seeding_ui(mode):
-        is_visible = mode == "Manual Seeding"
         return {
-            manual_seeds_textbox: gr.update(visible=is_visible),
-            manual_seeds_example: gr.update(visible=is_visible)
         }
-    analysis_mode_radio.change(
-        fn=toggle_manual_seeding_ui,
-        inputs=analysis_mode_radio,
-        outputs=[manual_seeds_textbox, manual_seeds_example]
-    )
-    start_button.click(
-        fn=run_analysis_pipeline,
-        inputs=[file_upload, gsheet_url, text_columns_checkboxgroup, analysis_mode_radio, manual_seeds_textbox, top_n_topics_slider, enable_ai_merging_checkbox, hf_token_textbox],
-        outputs=[log_output, review_tab, visualize_tab, review_topic_table_df, doc_topic_landscape_plot_ui, inter_topic_map_plot_ui,
-                 top_topics_barchart_plot_ui, topic_similarity_heatmap_ui, topic_hierarchy_plot_ui, temporal_analysis_group, date_column_dropdown]
-    )
-    def on_select_topic(evt: gr.SelectData):
-        """Handles selecting a topic from the main review table."""
-        if not isinstance(evt.index, tuple) or len(evt.index) == 0:
-            return {topic_id_state: None, topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
-        try:
-            topic_id_val = APP_STATE["topics_df"].iloc[evt.index[0]]['ID']
-            details = get_topic_details(topic_id_val)
-            details[topic_id_state] = topic_id_val # Store the ID in the hidden state
-            return details
-        except Exception:
-            return {topic_id_state: None, topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
-    review_topic_table_df.select(fn=on_select_topic, outputs=[topic_id_state, topic_name_textbox, topic_word_cloud_plot, topic_docs_df])
-    # Connect the new manual refinement buttons
-    update_name_button.click(fn=update_topic_name, inputs=[topic_id_state, topic_name_textbox], outputs=[review_topic_table_df])
-    # When the main results are generated, populate the topic merger checklist
-    review_topic_table_df.change(lambda df: gr.update(choices=df['Topic Name'].tolist()), inputs=review_topic_table_df, outputs=topic_merger_checkboxgroup)
-    merge_button.click(fn=merge_selected_topics, inputs=[topic_merger_checkboxgroup], outputs=[review_topic_table_df, topic_merger_checkboxgroup, topic_name_textbox, topic_word_cloud_plot, topic_docs_df])
-    # Connect the new Source Analysis tab
-    analyze_media_button.click(fn=generate_media_analysis, inputs=[media_column_dropdown], outputs=[media_plot])
-    # Other handlers
-    generate_trends_button.click(fn=generate_temporal_plot, inputs=[date_column_dropdown], outputs=[temporal_plot_ui])
-    finalize_button.click(fn=finalize_and_save, inputs=[], outputs=[download_link])
-# --- LAUNCH THE APP ---
 if __name__ == "__main__":
-    app.launch(debug=True, share=True)

+# ==============================================================================
+# SOCIAL PERCEPTION ANALYZER - FINAL COMPLETE APPLICATION
+# Version: 3.0 (Architecturally Refactored, Production Ready)
+# ==============================================================================
+# --- IMPORTS ---
 import gradio as gr
 import pandas as pd
 import numpy as np
 import json
 import logging
 import requests
+import os
+import time
+import random
+import functools
 from io import StringIO
+from datetime import datetime, timezone
+from logging.handlers import RotatingFileHandler
+# --- APIs and Web Scraping ---
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
+from GoogleNews import GoogleNews
+from urllib.error import HTTPError
+import dateparser
+# --- NLP & Machine Learning ---
 from transformers import pipeline, BitsAndBytesConfig
 from sentence_transformers import SentenceTransformer
+from huggingface_hub.utils import HfHubHTTPError
+# --- Visualization ---
+import matplotlib.pyplot as plt
+from matplotlib.font_manager import FontProperties
+import seaborn as sns
+from wordcloud import WordCloud
+# ==============================================================================
+# SETUP PRODUCTION-GRADE LOGGING & CONFIGURATION
+# ==============================================================================
+log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+log_handler = RotatingFileHandler('app.log', maxBytes=5*1024*1024, backupCount=2)
+log_handler.setFormatter(log_formatter)
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    logger.addHandler(log_handler)
+logger.info("Application starting up.")
+# --- APPLICATION CONFIGURATION ---
+APP_TITLE = "Social Perception Analyzer"
+APP_TAGLINE = "Prepared for the Policymakers of Bangladesh Nationalist Party (BNP)"
+APP_FOOTER = "Developed by CDSR"
+# --- FONT CONFIGURATION ---
+FONT_PATH = 'NotoSansBengali-Regular.ttf'
+try:
+    BANGLA_FONT = FontProperties(fname=FONT_PATH)
+    logger.info("Successfully loaded 'NotoSansBengali-Regular.ttf' font.")
+except OSError:
+    logger.error("Failed to load 'NotoSansBengali-Regular.ttf'. Ensure the file is in the root directory.")
+    gr.Warning("Bangla font not found! Visualizations may not render text correctly.")
+    BANGLA_FONT = FontProperties()
+# ==============================================================================
+# CORE HELPER FUNCTIONS
+# ==============================================================================
 BANGLA_STOP_WORDS = [
     'অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য',
     'অভিপ্রায়', 'একে', 'একই', 'একেবারে', 'একটি', 'একবার', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটি',
     'সম্পর্কে', 'সঙ্গেও', 'সর্বাধিক', 'সর্বদা', 'সহ', 'হৈতে', 'হইবে', 'হইয়া', 'হৈল', 'জানিয়েছেন', 'প্রতিবেদক'
 ]
+def get_dynamic_time_agg(start_date, end_date):
+    """Hardened helper to determine time aggregation level."""
+    if not isinstance(start_date, pd.Timestamp) or not isinstance(end_date, pd.Timestamp):
+        return 'D', 'Daily' # Graceful fallback
+    delta = end_date - start_date
+    if delta.days <= 2: return 'H', 'Hourly'
+    if delta.days <= 90: return 'D', 'Daily'
+    if delta.days <= 730: return 'W', 'Weekly'
+    return 'M', 'Monthly'
+# ==============================================================================
+# ML MODEL MANAGEMENT
+# ==============================================================================
+SENTIMENT_MODEL_ID = 'ahs95/banglabert-sentiment-analysis'
+MODELS = {"sentiment_pipeline": None}
+def _load_pipeline_with_retry(task, model_id, retries=3):
+    logger.info(f"Initializing {task} pipeline for model: {model_id}")
+    for attempt in range(retries):
         try:
+            device = 0 if torch.cuda.is_available() else -1
+            if device == -1: gr.Warning(f"{model_id} will run on CPU and may be very slow.")
+            pipe = pipeline(task, model=model_id, device=device)
+            logger.info(f"Pipeline '{task}' loaded successfully.")
+            return pipe
+        except (HfHubHTTPError, requests.exceptions.ConnectionError) as e:
+            logger.warning(f"Network error on loading {model_id} (Attempt {attempt + 1}/{retries}): {e}")
+            if attempt < retries - 1: time.sleep(5)
+            else: raise gr.Error(f"Failed to download model '{model_id}' after {retries} attempts. Check network.")
         except Exception as e:
+            logger.error(f"An unexpected error occurred while loading {model_id}: {e}")
+            raise gr.Error(f"Could not initialize model '{model_id}'. Error: {e}")
+    return None
+def get_sentiment_pipeline():
+    if MODELS["sentiment_pipeline"] is None:
+        MODELS["sentiment_pipeline"] = _load_pipeline_with_retry("sentiment-analysis", SENTIMENT_MODEL_ID)
+    return MODELS["sentiment_pipeline"]
+# ==============================================================================
+# NEWS SCRAPER BACKEND
+# ==============================================================================
+def run_news_scraper_pipeline(search_keywords, sites, start_date_str, end_date_str, interval, max_pages, filter_keys, progress=gr.Progress()):
+    """Full, robust implementation of the news scraper."""
+    # Input validation and sanitization
+    search_keywords = search_keywords.strip()
+    if not all([search_keywords, start_date_str, end_date_str]):
+        raise gr.Error("Search Keywords, Start Date, and End Date are required.")
+    start_dt = dateparser.parse(start_date_str)
+    end_dt = dateparser.parse(end_date_str)
+    if not all([start_dt, end_dt]):
+        raise gr.Error("Invalid date format. Please use a recognizable format like YYYY-MM-DD or '2 weeks ago'.")
+    all_articles, current_dt = [], start_dt
+    while current_dt <= end_dt:
+        interval_end_dt = min(current_dt + pd.Timedelta(days=interval - 1), end_dt)
+        start_str, end_str = current_dt.strftime('%Y-%m-%d'), interval_end_dt.strftime('%Y-%m-%d')
+        progress(0, desc=f"Fetching news from {start_str} to {end_str}")
+        site_query = f"({' OR '.join(['site:' + s.strip() for s in sites.split(',') if s.strip()])})" if sites else ""
+        final_query = f'"{search_keywords}" {site_query} after:{start_str} before:{end_str}'
+        googlenews = GoogleNews(lang='bn', region='BD')
+        googlenews.search(final_query)
+        for page in range(1, max_pages + 1):
+            try:
+                results = googlenews.results()
+                if not results: break
+                all_articles.extend(results)
+                if page < max_pages:
+                    googlenews.getpage(page + 1)
+                    time.sleep(random.uniform(2, 5))
+            except HTTPError as e:
+                if e.code == 429:
+                    wait_time = random.uniform(15, 30)
+                    gr.Warning(f"Rate limited by Google News. Pausing for {wait_time:.0f} seconds.")
+                    time.sleep(wait_time)
+                else:
+                    logger.error(f"HTTP Error fetching news: {e}"); break
+            except Exception as e:
+                logger.error(f"An error occurred fetching news: {e}"); break
+        current_dt += pd.Timedelta(days=interval)
+    if not all_articles: return pd.DataFrame(), pd.DataFrame()
+    df = pd.DataFrame(all_articles).drop_duplicates(subset=['link'])
+    df['published_date'] = df['date'].apply(lambda x: dateparser.parse(x, languages=['bn']))
+    df.dropna(subset=['published_date', 'title'], inplace=True)
+    if filter_keys and filter_keys.strip():
+        keywords = [k.strip().lower() for k in filter_keys.split(',')]
+        mask = df.apply(lambda row: any(key in str(row['title']).lower() or key in str(row['desc']).lower() for key in keywords), axis=1)
+        df = df[mask]
+    return df, df[['published_date', 'title', 'media', 'desc', 'link']].sort_values(by='published_date', ascending=False)
+# ==============================================================================
+# YOUTUBE ANALYZER BACKEND
+# ==============================================================================
+# (This section remains unchanged from the previous robust version)
+def _fetch_video_details(youtube_service, video_ids: list):
+    all_videos_data = []
+    try:
+        for i in range(0, len(video_ids), 50):
+            id_batch = video_ids[i:i+50]
+            video_request = youtube_service.videos().list(part="snippet,statistics", id=",".join(id_batch))
+            video_response = video_request.execute()
+            for item in video_response.get('items', []):
+                stats = item.get('statistics', {})
+                all_videos_data.append({
+                    'video_id': item['id'], 'video_title': item['snippet']['title'],
+                    'channel': item['snippet']['channelTitle'], 'published_date': item['snippet']['publishedAt'],
+                    'view_count': int(stats.get('viewCount', 0)), 'like_count': int(stats.get('likeCount', 0)),
+                    'comment_count': int(stats.get('commentCount', 0))
+                })
+    except HttpError as e:
+        logger.error(f"Could not fetch video details. Error: {e}")
+        gr.Warning("Could not fetch details for some videos due to an API error.")
+    return all_videos_data
+def _scrape_single_video_comments(youtube_service, video_id, max_comments):
+    comments_list = []
     try:
+        request = youtube_service.commentThreads().list(
+            part="snippet", videoId=video_id, maxResults=min(max_comments, 100),
+            order='relevance', textFormat="plainText"
+        )
+        response = request.execute()
+        for item in response.get('items', []):
+            snippet = item['snippet']['topLevelComment']['snippet']
+            comments_list.append({
+                'author': snippet['authorDisplayName'], 'published_date_comment': snippet['publishedAt'],
+                'comment_text': snippet['textDisplay'], 'likes': snippet['likeCount'],
+                'replies': item['snippet']['totalReplyCount']
+            })
+    except HttpError as e:
+        logger.warning(f"Could not retrieve comments for video {video_id} (may be disabled). Error: {e}")
+    return comments_list
+def run_youtube_analysis_pipeline(api_key, query, max_videos_for_stats, num_videos_for_comments, max_comments_per_video, published_after, progress=gr.Progress()):
+    if not api_key: raise gr.Error("YouTube API Key is required.")
+    if not query: raise gr.Error("Search Keywords are required.")
+    try:
+        youtube = build('youtube', 'v3', developerKey=api_key)
+    except HttpError as e:
+        raise gr.Error(f"Failed to initialize YouTube service. Check API Key. Error: {e}")
     except Exception as e:
+        raise gr.Error(f"An unexpected error occurred during API initialization: {e}")
+    progress(0.1, desc="Performing broad scan for videos...")
+    all_video_ids, next_page_token, total_results_estimate = [], None, 0
+    PAGES_TO_FETCH = min(15, (max_videos_for_stats // 50) + 1)
+    search_params = {'q': query, 'part': 'id', 'maxResults': 50, 'type': 'video', 'order': 'relevance'}
+    if published_after:
+        parsed_date = dateparser.parse(published_after)
+        if parsed_date:
+            search_params['publishedAfter'] = parsed_date.replace(tzinfo=timezone.utc).isoformat()
+        else:
+            gr.Warning(f"Could not parse date: '{published_after}'. Ignoring filter.")
+    for page in range(PAGES_TO_FETCH):
         try:
+            if next_page_token: search_params['pageToken'] = next_page_token
+            response = youtube.search().list(**search_params).execute()
+            if page == 0:
+                total_results_estimate = response.get('pageInfo', {}).get('totalResults', 0)
+            all_video_ids.extend([item['id']['videoId'] for item in response.get('items', [])])
+            next_page_token = response.get('nextPageToken')
+            progress(0.1 + (0.3 * (page / PAGES_TO_FETCH)), desc=f"Broad scan: Found {len(all_video_ids)} videos...")
+            if not next_page_token: break
+        except HttpError as e:
+             if "quotaExceeded" in str(e): raise gr.Error("CRITICAL: YouTube API daily quota exceeded. Try again tomorrow.")
+             logger.error(f"HTTP error during video search: {e}"); break
+    if not all_video_ids:
+        return pd.DataFrame(), pd.DataFrame(), 0
+    progress(0.4, desc=f"Fetching details for {len(all_video_ids)} videos...")
+    videos_df_full_scan = pd.DataFrame(_fetch_video_details(youtube, all_video_ids))
+    if videos_df_full_scan.empty:
+        return pd.DataFrame(), pd.DataFrame(), 0
+    videos_df_full_scan['published_date'] = pd.to_datetime(videos_df_full_scan['published_date'])
+    videos_df_full_scan['engagement_rate'] = ((videos_df_full_scan['like_count'] + videos_df_full_scan['comment_count']) / videos_df_full_scan['view_count']).fillna(0)
+    videos_df_full_scan = videos_df_full_scan.sort_values(by='view_count', ascending=False).reset_index(drop=True)
+    videos_to_scrape_df, all_comments = videos_df_full_scan.head(int(num_videos_for_comments)), []
+    for index, row in videos_to_scrape_df.iterrows():
+        progress(0.7 + (0.3 * (index / len(videos_to_scrape_df))), desc=f"Deep dive: Scraping comments from video {index+1}/{len(videos_to_scrape_df)}...")
+        comments_for_video = _scrape_single_video_comments(youtube, row['video_id'], max_comments_per_video)
+        if comments_for_video:
+            for comment in comments_for_video:
+                comment.update({'video_id': row['video_id'], 'video_title': row['video_title']})
+            all_comments.extend(comments_for_video)
+    comments_df = pd.DataFrame(all_comments)
+    if not comments_df.empty:
+        comments_df['published_date_comment'] = pd.to_datetime(comments_df['published_date_comment'])
+    logger.info(f"YouTube analysis complete. Est. total videos: {total_results_estimate}. Scanned: {len(videos_df_full_scan)}. Comments: {len(comments_df)}.")
+    return videos_df_full_scan, comments_df, total_results_estimate
+# ==============================================================================
+# ADVANCED ANALYTICS MODULE
+# ==============================================================================
+# (This section remains unchanged, as it was already robust)
+def set_plot_style():
+    plt.style.use('seaborn-v0_8-whitegrid')
+    plt.rcParams['figure.dpi'] = 100
+def run_sentiment_analysis(df: pd.DataFrame, text_column: str, progress=gr.Progress()):
+    if text_column not in df.columns: return df
+    sentiment_pipeline = get_sentiment_pipeline()
+    if not sentiment_pipeline:
+        gr.Warning("Sentiment model failed to load. Skipping analysis.")
+        return df
+    texts = df[text_column].dropna().tolist()
+    if not texts: return df
+    progress(0, desc="Running sentiment analysis...")
+    results = sentiment_pipeline(texts, batch_size=32)
+    text_to_sentiment = {text: result for text, result in zip(texts, results)}
+    df['sentiment_label'] = df[text_column].map(lambda x: text_to_sentiment.get(x, {}).get('label'))
+    df['sentiment_score'] = df[text_column].map(lambda x: text_to_sentiment.get(x, {}).get('score'))
+    logger.info("Sentiment analysis complete.")
+    return df
+def generate_scraper_dashboard(df: pd.DataFrame):
+    set_plot_style()
+    total_articles, unique_media = len(df), df['media'].nunique()
+    start_date, end_date = pd.to_datetime(df['published_date']).min(), pd.to_datetime(df['published_date']).max()
+    date_range_str = f"{start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"
+    agg_code, agg_name = get_dynamic_time_agg(start_date, end_date)
+    timeline_df = df.set_index(pd.to_datetime(df['published_date'])).resample(agg_code).size().reset_index(name='count')
+    timeline_plot = gr.LinePlot(timeline_df, x='published_date', y='count', title=f'{agg_name} News Volume', tooltip=['published_date', 'count'])
+    media_counts = df['media'].dropna().value_counts().nlargest(15).sort_values()
+    fig_media = None
+    if not media_counts.empty:
+        fig_media, ax = plt.subplots(figsize=(8, 6)); media_counts.plot(kind='barh', ax=ax, color='skyblue'); ax.set_title("Top 15 Media Sources", fontproperties=BANGLA_FONT)
+        ax.set_yticklabels(media_counts.index, fontproperties=BANGLA_FONT); ax.set_xlabel("Article Count"); plt.tight_layout()
+    text = " ".join(title for title in df['title'].astype(str))
+    fig_wc = None
+    try:
+        wc = WordCloud(font_path=FONT_PATH, width=800, height=400, background_color='white', stopwords=BANGLA_STOP_WORDS, collocations=False).generate(text)
+        fig_wc, ax = plt.subplots(figsize=(10, 5)); ax.imshow(wc, interpolation='bilinear'); ax.axis("off")
+    except Exception as e: logger.error(f"WordCloud failed: {e}")
     return {
+        kpi_total_articles: str(total_articles), kpi_unique_media: str(unique_media), kpi_date_range: date_range_str,
+        dashboard_timeline_plot: timeline_plot, dashboard_media_plot: fig_media, dashboard_wordcloud_plot: fig_wc,
+        scraper_dashboard_group: gr.update(visible=True)
     }
+def generate_sentiment_dashboard(df: pd.DataFrame):
+    updates = {sentiment_dashboard_tab: gr.update(visible=False)}
+    set_plot_style()
+    if 'sentiment_label' in df.columns:
+        sentiment_counts = df['sentiment_label'].value_counts()
+        fig_pie, fig_media_sent = None, None
+        if not sentiment_counts.empty:
+            fig_pie, ax = plt.subplots(figsize=(6, 6)); ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=['#66c2a5', '#fc8d62', '#8da0cb'])
+            ax.set_title("Overall Sentiment Distribution", fontproperties=BANGLA_FONT); ax.axis('equal')
+        top_media = df['media'].value_counts().nlargest(10).index
+        media_sentiment = pd.crosstab(df[df['media'].isin(top_media)]['media'], df['sentiment_label'], normalize='index').mul(100)
+        if not media_sentiment.empty:
+            fig_media_sent, ax = plt.subplots(figsize=(10, 7)); media_sentiment.plot(kind='barh', stacked=True, ax=ax, colormap='viridis')
+            ax.set_title("Sentiment by Top Media Sources", fontproperties=BANGLA_FONT); ax.set_yticklabels(media_sentiment.index, fontproperties=BANGLA_FONT); plt.tight_layout()
+        updates.update({sentiment_pie_plot: fig_pie, sentiment_by_media_plot: fig_media_sent, sentiment_dashboard_tab: gr.update(visible=True)})
+    return updates
+def generate_youtube_dashboard(videos_df, comments_df):
+    set_plot_style()
+    kpis = {
+        kpi_yt_videos_found: f"{len(videos_df):,}" if videos_df is not None else "0",
+        kpi_yt_views_scanned: f"{videos_df['view_count'].sum():,}" if videos_df is not None else "0",
+        kpi_yt_comments_scraped: f"{len(comments_df):,}" if comments_df is not None else "0"
+    }
+    channel_counts = videos_df['channel'].value_counts().nlargest(15).sort_values()
+    fig_channels, ax = plt.subplots(figsize=(8, 6))
+    if not channel_counts.empty:
+        channel_counts.plot(kind='barh', ax=ax, color='coral'); ax.set_title("Top 15 Channels by Video Volume", fontproperties=BANGLA_FONT); ax.set_yticklabels(channel_counts.index, fontproperties=BANGLA_FONT); plt.tight_layout()
+    fig_wc, fig_pie, fig_sentiment_video = None, None, None
+    if comments_df is not None and not comments_df.empty:
+        text = " ".join(comment for comment in comments_df['comment_text'].astype(str))
+        try:
+            wc = WordCloud(font_path=FONT_PATH, width=800, height=400, background_color='white', stopwords=BANGLA_STOP_WORDS, collocations=False).generate(text)
+            fig_wc, ax = plt.subplots(figsize=(10, 5)); ax.imshow(wc, interpolation='bilinear'); ax.axis("off"); ax.set_title("Most Common Words in Comments", fontproperties=BANGLA_FONT)
+        except Exception as e: logger.error(f"YouTube WordCloud failed: {e}")
+        if 'sentiment_label' in comments_df.columns:
+            sentiment_counts = comments_df['sentiment_label'].value_counts()
+            if not sentiment_counts.empty:
+                fig_pie, ax = plt.subplots(figsize=(6, 6)); ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=['#66c2a5', '#fc8d62', '#8da0cb']); ax.set_title("Overall Comment Sentiment", fontproperties=BANGLA_FONT)
+            top_videos_by_comment = comments_df['video_title'].value_counts().nlargest(10).index
+            video_sentiment = comments_df.groupby('video_title')['sentiment_label'].value_counts(normalize=True).unstack().mul(100).reindex(top_videos_by_comment).dropna(how='all')
+            if not video_sentiment.empty:
+                fig_sentiment_video, ax = plt.subplots(figsize=(10, 8)); video_sentiment.plot(kind='barh', stacked=True, ax=ax, colormap='viridis'); ax.set_title("Comment Sentiment by Top 10 Videos", fontproperties=BANGLA_FONT); ax.set_yticklabels(video_sentiment.index, fontproperties=BANGLA_FONT); plt.tight_layout()
+    return {**kpis, yt_channel_plot: fig_channels, yt_wordcloud_plot: fig_wc, yt_sentiment_pie_plot: fig_pie, yt_sentiment_by_video_plot: fig_sentiment_video}
+def generate_youtube_topic_dashboard(videos_df_full_scan: pd.DataFrame):
+    if videos_df_full_scan is None or videos_df_full_scan.empty: return None, None, None
+    set_plot_style()
+    channel_views = videos_df_full_scan.groupby('channel')['view_count'].sum().nlargest(15).sort_values()
+    fig_channel_views, ax = plt.subplots(figsize=(10, 7)); channel_views.plot(kind='barh', ax=ax, color='purple'); ax.set_title("Channel Dominance by Total Views (Top 15)", fontproperties=BANGLA_FONT); ax.set_xlabel("Combined Views on Topic"); ax.set_yticklabels(channel_views.index, fontproperties=BANGLA_FONT); plt.tight_layout()
+    df_sample = videos_df_full_scan.sample(n=min(len(videos_df_full_scan), 200))
+    avg_views, avg_engagement = df_sample['view_count'].median(), df_sample['engagement_rate'].median()
+    fig_quadrant, ax = plt.subplots(figsize=(10, 8)); sns.scatterplot(data=df_sample, x='view_count', y='engagement_rate', size='like_count', sizes=(20, 400), hue='channel', alpha=0.7, ax=ax, legend=False)
+    ax.set_xscale('log'); ax.set_yscale('log'); ax.set_title("Content Performance Quadrant", fontproperties=BANGLA_FONT); ax.set_xlabel("Video Views (Log Scale)", fontproperties=BANGLA_FONT); ax.set_ylabel("Engagement Rate (Log Scale)", fontproperties=BANGLA_FONT)
+    ax.axhline(avg_engagement, ls='--', color='gray'); ax.axvline(avg_views, ls='--', color='gray'); ax.text(avg_views*1.1, ax.get_ylim()[1], 'High Performers', color='green', fontproperties=BANGLA_FONT); ax.text(ax.get_xlim()[0], avg_engagement*1.1, 'Niche Stars', color='blue', fontproperties=BANGLA_FONT)
+    fig_age, ax = plt.subplots(figsize=(10, 7)); sns.scatterplot(data=df_sample, x='published_date', y='view_count', size='engagement_rate', sizes=(20, 400), alpha=0.6, ax=ax)
+    ax.set_yscale('log'); ax.set_title("Content Age vs. Impact", fontproperties=BANGLA_FONT); ax.set_xlabel("Publication Date", fontproperties=BANGLA_FONT); ax.set_ylabel("Views (Log Scale)", fontproperties=BANGLA_FONT); plt.xticks(rotation=45)
+    return fig_channel_views, fig_quadrant, fig_age
+# ==============================================================================
+# GRADIO UI DEFINITION
+# ==============================================================================
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), title=APP_TITLE) as app:
+    gr.Markdown(f"# {APP_TITLE}\n*{APP_TAGLINE}*")
+    # --- STATE MANAGEMENT ---
+    scraper_results_state = gr.State()
+    youtube_results_state = gr.State()
     with gr.Tabs() as tabs:
+        with gr.TabItem("1. News Scraper", id=0):
             with gr.Row():
                 with gr.Column(scale=1):
+                    gr.Markdown("### 1. Search Criteria")
+                    search_keywords_textbox = gr.Textbox(label="Search Keywords", placeholder="e.g., বিএনপি সমাবেশ")
+                    sites_to_search_textbox = gr.Textbox(label="Target Sites (Optional, comma-separated)", placeholder="e.g., prothomalo.com")
+                    start_date_textbox = gr.Textbox(label="Start Date", placeholder="YYYY-MM-DD or 'last week'")
+                    end_date_textbox = gr.Textbox(label="End Date", placeholder="YYYY-MM-DD or 'today'")
+                    gr.Markdown("### 2. Scraping Parameters")
+                    interval_days_slider = gr.Slider(1, 7, 3, step=1, label="Days per Interval")
+                    max_pages_slider = gr.Slider(1, 10, 5, step=1, label="Max Pages per Interval")
+                    filter_keywords_textbox = gr.Textbox(label="Filter Keywords (comma-separated, optional)", placeholder="e.g., নির্বাচন, সরকার")
+                    start_scraper_button = gr.Button("Start Scraping & Analysis", variant="primary")
                 with gr.Column(scale=2):
+                    scraper_results_df = gr.DataFrame(label="Filtered Results", interactive=False, wrap=True)
+                    scraper_download_file = gr.File(label="Download Filtered Results CSV")
+        with gr.TabItem("2. News Analytics", id=1):
+             with gr.Group(visible=False) as scraper_dashboard_group:
+                with gr.Tabs():
+                    with gr.TabItem("Overview"):
+                        with gr.Row():
+                            kpi_total_articles = gr.Textbox(label="Total Articles Found", interactive=False)
+                            kpi_unique_media = gr.Textbox(label="Unique Media Sources", interactive=False)
+                            kpi_date_range = gr.Textbox(label="Date Range of Articles", interactive=False)
+                        dashboard_timeline_plot = gr.LinePlot(label="News Volume Timeline")
+                        with gr.Row():
+                            dashboard_media_plot = gr.Plot(label="Top Media Sources by Article Count")
+                            dashboard_wordcloud_plot = gr.Plot(label="Headline Word Cloud")
+                    with gr.TabItem("Sentiment Analysis", visible=False) as sentiment_dashboard_tab:
                         with gr.Row():
+                            sentiment_pie_plot = gr.Plot(label="Overall Sentiment")
+                            sentiment_by_media_plot = gr.Plot(label="Sentiment by Media Source")
+        with gr.TabItem("3. YouTube Topic Analysis", id=2):
             with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### 1. YouTube API & Search")
+                    yt_api_key = gr.Textbox(label="YouTube API Key", type="password", placeholder="Paste your API key")
+                    yt_search_keywords = gr.Textbox(label="Search Keywords", placeholder="e.g., বিএনপি, তারেক রহমান")
+                    yt_published_after = gr.Textbox(label="Published After Date (Optional)", placeholder="YYYY-MM-DD or '1 month ago'")
+                    gr.Markdown("### 2. Analysis Parameters")
+                    yt_max_videos_for_stats = gr.Slider(label="Videos to Scan for Topic Stats (Broad Scan)", minimum=50, maximum=750, value=300, step=50)
+                    yt_num_videos_for_comments = gr.Slider(label="Top Videos for Comment Analysis (Deep Dive)", minimum=5, maximum=100, value=25, step=5)
+                    yt_max_comments = gr.Slider(10, 100, 30, step=10, label="Max Comments per Video")
+                    start_yt_analysis_button = gr.Button("Start YouTube Analysis", variant="primary")
+                with gr.Column(scale=2):
+                    with gr.Group(visible=False) as yt_dashboard_group:
+                        gr.Markdown("### Topic Footprint KPIs (Based on Broad Scan)")
+                        with gr.Row():
+                            kpi_yt_total_topic_videos = gr.Textbox(label="Est. Total Videos on Topic (YT)", interactive=False)
+                            kpi_yt_videos_found = gr.Textbox(label="Videos Scanned for Stats", interactive=False)
+                            kpi_yt_views_scanned = gr.Textbox(label="Combined Views (of Scanned)", interactive=False)
+                            kpi_yt_comments_scraped = gr.Textbox(label="Comments Analyzed (from Top Videos)", interactive=False)
+                        with gr.Tabs():
+                            with gr.TabItem("Deep Dive Analysis (on Top Videos)"):
+                                yt_videos_df_output = gr.DataFrame(label="Top Videos Analyzed for Comments (sorted by views)")
+                                with gr.Row():
+                                    yt_channel_plot = gr.Plot(label="Channel Contribution by Video Count")
+                                    yt_sentiment_pie_plot = gr.Plot(label="Overall Comment Sentiment")
+                                with gr.Row():
+                                    yt_wordcloud_plot = gr.Plot(label="Comment Word Cloud")
+                                    yt_sentiment_by_video_plot = gr.Plot(label="Comment Sentiment by Video")
+                            with gr.TabItem("Topic-Level Analytics (on All Scanned Videos)"):
+                                yt_channel_views_plot = gr.Plot(label="Channel Dominance by Views")
+                                yt_performance_quadrant_plot = gr.Plot(label="Content Performance Quadrant")
+                                yt_content_age_plot = gr.Plot(label="Content Age vs. Impact")
+    gr.Markdown(f"<div style='text-align: center; margin-top: 20px;'>{APP_FOOTER}</div>")
+    # ==============================================================================
+    # EVENT HANDLERS
+    # ==============================================================================
+    # --- NEWS SCRAPER WORKFLOW ---
+    def news_scraper_workflow(search_keywords, sites, start_date, end_date, interval, max_pages, filter_keys, progress=gr.Progress()):
+        progress(0, desc="Starting news analysis...")
+        raw_df, display_df = run_news_scraper_pipeline(search_keywords, sites, start_date, end_date, interval, max_pages, filter_keys, progress)
+        if raw_df.empty:
+            gr.Info("No news articles found for your query."); return None, None, None
+        progress(0.8, desc="Analyzing sentiment of news headlines...")
+        analyzed_df = run_sentiment_analysis(raw_df.copy(), 'title', progress)
+        output_path = "filtered_news_data.csv"; display_df.to_csv(output_path, index=False)
+        return display_df, output_path, analyzed_df
+    start_scraper_button.click(
+        fn=news_scraper_workflow,
+        inputs=[search_keywords_textbox, sites_to_search_textbox, start_date_textbox, end_date_textbox, interval_days_slider, max_pages_slider, filter_keywords_textbox],
+        outputs=[scraper_results_df, scraper_download_file, scraper_results_state]
+    )
+    def update_news_dashboards(analyzed_df):
+        if analyzed_df is None or analyzed_df.empty:
+            return {scraper_dashboard_group: gr.update(visible=False), sentiment_dashboard_tab: gr.update(visible=False)}
+        scraper_updates = generate_scraper_dashboard(analyzed_df)
+        sentiment_updates = generate_sentiment_dashboard(analyzed_df)
+        return {**scraper_updates, **sentiment_updates}
+    news_ui_components = [
+        scraper_dashboard_group, kpi_total_articles, kpi_unique_media, kpi_date_range,
+        dashboard_timeline_plot, dashboard_media_plot, dashboard_wordcloud_plot,
+        sentiment_dashboard_tab, sentiment_pie_plot, sentiment_by_media_plot
+    ]
+    scraper_results_state.change(fn=update_news_dashboards, inputs=scraper_results_state, outputs=news_ui_components)
+    # --- YOUTUBE WORKFLOW ---
+    def youtube_workflow(api_key, query, max_stats, num_comments, max_comments, published_after, progress=gr.Progress()):
+        sanitized_api_key = api_key.strip()
+        sanitized_query = query.strip()
+        videos_df_full, comments_df, total_vids_est = run_youtube_analysis_pipeline(
+            sanitized_api_key, sanitized_query, max_stats, num_comments, max_comments, published_after, progress
+        )
+        if videos_df_full.empty:
+            gr.Info("No videos found for your YouTube query."); return None, None
+        if comments_df is not None and not comments_df.empty:
+            progress(0.9, desc="Analyzing comment sentiment...")
+            comments_df = run_sentiment_analysis(comments_df.copy(), 'comment_text', progress)
+        top_videos_for_display = videos_df_full.head(int(num_comments))
+        return top_videos_for_display, {"full_scan": videos_df_full, "comments": comments_df, "total_estimate": total_vids_est}
+    start_yt_analysis_button.click(
+        fn=youtube_workflow,
+        inputs=[yt_api_key, yt_search_keywords, yt_max_videos_for_stats, yt_num_videos_for_comments, yt_max_comments, yt_published_after],
+        outputs=[yt_videos_df_output, youtube_results_state]
+    )
+    def update_youtube_dashboards(results_data):
+        if not results_data or results_data.get("full_scan") is None or results_data["full_scan"].empty:
             return {
+                yt_dashboard_group: gr.update(visible=False), kpi_yt_total_topic_videos: "0",
+                kpi_yt_videos_found: "0", kpi_yt_views_scanned: "0", kpi_yt_comments_scraped: "0",
+                yt_channel_plot: None, yt_wordcloud_plot: None, yt_sentiment_pie_plot: None,
+                yt_sentiment_by_video_plot: None, yt_channel_views_plot: None,
+                yt_performance_quadrant_plot: None, yt_content_age_plot: None
             }
+        videos_df_full, comments_df, total_estimate = results_data.get("full_scan"), results_data.get("comments"), results_data.get("total_estimate", 0)
+        deep_dive_updates = generate_youtube_dashboard(videos_df_full, comments_df)
+        fig_ch_views, fig_quad, fig_age = generate_youtube_topic_dashboard(videos_df_full)
         return {
+            yt_dashboard_group: gr.update(visible=True),
+            kpi_yt_total_topic_videos: f"{total_estimate:,}",
+            **deep_dive_updates,
+            yt_channel_views_plot: fig_ch_views,
+            yt_performance_quadrant_plot: fig_quad,
+            yt_content_age_plot: fig_age,
         }
+    yt_ui_components = [
+        yt_dashboard_group, kpi_yt_total_topic_videos, kpi_yt_videos_found, kpi_yt_views_scanned, kpi_yt_comments_scraped,
+        yt_channel_plot, yt_wordcloud_plot, yt_sentiment_pie_plot, yt_sentiment_by_video_plot,
+        yt_channel_views_plot, yt_performance_quadrant_plot, yt_content_age_plot
+    ]
+    youtube_results_state.change(fn=update_youtube_dashboards, inputs=youtube_results_state, outputs=yt_ui_components)
+# ==============================================================================
+# LAUNCH THE APP
+# ==============================================================================
 if __name__ == "__main__":
+    auth_credentials = os.getenv("AUTH_CREDENTIALS")
+    auth_tuple = None
+    if auth_credentials and ":" in auth_credentials:
+        user, pwd = auth_credentials.split(":", 1)
+        auth_tuple = (user, pwd)
+        logger.info("Using authentication credentials from environment variable.")
+    else:
+        logger.warning("No AUTH_CREDENTIALS found. Using default insecure credentials. Set this as an environment variable for production.")
+        auth_tuple = ("bnp", "12345")
+    app.launch(debug=True, auth=auth_tuple)