Twitter Drug Crime Monitoring Dashboard

#modify_app.py
import streamlit as st
import pandas as pd
import os
import json
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timedelta

import nltk
from nltk.corpus import stopwords  # ✅ import first

# Ensure stopwords data is downloaded
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

# Now you can safely use it
english_stopwords = stopwords.words('english')

import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
from alerts import compute_dynamic_risk,assign_dynamic_risk_level
from evaluation import evaluate_model

# Run evaluation on the scraped CSV folder
evaluate_model("drug_analysis_data_3months")

import re

st.set_page_config(page_title="Twitter Drug Crime Monitoring", layout="wide")

# Custom CSS for better styling
st.markdown("""
<style>
    .main-header {
        background: linear-gradient(90deg, #1e3c72, #2a5298);
        color: white;
        padding: 1rem;
        border-radius: 10px;
        text-align: center;
        margin-bottom: 2rem;
    }
    .metric-card {
        background: #f8f9fa;
        padding: 1rem;
        border-radius: 8px;
        border-left: 4px solid #007bff;
    }
    .critical-alert {
        background: #f8d7da;
        border: 1px solid #f5c6cb;
        color: #721c24;
        padding: 1rem;
        border-radius: 8px;
        margin: 1rem 0;
    }
    .high-priority {
        background: #fff3cd;
        border: 1px solid #ffeaa7;
        color: #856404;
        padding: 1rem;
        border-radius: 8px;
        margin: 1rem 0;
    }
    .warning-box {
        background: #d4edda;
        border: 1px solid #c3e6cb;
        color: #155724;
        padding: 1rem;
        border-radius: 8px;
        margin: 1rem 0;
    }
</style>
""", unsafe_allow_html=True)

# Configuration
DASHBOARD_CONFIG = {
    'data_dirs': ['drug_analysis_data_3months', 'data', 'output', '.'],
    'refresh_interval': 30,
    'max_display_tweets': 50,
    'chart_height': 400
}

# Main header
st.markdown('<div class="main-header"><h1>Twitter Drug Crime Monitoring Dashboard</h1><p>Real-time Twitter Analysis for Drug Crime Detection</p></div>', unsafe_allow_html=True)

# ------------------------
# Enhanced Data Loading Functions
# ------------------------

def parse_dates_flexible(df):
    """Parse dates with multiple format attempts."""
    if "datetime" not in df.columns:
        return df
        
    date_formats = [
        "%d-%m-%Y %H:%M:%S",
        "%Y-%m-%d %H:%M:%S", 
        "%Y-%m-%d %H:%M",
        "%Y-%m-%d",
        "%d/%m/%Y %H:%M:%S",
        "%m/%d/%Y %H:%M:%S"
    ]
    
    original_datetime = df["datetime"].copy()
    
    for fmt in date_formats:
        try:
            df["datetime"] = pd.to_datetime(original_datetime, format=fmt, errors="coerce")
            if not df["datetime"].isna().all():
                break
        except:
            continue
    
    # If parsing still failed, try generic parsing
    if df["datetime"].isna().all():
        df["datetime"] = pd.to_datetime(original_datetime, errors="coerce")
    
    # Fill any remaining NaT values with current time
    df["datetime"] = df["datetime"].fillna(pd.Timestamp.now())
    
    return df

def validate_dataframe(df):
    """Validate that the dataframe has expected columns."""
    if df is None or df.empty:
        return False, "DataFrame is empty"
        
    required_columns = ['username', 'content']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        return False, f"Missing required columns: {missing_columns}"
        
    return True, "DataFrame is valid"

@st.cache_data
def load_data():
    """Load the most recent data with robust error handling."""
    start_time = time.time()
    
    for data_dir in DASHBOARD_CONFIG['data_dirs']:
        if not os.path.exists(data_dir):
            continue
            
        try:
            # Look for main dataset files with flexible naming
            csv_files = []
            for f in os.listdir(data_dir):
                if f.endswith(".csv") and any(keyword in f.lower() for keyword in 
                    ["karnataka_drug_tweets", "drug_tweets", "drug_analysis", "drug_crime"]):
                    csv_files.append(f)
            
            if not csv_files:
                # Fallback to any CSV file
                csv_files = [f for f in os.listdir(data_dir) if f.endswith(".csv")]
                
            if not csv_files:
                continue
                
            # Get the most recent file
            latest_file = max(csv_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
            file_path = os.path.join(data_dir, latest_file)
            
            # Load with error handling
            df = pd.read_csv(file_path, encoding='utf-8')
            
            if df.empty:
                continue
            
            # Enhanced date parsing
            df = parse_dates_flexible(df)
            
            # Add derived columns if missing
            if "datetime" in df.columns:
                if "date" not in df.columns:
                    df["date"] = df["datetime"].dt.date
                if "hour" not in df.columns:
                    df["hour"] = df["datetime"].dt.hour
                if "day_of_week" not in df.columns:
                    df["day_of_week"] = df["datetime"].dt.day_name()
                if "day" not in df.columns:
                    df["day"] = df["datetime"].dt.day
            
            # Load report if available
            report_files = [f for f in os.listdir(data_dir) 
                           if f.startswith("ANALYSIS_REPORT_") and f.endswith(".json")]
            report_data = None
            
            if report_files:
                latest_report = max(report_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
                try:
                    with open(os.path.join(data_dir, latest_report), 'r', encoding='utf-8') as f:
                        report_data = json.load(f)
                except Exception as e:
                    st.sidebar.warning(f"Could not load report: {e}")
                    report_data = None
            
            load_time = time.time() - start_time
            
            # Display load metrics in sidebar
            st.sidebar.success(f"Data loaded successfully")
            st.sidebar.metric("Load Time", f"{load_time:.2f}s")
            st.sidebar.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
            st.sidebar.info(f"Source: {latest_file}")
            
            return df, report_data
            
        except Exception as e:
            st.sidebar.warning(f"Failed to load from {data_dir}: {str(e)}")
            continue
    
    return None, None

@st.cache_data
def load_priority_data():
    """Load high priority and contact info datasets with fallbacks."""
    data_dir = DASHBOARD_CONFIG['data_dirs'][0]  # Primary data directory
    
    if not os.path.exists(data_dir):
        return None, None
    
    high_priority_df = None
    contact_df = None
    
    try:
        # Load high priority tweets
        high_priority_files = [f for f in os.listdir(data_dir) 
                              if "HIGH_PRIORITY" in f and f.endswith(".csv")]
        
        if high_priority_files:
            latest_priority = max(high_priority_files, 
                                key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
            high_priority_df = pd.read_csv(os.path.join(data_dir, latest_priority))
            high_priority_df = parse_dates_flexible(high_priority_df)
    except Exception as e:
        st.sidebar.warning(f"Could not load high priority data: {e}")
    
    try:
        # Load contact info tweets
        contact_files = [f for f in os.listdir(data_dir) 
                        if "CONTACT_INFO" in f and f.endswith(".csv")]
        
        if contact_files:
            latest_contact = max(contact_files, 
                               key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
            contact_df = pd.read_csv(os.path.join(data_dir, latest_contact))
            contact_df = parse_dates_flexible(contact_df)
    except Exception as e:
        st.sidebar.warning(f"Could not load contact info data: {e}")
    
    return high_priority_df, contact_df

def safe_column_access(df, column, default=0):
    """Safely access DataFrame columns with defaults."""
    if column in df.columns:
        return df[column]
    else:
        return pd.Series([default] * len(df), index=df.index)

def safe_column_sum(df, column):
    """Safely sum a column with fallback."""
    if column in df.columns:
        return df[column].sum()
    return 0

def safe_column_mean(df, column):
    """Safely calculate mean of a column with fallback."""
    if column in df.columns and len(df) > 0:
        return df[column].mean()
    return 0

# ----------------- Helper: Calculate User Risk -----------------
def calculate_user_risk(df):
    """
    Calculate risk score per user:
    CRITICAL = 2 points, HIGH = 1 point
    Returns DataFrame with username, risk_score, tweet_count
    """
    if "username" not in df.columns or "risk_level" not in df.columns:
        return pd.DataFrame()
    
    user_metrics = []
    for username in df["username"].unique():
        user_data = df[df["username"] == username]
        risk_score = (user_data["risk_level"] == "HIGH").sum() + \
                     (user_data["risk_level"] == "CRITICAL").sum() * 2
        user_metrics.append({
            "username": username,
            "risk_score": risk_score,
            "tweet_count": len(user_data)
        })
    return pd.DataFrame(user_metrics)

# ----------------- Helper: Filter Words -----------------
def get_filtered_words(text_series):
    """
    Returns filtered words from a Series of text,
    removing English stopwords and words <=2 characters
    """
    stop_words_set = set(stopwords.words('english'))
    all_text = " ".join(text_series.astype(str))
    words = re.findall(r'\b\w+\b', all_text.lower())
    return [w for w in words if w not in stop_words_set and len(w) > 2]


def create_heatmap_chart(df, x_col, y_col, title="Heatmap"):
    """Create a heatmap using plotly."""
    if x_col not in df.columns or y_col not in df.columns:
        return None
    
    # Create pivot table for heatmap
    heatmap_data = df.groupby([x_col, y_col]).size().reset_index(name='count')
    pivot_data = heatmap_data.pivot(index=y_col, columns=x_col, values='count').fillna(0)
    
    fig = go.Figure(data=go.Heatmap(
        z=pivot_data.values,
        x=pivot_data.columns,
        y=pivot_data.index,
        colorscale='Blues',
        hoverongaps=False
    ))
    
    fig.update_layout(
        title=title,
        xaxis_title=x_col,
        yaxis_title=y_col,
        height=400
    )
    
    return fig

def create_weekly_trend_analysis(df):
    """Create weekly trend analysis."""
    if "datetime" not in df.columns:
        return None, None
    
    # Weekly aggregation
    df['week'] = df['datetime'].dt.isocalendar().week
    df['weekday'] = df['datetime'].dt.day_name()
    
    weekly_counts = df.groupby('week').size().reset_index(name='count')
    weekday_counts = df.groupby('weekday').size().reset_index(name='count')
    
    # Reorder weekdays
    weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    weekday_counts['weekday'] = pd.Categorical(weekday_counts['weekday'], categories=weekday_order, ordered=True)
    weekday_counts = weekday_counts.sort_values('weekday')
    
    fig1 = px.line(weekly_counts, x='week', y='count', title="Weekly Tweet Trends")
    fig2 = px.bar(weekday_counts, x='weekday', y='count', title="Tweets by Weekday")
    
    return fig1, fig2

# ------------------------
# Load Data
# ------------------------
df, report_data = load_data()

# --- Compute dynamic risk for all tweets ---
if df is not None and not df.empty:
    from alerts import compute_dynamic_risk, assign_dynamic_risk_level

    # Add dynamic risk fields
    df['dynamic_risk_score'] = df.apply(lambda row: compute_dynamic_risk(row.to_dict()), axis=1)
    df['risk_level'] = df.apply(lambda row: assign_dynamic_risk_level(row.to_dict()), axis=1)

if df is None:
    st.error("No data found. Please run the drug crime scraper first.")
    
    # Enhanced debug information
    st.subheader("Debug Information")
    current_dir = os.getcwd()
    st.write(f"Current directory: {current_dir}")
    
    for dir_name in DASHBOARD_CONFIG['data_dirs']:
        if os.path.exists(dir_name):
            files = [f for f in os.listdir(dir_name) if f.endswith('.csv')]
            st.write(f"CSV files in {dir_name}: {files}")
        else:
            st.write(f"Directory {dir_name} does not exist")
    
    st.info("Expected files: karnataka_drug_tweets_*.csv or similar drug-related CSV files")
    st.stop()

# Validate dataframe
is_valid, validation_message = validate_dataframe(df)
if not is_valid:
    st.error(f"Data validation failed: {validation_message}")
    st.write("Available columns:", list(df.columns))
    st.stop()

# Load priority data
high_priority_df, contact_df = load_priority_data()

# Filter for current month data for some analyses
now = datetime.now()
if "datetime" in df.columns:
    df_month = df[(df['datetime'].dt.month == now.month) & (df['datetime'].dt.year == now.year)]
else:
    df_month = df

# ------------------------
# Sidebar Navigation & Filters
# ------------------------
st.sidebar.title("Dashboard Navigation")

# Auto-refresh option
auto_refresh = st.sidebar.checkbox("Auto-refresh (30s)")
from streamlit_autorefresh import st_autorefresh

if auto_refresh:
    st_autorefresh(interval=30*1000, key="refresh")

# Navigation tabs - ENHANCED with new options
analysis_type = st.sidebar.radio(
    "Select Analysis View",
    ["Summary", "Risk Analysis", "Actionable Insights", "📈 Predictive Analytics", "🌐 Network Analysis",
     "Geographic Analysis", "User Analysis", 
     "Content Analysis", "📊 Volume Trends", "🧠 User Behavior", 
     "📍 Heatmaps", "⚠️ Risk Patterns"]
)

# Common filters
st.sidebar.header("Data Filters")

# Date range filter
if "datetime" in df.columns and not df["datetime"].isna().all():
    try:
        min_date = df["datetime"].min().date()
        max_date = df["datetime"].max().date()
        
        date_range = st.sidebar.date_input(
            "Select Date Range",
            value=[min_date, max_date],
            min_value=min_date,
            max_value=max_date
        )
        
        # Filter dataframe by date range
        if len(date_range) == 2:
            df = df[
                (df["datetime"].dt.date >= date_range[0]) & 
                (df["datetime"].dt.date <= date_range[1]) &
                (df["datetime"].dt.year == date_range[0].year)  # optional if needed
            ]

    except Exception as e:
        st.sidebar.warning(f"Date filtering error: {e}")

# Risk level filter
if "risk_level" in df.columns:
    available_risk_levels = df["risk_level"].unique().tolist()
    risk_levels = st.sidebar.multiselect(
        "Risk Levels",
        options=available_risk_levels,
        default=available_risk_levels
    )
    df = df[df["risk_level"].isin(risk_levels)]

# Search filter
search_term = st.sidebar.text_input("Search Content", "")
if search_term:
    df = df[df["content"].str.lower().str.contains(search_term.lower(), na=False)]

# Display current filter status
st.sidebar.info(f"Showing {len(df)} tweets")

# ------------------------
# EXECUTIVE SUMMARY
# ------------------------
if analysis_type == "Summary":
    st.header("Summary")
    
    # Key metrics in columns
    col1, col2, col3, col4, col5, col6 = st.columns(6)
    
    with col1:
        st.metric("Total Tweets", len(df))
    with col2:
        drug_related = safe_column_sum(df, "is_drug_related")
        st.metric("Drug Related", drug_related)
    with col3:
        crime_related = safe_column_sum(df, "is_crime_related")
        st.metric("Crime Related", crime_related)
    with col4:
        contact_info = safe_column_sum(df, "has_contact_info")
        st.metric("Contact Info", contact_info)
    with col5:
        st.metric("Unique Users", df["username"].nunique())
    with col6:  # Or create a new column if needed
        avg_risk = df["dynamic_risk_score"].mean() if "dynamic_risk_score" in df.columns else 0
        st.metric("Avg. Dynamic Risk Score", f"{avg_risk:.2f}")

    
    # Risk level analysis
    if "risk_level" in df.columns:
        critical_count = len(df[df["risk_level"] == "CRITICAL"])
        high_count = len(df[df["risk_level"] == "HIGH"])
        
        if critical_count > 0:
            st.markdown(f'<div class="critical-alert"><strong>CRITICAL ALERT:</strong> {critical_count} tweets require immediate attention</div>', unsafe_allow_html=True)
        
        if high_count > 0:
            st.markdown(f'<div class="high-priority"><strong>HIGH PRIORITY:</strong> {high_count} tweets for investigation</div>', unsafe_allow_html=True)
        
        # Risk distribution pie chart
        col1, col2 = st.columns(2)
        
        with col1:
            risk_order = ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
            risk_dist = df["risk_level"].value_counts().reindex(risk_order).fillna(0)
            fig_risk = px.pie(values=risk_dist.values, names=risk_dist.index, 
                             title="Risk Level Distribution",
                             color_discrete_map={
                                 "CRITICAL": "#dc3545",
                                 "HIGH": "#fd7e14", 
                                 "MEDIUM": "#ffc107",
                                 "LOW": "#28a745"
                             })
            st.plotly_chart(fig_risk, use_container_width=True)
        
        with col2:
            # Sentiment analysis if available
            if "sentiment_compound" in df.columns:
                sentiment_counts = pd.cut(df["sentiment_compound"], 
                                        bins=[-1, -0.1, 0.1, 1], 
                                        labels=["Negative", "Neutral", "Positive"]).value_counts()
                fig_sentiment = px.bar(x=sentiment_counts.index, y=sentiment_counts.values,
                                      title="Sentiment Distribution",
                                      color=sentiment_counts.values,
                                      color_continuous_scale="RdYlGn")
                st.plotly_chart(fig_sentiment, use_container_width=True)
            else:
                st.info("Sentiment data not available")
    
    # Analysis report summary
    if report_data:
        st.subheader("Analysis Report Summary")
        
        col1, col2 = st.columns(2)
        
        with col1:
            if "summary_statistics" in report_data:
                st.json(report_data["summary_statistics"])
        
        with col2:
            if "investigation_priorities" in report_data:
                st.json(report_data["investigation_priorities"])

# ------------------------
# NEW: VOLUME TRENDS
# ------------------------
elif analysis_type == "📊 Volume Trends":
    st.header("📊 Tweet Volume: Daily,Weekly and Hourly Trends")
    
    if "datetime" in df.columns and not df["datetime"].isna().all():
        # Daily trend
        if "date" in df.columns:
            daily_counts = df.groupby("date").size().reset_index(name="count")
            fig_daily = px.line(daily_counts, x="date", y="count", 
                               title="Daily Tweet Volume")
            st.plotly_chart(fig_daily, use_container_width=True)
        
        # Hourly and weekday patterns
        col1, = st.columns(1)
        
        with col1:
            if "hour" in df.columns:
                hourly_counts = df.groupby("hour").size()
                fig_hourly = px.bar(x=hourly_counts.index, y=hourly_counts.values,
                                   title="Tweets by Hour of Day")
                st.plotly_chart(fig_hourly, use_container_width=True)
        
    # Weekly trends
    if "datetime" in df.columns:
        weekly_fig1, weekly_fig2 = create_weekly_trend_analysis(df)
        if weekly_fig1 and weekly_fig2:
            st.subheader("📅 Weekly Trends")
            col1, col2 = st.columns(2)
            with col1:
                st.plotly_chart(weekly_fig1, use_container_width=True)
            with col2:
                st.plotly_chart(weekly_fig2, use_container_width=True)
    else:
        st.info("Temporal data not available")
        
    # CSV Downloads
    st.subheader("📄 Download Data")
    col1, col2 = st.columns(2)
    
    with col1:
        if st.button("📥 Download Top Users CSV"):
            top_users = df.groupby("username").agg(
                tweet_count=("username", "count"),
                max_risk=("dynamic_risk_score", "max")
            ).sort_values("tweet_count", ascending=False).head(20).reset_index()
            csv = top_users.to_csv(index=False)
            st.download_button(
                "Download CSV", csv, "top_users.csv", "text/csv"
            )

    with col2:
        if st.button("📥 Download Top Locations CSV"):
            if "user_location" in df.columns:
                top_locations = df.groupby("user_location").agg(
                    tweet_count=("user_location", "count"),
                    max_risk=("dynamic_risk_score", "max")
                ).sort_values("tweet_count", ascending=False).head(20).reset_index()
                csv = top_locations.to_csv(index=False)
                st.download_button(
                    "Download CSV", csv, "top_locations.csv", "text/csv"
                )


# ------------------------
# NEW: USER BEHAVIOR
# ------------------------
elif analysis_type == "🧠 User Behavior":
    st.header("🧠 User Behavior Analysis")
    
    # Top repeat users
    st.subheader("🧠 Top Repeat Users")
    user_activity = df["username"].value_counts().head(15)
    
    if not user_activity.empty:
        fig_users = px.bar(x=user_activity.values, y=user_activity.index, 
                          orientation='h', title="Top 15 Most Active Users")
        fig_users.update_layout(yaxis=dict(autorange="reversed"))
        st.plotly_chart(fig_users, use_container_width=True)
        
        # Show details of top users
        with st.expander("View Top User Details"):
            for username, count in user_activity.head(10).items():
                user_tweets = df[df["username"] == username]
                
                # Safe mode extraction with proper error handling
                if "risk_level" in user_tweets.columns and not user_tweets["risk_level"].empty:
                    risk_mode = user_tweets["risk_level"].mode()
                    risk_level = risk_mode.iloc[0] if len(risk_mode) > 0 else "Unknown"
                else:
                    risk_level = "Unknown"
                
                if "user_location" in user_tweets.columns and not user_tweets["user_location"].empty:
                    location_mode = user_tweets["user_location"].mode()
                    location = location_mode.iloc[0] if len(location_mode) > 0 else "Unknown"
                else:
                    location = "Unknown"
                
                st.write(f"**@{username}**: {count} tweets | Risk: {risk_level} | Location: {location}")
    
    # User engagement patterns
    if "like_count" in df.columns or "retweet_count" in df.columns:
        st.subheader("📊 User Engagement Patterns")
        
        col1, col2 = st.columns(2)
        
        with col1:
            if "like_count" in df.columns:
                avg_likes = df.groupby("username")["like_count"].mean().sort_values(ascending=False).head(15)
                fig_likes = px.bar(x=avg_likes.values, y=avg_likes.index,
                                  orientation='h', title="Users by Average Likes")
                fig_likes.update_layout(yaxis=dict(autorange="reversed"))
                st.plotly_chart(fig_likes, use_container_width=True)
        
        with col2:
            if "retweet_count" in df.columns:
                avg_retweets = df.groupby("username")["retweet_count"].mean().sort_values(ascending=False).head(15)
                fig_retweets = px.bar(x=avg_retweets.values, y=avg_retweets.index,
                                     orientation='h', title="Users by Average Retweets")
                fig_retweets.update_layout(yaxis=dict(autorange="reversed"))
                st.plotly_chart(fig_retweets, use_container_width=True)
    
    # User location overlap analysis
    if "user_location" in df.columns and "risk_level" in df.columns:
        st.subheader("📍 User Location vs Risk Analysis")
        location_risk = df.groupby(["user_location", "risk_level"]).size().reset_index(name="count")
        location_risk = location_risk[location_risk["user_location"] != ""]
        
        if not location_risk.empty:
            fig_loc_risk = px.bar(location_risk, x="user_location", y="count", 
                                 color="risk_level", title="Risk Distribution by Location",
                                 color_discrete_map={
                                     "CRITICAL": "#dc3545",
                                     "HIGH": "#fd7e14", 
                                     "MEDIUM": "#ffc107",
                                     "LOW": "#28a745"
                                 })
            fig_loc_risk.update_xaxes(tickangle=45)
            st.plotly_chart(fig_loc_risk, use_container_width=True)

# ------------------------
# NEW: HEATMAPS
# ------------------------
elif analysis_type == "📍 Heatmaps":
    st.header("📍 Time-Based Heatmaps")
    
    # -------------------
    # Day-Hour heatmap
    # -------------------
    if "day_of_week" in df.columns and "hour" in df.columns:
        # Ensure proper order
        day_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
        df['day_of_week'] = pd.Categorical(df['day_of_week'], categories=day_order, ordered=True)

        st.subheader("🔥 Day vs Hour Activity Heatmap")
        heatmap_fig = create_heatmap_chart(df, "hour", "day_of_week", "Tweet Activity: Day vs Hour")
        if heatmap_fig:
            st.plotly_chart(heatmap_fig, use_container_width=True)
    
    # Risk level heatmap
    if "risk_level" in df.columns and "hour" in df.columns:
        st.subheader("⚠️ Risk Level vs Hour Heatmap")
        risk_heatmap = create_heatmap_chart(df, "hour", "risk_level", "Risk Level Distribution by Hour")
        if risk_heatmap:
            st.plotly_chart(risk_heatmap, use_container_width=True)
    
    # -------------------
    # Top Locations Heatmap
    # -------------------
    if "user_location" in df.columns and "hour" in df.columns:
        st.subheader("📍 Location vs Hour Heatmap (Top Locations)")
        
        # Add slider in sidebar
        TOP_N_LOCATIONS = st.sidebar.slider("Top N Locations for Heatmaps", 5, 30, 10)

        # Filter top N locations
        top_locations = df["user_location"].value_counts().head(TOP_N_LOCATIONS).index
        df_top_loc = df[df["user_location"].isin(top_locations)]
        
        if not df_top_loc.empty:
            loc_heatmap = create_heatmap_chart(df_top_loc, "hour", "user_location",
                                               f"Top {TOP_N_LOCATIONS} Locations Activity by Hour")
            if loc_heatmap:
                st.plotly_chart(loc_heatmap, use_container_width=True)
    
    # Tweet location heatmap (if geographic coordinates available)
    if "latitude" in df.columns and "longitude" in df.columns:
        st.subheader("🗺️ Geographic Tweet Distribution")
        valid_coords = df.dropna(subset=["latitude", "longitude"])
        
        if not valid_coords.empty:
            fig_map = px.scatter_mapbox(
                valid_coords, lat="latitude", lon="longitude",
                color="risk_level" if "risk_level" in df.columns else None,
                size_max=15, zoom=7,
                mapbox_style="open-street-map",
                title="Geographic Distribution of Tweets"
            )
            st.plotly_chart(fig_map, use_container_width=True)
        else:
            st.info("No geographic coordinates available for mapping")


# ------------------------
# NEW: RISK PATTERNS
# ------------------------
# High-risk users analysis
elif analysis_type == "⚠️ Risk Patterns":
    st.header("⚠️ Risk Patterns and High-Risk Analysis")

    # High-risk users analysis
    if "risk_level" in df.columns:
        st.subheader("🚨 High-Risk Users")
        
        user_risk_df = calculate_user_risk(df)
        high_risk_users = user_risk_df[user_risk_df["risk_score"] > 0].sort_values("risk_score", ascending=False).head(20)
        
        if not high_risk_users.empty:
            fig_risk_users = px.bar(high_risk_users, x="risk_score", y="username",
                                    orientation='h', color="tweet_count", color_continuous_scale="Reds")
            fig_risk_users.update_layout(yaxis=dict(autorange="reversed"))
            st.plotly_chart(fig_risk_users, use_container_width=True)
            
            # Optional: show details
            with st.expander("High-Risk User Details"):
                for _, row in high_risk_users.iterrows():
                    user_data = df[df["username"] == row["username"]]
                    critical_count = (user_data["risk_level"] == "CRITICAL").sum()
                    high_count = (user_data["risk_level"] == "HIGH").sum()
                    st.write(f"**@{row['username']}**: Risk Score: {row['risk_score']} | Critical: {critical_count} | High: {high_count} | Total Tweets: {row['tweet_count']}")
    
    # Risk overlap analysis
    if "is_drug_related" in df.columns and "is_crime_related" in df.columns:
        st.subheader("🔄 Drug-Crime Overlap Analysis")
        
        # Create overlap categories
        df_overlap = df.copy()
        df_overlap["category"] = "Other"
        df_overlap.loc[df_overlap["is_drug_related"] == 1, "category"] = "Drug Only"
        df_overlap.loc[df_overlap["is_crime_related"] == 1, "category"] = "Crime Only"
        df_overlap.loc[(df_overlap["is_drug_related"] == 1) & (df_overlap["is_crime_related"] == 1), "category"] = "Drug + Crime"
        
        overlap_counts = df_overlap["category"].value_counts()
        
        fig_overlap = px.pie(values=overlap_counts.values, names=overlap_counts.index,
                            title="Drug-Crime Content Overlap",
                            color_discrete_map={
                                "Drug + Crime": "#dc3545",
                                "Drug Only": "#fd7e14",
                                "Crime Only": "#ffc107",
                                "Other": "#28a745"
                            })
        st.plotly_chart(fig_overlap, use_container_width=True)
        
        # Show high-overlap users
        high_overlap_users = df_overlap[df_overlap["category"] == "Drug + Crime"]["username"].value_counts().head(10)
        if not high_overlap_users.empty:
            st.write("**Users with most Drug+Crime tweets:**")
            for username, count in high_overlap_users.items():
                st.write(f"- @{username}: {count} tweets")
    
    # Risk progression over time
    if "datetime" in df.columns and "risk_level" in df.columns:
        st.subheader("📈 Risk Level Trends Over Time")
        
        # Daily risk aggregation
        df["date_str"] = df["datetime"].dt.strftime("%Y-%m-%d")
        risk_time = df.groupby(["date_str", "risk_level"]).size().reset_index(name="count")
        
        fig_risk_time = px.line(risk_time, x="date_str", y="count", color="risk_level",
                               title="Risk Levels Trend Over Time",
                               color_discrete_map={
                                   "CRITICAL": "#dc3545",
                                   "HIGH": "#fd7e14", 
                                   "MEDIUM": "#ffc107",
                                   "LOW": "#28a745"
                               })
        fig_risk_time.update_xaxes(tickangle=45)
        st.plotly_chart(fig_risk_time, use_container_width=True)

# ------------------------
# RISK ANALYSIS (Enhanced)
# ------------------------
elif analysis_type == "Risk Analysis":
    st.header("Risk Analysis")
    
    # High-risk tweets table
    if high_priority_df is not None and not high_priority_df.empty:
        st.subheader("High Priority Tweets")
        
        # Risk level tabs
        risk_tab1, risk_tab2 = st.tabs(["CRITICAL", "HIGH"])
        
        with risk_tab1:
            critical_tweets = high_priority_df[high_priority_df["risk_level"] == "CRITICAL"]
            if not critical_tweets.empty:
                for idx, tweet in critical_tweets.head(10).iterrows():
                    with st.expander(f"CRITICAL: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"):
                        st.write(f"**Content:** {tweet['content']}")
                        st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}")
                        st.write(f"**Drug Score:** {tweet.get('drug_score', 'N/A')} | **Crime Score:** {tweet.get('crime_score', 'N/A')}")
                        if 'tweet_url' in tweet:
                            st.write(f"**URL:** {tweet['tweet_url']}")
            else:
                st.info("No critical risk tweets in current filter")
        
        with risk_tab2:
            high_tweets = high_priority_df[high_priority_df["risk_level"] == "HIGH"]
            if not high_tweets.empty:
                for idx, tweet in high_tweets.head(10).iterrows():
                    with st.expander(f"HIGH: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"):
                        st.write(f"**Content:** {tweet['content']}")
                        st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}")
                        st.write(f"**Drug Score:** {tweet.get('drug_score', 'N/A')} | **Crime Score:** {tweet.get('crime_score', 'N/A')}")
                        if 'tweet_url' in tweet:
                            st.write(f"**URL:** {tweet['tweet_url']}")
            else:
                st.info("No high risk tweets in current filter")
    else:
        st.info("No high priority data available")
    
    # Risk score distribution
    if "drug_score" in df.columns and "crime_score" in df.columns:
        fig_scores = make_subplots(rows=1, cols=2, subplot_titles=("Drug Score Distribution", "Crime Score Distribution"))
        
        fig_scores.add_trace(go.Histogram(x=df["drug_score"], name="Drug Score", nbinsx=20), row=1, col=1)
        fig_scores.add_trace(go.Histogram(x=df["crime_score"], name="Crime Score", nbinsx=20), row=1, col=2)
        
        fig_scores.update_layout(title="Risk Score Distributions")
        st.plotly_chart(fig_scores, use_container_width=True)
    else:
        st.info("Risk score data not available")

# ------------------------
# Actionable Insights 
# ------------------------
elif analysis_type == "Actionable Insights":
    st.header("Actionable Insights")
    
    # Contact information tweets
    if contact_df is not None and not contact_df.empty:
        st.subheader("Tweets with Contact Information")
        st.markdown('<div class="warning-box">These tweets contain phone numbers or contact details - HIGH PRIORITY for investigation</div>', unsafe_allow_html=True)
        
        for idx, tweet in contact_df.head(20).iterrows():
            with st.expander(f"Contact Info: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"):
                st.write(f"**Content:** {tweet['content']}")
                st.write(f"**Phone Numbers:** {tweet.get('phone_numbers', 'Not extracted')}")
                st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}")
                st.write(f"**Risk Level:** {tweet.get('risk_level', 'Unknown')}")
                if 'tweet_url' in tweet:
                    st.write(f"**URL:** {tweet['tweet_url']}")
    else:
        st.info("No tweets with contact information found")
    
    # Bulk operation indicators
    st.subheader("Bulk Operation Indicators")
    # Sidebar input
    BULK_KEYWORDS = st.sidebar.text_area("Bulk Operation Keywords (comma-separated)",
                                        "kg,gram,bulk,wholesale,kilos,ounce,pound").split(",")

    # In code
    bulk_pattern = "|".join([kw.strip() for kw in BULK_KEYWORDS])
    bulk_regex = re.compile("|".join([kw.strip() for kw in BULK_KEYWORDS]), re.IGNORECASE)
    bulk_tweets = df[df["content"].str.contains(bulk_regex, na=False)]  
    
    if not bulk_tweets.empty:
        st.write(f"Found {len(bulk_tweets)} tweets mentioning bulk quantities")
        
        for idx, tweet in bulk_tweets.head(10).iterrows():
            with st.expander(f"Bulk: @{tweet['username']} - Risk: {tweet.get('risk_level', 'Unknown')}"):
                st.write(f"**Content:** {tweet['content']}")
                st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}")
                if 'tweet_url' in tweet:
                    st.write(f"**URL:** {tweet['tweet_url']}")
    else:
        st.info("No bulk operation indicators found")
    
    # High activity users
    st.subheader("High Activity Users")
    user_activity = df["username"].value_counts().head(15)
    
    if not user_activity.empty:
        fig_users = px.bar(x=user_activity.values, y=user_activity.index, 
                          orientation='h', title="Top 15 Most Active Users")
        fig_users.update_layout(yaxis=dict(autorange="reversed"))
        st.plotly_chart(fig_users, use_container_width=True)

# ------------------------
# NEW: PREDICTIVE ANALYTICS
# ------------------------
elif analysis_type == "📈 Predictive Analytics":
    st.header("📈 Predictive Analytics & Trends")
    
    st.subheader("📊 Activity Forecast")
    
    if "datetime" in df.columns and len(df) >= 7:
        # Daily activity trend
        daily_activity = df.groupby(df["datetime"].dt.date).size().reset_index(name="count")
        daily_activity.columns = ["date", "count"]
        daily_activity["date"] = pd.to_datetime(daily_activity["date"])
        
        # Calculate moving average
        daily_activity["7_day_ma"] = daily_activity["count"].rolling(window=7, min_periods=1).mean()
        daily_activity["trend"] = daily_activity["count"].rolling(window=7, min_periods=1).mean()
        
        # Create forecast visualization
        fig_forecast = go.Figure()
        
        fig_forecast.add_trace(go.Scatter(
            x=daily_activity["date"],
            y=daily_activity["count"],
            name="Actual Activity",
            mode="lines+markers",
            line=dict(color="#1f77b4")
        ))
        
        fig_forecast.add_trace(go.Scatter(
            x=daily_activity["date"],
            y=daily_activity["7_day_ma"],
            name="7-Day Moving Average",
            mode="lines",
            line=dict(color="#ff7f0e", dash="dash")
        ))
        
        fig_forecast.update_layout(
            title="Tweet Activity Trend & Forecast",
            xaxis_title="Date",
            yaxis_title="Number of Tweets",
            hovermode="x unified"
        )
        
        st.plotly_chart(fig_forecast, use_container_width=True)
        
        # Trend analysis
        col1, col2, col3 = st.columns(3)
        
        with col1:
            recent_avg = daily_activity["count"].tail(7).mean()
            st.metric("7-Day Average", f"{recent_avg:.1f} tweets/day")
        
        with col2:
            if len(daily_activity) >= 14:
                prev_avg = daily_activity["count"].tail(14).head(7).mean()
                change = ((recent_avg - prev_avg) / prev_avg * 100) if prev_avg > 0 else 0
                st.metric("Week-over-Week Change", f"{change:+.1f}%")
        
        with col3:
            peak_day = daily_activity.loc[daily_activity["count"].idxmax()]
            st.metric("Peak Activity Day", peak_day["date"].strftime("%Y-%m-%d"))

    
    # User activity prediction
    st.subheader("👤 High-Risk User Patterns")
    
    if "username" in df.columns and "risk_level" in df.columns:
        user_risk_scores = df.groupby("username").agg({
            "tweet_id": "count",
            "risk_level": lambda x: (x == "CRITICAL").sum() * 2 + (x == "HIGH").sum()
        }).reset_index()
        user_risk_scores.columns = ["username", "tweet_count", "risk_score"]
        
        # Identify escalating users
        escalating_users = user_risk_scores[
            (user_risk_scores["risk_score"] > 0) & 
            (user_risk_scores["tweet_count"] >= 3)
        ].sort_values("risk_score", ascending=False).head(15)
        
        if not escalating_users.empty:
            fig_escalating = px.scatter(
                escalating_users,
                x="tweet_count",
                y="risk_score",
                size="risk_score",
                hover_data=["username"],
                title="High-Risk User Activity Matrix",
                labels={"tweet_count": "Number of Tweets", "risk_score": "Risk Score"}
            )
            
            st.plotly_chart(fig_escalating, use_container_width=True)
            
            st.write("**Users to Monitor:**")
            for _, user in escalating_users.head(10).iterrows():
                st.write(f"- @{user['username']}: {user['tweet_count']} tweets, Risk Score: {user['risk_score']}")
    

# ------------------------
# NEW: NETWORK ANALYSIS
# ------------------------
elif analysis_type == "🌐 Network Analysis":
    st.header("🌐 Network Analysis")
    
    st.subheader("👥 User Connection Analysis")
    
    # Mentions network
    if "mentions" in df.columns:
        st.write("### User Mention Network")
        
        mention_pairs = []
        for _, row in df.iterrows():
            if pd.notna(row.get("mentions")) and row["mentions"]:
                mentions = str(row["mentions"]).split()
                for mention in mentions:
                    mention_clean = mention.strip("@")
                    if mention_clean:
                        mention_pairs.append({
                            "from": row["username"],
                            "to": mention_clean,
                            "risk_level": row.get("risk_level", "UNKNOWN")
                        })
        
        if mention_pairs:
            mention_df = pd.DataFrame(mention_pairs)
            
            # Top mentioned users
            top_mentioned = mention_df["to"].value_counts().head(15)
            
            fig_mentioned = px.bar(
                x=top_mentioned.values,
                y=top_mentioned.index,
                orientation="h",
                title="Most Mentioned Users",
                labels={"x": "Times Mentioned", "y": "Username"}
            )
            fig_mentioned.update_layout(yaxis=dict(autorange="reversed"))
            st.plotly_chart(fig_mentioned, use_container_width=True)
            
            # Connection strength
            connection_strength = mention_df.groupby(["from", "to"]).size().reset_index(name="mentions")
            strong_connections = connection_strength[connection_strength["mentions"] >= 2].sort_values("mentions", ascending=False)
            
            if not strong_connections.empty:
                st.write("### 🔗 Strong Connections (2+ mentions)")
                
                for _, conn in strong_connections.head(20).iterrows():
                    st.write(f"- @{conn['from']} → @{conn['to']}: {conn['mentions']} times")
        else:
            st.info("No mention data available")
    
    # Location clustering
    st.subheader("📍 Location-Based Clustering")
    
    if "user_location" in df.columns:
        location_users = df.groupby("user_location").agg({
            "username": lambda x: list(x.unique()),
            "tweet_id": "count",
            "risk_level": lambda x: (x == "CRITICAL").sum() if "risk_level" in df.columns else 0
        }).reset_index()
        
        location_users.columns = ["location", "users", "tweet_count", "critical_count"]
        location_users = location_users[location_users["location"] != ""]
        location_users = location_users[location_users["tweet_count"] >= 3]
        location_users["user_count"] = location_users["users"].apply(len)
        
        if not location_users.empty:
            fig_clusters = px.scatter(
                location_users,
                x="tweet_count",
                y="user_count",
                size="critical_count",
                hover_data=["location"],
                title="Location Clusters (Activity vs Users)",
                labels={
                    "tweet_count": "Total Tweets",
                    "user_count": "Unique Users",
                    "critical_count": "Critical Tweets"
                }
            )
            
            st.plotly_chart(fig_clusters, use_container_width=True)
            
            # High-density locations
            high_density = location_users.sort_values("user_count", ascending=False).head(10)
            
            st.write("### 🏙️ High-Density Locations")
            for _, loc in high_density.iterrows():
                with st.expander(f"{loc['location']} - {loc['user_count']} users, {loc['tweet_count']} tweets"):
                    st.write(f"**Critical tweets:** {loc['critical_count']}")
                    st.write(f"**Users:** {', '.join(['@' + u for u in loc['users'][:10]])}")
                    if len(loc['users']) > 10:
                        st.write(f"... and {len(loc['users']) - 10} more")
    
    # Co-occurrence analysis
    st.subheader("🔗 Keyword Co-occurrence")
    
    if "content" in df.columns:
        # Define drug/crime keywords
        drug_keywords = ["drug", "drugs", "weed", "cannabis", "cocaine", "heroin", "ganja", "charas"]
        crime_keywords = ["deal", "dealer", "selling", "supply", "smuggle", "illegal", "arrest"]
        
        cooccurrence = []
        
        for _, row in df.iterrows():
            content_lower = row["content"].lower()
            found_drug = [kw for kw in drug_keywords if kw in content_lower]
            found_crime = [kw for kw in crime_keywords if kw in content_lower]
            
            for drug in found_drug:
                for crime in found_crime:
                    cooccurrence.append({"drug_keyword": drug, "crime_keyword": crime})
        
        if cooccurrence:
            cooc_df = pd.DataFrame(cooccurrence)
            cooc_counts = cooc_df.groupby(["drug_keyword", "crime_keyword"]).size().reset_index(name="count")
            cooc_counts = cooc_counts.sort_values("count", ascending=False).head(20)
            
            if not cooc_counts.empty:
                fig_cooc = px.bar(
                    cooc_counts,
                    x="count",
                    y="drug_keyword",
                    color="crime_keyword",
                    title="Drug-Crime Keyword Co-occurrence",
                    orientation="h"
                )
                st.plotly_chart(fig_cooc, use_container_width=True)
        else:
            st.info("No significant keyword co-occurrences found")
    
    # Temporal clustering
    st.subheader("⏰ Temporal Activity Clusters")
    
    if "datetime" in df.columns and "username" in df.columns:
        df_copy = df.copy()
        df_copy["hour"] = df_copy["datetime"].dt.hour
        df_copy["day_of_week"] = df_copy["datetime"].dt.day_name()
        
        # Find users active at unusual hours (late night/early morning)
        unusual_hours = [0, 1, 2, 3, 4, 5]
        night_activity = df_copy[df_copy["hour"].isin(unusual_hours)]
        
        if len(night_activity) > 0:
            night_users = night_activity.groupby("username").size().reset_index(name="night_tweets")
            night_users = night_users[night_users["night_tweets"] >= 3].sort_values("night_tweets", ascending=False)
            
            if not night_users.empty:
                st.write(f"### 🌙 Users Active During Late Night (12 AM - 6 AM)")
                
                fig_night = px.bar(
                    night_users.head(15),
                    x="night_tweets",
                    y="username",
                    orientation="h",
                    title="Top Users with Late Night Activity"
                )
                fig_night.update_layout(yaxis=dict(autorange="reversed"))
                st.plotly_chart(fig_night, use_container_width=True)
                
                st.info("⚠️ Late night activity may indicate suspicious behavior patterns")
                
# ------------------------
# GEOGRAPHIC ANALYSIS (Enhanced)
# ------------------------
elif analysis_type == "Geographic Analysis":
    st.header("Geographic Analysis")
    
    # Location distribution
    locations = df["user_location"].value_counts().head(20)
    locations = locations[locations.index != ""]  # Remove empty locations
    
    if not locations.empty:
        fig_locations = px.bar(x=locations.values, y=locations.index,
                              orientation='h', title="Top 20 User Locations")
        fig_locations.update_layout(yaxis=dict(autorange="reversed"))
        st.plotly_chart(fig_locations, use_container_width=True)
    else:
        st.info("No location data available")
    
    # Karnataka relevance score distribution
    if "kar_score" in df.columns:
        fig_kar = px.histogram(df, x="kar_score", title="Karnataka Relevance Score Distribution")
        st.plotly_chart(fig_kar, use_container_width=True)
    
    # Location-based risk analysis
    if "risk_level" in df.columns and "user_location" in df.columns:
        location_risk = df.groupby("user_location").agg({
            "risk_level": lambda x: (x == "HIGH").sum() + (x == "CRITICAL").sum() * 2,
            "username": "count"
        }).reset_index()
        location_risk = location_risk[location_risk["username"] >= 3]  # Only locations with 3+ tweets
        location_risk = location_risk.sort_values("risk_level", ascending=False).head(15)
        
        if not location_risk.empty:
            fig_loc_risk = px.bar(location_risk, x="risk_level", y="user_location",
                                 orientation='h', title="High-Risk Locations (3+ tweets)")
            fig_loc_risk.update_layout(yaxis=dict(autorange="reversed"))
            st.plotly_chart(fig_loc_risk, use_container_width=True)

# ------------------------
# USER ANALYSIS (Enhanced)
# ------------------------
elif analysis_type == "User Analysis":
    st.header("User Analysis")
    
    # User metrics
    col1, col2, col3 = st.columns(3)
    
    with col1:
        st.metric("Unique Users", df["username"].nunique())
    with col2:
        verified_count = safe_column_sum(df, "user_verified")
        st.metric("Verified Users", verified_count)
    with col3:
        avg_followers = safe_column_mean(df, "user_followers")
        st.metric("Avg Followers", f"{avg_followers:,.0f}")
    
    # Top users by followers
    if "user_followers" in df.columns:
        top_followers = df.nlargest(15, "user_followers")[["username", "user_followers"]]
        if "user_verified" in df.columns:
            top_followers = df.nlargest(15, "user_followers")[["username", "user_followers", "user_verified"]]
            
        fig_followers = px.bar(top_followers, x="user_followers", y="username",
                              color="user_verified" if "user_verified" in top_followers.columns else None,
                              orientation='h', title="Users with Most Followers")
        fig_followers.update_layout(yaxis=dict(autorange="reversed"))
        st.plotly_chart(fig_followers, use_container_width=True)
    
    # User engagement vs risk (fixed aggregation)
    if "risk_level" in df.columns:
        user_metrics = []
        for username in df["username"].unique():
            user_data = df[df["username"] == username]
            risk_score = (user_data["risk_level"] == "HIGH").sum() + (user_data["risk_level"] == "CRITICAL").sum() * 2
            
            user_metrics.append({
                "username": username,
                "risk_score": risk_score,
                "avg_likes": safe_column_mean(user_data, "like_count"),
                "avg_retweets": safe_column_mean(user_data, "retweet_count"),
                "tweet_count": len(user_data)
            })
        
        user_risk_df = pd.DataFrame(user_metrics)
        multi_tweet_users = user_risk_df[user_risk_df["tweet_count"] >= 3]
        
        if not multi_tweet_users.empty:
            fig_user_risk = px.scatter(multi_tweet_users, x="avg_likes", y="risk_score",
                                      size="tweet_count", hover_data=["username"],
                                      title="User Risk vs Engagement (3+ tweets)")
            st.plotly_chart(fig_user_risk, use_container_width=True)

# ------------------------
# CONTENT ANALYSIS (Enhanced)
# ------------------------
elif analysis_type == "Content Analysis":
    st.header("Content Analysis")
    
    # Hashtag analysis
    if "hashtags" in df.columns:
        all_hashtags = df["hashtags"].dropna().str.split().explode()
        hashtag_counts = all_hashtags.value_counts().head(20)
        
        if not hashtag_counts.empty:
            fig_hashtags = px.bar(x=hashtag_counts.values, y=hashtag_counts.index,
                                 orientation='h', title="Top 20 Hashtags")
            fig_hashtags.update_layout(yaxis=dict(autorange="reversed"))
            st.plotly_chart(fig_hashtags, use_container_width=True)
    
    # Sentiment vs Risk correlation
    col1, col2 = st.columns(2)
    
    with col1:
        if "sentiment_compound" in df.columns and "risk_level" in df.columns:
            fig_sentiment_risk = px.box(df, x="risk_level", y="sentiment_compound",
                                       title="Sentiment by Risk Level")
            st.plotly_chart(fig_sentiment_risk, use_container_width=True)
        else:
            st.info("Sentiment analysis data not available")
    
    with col2:
        if "drug_score" in df.columns and "crime_score" in df.columns:
            # Drug score vs Crime score correlation
            fig_scores_corr = px.scatter(df, x="drug_score", y="crime_score",
                                        color="risk_level" if "risk_level" in df.columns else None,
                                        title="Drug Score vs Crime Score",
                                        color_discrete_map={
                                            "CRITICAL": "#dc3545",
                                            "HIGH": "#fd7e14", 
                                            "MEDIUM": "#ffc107",
                                            "LOW": "#28a745"
                                        })
            st.plotly_chart(fig_scores_corr, use_container_width=True)
        else:
            st.info("Score correlation data not available")
    
    # Content length analysis
    if "content" in df.columns:
        df_copy = df.copy()
        df_copy["content_length"] = df_copy["content"].str.len()
        
        if "risk_level" in df.columns:
            fig_length = px.histogram(df_copy, x="content_length", color="risk_level",
                                     title="Tweet Length Distribution by Risk Level",
                                     color_discrete_map={
                                         "CRITICAL": "#dc3545",
                                         "HIGH": "#fd7e14", 
                                         "MEDIUM": "#ffc107",
                                         "LOW": "#28a745"
                                     })
        else:
            fig_length = px.histogram(df_copy, x="content_length", title="Tweet Length Distribution")
        
        st.plotly_chart(fig_length, use_container_width=True)
    
    # Word frequency analysis
    if "content" in df.columns:
        st.subheader("Content Word Analysis")
        
        filtered_words = get_filtered_words(df["content"])
        
        if filtered_words:
            word_freq = pd.Series(filtered_words).value_counts().head(30)
            fig_words = px.bar(x=word_freq.values, y=word_freq.index,
                            orientation='h', title="Top 30 Most Frequent Words")
            fig_words.update_layout(yaxis=dict(autorange="reversed"))
            st.plotly_chart(fig_words, use_container_width=True)
        else:
            st.info("No content words available after filtering")


# ------------------------
# Footer with Data Information & Export
# ------------------------
st.markdown("---")

# Data summary footer
col1, col2, col3, col4 = st.columns(4)

with col1:
    st.info(f"Showing {len(df)} tweets")

with col2:
    if "risk_level" in df.columns:
        high_risk_count = len(df[df["risk_level"].isin(["HIGH", "CRITICAL"])])
        st.info(f"High Risk: {high_risk_count} tweets")
    else:
        st.info("Risk Level: Not available")

# Enhanced export functionality
st.sidebar.header("Data Export")

# Export current filtered data
if st.sidebar.button("Download Current View"):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv = df.to_csv(index=False)
    st.sidebar.download_button(
        label="Download as CSV",
        data=csv,
        file_name=f"drug_crime_analysis_{analysis_type.lower().replace(' ', '_')}_{timestamp}.csv",
        mime="text/csv"
    )

# Export summary report
if report_data:
    if st.sidebar.button("Download Analysis Report"):
        report_json = json.dumps(report_data, indent=2, default=str)
        st.sidebar.download_button(
            label="Download Report (JSON)",
            data=report_json,
            file_name=f"analysis_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
            mime="application/json"
        )

# Quick stats in sidebar
if len(df) > 0:
    st.sidebar.subheader("Quick Stats")
    
    if "risk_level" in df.columns:
        risk_counts = df["risk_level"].value_counts()
        for risk, count in risk_counts.items():
            percentage = (count / len(df)) * 100
            st.sidebar.text(f"{risk}: {count} ({percentage:.1f}%)")
    
    # Top location
    if "user_location" in df.columns:
        top_location = df["user_location"].value_counts().head(1)
        if not top_location.empty and top_location.index[0] != "":
            st.sidebar.text(f"Top Location: {top_location.index[0]} ({top_location.iloc[0]})")
    
    # Date range
    if "datetime" in df.columns and not df["datetime"].isna().all():
        try:
            days_span = (df["datetime"].max() - df["datetime"].min()).days
            st.sidebar.text(f"Data Span: {days_span} days")
        except:
            pass

# Debug information (collapsible)
with st.sidebar.expander("Debug Info"):
    st.write("Available columns:")
    st.write(list(df.columns))
    st.write(f"DataFrame shape: {df.shape}")
    st.write(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    if report_data:
        st.write("Report data available: Yes")
    else:
        st.write("Report data available: No")
    
    if high_priority_df is not None:
        st.write(f"High priority tweets: {len(high_priority_df)}")
    else:
        st.write("High priority tweets: Not available")
    
    if contact_df is not None:
        st.write(f"Contact info tweets: {len(contact_df)}")
    else:
        st.write("Contact info tweets: Not available")

# Footer
st.markdown("---")
st.markdown(
    """
    <div style='text-align: center; color: #666; padding: 20px;'>
        <p><strong>Twitter Drug Crime Monitoring Dashboard</strong></p>
        <p><em>Dashboard last updated: {}</em></p>
    </div>
    """.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
    unsafe_allow_html=True
)