Spaces:

lawlevisan
/

Reddit-Analysis

Sleeping

File size: 35,104 Bytes

import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap, MarkerCluster
from streamlit_folium import st_folium
from datetime import datetime, timedelta
import re
import os
from textblob import TextBlob

# ------------------------
# Config
# ------------------------
st.set_page_config(
    page_title="Reddit based Drug Crime Intelligence Dashboard", 
    layout="wide",
    initial_sidebar_state="expanded"
)

# Paths to data files
POSTS_FILE = "data/processed/reddit_posts_filtered.csv"
COMMENTS_FILE = "data/processed/reddit_comments_filtered.csv"
WARD_COORDS_FILE = "data/bangalore_wards_coordinates.csv"
DISTRICT_COORDS_FILE = "data/karnataka_districts_coordinates.csv"

# Drug-related keywords for classification
DRUG_KEYWORDS = {
    'high_risk': ['dealing', 'dealer', 'supply', 'trafficking', 'smuggling', 'cartel', 'seized', 'arrest', 'raid'],
    'substance': ['cocaine', 'heroin', 'mdma', 'meth', 'cannabis', 'marijuana', 'ganja', 'weed', 'lsd', 'ecstasy'],
    'activity': ['selling', 'buying', 'distribution', 'possession', 'consumption', 'overdose', 'addiction']
}

# ------------------------
# Enhanced Data Loading
# ------------------------
@st.cache_data
def load_data(posts_file, comments_file, ward_file, district_file):
    """Load all data files with comprehensive error handling"""
    data_status = {"posts": False, "comments": False, "wards": False, "districts": False}
    
    # Load posts
    try:
        posts = pd.read_csv(posts_file, dtype=str)
        posts = posts.drop_duplicates(subset=['id'], keep='first')
        data_status["posts"] = True
        st.sidebar.success(f"✅ Posts loaded: {len(posts)} records")
    except FileNotFoundError:
        posts = pd.DataFrame()
        st.sidebar.warning("⚠️ Reddit posts file not found")
    except Exception as e:
        posts = pd.DataFrame()
        st.sidebar.error(f"❌ Error loading posts: {str(e)}")

    # Load comments
    try:
        comments = pd.read_csv(comments_file)
        if 'id' in comments.columns:
            comments = comments.drop_duplicates(subset=['id'], keep='first')
        data_status["comments"] = True
        st.sidebar.success(f"✅ Comments loaded: {len(comments)} records")
    except FileNotFoundError:
        comments = pd.DataFrame()
        st.sidebar.warning("⚠️ Reddit comments file not found")
    except Exception as e:
        comments = pd.DataFrame()
        st.sidebar.error(f"❌ Error loading comments: {str(e)}")

    # Load ward coordinates
    try:
        wards = pd.read_csv(ward_file)
        if 'ward_name' not in wards.columns and 'name' in wards.columns:
            wards.rename(columns={'name': 'ward_name'}, inplace=True)
        data_status["wards"] = True
        st.sidebar.success(f"✅ Wards loaded: {len(wards)} wards")
    except FileNotFoundError:
        wards = pd.DataFrame()
        st.sidebar.warning("⚠️ Ward coordinates file not found")
    except Exception as e:
        wards = pd.DataFrame()
        st.sidebar.error(f"❌ Error loading wards: {str(e)}")

    # Load district coordinates
    try:
        districts = pd.read_csv(district_file)
        if 'district_name' not in districts.columns and 'name' in districts.columns:
            districts.rename(columns={'name': 'district_name'}, inplace=True)
        data_status["districts"] = True
        st.sidebar.success(f"✅ Districts loaded: {len(districts)} districts")
    except FileNotFoundError:
        districts = pd.DataFrame()
        st.sidebar.warning("⚠️ District coordinates file not found")
    except Exception as e:
        districts = pd.DataFrame()
        st.sidebar.error(f"❌ Error loading districts: {str(e)}")

    return posts, comments, wards, districts, data_status

# ------------------------
# Crime Analysis Functions
# ------------------------
def classify_crime_severity(text):
    """Classify posts by crime severity based on keywords"""
    text_lower = str(text).lower()
    severity_score = 0
    
    for keyword in DRUG_KEYWORDS['high_risk']:
        if keyword in text_lower:
            severity_score += 3
    
    for keyword in DRUG_KEYWORDS['substance']:
        if keyword in text_lower:
            severity_score += 2
    
    for keyword in DRUG_KEYWORDS['activity']:
        if keyword in text_lower:
            severity_score += 1
    
    if severity_score >= 5:
        return 'Critical'
    elif severity_score >= 3:
        return 'High'
    elif severity_score >= 1:
        return 'Medium'
    else:
        return 'Low'

def extract_drug_mentions(text):
    """Extract specific drug mentions from text"""
    text_lower = str(text).lower()
    drugs_found = []
    for drug in DRUG_KEYWORDS['substance']:
        if drug in text_lower:
            drugs_found.append(drug.capitalize())
    return ', '.join(drugs_found) if drugs_found else 'Unspecified'

def calculate_threat_score(row):
    """Calculate threat score based on multiple factors"""
    score = 0
    text = str(row.get('text', '')) + ' ' + str(row.get('title', ''))
    text_lower = text.lower()
    
    for keyword in DRUG_KEYWORDS['high_risk']:
        if keyword in text_lower:
            score += 10
    
    if 'score' in row:
        score += min(int(row.get('score', 0)) / 10, 5)
    
    if 'num_comments' in row:
        score += min(int(row.get('num_comments', 0)) / 5, 5)
    
    sentiment = TextBlob(text).sentiment.polarity
    if sentiment < -0.2:
        score += 5
    
    return min(score, 100)

# ------------------------
# Load All Data
# ------------------------
posts_df, comments_df, wards_df, districts_df, data_status = load_data(
    POSTS_FILE, COMMENTS_FILE, WARD_COORDS_FILE, DISTRICT_COORDS_FILE
)

# ------------------------
# Data Processing
# ------------------------
def process_datetime(df, datetime_col='created_utc'):
    """Process datetime column with robust error handling"""
    if datetime_col not in df.columns:
        return df
    
    df["datetime"] = pd.to_datetime(df[datetime_col], errors='coerce')
    df["date"] = df["datetime"].dt.date
    df["hour"] = df["datetime"].dt.hour
    df["day_of_week"] = df["datetime"].dt.day_name()
    return df

# Normalize coordinate names
if not wards_df.empty and "ward_name" in wards_df.columns:
    wards_df["ward_name"] = wards_df["ward_name"].astype(str).str.strip().str.lower()

if not districts_df.empty and "district_name" in districts_df.columns:
    districts_df["district_name"] = districts_df["district_name"].astype(str).str.strip().str.lower()

# District mapping
district_mapping = {
    "bangalore": "bengaluru",
    "blr": "bengaluru",
    "mysore": "mysuru",
}

# Create patterns
ward_pattern = None
district_pattern = None

if not wards_df.empty:
    ward_list = wards_df["ward_name"].str.lower().tolist()
    ward_pattern = r'\b(' + '|'.join(re.escape(w) for w in ward_list) + r')\b'

if not districts_df.empty:
    district_list = districts_df["district_name"].str.lower().tolist()
    district_pattern = r'\b(' + '|'.join(re.escape(d) for d in district_list) + r')\b'

def extract_locations(text_series, patterns):
    """Extract locations from text using regex patterns"""
    locations = []
    for text in text_series.fillna(""):
        matches = []
        for pattern in patterns:
            matches.extend(re.findall(pattern, str(text).lower()))
        matches = list(set(matches))
        locations.append(", ".join(matches))
    return pd.Series(locations, index=text_series.index)

# Process posts
if not posts_df.empty:
    posts_df = process_datetime(posts_df)
    
    post_text = (posts_df.get("title", "") + " " + posts_df.get("text", "")).fillna("")
    
    if ward_pattern:
        posts_df["ward_location"] = extract_locations(post_text, [ward_pattern])
    else:
        posts_df["ward_location"] = ""
    
    if district_pattern:
        posts_df["district_location"] = extract_locations(post_text, [district_pattern])
    else:
        posts_df["district_location"] = ""
    
    posts_df["district_location"] = posts_df["district_location"].replace(district_mapping)
    
    posts_df["severity"] = post_text.apply(classify_crime_severity)
    posts_df["drugs_mentioned"] = post_text.apply(extract_drug_mentions)
    posts_df["threat_score"] = posts_df.apply(calculate_threat_score, axis=1)
    
    posts_df["sentiment_score"] = post_text.apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    posts_df["sentiment"] = posts_df["sentiment_score"].apply(
        lambda x: "Positive" if x > 0 else ("Negative" if x < 0 else "Neutral")
    )

# Process comments
if not comments_df.empty:
    comments_df = process_datetime(comments_df)

# ------------------------
# Dashboard Header
# ------------------------
st.title("🚨 Reddit based Drug Crime Intelligence Dashboard")
st.markdown("**Real-time intelligence analysis of drug-related criminal activities from Reddit social media monitoring**")

# ------------------------
# Sidebar Filters
# ------------------------
st.sidebar.title("🔧 Intelligence Controls")

if st.sidebar.button("🔄 Refresh Data"):
    st.cache_data.clear()
    st.rerun()

# Severity filter
if not posts_df.empty and "severity" in posts_df.columns:
    severity_filter = st.sidebar.multiselect(
        "⚠️ Crime Severity Level",
        options=['Critical', 'High', 'Medium', 'Low'],
        default=['Critical', 'High']
    )
    if severity_filter:
        posts_df = posts_df[posts_df["severity"].isin(severity_filter)]

# Date range filter
if not posts_df.empty and "datetime" in posts_df.columns:
    min_date = posts_df["datetime"].min().date()
    max_date = posts_df["datetime"].max().date()
    
    date_range = st.sidebar.date_input(
        "📅 Select Date Range",
        value=(min_date, max_date),
        min_value=min_date,
        max_value=max_date
    )
    
    if len(date_range) == 2:
        posts_df = posts_df[
            (posts_df["date"] >= date_range[0]) & 
            (posts_df["date"] <= date_range[1])
        ]

# Subreddit filter
if not posts_df.empty and "subreddit" in posts_df.columns:
    subreddits = st.sidebar.multiselect(
        "📱 Filter by Subreddits",
        options=posts_df["subreddit"].unique(),
        default=posts_df["subreddit"].value_counts().head(5).index.tolist()
    )
    if subreddits:
        posts_df = posts_df[posts_df["subreddit"].isin(subreddits)]

# Keyword search
search_keyword = st.sidebar.text_input("🔍 Search Keywords in Content")
if search_keyword:
    posts_df = posts_df[
        posts_df["text"].str.contains(search_keyword, case=False, na=False) |
        posts_df["title"].str.contains(search_keyword, case=False, na=False)
    ]

# ------------------------
# Main Dashboard Content
# ------------------------

if posts_df.empty and comments_df.empty:
    st.error("🚫 No intelligence data available. Please ensure data collection is operational.")
    st.stop()

# --- Crime Intelligence Metrics
st.subheader("📊 Crime Intelligence Overview")
col1, col2, col3, col4 = st.columns(4)

with col1:
    critical_posts = len(posts_df[posts_df["severity"] == "Critical"]) if "severity" in posts_df.columns else 0
    st.metric(
        label="Critical Threats",
        value=critical_posts,
        delta=f"{(critical_posts/len(posts_df)*100):.1f}%" if len(posts_df) > 0 else "0%"
    )

with col2:
    avg_threat = posts_df["threat_score"].mean() if "threat_score" in posts_df.columns else 0
    st.metric(
        label="Avg Threat Score",
        value=f"{avg_threat:.1f}",
        delta="High" if avg_threat > 50 else "Moderate"
    )

with col3:
    if "ward_location" in posts_df.columns:
        ward_exploded_temp = posts_df[posts_df["ward_location"] != ""].copy()
        ward_exploded_temp["ward_location"] = ward_exploded_temp["ward_location"].str.split(", ")
        ward_exploded_temp = ward_exploded_temp.explode("ward_location")
        unique_locations = ward_exploded_temp["ward_location"].nunique()
        st.metric(
            label="Active Locations",
            value=unique_locations
        )

with col4:
    drug_types = posts_df["drugs_mentioned"].str.split(", ").explode().nunique() if "drugs_mentioned" in posts_df.columns else 0
    st.metric(
        label="Drug Types Identified",
        value=drug_types
    )

st.markdown("---")

# --- Crime Severity Distribution
if "severity" in posts_df.columns:
    st.subheader("⚠️ Crime Severity Analysis")
    
    col1, col2 = st.columns(2)
    
    with col1:
        severity_counts = posts_df["severity"].value_counts()
        fig_severity = px.pie(
            values=severity_counts.values,
            names=severity_counts.index,
            title="Crime Severity Distribution",
            color=severity_counts.index,
            color_discrete_map={
                'Critical': '#FF0000',
                'High': '#FF6B00',
                'Medium': '#FFD700',
                'Low': '#90EE90'
            }
        )
        st.plotly_chart(fig_severity, use_container_width=True)
    
    with col2:
        fig_threat = px.histogram(
            posts_df,
            x="threat_score",
            nbins=20,
            title="Threat Score Distribution",
            labels={"threat_score": "Threat Score", "count": "Number of Posts"}
        )
        fig_threat.add_vline(x=50, line_dash="dash", line_color="red", annotation_text="High Threat Threshold")
        st.plotly_chart(fig_threat, use_container_width=True)

st.markdown("---")

# --- Drug Type Analysis
if "drugs_mentioned" in posts_df.columns:
    st.subheader("💊 Substance Intelligence")
    
    all_drugs = posts_df["drugs_mentioned"].str.split(", ").explode()
    drug_counts = all_drugs[all_drugs != "Unspecified"].value_counts().head(10)
    
    if not drug_counts.empty:
        fig_drugs = px.bar(
            x=drug_counts.values,
            y=drug_counts.index,
            orientation='h',
            title="Top 10 Substances Mentioned",
            labels={"x": "Mentions", "y": "Substance"},
            color=drug_counts.values,
            color_continuous_scale="Reds"
        )
        st.plotly_chart(fig_drugs, use_container_width=True)

st.markdown("---")

# --- Timeline Analysis
if "date" in posts_df.columns:
    st.subheader("📈 Crime Activity Timeline")
    
    col1, col2 = st.columns(2)
    
    with col1:
        daily_data = posts_df.groupby(["date", "severity"]).size().reset_index(name="count")
        fig_daily = px.line(
            daily_data,
            x="date",
            y="count",
            color="severity",
            title="Daily Crime Activity by Severity",
            labels={"count": "Number of Incidents", "date": "Date"},
            color_discrete_map={
                'Critical': '#FF0000',
                'High': '#FF6B00',
                'Medium': '#FFD700',
                'Low': '#90EE90'
            }
        )
        st.plotly_chart(fig_daily, use_container_width=True)
    
    with col2:
        if "hour" in posts_df.columns and "day_of_week" in posts_df.columns:
            hourly_activity = posts_df.groupby(["day_of_week", "hour"]).size().reset_index(name="count")
            fig_hourly = px.density_heatmap(
                hourly_activity,
                x="hour",
                y="day_of_week",
                z="count",
                title="Activity Heatmap - High-Risk Hours",
                labels={"hour": "Hour of Day", "day_of_week": "Day", "count": "Incidents"},
                color_continuous_scale="Reds"
            )
            st.plotly_chart(fig_hourly, use_container_width=True)

st.markdown("---")

# --- Geographic Intelligence - COMBINED MAP
st.subheader("🗺️ Geographic Crime Intelligence")

# Process both ward and district data
ward_data_available = not wards_df.empty and "ward_location" in posts_df.columns
district_data_available = not districts_df.empty and "district_location" in posts_df.columns

if ward_data_available or district_data_available:
    st.markdown("**Crime hotspot analysis across Karnataka (Wards & Districts)**")
    
    # Prepare ward data
    merged_wards = pd.DataFrame()
    if ward_data_available:
        ward_posts = posts_df[posts_df["ward_location"] != ""].copy()
        ward_exploded = ward_posts.copy()
        ward_exploded["ward_location"] = ward_posts["ward_location"].str.split(", ")
        ward_exploded = ward_exploded.explode("ward_location")
        ward_exploded["ward_location"] = ward_exploded["ward_location"].str.strip().str.lower()
        
        loc_counts = ward_exploded.groupby("ward_location").size().reset_index(name="count")
        merged_wards = pd.merge(loc_counts, wards_df, left_on="ward_location", right_on="ward_name", how="inner")
        merged_wards["location_type"] = "Ward"
        merged_wards["location_name"] = merged_wards["ward_name"]
    
    # Prepare district data
    merged_districts = pd.DataFrame()
    if district_data_available:
        district_posts = posts_df[posts_df["district_location"] != ""].copy()
        district_exploded = district_posts.copy()
        district_exploded["district_location"] = district_posts["district_location"].str.split(", ")
        district_exploded = district_exploded.explode("district_location")
        district_exploded["district_location"] = district_exploded["district_location"].str.strip().str.lower()
        
        district_counts = district_exploded.groupby("district_location").size().reset_index(name="count")
        merged_districts = pd.merge(district_counts, districts_df, left_on="district_location", right_on="district_name", how="inner")
        merged_districts["location_type"] = "District"
        merged_districts["location_name"] = merged_districts["district_name"]
    
    # Combine both datasets
    all_locations = pd.concat([merged_wards, merged_districts], ignore_index=True)
    
    if not all_locations.empty:
        # Determine center of map
        center_lat = all_locations["lat"].mean()
        center_lon = all_locations["lon"].mean()
        
        # Create unified map
        m_unified = folium.Map(
            location=[center_lat, center_lon],
            zoom_start=9 if ward_data_available else 7,
            tiles="OpenStreetMap"
        )
        
        # Add heatmap layer
        heat_data = [[row["lat"], row["lon"], row["count"]] for _, row in all_locations.iterrows()]
        HeatMap(heat_data, radius=20, blur=15, max_zoom=13, gradient={
            0.0: 'blue', 0.5: 'yellow', 0.75: 'orange', 1.0: 'red'
        }).add_to(m_unified)
        
        # Determine hotspot threshold
        threshold = all_locations["count"].quantile(0.70)
        all_locations["is_hotspot"] = all_locations["count"] >= threshold
        
        # Add markers for each location
        for _, row in all_locations.iterrows():
            location_name = row["location_name"].title()
            location_type = row["location_type"]
            incident_count = row["count"]
            
            # Get location-specific crime data
            if location_type == "Ward":
                loc_data = posts_df[posts_df["ward_location"].str.contains(row["location_name"], case=False, na=False)]
            else:
                loc_data = posts_df[posts_df["district_location"].str.contains(row["location_name"], case=False, na=False)]
            
            # Severity breakdown
            severity_breakdown = loc_data["severity"].value_counts().to_dict()
            severity_html = "<br>".join([f"&nbsp;&nbsp;• {sev}: {count}" for sev, count in severity_breakdown.items()])
            
            # Critical incidents count
            critical_count = severity_breakdown.get("Critical", 0)
            
            # Top drugs in this location
            loc_drugs = loc_data["drugs_mentioned"].str.split(", ").explode()
            top_drugs = loc_drugs[loc_drugs != "Unspecified"].value_counts().head(3)
            drugs_html = "<br>".join([f"&nbsp;&nbsp;• {drug}: {count}" for drug, count in top_drugs.items()])
            
            # Average threat score
            avg_threat = loc_data["threat_score"].mean()
            
            # Recent high-threat incidents
            recent = loc_data.nlargest(3, "threat_score")[["title", "severity", "threat_score"]]
            incidents_html = "<br>".join([
                f"&nbsp;&nbsp;• <b>[{r['severity']}]</b> {r['title'][:50]}... <i>(Score: {r['threat_score']:.0f})</i>"
                for _, r in recent.iterrows()
            ])
            
            # Marker color based on severity
            marker_color = 'darkred' if row["is_hotspot"] else ('red' if incident_count >= 5 else ('orange' if incident_count >= 3 else 'blue'))
            
            # Icon based on type
            icon_symbol = 'home' if location_type == "Ward" else 'map'
            
            # Create detailed popup
            popup_html = f"""
            <div style='width: 350px; font-family: Arial, sans-serif;'>
                <h3 style='color: {marker_color}; margin-bottom: 8px; border-bottom: 2px solid {marker_color}; padding-bottom: 5px;'>
                    {location_type}: {location_name}
                </h3>
                <div style='margin: 10px 0;'>
                    <b>📊 Total Incidents:</b> <span style='font-size: 18px; color: {marker_color};'>{incident_count}</span><br>
                    <b>🚨 Critical Threats:</b> <span style='font-size: 18px; color: darkred;'>{critical_count}</span><br>
                    <b>📈 Avg Threat Score:</b> <span style='font-size: 16px;'>{avg_threat:.1f}/100</span>
                </div>
                <hr style='border: 1px solid #ddd;'>
                <div style='margin: 10px 0;'>
                    <b>⚠️ Severity Breakdown:</b><br>
                    {severity_html if severity_html else '&nbsp;&nbsp;No data'}
                </div>
                <hr style='border: 1px solid #ddd;'>
                <div style='margin: 10px 0;'>
                    <b>💊 Top Substances Detected:</b><br>
                    {drugs_html if not top_drugs.empty else '&nbsp;&nbsp;None identified'}
                </div>
                <hr style='border: 1px solid #ddd;'>
                <div style='margin: 10px 0;'>
                    <b>🎯 Recent High-Threat Incidents:</b><br>
                    {incidents_html if not recent.empty else '&nbsp;&nbsp;None'}
                </div>
                <div style='margin-top: 10px; padding: 5px; background-color: #f0f0f0; border-radius: 5px; text-align: center; font-size: 11px;'>
                    <i>Click marker for details • Hover for quick info</i>
                </div>
            </div>
            """
            
            # Tooltip (hover text)
            tooltip_text = f"""
            <b>{location_type}: {location_name}</b><br>
            Total Incidents: {incident_count}<br>
            Critical: {critical_count} | Avg Threat: {avg_threat:.1f}
            """
            
            # Add marker
            folium.CircleMarker(
                location=[row["lat"], row["lon"]],
                radius=min(incident_count * 2.5 if location_type == "Ward" else incident_count * 3.5, 25),
                color=marker_color,
                fill=True,
                fill_color=marker_color,
                fill_opacity=0.7,
                weight=2,
                popup=folium.Popup(popup_html, max_width=400),
                tooltip=folium.Tooltip(tooltip_text, sticky=True)
            ).add_to(m_unified)
        
        # Display map
        st_folium(m_unified, width="100%", height=700)
        
        # Hotspot analysis table
        st.subheader("🔥 Top Crime Hotspots")

        col1 = st.columns(1)

        with col1[0]:
            st.markdown("**High-Activity Wards**")
            if not merged_wards.empty:
                ward_display = merged_wards.sort_values("count", ascending=False).head(10)
                st.dataframe(
                    ward_display[["ward_name", "count"]].rename(columns={
                        "ward_name": "Ward Name",
                        "count": "Incidents"
                    }).reset_index(drop=True),
                    use_container_width=True,
                    height=300
                )
            else:
                st.info("No ward data available")

st.markdown("---")

# --- High-Priority Intelligence Reports
st.subheader("🚨 High-Priority Intelligence Reports")

if not posts_df.empty:
    priority_posts = posts_df[
        (posts_df["severity"].isin(['Critical', 'High'])) |
        (posts_df["threat_score"] >= 50)
    ].sort_values("threat_score", ascending=False)
    
    if not priority_posts.empty:
        priority_posts = priority_posts.drop_duplicates(subset=['id'], keep='first')
        
        display_cols = ["datetime", "title", "severity", "threat_score", "drugs_mentioned", "ward_location", "subreddit"]
        available_cols = [col for col in display_cols if col in priority_posts.columns]
        
        st.dataframe(
            priority_posts[available_cols].head(50).rename(columns={
                "datetime": "Timestamp",
                "title": "Intelligence Report",
                "severity": "Severity",
                "threat_score": "Threat Score",
                "drugs_mentioned": "Substances",
                "ward_location": "Location",
                "subreddit": "Source"
            }),
            use_container_width=True,
            height=400
        )
        
        st.download_button(
            label="📥 Download Priority Reports (CSV)",
            data=priority_posts[available_cols].to_csv(index=False).encode("utf-8"),
            file_name=f"priority_intelligence_{datetime.now().strftime('%Y%m%d')}.csv",
            mime="text/csv"
        )
    else:
        st.info("No high-priority incidents in selected date range")
else:
    st.info("No intelligence data available")

st.markdown("---")

# --- Advanced Analytics Section
st.subheader("🔬 Advanced Crime Analytics")

col1, col2 = st.columns(2)

with col1:
    if "hour" in posts_df.columns and "severity" in posts_df.columns:
        st.markdown("**Crime Patterns by Time of Day**")
        time_severity = posts_df.groupby(["hour", "severity"]).size().reset_index(name="count")
        fig_time = px.bar(
            time_severity,
            x="hour",
            y="count",
            color="severity",
            title="Crime Activity by Hour and Severity",
            labels={"hour": "Hour of Day", "count": "Incidents"},
            color_discrete_map={
                'Critical': '#FF0000',
                'High': '#FF6B00',
                'Medium': '#FFD700',
                'Low': '#90EE90'
            }
        )
        st.plotly_chart(fig_time, use_container_width=True)

with col2:
    if "sentiment_score" in posts_df.columns and "severity" in posts_df.columns:
        st.markdown("**Sentiment vs Crime Severity**")
        fig_sentiment_severity = px.box(
            posts_df,
            x="severity",
            y="sentiment_score",
            color="severity",
            title="Sentiment Distribution by Crime Severity",
            labels={"sentiment_score": "Sentiment Score", "severity": "Crime Severity"},
            color_discrete_map={
                'Critical': '#FF0000',
                'High': '#FF6B00',
                'Medium': '#FFD700',
                'Low': '#90EE90'
            }
        )
        st.plotly_chart(fig_sentiment_severity, use_container_width=True)

st.markdown("---")

# --- Network Analysis
if "subreddit" in posts_df.columns and "drugs_mentioned" in posts_df.columns:
    st.subheader("🕸️ Source-Substance Network Analysis")
    
    source_drug = posts_df[posts_df["drugs_mentioned"] != "Unspecified"].groupby(
        ["subreddit", "drugs_mentioned"]
    ).size().reset_index(name="mentions")
    
    if not source_drug.empty:
        top_relationships = source_drug.nlargest(15, "mentions")
        
        fig_network = px.bar(
            top_relationships,
            x="mentions",
            y="subreddit",
            color="drugs_mentioned",
            orientation='h',
            title="Top Source-Substance Relationships",
            labels={"mentions": "Number of Mentions", "subreddit": "Source Community"},
            height=500
        )
        st.plotly_chart(fig_network, use_container_width=True)

st.markdown("---")

# --- Emerging Threats Detection
st.subheader("⚡ Emerging Threats Detection")

if "date" in posts_df.columns and "threat_score" in posts_df.columns:
    today = posts_df["date"].max()
    last_week = today - timedelta(days=7)
    prev_week = last_week - timedelta(days=7)
    
    recent_threats = posts_df[posts_df["date"] >= last_week]["threat_score"].mean()
    previous_threats = posts_df[(posts_df["date"] >= prev_week) & (posts_df["date"] < last_week)]["threat_score"].mean()
    
    threat_change = ((recent_threats - previous_threats) / previous_threats * 100) if previous_threats > 0 else 0
    
    col1, col2, col3 = st.columns(3)
    
    with col1:
        st.metric(
            "Threat Level Trend",
            f"{recent_threats:.1f}",
            f"{threat_change:+.1f}%",
            delta_color="inverse"
        )
    
    with col2:
        recent_locs = set(posts_df[posts_df["date"] >= last_week]["ward_location"].str.split(", ").explode())
        prev_locs = set(posts_df[posts_df["date"] < last_week]["ward_location"].str.split(", ").explode())
        new_locations = len(recent_locs - prev_locs)
        st.metric("New Active Locations", new_locations)
    
    with col3:
        daily_avg = posts_df.groupby("date").size().mean()
        recent_avg = posts_df[posts_df["date"] >= last_week].groupby("date").size().mean()
        spike = recent_avg > daily_avg * 1.5
        st.metric("Activity Status", "⚠️ SPIKE" if spike else "✅ Normal")

st.markdown("---")

# --- Intelligence Summary Report
st.subheader("📋 Executive Intelligence Summary")

summary_col1, summary_col2 = st.columns(2)

with summary_col1:
    st.markdown("**Key Findings:**")
    
    if not posts_df.empty:
        if "ward_location" in posts_df.columns and "threat_score" in posts_df.columns:
            ward_posts_with_location = posts_df[posts_df["ward_location"] != ""].copy()
            if not ward_posts_with_location.empty:
                ward_exploded_threat = ward_posts_with_location.copy()
                ward_exploded_threat["ward_location"] = ward_posts_with_location["ward_location"].str.split(", ")
                ward_exploded_threat = ward_exploded_threat.explode("ward_location").reset_index(drop=True)
                
                ward_threat = ward_exploded_threat.groupby("ward_location")["threat_score"].mean().sort_values(ascending=False)
                
                if not ward_threat.empty:
                    st.markdown(f"🎯 **Highest Threat Zone:** {ward_threat.index[0].title()} (Score: {ward_threat.iloc[0]:.1f})")
        
        if "drugs_mentioned" in posts_df.columns:
            top_drug = posts_df["drugs_mentioned"].str.split(", ").explode().value_counts()
            if len(top_drug) > 0 and top_drug.index[0] != "Unspecified":
                st.markdown(f"💊 **Primary Substance:** {top_drug.index[0]} ({top_drug.iloc[0]} mentions)")
        
        if "hour" in posts_df.columns:
            peak_hour = posts_df["hour"].mode()[0]
            st.markdown(f"🕐 **Peak Activity Time:** {peak_hour}:00 - {peak_hour+1}:00")
        
        if "subreddit" in posts_df.columns:
            top_source = posts_df["subreddit"].value_counts().index[0]
            st.markdown(f"📱 **Primary Intelligence Source:** r/{top_source}")

with summary_col2:
    st.markdown("**Risk Assessment:**")
    
    if not posts_df.empty and "severity" in posts_df.columns:
        critical_pct = (len(posts_df[posts_df["severity"] == "Critical"]) / len(posts_df) * 100)
        
        if critical_pct > 30:
            risk_level = "🔴 CRITICAL"
            risk_desc = "Immediate action required"
        elif critical_pct > 15:
            risk_level = "🟠 HIGH"
            risk_desc = "Enhanced monitoring recommended"
        elif critical_pct > 5:
            risk_level = "🟡 MODERATE"
            risk_desc = "Standard surveillance protocols"
        else:
            risk_level = "🟢 LOW"
            risk_desc = "Routine monitoring sufficient"
        
        st.markdown(f"**Overall Risk Level:** {risk_level}")
        st.markdown(f"*{risk_desc}*")
        st.markdown(f"- Critical incidents: {critical_pct:.1f}%")
        st.markdown(f"- Total monitored incidents: {len(posts_df)}")
        st.markdown(f"- Date range: {posts_df['date'].min()} to {posts_df['date'].max()}")

st.markdown("---")

# --- Export Options
st.subheader("📤 Export Intelligence Reports")

export_col1, export_col2, export_col3 = st.columns(3)

with export_col1:
    if not posts_df.empty:
        full_export = posts_df.to_csv(index=False).encode("utf-8")
        st.download_button(
            label="📊 Full Dataset",
            data=full_export,
            file_name=f"intelligence_full_{datetime.now().strftime('%Y%m%d')}.csv",
            mime="text/csv"
        )

with export_col2:
    if "severity" in posts_df.columns:
        critical_data = posts_df[posts_df["severity"] == "Critical"]
        if not critical_data.empty:
            critical_export = critical_data.to_csv(index=False).encode("utf-8")
            st.download_button(
                label="🚨 Critical Incidents",
                data=critical_export,
                file_name=f"critical_incidents_{datetime.now().strftime('%Y%m%d')}.csv",
                mime="text/csv"
            )

with export_col3:
    if 'merged_wards' in locals() and not merged_wards.empty:
        location_export = merged_wards.to_csv(index=False).encode("utf-8")
        st.download_button(
            label="🗺️ Location Analysis",
            data=location_export,
            file_name=f"location_analysis_{datetime.now().strftime('%Y%m%d')}.csv",
            mime="text/csv"
        )

st.markdown("---")

# --- System Status Footer
st.markdown("**🔒 Intelligence System Status:**")
status_cols = st.columns(4)
with status_cols[0]:
    st.write("📄 Posts:", "✅ Online" if data_status["posts"] else "❌ Offline")
with status_cols[1]:
    st.write("💬 Comments:", "✅ Online" if data_status["comments"] else "❌ Offline")
with status_cols[2]:
    st.write("🏘️ Wards:", "✅ Online" if data_status["wards"] else "❌ Offline")
with status_cols[3]:
    st.write("🌍 Districts:", "✅ Online" if data_status["districts"] else "❌ Offline")

try:
    file_mod_time = datetime.fromtimestamp(os.path.getmtime(POSTS_FILE))
    st.markdown(f"*Intelligence data last updated: {file_mod_time.strftime('%Y-%m-%d %H:%M:%S')}*")
except:
    pass

st.markdown("---")