import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap, MarkerCluster
from streamlit_folium import st_folium
from datetime import datetime, timedelta
import re
import os
from textblob import TextBlob
# ------------------------
# Config
# ------------------------
st.set_page_config(
page_title="Reddit based Drug Crime Intelligence Dashboard",
layout="wide",
initial_sidebar_state="expanded"
)
# Paths to data files
POSTS_FILE = "data/processed/reddit_posts_filtered.csv"
COMMENTS_FILE = "data/processed/reddit_comments_filtered.csv"
WARD_COORDS_FILE = "data/bangalore_wards_coordinates.csv"
DISTRICT_COORDS_FILE = "data/karnataka_districts_coordinates.csv"
# Drug-related keywords for classification
DRUG_KEYWORDS = {
'high_risk': ['dealing', 'dealer', 'supply', 'trafficking', 'smuggling', 'cartel', 'seized', 'arrest', 'raid'],
'substance': ['cocaine', 'heroin', 'mdma', 'meth', 'cannabis', 'marijuana', 'ganja', 'weed', 'lsd', 'ecstasy'],
'activity': ['selling', 'buying', 'distribution', 'possession', 'consumption', 'overdose', 'addiction']
}
# ------------------------
# Enhanced Data Loading
# ------------------------
@st.cache_data
def load_data(posts_file, comments_file, ward_file, district_file):
"""Load all data files with comprehensive error handling"""
data_status = {"posts": False, "comments": False, "wards": False, "districts": False}
# Load posts
try:
posts = pd.read_csv(posts_file, dtype=str)
posts = posts.drop_duplicates(subset=['id'], keep='first')
data_status["posts"] = True
st.sidebar.success(f"✅ Posts loaded: {len(posts)} records")
except FileNotFoundError:
posts = pd.DataFrame()
st.sidebar.warning("⚠️ Reddit posts file not found")
except Exception as e:
posts = pd.DataFrame()
st.sidebar.error(f"❌ Error loading posts: {str(e)}")
# Load comments
try:
comments = pd.read_csv(comments_file)
if 'id' in comments.columns:
comments = comments.drop_duplicates(subset=['id'], keep='first')
data_status["comments"] = True
st.sidebar.success(f"✅ Comments loaded: {len(comments)} records")
except FileNotFoundError:
comments = pd.DataFrame()
st.sidebar.warning("⚠️ Reddit comments file not found")
except Exception as e:
comments = pd.DataFrame()
st.sidebar.error(f"❌ Error loading comments: {str(e)}")
# Load ward coordinates
try:
wards = pd.read_csv(ward_file)
if 'ward_name' not in wards.columns and 'name' in wards.columns:
wards.rename(columns={'name': 'ward_name'}, inplace=True)
data_status["wards"] = True
st.sidebar.success(f"✅ Wards loaded: {len(wards)} wards")
except FileNotFoundError:
wards = pd.DataFrame()
st.sidebar.warning("⚠️ Ward coordinates file not found")
except Exception as e:
wards = pd.DataFrame()
st.sidebar.error(f"❌ Error loading wards: {str(e)}")
# Load district coordinates
try:
districts = pd.read_csv(district_file)
if 'district_name' not in districts.columns and 'name' in districts.columns:
districts.rename(columns={'name': 'district_name'}, inplace=True)
data_status["districts"] = True
st.sidebar.success(f"✅ Districts loaded: {len(districts)} districts")
except FileNotFoundError:
districts = pd.DataFrame()
st.sidebar.warning("⚠️ District coordinates file not found")
except Exception as e:
districts = pd.DataFrame()
st.sidebar.error(f"❌ Error loading districts: {str(e)}")
return posts, comments, wards, districts, data_status
# ------------------------
# Crime Analysis Functions
# ------------------------
def classify_crime_severity(text):
"""Classify posts by crime severity based on keywords"""
text_lower = str(text).lower()
severity_score = 0
for keyword in DRUG_KEYWORDS['high_risk']:
if keyword in text_lower:
severity_score += 3
for keyword in DRUG_KEYWORDS['substance']:
if keyword in text_lower:
severity_score += 2
for keyword in DRUG_KEYWORDS['activity']:
if keyword in text_lower:
severity_score += 1
if severity_score >= 5:
return 'Critical'
elif severity_score >= 3:
return 'High'
elif severity_score >= 1:
return 'Medium'
else:
return 'Low'
def extract_drug_mentions(text):
"""Extract specific drug mentions from text"""
text_lower = str(text).lower()
drugs_found = []
for drug in DRUG_KEYWORDS['substance']:
if drug in text_lower:
drugs_found.append(drug.capitalize())
return ', '.join(drugs_found) if drugs_found else 'Unspecified'
def calculate_threat_score(row):
"""Calculate threat score based on multiple factors"""
score = 0
text = str(row.get('text', '')) + ' ' + str(row.get('title', ''))
text_lower = text.lower()
for keyword in DRUG_KEYWORDS['high_risk']:
if keyword in text_lower:
score += 10
if 'score' in row:
score += min(int(row.get('score', 0)) / 10, 5)
if 'num_comments' in row:
score += min(int(row.get('num_comments', 0)) / 5, 5)
sentiment = TextBlob(text).sentiment.polarity
if sentiment < -0.2:
score += 5
return min(score, 100)
# ------------------------
# Load All Data
# ------------------------
posts_df, comments_df, wards_df, districts_df, data_status = load_data(
POSTS_FILE, COMMENTS_FILE, WARD_COORDS_FILE, DISTRICT_COORDS_FILE
)
# ------------------------
# Data Processing
# ------------------------
def process_datetime(df, datetime_col='created_utc'):
"""Process datetime column with robust error handling"""
if datetime_col not in df.columns:
return df
df["datetime"] = pd.to_datetime(df[datetime_col], errors='coerce')
df["date"] = df["datetime"].dt.date
df["hour"] = df["datetime"].dt.hour
df["day_of_week"] = df["datetime"].dt.day_name()
return df
# Normalize coordinate names
if not wards_df.empty and "ward_name" in wards_df.columns:
wards_df["ward_name"] = wards_df["ward_name"].astype(str).str.strip().str.lower()
if not districts_df.empty and "district_name" in districts_df.columns:
districts_df["district_name"] = districts_df["district_name"].astype(str).str.strip().str.lower()
# District mapping
district_mapping = {
"bangalore": "bengaluru",
"blr": "bengaluru",
"mysore": "mysuru",
}
# Create patterns
ward_pattern = None
district_pattern = None
if not wards_df.empty:
ward_list = wards_df["ward_name"].str.lower().tolist()
ward_pattern = r'\b(' + '|'.join(re.escape(w) for w in ward_list) + r')\b'
if not districts_df.empty:
district_list = districts_df["district_name"].str.lower().tolist()
district_pattern = r'\b(' + '|'.join(re.escape(d) for d in district_list) + r')\b'
def extract_locations(text_series, patterns):
"""Extract locations from text using regex patterns"""
locations = []
for text in text_series.fillna(""):
matches = []
for pattern in patterns:
matches.extend(re.findall(pattern, str(text).lower()))
matches = list(set(matches))
locations.append(", ".join(matches))
return pd.Series(locations, index=text_series.index)
# Process posts
if not posts_df.empty:
posts_df = process_datetime(posts_df)
post_text = (posts_df.get("title", "") + " " + posts_df.get("text", "")).fillna("")
if ward_pattern:
posts_df["ward_location"] = extract_locations(post_text, [ward_pattern])
else:
posts_df["ward_location"] = ""
if district_pattern:
posts_df["district_location"] = extract_locations(post_text, [district_pattern])
else:
posts_df["district_location"] = ""
posts_df["district_location"] = posts_df["district_location"].replace(district_mapping)
posts_df["severity"] = post_text.apply(classify_crime_severity)
posts_df["drugs_mentioned"] = post_text.apply(extract_drug_mentions)
posts_df["threat_score"] = posts_df.apply(calculate_threat_score, axis=1)
posts_df["sentiment_score"] = post_text.apply(lambda x: TextBlob(str(x)).sentiment.polarity)
posts_df["sentiment"] = posts_df["sentiment_score"].apply(
lambda x: "Positive" if x > 0 else ("Negative" if x < 0 else "Neutral")
)
# Process comments
if not comments_df.empty:
comments_df = process_datetime(comments_df)
# ------------------------
# Dashboard Header
# ------------------------
st.title("🚨 Reddit based Drug Crime Intelligence Dashboard")
st.markdown("**Real-time intelligence analysis of drug-related criminal activities from Reddit social media monitoring**")
# ------------------------
# Sidebar Filters
# ------------------------
st.sidebar.title("🔧 Intelligence Controls")
if st.sidebar.button("🔄 Refresh Data"):
st.cache_data.clear()
st.rerun()
# Severity filter
if not posts_df.empty and "severity" in posts_df.columns:
severity_filter = st.sidebar.multiselect(
"⚠️ Crime Severity Level",
options=['Critical', 'High', 'Medium', 'Low'],
default=['Critical', 'High']
)
if severity_filter:
posts_df = posts_df[posts_df["severity"].isin(severity_filter)]
# Date range filter
if not posts_df.empty and "datetime" in posts_df.columns:
min_date = posts_df["datetime"].min().date()
max_date = posts_df["datetime"].max().date()
date_range = st.sidebar.date_input(
"📅 Select Date Range",
value=(min_date, max_date),
min_value=min_date,
max_value=max_date
)
if len(date_range) == 2:
posts_df = posts_df[
(posts_df["date"] >= date_range[0]) &
(posts_df["date"] <= date_range[1])
]
# Subreddit filter
if not posts_df.empty and "subreddit" in posts_df.columns:
subreddits = st.sidebar.multiselect(
"📱 Filter by Subreddits",
options=posts_df["subreddit"].unique(),
default=posts_df["subreddit"].value_counts().head(5).index.tolist()
)
if subreddits:
posts_df = posts_df[posts_df["subreddit"].isin(subreddits)]
# Keyword search
search_keyword = st.sidebar.text_input("🔍 Search Keywords in Content")
if search_keyword:
posts_df = posts_df[
posts_df["text"].str.contains(search_keyword, case=False, na=False) |
posts_df["title"].str.contains(search_keyword, case=False, na=False)
]
# ------------------------
# Main Dashboard Content
# ------------------------
if posts_df.empty and comments_df.empty:
st.error("🚫 No intelligence data available. Please ensure data collection is operational.")
st.stop()
# --- Crime Intelligence Metrics
st.subheader("📊 Crime Intelligence Overview")
col1, col2, col3, col4 = st.columns(4)
with col1:
critical_posts = len(posts_df[posts_df["severity"] == "Critical"]) if "severity" in posts_df.columns else 0
st.metric(
label="Critical Threats",
value=critical_posts,
delta=f"{(critical_posts/len(posts_df)*100):.1f}%" if len(posts_df) > 0 else "0%"
)
with col2:
avg_threat = posts_df["threat_score"].mean() if "threat_score" in posts_df.columns else 0
st.metric(
label="Avg Threat Score",
value=f"{avg_threat:.1f}",
delta="High" if avg_threat > 50 else "Moderate"
)
with col3:
if "ward_location" in posts_df.columns:
ward_exploded_temp = posts_df[posts_df["ward_location"] != ""].copy()
ward_exploded_temp["ward_location"] = ward_exploded_temp["ward_location"].str.split(", ")
ward_exploded_temp = ward_exploded_temp.explode("ward_location")
unique_locations = ward_exploded_temp["ward_location"].nunique()
st.metric(
label="Active Locations",
value=unique_locations
)
with col4:
drug_types = posts_df["drugs_mentioned"].str.split(", ").explode().nunique() if "drugs_mentioned" in posts_df.columns else 0
st.metric(
label="Drug Types Identified",
value=drug_types
)
st.markdown("---")
# --- Crime Severity Distribution
if "severity" in posts_df.columns:
st.subheader("⚠️ Crime Severity Analysis")
col1, col2 = st.columns(2)
with col1:
severity_counts = posts_df["severity"].value_counts()
fig_severity = px.pie(
values=severity_counts.values,
names=severity_counts.index,
title="Crime Severity Distribution",
color=severity_counts.index,
color_discrete_map={
'Critical': '#FF0000',
'High': '#FF6B00',
'Medium': '#FFD700',
'Low': '#90EE90'
}
)
st.plotly_chart(fig_severity, use_container_width=True)
with col2:
fig_threat = px.histogram(
posts_df,
x="threat_score",
nbins=20,
title="Threat Score Distribution",
labels={"threat_score": "Threat Score", "count": "Number of Posts"}
)
fig_threat.add_vline(x=50, line_dash="dash", line_color="red", annotation_text="High Threat Threshold")
st.plotly_chart(fig_threat, use_container_width=True)
st.markdown("---")
# --- Drug Type Analysis
if "drugs_mentioned" in posts_df.columns:
st.subheader("💊 Substance Intelligence")
all_drugs = posts_df["drugs_mentioned"].str.split(", ").explode()
drug_counts = all_drugs[all_drugs != "Unspecified"].value_counts().head(10)
if not drug_counts.empty:
fig_drugs = px.bar(
x=drug_counts.values,
y=drug_counts.index,
orientation='h',
title="Top 10 Substances Mentioned",
labels={"x": "Mentions", "y": "Substance"},
color=drug_counts.values,
color_continuous_scale="Reds"
)
st.plotly_chart(fig_drugs, use_container_width=True)
st.markdown("---")
# --- Timeline Analysis
if "date" in posts_df.columns:
st.subheader("📈 Crime Activity Timeline")
col1, col2 = st.columns(2)
with col1:
daily_data = posts_df.groupby(["date", "severity"]).size().reset_index(name="count")
fig_daily = px.line(
daily_data,
x="date",
y="count",
color="severity",
title="Daily Crime Activity by Severity",
labels={"count": "Number of Incidents", "date": "Date"},
color_discrete_map={
'Critical': '#FF0000',
'High': '#FF6B00',
'Medium': '#FFD700',
'Low': '#90EE90'
}
)
st.plotly_chart(fig_daily, use_container_width=True)
with col2:
if "hour" in posts_df.columns and "day_of_week" in posts_df.columns:
hourly_activity = posts_df.groupby(["day_of_week", "hour"]).size().reset_index(name="count")
fig_hourly = px.density_heatmap(
hourly_activity,
x="hour",
y="day_of_week",
z="count",
title="Activity Heatmap - High-Risk Hours",
labels={"hour": "Hour of Day", "day_of_week": "Day", "count": "Incidents"},
color_continuous_scale="Reds"
)
st.plotly_chart(fig_hourly, use_container_width=True)
st.markdown("---")
# --- Geographic Intelligence - COMBINED MAP
st.subheader("🗺️ Geographic Crime Intelligence")
# Process both ward and district data
ward_data_available = not wards_df.empty and "ward_location" in posts_df.columns
district_data_available = not districts_df.empty and "district_location" in posts_df.columns
if ward_data_available or district_data_available:
st.markdown("**Crime hotspot analysis across Karnataka (Wards & Districts)**")
# Prepare ward data
merged_wards = pd.DataFrame()
if ward_data_available:
ward_posts = posts_df[posts_df["ward_location"] != ""].copy()
ward_exploded = ward_posts.copy()
ward_exploded["ward_location"] = ward_posts["ward_location"].str.split(", ")
ward_exploded = ward_exploded.explode("ward_location")
ward_exploded["ward_location"] = ward_exploded["ward_location"].str.strip().str.lower()
loc_counts = ward_exploded.groupby("ward_location").size().reset_index(name="count")
merged_wards = pd.merge(loc_counts, wards_df, left_on="ward_location", right_on="ward_name", how="inner")
merged_wards["location_type"] = "Ward"
merged_wards["location_name"] = merged_wards["ward_name"]
# Prepare district data
merged_districts = pd.DataFrame()
if district_data_available:
district_posts = posts_df[posts_df["district_location"] != ""].copy()
district_exploded = district_posts.copy()
district_exploded["district_location"] = district_posts["district_location"].str.split(", ")
district_exploded = district_exploded.explode("district_location")
district_exploded["district_location"] = district_exploded["district_location"].str.strip().str.lower()
district_counts = district_exploded.groupby("district_location").size().reset_index(name="count")
merged_districts = pd.merge(district_counts, districts_df, left_on="district_location", right_on="district_name", how="inner")
merged_districts["location_type"] = "District"
merged_districts["location_name"] = merged_districts["district_name"]
# Combine both datasets
all_locations = pd.concat([merged_wards, merged_districts], ignore_index=True)
if not all_locations.empty:
# Determine center of map
center_lat = all_locations["lat"].mean()
center_lon = all_locations["lon"].mean()
# Create unified map
m_unified = folium.Map(
location=[center_lat, center_lon],
zoom_start=9 if ward_data_available else 7,
tiles="OpenStreetMap"
)
# Add heatmap layer
heat_data = [[row["lat"], row["lon"], row["count"]] for _, row in all_locations.iterrows()]
HeatMap(heat_data, radius=20, blur=15, max_zoom=13, gradient={
0.0: 'blue', 0.5: 'yellow', 0.75: 'orange', 1.0: 'red'
}).add_to(m_unified)
# Determine hotspot threshold
threshold = all_locations["count"].quantile(0.70)
all_locations["is_hotspot"] = all_locations["count"] >= threshold
# Add markers for each location
for _, row in all_locations.iterrows():
location_name = row["location_name"].title()
location_type = row["location_type"]
incident_count = row["count"]
# Get location-specific crime data
if location_type == "Ward":
loc_data = posts_df[posts_df["ward_location"].str.contains(row["location_name"], case=False, na=False)]
else:
loc_data = posts_df[posts_df["district_location"].str.contains(row["location_name"], case=False, na=False)]
# Severity breakdown
severity_breakdown = loc_data["severity"].value_counts().to_dict()
severity_html = "
".join([f" • {sev}: {count}" for sev, count in severity_breakdown.items()])
# Critical incidents count
critical_count = severity_breakdown.get("Critical", 0)
# Top drugs in this location
loc_drugs = loc_data["drugs_mentioned"].str.split(", ").explode()
top_drugs = loc_drugs[loc_drugs != "Unspecified"].value_counts().head(3)
drugs_html = "
".join([f" • {drug}: {count}" for drug, count in top_drugs.items()])
# Average threat score
avg_threat = loc_data["threat_score"].mean()
# Recent high-threat incidents
recent = loc_data.nlargest(3, "threat_score")[["title", "severity", "threat_score"]]
incidents_html = "
".join([
f" • [{r['severity']}] {r['title'][:50]}... (Score: {r['threat_score']:.0f})"
for _, r in recent.iterrows()
])
# Marker color based on severity
marker_color = 'darkred' if row["is_hotspot"] else ('red' if incident_count >= 5 else ('orange' if incident_count >= 3 else 'blue'))
# Icon based on type
icon_symbol = 'home' if location_type == "Ward" else 'map'
# Create detailed popup
popup_html = f"""