Spaces:

lawlevisan
/

Twitter-Analysis

Sleeping

App Files Files Community

Twitter-Analysis / src /streamlit_app.py

lawlevisan

Update src/streamlit_app.py

7f30852 verified 4 months ago

raw

history blame contribute delete

60.6 kB

	#modify_app.py
	import streamlit as st
	import pandas as pd
	import os
	import json
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	from datetime import datetime, timedelta

	import nltk
	from nltk.corpus import stopwords # ✅ import first

	# Ensure stopwords data is downloaded
	try:
	stopwords.words('english')
	except LookupError:
	nltk.download('stopwords')

	# Now you can safely use it
	english_stopwords = stopwords.words('english')

	import numpy as np
	import time
	import seaborn as sns
	import matplotlib.pyplot as plt
	from alerts import compute_dynamic_risk,assign_dynamic_risk_level
	from evaluation import evaluate_model

	# Run evaluation on the scraped CSV folder
	evaluate_model("drug_analysis_data_3months")

	import re

	st.set_page_config(page_title="Twitter Drug Crime Monitoring", layout="wide")

	# Custom CSS for better styling
	st.markdown("""
	<style>
	.main-header {
	background: linear-gradient(90deg, #1e3c72, #2a5298);
	color: white;
	padding: 1rem;
	border-radius: 10px;
	text-align: center;
	margin-bottom: 2rem;
	}
	.metric-card {
	background: #f8f9fa;
	padding: 1rem;
	border-radius: 8px;
	border-left: 4px solid #007bff;
	}
	.critical-alert {
	background: #f8d7da;
	border: 1px solid #f5c6cb;
	color: #721c24;
	padding: 1rem;
	border-radius: 8px;
	margin: 1rem 0;
	}
	.high-priority {
	background: #fff3cd;
	border: 1px solid #ffeaa7;
	color: #856404;
	padding: 1rem;
	border-radius: 8px;
	margin: 1rem 0;
	}
	.warning-box {
	background: #d4edda;
	border: 1px solid #c3e6cb;
	color: #155724;
	padding: 1rem;
	border-radius: 8px;
	margin: 1rem 0;
	}
	</style>
	""", unsafe_allow_html=True)

	# Configuration
	DASHBOARD_CONFIG = {
	'data_dirs': ['drug_analysis_data_3months', 'data', 'output', '.'],
	'refresh_interval': 30,
	'max_display_tweets': 50,
	'chart_height': 400
	}

	# Main header
	st.markdown('<div class="main-header"><h1>Twitter Drug Crime Monitoring Dashboard</h1><p>Real-time Twitter Analysis for Drug Crime Detection</p></div>', unsafe_allow_html=True)

	# ------------------------
	# Enhanced Data Loading Functions
	# ------------------------

	def parse_dates_flexible(df):
	"""Parse dates with multiple format attempts."""
	if "datetime" not in df.columns:
	return df

	date_formats = [
	"%d-%m-%Y %H:%M:%S",
	"%Y-%m-%d %H:%M:%S",
	"%Y-%m-%d %H:%M",
	"%Y-%m-%d",
	"%d/%m/%Y %H:%M:%S",
	"%m/%d/%Y %H:%M:%S"
	]

	original_datetime = df["datetime"].copy()

	for fmt in date_formats:
	try:
	df["datetime"] = pd.to_datetime(original_datetime, format=fmt, errors="coerce")
	if not df["datetime"].isna().all():
	break
	except:
	continue

	# If parsing still failed, try generic parsing
	if df["datetime"].isna().all():
	df["datetime"] = pd.to_datetime(original_datetime, errors="coerce")

	# Fill any remaining NaT values with current time
	df["datetime"] = df["datetime"].fillna(pd.Timestamp.now())

	return df

	def validate_dataframe(df):
	"""Validate that the dataframe has expected columns."""
	if df is None or df.empty:
	return False, "DataFrame is empty"

	required_columns = ['username', 'content']
	missing_columns = [col for col in required_columns if col not in df.columns]

	if missing_columns:
	return False, f"Missing required columns: {missing_columns}"

	return True, "DataFrame is valid"

	@st.cache_data
	def load_data():
	"""Load the most recent data with robust error handling."""
	start_time = time.time()

	for data_dir in DASHBOARD_CONFIG['data_dirs']:
	if not os.path.exists(data_dir):
	continue

	try:
	# Look for main dataset files with flexible naming
	csv_files = []
	for f in os.listdir(data_dir):
	if f.endswith(".csv") and any(keyword in f.lower() for keyword in
	["karnataka_drug_tweets", "drug_tweets", "drug_analysis", "drug_crime"]):
	csv_files.append(f)

	if not csv_files:
	# Fallback to any CSV file
	csv_files = [f for f in os.listdir(data_dir) if f.endswith(".csv")]

	if not csv_files:
	continue

	# Get the most recent file
	latest_file = max(csv_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
	file_path = os.path.join(data_dir, latest_file)

	# Load with error handling
	df = pd.read_csv(file_path, encoding='utf-8')

	if df.empty:
	continue

	# Enhanced date parsing
	df = parse_dates_flexible(df)

	# Add derived columns if missing
	if "datetime" in df.columns:
	if "date" not in df.columns:
	df["date"] = df["datetime"].dt.date
	if "hour" not in df.columns:
	df["hour"] = df["datetime"].dt.hour
	if "day_of_week" not in df.columns:
	df["day_of_week"] = df["datetime"].dt.day_name()
	if "day" not in df.columns:
	df["day"] = df["datetime"].dt.day

	# Load report if available
	report_files = [f for f in os.listdir(data_dir)
	if f.startswith("ANALYSIS_REPORT_") and f.endswith(".json")]
	report_data = None

	if report_files:
	latest_report = max(report_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
	try:
	with open(os.path.join(data_dir, latest_report), 'r', encoding='utf-8') as f:
	report_data = json.load(f)
	except Exception as e:
	st.sidebar.warning(f"Could not load report: {e}")
	report_data = None

	load_time = time.time() - start_time

	# Display load metrics in sidebar
	st.sidebar.success(f"Data loaded successfully")
	st.sidebar.metric("Load Time", f"{load_time:.2f}s")
	st.sidebar.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
	st.sidebar.info(f"Source: {latest_file}")

	return df, report_data

	except Exception as e:
	st.sidebar.warning(f"Failed to load from {data_dir}: {str(e)}")
	continue

	return None, None

	@st.cache_data
	def load_priority_data():
	"""Load high priority and contact info datasets with fallbacks."""
	data_dir = DASHBOARD_CONFIG['data_dirs'][0] # Primary data directory

	if not os.path.exists(data_dir):
	return None, None

	high_priority_df = None
	contact_df = None

	try:
	# Load high priority tweets
	high_priority_files = [f for f in os.listdir(data_dir)
	if "HIGH_PRIORITY" in f and f.endswith(".csv")]

	if high_priority_files:
	latest_priority = max(high_priority_files,
	key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
	high_priority_df = pd.read_csv(os.path.join(data_dir, latest_priority))
	high_priority_df = parse_dates_flexible(high_priority_df)
	except Exception as e:
	st.sidebar.warning(f"Could not load high priority data: {e}")

	try:
	# Load contact info tweets
	contact_files = [f for f in os.listdir(data_dir)
	if "CONTACT_INFO" in f and f.endswith(".csv")]

	if contact_files:
	latest_contact = max(contact_files,
	key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
	contact_df = pd.read_csv(os.path.join(data_dir, latest_contact))
	contact_df = parse_dates_flexible(contact_df)
	except Exception as e:
	st.sidebar.warning(f"Could not load contact info data: {e}")

	return high_priority_df, contact_df

	def safe_column_access(df, column, default=0):
	"""Safely access DataFrame columns with defaults."""
	if column in df.columns:
	return df[column]
	else:
	return pd.Series([default] * len(df), index=df.index)

	def safe_column_sum(df, column):
	"""Safely sum a column with fallback."""
	if column in df.columns:
	return df[column].sum()
	return 0

	def safe_column_mean(df, column):
	"""Safely calculate mean of a column with fallback."""
	if column in df.columns and len(df) > 0:
	return df[column].mean()
	return 0

	# ----------------- Helper: Calculate User Risk -----------------
	def calculate_user_risk(df):
	"""
	Calculate risk score per user:
	CRITICAL = 2 points, HIGH = 1 point
	Returns DataFrame with username, risk_score, tweet_count
	"""
	if "username" not in df.columns or "risk_level" not in df.columns:
	return pd.DataFrame()

	user_metrics = []
	for username in df["username"].unique():
	user_data = df[df["username"] == username]
	risk_score = (user_data["risk_level"] == "HIGH").sum() + \
	(user_data["risk_level"] == "CRITICAL").sum() * 2
	user_metrics.append({
	"username": username,
	"risk_score": risk_score,
	"tweet_count": len(user_data)
	})
	return pd.DataFrame(user_metrics)

	# ----------------- Helper: Filter Words -----------------
	def get_filtered_words(text_series):
	"""
	Returns filtered words from a Series of text,
	removing English stopwords and words <=2 characters
	"""
	stop_words_set = set(stopwords.words('english'))
	all_text = " ".join(text_series.astype(str))
	words = re.findall(r'\b\w+\b', all_text.lower())
	return [w for w in words if w not in stop_words_set and len(w) > 2]


	def create_heatmap_chart(df, x_col, y_col, title="Heatmap"):
	"""Create a heatmap using plotly."""
	if x_col not in df.columns or y_col not in df.columns:
	return None

	# Create pivot table for heatmap
	heatmap_data = df.groupby([x_col, y_col]).size().reset_index(name='count')
	pivot_data = heatmap_data.pivot(index=y_col, columns=x_col, values='count').fillna(0)

	fig = go.Figure(data=go.Heatmap(
	z=pivot_data.values,
	x=pivot_data.columns,
	y=pivot_data.index,
	colorscale='Blues',
	hoverongaps=False
	))

	fig.update_layout(
	title=title,
	xaxis_title=x_col,
	yaxis_title=y_col,
	height=400
	)

	return fig

	def create_weekly_trend_analysis(df):
	"""Create weekly trend analysis."""
	if "datetime" not in df.columns:
	return None, None

	# Weekly aggregation
	df['week'] = df['datetime'].dt.isocalendar().week
	df['weekday'] = df['datetime'].dt.day_name()

	weekly_counts = df.groupby('week').size().reset_index(name='count')
	weekday_counts = df.groupby('weekday').size().reset_index(name='count')

	# Reorder weekdays
	weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
	weekday_counts['weekday'] = pd.Categorical(weekday_counts['weekday'], categories=weekday_order, ordered=True)
	weekday_counts = weekday_counts.sort_values('weekday')

	fig1 = px.line(weekly_counts, x='week', y='count', title="Weekly Tweet Trends")
	fig2 = px.bar(weekday_counts, x='weekday', y='count', title="Tweets by Weekday")

	return fig1, fig2

	# ------------------------
	# Load Data
	# ------------------------
	df, report_data = load_data()

	# --- Compute dynamic risk for all tweets ---
	if df is not None and not df.empty:
	from alerts import compute_dynamic_risk, assign_dynamic_risk_level

	# Add dynamic risk fields
	df['dynamic_risk_score'] = df.apply(lambda row: compute_dynamic_risk(row.to_dict()), axis=1)
	df['risk_level'] = df.apply(lambda row: assign_dynamic_risk_level(row.to_dict()), axis=1)

	if df is None:
	st.error("No data found. Please run the drug crime scraper first.")

	# Enhanced debug information
	st.subheader("Debug Information")
	current_dir = os.getcwd()
	st.write(f"Current directory: {current_dir}")

	for dir_name in DASHBOARD_CONFIG['data_dirs']:
	if os.path.exists(dir_name):
	files = [f for f in os.listdir(dir_name) if f.endswith('.csv')]
	st.write(f"CSV files in {dir_name}: {files}")
	else:
	st.write(f"Directory {dir_name} does not exist")

	st.info("Expected files: karnataka_drug_tweets_*.csv or similar drug-related CSV files")
	st.stop()

	# Validate dataframe
	is_valid, validation_message = validate_dataframe(df)
	if not is_valid:
	st.error(f"Data validation failed: {validation_message}")
	st.write("Available columns:", list(df.columns))
	st.stop()

	# Load priority data
	high_priority_df, contact_df = load_priority_data()

	# Filter for current month data for some analyses
	now = datetime.now()
	if "datetime" in df.columns:
	df_month = df[(df['datetime'].dt.month == now.month) & (df['datetime'].dt.year == now.year)]
	else:
	df_month = df

	# ------------------------
	# Sidebar Navigation & Filters
	# ------------------------
	st.sidebar.title("Dashboard Navigation")

	# Auto-refresh option
	auto_refresh = st.sidebar.checkbox("Auto-refresh (30s)")
	from streamlit_autorefresh import st_autorefresh

	if auto_refresh:
	st_autorefresh(interval=30*1000, key="refresh")

	# Navigation tabs - ENHANCED with new options
	analysis_type = st.sidebar.radio(
	"Select Analysis View",
	["Summary", "Risk Analysis", "Actionable Insights", "📈 Predictive Analytics", "🌐 Network Analysis",
	"Geographic Analysis", "User Analysis",
	"Content Analysis", "📊 Volume Trends", "🧠 User Behavior",
	"📍 Heatmaps", "⚠️ Risk Patterns"]
	)

	# Common filters
	st.sidebar.header("Data Filters")

	# Date range filter
	if "datetime" in df.columns and not df["datetime"].isna().all():
	try:
	min_date = df["datetime"].min().date()
	max_date = df["datetime"].max().date()

	date_range = st.sidebar.date_input(
	"Select Date Range",
	value=[min_date, max_date],
	min_value=min_date,
	max_value=max_date
	)

	# Filter dataframe by date range
	if len(date_range) == 2:
	df = df[
	(df["datetime"].dt.date >= date_range[0]) &
	(df["datetime"].dt.date <= date_range[1]) &
	(df["datetime"].dt.year == date_range[0].year) # optional if needed
	]

	except Exception as e:
	st.sidebar.warning(f"Date filtering error: {e}")

	# Risk level filter
	if "risk_level" in df.columns:
	available_risk_levels = df["risk_level"].unique().tolist()
	risk_levels = st.sidebar.multiselect(
	"Risk Levels",
	options=available_risk_levels,
	default=available_risk_levels
	)
	df = df[df["risk_level"].isin(risk_levels)]

	# Search filter
	search_term = st.sidebar.text_input("Search Content", "")
	if search_term:
	df = df[df["content"].str.lower().str.contains(search_term.lower(), na=False)]

	# Display current filter status
	st.sidebar.info(f"Showing {len(df)} tweets")

	# ------------------------
	# EXECUTIVE SUMMARY
	# ------------------------
	if analysis_type == "Summary":
	st.header("Summary")

	# Key metrics in columns
	col1, col2, col3, col4, col5, col6 = st.columns(6)

	with col1:
	st.metric("Total Tweets", len(df))
	with col2:
	drug_related = safe_column_sum(df, "is_drug_related")
	st.metric("Drug Related", drug_related)
	with col3:
	crime_related = safe_column_sum(df, "is_crime_related")
	st.metric("Crime Related", crime_related)
	with col4:
	contact_info = safe_column_sum(df, "has_contact_info")
	st.metric("Contact Info", contact_info)
	with col5:
	st.metric("Unique Users", df["username"].nunique())
	with col6: # Or create a new column if needed
	avg_risk = df["dynamic_risk_score"].mean() if "dynamic_risk_score" in df.columns else 0
	st.metric("Avg. Dynamic Risk Score", f"{avg_risk:.2f}")


	# Risk level analysis
	if "risk_level" in df.columns:
	critical_count = len(df[df["risk_level"] == "CRITICAL"])
	high_count = len(df[df["risk_level"] == "HIGH"])

	if critical_count > 0:
	st.markdown(f'<div class="critical-alert"><strong>CRITICAL ALERT:</strong> {critical_count} tweets require immediate attention</div>', unsafe_allow_html=True)

	if high_count > 0:
	st.markdown(f'<div class="high-priority"><strong>HIGH PRIORITY:</strong> {high_count} tweets for investigation</div>', unsafe_allow_html=True)

	# Risk distribution pie chart
	col1, col2 = st.columns(2)

	with col1:
	risk_order = ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
	risk_dist = df["risk_level"].value_counts().reindex(risk_order).fillna(0)
	fig_risk = px.pie(values=risk_dist.values, names=risk_dist.index,
	title="Risk Level Distribution",
	color_discrete_map={
	"CRITICAL": "#dc3545",
	"HIGH": "#fd7e14",
	"MEDIUM": "#ffc107",
	"LOW": "#28a745"
	})
	st.plotly_chart(fig_risk, use_container_width=True)

	with col2:
	# Sentiment analysis if available
	if "sentiment_compound" in df.columns:
	sentiment_counts = pd.cut(df["sentiment_compound"],
	bins=[-1, -0.1, 0.1, 1],
	labels=["Negative", "Neutral", "Positive"]).value_counts()
	fig_sentiment = px.bar(x=sentiment_counts.index, y=sentiment_counts.values,
	title="Sentiment Distribution",
	color=sentiment_counts.values,
	color_continuous_scale="RdYlGn")
	st.plotly_chart(fig_sentiment, use_container_width=True)
	else:
	st.info("Sentiment data not available")

	# Analysis report summary
	if report_data:
	st.subheader("Analysis Report Summary")

	col1, col2 = st.columns(2)

	with col1:
	if "summary_statistics" in report_data:
	st.json(report_data["summary_statistics"])

	with col2:
	if "investigation_priorities" in report_data:
	st.json(report_data["investigation_priorities"])

	# ------------------------
	# NEW: VOLUME TRENDS
	# ------------------------
	elif analysis_type == "📊 Volume Trends":
	st.header("📊 Tweet Volume: Daily,Weekly and Hourly Trends")

	if "datetime" in df.columns and not df["datetime"].isna().all():
	# Daily trend
	if "date" in df.columns:
	daily_counts = df.groupby("date").size().reset_index(name="count")
	fig_daily = px.line(daily_counts, x="date", y="count",
	title="Daily Tweet Volume")
	st.plotly_chart(fig_daily, use_container_width=True)

	# Hourly and weekday patterns
	col1, = st.columns(1)

	with col1:
	if "hour" in df.columns:
	hourly_counts = df.groupby("hour").size()
	fig_hourly = px.bar(x=hourly_counts.index, y=hourly_counts.values,
	title="Tweets by Hour of Day")
	st.plotly_chart(fig_hourly, use_container_width=True)

	# Weekly trends
	if "datetime" in df.columns:
	weekly_fig1, weekly_fig2 = create_weekly_trend_analysis(df)
	if weekly_fig1 and weekly_fig2:
	st.subheader("📅 Weekly Trends")
	col1, col2 = st.columns(2)
	with col1:
	st.plotly_chart(weekly_fig1, use_container_width=True)
	with col2:
	st.plotly_chart(weekly_fig2, use_container_width=True)
	else:
	st.info("Temporal data not available")

	# CSV Downloads
	st.subheader("📄 Download Data")
	col1, col2 = st.columns(2)

	with col1:
	if st.button("📥 Download Top Users CSV"):
	top_users = df.groupby("username").agg(
	tweet_count=("username", "count"),
	max_risk=("dynamic_risk_score", "max")
	).sort_values("tweet_count", ascending=False).head(20).reset_index()
	csv = top_users.to_csv(index=False)
	st.download_button(
	"Download CSV", csv, "top_users.csv", "text/csv"
	)

	with col2:
	if st.button("📥 Download Top Locations CSV"):
	if "user_location" in df.columns:
	top_locations = df.groupby("user_location").agg(
	tweet_count=("user_location", "count"),
	max_risk=("dynamic_risk_score", "max")
	).sort_values("tweet_count", ascending=False).head(20).reset_index()
	csv = top_locations.to_csv(index=False)
	st.download_button(
	"Download CSV", csv, "top_locations.csv", "text/csv"
	)


	# ------------------------
	# NEW: USER BEHAVIOR
	# ------------------------
	elif analysis_type == "🧠 User Behavior":
	st.header("🧠 User Behavior Analysis")

	# Top repeat users
	st.subheader("🧠 Top Repeat Users")
	user_activity = df["username"].value_counts().head(15)

	if not user_activity.empty:
	fig_users = px.bar(x=user_activity.values, y=user_activity.index,
	orientation='h', title="Top 15 Most Active Users")
	fig_users.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig_users, use_container_width=True)

	# Show details of top users
	with st.expander("View Top User Details"):
	for username, count in user_activity.head(10).items():
	user_tweets = df[df["username"] == username]

	# Safe mode extraction with proper error handling
	if "risk_level" in user_tweets.columns and not user_tweets["risk_level"].empty:
	risk_mode = user_tweets["risk_level"].mode()
	risk_level = risk_mode.iloc[0] if len(risk_mode) > 0 else "Unknown"
	else:
	risk_level = "Unknown"

	if "user_location" in user_tweets.columns and not user_tweets["user_location"].empty:
	location_mode = user_tweets["user_location"].mode()
	location = location_mode.iloc[0] if len(location_mode) > 0 else "Unknown"
	else:
	location = "Unknown"

	st.write(f"@{username}: {count} tweets \| Risk: {risk_level} \| Location: {location}")

	# User engagement patterns
	if "like_count" in df.columns or "retweet_count" in df.columns:
	st.subheader("📊 User Engagement Patterns")

	col1, col2 = st.columns(2)

	with col1:
	if "like_count" in df.columns:
	avg_likes = df.groupby("username")["like_count"].mean().sort_values(ascending=False).head(15)
	fig_likes = px.bar(x=avg_likes.values, y=avg_likes.index,
	orientation='h', title="Users by Average Likes")
	fig_likes.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig_likes, use_container_width=True)

	with col2:
	if "retweet_count" in df.columns:
	avg_retweets = df.groupby("username")["retweet_count"].mean().sort_values(ascending=False).head(15)
	fig_retweets = px.bar(x=avg_retweets.values, y=avg_retweets.index,
	orientation='h', title="Users by Average Retweets")
	fig_retweets.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig_retweets, use_container_width=True)

	# User location overlap analysis
	if "user_location" in df.columns and "risk_level" in df.columns:
	st.subheader("📍 User Location vs Risk Analysis")
	location_risk = df.groupby(["user_location", "risk_level"]).size().reset_index(name="count")
	location_risk = location_risk[location_risk["user_location"] != ""]

	if not location_risk.empty:
	fig_loc_risk = px.bar(location_risk, x="user_location", y="count",
	color="risk_level", title="Risk Distribution by Location",
	color_discrete_map={
	"CRITICAL": "#dc3545",
	"HIGH": "#fd7e14",
	"MEDIUM": "#ffc107",
	"LOW": "#28a745"
	})
	fig_loc_risk.update_xaxes(tickangle=45)
	st.plotly_chart(fig_loc_risk, use_container_width=True)

	# ------------------------
	# NEW: HEATMAPS
	# ------------------------
	elif analysis_type == "📍 Heatmaps":
	st.header("📍 Time-Based Heatmaps")

	# -------------------
	# Day-Hour heatmap
	# -------------------
	if "day_of_week" in df.columns and "hour" in df.columns:
	# Ensure proper order
	day_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
	df['day_of_week'] = pd.Categorical(df['day_of_week'], categories=day_order, ordered=True)

	st.subheader("🔥 Day vs Hour Activity Heatmap")
	heatmap_fig = create_heatmap_chart(df, "hour", "day_of_week", "Tweet Activity: Day vs Hour")
	if heatmap_fig:
	st.plotly_chart(heatmap_fig, use_container_width=True)

	# Risk level heatmap
	if "risk_level" in df.columns and "hour" in df.columns:
	st.subheader("⚠️ Risk Level vs Hour Heatmap")
	risk_heatmap = create_heatmap_chart(df, "hour", "risk_level", "Risk Level Distribution by Hour")
	if risk_heatmap:
	st.plotly_chart(risk_heatmap, use_container_width=True)

	# -------------------
	# Top Locations Heatmap
	# -------------------
	if "user_location" in df.columns and "hour" in df.columns:
	st.subheader("📍 Location vs Hour Heatmap (Top Locations)")

	# Add slider in sidebar
	TOP_N_LOCATIONS = st.sidebar.slider("Top N Locations for Heatmaps", 5, 30, 10)

	# Filter top N locations
	top_locations = df["user_location"].value_counts().head(TOP_N_LOCATIONS).index
	df_top_loc = df[df["user_location"].isin(top_locations)]

	if not df_top_loc.empty:
	loc_heatmap = create_heatmap_chart(df_top_loc, "hour", "user_location",
	f"Top {TOP_N_LOCATIONS} Locations Activity by Hour")
	if loc_heatmap:
	st.plotly_chart(loc_heatmap, use_container_width=True)

	# Tweet location heatmap (if geographic coordinates available)
	if "latitude" in df.columns and "longitude" in df.columns:
	st.subheader("🗺️ Geographic Tweet Distribution")
	valid_coords = df.dropna(subset=["latitude", "longitude"])

	if not valid_coords.empty:
	fig_map = px.scatter_mapbox(
	valid_coords, lat="latitude", lon="longitude",
	color="risk_level" if "risk_level" in df.columns else None,
	size_max=15, zoom=7,
	mapbox_style="open-street-map",
	title="Geographic Distribution of Tweets"
	)
	st.plotly_chart(fig_map, use_container_width=True)
	else:
	st.info("No geographic coordinates available for mapping")


	# ------------------------
	# NEW: RISK PATTERNS
	# ------------------------
	# High-risk users analysis
	elif analysis_type == "⚠️ Risk Patterns":
	st.header("⚠️ Risk Patterns and High-Risk Analysis")

	# High-risk users analysis
	if "risk_level" in df.columns:
	st.subheader("🚨 High-Risk Users")

	user_risk_df = calculate_user_risk(df)
	high_risk_users = user_risk_df[user_risk_df["risk_score"] > 0].sort_values("risk_score", ascending=False).head(20)

	if not high_risk_users.empty:
	fig_risk_users = px.bar(high_risk_users, x="risk_score", y="username",
	orientation='h', color="tweet_count", color_continuous_scale="Reds")
	fig_risk_users.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig_risk_users, use_container_width=True)

	# Optional: show details
	with st.expander("High-Risk User Details"):
	for _, row in high_risk_users.iterrows():
	user_data = df[df["username"] == row["username"]]
	critical_count = (user_data["risk_level"] == "CRITICAL").sum()
	high_count = (user_data["risk_level"] == "HIGH").sum()
	st.write(f"@{row['username']}: Risk Score: {row['risk_score']} \| Critical: {critical_count} \| High: {high_count} \| Total Tweets: {row['tweet_count']}")

	# Risk overlap analysis
	if "is_drug_related" in df.columns and "is_crime_related" in df.columns:
	st.subheader("🔄 Drug-Crime Overlap Analysis")

	# Create overlap categories
	df_overlap = df.copy()
	df_overlap["category"] = "Other"
	df_overlap.loc[df_overlap["is_drug_related"] == 1, "category"] = "Drug Only"
	df_overlap.loc[df_overlap["is_crime_related"] == 1, "category"] = "Crime Only"
	df_overlap.loc[(df_overlap["is_drug_related"] == 1) & (df_overlap["is_crime_related"] == 1), "category"] = "Drug + Crime"

	overlap_counts = df_overlap["category"].value_counts()

	fig_overlap = px.pie(values=overlap_counts.values, names=overlap_counts.index,
	title="Drug-Crime Content Overlap",
	color_discrete_map={
	"Drug + Crime": "#dc3545",
	"Drug Only": "#fd7e14",
	"Crime Only": "#ffc107",
	"Other": "#28a745"
	})
	st.plotly_chart(fig_overlap, use_container_width=True)

	# Show high-overlap users
	high_overlap_users = df_overlap[df_overlap["category"] == "Drug + Crime"]["username"].value_counts().head(10)
	if not high_overlap_users.empty:
	st.write("Users with most Drug+Crime tweets:")
	for username, count in high_overlap_users.items():
	st.write(f"- @{username}: {count} tweets")

	# Risk progression over time
	if "datetime" in df.columns and "risk_level" in df.columns:
	st.subheader("📈 Risk Level Trends Over Time")

	# Daily risk aggregation
	df["date_str"] = df["datetime"].dt.strftime("%Y-%m-%d")
	risk_time = df.groupby(["date_str", "risk_level"]).size().reset_index(name="count")

	fig_risk_time = px.line(risk_time, x="date_str", y="count", color="risk_level",
	title="Risk Levels Trend Over Time",
	color_discrete_map={
	"CRITICAL": "#dc3545",
	"HIGH": "#fd7e14",
	"MEDIUM": "#ffc107",
	"LOW": "#28a745"
	})
	fig_risk_time.update_xaxes(tickangle=45)
	st.plotly_chart(fig_risk_time, use_container_width=True)

	# ------------------------
	# RISK ANALYSIS (Enhanced)
	# ------------------------
	elif analysis_type == "Risk Analysis":
	st.header("Risk Analysis")

	# High-risk tweets table
	if high_priority_df is not None and not high_priority_df.empty:
	st.subheader("High Priority Tweets")

	# Risk level tabs
	risk_tab1, risk_tab2 = st.tabs(["CRITICAL", "HIGH"])

	with risk_tab1:
	critical_tweets = high_priority_df[high_priority_df["risk_level"] == "CRITICAL"]
	if not critical_tweets.empty:
	for idx, tweet in critical_tweets.head(10).iterrows():
	with st.expander(f"CRITICAL: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"):
	st.write(f"Content: {tweet['content']}")
	st.write(f"Location: {tweet.get('user_location', 'Unknown')}")
	st.write(f"Drug Score: {tweet.get('drug_score', 'N/A')} \| Crime Score: {tweet.get('crime_score', 'N/A')}")
	if 'tweet_url' in tweet:
	st.write(f"URL: {tweet['tweet_url']}")
	else:
	st.info("No critical risk tweets in current filter")

	with risk_tab2:
	high_tweets = high_priority_df[high_priority_df["risk_level"] == "HIGH"]
	if not high_tweets.empty:
	for idx, tweet in high_tweets.head(10).iterrows():
	with st.expander(f"HIGH: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"):
	st.write(f"Content: {tweet['content']}")
	st.write(f"Location: {tweet.get('user_location', 'Unknown')}")
	st.write(f"Drug Score: {tweet.get('drug_score', 'N/A')} \| Crime Score: {tweet.get('crime_score', 'N/A')}")
	if 'tweet_url' in tweet:
	st.write(f"URL: {tweet['tweet_url']}")
	else:
	st.info("No high risk tweets in current filter")
	else:
	st.info("No high priority data available")

	# Risk score distribution
	if "drug_score" in df.columns and "crime_score" in df.columns:
	fig_scores = make_subplots(rows=1, cols=2, subplot_titles=("Drug Score Distribution", "Crime Score Distribution"))

	fig_scores.add_trace(go.Histogram(x=df["drug_score"], name="Drug Score", nbinsx=20), row=1, col=1)
	fig_scores.add_trace(go.Histogram(x=df["crime_score"], name="Crime Score", nbinsx=20), row=1, col=2)

	fig_scores.update_layout(title="Risk Score Distributions")
	st.plotly_chart(fig_scores, use_container_width=True)
	else:
	st.info("Risk score data not available")

	# ------------------------
	# Actionable Insights
	# ------------------------
	elif analysis_type == "Actionable Insights":
	st.header("Actionable Insights")

	# Contact information tweets
	if contact_df is not None and not contact_df.empty:
	st.subheader("Tweets with Contact Information")
	st.markdown('<div class="warning-box">These tweets contain phone numbers or contact details - HIGH PRIORITY for investigation</div>', unsafe_allow_html=True)

	for idx, tweet in contact_df.head(20).iterrows():
	with st.expander(f"Contact Info: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"):
	st.write(f"Content: {tweet['content']}")
	st.write(f"Phone Numbers: {tweet.get('phone_numbers', 'Not extracted')}")
	st.write(f"Location: {tweet.get('user_location', 'Unknown')}")
	st.write(f"Risk Level: {tweet.get('risk_level', 'Unknown')}")
	if 'tweet_url' in tweet:
	st.write(f"URL: {tweet['tweet_url']}")
	else:
	st.info("No tweets with contact information found")

	# Bulk operation indicators
	st.subheader("Bulk Operation Indicators")
	# Sidebar input
	BULK_KEYWORDS = st.sidebar.text_area("Bulk Operation Keywords (comma-separated)",
	"kg,gram,bulk,wholesale,kilos,ounce,pound").split(",")

	# In code
	bulk_pattern = "\|".join([kw.strip() for kw in BULK_KEYWORDS])
	bulk_regex = re.compile("\|".join([kw.strip() for kw in BULK_KEYWORDS]), re.IGNORECASE)
	bulk_tweets = df[df["content"].str.contains(bulk_regex, na=False)]

	if not bulk_tweets.empty:
	st.write(f"Found {len(bulk_tweets)} tweets mentioning bulk quantities")

	for idx, tweet in bulk_tweets.head(10).iterrows():
	with st.expander(f"Bulk: @{tweet['username']} - Risk: {tweet.get('risk_level', 'Unknown')}"):
	st.write(f"Content: {tweet['content']}")
	st.write(f"Location: {tweet.get('user_location', 'Unknown')}")
	if 'tweet_url' in tweet:
	st.write(f"URL: {tweet['tweet_url']}")
	else:
	st.info("No bulk operation indicators found")

	# High activity users
	st.subheader("High Activity Users")
	user_activity = df["username"].value_counts().head(15)

	if not user_activity.empty:
	fig_users = px.bar(x=user_activity.values, y=user_activity.index,
	orientation='h', title="Top 15 Most Active Users")
	fig_users.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig_users, use_container_width=True)

	# ------------------------
	# NEW: PREDICTIVE ANALYTICS
	# ------------------------
	elif analysis_type == "📈 Predictive Analytics":
	st.header("📈 Predictive Analytics & Trends")

	st.subheader("📊 Activity Forecast")

	if "datetime" in df.columns and len(df) >= 7:
	# Daily activity trend
	daily_activity = df.groupby(df["datetime"].dt.date).size().reset_index(name="count")
	daily_activity.columns = ["date", "count"]
	daily_activity["date"] = pd.to_datetime(daily_activity["date"])

	# Calculate moving average
	daily_activity["7_day_ma"] = daily_activity["count"].rolling(window=7, min_periods=1).mean()
	daily_activity["trend"] = daily_activity["count"].rolling(window=7, min_periods=1).mean()

	# Create forecast visualization
	fig_forecast = go.Figure()

	fig_forecast.add_trace(go.Scatter(
	x=daily_activity["date"],
	y=daily_activity["count"],
	name="Actual Activity",
	mode="lines+markers",
	line=dict(color="#1f77b4")
	))

	fig_forecast.add_trace(go.Scatter(
	x=daily_activity["date"],
	y=daily_activity["7_day_ma"],
	name="7-Day Moving Average",
	mode="lines",
	line=dict(color="#ff7f0e", dash="dash")
	))

	fig_forecast.update_layout(
	title="Tweet Activity Trend & Forecast",
	xaxis_title="Date",
	yaxis_title="Number of Tweets",
	hovermode="x unified"
	)

	st.plotly_chart(fig_forecast, use_container_width=True)

	# Trend analysis
	col1, col2, col3 = st.columns(3)

	with col1:
	recent_avg = daily_activity["count"].tail(7).mean()
	st.metric("7-Day Average", f"{recent_avg:.1f} tweets/day")

	with col2:
	if len(daily_activity) >= 14:
	prev_avg = daily_activity["count"].tail(14).head(7).mean()
	change = ((recent_avg - prev_avg) / prev_avg * 100) if prev_avg > 0 else 0
	st.metric("Week-over-Week Change", f"{change:+.1f}%")

	with col3:
	peak_day = daily_activity.loc[daily_activity["count"].idxmax()]
	st.metric("Peak Activity Day", peak_day["date"].strftime("%Y-%m-%d"))


	# User activity prediction
	st.subheader("👤 High-Risk User Patterns")

	if "username" in df.columns and "risk_level" in df.columns:
	user_risk_scores = df.groupby("username").agg({
	"tweet_id": "count",
	"risk_level": lambda x: (x == "CRITICAL").sum() * 2 + (x == "HIGH").sum()
	}).reset_index()
	user_risk_scores.columns = ["username", "tweet_count", "risk_score"]

	# Identify escalating users
	escalating_users = user_risk_scores[
	(user_risk_scores["risk_score"] > 0) &
	(user_risk_scores["tweet_count"] >= 3)
	].sort_values("risk_score", ascending=False).head(15)

	if not escalating_users.empty:
	fig_escalating = px.scatter(
	escalating_users,
	x="tweet_count",
	y="risk_score",
	size="risk_score",
	hover_data=["username"],
	title="High-Risk User Activity Matrix",
	labels={"tweet_count": "Number of Tweets", "risk_score": "Risk Score"}
	)

	st.plotly_chart(fig_escalating, use_container_width=True)

	st.write("Users to Monitor:")
	for _, user in escalating_users.head(10).iterrows():
	st.write(f"- @{user['username']}: {user['tweet_count']} tweets, Risk Score: {user['risk_score']}")



	# ------------------------
	# NEW: NETWORK ANALYSIS
	# ------------------------
	elif analysis_type == "🌐 Network Analysis":
	st.header("🌐 Network Analysis")

	st.subheader("👥 User Connection Analysis")

	# Mentions network
	if "mentions" in df.columns:
	st.write("### User Mention Network")

	mention_pairs = []
	for _, row in df.iterrows():
	if pd.notna(row.get("mentions")) and row["mentions"]:
	mentions = str(row["mentions"]).split()
	for mention in mentions:
	mention_clean = mention.strip("@")
	if mention_clean:
	mention_pairs.append({
	"from": row["username"],
	"to": mention_clean,
	"risk_level": row.get("risk_level", "UNKNOWN")
	})

	if mention_pairs:
	mention_df = pd.DataFrame(mention_pairs)

	# Top mentioned users
	top_mentioned = mention_df["to"].value_counts().head(15)

	fig_mentioned = px.bar(
	x=top_mentioned.values,
	y=top_mentioned.index,
	orientation="h",
	title="Most Mentioned Users",
	labels={"x": "Times Mentioned", "y": "Username"}
	)
	fig_mentioned.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig_mentioned, use_container_width=True)

	# Connection strength
	connection_strength = mention_df.groupby(["from", "to"]).size().reset_index(name="mentions")
	strong_connections = connection_strength[connection_strength["mentions"] >= 2].sort_values("mentions", ascending=False)

	if not strong_connections.empty:
	st.write("### 🔗 Strong Connections (2+ mentions)")

	for _, conn in strong_connections.head(20).iterrows():
	st.write(f"- @{conn['from']} → @{conn['to']}: {conn['mentions']} times")
	else:
	st.info("No mention data available")

	# Location clustering
	st.subheader("📍 Location-Based Clustering")

	if "user_location" in df.columns:
	location_users = df.groupby("user_location").agg({
	"username": lambda x: list(x.unique()),
	"tweet_id": "count",
	"risk_level": lambda x: (x == "CRITICAL").sum() if "risk_level" in df.columns else 0
	}).reset_index()

	location_users.columns = ["location", "users", "tweet_count", "critical_count"]
	location_users = location_users[location_users["location"] != ""]
	location_users = location_users[location_users["tweet_count"] >= 3]
	location_users["user_count"] = location_users["users"].apply(len)

	if not location_users.empty:
	fig_clusters = px.scatter(
	location_users,
	x="tweet_count",
	y="user_count",
	size="critical_count",
	hover_data=["location"],
	title="Location Clusters (Activity vs Users)",
	labels={
	"tweet_count": "Total Tweets",
	"user_count": "Unique Users",
	"critical_count": "Critical Tweets"
	}
	)

	st.plotly_chart(fig_clusters, use_container_width=True)

	# High-density locations
	high_density = location_users.sort_values("user_count", ascending=False).head(10)

	st.write("### 🏙️ High-Density Locations")
	for _, loc in high_density.iterrows():
	with st.expander(f"{loc['location']} - {loc['user_count']} users, {loc['tweet_count']} tweets"):
	st.write(f"Critical tweets: {loc['critical_count']}")
	st.write(f"Users: {', '.join(['@' + u for u in loc['users'][:10]])}")
	if len(loc['users']) > 10:
	st.write(f"... and {len(loc['users']) - 10} more")

	# Co-occurrence analysis
	st.subheader("🔗 Keyword Co-occurrence")

	if "content" in df.columns:
	# Define drug/crime keywords
	drug_keywords = ["drug", "drugs", "weed", "cannabis", "cocaine", "heroin", "ganja", "charas"]
	crime_keywords = ["deal", "dealer", "selling", "supply", "smuggle", "illegal", "arrest"]

	cooccurrence = []

	for _, row in df.iterrows():
	content_lower = row["content"].lower()
	found_drug = [kw for kw in drug_keywords if kw in content_lower]
	found_crime = [kw for kw in crime_keywords if kw in content_lower]

	for drug in found_drug:
	for crime in found_crime:
	cooccurrence.append({"drug_keyword": drug, "crime_keyword": crime})

	if cooccurrence:
	cooc_df = pd.DataFrame(cooccurrence)
	cooc_counts = cooc_df.groupby(["drug_keyword", "crime_keyword"]).size().reset_index(name="count")
	cooc_counts = cooc_counts.sort_values("count", ascending=False).head(20)

	if not cooc_counts.empty:
	fig_cooc = px.bar(
	cooc_counts,
	x="count",
	y="drug_keyword",
	color="crime_keyword",
	title="Drug-Crime Keyword Co-occurrence",
	orientation="h"
	)
	st.plotly_chart(fig_cooc, use_container_width=True)
	else:
	st.info("No significant keyword co-occurrences found")

	# Temporal clustering
	st.subheader("⏰ Temporal Activity Clusters")

	if "datetime" in df.columns and "username" in df.columns:
	df_copy = df.copy()
	df_copy["hour"] = df_copy["datetime"].dt.hour
	df_copy["day_of_week"] = df_copy["datetime"].dt.day_name()

	# Find users active at unusual hours (late night/early morning)
	unusual_hours = [0, 1, 2, 3, 4, 5]
	night_activity = df_copy[df_copy["hour"].isin(unusual_hours)]

	if len(night_activity) > 0:
	night_users = night_activity.groupby("username").size().reset_index(name="night_tweets")
	night_users = night_users[night_users["night_tweets"] >= 3].sort_values("night_tweets", ascending=False)

	if not night_users.empty:
	st.write(f"### 🌙 Users Active During Late Night (12 AM - 6 AM)")

	fig_night = px.bar(
	night_users.head(15),
	x="night_tweets",
	y="username",
	orientation="h",
	title="Top Users with Late Night Activity"
	)
	fig_night.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig_night, use_container_width=True)

	st.info("⚠️ Late night activity may indicate suspicious behavior patterns")

	# ------------------------
	# GEOGRAPHIC ANALYSIS (Enhanced)
	# ------------------------
	elif analysis_type == "Geographic Analysis":
	st.header("Geographic Analysis")

	# Location distribution
	locations = df["user_location"].value_counts().head(20)
	locations = locations[locations.index != ""] # Remove empty locations

	if not locations.empty:
	fig_locations = px.bar(x=locations.values, y=locations.index,
	orientation='h', title="Top 20 User Locations")
	fig_locations.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig_locations, use_container_width=True)
	else:
	st.info("No location data available")

	# Karnataka relevance score distribution
	if "kar_score" in df.columns:
	fig_kar = px.histogram(df, x="kar_score", title="Karnataka Relevance Score Distribution")
	st.plotly_chart(fig_kar, use_container_width=True)

	# Location-based risk analysis
	if "risk_level" in df.columns and "user_location" in df.columns:
	location_risk = df.groupby("user_location").agg({
	"risk_level": lambda x: (x == "HIGH").sum() + (x == "CRITICAL").sum() * 2,
	"username": "count"
	}).reset_index()
	location_risk = location_risk[location_risk["username"] >= 3] # Only locations with 3+ tweets
	location_risk = location_risk.sort_values("risk_level", ascending=False).head(15)

	if not location_risk.empty:
	fig_loc_risk = px.bar(location_risk, x="risk_level", y="user_location",
	orientation='h', title="High-Risk Locations (3+ tweets)")
	fig_loc_risk.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig_loc_risk, use_container_width=True)

	# ------------------------
	# USER ANALYSIS (Enhanced)
	# ------------------------
	elif analysis_type == "User Analysis":
	st.header("User Analysis")

	# User metrics
	col1, col2, col3 = st.columns(3)

	with col1:
	st.metric("Unique Users", df["username"].nunique())
	with col2:
	verified_count = safe_column_sum(df, "user_verified")
	st.metric("Verified Users", verified_count)
	with col3:
	avg_followers = safe_column_mean(df, "user_followers")
	st.metric("Avg Followers", f"{avg_followers:,.0f}")

	# Top users by followers
	if "user_followers" in df.columns:
	top_followers = df.nlargest(15, "user_followers")[["username", "user_followers"]]
	if "user_verified" in df.columns:
	top_followers = df.nlargest(15, "user_followers")[["username", "user_followers", "user_verified"]]

	fig_followers = px.bar(top_followers, x="user_followers", y="username",
	color="user_verified" if "user_verified" in top_followers.columns else None,
	orientation='h', title="Users with Most Followers")
	fig_followers.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig_followers, use_container_width=True)

	# User engagement vs risk (fixed aggregation)
	if "risk_level" in df.columns:
	user_metrics = []
	for username in df["username"].unique():
	user_data = df[df["username"] == username]
	risk_score = (user_data["risk_level"] == "HIGH").sum() + (user_data["risk_level"] == "CRITICAL").sum() * 2

	user_metrics.append({
	"username": username,
	"risk_score": risk_score,
	"avg_likes": safe_column_mean(user_data, "like_count"),
	"avg_retweets": safe_column_mean(user_data, "retweet_count"),
	"tweet_count": len(user_data)
	})

	user_risk_df = pd.DataFrame(user_metrics)
	multi_tweet_users = user_risk_df[user_risk_df["tweet_count"] >= 3]

	if not multi_tweet_users.empty:
	fig_user_risk = px.scatter(multi_tweet_users, x="avg_likes", y="risk_score",
	size="tweet_count", hover_data=["username"],
	title="User Risk vs Engagement (3+ tweets)")
	st.plotly_chart(fig_user_risk, use_container_width=True)

	# ------------------------
	# CONTENT ANALYSIS (Enhanced)
	# ------------------------
	elif analysis_type == "Content Analysis":
	st.header("Content Analysis")

	# Hashtag analysis
	if "hashtags" in df.columns:
	all_hashtags = df["hashtags"].dropna().str.split().explode()
	hashtag_counts = all_hashtags.value_counts().head(20)

	if not hashtag_counts.empty:
	fig_hashtags = px.bar(x=hashtag_counts.values, y=hashtag_counts.index,
	orientation='h', title="Top 20 Hashtags")
	fig_hashtags.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig_hashtags, use_container_width=True)

	# Sentiment vs Risk correlation
	col1, col2 = st.columns(2)

	with col1:
	if "sentiment_compound" in df.columns and "risk_level" in df.columns:
	fig_sentiment_risk = px.box(df, x="risk_level", y="sentiment_compound",
	title="Sentiment by Risk Level")
	st.plotly_chart(fig_sentiment_risk, use_container_width=True)
	else:
	st.info("Sentiment analysis data not available")

	with col2:
	if "drug_score" in df.columns and "crime_score" in df.columns:
	# Drug score vs Crime score correlation
	fig_scores_corr = px.scatter(df, x="drug_score", y="crime_score",
	color="risk_level" if "risk_level" in df.columns else None,
	title="Drug Score vs Crime Score",
	color_discrete_map={
	"CRITICAL": "#dc3545",
	"HIGH": "#fd7e14",
	"MEDIUM": "#ffc107",
	"LOW": "#28a745"
	})
	st.plotly_chart(fig_scores_corr, use_container_width=True)
	else:
	st.info("Score correlation data not available")

	# Content length analysis
	if "content" in df.columns:
	df_copy = df.copy()
	df_copy["content_length"] = df_copy["content"].str.len()

	if "risk_level" in df.columns:
	fig_length = px.histogram(df_copy, x="content_length", color="risk_level",
	title="Tweet Length Distribution by Risk Level",
	color_discrete_map={
	"CRITICAL": "#dc3545",
	"HIGH": "#fd7e14",
	"MEDIUM": "#ffc107",
	"LOW": "#28a745"
	})
	else:
	fig_length = px.histogram(df_copy, x="content_length", title="Tweet Length Distribution")

	st.plotly_chart(fig_length, use_container_width=True)

	# Word frequency analysis
	if "content" in df.columns:
	st.subheader("Content Word Analysis")

	filtered_words = get_filtered_words(df["content"])

	if filtered_words:
	word_freq = pd.Series(filtered_words).value_counts().head(30)
	fig_words = px.bar(x=word_freq.values, y=word_freq.index,
	orientation='h', title="Top 30 Most Frequent Words")
	fig_words.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig_words, use_container_width=True)
	else:
	st.info("No content words available after filtering")


	# ------------------------
	# Footer with Data Information & Export
	# ------------------------
	st.markdown("---")

	# Data summary footer
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.info(f"Showing {len(df)} tweets")

	with col2:
	if "risk_level" in df.columns:
	high_risk_count = len(df[df["risk_level"].isin(["HIGH", "CRITICAL"])])
	st.info(f"High Risk: {high_risk_count} tweets")
	else:
	st.info("Risk Level: Not available")

	# Enhanced export functionality
	st.sidebar.header("Data Export")

	# Export current filtered data
	if st.sidebar.button("Download Current View"):
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	csv = df.to_csv(index=False)
	st.sidebar.download_button(
	label="Download as CSV",
	data=csv,
	file_name=f"drug_crime_analysis_{analysis_type.lower().replace(' ', '_')}_{timestamp}.csv",
	mime="text/csv"
	)

	# Export summary report
	if report_data:
	if st.sidebar.button("Download Analysis Report"):
	report_json = json.dumps(report_data, indent=2, default=str)
	st.sidebar.download_button(
	label="Download Report (JSON)",
	data=report_json,
	file_name=f"analysis_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
	mime="application/json"
	)

	# Quick stats in sidebar
	if len(df) > 0:
	st.sidebar.subheader("Quick Stats")

	if "risk_level" in df.columns:
	risk_counts = df["risk_level"].value_counts()
	for risk, count in risk_counts.items():
	percentage = (count / len(df)) * 100
	st.sidebar.text(f"{risk}: {count} ({percentage:.1f}%)")

	# Top location
	if "user_location" in df.columns:
	top_location = df["user_location"].value_counts().head(1)
	if not top_location.empty and top_location.index[0] != "":
	st.sidebar.text(f"Top Location: {top_location.index[0]} ({top_location.iloc[0]})")

	# Date range
	if "datetime" in df.columns and not df["datetime"].isna().all():
	try:
	days_span = (df["datetime"].max() - df["datetime"].min()).days
	st.sidebar.text(f"Data Span: {days_span} days")
	except:
	pass

	# Debug information (collapsible)
	with st.sidebar.expander("Debug Info"):
	st.write("Available columns:")
	st.write(list(df.columns))
	st.write(f"DataFrame shape: {df.shape}")
	st.write(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

	if report_data:
	st.write("Report data available: Yes")
	else:
	st.write("Report data available: No")

	if high_priority_df is not None:
	st.write(f"High priority tweets: {len(high_priority_df)}")
	else:
	st.write("High priority tweets: Not available")

	if contact_df is not None:
	st.write(f"Contact info tweets: {len(contact_df)}")
	else:
	st.write("Contact info tweets: Not available")

	# Footer
	st.markdown("---")
	st.markdown(
	"""
	<div style='text-align: center; color: #666; padding: 20px;'>
	<p><strong>Twitter Drug Crime Monitoring Dashboard</strong></p>
	<p><em>Dashboard last updated: {}</em></p>
	</div>
	""".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
	unsafe_allow_html=True
	)