Twitter-Analysis / src /streamlit_app.py
lawlevisan's picture
Update src/streamlit_app.py
7f30852 verified
#modify_app.py
import streamlit as st
import pandas as pd
import os
import json
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
import nltk
from nltk.corpus import stopwords # ✅ import first
# Ensure stopwords data is downloaded
try:
stopwords.words('english')
except LookupError:
nltk.download('stopwords')
# Now you can safely use it
english_stopwords = stopwords.words('english')
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
from alerts import compute_dynamic_risk,assign_dynamic_risk_level
from evaluation import evaluate_model
# Run evaluation on the scraped CSV folder
evaluate_model("drug_analysis_data_3months")
import re
st.set_page_config(page_title="Twitter Drug Crime Monitoring", layout="wide")
# Custom CSS for better styling
st.markdown("""
<style>
.main-header {
background: linear-gradient(90deg, #1e3c72, #2a5298);
color: white;
padding: 1rem;
border-radius: 10px;
text-align: center;
margin-bottom: 2rem;
}
.metric-card {
background: #f8f9fa;
padding: 1rem;
border-radius: 8px;
border-left: 4px solid #007bff;
}
.critical-alert {
background: #f8d7da;
border: 1px solid #f5c6cb;
color: #721c24;
padding: 1rem;
border-radius: 8px;
margin: 1rem 0;
}
.high-priority {
background: #fff3cd;
border: 1px solid #ffeaa7;
color: #856404;
padding: 1rem;
border-radius: 8px;
margin: 1rem 0;
}
.warning-box {
background: #d4edda;
border: 1px solid #c3e6cb;
color: #155724;
padding: 1rem;
border-radius: 8px;
margin: 1rem 0;
}
</style>
""", unsafe_allow_html=True)
# Configuration
DASHBOARD_CONFIG = {
'data_dirs': ['drug_analysis_data_3months', 'data', 'output', '.'],
'refresh_interval': 30,
'max_display_tweets': 50,
'chart_height': 400
}
# Main header
st.markdown('<div class="main-header"><h1>Twitter Drug Crime Monitoring Dashboard</h1><p>Real-time Twitter Analysis for Drug Crime Detection</p></div>', unsafe_allow_html=True)
# ------------------------
# Enhanced Data Loading Functions
# ------------------------
def parse_dates_flexible(df):
"""Parse dates with multiple format attempts."""
if "datetime" not in df.columns:
return df
date_formats = [
"%d-%m-%Y %H:%M:%S",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d %H:%M",
"%Y-%m-%d",
"%d/%m/%Y %H:%M:%S",
"%m/%d/%Y %H:%M:%S"
]
original_datetime = df["datetime"].copy()
for fmt in date_formats:
try:
df["datetime"] = pd.to_datetime(original_datetime, format=fmt, errors="coerce")
if not df["datetime"].isna().all():
break
except:
continue
# If parsing still failed, try generic parsing
if df["datetime"].isna().all():
df["datetime"] = pd.to_datetime(original_datetime, errors="coerce")
# Fill any remaining NaT values with current time
df["datetime"] = df["datetime"].fillna(pd.Timestamp.now())
return df
def validate_dataframe(df):
"""Validate that the dataframe has expected columns."""
if df is None or df.empty:
return False, "DataFrame is empty"
required_columns = ['username', 'content']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
return False, f"Missing required columns: {missing_columns}"
return True, "DataFrame is valid"
@st.cache_data
def load_data():
"""Load the most recent data with robust error handling."""
start_time = time.time()
for data_dir in DASHBOARD_CONFIG['data_dirs']:
if not os.path.exists(data_dir):
continue
try:
# Look for main dataset files with flexible naming
csv_files = []
for f in os.listdir(data_dir):
if f.endswith(".csv") and any(keyword in f.lower() for keyword in
["karnataka_drug_tweets", "drug_tweets", "drug_analysis", "drug_crime"]):
csv_files.append(f)
if not csv_files:
# Fallback to any CSV file
csv_files = [f for f in os.listdir(data_dir) if f.endswith(".csv")]
if not csv_files:
continue
# Get the most recent file
latest_file = max(csv_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
file_path = os.path.join(data_dir, latest_file)
# Load with error handling
df = pd.read_csv(file_path, encoding='utf-8')
if df.empty:
continue
# Enhanced date parsing
df = parse_dates_flexible(df)
# Add derived columns if missing
if "datetime" in df.columns:
if "date" not in df.columns:
df["date"] = df["datetime"].dt.date
if "hour" not in df.columns:
df["hour"] = df["datetime"].dt.hour
if "day_of_week" not in df.columns:
df["day_of_week"] = df["datetime"].dt.day_name()
if "day" not in df.columns:
df["day"] = df["datetime"].dt.day
# Load report if available
report_files = [f for f in os.listdir(data_dir)
if f.startswith("ANALYSIS_REPORT_") and f.endswith(".json")]
report_data = None
if report_files:
latest_report = max(report_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
try:
with open(os.path.join(data_dir, latest_report), 'r', encoding='utf-8') as f:
report_data = json.load(f)
except Exception as e:
st.sidebar.warning(f"Could not load report: {e}")
report_data = None
load_time = time.time() - start_time
# Display load metrics in sidebar
st.sidebar.success(f"Data loaded successfully")
st.sidebar.metric("Load Time", f"{load_time:.2f}s")
st.sidebar.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
st.sidebar.info(f"Source: {latest_file}")
return df, report_data
except Exception as e:
st.sidebar.warning(f"Failed to load from {data_dir}: {str(e)}")
continue
return None, None
@st.cache_data
def load_priority_data():
"""Load high priority and contact info datasets with fallbacks."""
data_dir = DASHBOARD_CONFIG['data_dirs'][0] # Primary data directory
if not os.path.exists(data_dir):
return None, None
high_priority_df = None
contact_df = None
try:
# Load high priority tweets
high_priority_files = [f for f in os.listdir(data_dir)
if "HIGH_PRIORITY" in f and f.endswith(".csv")]
if high_priority_files:
latest_priority = max(high_priority_files,
key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
high_priority_df = pd.read_csv(os.path.join(data_dir, latest_priority))
high_priority_df = parse_dates_flexible(high_priority_df)
except Exception as e:
st.sidebar.warning(f"Could not load high priority data: {e}")
try:
# Load contact info tweets
contact_files = [f for f in os.listdir(data_dir)
if "CONTACT_INFO" in f and f.endswith(".csv")]
if contact_files:
latest_contact = max(contact_files,
key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
contact_df = pd.read_csv(os.path.join(data_dir, latest_contact))
contact_df = parse_dates_flexible(contact_df)
except Exception as e:
st.sidebar.warning(f"Could not load contact info data: {e}")
return high_priority_df, contact_df
def safe_column_access(df, column, default=0):
"""Safely access DataFrame columns with defaults."""
if column in df.columns:
return df[column]
else:
return pd.Series([default] * len(df), index=df.index)
def safe_column_sum(df, column):
"""Safely sum a column with fallback."""
if column in df.columns:
return df[column].sum()
return 0
def safe_column_mean(df, column):
"""Safely calculate mean of a column with fallback."""
if column in df.columns and len(df) > 0:
return df[column].mean()
return 0
# ----------------- Helper: Calculate User Risk -----------------
def calculate_user_risk(df):
"""
Calculate risk score per user:
CRITICAL = 2 points, HIGH = 1 point
Returns DataFrame with username, risk_score, tweet_count
"""
if "username" not in df.columns or "risk_level" not in df.columns:
return pd.DataFrame()
user_metrics = []
for username in df["username"].unique():
user_data = df[df["username"] == username]
risk_score = (user_data["risk_level"] == "HIGH").sum() + \
(user_data["risk_level"] == "CRITICAL").sum() * 2
user_metrics.append({
"username": username,
"risk_score": risk_score,
"tweet_count": len(user_data)
})
return pd.DataFrame(user_metrics)
# ----------------- Helper: Filter Words -----------------
def get_filtered_words(text_series):
"""
Returns filtered words from a Series of text,
removing English stopwords and words <=2 characters
"""
stop_words_set = set(stopwords.words('english'))
all_text = " ".join(text_series.astype(str))
words = re.findall(r'\b\w+\b', all_text.lower())
return [w for w in words if w not in stop_words_set and len(w) > 2]
def create_heatmap_chart(df, x_col, y_col, title="Heatmap"):
"""Create a heatmap using plotly."""
if x_col not in df.columns or y_col not in df.columns:
return None
# Create pivot table for heatmap
heatmap_data = df.groupby([x_col, y_col]).size().reset_index(name='count')
pivot_data = heatmap_data.pivot(index=y_col, columns=x_col, values='count').fillna(0)
fig = go.Figure(data=go.Heatmap(
z=pivot_data.values,
x=pivot_data.columns,
y=pivot_data.index,
colorscale='Blues',
hoverongaps=False
))
fig.update_layout(
title=title,
xaxis_title=x_col,
yaxis_title=y_col,
height=400
)
return fig
def create_weekly_trend_analysis(df):
"""Create weekly trend analysis."""
if "datetime" not in df.columns:
return None, None
# Weekly aggregation
df['week'] = df['datetime'].dt.isocalendar().week
df['weekday'] = df['datetime'].dt.day_name()
weekly_counts = df.groupby('week').size().reset_index(name='count')
weekday_counts = df.groupby('weekday').size().reset_index(name='count')
# Reorder weekdays
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_counts['weekday'] = pd.Categorical(weekday_counts['weekday'], categories=weekday_order, ordered=True)
weekday_counts = weekday_counts.sort_values('weekday')
fig1 = px.line(weekly_counts, x='week', y='count', title="Weekly Tweet Trends")
fig2 = px.bar(weekday_counts, x='weekday', y='count', title="Tweets by Weekday")
return fig1, fig2
# ------------------------
# Load Data
# ------------------------
df, report_data = load_data()
# --- Compute dynamic risk for all tweets ---
if df is not None and not df.empty:
from alerts import compute_dynamic_risk, assign_dynamic_risk_level
# Add dynamic risk fields
df['dynamic_risk_score'] = df.apply(lambda row: compute_dynamic_risk(row.to_dict()), axis=1)
df['risk_level'] = df.apply(lambda row: assign_dynamic_risk_level(row.to_dict()), axis=1)
if df is None:
st.error("No data found. Please run the drug crime scraper first.")
# Enhanced debug information
st.subheader("Debug Information")
current_dir = os.getcwd()
st.write(f"Current directory: {current_dir}")
for dir_name in DASHBOARD_CONFIG['data_dirs']:
if os.path.exists(dir_name):
files = [f for f in os.listdir(dir_name) if f.endswith('.csv')]
st.write(f"CSV files in {dir_name}: {files}")
else:
st.write(f"Directory {dir_name} does not exist")
st.info("Expected files: karnataka_drug_tweets_*.csv or similar drug-related CSV files")
st.stop()
# Validate dataframe
is_valid, validation_message = validate_dataframe(df)
if not is_valid:
st.error(f"Data validation failed: {validation_message}")
st.write("Available columns:", list(df.columns))
st.stop()
# Load priority data
high_priority_df, contact_df = load_priority_data()
# Filter for current month data for some analyses
now = datetime.now()
if "datetime" in df.columns:
df_month = df[(df['datetime'].dt.month == now.month) & (df['datetime'].dt.year == now.year)]
else:
df_month = df
# ------------------------
# Sidebar Navigation & Filters
# ------------------------
st.sidebar.title("Dashboard Navigation")
# Auto-refresh option
auto_refresh = st.sidebar.checkbox("Auto-refresh (30s)")
from streamlit_autorefresh import st_autorefresh
if auto_refresh:
st_autorefresh(interval=30*1000, key="refresh")
# Navigation tabs - ENHANCED with new options
analysis_type = st.sidebar.radio(
"Select Analysis View",
["Summary", "Risk Analysis", "Actionable Insights", "📈 Predictive Analytics", "🌐 Network Analysis",
"Geographic Analysis", "User Analysis",
"Content Analysis", "📊 Volume Trends", "🧠 User Behavior",
"📍 Heatmaps", "⚠️ Risk Patterns"]
)
# Common filters
st.sidebar.header("Data Filters")
# Date range filter
if "datetime" in df.columns and not df["datetime"].isna().all():
try:
min_date = df["datetime"].min().date()
max_date = df["datetime"].max().date()
date_range = st.sidebar.date_input(
"Select Date Range",
value=[min_date, max_date],
min_value=min_date,
max_value=max_date
)
# Filter dataframe by date range
if len(date_range) == 2:
df = df[
(df["datetime"].dt.date >= date_range[0]) &
(df["datetime"].dt.date <= date_range[1]) &
(df["datetime"].dt.year == date_range[0].year) # optional if needed
]
except Exception as e:
st.sidebar.warning(f"Date filtering error: {e}")
# Risk level filter
if "risk_level" in df.columns:
available_risk_levels = df["risk_level"].unique().tolist()
risk_levels = st.sidebar.multiselect(
"Risk Levels",
options=available_risk_levels,
default=available_risk_levels
)
df = df[df["risk_level"].isin(risk_levels)]
# Search filter
search_term = st.sidebar.text_input("Search Content", "")
if search_term:
df = df[df["content"].str.lower().str.contains(search_term.lower(), na=False)]
# Display current filter status
st.sidebar.info(f"Showing {len(df)} tweets")
# ------------------------
# EXECUTIVE SUMMARY
# ------------------------
if analysis_type == "Summary":
st.header("Summary")
# Key metrics in columns
col1, col2, col3, col4, col5, col6 = st.columns(6)
with col1:
st.metric("Total Tweets", len(df))
with col2:
drug_related = safe_column_sum(df, "is_drug_related")
st.metric("Drug Related", drug_related)
with col3:
crime_related = safe_column_sum(df, "is_crime_related")
st.metric("Crime Related", crime_related)
with col4:
contact_info = safe_column_sum(df, "has_contact_info")
st.metric("Contact Info", contact_info)
with col5:
st.metric("Unique Users", df["username"].nunique())
with col6: # Or create a new column if needed
avg_risk = df["dynamic_risk_score"].mean() if "dynamic_risk_score" in df.columns else 0
st.metric("Avg. Dynamic Risk Score", f"{avg_risk:.2f}")
# Risk level analysis
if "risk_level" in df.columns:
critical_count = len(df[df["risk_level"] == "CRITICAL"])
high_count = len(df[df["risk_level"] == "HIGH"])
if critical_count > 0:
st.markdown(f'<div class="critical-alert"><strong>CRITICAL ALERT:</strong> {critical_count} tweets require immediate attention</div>', unsafe_allow_html=True)
if high_count > 0:
st.markdown(f'<div class="high-priority"><strong>HIGH PRIORITY:</strong> {high_count} tweets for investigation</div>', unsafe_allow_html=True)
# Risk distribution pie chart
col1, col2 = st.columns(2)
with col1:
risk_order = ["CRITICAL", "HIGH", "MEDIUM", "LOW"]
risk_dist = df["risk_level"].value_counts().reindex(risk_order).fillna(0)
fig_risk = px.pie(values=risk_dist.values, names=risk_dist.index,
title="Risk Level Distribution",
color_discrete_map={
"CRITICAL": "#dc3545",
"HIGH": "#fd7e14",
"MEDIUM": "#ffc107",
"LOW": "#28a745"
})
st.plotly_chart(fig_risk, use_container_width=True)
with col2:
# Sentiment analysis if available
if "sentiment_compound" in df.columns:
sentiment_counts = pd.cut(df["sentiment_compound"],
bins=[-1, -0.1, 0.1, 1],
labels=["Negative", "Neutral", "Positive"]).value_counts()
fig_sentiment = px.bar(x=sentiment_counts.index, y=sentiment_counts.values,
title="Sentiment Distribution",
color=sentiment_counts.values,
color_continuous_scale="RdYlGn")
st.plotly_chart(fig_sentiment, use_container_width=True)
else:
st.info("Sentiment data not available")
# Analysis report summary
if report_data:
st.subheader("Analysis Report Summary")
col1, col2 = st.columns(2)
with col1:
if "summary_statistics" in report_data:
st.json(report_data["summary_statistics"])
with col2:
if "investigation_priorities" in report_data:
st.json(report_data["investigation_priorities"])
# ------------------------
# NEW: VOLUME TRENDS
# ------------------------
elif analysis_type == "📊 Volume Trends":
st.header("📊 Tweet Volume: Daily,Weekly and Hourly Trends")
if "datetime" in df.columns and not df["datetime"].isna().all():
# Daily trend
if "date" in df.columns:
daily_counts = df.groupby("date").size().reset_index(name="count")
fig_daily = px.line(daily_counts, x="date", y="count",
title="Daily Tweet Volume")
st.plotly_chart(fig_daily, use_container_width=True)
# Hourly and weekday patterns
col1, = st.columns(1)
with col1:
if "hour" in df.columns:
hourly_counts = df.groupby("hour").size()
fig_hourly = px.bar(x=hourly_counts.index, y=hourly_counts.values,
title="Tweets by Hour of Day")
st.plotly_chart(fig_hourly, use_container_width=True)
# Weekly trends
if "datetime" in df.columns:
weekly_fig1, weekly_fig2 = create_weekly_trend_analysis(df)
if weekly_fig1 and weekly_fig2:
st.subheader("📅 Weekly Trends")
col1, col2 = st.columns(2)
with col1:
st.plotly_chart(weekly_fig1, use_container_width=True)
with col2:
st.plotly_chart(weekly_fig2, use_container_width=True)
else:
st.info("Temporal data not available")
# CSV Downloads
st.subheader("📄 Download Data")
col1, col2 = st.columns(2)
with col1:
if st.button("📥 Download Top Users CSV"):
top_users = df.groupby("username").agg(
tweet_count=("username", "count"),
max_risk=("dynamic_risk_score", "max")
).sort_values("tweet_count", ascending=False).head(20).reset_index()
csv = top_users.to_csv(index=False)
st.download_button(
"Download CSV", csv, "top_users.csv", "text/csv"
)
with col2:
if st.button("📥 Download Top Locations CSV"):
if "user_location" in df.columns:
top_locations = df.groupby("user_location").agg(
tweet_count=("user_location", "count"),
max_risk=("dynamic_risk_score", "max")
).sort_values("tweet_count", ascending=False).head(20).reset_index()
csv = top_locations.to_csv(index=False)
st.download_button(
"Download CSV", csv, "top_locations.csv", "text/csv"
)
# ------------------------
# NEW: USER BEHAVIOR
# ------------------------
elif analysis_type == "🧠 User Behavior":
st.header("🧠 User Behavior Analysis")
# Top repeat users
st.subheader("🧠 Top Repeat Users")
user_activity = df["username"].value_counts().head(15)
if not user_activity.empty:
fig_users = px.bar(x=user_activity.values, y=user_activity.index,
orientation='h', title="Top 15 Most Active Users")
fig_users.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig_users, use_container_width=True)
# Show details of top users
with st.expander("View Top User Details"):
for username, count in user_activity.head(10).items():
user_tweets = df[df["username"] == username]
# Safe mode extraction with proper error handling
if "risk_level" in user_tweets.columns and not user_tweets["risk_level"].empty:
risk_mode = user_tweets["risk_level"].mode()
risk_level = risk_mode.iloc[0] if len(risk_mode) > 0 else "Unknown"
else:
risk_level = "Unknown"
if "user_location" in user_tweets.columns and not user_tweets["user_location"].empty:
location_mode = user_tweets["user_location"].mode()
location = location_mode.iloc[0] if len(location_mode) > 0 else "Unknown"
else:
location = "Unknown"
st.write(f"**@{username}**: {count} tweets | Risk: {risk_level} | Location: {location}")
# User engagement patterns
if "like_count" in df.columns or "retweet_count" in df.columns:
st.subheader("📊 User Engagement Patterns")
col1, col2 = st.columns(2)
with col1:
if "like_count" in df.columns:
avg_likes = df.groupby("username")["like_count"].mean().sort_values(ascending=False).head(15)
fig_likes = px.bar(x=avg_likes.values, y=avg_likes.index,
orientation='h', title="Users by Average Likes")
fig_likes.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig_likes, use_container_width=True)
with col2:
if "retweet_count" in df.columns:
avg_retweets = df.groupby("username")["retweet_count"].mean().sort_values(ascending=False).head(15)
fig_retweets = px.bar(x=avg_retweets.values, y=avg_retweets.index,
orientation='h', title="Users by Average Retweets")
fig_retweets.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig_retweets, use_container_width=True)
# User location overlap analysis
if "user_location" in df.columns and "risk_level" in df.columns:
st.subheader("📍 User Location vs Risk Analysis")
location_risk = df.groupby(["user_location", "risk_level"]).size().reset_index(name="count")
location_risk = location_risk[location_risk["user_location"] != ""]
if not location_risk.empty:
fig_loc_risk = px.bar(location_risk, x="user_location", y="count",
color="risk_level", title="Risk Distribution by Location",
color_discrete_map={
"CRITICAL": "#dc3545",
"HIGH": "#fd7e14",
"MEDIUM": "#ffc107",
"LOW": "#28a745"
})
fig_loc_risk.update_xaxes(tickangle=45)
st.plotly_chart(fig_loc_risk, use_container_width=True)
# ------------------------
# NEW: HEATMAPS
# ------------------------
elif analysis_type == "📍 Heatmaps":
st.header("📍 Time-Based Heatmaps")
# -------------------
# Day-Hour heatmap
# -------------------
if "day_of_week" in df.columns and "hour" in df.columns:
# Ensure proper order
day_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
df['day_of_week'] = pd.Categorical(df['day_of_week'], categories=day_order, ordered=True)
st.subheader("🔥 Day vs Hour Activity Heatmap")
heatmap_fig = create_heatmap_chart(df, "hour", "day_of_week", "Tweet Activity: Day vs Hour")
if heatmap_fig:
st.plotly_chart(heatmap_fig, use_container_width=True)
# Risk level heatmap
if "risk_level" in df.columns and "hour" in df.columns:
st.subheader("⚠️ Risk Level vs Hour Heatmap")
risk_heatmap = create_heatmap_chart(df, "hour", "risk_level", "Risk Level Distribution by Hour")
if risk_heatmap:
st.plotly_chart(risk_heatmap, use_container_width=True)
# -------------------
# Top Locations Heatmap
# -------------------
if "user_location" in df.columns and "hour" in df.columns:
st.subheader("📍 Location vs Hour Heatmap (Top Locations)")
# Add slider in sidebar
TOP_N_LOCATIONS = st.sidebar.slider("Top N Locations for Heatmaps", 5, 30, 10)
# Filter top N locations
top_locations = df["user_location"].value_counts().head(TOP_N_LOCATIONS).index
df_top_loc = df[df["user_location"].isin(top_locations)]
if not df_top_loc.empty:
loc_heatmap = create_heatmap_chart(df_top_loc, "hour", "user_location",
f"Top {TOP_N_LOCATIONS} Locations Activity by Hour")
if loc_heatmap:
st.plotly_chart(loc_heatmap, use_container_width=True)
# Tweet location heatmap (if geographic coordinates available)
if "latitude" in df.columns and "longitude" in df.columns:
st.subheader("🗺️ Geographic Tweet Distribution")
valid_coords = df.dropna(subset=["latitude", "longitude"])
if not valid_coords.empty:
fig_map = px.scatter_mapbox(
valid_coords, lat="latitude", lon="longitude",
color="risk_level" if "risk_level" in df.columns else None,
size_max=15, zoom=7,
mapbox_style="open-street-map",
title="Geographic Distribution of Tweets"
)
st.plotly_chart(fig_map, use_container_width=True)
else:
st.info("No geographic coordinates available for mapping")
# ------------------------
# NEW: RISK PATTERNS
# ------------------------
# High-risk users analysis
elif analysis_type == "⚠️ Risk Patterns":
st.header("⚠️ Risk Patterns and High-Risk Analysis")
# High-risk users analysis
if "risk_level" in df.columns:
st.subheader("🚨 High-Risk Users")
user_risk_df = calculate_user_risk(df)
high_risk_users = user_risk_df[user_risk_df["risk_score"] > 0].sort_values("risk_score", ascending=False).head(20)
if not high_risk_users.empty:
fig_risk_users = px.bar(high_risk_users, x="risk_score", y="username",
orientation='h', color="tweet_count", color_continuous_scale="Reds")
fig_risk_users.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig_risk_users, use_container_width=True)
# Optional: show details
with st.expander("High-Risk User Details"):
for _, row in high_risk_users.iterrows():
user_data = df[df["username"] == row["username"]]
critical_count = (user_data["risk_level"] == "CRITICAL").sum()
high_count = (user_data["risk_level"] == "HIGH").sum()
st.write(f"**@{row['username']}**: Risk Score: {row['risk_score']} | Critical: {critical_count} | High: {high_count} | Total Tweets: {row['tweet_count']}")
# Risk overlap analysis
if "is_drug_related" in df.columns and "is_crime_related" in df.columns:
st.subheader("🔄 Drug-Crime Overlap Analysis")
# Create overlap categories
df_overlap = df.copy()
df_overlap["category"] = "Other"
df_overlap.loc[df_overlap["is_drug_related"] == 1, "category"] = "Drug Only"
df_overlap.loc[df_overlap["is_crime_related"] == 1, "category"] = "Crime Only"
df_overlap.loc[(df_overlap["is_drug_related"] == 1) & (df_overlap["is_crime_related"] == 1), "category"] = "Drug + Crime"
overlap_counts = df_overlap["category"].value_counts()
fig_overlap = px.pie(values=overlap_counts.values, names=overlap_counts.index,
title="Drug-Crime Content Overlap",
color_discrete_map={
"Drug + Crime": "#dc3545",
"Drug Only": "#fd7e14",
"Crime Only": "#ffc107",
"Other": "#28a745"
})
st.plotly_chart(fig_overlap, use_container_width=True)
# Show high-overlap users
high_overlap_users = df_overlap[df_overlap["category"] == "Drug + Crime"]["username"].value_counts().head(10)
if not high_overlap_users.empty:
st.write("**Users with most Drug+Crime tweets:**")
for username, count in high_overlap_users.items():
st.write(f"- @{username}: {count} tweets")
# Risk progression over time
if "datetime" in df.columns and "risk_level" in df.columns:
st.subheader("📈 Risk Level Trends Over Time")
# Daily risk aggregation
df["date_str"] = df["datetime"].dt.strftime("%Y-%m-%d")
risk_time = df.groupby(["date_str", "risk_level"]).size().reset_index(name="count")
fig_risk_time = px.line(risk_time, x="date_str", y="count", color="risk_level",
title="Risk Levels Trend Over Time",
color_discrete_map={
"CRITICAL": "#dc3545",
"HIGH": "#fd7e14",
"MEDIUM": "#ffc107",
"LOW": "#28a745"
})
fig_risk_time.update_xaxes(tickangle=45)
st.plotly_chart(fig_risk_time, use_container_width=True)
# ------------------------
# RISK ANALYSIS (Enhanced)
# ------------------------
elif analysis_type == "Risk Analysis":
st.header("Risk Analysis")
# High-risk tweets table
if high_priority_df is not None and not high_priority_df.empty:
st.subheader("High Priority Tweets")
# Risk level tabs
risk_tab1, risk_tab2 = st.tabs(["CRITICAL", "HIGH"])
with risk_tab1:
critical_tweets = high_priority_df[high_priority_df["risk_level"] == "CRITICAL"]
if not critical_tweets.empty:
for idx, tweet in critical_tweets.head(10).iterrows():
with st.expander(f"CRITICAL: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"):
st.write(f"**Content:** {tweet['content']}")
st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}")
st.write(f"**Drug Score:** {tweet.get('drug_score', 'N/A')} | **Crime Score:** {tweet.get('crime_score', 'N/A')}")
if 'tweet_url' in tweet:
st.write(f"**URL:** {tweet['tweet_url']}")
else:
st.info("No critical risk tweets in current filter")
with risk_tab2:
high_tweets = high_priority_df[high_priority_df["risk_level"] == "HIGH"]
if not high_tweets.empty:
for idx, tweet in high_tweets.head(10).iterrows():
with st.expander(f"HIGH: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"):
st.write(f"**Content:** {tweet['content']}")
st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}")
st.write(f"**Drug Score:** {tweet.get('drug_score', 'N/A')} | **Crime Score:** {tweet.get('crime_score', 'N/A')}")
if 'tweet_url' in tweet:
st.write(f"**URL:** {tweet['tweet_url']}")
else:
st.info("No high risk tweets in current filter")
else:
st.info("No high priority data available")
# Risk score distribution
if "drug_score" in df.columns and "crime_score" in df.columns:
fig_scores = make_subplots(rows=1, cols=2, subplot_titles=("Drug Score Distribution", "Crime Score Distribution"))
fig_scores.add_trace(go.Histogram(x=df["drug_score"], name="Drug Score", nbinsx=20), row=1, col=1)
fig_scores.add_trace(go.Histogram(x=df["crime_score"], name="Crime Score", nbinsx=20), row=1, col=2)
fig_scores.update_layout(title="Risk Score Distributions")
st.plotly_chart(fig_scores, use_container_width=True)
else:
st.info("Risk score data not available")
# ------------------------
# Actionable Insights
# ------------------------
elif analysis_type == "Actionable Insights":
st.header("Actionable Insights")
# Contact information tweets
if contact_df is not None and not contact_df.empty:
st.subheader("Tweets with Contact Information")
st.markdown('<div class="warning-box">These tweets contain phone numbers or contact details - HIGH PRIORITY for investigation</div>', unsafe_allow_html=True)
for idx, tweet in contact_df.head(20).iterrows():
with st.expander(f"Contact Info: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"):
st.write(f"**Content:** {tweet['content']}")
st.write(f"**Phone Numbers:** {tweet.get('phone_numbers', 'Not extracted')}")
st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}")
st.write(f"**Risk Level:** {tweet.get('risk_level', 'Unknown')}")
if 'tweet_url' in tweet:
st.write(f"**URL:** {tweet['tweet_url']}")
else:
st.info("No tweets with contact information found")
# Bulk operation indicators
st.subheader("Bulk Operation Indicators")
# Sidebar input
BULK_KEYWORDS = st.sidebar.text_area("Bulk Operation Keywords (comma-separated)",
"kg,gram,bulk,wholesale,kilos,ounce,pound").split(",")
# In code
bulk_pattern = "|".join([kw.strip() for kw in BULK_KEYWORDS])
bulk_regex = re.compile("|".join([kw.strip() for kw in BULK_KEYWORDS]), re.IGNORECASE)
bulk_tweets = df[df["content"].str.contains(bulk_regex, na=False)]
if not bulk_tweets.empty:
st.write(f"Found {len(bulk_tweets)} tweets mentioning bulk quantities")
for idx, tweet in bulk_tweets.head(10).iterrows():
with st.expander(f"Bulk: @{tweet['username']} - Risk: {tweet.get('risk_level', 'Unknown')}"):
st.write(f"**Content:** {tweet['content']}")
st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}")
if 'tweet_url' in tweet:
st.write(f"**URL:** {tweet['tweet_url']}")
else:
st.info("No bulk operation indicators found")
# High activity users
st.subheader("High Activity Users")
user_activity = df["username"].value_counts().head(15)
if not user_activity.empty:
fig_users = px.bar(x=user_activity.values, y=user_activity.index,
orientation='h', title="Top 15 Most Active Users")
fig_users.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig_users, use_container_width=True)
# ------------------------
# NEW: PREDICTIVE ANALYTICS
# ------------------------
elif analysis_type == "📈 Predictive Analytics":
st.header("📈 Predictive Analytics & Trends")
st.subheader("📊 Activity Forecast")
if "datetime" in df.columns and len(df) >= 7:
# Daily activity trend
daily_activity = df.groupby(df["datetime"].dt.date).size().reset_index(name="count")
daily_activity.columns = ["date", "count"]
daily_activity["date"] = pd.to_datetime(daily_activity["date"])
# Calculate moving average
daily_activity["7_day_ma"] = daily_activity["count"].rolling(window=7, min_periods=1).mean()
daily_activity["trend"] = daily_activity["count"].rolling(window=7, min_periods=1).mean()
# Create forecast visualization
fig_forecast = go.Figure()
fig_forecast.add_trace(go.Scatter(
x=daily_activity["date"],
y=daily_activity["count"],
name="Actual Activity",
mode="lines+markers",
line=dict(color="#1f77b4")
))
fig_forecast.add_trace(go.Scatter(
x=daily_activity["date"],
y=daily_activity["7_day_ma"],
name="7-Day Moving Average",
mode="lines",
line=dict(color="#ff7f0e", dash="dash")
))
fig_forecast.update_layout(
title="Tweet Activity Trend & Forecast",
xaxis_title="Date",
yaxis_title="Number of Tweets",
hovermode="x unified"
)
st.plotly_chart(fig_forecast, use_container_width=True)
# Trend analysis
col1, col2, col3 = st.columns(3)
with col1:
recent_avg = daily_activity["count"].tail(7).mean()
st.metric("7-Day Average", f"{recent_avg:.1f} tweets/day")
with col2:
if len(daily_activity) >= 14:
prev_avg = daily_activity["count"].tail(14).head(7).mean()
change = ((recent_avg - prev_avg) / prev_avg * 100) if prev_avg > 0 else 0
st.metric("Week-over-Week Change", f"{change:+.1f}%")
with col3:
peak_day = daily_activity.loc[daily_activity["count"].idxmax()]
st.metric("Peak Activity Day", peak_day["date"].strftime("%Y-%m-%d"))
# User activity prediction
st.subheader("👤 High-Risk User Patterns")
if "username" in df.columns and "risk_level" in df.columns:
user_risk_scores = df.groupby("username").agg({
"tweet_id": "count",
"risk_level": lambda x: (x == "CRITICAL").sum() * 2 + (x == "HIGH").sum()
}).reset_index()
user_risk_scores.columns = ["username", "tweet_count", "risk_score"]
# Identify escalating users
escalating_users = user_risk_scores[
(user_risk_scores["risk_score"] > 0) &
(user_risk_scores["tweet_count"] >= 3)
].sort_values("risk_score", ascending=False).head(15)
if not escalating_users.empty:
fig_escalating = px.scatter(
escalating_users,
x="tweet_count",
y="risk_score",
size="risk_score",
hover_data=["username"],
title="High-Risk User Activity Matrix",
labels={"tweet_count": "Number of Tweets", "risk_score": "Risk Score"}
)
st.plotly_chart(fig_escalating, use_container_width=True)
st.write("**Users to Monitor:**")
for _, user in escalating_users.head(10).iterrows():
st.write(f"- @{user['username']}: {user['tweet_count']} tweets, Risk Score: {user['risk_score']}")
# ------------------------
# NEW: NETWORK ANALYSIS
# ------------------------
elif analysis_type == "🌐 Network Analysis":
st.header("🌐 Network Analysis")
st.subheader("👥 User Connection Analysis")
# Mentions network
if "mentions" in df.columns:
st.write("### User Mention Network")
mention_pairs = []
for _, row in df.iterrows():
if pd.notna(row.get("mentions")) and row["mentions"]:
mentions = str(row["mentions"]).split()
for mention in mentions:
mention_clean = mention.strip("@")
if mention_clean:
mention_pairs.append({
"from": row["username"],
"to": mention_clean,
"risk_level": row.get("risk_level", "UNKNOWN")
})
if mention_pairs:
mention_df = pd.DataFrame(mention_pairs)
# Top mentioned users
top_mentioned = mention_df["to"].value_counts().head(15)
fig_mentioned = px.bar(
x=top_mentioned.values,
y=top_mentioned.index,
orientation="h",
title="Most Mentioned Users",
labels={"x": "Times Mentioned", "y": "Username"}
)
fig_mentioned.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig_mentioned, use_container_width=True)
# Connection strength
connection_strength = mention_df.groupby(["from", "to"]).size().reset_index(name="mentions")
strong_connections = connection_strength[connection_strength["mentions"] >= 2].sort_values("mentions", ascending=False)
if not strong_connections.empty:
st.write("### 🔗 Strong Connections (2+ mentions)")
for _, conn in strong_connections.head(20).iterrows():
st.write(f"- @{conn['from']} → @{conn['to']}: {conn['mentions']} times")
else:
st.info("No mention data available")
# Location clustering
st.subheader("📍 Location-Based Clustering")
if "user_location" in df.columns:
location_users = df.groupby("user_location").agg({
"username": lambda x: list(x.unique()),
"tweet_id": "count",
"risk_level": lambda x: (x == "CRITICAL").sum() if "risk_level" in df.columns else 0
}).reset_index()
location_users.columns = ["location", "users", "tweet_count", "critical_count"]
location_users = location_users[location_users["location"] != ""]
location_users = location_users[location_users["tweet_count"] >= 3]
location_users["user_count"] = location_users["users"].apply(len)
if not location_users.empty:
fig_clusters = px.scatter(
location_users,
x="tweet_count",
y="user_count",
size="critical_count",
hover_data=["location"],
title="Location Clusters (Activity vs Users)",
labels={
"tweet_count": "Total Tweets",
"user_count": "Unique Users",
"critical_count": "Critical Tweets"
}
)
st.plotly_chart(fig_clusters, use_container_width=True)
# High-density locations
high_density = location_users.sort_values("user_count", ascending=False).head(10)
st.write("### 🏙️ High-Density Locations")
for _, loc in high_density.iterrows():
with st.expander(f"{loc['location']} - {loc['user_count']} users, {loc['tweet_count']} tweets"):
st.write(f"**Critical tweets:** {loc['critical_count']}")
st.write(f"**Users:** {', '.join(['@' + u for u in loc['users'][:10]])}")
if len(loc['users']) > 10:
st.write(f"... and {len(loc['users']) - 10} more")
# Co-occurrence analysis
st.subheader("🔗 Keyword Co-occurrence")
if "content" in df.columns:
# Define drug/crime keywords
drug_keywords = ["drug", "drugs", "weed", "cannabis", "cocaine", "heroin", "ganja", "charas"]
crime_keywords = ["deal", "dealer", "selling", "supply", "smuggle", "illegal", "arrest"]
cooccurrence = []
for _, row in df.iterrows():
content_lower = row["content"].lower()
found_drug = [kw for kw in drug_keywords if kw in content_lower]
found_crime = [kw for kw in crime_keywords if kw in content_lower]
for drug in found_drug:
for crime in found_crime:
cooccurrence.append({"drug_keyword": drug, "crime_keyword": crime})
if cooccurrence:
cooc_df = pd.DataFrame(cooccurrence)
cooc_counts = cooc_df.groupby(["drug_keyword", "crime_keyword"]).size().reset_index(name="count")
cooc_counts = cooc_counts.sort_values("count", ascending=False).head(20)
if not cooc_counts.empty:
fig_cooc = px.bar(
cooc_counts,
x="count",
y="drug_keyword",
color="crime_keyword",
title="Drug-Crime Keyword Co-occurrence",
orientation="h"
)
st.plotly_chart(fig_cooc, use_container_width=True)
else:
st.info("No significant keyword co-occurrences found")
# Temporal clustering
st.subheader("⏰ Temporal Activity Clusters")
if "datetime" in df.columns and "username" in df.columns:
df_copy = df.copy()
df_copy["hour"] = df_copy["datetime"].dt.hour
df_copy["day_of_week"] = df_copy["datetime"].dt.day_name()
# Find users active at unusual hours (late night/early morning)
unusual_hours = [0, 1, 2, 3, 4, 5]
night_activity = df_copy[df_copy["hour"].isin(unusual_hours)]
if len(night_activity) > 0:
night_users = night_activity.groupby("username").size().reset_index(name="night_tweets")
night_users = night_users[night_users["night_tweets"] >= 3].sort_values("night_tweets", ascending=False)
if not night_users.empty:
st.write(f"### 🌙 Users Active During Late Night (12 AM - 6 AM)")
fig_night = px.bar(
night_users.head(15),
x="night_tweets",
y="username",
orientation="h",
title="Top Users with Late Night Activity"
)
fig_night.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig_night, use_container_width=True)
st.info("⚠️ Late night activity may indicate suspicious behavior patterns")
# ------------------------
# GEOGRAPHIC ANALYSIS (Enhanced)
# ------------------------
elif analysis_type == "Geographic Analysis":
st.header("Geographic Analysis")
# Location distribution
locations = df["user_location"].value_counts().head(20)
locations = locations[locations.index != ""] # Remove empty locations
if not locations.empty:
fig_locations = px.bar(x=locations.values, y=locations.index,
orientation='h', title="Top 20 User Locations")
fig_locations.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig_locations, use_container_width=True)
else:
st.info("No location data available")
# Karnataka relevance score distribution
if "kar_score" in df.columns:
fig_kar = px.histogram(df, x="kar_score", title="Karnataka Relevance Score Distribution")
st.plotly_chart(fig_kar, use_container_width=True)
# Location-based risk analysis
if "risk_level" in df.columns and "user_location" in df.columns:
location_risk = df.groupby("user_location").agg({
"risk_level": lambda x: (x == "HIGH").sum() + (x == "CRITICAL").sum() * 2,
"username": "count"
}).reset_index()
location_risk = location_risk[location_risk["username"] >= 3] # Only locations with 3+ tweets
location_risk = location_risk.sort_values("risk_level", ascending=False).head(15)
if not location_risk.empty:
fig_loc_risk = px.bar(location_risk, x="risk_level", y="user_location",
orientation='h', title="High-Risk Locations (3+ tweets)")
fig_loc_risk.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig_loc_risk, use_container_width=True)
# ------------------------
# USER ANALYSIS (Enhanced)
# ------------------------
elif analysis_type == "User Analysis":
st.header("User Analysis")
# User metrics
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Unique Users", df["username"].nunique())
with col2:
verified_count = safe_column_sum(df, "user_verified")
st.metric("Verified Users", verified_count)
with col3:
avg_followers = safe_column_mean(df, "user_followers")
st.metric("Avg Followers", f"{avg_followers:,.0f}")
# Top users by followers
if "user_followers" in df.columns:
top_followers = df.nlargest(15, "user_followers")[["username", "user_followers"]]
if "user_verified" in df.columns:
top_followers = df.nlargest(15, "user_followers")[["username", "user_followers", "user_verified"]]
fig_followers = px.bar(top_followers, x="user_followers", y="username",
color="user_verified" if "user_verified" in top_followers.columns else None,
orientation='h', title="Users with Most Followers")
fig_followers.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig_followers, use_container_width=True)
# User engagement vs risk (fixed aggregation)
if "risk_level" in df.columns:
user_metrics = []
for username in df["username"].unique():
user_data = df[df["username"] == username]
risk_score = (user_data["risk_level"] == "HIGH").sum() + (user_data["risk_level"] == "CRITICAL").sum() * 2
user_metrics.append({
"username": username,
"risk_score": risk_score,
"avg_likes": safe_column_mean(user_data, "like_count"),
"avg_retweets": safe_column_mean(user_data, "retweet_count"),
"tweet_count": len(user_data)
})
user_risk_df = pd.DataFrame(user_metrics)
multi_tweet_users = user_risk_df[user_risk_df["tweet_count"] >= 3]
if not multi_tweet_users.empty:
fig_user_risk = px.scatter(multi_tweet_users, x="avg_likes", y="risk_score",
size="tweet_count", hover_data=["username"],
title="User Risk vs Engagement (3+ tweets)")
st.plotly_chart(fig_user_risk, use_container_width=True)
# ------------------------
# CONTENT ANALYSIS (Enhanced)
# ------------------------
elif analysis_type == "Content Analysis":
st.header("Content Analysis")
# Hashtag analysis
if "hashtags" in df.columns:
all_hashtags = df["hashtags"].dropna().str.split().explode()
hashtag_counts = all_hashtags.value_counts().head(20)
if not hashtag_counts.empty:
fig_hashtags = px.bar(x=hashtag_counts.values, y=hashtag_counts.index,
orientation='h', title="Top 20 Hashtags")
fig_hashtags.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig_hashtags, use_container_width=True)
# Sentiment vs Risk correlation
col1, col2 = st.columns(2)
with col1:
if "sentiment_compound" in df.columns and "risk_level" in df.columns:
fig_sentiment_risk = px.box(df, x="risk_level", y="sentiment_compound",
title="Sentiment by Risk Level")
st.plotly_chart(fig_sentiment_risk, use_container_width=True)
else:
st.info("Sentiment analysis data not available")
with col2:
if "drug_score" in df.columns and "crime_score" in df.columns:
# Drug score vs Crime score correlation
fig_scores_corr = px.scatter(df, x="drug_score", y="crime_score",
color="risk_level" if "risk_level" in df.columns else None,
title="Drug Score vs Crime Score",
color_discrete_map={
"CRITICAL": "#dc3545",
"HIGH": "#fd7e14",
"MEDIUM": "#ffc107",
"LOW": "#28a745"
})
st.plotly_chart(fig_scores_corr, use_container_width=True)
else:
st.info("Score correlation data not available")
# Content length analysis
if "content" in df.columns:
df_copy = df.copy()
df_copy["content_length"] = df_copy["content"].str.len()
if "risk_level" in df.columns:
fig_length = px.histogram(df_copy, x="content_length", color="risk_level",
title="Tweet Length Distribution by Risk Level",
color_discrete_map={
"CRITICAL": "#dc3545",
"HIGH": "#fd7e14",
"MEDIUM": "#ffc107",
"LOW": "#28a745"
})
else:
fig_length = px.histogram(df_copy, x="content_length", title="Tweet Length Distribution")
st.plotly_chart(fig_length, use_container_width=True)
# Word frequency analysis
if "content" in df.columns:
st.subheader("Content Word Analysis")
filtered_words = get_filtered_words(df["content"])
if filtered_words:
word_freq = pd.Series(filtered_words).value_counts().head(30)
fig_words = px.bar(x=word_freq.values, y=word_freq.index,
orientation='h', title="Top 30 Most Frequent Words")
fig_words.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig_words, use_container_width=True)
else:
st.info("No content words available after filtering")
# ------------------------
# Footer with Data Information & Export
# ------------------------
st.markdown("---")
# Data summary footer
col1, col2, col3, col4 = st.columns(4)
with col1:
st.info(f"Showing {len(df)} tweets")
with col2:
if "risk_level" in df.columns:
high_risk_count = len(df[df["risk_level"].isin(["HIGH", "CRITICAL"])])
st.info(f"High Risk: {high_risk_count} tweets")
else:
st.info("Risk Level: Not available")
# Enhanced export functionality
st.sidebar.header("Data Export")
# Export current filtered data
if st.sidebar.button("Download Current View"):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv = df.to_csv(index=False)
st.sidebar.download_button(
label="Download as CSV",
data=csv,
file_name=f"drug_crime_analysis_{analysis_type.lower().replace(' ', '_')}_{timestamp}.csv",
mime="text/csv"
)
# Export summary report
if report_data:
if st.sidebar.button("Download Analysis Report"):
report_json = json.dumps(report_data, indent=2, default=str)
st.sidebar.download_button(
label="Download Report (JSON)",
data=report_json,
file_name=f"analysis_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
mime="application/json"
)
# Quick stats in sidebar
if len(df) > 0:
st.sidebar.subheader("Quick Stats")
if "risk_level" in df.columns:
risk_counts = df["risk_level"].value_counts()
for risk, count in risk_counts.items():
percentage = (count / len(df)) * 100
st.sidebar.text(f"{risk}: {count} ({percentage:.1f}%)")
# Top location
if "user_location" in df.columns:
top_location = df["user_location"].value_counts().head(1)
if not top_location.empty and top_location.index[0] != "":
st.sidebar.text(f"Top Location: {top_location.index[0]} ({top_location.iloc[0]})")
# Date range
if "datetime" in df.columns and not df["datetime"].isna().all():
try:
days_span = (df["datetime"].max() - df["datetime"].min()).days
st.sidebar.text(f"Data Span: {days_span} days")
except:
pass
# Debug information (collapsible)
with st.sidebar.expander("Debug Info"):
st.write("Available columns:")
st.write(list(df.columns))
st.write(f"DataFrame shape: {df.shape}")
st.write(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
if report_data:
st.write("Report data available: Yes")
else:
st.write("Report data available: No")
if high_priority_df is not None:
st.write(f"High priority tweets: {len(high_priority_df)}")
else:
st.write("High priority tweets: Not available")
if contact_df is not None:
st.write(f"Contact info tweets: {len(contact_df)}")
else:
st.write("Contact info tweets: Not available")
# Footer
st.markdown("---")
st.markdown(
"""
<div style='text-align: center; color: #666; padding: 20px;'>
<p><strong>Twitter Drug Crime Monitoring Dashboard</strong></p>
<p><em>Dashboard last updated: {}</em></p>
</div>
""".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
unsafe_allow_html=True
)