#modify_app.py import streamlit as st import pandas as pd import os import json import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots from datetime import datetime, timedelta import nltk from nltk.corpus import stopwords # βœ… import first # Ensure stopwords data is downloaded try: stopwords.words('english') except LookupError: nltk.download('stopwords') # Now you can safely use it english_stopwords = stopwords.words('english') import numpy as np import time import seaborn as sns import matplotlib.pyplot as plt from alerts import compute_dynamic_risk,assign_dynamic_risk_level from evaluation import evaluate_model # Run evaluation on the scraped CSV folder evaluate_model("drug_analysis_data_3months") import re st.set_page_config(page_title="Twitter Drug Crime Monitoring", layout="wide") # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) # Configuration DASHBOARD_CONFIG = { 'data_dirs': ['drug_analysis_data_3months', 'data', 'output', '.'], 'refresh_interval': 30, 'max_display_tweets': 50, 'chart_height': 400 } # Main header st.markdown('

Twitter Drug Crime Monitoring Dashboard

Real-time Twitter Analysis for Drug Crime Detection

', unsafe_allow_html=True) # ------------------------ # Enhanced Data Loading Functions # ------------------------ def parse_dates_flexible(df): """Parse dates with multiple format attempts.""" if "datetime" not in df.columns: return df date_formats = [ "%d-%m-%Y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", "%Y-%m-%d", "%d/%m/%Y %H:%M:%S", "%m/%d/%Y %H:%M:%S" ] original_datetime = df["datetime"].copy() for fmt in date_formats: try: df["datetime"] = pd.to_datetime(original_datetime, format=fmt, errors="coerce") if not df["datetime"].isna().all(): break except: continue # If parsing still failed, try generic parsing if df["datetime"].isna().all(): df["datetime"] = pd.to_datetime(original_datetime, errors="coerce") # Fill any remaining NaT values with current time df["datetime"] = df["datetime"].fillna(pd.Timestamp.now()) return df def validate_dataframe(df): """Validate that the dataframe has expected columns.""" if df is None or df.empty: return False, "DataFrame is empty" required_columns = ['username', 'content'] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: return False, f"Missing required columns: {missing_columns}" return True, "DataFrame is valid" @st.cache_data def load_data(): """Load the most recent data with robust error handling.""" start_time = time.time() for data_dir in DASHBOARD_CONFIG['data_dirs']: if not os.path.exists(data_dir): continue try: # Look for main dataset files with flexible naming csv_files = [] for f in os.listdir(data_dir): if f.endswith(".csv") and any(keyword in f.lower() for keyword in ["karnataka_drug_tweets", "drug_tweets", "drug_analysis", "drug_crime"]): csv_files.append(f) if not csv_files: # Fallback to any CSV file csv_files = [f for f in os.listdir(data_dir) if f.endswith(".csv")] if not csv_files: continue # Get the most recent file latest_file = max(csv_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x))) file_path = os.path.join(data_dir, latest_file) # Load with error handling df = pd.read_csv(file_path, encoding='utf-8') if df.empty: continue # Enhanced date parsing df = parse_dates_flexible(df) # Add derived columns if missing if "datetime" in df.columns: if "date" not in df.columns: df["date"] = df["datetime"].dt.date if "hour" not in df.columns: df["hour"] = df["datetime"].dt.hour if "day_of_week" not in df.columns: df["day_of_week"] = df["datetime"].dt.day_name() if "day" not in df.columns: df["day"] = df["datetime"].dt.day # Load report if available report_files = [f for f in os.listdir(data_dir) if f.startswith("ANALYSIS_REPORT_") and f.endswith(".json")] report_data = None if report_files: latest_report = max(report_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x))) try: with open(os.path.join(data_dir, latest_report), 'r', encoding='utf-8') as f: report_data = json.load(f) except Exception as e: st.sidebar.warning(f"Could not load report: {e}") report_data = None load_time = time.time() - start_time # Display load metrics in sidebar st.sidebar.success(f"Data loaded successfully") st.sidebar.metric("Load Time", f"{load_time:.2f}s") st.sidebar.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB") st.sidebar.info(f"Source: {latest_file}") return df, report_data except Exception as e: st.sidebar.warning(f"Failed to load from {data_dir}: {str(e)}") continue return None, None @st.cache_data def load_priority_data(): """Load high priority and contact info datasets with fallbacks.""" data_dir = DASHBOARD_CONFIG['data_dirs'][0] # Primary data directory if not os.path.exists(data_dir): return None, None high_priority_df = None contact_df = None try: # Load high priority tweets high_priority_files = [f for f in os.listdir(data_dir) if "HIGH_PRIORITY" in f and f.endswith(".csv")] if high_priority_files: latest_priority = max(high_priority_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x))) high_priority_df = pd.read_csv(os.path.join(data_dir, latest_priority)) high_priority_df = parse_dates_flexible(high_priority_df) except Exception as e: st.sidebar.warning(f"Could not load high priority data: {e}") try: # Load contact info tweets contact_files = [f for f in os.listdir(data_dir) if "CONTACT_INFO" in f and f.endswith(".csv")] if contact_files: latest_contact = max(contact_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x))) contact_df = pd.read_csv(os.path.join(data_dir, latest_contact)) contact_df = parse_dates_flexible(contact_df) except Exception as e: st.sidebar.warning(f"Could not load contact info data: {e}") return high_priority_df, contact_df def safe_column_access(df, column, default=0): """Safely access DataFrame columns with defaults.""" if column in df.columns: return df[column] else: return pd.Series([default] * len(df), index=df.index) def safe_column_sum(df, column): """Safely sum a column with fallback.""" if column in df.columns: return df[column].sum() return 0 def safe_column_mean(df, column): """Safely calculate mean of a column with fallback.""" if column in df.columns and len(df) > 0: return df[column].mean() return 0 # ----------------- Helper: Calculate User Risk ----------------- def calculate_user_risk(df): """ Calculate risk score per user: CRITICAL = 2 points, HIGH = 1 point Returns DataFrame with username, risk_score, tweet_count """ if "username" not in df.columns or "risk_level" not in df.columns: return pd.DataFrame() user_metrics = [] for username in df["username"].unique(): user_data = df[df["username"] == username] risk_score = (user_data["risk_level"] == "HIGH").sum() + \ (user_data["risk_level"] == "CRITICAL").sum() * 2 user_metrics.append({ "username": username, "risk_score": risk_score, "tweet_count": len(user_data) }) return pd.DataFrame(user_metrics) # ----------------- Helper: Filter Words ----------------- def get_filtered_words(text_series): """ Returns filtered words from a Series of text, removing English stopwords and words <=2 characters """ stop_words_set = set(stopwords.words('english')) all_text = " ".join(text_series.astype(str)) words = re.findall(r'\b\w+\b', all_text.lower()) return [w for w in words if w not in stop_words_set and len(w) > 2] def create_heatmap_chart(df, x_col, y_col, title="Heatmap"): """Create a heatmap using plotly.""" if x_col not in df.columns or y_col not in df.columns: return None # Create pivot table for heatmap heatmap_data = df.groupby([x_col, y_col]).size().reset_index(name='count') pivot_data = heatmap_data.pivot(index=y_col, columns=x_col, values='count').fillna(0) fig = go.Figure(data=go.Heatmap( z=pivot_data.values, x=pivot_data.columns, y=pivot_data.index, colorscale='Blues', hoverongaps=False )) fig.update_layout( title=title, xaxis_title=x_col, yaxis_title=y_col, height=400 ) return fig def create_weekly_trend_analysis(df): """Create weekly trend analysis.""" if "datetime" not in df.columns: return None, None # Weekly aggregation df['week'] = df['datetime'].dt.isocalendar().week df['weekday'] = df['datetime'].dt.day_name() weekly_counts = df.groupby('week').size().reset_index(name='count') weekday_counts = df.groupby('weekday').size().reset_index(name='count') # Reorder weekdays weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] weekday_counts['weekday'] = pd.Categorical(weekday_counts['weekday'], categories=weekday_order, ordered=True) weekday_counts = weekday_counts.sort_values('weekday') fig1 = px.line(weekly_counts, x='week', y='count', title="Weekly Tweet Trends") fig2 = px.bar(weekday_counts, x='weekday', y='count', title="Tweets by Weekday") return fig1, fig2 # ------------------------ # Load Data # ------------------------ df, report_data = load_data() # --- Compute dynamic risk for all tweets --- if df is not None and not df.empty: from alerts import compute_dynamic_risk, assign_dynamic_risk_level # Add dynamic risk fields df['dynamic_risk_score'] = df.apply(lambda row: compute_dynamic_risk(row.to_dict()), axis=1) df['risk_level'] = df.apply(lambda row: assign_dynamic_risk_level(row.to_dict()), axis=1) if df is None: st.error("No data found. Please run the drug crime scraper first.") # Enhanced debug information st.subheader("Debug Information") current_dir = os.getcwd() st.write(f"Current directory: {current_dir}") for dir_name in DASHBOARD_CONFIG['data_dirs']: if os.path.exists(dir_name): files = [f for f in os.listdir(dir_name) if f.endswith('.csv')] st.write(f"CSV files in {dir_name}: {files}") else: st.write(f"Directory {dir_name} does not exist") st.info("Expected files: karnataka_drug_tweets_*.csv or similar drug-related CSV files") st.stop() # Validate dataframe is_valid, validation_message = validate_dataframe(df) if not is_valid: st.error(f"Data validation failed: {validation_message}") st.write("Available columns:", list(df.columns)) st.stop() # Load priority data high_priority_df, contact_df = load_priority_data() # Filter for current month data for some analyses now = datetime.now() if "datetime" in df.columns: df_month = df[(df['datetime'].dt.month == now.month) & (df['datetime'].dt.year == now.year)] else: df_month = df # ------------------------ # Sidebar Navigation & Filters # ------------------------ st.sidebar.title("Dashboard Navigation") # Auto-refresh option auto_refresh = st.sidebar.checkbox("Auto-refresh (30s)") from streamlit_autorefresh import st_autorefresh if auto_refresh: st_autorefresh(interval=30*1000, key="refresh") # Navigation tabs - ENHANCED with new options analysis_type = st.sidebar.radio( "Select Analysis View", ["Summary", "Risk Analysis", "Actionable Insights", "πŸ“ˆ Predictive Analytics", "🌐 Network Analysis", "Geographic Analysis", "User Analysis", "Content Analysis", "πŸ“Š Volume Trends", "🧠 User Behavior", "πŸ“ Heatmaps", "⚠️ Risk Patterns"] ) # Common filters st.sidebar.header("Data Filters") # Date range filter if "datetime" in df.columns and not df["datetime"].isna().all(): try: min_date = df["datetime"].min().date() max_date = df["datetime"].max().date() date_range = st.sidebar.date_input( "Select Date Range", value=[min_date, max_date], min_value=min_date, max_value=max_date ) # Filter dataframe by date range if len(date_range) == 2: df = df[ (df["datetime"].dt.date >= date_range[0]) & (df["datetime"].dt.date <= date_range[1]) & (df["datetime"].dt.year == date_range[0].year) # optional if needed ] except Exception as e: st.sidebar.warning(f"Date filtering error: {e}") # Risk level filter if "risk_level" in df.columns: available_risk_levels = df["risk_level"].unique().tolist() risk_levels = st.sidebar.multiselect( "Risk Levels", options=available_risk_levels, default=available_risk_levels ) df = df[df["risk_level"].isin(risk_levels)] # Search filter search_term = st.sidebar.text_input("Search Content", "") if search_term: df = df[df["content"].str.lower().str.contains(search_term.lower(), na=False)] # Display current filter status st.sidebar.info(f"Showing {len(df)} tweets") # ------------------------ # EXECUTIVE SUMMARY # ------------------------ if analysis_type == "Summary": st.header("Summary") # Key metrics in columns col1, col2, col3, col4, col5, col6 = st.columns(6) with col1: st.metric("Total Tweets", len(df)) with col2: drug_related = safe_column_sum(df, "is_drug_related") st.metric("Drug Related", drug_related) with col3: crime_related = safe_column_sum(df, "is_crime_related") st.metric("Crime Related", crime_related) with col4: contact_info = safe_column_sum(df, "has_contact_info") st.metric("Contact Info", contact_info) with col5: st.metric("Unique Users", df["username"].nunique()) with col6: # Or create a new column if needed avg_risk = df["dynamic_risk_score"].mean() if "dynamic_risk_score" in df.columns else 0 st.metric("Avg. Dynamic Risk Score", f"{avg_risk:.2f}") # Risk level analysis if "risk_level" in df.columns: critical_count = len(df[df["risk_level"] == "CRITICAL"]) high_count = len(df[df["risk_level"] == "HIGH"]) if critical_count > 0: st.markdown(f'
CRITICAL ALERT: {critical_count} tweets require immediate attention
', unsafe_allow_html=True) if high_count > 0: st.markdown(f'
HIGH PRIORITY: {high_count} tweets for investigation
', unsafe_allow_html=True) # Risk distribution pie chart col1, col2 = st.columns(2) with col1: risk_order = ["CRITICAL", "HIGH", "MEDIUM", "LOW"] risk_dist = df["risk_level"].value_counts().reindex(risk_order).fillna(0) fig_risk = px.pie(values=risk_dist.values, names=risk_dist.index, title="Risk Level Distribution", color_discrete_map={ "CRITICAL": "#dc3545", "HIGH": "#fd7e14", "MEDIUM": "#ffc107", "LOW": "#28a745" }) st.plotly_chart(fig_risk, use_container_width=True) with col2: # Sentiment analysis if available if "sentiment_compound" in df.columns: sentiment_counts = pd.cut(df["sentiment_compound"], bins=[-1, -0.1, 0.1, 1], labels=["Negative", "Neutral", "Positive"]).value_counts() fig_sentiment = px.bar(x=sentiment_counts.index, y=sentiment_counts.values, title="Sentiment Distribution", color=sentiment_counts.values, color_continuous_scale="RdYlGn") st.plotly_chart(fig_sentiment, use_container_width=True) else: st.info("Sentiment data not available") # Analysis report summary if report_data: st.subheader("Analysis Report Summary") col1, col2 = st.columns(2) with col1: if "summary_statistics" in report_data: st.json(report_data["summary_statistics"]) with col2: if "investigation_priorities" in report_data: st.json(report_data["investigation_priorities"]) # ------------------------ # NEW: VOLUME TRENDS # ------------------------ elif analysis_type == "πŸ“Š Volume Trends": st.header("πŸ“Š Tweet Volume: Daily,Weekly and Hourly Trends") if "datetime" in df.columns and not df["datetime"].isna().all(): # Daily trend if "date" in df.columns: daily_counts = df.groupby("date").size().reset_index(name="count") fig_daily = px.line(daily_counts, x="date", y="count", title="Daily Tweet Volume") st.plotly_chart(fig_daily, use_container_width=True) # Hourly and weekday patterns col1, = st.columns(1) with col1: if "hour" in df.columns: hourly_counts = df.groupby("hour").size() fig_hourly = px.bar(x=hourly_counts.index, y=hourly_counts.values, title="Tweets by Hour of Day") st.plotly_chart(fig_hourly, use_container_width=True) # Weekly trends if "datetime" in df.columns: weekly_fig1, weekly_fig2 = create_weekly_trend_analysis(df) if weekly_fig1 and weekly_fig2: st.subheader("πŸ“… Weekly Trends") col1, col2 = st.columns(2) with col1: st.plotly_chart(weekly_fig1, use_container_width=True) with col2: st.plotly_chart(weekly_fig2, use_container_width=True) else: st.info("Temporal data not available") # CSV Downloads st.subheader("πŸ“„ Download Data") col1, col2 = st.columns(2) with col1: if st.button("πŸ“₯ Download Top Users CSV"): top_users = df.groupby("username").agg( tweet_count=("username", "count"), max_risk=("dynamic_risk_score", "max") ).sort_values("tweet_count", ascending=False).head(20).reset_index() csv = top_users.to_csv(index=False) st.download_button( "Download CSV", csv, "top_users.csv", "text/csv" ) with col2: if st.button("πŸ“₯ Download Top Locations CSV"): if "user_location" in df.columns: top_locations = df.groupby("user_location").agg( tweet_count=("user_location", "count"), max_risk=("dynamic_risk_score", "max") ).sort_values("tweet_count", ascending=False).head(20).reset_index() csv = top_locations.to_csv(index=False) st.download_button( "Download CSV", csv, "top_locations.csv", "text/csv" ) # ------------------------ # NEW: USER BEHAVIOR # ------------------------ elif analysis_type == "🧠 User Behavior": st.header("🧠 User Behavior Analysis") # Top repeat users st.subheader("🧠 Top Repeat Users") user_activity = df["username"].value_counts().head(15) if not user_activity.empty: fig_users = px.bar(x=user_activity.values, y=user_activity.index, orientation='h', title="Top 15 Most Active Users") fig_users.update_layout(yaxis=dict(autorange="reversed")) st.plotly_chart(fig_users, use_container_width=True) # Show details of top users with st.expander("View Top User Details"): for username, count in user_activity.head(10).items(): user_tweets = df[df["username"] == username] # Safe mode extraction with proper error handling if "risk_level" in user_tweets.columns and not user_tweets["risk_level"].empty: risk_mode = user_tweets["risk_level"].mode() risk_level = risk_mode.iloc[0] if len(risk_mode) > 0 else "Unknown" else: risk_level = "Unknown" if "user_location" in user_tweets.columns and not user_tweets["user_location"].empty: location_mode = user_tweets["user_location"].mode() location = location_mode.iloc[0] if len(location_mode) > 0 else "Unknown" else: location = "Unknown" st.write(f"**@{username}**: {count} tweets | Risk: {risk_level} | Location: {location}") # User engagement patterns if "like_count" in df.columns or "retweet_count" in df.columns: st.subheader("πŸ“Š User Engagement Patterns") col1, col2 = st.columns(2) with col1: if "like_count" in df.columns: avg_likes = df.groupby("username")["like_count"].mean().sort_values(ascending=False).head(15) fig_likes = px.bar(x=avg_likes.values, y=avg_likes.index, orientation='h', title="Users by Average Likes") fig_likes.update_layout(yaxis=dict(autorange="reversed")) st.plotly_chart(fig_likes, use_container_width=True) with col2: if "retweet_count" in df.columns: avg_retweets = df.groupby("username")["retweet_count"].mean().sort_values(ascending=False).head(15) fig_retweets = px.bar(x=avg_retweets.values, y=avg_retweets.index, orientation='h', title="Users by Average Retweets") fig_retweets.update_layout(yaxis=dict(autorange="reversed")) st.plotly_chart(fig_retweets, use_container_width=True) # User location overlap analysis if "user_location" in df.columns and "risk_level" in df.columns: st.subheader("πŸ“ User Location vs Risk Analysis") location_risk = df.groupby(["user_location", "risk_level"]).size().reset_index(name="count") location_risk = location_risk[location_risk["user_location"] != ""] if not location_risk.empty: fig_loc_risk = px.bar(location_risk, x="user_location", y="count", color="risk_level", title="Risk Distribution by Location", color_discrete_map={ "CRITICAL": "#dc3545", "HIGH": "#fd7e14", "MEDIUM": "#ffc107", "LOW": "#28a745" }) fig_loc_risk.update_xaxes(tickangle=45) st.plotly_chart(fig_loc_risk, use_container_width=True) # ------------------------ # NEW: HEATMAPS # ------------------------ elif analysis_type == "πŸ“ Heatmaps": st.header("πŸ“ Time-Based Heatmaps") # ------------------- # Day-Hour heatmap # ------------------- if "day_of_week" in df.columns and "hour" in df.columns: # Ensure proper order day_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] df['day_of_week'] = pd.Categorical(df['day_of_week'], categories=day_order, ordered=True) st.subheader("πŸ”₯ Day vs Hour Activity Heatmap") heatmap_fig = create_heatmap_chart(df, "hour", "day_of_week", "Tweet Activity: Day vs Hour") if heatmap_fig: st.plotly_chart(heatmap_fig, use_container_width=True) # Risk level heatmap if "risk_level" in df.columns and "hour" in df.columns: st.subheader("⚠️ Risk Level vs Hour Heatmap") risk_heatmap = create_heatmap_chart(df, "hour", "risk_level", "Risk Level Distribution by Hour") if risk_heatmap: st.plotly_chart(risk_heatmap, use_container_width=True) # ------------------- # Top Locations Heatmap # ------------------- if "user_location" in df.columns and "hour" in df.columns: st.subheader("πŸ“ Location vs Hour Heatmap (Top Locations)") # Add slider in sidebar TOP_N_LOCATIONS = st.sidebar.slider("Top N Locations for Heatmaps", 5, 30, 10) # Filter top N locations top_locations = df["user_location"].value_counts().head(TOP_N_LOCATIONS).index df_top_loc = df[df["user_location"].isin(top_locations)] if not df_top_loc.empty: loc_heatmap = create_heatmap_chart(df_top_loc, "hour", "user_location", f"Top {TOP_N_LOCATIONS} Locations Activity by Hour") if loc_heatmap: st.plotly_chart(loc_heatmap, use_container_width=True) # Tweet location heatmap (if geographic coordinates available) if "latitude" in df.columns and "longitude" in df.columns: st.subheader("πŸ—ΊοΈ Geographic Tweet Distribution") valid_coords = df.dropna(subset=["latitude", "longitude"]) if not valid_coords.empty: fig_map = px.scatter_mapbox( valid_coords, lat="latitude", lon="longitude", color="risk_level" if "risk_level" in df.columns else None, size_max=15, zoom=7, mapbox_style="open-street-map", title="Geographic Distribution of Tweets" ) st.plotly_chart(fig_map, use_container_width=True) else: st.info("No geographic coordinates available for mapping") # ------------------------ # NEW: RISK PATTERNS # ------------------------ # High-risk users analysis elif analysis_type == "⚠️ Risk Patterns": st.header("⚠️ Risk Patterns and High-Risk Analysis") # High-risk users analysis if "risk_level" in df.columns: st.subheader("🚨 High-Risk Users") user_risk_df = calculate_user_risk(df) high_risk_users = user_risk_df[user_risk_df["risk_score"] > 0].sort_values("risk_score", ascending=False).head(20) if not high_risk_users.empty: fig_risk_users = px.bar(high_risk_users, x="risk_score", y="username", orientation='h', color="tweet_count", color_continuous_scale="Reds") fig_risk_users.update_layout(yaxis=dict(autorange="reversed")) st.plotly_chart(fig_risk_users, use_container_width=True) # Optional: show details with st.expander("High-Risk User Details"): for _, row in high_risk_users.iterrows(): user_data = df[df["username"] == row["username"]] critical_count = (user_data["risk_level"] == "CRITICAL").sum() high_count = (user_data["risk_level"] == "HIGH").sum() st.write(f"**@{row['username']}**: Risk Score: {row['risk_score']} | Critical: {critical_count} | High: {high_count} | Total Tweets: {row['tweet_count']}") # Risk overlap analysis if "is_drug_related" in df.columns and "is_crime_related" in df.columns: st.subheader("πŸ”„ Drug-Crime Overlap Analysis") # Create overlap categories df_overlap = df.copy() df_overlap["category"] = "Other" df_overlap.loc[df_overlap["is_drug_related"] == 1, "category"] = "Drug Only" df_overlap.loc[df_overlap["is_crime_related"] == 1, "category"] = "Crime Only" df_overlap.loc[(df_overlap["is_drug_related"] == 1) & (df_overlap["is_crime_related"] == 1), "category"] = "Drug + Crime" overlap_counts = df_overlap["category"].value_counts() fig_overlap = px.pie(values=overlap_counts.values, names=overlap_counts.index, title="Drug-Crime Content Overlap", color_discrete_map={ "Drug + Crime": "#dc3545", "Drug Only": "#fd7e14", "Crime Only": "#ffc107", "Other": "#28a745" }) st.plotly_chart(fig_overlap, use_container_width=True) # Show high-overlap users high_overlap_users = df_overlap[df_overlap["category"] == "Drug + Crime"]["username"].value_counts().head(10) if not high_overlap_users.empty: st.write("**Users with most Drug+Crime tweets:**") for username, count in high_overlap_users.items(): st.write(f"- @{username}: {count} tweets") # Risk progression over time if "datetime" in df.columns and "risk_level" in df.columns: st.subheader("πŸ“ˆ Risk Level Trends Over Time") # Daily risk aggregation df["date_str"] = df["datetime"].dt.strftime("%Y-%m-%d") risk_time = df.groupby(["date_str", "risk_level"]).size().reset_index(name="count") fig_risk_time = px.line(risk_time, x="date_str", y="count", color="risk_level", title="Risk Levels Trend Over Time", color_discrete_map={ "CRITICAL": "#dc3545", "HIGH": "#fd7e14", "MEDIUM": "#ffc107", "LOW": "#28a745" }) fig_risk_time.update_xaxes(tickangle=45) st.plotly_chart(fig_risk_time, use_container_width=True) # ------------------------ # RISK ANALYSIS (Enhanced) # ------------------------ elif analysis_type == "Risk Analysis": st.header("Risk Analysis") # High-risk tweets table if high_priority_df is not None and not high_priority_df.empty: st.subheader("High Priority Tweets") # Risk level tabs risk_tab1, risk_tab2 = st.tabs(["CRITICAL", "HIGH"]) with risk_tab1: critical_tweets = high_priority_df[high_priority_df["risk_level"] == "CRITICAL"] if not critical_tweets.empty: for idx, tweet in critical_tweets.head(10).iterrows(): with st.expander(f"CRITICAL: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"): st.write(f"**Content:** {tweet['content']}") st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}") st.write(f"**Drug Score:** {tweet.get('drug_score', 'N/A')} | **Crime Score:** {tweet.get('crime_score', 'N/A')}") if 'tweet_url' in tweet: st.write(f"**URL:** {tweet['tweet_url']}") else: st.info("No critical risk tweets in current filter") with risk_tab2: high_tweets = high_priority_df[high_priority_df["risk_level"] == "HIGH"] if not high_tweets.empty: for idx, tweet in high_tweets.head(10).iterrows(): with st.expander(f"HIGH: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"): st.write(f"**Content:** {tweet['content']}") st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}") st.write(f"**Drug Score:** {tweet.get('drug_score', 'N/A')} | **Crime Score:** {tweet.get('crime_score', 'N/A')}") if 'tweet_url' in tweet: st.write(f"**URL:** {tweet['tweet_url']}") else: st.info("No high risk tweets in current filter") else: st.info("No high priority data available") # Risk score distribution if "drug_score" in df.columns and "crime_score" in df.columns: fig_scores = make_subplots(rows=1, cols=2, subplot_titles=("Drug Score Distribution", "Crime Score Distribution")) fig_scores.add_trace(go.Histogram(x=df["drug_score"], name="Drug Score", nbinsx=20), row=1, col=1) fig_scores.add_trace(go.Histogram(x=df["crime_score"], name="Crime Score", nbinsx=20), row=1, col=2) fig_scores.update_layout(title="Risk Score Distributions") st.plotly_chart(fig_scores, use_container_width=True) else: st.info("Risk score data not available") # ------------------------ # Actionable Insights # ------------------------ elif analysis_type == "Actionable Insights": st.header("Actionable Insights") # Contact information tweets if contact_df is not None and not contact_df.empty: st.subheader("Tweets with Contact Information") st.markdown('
These tweets contain phone numbers or contact details - HIGH PRIORITY for investigation
', unsafe_allow_html=True) for idx, tweet in contact_df.head(20).iterrows(): with st.expander(f"Contact Info: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"): st.write(f"**Content:** {tweet['content']}") st.write(f"**Phone Numbers:** {tweet.get('phone_numbers', 'Not extracted')}") st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}") st.write(f"**Risk Level:** {tweet.get('risk_level', 'Unknown')}") if 'tweet_url' in tweet: st.write(f"**URL:** {tweet['tweet_url']}") else: st.info("No tweets with contact information found") # Bulk operation indicators st.subheader("Bulk Operation Indicators") # Sidebar input BULK_KEYWORDS = st.sidebar.text_area("Bulk Operation Keywords (comma-separated)", "kg,gram,bulk,wholesale,kilos,ounce,pound").split(",") # In code bulk_pattern = "|".join([kw.strip() for kw in BULK_KEYWORDS]) bulk_regex = re.compile("|".join([kw.strip() for kw in BULK_KEYWORDS]), re.IGNORECASE) bulk_tweets = df[df["content"].str.contains(bulk_regex, na=False)] if not bulk_tweets.empty: st.write(f"Found {len(bulk_tweets)} tweets mentioning bulk quantities") for idx, tweet in bulk_tweets.head(10).iterrows(): with st.expander(f"Bulk: @{tweet['username']} - Risk: {tweet.get('risk_level', 'Unknown')}"): st.write(f"**Content:** {tweet['content']}") st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}") if 'tweet_url' in tweet: st.write(f"**URL:** {tweet['tweet_url']}") else: st.info("No bulk operation indicators found") # High activity users st.subheader("High Activity Users") user_activity = df["username"].value_counts().head(15) if not user_activity.empty: fig_users = px.bar(x=user_activity.values, y=user_activity.index, orientation='h', title="Top 15 Most Active Users") fig_users.update_layout(yaxis=dict(autorange="reversed")) st.plotly_chart(fig_users, use_container_width=True) # ------------------------ # NEW: PREDICTIVE ANALYTICS # ------------------------ elif analysis_type == "πŸ“ˆ Predictive Analytics": st.header("πŸ“ˆ Predictive Analytics & Trends") st.subheader("πŸ“Š Activity Forecast") if "datetime" in df.columns and len(df) >= 7: # Daily activity trend daily_activity = df.groupby(df["datetime"].dt.date).size().reset_index(name="count") daily_activity.columns = ["date", "count"] daily_activity["date"] = pd.to_datetime(daily_activity["date"]) # Calculate moving average daily_activity["7_day_ma"] = daily_activity["count"].rolling(window=7, min_periods=1).mean() daily_activity["trend"] = daily_activity["count"].rolling(window=7, min_periods=1).mean() # Create forecast visualization fig_forecast = go.Figure() fig_forecast.add_trace(go.Scatter( x=daily_activity["date"], y=daily_activity["count"], name="Actual Activity", mode="lines+markers", line=dict(color="#1f77b4") )) fig_forecast.add_trace(go.Scatter( x=daily_activity["date"], y=daily_activity["7_day_ma"], name="7-Day Moving Average", mode="lines", line=dict(color="#ff7f0e", dash="dash") )) fig_forecast.update_layout( title="Tweet Activity Trend & Forecast", xaxis_title="Date", yaxis_title="Number of Tweets", hovermode="x unified" ) st.plotly_chart(fig_forecast, use_container_width=True) # Trend analysis col1, col2, col3 = st.columns(3) with col1: recent_avg = daily_activity["count"].tail(7).mean() st.metric("7-Day Average", f"{recent_avg:.1f} tweets/day") with col2: if len(daily_activity) >= 14: prev_avg = daily_activity["count"].tail(14).head(7).mean() change = ((recent_avg - prev_avg) / prev_avg * 100) if prev_avg > 0 else 0 st.metric("Week-over-Week Change", f"{change:+.1f}%") with col3: peak_day = daily_activity.loc[daily_activity["count"].idxmax()] st.metric("Peak Activity Day", peak_day["date"].strftime("%Y-%m-%d")) # User activity prediction st.subheader("πŸ‘€ High-Risk User Patterns") if "username" in df.columns and "risk_level" in df.columns: user_risk_scores = df.groupby("username").agg({ "tweet_id": "count", "risk_level": lambda x: (x == "CRITICAL").sum() * 2 + (x == "HIGH").sum() }).reset_index() user_risk_scores.columns = ["username", "tweet_count", "risk_score"] # Identify escalating users escalating_users = user_risk_scores[ (user_risk_scores["risk_score"] > 0) & (user_risk_scores["tweet_count"] >= 3) ].sort_values("risk_score", ascending=False).head(15) if not escalating_users.empty: fig_escalating = px.scatter( escalating_users, x="tweet_count", y="risk_score", size="risk_score", hover_data=["username"], title="High-Risk User Activity Matrix", labels={"tweet_count": "Number of Tweets", "risk_score": "Risk Score"} ) st.plotly_chart(fig_escalating, use_container_width=True) st.write("**Users to Monitor:**") for _, user in escalating_users.head(10).iterrows(): st.write(f"- @{user['username']}: {user['tweet_count']} tweets, Risk Score: {user['risk_score']}") # ------------------------ # NEW: NETWORK ANALYSIS # ------------------------ elif analysis_type == "🌐 Network Analysis": st.header("🌐 Network Analysis") st.subheader("πŸ‘₯ User Connection Analysis") # Mentions network if "mentions" in df.columns: st.write("### User Mention Network") mention_pairs = [] for _, row in df.iterrows(): if pd.notna(row.get("mentions")) and row["mentions"]: mentions = str(row["mentions"]).split() for mention in mentions: mention_clean = mention.strip("@") if mention_clean: mention_pairs.append({ "from": row["username"], "to": mention_clean, "risk_level": row.get("risk_level", "UNKNOWN") }) if mention_pairs: mention_df = pd.DataFrame(mention_pairs) # Top mentioned users top_mentioned = mention_df["to"].value_counts().head(15) fig_mentioned = px.bar( x=top_mentioned.values, y=top_mentioned.index, orientation="h", title="Most Mentioned Users", labels={"x": "Times Mentioned", "y": "Username"} ) fig_mentioned.update_layout(yaxis=dict(autorange="reversed")) st.plotly_chart(fig_mentioned, use_container_width=True) # Connection strength connection_strength = mention_df.groupby(["from", "to"]).size().reset_index(name="mentions") strong_connections = connection_strength[connection_strength["mentions"] >= 2].sort_values("mentions", ascending=False) if not strong_connections.empty: st.write("### πŸ”— Strong Connections (2+ mentions)") for _, conn in strong_connections.head(20).iterrows(): st.write(f"- @{conn['from']} β†’ @{conn['to']}: {conn['mentions']} times") else: st.info("No mention data available") # Location clustering st.subheader("πŸ“ Location-Based Clustering") if "user_location" in df.columns: location_users = df.groupby("user_location").agg({ "username": lambda x: list(x.unique()), "tweet_id": "count", "risk_level": lambda x: (x == "CRITICAL").sum() if "risk_level" in df.columns else 0 }).reset_index() location_users.columns = ["location", "users", "tweet_count", "critical_count"] location_users = location_users[location_users["location"] != ""] location_users = location_users[location_users["tweet_count"] >= 3] location_users["user_count"] = location_users["users"].apply(len) if not location_users.empty: fig_clusters = px.scatter( location_users, x="tweet_count", y="user_count", size="critical_count", hover_data=["location"], title="Location Clusters (Activity vs Users)", labels={ "tweet_count": "Total Tweets", "user_count": "Unique Users", "critical_count": "Critical Tweets" } ) st.plotly_chart(fig_clusters, use_container_width=True) # High-density locations high_density = location_users.sort_values("user_count", ascending=False).head(10) st.write("### πŸ™οΈ High-Density Locations") for _, loc in high_density.iterrows(): with st.expander(f"{loc['location']} - {loc['user_count']} users, {loc['tweet_count']} tweets"): st.write(f"**Critical tweets:** {loc['critical_count']}") st.write(f"**Users:** {', '.join(['@' + u for u in loc['users'][:10]])}") if len(loc['users']) > 10: st.write(f"... and {len(loc['users']) - 10} more") # Co-occurrence analysis st.subheader("πŸ”— Keyword Co-occurrence") if "content" in df.columns: # Define drug/crime keywords drug_keywords = ["drug", "drugs", "weed", "cannabis", "cocaine", "heroin", "ganja", "charas"] crime_keywords = ["deal", "dealer", "selling", "supply", "smuggle", "illegal", "arrest"] cooccurrence = [] for _, row in df.iterrows(): content_lower = row["content"].lower() found_drug = [kw for kw in drug_keywords if kw in content_lower] found_crime = [kw for kw in crime_keywords if kw in content_lower] for drug in found_drug: for crime in found_crime: cooccurrence.append({"drug_keyword": drug, "crime_keyword": crime}) if cooccurrence: cooc_df = pd.DataFrame(cooccurrence) cooc_counts = cooc_df.groupby(["drug_keyword", "crime_keyword"]).size().reset_index(name="count") cooc_counts = cooc_counts.sort_values("count", ascending=False).head(20) if not cooc_counts.empty: fig_cooc = px.bar( cooc_counts, x="count", y="drug_keyword", color="crime_keyword", title="Drug-Crime Keyword Co-occurrence", orientation="h" ) st.plotly_chart(fig_cooc, use_container_width=True) else: st.info("No significant keyword co-occurrences found") # Temporal clustering st.subheader("⏰ Temporal Activity Clusters") if "datetime" in df.columns and "username" in df.columns: df_copy = df.copy() df_copy["hour"] = df_copy["datetime"].dt.hour df_copy["day_of_week"] = df_copy["datetime"].dt.day_name() # Find users active at unusual hours (late night/early morning) unusual_hours = [0, 1, 2, 3, 4, 5] night_activity = df_copy[df_copy["hour"].isin(unusual_hours)] if len(night_activity) > 0: night_users = night_activity.groupby("username").size().reset_index(name="night_tweets") night_users = night_users[night_users["night_tweets"] >= 3].sort_values("night_tweets", ascending=False) if not night_users.empty: st.write(f"### πŸŒ™ Users Active During Late Night (12 AM - 6 AM)") fig_night = px.bar( night_users.head(15), x="night_tweets", y="username", orientation="h", title="Top Users with Late Night Activity" ) fig_night.update_layout(yaxis=dict(autorange="reversed")) st.plotly_chart(fig_night, use_container_width=True) st.info("⚠️ Late night activity may indicate suspicious behavior patterns") # ------------------------ # GEOGRAPHIC ANALYSIS (Enhanced) # ------------------------ elif analysis_type == "Geographic Analysis": st.header("Geographic Analysis") # Location distribution locations = df["user_location"].value_counts().head(20) locations = locations[locations.index != ""] # Remove empty locations if not locations.empty: fig_locations = px.bar(x=locations.values, y=locations.index, orientation='h', title="Top 20 User Locations") fig_locations.update_layout(yaxis=dict(autorange="reversed")) st.plotly_chart(fig_locations, use_container_width=True) else: st.info("No location data available") # Karnataka relevance score distribution if "kar_score" in df.columns: fig_kar = px.histogram(df, x="kar_score", title="Karnataka Relevance Score Distribution") st.plotly_chart(fig_kar, use_container_width=True) # Location-based risk analysis if "risk_level" in df.columns and "user_location" in df.columns: location_risk = df.groupby("user_location").agg({ "risk_level": lambda x: (x == "HIGH").sum() + (x == "CRITICAL").sum() * 2, "username": "count" }).reset_index() location_risk = location_risk[location_risk["username"] >= 3] # Only locations with 3+ tweets location_risk = location_risk.sort_values("risk_level", ascending=False).head(15) if not location_risk.empty: fig_loc_risk = px.bar(location_risk, x="risk_level", y="user_location", orientation='h', title="High-Risk Locations (3+ tweets)") fig_loc_risk.update_layout(yaxis=dict(autorange="reversed")) st.plotly_chart(fig_loc_risk, use_container_width=True) # ------------------------ # USER ANALYSIS (Enhanced) # ------------------------ elif analysis_type == "User Analysis": st.header("User Analysis") # User metrics col1, col2, col3 = st.columns(3) with col1: st.metric("Unique Users", df["username"].nunique()) with col2: verified_count = safe_column_sum(df, "user_verified") st.metric("Verified Users", verified_count) with col3: avg_followers = safe_column_mean(df, "user_followers") st.metric("Avg Followers", f"{avg_followers:,.0f}") # Top users by followers if "user_followers" in df.columns: top_followers = df.nlargest(15, "user_followers")[["username", "user_followers"]] if "user_verified" in df.columns: top_followers = df.nlargest(15, "user_followers")[["username", "user_followers", "user_verified"]] fig_followers = px.bar(top_followers, x="user_followers", y="username", color="user_verified" if "user_verified" in top_followers.columns else None, orientation='h', title="Users with Most Followers") fig_followers.update_layout(yaxis=dict(autorange="reversed")) st.plotly_chart(fig_followers, use_container_width=True) # User engagement vs risk (fixed aggregation) if "risk_level" in df.columns: user_metrics = [] for username in df["username"].unique(): user_data = df[df["username"] == username] risk_score = (user_data["risk_level"] == "HIGH").sum() + (user_data["risk_level"] == "CRITICAL").sum() * 2 user_metrics.append({ "username": username, "risk_score": risk_score, "avg_likes": safe_column_mean(user_data, "like_count"), "avg_retweets": safe_column_mean(user_data, "retweet_count"), "tweet_count": len(user_data) }) user_risk_df = pd.DataFrame(user_metrics) multi_tweet_users = user_risk_df[user_risk_df["tweet_count"] >= 3] if not multi_tweet_users.empty: fig_user_risk = px.scatter(multi_tweet_users, x="avg_likes", y="risk_score", size="tweet_count", hover_data=["username"], title="User Risk vs Engagement (3+ tweets)") st.plotly_chart(fig_user_risk, use_container_width=True) # ------------------------ # CONTENT ANALYSIS (Enhanced) # ------------------------ elif analysis_type == "Content Analysis": st.header("Content Analysis") # Hashtag analysis if "hashtags" in df.columns: all_hashtags = df["hashtags"].dropna().str.split().explode() hashtag_counts = all_hashtags.value_counts().head(20) if not hashtag_counts.empty: fig_hashtags = px.bar(x=hashtag_counts.values, y=hashtag_counts.index, orientation='h', title="Top 20 Hashtags") fig_hashtags.update_layout(yaxis=dict(autorange="reversed")) st.plotly_chart(fig_hashtags, use_container_width=True) # Sentiment vs Risk correlation col1, col2 = st.columns(2) with col1: if "sentiment_compound" in df.columns and "risk_level" in df.columns: fig_sentiment_risk = px.box(df, x="risk_level", y="sentiment_compound", title="Sentiment by Risk Level") st.plotly_chart(fig_sentiment_risk, use_container_width=True) else: st.info("Sentiment analysis data not available") with col2: if "drug_score" in df.columns and "crime_score" in df.columns: # Drug score vs Crime score correlation fig_scores_corr = px.scatter(df, x="drug_score", y="crime_score", color="risk_level" if "risk_level" in df.columns else None, title="Drug Score vs Crime Score", color_discrete_map={ "CRITICAL": "#dc3545", "HIGH": "#fd7e14", "MEDIUM": "#ffc107", "LOW": "#28a745" }) st.plotly_chart(fig_scores_corr, use_container_width=True) else: st.info("Score correlation data not available") # Content length analysis if "content" in df.columns: df_copy = df.copy() df_copy["content_length"] = df_copy["content"].str.len() if "risk_level" in df.columns: fig_length = px.histogram(df_copy, x="content_length", color="risk_level", title="Tweet Length Distribution by Risk Level", color_discrete_map={ "CRITICAL": "#dc3545", "HIGH": "#fd7e14", "MEDIUM": "#ffc107", "LOW": "#28a745" }) else: fig_length = px.histogram(df_copy, x="content_length", title="Tweet Length Distribution") st.plotly_chart(fig_length, use_container_width=True) # Word frequency analysis if "content" in df.columns: st.subheader("Content Word Analysis") filtered_words = get_filtered_words(df["content"]) if filtered_words: word_freq = pd.Series(filtered_words).value_counts().head(30) fig_words = px.bar(x=word_freq.values, y=word_freq.index, orientation='h', title="Top 30 Most Frequent Words") fig_words.update_layout(yaxis=dict(autorange="reversed")) st.plotly_chart(fig_words, use_container_width=True) else: st.info("No content words available after filtering") # ------------------------ # Footer with Data Information & Export # ------------------------ st.markdown("---") # Data summary footer col1, col2, col3, col4 = st.columns(4) with col1: st.info(f"Showing {len(df)} tweets") with col2: if "risk_level" in df.columns: high_risk_count = len(df[df["risk_level"].isin(["HIGH", "CRITICAL"])]) st.info(f"High Risk: {high_risk_count} tweets") else: st.info("Risk Level: Not available") # Enhanced export functionality st.sidebar.header("Data Export") # Export current filtered data if st.sidebar.button("Download Current View"): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") csv = df.to_csv(index=False) st.sidebar.download_button( label="Download as CSV", data=csv, file_name=f"drug_crime_analysis_{analysis_type.lower().replace(' ', '_')}_{timestamp}.csv", mime="text/csv" ) # Export summary report if report_data: if st.sidebar.button("Download Analysis Report"): report_json = json.dumps(report_data, indent=2, default=str) st.sidebar.download_button( label="Download Report (JSON)", data=report_json, file_name=f"analysis_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", mime="application/json" ) # Quick stats in sidebar if len(df) > 0: st.sidebar.subheader("Quick Stats") if "risk_level" in df.columns: risk_counts = df["risk_level"].value_counts() for risk, count in risk_counts.items(): percentage = (count / len(df)) * 100 st.sidebar.text(f"{risk}: {count} ({percentage:.1f}%)") # Top location if "user_location" in df.columns: top_location = df["user_location"].value_counts().head(1) if not top_location.empty and top_location.index[0] != "": st.sidebar.text(f"Top Location: {top_location.index[0]} ({top_location.iloc[0]})") # Date range if "datetime" in df.columns and not df["datetime"].isna().all(): try: days_span = (df["datetime"].max() - df["datetime"].min()).days st.sidebar.text(f"Data Span: {days_span} days") except: pass # Debug information (collapsible) with st.sidebar.expander("Debug Info"): st.write("Available columns:") st.write(list(df.columns)) st.write(f"DataFrame shape: {df.shape}") st.write(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB") if report_data: st.write("Report data available: Yes") else: st.write("Report data available: No") if high_priority_df is not None: st.write(f"High priority tweets: {len(high_priority_df)}") else: st.write("High priority tweets: Not available") if contact_df is not None: st.write(f"Contact info tweets: {len(contact_df)}") else: st.write("Contact info tweets: Not available") # Footer st.markdown("---") st.markdown( """

Twitter Drug Crime Monitoring Dashboard

Dashboard last updated: {}

""".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")), unsafe_allow_html=True )