Spaces:
Sleeping
Sleeping
| #modify_app.py | |
| import streamlit as st | |
| import pandas as pd | |
| import os | |
| import json | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| from datetime import datetime, timedelta | |
| import nltk | |
| from nltk.corpus import stopwords # ✅ import first | |
| # Ensure stopwords data is downloaded | |
| try: | |
| stopwords.words('english') | |
| except LookupError: | |
| nltk.download('stopwords') | |
| # Now you can safely use it | |
| english_stopwords = stopwords.words('english') | |
| import numpy as np | |
| import time | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| from alerts import compute_dynamic_risk,assign_dynamic_risk_level | |
| from evaluation import evaluate_model | |
| # Run evaluation on the scraped CSV folder | |
| evaluate_model("drug_analysis_data_3months") | |
| import re | |
| st.set_page_config(page_title="Twitter Drug Crime Monitoring", layout="wide") | |
| # Custom CSS for better styling | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| background: linear-gradient(90deg, #1e3c72, #2a5298); | |
| color: white; | |
| padding: 1rem; | |
| border-radius: 10px; | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| } | |
| .metric-card { | |
| background: #f8f9fa; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| border-left: 4px solid #007bff; | |
| } | |
| .critical-alert { | |
| background: #f8d7da; | |
| border: 1px solid #f5c6cb; | |
| color: #721c24; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| margin: 1rem 0; | |
| } | |
| .high-priority { | |
| background: #fff3cd; | |
| border: 1px solid #ffeaa7; | |
| color: #856404; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| margin: 1rem 0; | |
| } | |
| .warning-box { | |
| background: #d4edda; | |
| border: 1px solid #c3e6cb; | |
| color: #155724; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| margin: 1rem 0; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Configuration | |
| DASHBOARD_CONFIG = { | |
| 'data_dirs': ['drug_analysis_data_3months', 'data', 'output', '.'], | |
| 'refresh_interval': 30, | |
| 'max_display_tweets': 50, | |
| 'chart_height': 400 | |
| } | |
| # Main header | |
| st.markdown('<div class="main-header"><h1>Twitter Drug Crime Monitoring Dashboard</h1><p>Real-time Twitter Analysis for Drug Crime Detection</p></div>', unsafe_allow_html=True) | |
| # ------------------------ | |
| # Enhanced Data Loading Functions | |
| # ------------------------ | |
| def parse_dates_flexible(df): | |
| """Parse dates with multiple format attempts.""" | |
| if "datetime" not in df.columns: | |
| return df | |
| date_formats = [ | |
| "%d-%m-%Y %H:%M:%S", | |
| "%Y-%m-%d %H:%M:%S", | |
| "%Y-%m-%d %H:%M", | |
| "%Y-%m-%d", | |
| "%d/%m/%Y %H:%M:%S", | |
| "%m/%d/%Y %H:%M:%S" | |
| ] | |
| original_datetime = df["datetime"].copy() | |
| for fmt in date_formats: | |
| try: | |
| df["datetime"] = pd.to_datetime(original_datetime, format=fmt, errors="coerce") | |
| if not df["datetime"].isna().all(): | |
| break | |
| except: | |
| continue | |
| # If parsing still failed, try generic parsing | |
| if df["datetime"].isna().all(): | |
| df["datetime"] = pd.to_datetime(original_datetime, errors="coerce") | |
| # Fill any remaining NaT values with current time | |
| df["datetime"] = df["datetime"].fillna(pd.Timestamp.now()) | |
| return df | |
| def validate_dataframe(df): | |
| """Validate that the dataframe has expected columns.""" | |
| if df is None or df.empty: | |
| return False, "DataFrame is empty" | |
| required_columns = ['username', 'content'] | |
| missing_columns = [col for col in required_columns if col not in df.columns] | |
| if missing_columns: | |
| return False, f"Missing required columns: {missing_columns}" | |
| return True, "DataFrame is valid" | |
| def load_data(): | |
| """Load the most recent data with robust error handling.""" | |
| start_time = time.time() | |
| for data_dir in DASHBOARD_CONFIG['data_dirs']: | |
| if not os.path.exists(data_dir): | |
| continue | |
| try: | |
| # Look for main dataset files with flexible naming | |
| csv_files = [] | |
| for f in os.listdir(data_dir): | |
| if f.endswith(".csv") and any(keyword in f.lower() for keyword in | |
| ["karnataka_drug_tweets", "drug_tweets", "drug_analysis", "drug_crime"]): | |
| csv_files.append(f) | |
| if not csv_files: | |
| # Fallback to any CSV file | |
| csv_files = [f for f in os.listdir(data_dir) if f.endswith(".csv")] | |
| if not csv_files: | |
| continue | |
| # Get the most recent file | |
| latest_file = max(csv_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x))) | |
| file_path = os.path.join(data_dir, latest_file) | |
| # Load with error handling | |
| df = pd.read_csv(file_path, encoding='utf-8') | |
| if df.empty: | |
| continue | |
| # Enhanced date parsing | |
| df = parse_dates_flexible(df) | |
| # Add derived columns if missing | |
| if "datetime" in df.columns: | |
| if "date" not in df.columns: | |
| df["date"] = df["datetime"].dt.date | |
| if "hour" not in df.columns: | |
| df["hour"] = df["datetime"].dt.hour | |
| if "day_of_week" not in df.columns: | |
| df["day_of_week"] = df["datetime"].dt.day_name() | |
| if "day" not in df.columns: | |
| df["day"] = df["datetime"].dt.day | |
| # Load report if available | |
| report_files = [f for f in os.listdir(data_dir) | |
| if f.startswith("ANALYSIS_REPORT_") and f.endswith(".json")] | |
| report_data = None | |
| if report_files: | |
| latest_report = max(report_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x))) | |
| try: | |
| with open(os.path.join(data_dir, latest_report), 'r', encoding='utf-8') as f: | |
| report_data = json.load(f) | |
| except Exception as e: | |
| st.sidebar.warning(f"Could not load report: {e}") | |
| report_data = None | |
| load_time = time.time() - start_time | |
| # Display load metrics in sidebar | |
| st.sidebar.success(f"Data loaded successfully") | |
| st.sidebar.metric("Load Time", f"{load_time:.2f}s") | |
| st.sidebar.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB") | |
| st.sidebar.info(f"Source: {latest_file}") | |
| return df, report_data | |
| except Exception as e: | |
| st.sidebar.warning(f"Failed to load from {data_dir}: {str(e)}") | |
| continue | |
| return None, None | |
| def load_priority_data(): | |
| """Load high priority and contact info datasets with fallbacks.""" | |
| data_dir = DASHBOARD_CONFIG['data_dirs'][0] # Primary data directory | |
| if not os.path.exists(data_dir): | |
| return None, None | |
| high_priority_df = None | |
| contact_df = None | |
| try: | |
| # Load high priority tweets | |
| high_priority_files = [f for f in os.listdir(data_dir) | |
| if "HIGH_PRIORITY" in f and f.endswith(".csv")] | |
| if high_priority_files: | |
| latest_priority = max(high_priority_files, | |
| key=lambda x: os.path.getctime(os.path.join(data_dir, x))) | |
| high_priority_df = pd.read_csv(os.path.join(data_dir, latest_priority)) | |
| high_priority_df = parse_dates_flexible(high_priority_df) | |
| except Exception as e: | |
| st.sidebar.warning(f"Could not load high priority data: {e}") | |
| try: | |
| # Load contact info tweets | |
| contact_files = [f for f in os.listdir(data_dir) | |
| if "CONTACT_INFO" in f and f.endswith(".csv")] | |
| if contact_files: | |
| latest_contact = max(contact_files, | |
| key=lambda x: os.path.getctime(os.path.join(data_dir, x))) | |
| contact_df = pd.read_csv(os.path.join(data_dir, latest_contact)) | |
| contact_df = parse_dates_flexible(contact_df) | |
| except Exception as e: | |
| st.sidebar.warning(f"Could not load contact info data: {e}") | |
| return high_priority_df, contact_df | |
| def safe_column_access(df, column, default=0): | |
| """Safely access DataFrame columns with defaults.""" | |
| if column in df.columns: | |
| return df[column] | |
| else: | |
| return pd.Series([default] * len(df), index=df.index) | |
| def safe_column_sum(df, column): | |
| """Safely sum a column with fallback.""" | |
| if column in df.columns: | |
| return df[column].sum() | |
| return 0 | |
| def safe_column_mean(df, column): | |
| """Safely calculate mean of a column with fallback.""" | |
| if column in df.columns and len(df) > 0: | |
| return df[column].mean() | |
| return 0 | |
| # ----------------- Helper: Calculate User Risk ----------------- | |
| def calculate_user_risk(df): | |
| """ | |
| Calculate risk score per user: | |
| CRITICAL = 2 points, HIGH = 1 point | |
| Returns DataFrame with username, risk_score, tweet_count | |
| """ | |
| if "username" not in df.columns or "risk_level" not in df.columns: | |
| return pd.DataFrame() | |
| user_metrics = [] | |
| for username in df["username"].unique(): | |
| user_data = df[df["username"] == username] | |
| risk_score = (user_data["risk_level"] == "HIGH").sum() + \ | |
| (user_data["risk_level"] == "CRITICAL").sum() * 2 | |
| user_metrics.append({ | |
| "username": username, | |
| "risk_score": risk_score, | |
| "tweet_count": len(user_data) | |
| }) | |
| return pd.DataFrame(user_metrics) | |
| # ----------------- Helper: Filter Words ----------------- | |
| def get_filtered_words(text_series): | |
| """ | |
| Returns filtered words from a Series of text, | |
| removing English stopwords and words <=2 characters | |
| """ | |
| stop_words_set = set(stopwords.words('english')) | |
| all_text = " ".join(text_series.astype(str)) | |
| words = re.findall(r'\b\w+\b', all_text.lower()) | |
| return [w for w in words if w not in stop_words_set and len(w) > 2] | |
| def create_heatmap_chart(df, x_col, y_col, title="Heatmap"): | |
| """Create a heatmap using plotly.""" | |
| if x_col not in df.columns or y_col not in df.columns: | |
| return None | |
| # Create pivot table for heatmap | |
| heatmap_data = df.groupby([x_col, y_col]).size().reset_index(name='count') | |
| pivot_data = heatmap_data.pivot(index=y_col, columns=x_col, values='count').fillna(0) | |
| fig = go.Figure(data=go.Heatmap( | |
| z=pivot_data.values, | |
| x=pivot_data.columns, | |
| y=pivot_data.index, | |
| colorscale='Blues', | |
| hoverongaps=False | |
| )) | |
| fig.update_layout( | |
| title=title, | |
| xaxis_title=x_col, | |
| yaxis_title=y_col, | |
| height=400 | |
| ) | |
| return fig | |
| def create_weekly_trend_analysis(df): | |
| """Create weekly trend analysis.""" | |
| if "datetime" not in df.columns: | |
| return None, None | |
| # Weekly aggregation | |
| df['week'] = df['datetime'].dt.isocalendar().week | |
| df['weekday'] = df['datetime'].dt.day_name() | |
| weekly_counts = df.groupby('week').size().reset_index(name='count') | |
| weekday_counts = df.groupby('weekday').size().reset_index(name='count') | |
| # Reorder weekdays | |
| weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] | |
| weekday_counts['weekday'] = pd.Categorical(weekday_counts['weekday'], categories=weekday_order, ordered=True) | |
| weekday_counts = weekday_counts.sort_values('weekday') | |
| fig1 = px.line(weekly_counts, x='week', y='count', title="Weekly Tweet Trends") | |
| fig2 = px.bar(weekday_counts, x='weekday', y='count', title="Tweets by Weekday") | |
| return fig1, fig2 | |
| # ------------------------ | |
| # Load Data | |
| # ------------------------ | |
| df, report_data = load_data() | |
| # --- Compute dynamic risk for all tweets --- | |
| if df is not None and not df.empty: | |
| from alerts import compute_dynamic_risk, assign_dynamic_risk_level | |
| # Add dynamic risk fields | |
| df['dynamic_risk_score'] = df.apply(lambda row: compute_dynamic_risk(row.to_dict()), axis=1) | |
| df['risk_level'] = df.apply(lambda row: assign_dynamic_risk_level(row.to_dict()), axis=1) | |
| if df is None: | |
| st.error("No data found. Please run the drug crime scraper first.") | |
| # Enhanced debug information | |
| st.subheader("Debug Information") | |
| current_dir = os.getcwd() | |
| st.write(f"Current directory: {current_dir}") | |
| for dir_name in DASHBOARD_CONFIG['data_dirs']: | |
| if os.path.exists(dir_name): | |
| files = [f for f in os.listdir(dir_name) if f.endswith('.csv')] | |
| st.write(f"CSV files in {dir_name}: {files}") | |
| else: | |
| st.write(f"Directory {dir_name} does not exist") | |
| st.info("Expected files: karnataka_drug_tweets_*.csv or similar drug-related CSV files") | |
| st.stop() | |
| # Validate dataframe | |
| is_valid, validation_message = validate_dataframe(df) | |
| if not is_valid: | |
| st.error(f"Data validation failed: {validation_message}") | |
| st.write("Available columns:", list(df.columns)) | |
| st.stop() | |
| # Load priority data | |
| high_priority_df, contact_df = load_priority_data() | |
| # Filter for current month data for some analyses | |
| now = datetime.now() | |
| if "datetime" in df.columns: | |
| df_month = df[(df['datetime'].dt.month == now.month) & (df['datetime'].dt.year == now.year)] | |
| else: | |
| df_month = df | |
| # ------------------------ | |
| # Sidebar Navigation & Filters | |
| # ------------------------ | |
| st.sidebar.title("Dashboard Navigation") | |
| # Auto-refresh option | |
| auto_refresh = st.sidebar.checkbox("Auto-refresh (30s)") | |
| from streamlit_autorefresh import st_autorefresh | |
| if auto_refresh: | |
| st_autorefresh(interval=30*1000, key="refresh") | |
| # Navigation tabs - ENHANCED with new options | |
| analysis_type = st.sidebar.radio( | |
| "Select Analysis View", | |
| ["Summary", "Risk Analysis", "Actionable Insights", "📈 Predictive Analytics", "🌐 Network Analysis", | |
| "Geographic Analysis", "User Analysis", | |
| "Content Analysis", "📊 Volume Trends", "🧠 User Behavior", | |
| "📍 Heatmaps", "⚠️ Risk Patterns"] | |
| ) | |
| # Common filters | |
| st.sidebar.header("Data Filters") | |
| # Date range filter | |
| if "datetime" in df.columns and not df["datetime"].isna().all(): | |
| try: | |
| min_date = df["datetime"].min().date() | |
| max_date = df["datetime"].max().date() | |
| date_range = st.sidebar.date_input( | |
| "Select Date Range", | |
| value=[min_date, max_date], | |
| min_value=min_date, | |
| max_value=max_date | |
| ) | |
| # Filter dataframe by date range | |
| if len(date_range) == 2: | |
| df = df[ | |
| (df["datetime"].dt.date >= date_range[0]) & | |
| (df["datetime"].dt.date <= date_range[1]) & | |
| (df["datetime"].dt.year == date_range[0].year) # optional if needed | |
| ] | |
| except Exception as e: | |
| st.sidebar.warning(f"Date filtering error: {e}") | |
| # Risk level filter | |
| if "risk_level" in df.columns: | |
| available_risk_levels = df["risk_level"].unique().tolist() | |
| risk_levels = st.sidebar.multiselect( | |
| "Risk Levels", | |
| options=available_risk_levels, | |
| default=available_risk_levels | |
| ) | |
| df = df[df["risk_level"].isin(risk_levels)] | |
| # Search filter | |
| search_term = st.sidebar.text_input("Search Content", "") | |
| if search_term: | |
| df = df[df["content"].str.lower().str.contains(search_term.lower(), na=False)] | |
| # Display current filter status | |
| st.sidebar.info(f"Showing {len(df)} tweets") | |
| # ------------------------ | |
| # EXECUTIVE SUMMARY | |
| # ------------------------ | |
| if analysis_type == "Summary": | |
| st.header("Summary") | |
| # Key metrics in columns | |
| col1, col2, col3, col4, col5, col6 = st.columns(6) | |
| with col1: | |
| st.metric("Total Tweets", len(df)) | |
| with col2: | |
| drug_related = safe_column_sum(df, "is_drug_related") | |
| st.metric("Drug Related", drug_related) | |
| with col3: | |
| crime_related = safe_column_sum(df, "is_crime_related") | |
| st.metric("Crime Related", crime_related) | |
| with col4: | |
| contact_info = safe_column_sum(df, "has_contact_info") | |
| st.metric("Contact Info", contact_info) | |
| with col5: | |
| st.metric("Unique Users", df["username"].nunique()) | |
| with col6: # Or create a new column if needed | |
| avg_risk = df["dynamic_risk_score"].mean() if "dynamic_risk_score" in df.columns else 0 | |
| st.metric("Avg. Dynamic Risk Score", f"{avg_risk:.2f}") | |
| # Risk level analysis | |
| if "risk_level" in df.columns: | |
| critical_count = len(df[df["risk_level"] == "CRITICAL"]) | |
| high_count = len(df[df["risk_level"] == "HIGH"]) | |
| if critical_count > 0: | |
| st.markdown(f'<div class="critical-alert"><strong>CRITICAL ALERT:</strong> {critical_count} tweets require immediate attention</div>', unsafe_allow_html=True) | |
| if high_count > 0: | |
| st.markdown(f'<div class="high-priority"><strong>HIGH PRIORITY:</strong> {high_count} tweets for investigation</div>', unsafe_allow_html=True) | |
| # Risk distribution pie chart | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| risk_order = ["CRITICAL", "HIGH", "MEDIUM", "LOW"] | |
| risk_dist = df["risk_level"].value_counts().reindex(risk_order).fillna(0) | |
| fig_risk = px.pie(values=risk_dist.values, names=risk_dist.index, | |
| title="Risk Level Distribution", | |
| color_discrete_map={ | |
| "CRITICAL": "#dc3545", | |
| "HIGH": "#fd7e14", | |
| "MEDIUM": "#ffc107", | |
| "LOW": "#28a745" | |
| }) | |
| st.plotly_chart(fig_risk, use_container_width=True) | |
| with col2: | |
| # Sentiment analysis if available | |
| if "sentiment_compound" in df.columns: | |
| sentiment_counts = pd.cut(df["sentiment_compound"], | |
| bins=[-1, -0.1, 0.1, 1], | |
| labels=["Negative", "Neutral", "Positive"]).value_counts() | |
| fig_sentiment = px.bar(x=sentiment_counts.index, y=sentiment_counts.values, | |
| title="Sentiment Distribution", | |
| color=sentiment_counts.values, | |
| color_continuous_scale="RdYlGn") | |
| st.plotly_chart(fig_sentiment, use_container_width=True) | |
| else: | |
| st.info("Sentiment data not available") | |
| # Analysis report summary | |
| if report_data: | |
| st.subheader("Analysis Report Summary") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if "summary_statistics" in report_data: | |
| st.json(report_data["summary_statistics"]) | |
| with col2: | |
| if "investigation_priorities" in report_data: | |
| st.json(report_data["investigation_priorities"]) | |
| # ------------------------ | |
| # NEW: VOLUME TRENDS | |
| # ------------------------ | |
| elif analysis_type == "📊 Volume Trends": | |
| st.header("📊 Tweet Volume: Daily,Weekly and Hourly Trends") | |
| if "datetime" in df.columns and not df["datetime"].isna().all(): | |
| # Daily trend | |
| if "date" in df.columns: | |
| daily_counts = df.groupby("date").size().reset_index(name="count") | |
| fig_daily = px.line(daily_counts, x="date", y="count", | |
| title="Daily Tweet Volume") | |
| st.plotly_chart(fig_daily, use_container_width=True) | |
| # Hourly and weekday patterns | |
| col1, = st.columns(1) | |
| with col1: | |
| if "hour" in df.columns: | |
| hourly_counts = df.groupby("hour").size() | |
| fig_hourly = px.bar(x=hourly_counts.index, y=hourly_counts.values, | |
| title="Tweets by Hour of Day") | |
| st.plotly_chart(fig_hourly, use_container_width=True) | |
| # Weekly trends | |
| if "datetime" in df.columns: | |
| weekly_fig1, weekly_fig2 = create_weekly_trend_analysis(df) | |
| if weekly_fig1 and weekly_fig2: | |
| st.subheader("📅 Weekly Trends") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.plotly_chart(weekly_fig1, use_container_width=True) | |
| with col2: | |
| st.plotly_chart(weekly_fig2, use_container_width=True) | |
| else: | |
| st.info("Temporal data not available") | |
| # CSV Downloads | |
| st.subheader("📄 Download Data") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("📥 Download Top Users CSV"): | |
| top_users = df.groupby("username").agg( | |
| tweet_count=("username", "count"), | |
| max_risk=("dynamic_risk_score", "max") | |
| ).sort_values("tweet_count", ascending=False).head(20).reset_index() | |
| csv = top_users.to_csv(index=False) | |
| st.download_button( | |
| "Download CSV", csv, "top_users.csv", "text/csv" | |
| ) | |
| with col2: | |
| if st.button("📥 Download Top Locations CSV"): | |
| if "user_location" in df.columns: | |
| top_locations = df.groupby("user_location").agg( | |
| tweet_count=("user_location", "count"), | |
| max_risk=("dynamic_risk_score", "max") | |
| ).sort_values("tweet_count", ascending=False).head(20).reset_index() | |
| csv = top_locations.to_csv(index=False) | |
| st.download_button( | |
| "Download CSV", csv, "top_locations.csv", "text/csv" | |
| ) | |
| # ------------------------ | |
| # NEW: USER BEHAVIOR | |
| # ------------------------ | |
| elif analysis_type == "🧠 User Behavior": | |
| st.header("🧠 User Behavior Analysis") | |
| # Top repeat users | |
| st.subheader("🧠 Top Repeat Users") | |
| user_activity = df["username"].value_counts().head(15) | |
| if not user_activity.empty: | |
| fig_users = px.bar(x=user_activity.values, y=user_activity.index, | |
| orientation='h', title="Top 15 Most Active Users") | |
| fig_users.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig_users, use_container_width=True) | |
| # Show details of top users | |
| with st.expander("View Top User Details"): | |
| for username, count in user_activity.head(10).items(): | |
| user_tweets = df[df["username"] == username] | |
| # Safe mode extraction with proper error handling | |
| if "risk_level" in user_tweets.columns and not user_tweets["risk_level"].empty: | |
| risk_mode = user_tweets["risk_level"].mode() | |
| risk_level = risk_mode.iloc[0] if len(risk_mode) > 0 else "Unknown" | |
| else: | |
| risk_level = "Unknown" | |
| if "user_location" in user_tweets.columns and not user_tweets["user_location"].empty: | |
| location_mode = user_tweets["user_location"].mode() | |
| location = location_mode.iloc[0] if len(location_mode) > 0 else "Unknown" | |
| else: | |
| location = "Unknown" | |
| st.write(f"**@{username}**: {count} tweets | Risk: {risk_level} | Location: {location}") | |
| # User engagement patterns | |
| if "like_count" in df.columns or "retweet_count" in df.columns: | |
| st.subheader("📊 User Engagement Patterns") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if "like_count" in df.columns: | |
| avg_likes = df.groupby("username")["like_count"].mean().sort_values(ascending=False).head(15) | |
| fig_likes = px.bar(x=avg_likes.values, y=avg_likes.index, | |
| orientation='h', title="Users by Average Likes") | |
| fig_likes.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig_likes, use_container_width=True) | |
| with col2: | |
| if "retweet_count" in df.columns: | |
| avg_retweets = df.groupby("username")["retweet_count"].mean().sort_values(ascending=False).head(15) | |
| fig_retweets = px.bar(x=avg_retweets.values, y=avg_retweets.index, | |
| orientation='h', title="Users by Average Retweets") | |
| fig_retweets.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig_retweets, use_container_width=True) | |
| # User location overlap analysis | |
| if "user_location" in df.columns and "risk_level" in df.columns: | |
| st.subheader("📍 User Location vs Risk Analysis") | |
| location_risk = df.groupby(["user_location", "risk_level"]).size().reset_index(name="count") | |
| location_risk = location_risk[location_risk["user_location"] != ""] | |
| if not location_risk.empty: | |
| fig_loc_risk = px.bar(location_risk, x="user_location", y="count", | |
| color="risk_level", title="Risk Distribution by Location", | |
| color_discrete_map={ | |
| "CRITICAL": "#dc3545", | |
| "HIGH": "#fd7e14", | |
| "MEDIUM": "#ffc107", | |
| "LOW": "#28a745" | |
| }) | |
| fig_loc_risk.update_xaxes(tickangle=45) | |
| st.plotly_chart(fig_loc_risk, use_container_width=True) | |
| # ------------------------ | |
| # NEW: HEATMAPS | |
| # ------------------------ | |
| elif analysis_type == "📍 Heatmaps": | |
| st.header("📍 Time-Based Heatmaps") | |
| # ------------------- | |
| # Day-Hour heatmap | |
| # ------------------- | |
| if "day_of_week" in df.columns and "hour" in df.columns: | |
| # Ensure proper order | |
| day_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] | |
| df['day_of_week'] = pd.Categorical(df['day_of_week'], categories=day_order, ordered=True) | |
| st.subheader("🔥 Day vs Hour Activity Heatmap") | |
| heatmap_fig = create_heatmap_chart(df, "hour", "day_of_week", "Tweet Activity: Day vs Hour") | |
| if heatmap_fig: | |
| st.plotly_chart(heatmap_fig, use_container_width=True) | |
| # Risk level heatmap | |
| if "risk_level" in df.columns and "hour" in df.columns: | |
| st.subheader("⚠️ Risk Level vs Hour Heatmap") | |
| risk_heatmap = create_heatmap_chart(df, "hour", "risk_level", "Risk Level Distribution by Hour") | |
| if risk_heatmap: | |
| st.plotly_chart(risk_heatmap, use_container_width=True) | |
| # ------------------- | |
| # Top Locations Heatmap | |
| # ------------------- | |
| if "user_location" in df.columns and "hour" in df.columns: | |
| st.subheader("📍 Location vs Hour Heatmap (Top Locations)") | |
| # Add slider in sidebar | |
| TOP_N_LOCATIONS = st.sidebar.slider("Top N Locations for Heatmaps", 5, 30, 10) | |
| # Filter top N locations | |
| top_locations = df["user_location"].value_counts().head(TOP_N_LOCATIONS).index | |
| df_top_loc = df[df["user_location"].isin(top_locations)] | |
| if not df_top_loc.empty: | |
| loc_heatmap = create_heatmap_chart(df_top_loc, "hour", "user_location", | |
| f"Top {TOP_N_LOCATIONS} Locations Activity by Hour") | |
| if loc_heatmap: | |
| st.plotly_chart(loc_heatmap, use_container_width=True) | |
| # Tweet location heatmap (if geographic coordinates available) | |
| if "latitude" in df.columns and "longitude" in df.columns: | |
| st.subheader("🗺️ Geographic Tweet Distribution") | |
| valid_coords = df.dropna(subset=["latitude", "longitude"]) | |
| if not valid_coords.empty: | |
| fig_map = px.scatter_mapbox( | |
| valid_coords, lat="latitude", lon="longitude", | |
| color="risk_level" if "risk_level" in df.columns else None, | |
| size_max=15, zoom=7, | |
| mapbox_style="open-street-map", | |
| title="Geographic Distribution of Tweets" | |
| ) | |
| st.plotly_chart(fig_map, use_container_width=True) | |
| else: | |
| st.info("No geographic coordinates available for mapping") | |
| # ------------------------ | |
| # NEW: RISK PATTERNS | |
| # ------------------------ | |
| # High-risk users analysis | |
| elif analysis_type == "⚠️ Risk Patterns": | |
| st.header("⚠️ Risk Patterns and High-Risk Analysis") | |
| # High-risk users analysis | |
| if "risk_level" in df.columns: | |
| st.subheader("🚨 High-Risk Users") | |
| user_risk_df = calculate_user_risk(df) | |
| high_risk_users = user_risk_df[user_risk_df["risk_score"] > 0].sort_values("risk_score", ascending=False).head(20) | |
| if not high_risk_users.empty: | |
| fig_risk_users = px.bar(high_risk_users, x="risk_score", y="username", | |
| orientation='h', color="tweet_count", color_continuous_scale="Reds") | |
| fig_risk_users.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig_risk_users, use_container_width=True) | |
| # Optional: show details | |
| with st.expander("High-Risk User Details"): | |
| for _, row in high_risk_users.iterrows(): | |
| user_data = df[df["username"] == row["username"]] | |
| critical_count = (user_data["risk_level"] == "CRITICAL").sum() | |
| high_count = (user_data["risk_level"] == "HIGH").sum() | |
| st.write(f"**@{row['username']}**: Risk Score: {row['risk_score']} | Critical: {critical_count} | High: {high_count} | Total Tweets: {row['tweet_count']}") | |
| # Risk overlap analysis | |
| if "is_drug_related" in df.columns and "is_crime_related" in df.columns: | |
| st.subheader("🔄 Drug-Crime Overlap Analysis") | |
| # Create overlap categories | |
| df_overlap = df.copy() | |
| df_overlap["category"] = "Other" | |
| df_overlap.loc[df_overlap["is_drug_related"] == 1, "category"] = "Drug Only" | |
| df_overlap.loc[df_overlap["is_crime_related"] == 1, "category"] = "Crime Only" | |
| df_overlap.loc[(df_overlap["is_drug_related"] == 1) & (df_overlap["is_crime_related"] == 1), "category"] = "Drug + Crime" | |
| overlap_counts = df_overlap["category"].value_counts() | |
| fig_overlap = px.pie(values=overlap_counts.values, names=overlap_counts.index, | |
| title="Drug-Crime Content Overlap", | |
| color_discrete_map={ | |
| "Drug + Crime": "#dc3545", | |
| "Drug Only": "#fd7e14", | |
| "Crime Only": "#ffc107", | |
| "Other": "#28a745" | |
| }) | |
| st.plotly_chart(fig_overlap, use_container_width=True) | |
| # Show high-overlap users | |
| high_overlap_users = df_overlap[df_overlap["category"] == "Drug + Crime"]["username"].value_counts().head(10) | |
| if not high_overlap_users.empty: | |
| st.write("**Users with most Drug+Crime tweets:**") | |
| for username, count in high_overlap_users.items(): | |
| st.write(f"- @{username}: {count} tweets") | |
| # Risk progression over time | |
| if "datetime" in df.columns and "risk_level" in df.columns: | |
| st.subheader("📈 Risk Level Trends Over Time") | |
| # Daily risk aggregation | |
| df["date_str"] = df["datetime"].dt.strftime("%Y-%m-%d") | |
| risk_time = df.groupby(["date_str", "risk_level"]).size().reset_index(name="count") | |
| fig_risk_time = px.line(risk_time, x="date_str", y="count", color="risk_level", | |
| title="Risk Levels Trend Over Time", | |
| color_discrete_map={ | |
| "CRITICAL": "#dc3545", | |
| "HIGH": "#fd7e14", | |
| "MEDIUM": "#ffc107", | |
| "LOW": "#28a745" | |
| }) | |
| fig_risk_time.update_xaxes(tickangle=45) | |
| st.plotly_chart(fig_risk_time, use_container_width=True) | |
| # ------------------------ | |
| # RISK ANALYSIS (Enhanced) | |
| # ------------------------ | |
| elif analysis_type == "Risk Analysis": | |
| st.header("Risk Analysis") | |
| # High-risk tweets table | |
| if high_priority_df is not None and not high_priority_df.empty: | |
| st.subheader("High Priority Tweets") | |
| # Risk level tabs | |
| risk_tab1, risk_tab2 = st.tabs(["CRITICAL", "HIGH"]) | |
| with risk_tab1: | |
| critical_tweets = high_priority_df[high_priority_df["risk_level"] == "CRITICAL"] | |
| if not critical_tweets.empty: | |
| for idx, tweet in critical_tweets.head(10).iterrows(): | |
| with st.expander(f"CRITICAL: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"): | |
| st.write(f"**Content:** {tweet['content']}") | |
| st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}") | |
| st.write(f"**Drug Score:** {tweet.get('drug_score', 'N/A')} | **Crime Score:** {tweet.get('crime_score', 'N/A')}") | |
| if 'tweet_url' in tweet: | |
| st.write(f"**URL:** {tweet['tweet_url']}") | |
| else: | |
| st.info("No critical risk tweets in current filter") | |
| with risk_tab2: | |
| high_tweets = high_priority_df[high_priority_df["risk_level"] == "HIGH"] | |
| if not high_tweets.empty: | |
| for idx, tweet in high_tweets.head(10).iterrows(): | |
| with st.expander(f"HIGH: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"): | |
| st.write(f"**Content:** {tweet['content']}") | |
| st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}") | |
| st.write(f"**Drug Score:** {tweet.get('drug_score', 'N/A')} | **Crime Score:** {tweet.get('crime_score', 'N/A')}") | |
| if 'tweet_url' in tweet: | |
| st.write(f"**URL:** {tweet['tweet_url']}") | |
| else: | |
| st.info("No high risk tweets in current filter") | |
| else: | |
| st.info("No high priority data available") | |
| # Risk score distribution | |
| if "drug_score" in df.columns and "crime_score" in df.columns: | |
| fig_scores = make_subplots(rows=1, cols=2, subplot_titles=("Drug Score Distribution", "Crime Score Distribution")) | |
| fig_scores.add_trace(go.Histogram(x=df["drug_score"], name="Drug Score", nbinsx=20), row=1, col=1) | |
| fig_scores.add_trace(go.Histogram(x=df["crime_score"], name="Crime Score", nbinsx=20), row=1, col=2) | |
| fig_scores.update_layout(title="Risk Score Distributions") | |
| st.plotly_chart(fig_scores, use_container_width=True) | |
| else: | |
| st.info("Risk score data not available") | |
| # ------------------------ | |
| # Actionable Insights | |
| # ------------------------ | |
| elif analysis_type == "Actionable Insights": | |
| st.header("Actionable Insights") | |
| # Contact information tweets | |
| if contact_df is not None and not contact_df.empty: | |
| st.subheader("Tweets with Contact Information") | |
| st.markdown('<div class="warning-box">These tweets contain phone numbers or contact details - HIGH PRIORITY for investigation</div>', unsafe_allow_html=True) | |
| for idx, tweet in contact_df.head(20).iterrows(): | |
| with st.expander(f"Contact Info: @{tweet['username']} - {tweet['datetime'].strftime('%Y-%m-%d %H:%M') if pd.notna(tweet['datetime']) else 'No date'}"): | |
| st.write(f"**Content:** {tweet['content']}") | |
| st.write(f"**Phone Numbers:** {tweet.get('phone_numbers', 'Not extracted')}") | |
| st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}") | |
| st.write(f"**Risk Level:** {tweet.get('risk_level', 'Unknown')}") | |
| if 'tweet_url' in tweet: | |
| st.write(f"**URL:** {tweet['tweet_url']}") | |
| else: | |
| st.info("No tweets with contact information found") | |
| # Bulk operation indicators | |
| st.subheader("Bulk Operation Indicators") | |
| # Sidebar input | |
| BULK_KEYWORDS = st.sidebar.text_area("Bulk Operation Keywords (comma-separated)", | |
| "kg,gram,bulk,wholesale,kilos,ounce,pound").split(",") | |
| # In code | |
| bulk_pattern = "|".join([kw.strip() for kw in BULK_KEYWORDS]) | |
| bulk_regex = re.compile("|".join([kw.strip() for kw in BULK_KEYWORDS]), re.IGNORECASE) | |
| bulk_tweets = df[df["content"].str.contains(bulk_regex, na=False)] | |
| if not bulk_tweets.empty: | |
| st.write(f"Found {len(bulk_tweets)} tweets mentioning bulk quantities") | |
| for idx, tweet in bulk_tweets.head(10).iterrows(): | |
| with st.expander(f"Bulk: @{tweet['username']} - Risk: {tweet.get('risk_level', 'Unknown')}"): | |
| st.write(f"**Content:** {tweet['content']}") | |
| st.write(f"**Location:** {tweet.get('user_location', 'Unknown')}") | |
| if 'tweet_url' in tweet: | |
| st.write(f"**URL:** {tweet['tweet_url']}") | |
| else: | |
| st.info("No bulk operation indicators found") | |
| # High activity users | |
| st.subheader("High Activity Users") | |
| user_activity = df["username"].value_counts().head(15) | |
| if not user_activity.empty: | |
| fig_users = px.bar(x=user_activity.values, y=user_activity.index, | |
| orientation='h', title="Top 15 Most Active Users") | |
| fig_users.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig_users, use_container_width=True) | |
| # ------------------------ | |
| # NEW: PREDICTIVE ANALYTICS | |
| # ------------------------ | |
| elif analysis_type == "📈 Predictive Analytics": | |
| st.header("📈 Predictive Analytics & Trends") | |
| st.subheader("📊 Activity Forecast") | |
| if "datetime" in df.columns and len(df) >= 7: | |
| # Daily activity trend | |
| daily_activity = df.groupby(df["datetime"].dt.date).size().reset_index(name="count") | |
| daily_activity.columns = ["date", "count"] | |
| daily_activity["date"] = pd.to_datetime(daily_activity["date"]) | |
| # Calculate moving average | |
| daily_activity["7_day_ma"] = daily_activity["count"].rolling(window=7, min_periods=1).mean() | |
| daily_activity["trend"] = daily_activity["count"].rolling(window=7, min_periods=1).mean() | |
| # Create forecast visualization | |
| fig_forecast = go.Figure() | |
| fig_forecast.add_trace(go.Scatter( | |
| x=daily_activity["date"], | |
| y=daily_activity["count"], | |
| name="Actual Activity", | |
| mode="lines+markers", | |
| line=dict(color="#1f77b4") | |
| )) | |
| fig_forecast.add_trace(go.Scatter( | |
| x=daily_activity["date"], | |
| y=daily_activity["7_day_ma"], | |
| name="7-Day Moving Average", | |
| mode="lines", | |
| line=dict(color="#ff7f0e", dash="dash") | |
| )) | |
| fig_forecast.update_layout( | |
| title="Tweet Activity Trend & Forecast", | |
| xaxis_title="Date", | |
| yaxis_title="Number of Tweets", | |
| hovermode="x unified" | |
| ) | |
| st.plotly_chart(fig_forecast, use_container_width=True) | |
| # Trend analysis | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| recent_avg = daily_activity["count"].tail(7).mean() | |
| st.metric("7-Day Average", f"{recent_avg:.1f} tweets/day") | |
| with col2: | |
| if len(daily_activity) >= 14: | |
| prev_avg = daily_activity["count"].tail(14).head(7).mean() | |
| change = ((recent_avg - prev_avg) / prev_avg * 100) if prev_avg > 0 else 0 | |
| st.metric("Week-over-Week Change", f"{change:+.1f}%") | |
| with col3: | |
| peak_day = daily_activity.loc[daily_activity["count"].idxmax()] | |
| st.metric("Peak Activity Day", peak_day["date"].strftime("%Y-%m-%d")) | |
| # User activity prediction | |
| st.subheader("👤 High-Risk User Patterns") | |
| if "username" in df.columns and "risk_level" in df.columns: | |
| user_risk_scores = df.groupby("username").agg({ | |
| "tweet_id": "count", | |
| "risk_level": lambda x: (x == "CRITICAL").sum() * 2 + (x == "HIGH").sum() | |
| }).reset_index() | |
| user_risk_scores.columns = ["username", "tweet_count", "risk_score"] | |
| # Identify escalating users | |
| escalating_users = user_risk_scores[ | |
| (user_risk_scores["risk_score"] > 0) & | |
| (user_risk_scores["tweet_count"] >= 3) | |
| ].sort_values("risk_score", ascending=False).head(15) | |
| if not escalating_users.empty: | |
| fig_escalating = px.scatter( | |
| escalating_users, | |
| x="tweet_count", | |
| y="risk_score", | |
| size="risk_score", | |
| hover_data=["username"], | |
| title="High-Risk User Activity Matrix", | |
| labels={"tweet_count": "Number of Tweets", "risk_score": "Risk Score"} | |
| ) | |
| st.plotly_chart(fig_escalating, use_container_width=True) | |
| st.write("**Users to Monitor:**") | |
| for _, user in escalating_users.head(10).iterrows(): | |
| st.write(f"- @{user['username']}: {user['tweet_count']} tweets, Risk Score: {user['risk_score']}") | |
| # ------------------------ | |
| # NEW: NETWORK ANALYSIS | |
| # ------------------------ | |
| elif analysis_type == "🌐 Network Analysis": | |
| st.header("🌐 Network Analysis") | |
| st.subheader("👥 User Connection Analysis") | |
| # Mentions network | |
| if "mentions" in df.columns: | |
| st.write("### User Mention Network") | |
| mention_pairs = [] | |
| for _, row in df.iterrows(): | |
| if pd.notna(row.get("mentions")) and row["mentions"]: | |
| mentions = str(row["mentions"]).split() | |
| for mention in mentions: | |
| mention_clean = mention.strip("@") | |
| if mention_clean: | |
| mention_pairs.append({ | |
| "from": row["username"], | |
| "to": mention_clean, | |
| "risk_level": row.get("risk_level", "UNKNOWN") | |
| }) | |
| if mention_pairs: | |
| mention_df = pd.DataFrame(mention_pairs) | |
| # Top mentioned users | |
| top_mentioned = mention_df["to"].value_counts().head(15) | |
| fig_mentioned = px.bar( | |
| x=top_mentioned.values, | |
| y=top_mentioned.index, | |
| orientation="h", | |
| title="Most Mentioned Users", | |
| labels={"x": "Times Mentioned", "y": "Username"} | |
| ) | |
| fig_mentioned.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig_mentioned, use_container_width=True) | |
| # Connection strength | |
| connection_strength = mention_df.groupby(["from", "to"]).size().reset_index(name="mentions") | |
| strong_connections = connection_strength[connection_strength["mentions"] >= 2].sort_values("mentions", ascending=False) | |
| if not strong_connections.empty: | |
| st.write("### 🔗 Strong Connections (2+ mentions)") | |
| for _, conn in strong_connections.head(20).iterrows(): | |
| st.write(f"- @{conn['from']} → @{conn['to']}: {conn['mentions']} times") | |
| else: | |
| st.info("No mention data available") | |
| # Location clustering | |
| st.subheader("📍 Location-Based Clustering") | |
| if "user_location" in df.columns: | |
| location_users = df.groupby("user_location").agg({ | |
| "username": lambda x: list(x.unique()), | |
| "tweet_id": "count", | |
| "risk_level": lambda x: (x == "CRITICAL").sum() if "risk_level" in df.columns else 0 | |
| }).reset_index() | |
| location_users.columns = ["location", "users", "tweet_count", "critical_count"] | |
| location_users = location_users[location_users["location"] != ""] | |
| location_users = location_users[location_users["tweet_count"] >= 3] | |
| location_users["user_count"] = location_users["users"].apply(len) | |
| if not location_users.empty: | |
| fig_clusters = px.scatter( | |
| location_users, | |
| x="tweet_count", | |
| y="user_count", | |
| size="critical_count", | |
| hover_data=["location"], | |
| title="Location Clusters (Activity vs Users)", | |
| labels={ | |
| "tweet_count": "Total Tweets", | |
| "user_count": "Unique Users", | |
| "critical_count": "Critical Tweets" | |
| } | |
| ) | |
| st.plotly_chart(fig_clusters, use_container_width=True) | |
| # High-density locations | |
| high_density = location_users.sort_values("user_count", ascending=False).head(10) | |
| st.write("### 🏙️ High-Density Locations") | |
| for _, loc in high_density.iterrows(): | |
| with st.expander(f"{loc['location']} - {loc['user_count']} users, {loc['tweet_count']} tweets"): | |
| st.write(f"**Critical tweets:** {loc['critical_count']}") | |
| st.write(f"**Users:** {', '.join(['@' + u for u in loc['users'][:10]])}") | |
| if len(loc['users']) > 10: | |
| st.write(f"... and {len(loc['users']) - 10} more") | |
| # Co-occurrence analysis | |
| st.subheader("🔗 Keyword Co-occurrence") | |
| if "content" in df.columns: | |
| # Define drug/crime keywords | |
| drug_keywords = ["drug", "drugs", "weed", "cannabis", "cocaine", "heroin", "ganja", "charas"] | |
| crime_keywords = ["deal", "dealer", "selling", "supply", "smuggle", "illegal", "arrest"] | |
| cooccurrence = [] | |
| for _, row in df.iterrows(): | |
| content_lower = row["content"].lower() | |
| found_drug = [kw for kw in drug_keywords if kw in content_lower] | |
| found_crime = [kw for kw in crime_keywords if kw in content_lower] | |
| for drug in found_drug: | |
| for crime in found_crime: | |
| cooccurrence.append({"drug_keyword": drug, "crime_keyword": crime}) | |
| if cooccurrence: | |
| cooc_df = pd.DataFrame(cooccurrence) | |
| cooc_counts = cooc_df.groupby(["drug_keyword", "crime_keyword"]).size().reset_index(name="count") | |
| cooc_counts = cooc_counts.sort_values("count", ascending=False).head(20) | |
| if not cooc_counts.empty: | |
| fig_cooc = px.bar( | |
| cooc_counts, | |
| x="count", | |
| y="drug_keyword", | |
| color="crime_keyword", | |
| title="Drug-Crime Keyword Co-occurrence", | |
| orientation="h" | |
| ) | |
| st.plotly_chart(fig_cooc, use_container_width=True) | |
| else: | |
| st.info("No significant keyword co-occurrences found") | |
| # Temporal clustering | |
| st.subheader("⏰ Temporal Activity Clusters") | |
| if "datetime" in df.columns and "username" in df.columns: | |
| df_copy = df.copy() | |
| df_copy["hour"] = df_copy["datetime"].dt.hour | |
| df_copy["day_of_week"] = df_copy["datetime"].dt.day_name() | |
| # Find users active at unusual hours (late night/early morning) | |
| unusual_hours = [0, 1, 2, 3, 4, 5] | |
| night_activity = df_copy[df_copy["hour"].isin(unusual_hours)] | |
| if len(night_activity) > 0: | |
| night_users = night_activity.groupby("username").size().reset_index(name="night_tweets") | |
| night_users = night_users[night_users["night_tweets"] >= 3].sort_values("night_tweets", ascending=False) | |
| if not night_users.empty: | |
| st.write(f"### 🌙 Users Active During Late Night (12 AM - 6 AM)") | |
| fig_night = px.bar( | |
| night_users.head(15), | |
| x="night_tweets", | |
| y="username", | |
| orientation="h", | |
| title="Top Users with Late Night Activity" | |
| ) | |
| fig_night.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig_night, use_container_width=True) | |
| st.info("⚠️ Late night activity may indicate suspicious behavior patterns") | |
| # ------------------------ | |
| # GEOGRAPHIC ANALYSIS (Enhanced) | |
| # ------------------------ | |
| elif analysis_type == "Geographic Analysis": | |
| st.header("Geographic Analysis") | |
| # Location distribution | |
| locations = df["user_location"].value_counts().head(20) | |
| locations = locations[locations.index != ""] # Remove empty locations | |
| if not locations.empty: | |
| fig_locations = px.bar(x=locations.values, y=locations.index, | |
| orientation='h', title="Top 20 User Locations") | |
| fig_locations.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig_locations, use_container_width=True) | |
| else: | |
| st.info("No location data available") | |
| # Karnataka relevance score distribution | |
| if "kar_score" in df.columns: | |
| fig_kar = px.histogram(df, x="kar_score", title="Karnataka Relevance Score Distribution") | |
| st.plotly_chart(fig_kar, use_container_width=True) | |
| # Location-based risk analysis | |
| if "risk_level" in df.columns and "user_location" in df.columns: | |
| location_risk = df.groupby("user_location").agg({ | |
| "risk_level": lambda x: (x == "HIGH").sum() + (x == "CRITICAL").sum() * 2, | |
| "username": "count" | |
| }).reset_index() | |
| location_risk = location_risk[location_risk["username"] >= 3] # Only locations with 3+ tweets | |
| location_risk = location_risk.sort_values("risk_level", ascending=False).head(15) | |
| if not location_risk.empty: | |
| fig_loc_risk = px.bar(location_risk, x="risk_level", y="user_location", | |
| orientation='h', title="High-Risk Locations (3+ tweets)") | |
| fig_loc_risk.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig_loc_risk, use_container_width=True) | |
| # ------------------------ | |
| # USER ANALYSIS (Enhanced) | |
| # ------------------------ | |
| elif analysis_type == "User Analysis": | |
| st.header("User Analysis") | |
| # User metrics | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Unique Users", df["username"].nunique()) | |
| with col2: | |
| verified_count = safe_column_sum(df, "user_verified") | |
| st.metric("Verified Users", verified_count) | |
| with col3: | |
| avg_followers = safe_column_mean(df, "user_followers") | |
| st.metric("Avg Followers", f"{avg_followers:,.0f}") | |
| # Top users by followers | |
| if "user_followers" in df.columns: | |
| top_followers = df.nlargest(15, "user_followers")[["username", "user_followers"]] | |
| if "user_verified" in df.columns: | |
| top_followers = df.nlargest(15, "user_followers")[["username", "user_followers", "user_verified"]] | |
| fig_followers = px.bar(top_followers, x="user_followers", y="username", | |
| color="user_verified" if "user_verified" in top_followers.columns else None, | |
| orientation='h', title="Users with Most Followers") | |
| fig_followers.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig_followers, use_container_width=True) | |
| # User engagement vs risk (fixed aggregation) | |
| if "risk_level" in df.columns: | |
| user_metrics = [] | |
| for username in df["username"].unique(): | |
| user_data = df[df["username"] == username] | |
| risk_score = (user_data["risk_level"] == "HIGH").sum() + (user_data["risk_level"] == "CRITICAL").sum() * 2 | |
| user_metrics.append({ | |
| "username": username, | |
| "risk_score": risk_score, | |
| "avg_likes": safe_column_mean(user_data, "like_count"), | |
| "avg_retweets": safe_column_mean(user_data, "retweet_count"), | |
| "tweet_count": len(user_data) | |
| }) | |
| user_risk_df = pd.DataFrame(user_metrics) | |
| multi_tweet_users = user_risk_df[user_risk_df["tweet_count"] >= 3] | |
| if not multi_tweet_users.empty: | |
| fig_user_risk = px.scatter(multi_tweet_users, x="avg_likes", y="risk_score", | |
| size="tweet_count", hover_data=["username"], | |
| title="User Risk vs Engagement (3+ tweets)") | |
| st.plotly_chart(fig_user_risk, use_container_width=True) | |
| # ------------------------ | |
| # CONTENT ANALYSIS (Enhanced) | |
| # ------------------------ | |
| elif analysis_type == "Content Analysis": | |
| st.header("Content Analysis") | |
| # Hashtag analysis | |
| if "hashtags" in df.columns: | |
| all_hashtags = df["hashtags"].dropna().str.split().explode() | |
| hashtag_counts = all_hashtags.value_counts().head(20) | |
| if not hashtag_counts.empty: | |
| fig_hashtags = px.bar(x=hashtag_counts.values, y=hashtag_counts.index, | |
| orientation='h', title="Top 20 Hashtags") | |
| fig_hashtags.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig_hashtags, use_container_width=True) | |
| # Sentiment vs Risk correlation | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if "sentiment_compound" in df.columns and "risk_level" in df.columns: | |
| fig_sentiment_risk = px.box(df, x="risk_level", y="sentiment_compound", | |
| title="Sentiment by Risk Level") | |
| st.plotly_chart(fig_sentiment_risk, use_container_width=True) | |
| else: | |
| st.info("Sentiment analysis data not available") | |
| with col2: | |
| if "drug_score" in df.columns and "crime_score" in df.columns: | |
| # Drug score vs Crime score correlation | |
| fig_scores_corr = px.scatter(df, x="drug_score", y="crime_score", | |
| color="risk_level" if "risk_level" in df.columns else None, | |
| title="Drug Score vs Crime Score", | |
| color_discrete_map={ | |
| "CRITICAL": "#dc3545", | |
| "HIGH": "#fd7e14", | |
| "MEDIUM": "#ffc107", | |
| "LOW": "#28a745" | |
| }) | |
| st.plotly_chart(fig_scores_corr, use_container_width=True) | |
| else: | |
| st.info("Score correlation data not available") | |
| # Content length analysis | |
| if "content" in df.columns: | |
| df_copy = df.copy() | |
| df_copy["content_length"] = df_copy["content"].str.len() | |
| if "risk_level" in df.columns: | |
| fig_length = px.histogram(df_copy, x="content_length", color="risk_level", | |
| title="Tweet Length Distribution by Risk Level", | |
| color_discrete_map={ | |
| "CRITICAL": "#dc3545", | |
| "HIGH": "#fd7e14", | |
| "MEDIUM": "#ffc107", | |
| "LOW": "#28a745" | |
| }) | |
| else: | |
| fig_length = px.histogram(df_copy, x="content_length", title="Tweet Length Distribution") | |
| st.plotly_chart(fig_length, use_container_width=True) | |
| # Word frequency analysis | |
| if "content" in df.columns: | |
| st.subheader("Content Word Analysis") | |
| filtered_words = get_filtered_words(df["content"]) | |
| if filtered_words: | |
| word_freq = pd.Series(filtered_words).value_counts().head(30) | |
| fig_words = px.bar(x=word_freq.values, y=word_freq.index, | |
| orientation='h', title="Top 30 Most Frequent Words") | |
| fig_words.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig_words, use_container_width=True) | |
| else: | |
| st.info("No content words available after filtering") | |
| # ------------------------ | |
| # Footer with Data Information & Export | |
| # ------------------------ | |
| st.markdown("---") | |
| # Data summary footer | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.info(f"Showing {len(df)} tweets") | |
| with col2: | |
| if "risk_level" in df.columns: | |
| high_risk_count = len(df[df["risk_level"].isin(["HIGH", "CRITICAL"])]) | |
| st.info(f"High Risk: {high_risk_count} tweets") | |
| else: | |
| st.info("Risk Level: Not available") | |
| # Enhanced export functionality | |
| st.sidebar.header("Data Export") | |
| # Export current filtered data | |
| if st.sidebar.button("Download Current View"): | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| csv = df.to_csv(index=False) | |
| st.sidebar.download_button( | |
| label="Download as CSV", | |
| data=csv, | |
| file_name=f"drug_crime_analysis_{analysis_type.lower().replace(' ', '_')}_{timestamp}.csv", | |
| mime="text/csv" | |
| ) | |
| # Export summary report | |
| if report_data: | |
| if st.sidebar.button("Download Analysis Report"): | |
| report_json = json.dumps(report_data, indent=2, default=str) | |
| st.sidebar.download_button( | |
| label="Download Report (JSON)", | |
| data=report_json, | |
| file_name=f"analysis_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", | |
| mime="application/json" | |
| ) | |
| # Quick stats in sidebar | |
| if len(df) > 0: | |
| st.sidebar.subheader("Quick Stats") | |
| if "risk_level" in df.columns: | |
| risk_counts = df["risk_level"].value_counts() | |
| for risk, count in risk_counts.items(): | |
| percentage = (count / len(df)) * 100 | |
| st.sidebar.text(f"{risk}: {count} ({percentage:.1f}%)") | |
| # Top location | |
| if "user_location" in df.columns: | |
| top_location = df["user_location"].value_counts().head(1) | |
| if not top_location.empty and top_location.index[0] != "": | |
| st.sidebar.text(f"Top Location: {top_location.index[0]} ({top_location.iloc[0]})") | |
| # Date range | |
| if "datetime" in df.columns and not df["datetime"].isna().all(): | |
| try: | |
| days_span = (df["datetime"].max() - df["datetime"].min()).days | |
| st.sidebar.text(f"Data Span: {days_span} days") | |
| except: | |
| pass | |
| # Debug information (collapsible) | |
| with st.sidebar.expander("Debug Info"): | |
| st.write("Available columns:") | |
| st.write(list(df.columns)) | |
| st.write(f"DataFrame shape: {df.shape}") | |
| st.write(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB") | |
| if report_data: | |
| st.write("Report data available: Yes") | |
| else: | |
| st.write("Report data available: No") | |
| if high_priority_df is not None: | |
| st.write(f"High priority tweets: {len(high_priority_df)}") | |
| else: | |
| st.write("High priority tweets: Not available") | |
| if contact_df is not None: | |
| st.write(f"Contact info tweets: {len(contact_df)}") | |
| else: | |
| st.write("Contact info tweets: Not available") | |
| # Footer | |
| st.markdown("---") | |
| st.markdown( | |
| """ | |
| <div style='text-align: center; color: #666; padding: 20px;'> | |
| <p><strong>Twitter Drug Crime Monitoring Dashboard</strong></p> | |
| <p><em>Dashboard last updated: {}</em></p> | |
| </div> | |
| """.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")), | |
| unsafe_allow_html=True | |
| ) |