Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +299 -49
src/streamlit_app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
#
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
import os
|
|
@@ -7,57 +7,32 @@ import plotly.express as px
|
|
| 7 |
import plotly.graph_objects as go
|
| 8 |
from plotly.subplots import make_subplots
|
| 9 |
from datetime import datetime, timedelta
|
| 10 |
-
import numpy as np
|
| 11 |
-
import time
|
| 12 |
-
import re
|
| 13 |
-
import seaborn as sns
|
| 14 |
-
import matplotlib.pyplot as plt
|
| 15 |
-
|
| 16 |
-
from evaluation import evaluate_model
|
| 17 |
|
| 18 |
-
# Run evaluation on the scraped CSV folder (no logs shown)
|
| 19 |
-
evaluate_model("drug_analysis_data_3months")
|
| 20 |
-
|
| 21 |
-
st.set_page_config(
|
| 22 |
-
page_title="Twitter Drug Crime Monitoring",
|
| 23 |
-
layout="wide",
|
| 24 |
-
initial_sidebar_state="expanded"
|
| 25 |
-
)
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
# Import NLTK with error handling
|
| 29 |
import nltk
|
| 30 |
-
from nltk.corpus import stopwords
|
| 31 |
|
|
|
|
| 32 |
try:
|
| 33 |
stopwords.words('english')
|
| 34 |
except LookupError:
|
| 35 |
-
nltk.download('stopwords'
|
| 36 |
-
nltk.download('vader_lexicon', quiet=True)
|
| 37 |
-
nltk.download('punkt', quiet=True)
|
| 38 |
|
| 39 |
# Now you can safely use it
|
| 40 |
english_stopwords = stopwords.words('english')
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
|
| 49 |
-
#
|
| 50 |
-
|
| 51 |
-
missing_secrets = [s for s in required_secrets if not os.getenv(s)]
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
st.warning("The dashboard will run in limited mode without database connectivity.")
|
| 57 |
-
# Don't stop - allow dashboard to work with local CSV files
|
| 58 |
-
|
| 59 |
-
# Limit dataframe size in memory
|
| 60 |
-
MAX_ROWS_IN_MEMORY = 10000
|
| 61 |
|
| 62 |
# Custom CSS for better styling
|
| 63 |
st.markdown("""
|
|
@@ -164,7 +139,7 @@ def validate_dataframe(df):
|
|
| 164 |
|
| 165 |
return True, "DataFrame is valid"
|
| 166 |
|
| 167 |
-
@st.cache_data
|
| 168 |
def load_data():
|
| 169 |
"""Load the most recent data with robust error handling."""
|
| 170 |
start_time = time.time()
|
|
@@ -233,9 +208,7 @@ def load_data():
|
|
| 233 |
st.sidebar.metric("Load Time", f"{load_time:.2f}s")
|
| 234 |
st.sidebar.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
|
| 235 |
st.sidebar.info(f"Source: {latest_file}")
|
| 236 |
-
|
| 237 |
-
if len(df) > MAX_ROWS_IN_MEMORY:
|
| 238 |
-
df = df.tail(MAX_ROWS_IN_MEMORY)
|
| 239 |
return df, report_data
|
| 240 |
|
| 241 |
except Exception as e:
|
|
@@ -442,15 +415,12 @@ auto_refresh = st.sidebar.checkbox("Auto-refresh (30s)")
|
|
| 442 |
from streamlit_autorefresh import st_autorefresh
|
| 443 |
|
| 444 |
if auto_refresh:
|
| 445 |
-
|
| 446 |
-
st_autorefresh(interval=30*1000, key="refresh")
|
| 447 |
-
else:
|
| 448 |
-
st.sidebar.warning("β οΈ Auto-refresh not available. Install streamlit-autorefresh package.")
|
| 449 |
|
| 450 |
# Navigation tabs - ENHANCED with new options
|
| 451 |
analysis_type = st.sidebar.radio(
|
| 452 |
"Select Analysis View",
|
| 453 |
-
["Summary", "Risk Analysis", "Actionable Insights",
|
| 454 |
"Geographic Analysis", "User Analysis",
|
| 455 |
"Content Analysis", "π Volume Trends", "π§ User Behavior",
|
| 456 |
"π Heatmaps", "β οΈ Risk Patterns"]
|
|
@@ -501,6 +471,23 @@ if search_term:
|
|
| 501 |
# Display current filter status
|
| 502 |
st.sidebar.info(f"Showing {len(df)} tweets")
|
| 503 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
# ------------------------
|
| 505 |
# EXECUTIVE SUMMARY
|
| 506 |
# ------------------------
|
|
@@ -972,6 +959,269 @@ elif analysis_type == "Actionable Insights":
|
|
| 972 |
fig_users.update_layout(yaxis=dict(autorange="reversed"))
|
| 973 |
st.plotly_chart(fig_users, use_container_width=True)
|
| 974 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 975 |
# ------------------------
|
| 976 |
# GEOGRAPHIC ANALYSIS (Enhanced)
|
| 977 |
# ------------------------
|
|
|
|
| 1 |
+
#modify_app.py
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
import os
|
|
|
|
| 7 |
import plotly.graph_objects as go
|
| 8 |
from plotly.subplots import make_subplots
|
| 9 |
from datetime import datetime, timedelta
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
import nltk
|
| 12 |
+
from nltk.corpus import stopwords # β
import first
|
| 13 |
|
| 14 |
+
# Ensure stopwords data is downloaded
|
| 15 |
try:
|
| 16 |
stopwords.words('english')
|
| 17 |
except LookupError:
|
| 18 |
+
nltk.download('stopwords')
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# Now you can safely use it
|
| 21 |
english_stopwords = stopwords.words('english')
|
| 22 |
|
| 23 |
+
import numpy as np
|
| 24 |
+
import time
|
| 25 |
+
import seaborn as sns
|
| 26 |
+
import matplotlib.pyplot as plt
|
| 27 |
+
from alerts import compute_dynamic_risk,assign_dynamic_risk_level,trigger_alerts
|
| 28 |
+
from evaluation import evaluate_model
|
| 29 |
|
| 30 |
+
# Run evaluation on the scraped CSV folder
|
| 31 |
+
evaluate_model("drug_analysis_data_3months")
|
|
|
|
| 32 |
|
| 33 |
+
import re
|
| 34 |
+
|
| 35 |
+
st.set_page_config(page_title="Twitter Drug Crime Monitoring", layout="wide")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
# Custom CSS for better styling
|
| 38 |
st.markdown("""
|
|
|
|
| 139 |
|
| 140 |
return True, "DataFrame is valid"
|
| 141 |
|
| 142 |
+
@st.cache_data
|
| 143 |
def load_data():
|
| 144 |
"""Load the most recent data with robust error handling."""
|
| 145 |
start_time = time.time()
|
|
|
|
| 208 |
st.sidebar.metric("Load Time", f"{load_time:.2f}s")
|
| 209 |
st.sidebar.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
|
| 210 |
st.sidebar.info(f"Source: {latest_file}")
|
| 211 |
+
|
|
|
|
|
|
|
| 212 |
return df, report_data
|
| 213 |
|
| 214 |
except Exception as e:
|
|
|
|
| 415 |
from streamlit_autorefresh import st_autorefresh
|
| 416 |
|
| 417 |
if auto_refresh:
|
| 418 |
+
st_autorefresh(interval=30*1000, key="refresh")
|
|
|
|
|
|
|
|
|
|
| 419 |
|
| 420 |
# Navigation tabs - ENHANCED with new options
|
| 421 |
analysis_type = st.sidebar.radio(
|
| 422 |
"Select Analysis View",
|
| 423 |
+
["Summary", "Risk Analysis", "Actionable Insights", "π Predictive Analytics", "π Network Analysis",
|
| 424 |
"Geographic Analysis", "User Analysis",
|
| 425 |
"Content Analysis", "π Volume Trends", "π§ User Behavior",
|
| 426 |
"π Heatmaps", "β οΈ Risk Patterns"]
|
|
|
|
| 471 |
# Display current filter status
|
| 472 |
st.sidebar.info(f"Showing {len(df)} tweets")
|
| 473 |
|
| 474 |
+
# ---------------- EMAIL ALERTS SECTION ---------------
|
| 475 |
+
|
| 476 |
+
st.sidebar.header("π© Email Alerts")
|
| 477 |
+
num_tweets = st.sidebar.number_input(
|
| 478 |
+
"Number of high-risk tweets to send",
|
| 479 |
+
min_value=1,
|
| 480 |
+
max_value=50,
|
| 481 |
+
value=5,
|
| 482 |
+
step=1
|
| 483 |
+
)
|
| 484 |
+
send_button = st.sidebar.button("Send Alerts")
|
| 485 |
+
|
| 486 |
+
if send_button:
|
| 487 |
+
st.info(f"Sending top {num_tweets} high-risk tweets via email...")
|
| 488 |
+
trigger_alerts(max_tweets=num_tweets)
|
| 489 |
+
st.success(f"β
Alerts sent for top {num_tweets} tweets!")
|
| 490 |
+
|
| 491 |
# ------------------------
|
| 492 |
# EXECUTIVE SUMMARY
|
| 493 |
# ------------------------
|
|
|
|
| 959 |
fig_users.update_layout(yaxis=dict(autorange="reversed"))
|
| 960 |
st.plotly_chart(fig_users, use_container_width=True)
|
| 961 |
|
| 962 |
+
# ------------------------
|
| 963 |
+
# NEW: PREDICTIVE ANALYTICS
|
| 964 |
+
# ------------------------
|
| 965 |
+
elif analysis_type == "π Predictive Analytics":
|
| 966 |
+
st.header("π Predictive Analytics & Trends")
|
| 967 |
+
|
| 968 |
+
st.subheader("π Activity Forecast")
|
| 969 |
+
|
| 970 |
+
if "datetime" in df.columns and len(df) >= 7:
|
| 971 |
+
# Daily activity trend
|
| 972 |
+
daily_activity = df.groupby(df["datetime"].dt.date).size().reset_index(name="count")
|
| 973 |
+
daily_activity.columns = ["date", "count"]
|
| 974 |
+
daily_activity["date"] = pd.to_datetime(daily_activity["date"])
|
| 975 |
+
|
| 976 |
+
# Calculate moving average
|
| 977 |
+
daily_activity["7_day_ma"] = daily_activity["count"].rolling(window=7, min_periods=1).mean()
|
| 978 |
+
daily_activity["trend"] = daily_activity["count"].rolling(window=7, min_periods=1).mean()
|
| 979 |
+
|
| 980 |
+
# Create forecast visualization
|
| 981 |
+
fig_forecast = go.Figure()
|
| 982 |
+
|
| 983 |
+
fig_forecast.add_trace(go.Scatter(
|
| 984 |
+
x=daily_activity["date"],
|
| 985 |
+
y=daily_activity["count"],
|
| 986 |
+
name="Actual Activity",
|
| 987 |
+
mode="lines+markers",
|
| 988 |
+
line=dict(color="#1f77b4")
|
| 989 |
+
))
|
| 990 |
+
|
| 991 |
+
fig_forecast.add_trace(go.Scatter(
|
| 992 |
+
x=daily_activity["date"],
|
| 993 |
+
y=daily_activity["7_day_ma"],
|
| 994 |
+
name="7-Day Moving Average",
|
| 995 |
+
mode="lines",
|
| 996 |
+
line=dict(color="#ff7f0e", dash="dash")
|
| 997 |
+
))
|
| 998 |
+
|
| 999 |
+
fig_forecast.update_layout(
|
| 1000 |
+
title="Tweet Activity Trend & Forecast",
|
| 1001 |
+
xaxis_title="Date",
|
| 1002 |
+
yaxis_title="Number of Tweets",
|
| 1003 |
+
hovermode="x unified"
|
| 1004 |
+
)
|
| 1005 |
+
|
| 1006 |
+
st.plotly_chart(fig_forecast, use_container_width=True)
|
| 1007 |
+
|
| 1008 |
+
# Trend analysis
|
| 1009 |
+
col1, col2, col3 = st.columns(3)
|
| 1010 |
+
|
| 1011 |
+
with col1:
|
| 1012 |
+
recent_avg = daily_activity["count"].tail(7).mean()
|
| 1013 |
+
st.metric("7-Day Average", f"{recent_avg:.1f} tweets/day")
|
| 1014 |
+
|
| 1015 |
+
with col2:
|
| 1016 |
+
if len(daily_activity) >= 14:
|
| 1017 |
+
prev_avg = daily_activity["count"].tail(14).head(7).mean()
|
| 1018 |
+
change = ((recent_avg - prev_avg) / prev_avg * 100) if prev_avg > 0 else 0
|
| 1019 |
+
st.metric("Week-over-Week Change", f"{change:+.1f}%")
|
| 1020 |
+
|
| 1021 |
+
with col3:
|
| 1022 |
+
peak_day = daily_activity.loc[daily_activity["count"].idxmax()]
|
| 1023 |
+
st.metric("Peak Activity Day", peak_day["date"].strftime("%Y-%m-%d"))
|
| 1024 |
+
|
| 1025 |
+
|
| 1026 |
+
# User activity prediction
|
| 1027 |
+
st.subheader("π€ High-Risk User Patterns")
|
| 1028 |
+
|
| 1029 |
+
if "username" in df.columns and "risk_level" in df.columns:
|
| 1030 |
+
user_risk_scores = df.groupby("username").agg({
|
| 1031 |
+
"tweet_id": "count",
|
| 1032 |
+
"risk_level": lambda x: (x == "CRITICAL").sum() * 2 + (x == "HIGH").sum()
|
| 1033 |
+
}).reset_index()
|
| 1034 |
+
user_risk_scores.columns = ["username", "tweet_count", "risk_score"]
|
| 1035 |
+
|
| 1036 |
+
# Identify escalating users
|
| 1037 |
+
escalating_users = user_risk_scores[
|
| 1038 |
+
(user_risk_scores["risk_score"] > 0) &
|
| 1039 |
+
(user_risk_scores["tweet_count"] >= 3)
|
| 1040 |
+
].sort_values("risk_score", ascending=False).head(15)
|
| 1041 |
+
|
| 1042 |
+
if not escalating_users.empty:
|
| 1043 |
+
fig_escalating = px.scatter(
|
| 1044 |
+
escalating_users,
|
| 1045 |
+
x="tweet_count",
|
| 1046 |
+
y="risk_score",
|
| 1047 |
+
size="risk_score",
|
| 1048 |
+
hover_data=["username"],
|
| 1049 |
+
title="High-Risk User Activity Matrix",
|
| 1050 |
+
labels={"tweet_count": "Number of Tweets", "risk_score": "Risk Score"}
|
| 1051 |
+
)
|
| 1052 |
+
|
| 1053 |
+
st.plotly_chart(fig_escalating, use_container_width=True)
|
| 1054 |
+
|
| 1055 |
+
st.write("**Users to Monitor:**")
|
| 1056 |
+
for _, user in escalating_users.head(10).iterrows():
|
| 1057 |
+
st.write(f"- @{user['username']}: {user['tweet_count']} tweets, Risk Score: {user['risk_score']}")
|
| 1058 |
+
|
| 1059 |
+
|
| 1060 |
+
|
| 1061 |
+
# ------------------------
|
| 1062 |
+
# NEW: NETWORK ANALYSIS
|
| 1063 |
+
# ------------------------
|
| 1064 |
+
elif analysis_type == "π Network Analysis":
|
| 1065 |
+
st.header("π Network Analysis")
|
| 1066 |
+
|
| 1067 |
+
st.subheader("π₯ User Connection Analysis")
|
| 1068 |
+
|
| 1069 |
+
# Mentions network
|
| 1070 |
+
if "mentions" in df.columns:
|
| 1071 |
+
st.write("### User Mention Network")
|
| 1072 |
+
|
| 1073 |
+
mention_pairs = []
|
| 1074 |
+
for _, row in df.iterrows():
|
| 1075 |
+
if pd.notna(row.get("mentions")) and row["mentions"]:
|
| 1076 |
+
mentions = str(row["mentions"]).split()
|
| 1077 |
+
for mention in mentions:
|
| 1078 |
+
mention_clean = mention.strip("@")
|
| 1079 |
+
if mention_clean:
|
| 1080 |
+
mention_pairs.append({
|
| 1081 |
+
"from": row["username"],
|
| 1082 |
+
"to": mention_clean,
|
| 1083 |
+
"risk_level": row.get("risk_level", "UNKNOWN")
|
| 1084 |
+
})
|
| 1085 |
+
|
| 1086 |
+
if mention_pairs:
|
| 1087 |
+
mention_df = pd.DataFrame(mention_pairs)
|
| 1088 |
+
|
| 1089 |
+
# Top mentioned users
|
| 1090 |
+
top_mentioned = mention_df["to"].value_counts().head(15)
|
| 1091 |
+
|
| 1092 |
+
fig_mentioned = px.bar(
|
| 1093 |
+
x=top_mentioned.values,
|
| 1094 |
+
y=top_mentioned.index,
|
| 1095 |
+
orientation="h",
|
| 1096 |
+
title="Most Mentioned Users",
|
| 1097 |
+
labels={"x": "Times Mentioned", "y": "Username"}
|
| 1098 |
+
)
|
| 1099 |
+
fig_mentioned.update_layout(yaxis=dict(autorange="reversed"))
|
| 1100 |
+
st.plotly_chart(fig_mentioned, use_container_width=True)
|
| 1101 |
+
|
| 1102 |
+
# Connection strength
|
| 1103 |
+
connection_strength = mention_df.groupby(["from", "to"]).size().reset_index(name="mentions")
|
| 1104 |
+
strong_connections = connection_strength[connection_strength["mentions"] >= 2].sort_values("mentions", ascending=False)
|
| 1105 |
+
|
| 1106 |
+
if not strong_connections.empty:
|
| 1107 |
+
st.write("### π Strong Connections (2+ mentions)")
|
| 1108 |
+
|
| 1109 |
+
for _, conn in strong_connections.head(20).iterrows():
|
| 1110 |
+
st.write(f"- @{conn['from']} β @{conn['to']}: {conn['mentions']} times")
|
| 1111 |
+
else:
|
| 1112 |
+
st.info("No mention data available")
|
| 1113 |
+
|
| 1114 |
+
# Location clustering
|
| 1115 |
+
st.subheader("π Location-Based Clustering")
|
| 1116 |
+
|
| 1117 |
+
if "user_location" in df.columns:
|
| 1118 |
+
location_users = df.groupby("user_location").agg({
|
| 1119 |
+
"username": lambda x: list(x.unique()),
|
| 1120 |
+
"tweet_id": "count",
|
| 1121 |
+
"risk_level": lambda x: (x == "CRITICAL").sum() if "risk_level" in df.columns else 0
|
| 1122 |
+
}).reset_index()
|
| 1123 |
+
|
| 1124 |
+
location_users.columns = ["location", "users", "tweet_count", "critical_count"]
|
| 1125 |
+
location_users = location_users[location_users["location"] != ""]
|
| 1126 |
+
location_users = location_users[location_users["tweet_count"] >= 3]
|
| 1127 |
+
location_users["user_count"] = location_users["users"].apply(len)
|
| 1128 |
+
|
| 1129 |
+
if not location_users.empty:
|
| 1130 |
+
fig_clusters = px.scatter(
|
| 1131 |
+
location_users,
|
| 1132 |
+
x="tweet_count",
|
| 1133 |
+
y="user_count",
|
| 1134 |
+
size="critical_count",
|
| 1135 |
+
hover_data=["location"],
|
| 1136 |
+
title="Location Clusters (Activity vs Users)",
|
| 1137 |
+
labels={
|
| 1138 |
+
"tweet_count": "Total Tweets",
|
| 1139 |
+
"user_count": "Unique Users",
|
| 1140 |
+
"critical_count": "Critical Tweets"
|
| 1141 |
+
}
|
| 1142 |
+
)
|
| 1143 |
+
|
| 1144 |
+
st.plotly_chart(fig_clusters, use_container_width=True)
|
| 1145 |
+
|
| 1146 |
+
# High-density locations
|
| 1147 |
+
high_density = location_users.sort_values("user_count", ascending=False).head(10)
|
| 1148 |
+
|
| 1149 |
+
st.write("### ποΈ High-Density Locations")
|
| 1150 |
+
for _, loc in high_density.iterrows():
|
| 1151 |
+
with st.expander(f"{loc['location']} - {loc['user_count']} users, {loc['tweet_count']} tweets"):
|
| 1152 |
+
st.write(f"**Critical tweets:** {loc['critical_count']}")
|
| 1153 |
+
st.write(f"**Users:** {', '.join(['@' + u for u in loc['users'][:10]])}")
|
| 1154 |
+
if len(loc['users']) > 10:
|
| 1155 |
+
st.write(f"... and {len(loc['users']) - 10} more")
|
| 1156 |
+
|
| 1157 |
+
# Co-occurrence analysis
|
| 1158 |
+
st.subheader("π Keyword Co-occurrence")
|
| 1159 |
+
|
| 1160 |
+
if "content" in df.columns:
|
| 1161 |
+
# Define drug/crime keywords
|
| 1162 |
+
drug_keywords = ["drug", "drugs", "weed", "cannabis", "cocaine", "heroin", "ganja", "charas"]
|
| 1163 |
+
crime_keywords = ["deal", "dealer", "selling", "supply", "smuggle", "illegal", "arrest"]
|
| 1164 |
+
|
| 1165 |
+
cooccurrence = []
|
| 1166 |
+
|
| 1167 |
+
for _, row in df.iterrows():
|
| 1168 |
+
content_lower = row["content"].lower()
|
| 1169 |
+
found_drug = [kw for kw in drug_keywords if kw in content_lower]
|
| 1170 |
+
found_crime = [kw for kw in crime_keywords if kw in content_lower]
|
| 1171 |
+
|
| 1172 |
+
for drug in found_drug:
|
| 1173 |
+
for crime in found_crime:
|
| 1174 |
+
cooccurrence.append({"drug_keyword": drug, "crime_keyword": crime})
|
| 1175 |
+
|
| 1176 |
+
if cooccurrence:
|
| 1177 |
+
cooc_df = pd.DataFrame(cooccurrence)
|
| 1178 |
+
cooc_counts = cooc_df.groupby(["drug_keyword", "crime_keyword"]).size().reset_index(name="count")
|
| 1179 |
+
cooc_counts = cooc_counts.sort_values("count", ascending=False).head(20)
|
| 1180 |
+
|
| 1181 |
+
if not cooc_counts.empty:
|
| 1182 |
+
fig_cooc = px.bar(
|
| 1183 |
+
cooc_counts,
|
| 1184 |
+
x="count",
|
| 1185 |
+
y="drug_keyword",
|
| 1186 |
+
color="crime_keyword",
|
| 1187 |
+
title="Drug-Crime Keyword Co-occurrence",
|
| 1188 |
+
orientation="h"
|
| 1189 |
+
)
|
| 1190 |
+
st.plotly_chart(fig_cooc, use_container_width=True)
|
| 1191 |
+
else:
|
| 1192 |
+
st.info("No significant keyword co-occurrences found")
|
| 1193 |
+
|
| 1194 |
+
# Temporal clustering
|
| 1195 |
+
st.subheader("β° Temporal Activity Clusters")
|
| 1196 |
+
|
| 1197 |
+
if "datetime" in df.columns and "username" in df.columns:
|
| 1198 |
+
df_copy = df.copy()
|
| 1199 |
+
df_copy["hour"] = df_copy["datetime"].dt.hour
|
| 1200 |
+
df_copy["day_of_week"] = df_copy["datetime"].dt.day_name()
|
| 1201 |
+
|
| 1202 |
+
# Find users active at unusual hours (late night/early morning)
|
| 1203 |
+
unusual_hours = [0, 1, 2, 3, 4, 5]
|
| 1204 |
+
night_activity = df_copy[df_copy["hour"].isin(unusual_hours)]
|
| 1205 |
+
|
| 1206 |
+
if len(night_activity) > 0:
|
| 1207 |
+
night_users = night_activity.groupby("username").size().reset_index(name="night_tweets")
|
| 1208 |
+
night_users = night_users[night_users["night_tweets"] >= 3].sort_values("night_tweets", ascending=False)
|
| 1209 |
+
|
| 1210 |
+
if not night_users.empty:
|
| 1211 |
+
st.write(f"### π Users Active During Late Night (12 AM - 6 AM)")
|
| 1212 |
+
|
| 1213 |
+
fig_night = px.bar(
|
| 1214 |
+
night_users.head(15),
|
| 1215 |
+
x="night_tweets",
|
| 1216 |
+
y="username",
|
| 1217 |
+
orientation="h",
|
| 1218 |
+
title="Top Users with Late Night Activity"
|
| 1219 |
+
)
|
| 1220 |
+
fig_night.update_layout(yaxis=dict(autorange="reversed"))
|
| 1221 |
+
st.plotly_chart(fig_night, use_container_width=True)
|
| 1222 |
+
|
| 1223 |
+
st.info("β οΈ Late night activity may indicate suspicious behavior patterns")
|
| 1224 |
+
|
| 1225 |
# ------------------------
|
| 1226 |
# GEOGRAPHIC ANALYSIS (Enhanced)
|
| 1227 |
# ------------------------
|