lawlevisan commited on
Commit
d68e0e5
Β·
verified Β·
1 Parent(s): b1eb703

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +299 -49
src/streamlit_app.py CHANGED
@@ -1,4 +1,4 @@
1
- #streamlit_app.py
2
  import streamlit as st
3
  import pandas as pd
4
  import os
@@ -7,57 +7,32 @@ import plotly.express as px
7
  import plotly.graph_objects as go
8
  from plotly.subplots import make_subplots
9
  from datetime import datetime, timedelta
10
- import numpy as np
11
- import time
12
- import re
13
- import seaborn as sns
14
- import matplotlib.pyplot as plt
15
-
16
- from evaluation import evaluate_model
17
 
18
- # Run evaluation on the scraped CSV folder (no logs shown)
19
- evaluate_model("drug_analysis_data_3months")
20
-
21
- st.set_page_config(
22
- page_title="Twitter Drug Crime Monitoring",
23
- layout="wide",
24
- initial_sidebar_state="expanded"
25
- )
26
-
27
-
28
- # Import NLTK with error handling
29
  import nltk
30
- from nltk.corpus import stopwords
31
 
 
32
  try:
33
  stopwords.words('english')
34
  except LookupError:
35
- nltk.download('stopwords', quiet=True)
36
- nltk.download('vader_lexicon', quiet=True)
37
- nltk.download('punkt', quiet=True)
38
 
39
  # Now you can safely use it
40
  english_stopwords = stopwords.words('english')
41
 
42
- # Import autorefresh with fallback
43
- try:
44
- from streamlit_autorefresh import st_autorefresh
45
- AUTOREFRESH_AVAILABLE = True
46
- except ImportError:
47
- AUTOREFRESH_AVAILABLE = False
48
 
49
- # Check required secrets AFTER imports
50
- required_secrets = ["MONGO_URI"]
51
- missing_secrets = [s for s in required_secrets if not os.getenv(s)]
52
 
53
- if missing_secrets:
54
- st.error(f"❌ Missing required secrets: {', '.join(missing_secrets)}")
55
- st.info("Please add these in Hugging Face Space Settings β†’ Repository secrets")
56
- st.warning("The dashboard will run in limited mode without database connectivity.")
57
- # Don't stop - allow dashboard to work with local CSV files
58
-
59
- # Limit dataframe size in memory
60
- MAX_ROWS_IN_MEMORY = 10000
61
 
62
  # Custom CSS for better styling
63
  st.markdown("""
@@ -164,7 +139,7 @@ def validate_dataframe(df):
164
 
165
  return True, "DataFrame is valid"
166
 
167
- @st.cache_data(ttl=600) # Cache for 10 minutes
168
  def load_data():
169
  """Load the most recent data with robust error handling."""
170
  start_time = time.time()
@@ -233,9 +208,7 @@ def load_data():
233
  st.sidebar.metric("Load Time", f"{load_time:.2f}s")
234
  st.sidebar.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
235
  st.sidebar.info(f"Source: {latest_file}")
236
-
237
- if len(df) > MAX_ROWS_IN_MEMORY:
238
- df = df.tail(MAX_ROWS_IN_MEMORY)
239
  return df, report_data
240
 
241
  except Exception as e:
@@ -442,15 +415,12 @@ auto_refresh = st.sidebar.checkbox("Auto-refresh (30s)")
442
  from streamlit_autorefresh import st_autorefresh
443
 
444
  if auto_refresh:
445
- if AUTOREFRESH_AVAILABLE:
446
- st_autorefresh(interval=30*1000, key="refresh")
447
- else:
448
- st.sidebar.warning("⚠️ Auto-refresh not available. Install streamlit-autorefresh package.")
449
 
450
  # Navigation tabs - ENHANCED with new options
451
  analysis_type = st.sidebar.radio(
452
  "Select Analysis View",
453
- ["Summary", "Risk Analysis", "Actionable Insights",
454
  "Geographic Analysis", "User Analysis",
455
  "Content Analysis", "πŸ“Š Volume Trends", "🧠 User Behavior",
456
  "πŸ“ Heatmaps", "⚠️ Risk Patterns"]
@@ -501,6 +471,23 @@ if search_term:
501
  # Display current filter status
502
  st.sidebar.info(f"Showing {len(df)} tweets")
503
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  # ------------------------
505
  # EXECUTIVE SUMMARY
506
  # ------------------------
@@ -972,6 +959,269 @@ elif analysis_type == "Actionable Insights":
972
  fig_users.update_layout(yaxis=dict(autorange="reversed"))
973
  st.plotly_chart(fig_users, use_container_width=True)
974
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
975
  # ------------------------
976
  # GEOGRAPHIC ANALYSIS (Enhanced)
977
  # ------------------------
 
1
+ #modify_app.py
2
  import streamlit as st
3
  import pandas as pd
4
  import os
 
7
  import plotly.graph_objects as go
8
  from plotly.subplots import make_subplots
9
  from datetime import datetime, timedelta
 
 
 
 
 
 
 
10
 
 
 
 
 
 
 
 
 
 
 
 
11
  import nltk
12
+ from nltk.corpus import stopwords # βœ… import first
13
 
14
+ # Ensure stopwords data is downloaded
15
  try:
16
  stopwords.words('english')
17
  except LookupError:
18
+ nltk.download('stopwords')
 
 
19
 
20
  # Now you can safely use it
21
  english_stopwords = stopwords.words('english')
22
 
23
+ import numpy as np
24
+ import time
25
+ import seaborn as sns
26
+ import matplotlib.pyplot as plt
27
+ from alerts import compute_dynamic_risk,assign_dynamic_risk_level,trigger_alerts
28
+ from evaluation import evaluate_model
29
 
30
+ # Run evaluation on the scraped CSV folder
31
+ evaluate_model("drug_analysis_data_3months")
 
32
 
33
+ import re
34
+
35
+ st.set_page_config(page_title="Twitter Drug Crime Monitoring", layout="wide")
 
 
 
 
 
36
 
37
  # Custom CSS for better styling
38
  st.markdown("""
 
139
 
140
  return True, "DataFrame is valid"
141
 
142
+ @st.cache_data
143
  def load_data():
144
  """Load the most recent data with robust error handling."""
145
  start_time = time.time()
 
208
  st.sidebar.metric("Load Time", f"{load_time:.2f}s")
209
  st.sidebar.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
210
  st.sidebar.info(f"Source: {latest_file}")
211
+
 
 
212
  return df, report_data
213
 
214
  except Exception as e:
 
415
  from streamlit_autorefresh import st_autorefresh
416
 
417
  if auto_refresh:
418
+ st_autorefresh(interval=30*1000, key="refresh")
 
 
 
419
 
420
  # Navigation tabs - ENHANCED with new options
421
  analysis_type = st.sidebar.radio(
422
  "Select Analysis View",
423
+ ["Summary", "Risk Analysis", "Actionable Insights", "πŸ“ˆ Predictive Analytics", "🌐 Network Analysis",
424
  "Geographic Analysis", "User Analysis",
425
  "Content Analysis", "πŸ“Š Volume Trends", "🧠 User Behavior",
426
  "πŸ“ Heatmaps", "⚠️ Risk Patterns"]
 
471
  # Display current filter status
472
  st.sidebar.info(f"Showing {len(df)} tweets")
473
 
474
+ # ---------------- EMAIL ALERTS SECTION ---------------
475
+
476
+ st.sidebar.header("πŸ“© Email Alerts")
477
+ num_tweets = st.sidebar.number_input(
478
+ "Number of high-risk tweets to send",
479
+ min_value=1,
480
+ max_value=50,
481
+ value=5,
482
+ step=1
483
+ )
484
+ send_button = st.sidebar.button("Send Alerts")
485
+
486
+ if send_button:
487
+ st.info(f"Sending top {num_tweets} high-risk tweets via email...")
488
+ trigger_alerts(max_tweets=num_tweets)
489
+ st.success(f"βœ… Alerts sent for top {num_tweets} tweets!")
490
+
491
  # ------------------------
492
  # EXECUTIVE SUMMARY
493
  # ------------------------
 
959
  fig_users.update_layout(yaxis=dict(autorange="reversed"))
960
  st.plotly_chart(fig_users, use_container_width=True)
961
 
962
+ # ------------------------
963
+ # NEW: PREDICTIVE ANALYTICS
964
+ # ------------------------
965
+ elif analysis_type == "πŸ“ˆ Predictive Analytics":
966
+ st.header("πŸ“ˆ Predictive Analytics & Trends")
967
+
968
+ st.subheader("πŸ“Š Activity Forecast")
969
+
970
+ if "datetime" in df.columns and len(df) >= 7:
971
+ # Daily activity trend
972
+ daily_activity = df.groupby(df["datetime"].dt.date).size().reset_index(name="count")
973
+ daily_activity.columns = ["date", "count"]
974
+ daily_activity["date"] = pd.to_datetime(daily_activity["date"])
975
+
976
+ # Calculate moving average
977
+ daily_activity["7_day_ma"] = daily_activity["count"].rolling(window=7, min_periods=1).mean()
978
+ daily_activity["trend"] = daily_activity["count"].rolling(window=7, min_periods=1).mean()
979
+
980
+ # Create forecast visualization
981
+ fig_forecast = go.Figure()
982
+
983
+ fig_forecast.add_trace(go.Scatter(
984
+ x=daily_activity["date"],
985
+ y=daily_activity["count"],
986
+ name="Actual Activity",
987
+ mode="lines+markers",
988
+ line=dict(color="#1f77b4")
989
+ ))
990
+
991
+ fig_forecast.add_trace(go.Scatter(
992
+ x=daily_activity["date"],
993
+ y=daily_activity["7_day_ma"],
994
+ name="7-Day Moving Average",
995
+ mode="lines",
996
+ line=dict(color="#ff7f0e", dash="dash")
997
+ ))
998
+
999
+ fig_forecast.update_layout(
1000
+ title="Tweet Activity Trend & Forecast",
1001
+ xaxis_title="Date",
1002
+ yaxis_title="Number of Tweets",
1003
+ hovermode="x unified"
1004
+ )
1005
+
1006
+ st.plotly_chart(fig_forecast, use_container_width=True)
1007
+
1008
+ # Trend analysis
1009
+ col1, col2, col3 = st.columns(3)
1010
+
1011
+ with col1:
1012
+ recent_avg = daily_activity["count"].tail(7).mean()
1013
+ st.metric("7-Day Average", f"{recent_avg:.1f} tweets/day")
1014
+
1015
+ with col2:
1016
+ if len(daily_activity) >= 14:
1017
+ prev_avg = daily_activity["count"].tail(14).head(7).mean()
1018
+ change = ((recent_avg - prev_avg) / prev_avg * 100) if prev_avg > 0 else 0
1019
+ st.metric("Week-over-Week Change", f"{change:+.1f}%")
1020
+
1021
+ with col3:
1022
+ peak_day = daily_activity.loc[daily_activity["count"].idxmax()]
1023
+ st.metric("Peak Activity Day", peak_day["date"].strftime("%Y-%m-%d"))
1024
+
1025
+
1026
+ # User activity prediction
1027
+ st.subheader("πŸ‘€ High-Risk User Patterns")
1028
+
1029
+ if "username" in df.columns and "risk_level" in df.columns:
1030
+ user_risk_scores = df.groupby("username").agg({
1031
+ "tweet_id": "count",
1032
+ "risk_level": lambda x: (x == "CRITICAL").sum() * 2 + (x == "HIGH").sum()
1033
+ }).reset_index()
1034
+ user_risk_scores.columns = ["username", "tweet_count", "risk_score"]
1035
+
1036
+ # Identify escalating users
1037
+ escalating_users = user_risk_scores[
1038
+ (user_risk_scores["risk_score"] > 0) &
1039
+ (user_risk_scores["tweet_count"] >= 3)
1040
+ ].sort_values("risk_score", ascending=False).head(15)
1041
+
1042
+ if not escalating_users.empty:
1043
+ fig_escalating = px.scatter(
1044
+ escalating_users,
1045
+ x="tweet_count",
1046
+ y="risk_score",
1047
+ size="risk_score",
1048
+ hover_data=["username"],
1049
+ title="High-Risk User Activity Matrix",
1050
+ labels={"tweet_count": "Number of Tweets", "risk_score": "Risk Score"}
1051
+ )
1052
+
1053
+ st.plotly_chart(fig_escalating, use_container_width=True)
1054
+
1055
+ st.write("**Users to Monitor:**")
1056
+ for _, user in escalating_users.head(10).iterrows():
1057
+ st.write(f"- @{user['username']}: {user['tweet_count']} tweets, Risk Score: {user['risk_score']}")
1058
+
1059
+
1060
+
1061
+ # ------------------------
1062
+ # NEW: NETWORK ANALYSIS
1063
+ # ------------------------
1064
+ elif analysis_type == "🌐 Network Analysis":
1065
+ st.header("🌐 Network Analysis")
1066
+
1067
+ st.subheader("πŸ‘₯ User Connection Analysis")
1068
+
1069
+ # Mentions network
1070
+ if "mentions" in df.columns:
1071
+ st.write("### User Mention Network")
1072
+
1073
+ mention_pairs = []
1074
+ for _, row in df.iterrows():
1075
+ if pd.notna(row.get("mentions")) and row["mentions"]:
1076
+ mentions = str(row["mentions"]).split()
1077
+ for mention in mentions:
1078
+ mention_clean = mention.strip("@")
1079
+ if mention_clean:
1080
+ mention_pairs.append({
1081
+ "from": row["username"],
1082
+ "to": mention_clean,
1083
+ "risk_level": row.get("risk_level", "UNKNOWN")
1084
+ })
1085
+
1086
+ if mention_pairs:
1087
+ mention_df = pd.DataFrame(mention_pairs)
1088
+
1089
+ # Top mentioned users
1090
+ top_mentioned = mention_df["to"].value_counts().head(15)
1091
+
1092
+ fig_mentioned = px.bar(
1093
+ x=top_mentioned.values,
1094
+ y=top_mentioned.index,
1095
+ orientation="h",
1096
+ title="Most Mentioned Users",
1097
+ labels={"x": "Times Mentioned", "y": "Username"}
1098
+ )
1099
+ fig_mentioned.update_layout(yaxis=dict(autorange="reversed"))
1100
+ st.plotly_chart(fig_mentioned, use_container_width=True)
1101
+
1102
+ # Connection strength
1103
+ connection_strength = mention_df.groupby(["from", "to"]).size().reset_index(name="mentions")
1104
+ strong_connections = connection_strength[connection_strength["mentions"] >= 2].sort_values("mentions", ascending=False)
1105
+
1106
+ if not strong_connections.empty:
1107
+ st.write("### πŸ”— Strong Connections (2+ mentions)")
1108
+
1109
+ for _, conn in strong_connections.head(20).iterrows():
1110
+ st.write(f"- @{conn['from']} β†’ @{conn['to']}: {conn['mentions']} times")
1111
+ else:
1112
+ st.info("No mention data available")
1113
+
1114
+ # Location clustering
1115
+ st.subheader("πŸ“ Location-Based Clustering")
1116
+
1117
+ if "user_location" in df.columns:
1118
+ location_users = df.groupby("user_location").agg({
1119
+ "username": lambda x: list(x.unique()),
1120
+ "tweet_id": "count",
1121
+ "risk_level": lambda x: (x == "CRITICAL").sum() if "risk_level" in df.columns else 0
1122
+ }).reset_index()
1123
+
1124
+ location_users.columns = ["location", "users", "tweet_count", "critical_count"]
1125
+ location_users = location_users[location_users["location"] != ""]
1126
+ location_users = location_users[location_users["tweet_count"] >= 3]
1127
+ location_users["user_count"] = location_users["users"].apply(len)
1128
+
1129
+ if not location_users.empty:
1130
+ fig_clusters = px.scatter(
1131
+ location_users,
1132
+ x="tweet_count",
1133
+ y="user_count",
1134
+ size="critical_count",
1135
+ hover_data=["location"],
1136
+ title="Location Clusters (Activity vs Users)",
1137
+ labels={
1138
+ "tweet_count": "Total Tweets",
1139
+ "user_count": "Unique Users",
1140
+ "critical_count": "Critical Tweets"
1141
+ }
1142
+ )
1143
+
1144
+ st.plotly_chart(fig_clusters, use_container_width=True)
1145
+
1146
+ # High-density locations
1147
+ high_density = location_users.sort_values("user_count", ascending=False).head(10)
1148
+
1149
+ st.write("### πŸ™οΈ High-Density Locations")
1150
+ for _, loc in high_density.iterrows():
1151
+ with st.expander(f"{loc['location']} - {loc['user_count']} users, {loc['tweet_count']} tweets"):
1152
+ st.write(f"**Critical tweets:** {loc['critical_count']}")
1153
+ st.write(f"**Users:** {', '.join(['@' + u for u in loc['users'][:10]])}")
1154
+ if len(loc['users']) > 10:
1155
+ st.write(f"... and {len(loc['users']) - 10} more")
1156
+
1157
+ # Co-occurrence analysis
1158
+ st.subheader("πŸ”— Keyword Co-occurrence")
1159
+
1160
+ if "content" in df.columns:
1161
+ # Define drug/crime keywords
1162
+ drug_keywords = ["drug", "drugs", "weed", "cannabis", "cocaine", "heroin", "ganja", "charas"]
1163
+ crime_keywords = ["deal", "dealer", "selling", "supply", "smuggle", "illegal", "arrest"]
1164
+
1165
+ cooccurrence = []
1166
+
1167
+ for _, row in df.iterrows():
1168
+ content_lower = row["content"].lower()
1169
+ found_drug = [kw for kw in drug_keywords if kw in content_lower]
1170
+ found_crime = [kw for kw in crime_keywords if kw in content_lower]
1171
+
1172
+ for drug in found_drug:
1173
+ for crime in found_crime:
1174
+ cooccurrence.append({"drug_keyword": drug, "crime_keyword": crime})
1175
+
1176
+ if cooccurrence:
1177
+ cooc_df = pd.DataFrame(cooccurrence)
1178
+ cooc_counts = cooc_df.groupby(["drug_keyword", "crime_keyword"]).size().reset_index(name="count")
1179
+ cooc_counts = cooc_counts.sort_values("count", ascending=False).head(20)
1180
+
1181
+ if not cooc_counts.empty:
1182
+ fig_cooc = px.bar(
1183
+ cooc_counts,
1184
+ x="count",
1185
+ y="drug_keyword",
1186
+ color="crime_keyword",
1187
+ title="Drug-Crime Keyword Co-occurrence",
1188
+ orientation="h"
1189
+ )
1190
+ st.plotly_chart(fig_cooc, use_container_width=True)
1191
+ else:
1192
+ st.info("No significant keyword co-occurrences found")
1193
+
1194
+ # Temporal clustering
1195
+ st.subheader("⏰ Temporal Activity Clusters")
1196
+
1197
+ if "datetime" in df.columns and "username" in df.columns:
1198
+ df_copy = df.copy()
1199
+ df_copy["hour"] = df_copy["datetime"].dt.hour
1200
+ df_copy["day_of_week"] = df_copy["datetime"].dt.day_name()
1201
+
1202
+ # Find users active at unusual hours (late night/early morning)
1203
+ unusual_hours = [0, 1, 2, 3, 4, 5]
1204
+ night_activity = df_copy[df_copy["hour"].isin(unusual_hours)]
1205
+
1206
+ if len(night_activity) > 0:
1207
+ night_users = night_activity.groupby("username").size().reset_index(name="night_tweets")
1208
+ night_users = night_users[night_users["night_tweets"] >= 3].sort_values("night_tweets", ascending=False)
1209
+
1210
+ if not night_users.empty:
1211
+ st.write(f"### πŸŒ™ Users Active During Late Night (12 AM - 6 AM)")
1212
+
1213
+ fig_night = px.bar(
1214
+ night_users.head(15),
1215
+ x="night_tweets",
1216
+ y="username",
1217
+ orientation="h",
1218
+ title="Top Users with Late Night Activity"
1219
+ )
1220
+ fig_night.update_layout(yaxis=dict(autorange="reversed"))
1221
+ st.plotly_chart(fig_night, use_container_width=True)
1222
+
1223
+ st.info("⚠️ Late night activity may indicate suspicious behavior patterns")
1224
+
1225
  # ------------------------
1226
  # GEOGRAPHIC ANALYSIS (Enhanced)
1227
  # ------------------------