lawlevisan commited on
Commit
228c79c
Β·
verified Β·
1 Parent(s): 84d37de

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +902 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,904 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ from wordcloud import WordCloud
6
+ import matplotlib.pyplot as plt
7
+ import folium
8
+ from folium.plugins import HeatMap, MarkerCluster
9
+ from streamlit_folium import st_folium
10
+ from datetime import datetime, timedelta
11
+ import re
12
+ import os
13
+ from textblob import TextBlob
14
+
15
+ # ------------------------
16
+ # Config
17
+ # ------------------------
18
+ st.set_page_config(
19
+ page_title="Reddit based Drug Crime Intelligence Dashboard",
20
+ layout="wide",
21
+ initial_sidebar_state="expanded"
22
+ )
23
+
24
+ # Paths to data files
25
+ POSTS_FILE = "data/processed/reddit_posts_filtered.csv"
26
+ COMMENTS_FILE = "data/processed/reddit_comments_filtered.csv"
27
+ WARD_COORDS_FILE = "data/bangalore_wards_coordinates.csv"
28
+ DISTRICT_COORDS_FILE = "data/karnataka_districts_coordinates.csv"
29
+
30
+ # Drug-related keywords for classification
31
+ DRUG_KEYWORDS = {
32
+ 'high_risk': ['dealing', 'dealer', 'supply', 'trafficking', 'smuggling', 'cartel', 'seized', 'arrest', 'raid'],
33
+ 'substance': ['cocaine', 'heroin', 'mdma', 'meth', 'cannabis', 'marijuana', 'ganja', 'weed', 'lsd', 'ecstasy'],
34
+ 'activity': ['selling', 'buying', 'distribution', 'possession', 'consumption', 'overdose', 'addiction']
35
+ }
36
+
37
+ # ------------------------
38
+ # Enhanced Data Loading
39
+ # ------------------------
40
+ @st.cache_data
41
+ def load_data(posts_file, comments_file, ward_file, district_file):
42
+ """Load all data files with comprehensive error handling"""
43
+ data_status = {"posts": False, "comments": False, "wards": False, "districts": False}
44
+
45
+ # Load posts
46
+ try:
47
+ posts = pd.read_csv(posts_file, dtype=str)
48
+ posts = posts.drop_duplicates(subset=['id'], keep='first')
49
+ data_status["posts"] = True
50
+ st.sidebar.success(f"βœ… Posts loaded: {len(posts)} records")
51
+ except FileNotFoundError:
52
+ posts = pd.DataFrame()
53
+ st.sidebar.warning("⚠️ Reddit posts file not found")
54
+ except Exception as e:
55
+ posts = pd.DataFrame()
56
+ st.sidebar.error(f"❌ Error loading posts: {str(e)}")
57
+
58
+ # Load comments
59
+ try:
60
+ comments = pd.read_csv(comments_file)
61
+ if 'id' in comments.columns:
62
+ comments = comments.drop_duplicates(subset=['id'], keep='first')
63
+ data_status["comments"] = True
64
+ st.sidebar.success(f"βœ… Comments loaded: {len(comments)} records")
65
+ except FileNotFoundError:
66
+ comments = pd.DataFrame()
67
+ st.sidebar.warning("⚠️ Reddit comments file not found")
68
+ except Exception as e:
69
+ comments = pd.DataFrame()
70
+ st.sidebar.error(f"❌ Error loading comments: {str(e)}")
71
+
72
+ # Load ward coordinates
73
+ try:
74
+ wards = pd.read_csv(ward_file)
75
+ if 'ward_name' not in wards.columns and 'name' in wards.columns:
76
+ wards.rename(columns={'name': 'ward_name'}, inplace=True)
77
+ data_status["wards"] = True
78
+ st.sidebar.success(f"βœ… Wards loaded: {len(wards)} wards")
79
+ except FileNotFoundError:
80
+ wards = pd.DataFrame()
81
+ st.sidebar.warning("⚠️ Ward coordinates file not found")
82
+ except Exception as e:
83
+ wards = pd.DataFrame()
84
+ st.sidebar.error(f"❌ Error loading wards: {str(e)}")
85
+
86
+ # Load district coordinates
87
+ try:
88
+ districts = pd.read_csv(district_file)
89
+ if 'district_name' not in districts.columns and 'name' in districts.columns:
90
+ districts.rename(columns={'name': 'district_name'}, inplace=True)
91
+ data_status["districts"] = True
92
+ st.sidebar.success(f"βœ… Districts loaded: {len(districts)} districts")
93
+ except FileNotFoundError:
94
+ districts = pd.DataFrame()
95
+ st.sidebar.warning("⚠️ District coordinates file not found")
96
+ except Exception as e:
97
+ districts = pd.DataFrame()
98
+ st.sidebar.error(f"❌ Error loading districts: {str(e)}")
99
+
100
+ return posts, comments, wards, districts, data_status
101
+
102
+ # ------------------------
103
+ # Crime Analysis Functions
104
+ # ------------------------
105
+ def classify_crime_severity(text):
106
+ """Classify posts by crime severity based on keywords"""
107
+ text_lower = str(text).lower()
108
+ severity_score = 0
109
+
110
+ for keyword in DRUG_KEYWORDS['high_risk']:
111
+ if keyword in text_lower:
112
+ severity_score += 3
113
+
114
+ for keyword in DRUG_KEYWORDS['substance']:
115
+ if keyword in text_lower:
116
+ severity_score += 2
117
+
118
+ for keyword in DRUG_KEYWORDS['activity']:
119
+ if keyword in text_lower:
120
+ severity_score += 1
121
+
122
+ if severity_score >= 5:
123
+ return 'Critical'
124
+ elif severity_score >= 3:
125
+ return 'High'
126
+ elif severity_score >= 1:
127
+ return 'Medium'
128
+ else:
129
+ return 'Low'
130
+
131
+ def extract_drug_mentions(text):
132
+ """Extract specific drug mentions from text"""
133
+ text_lower = str(text).lower()
134
+ drugs_found = []
135
+ for drug in DRUG_KEYWORDS['substance']:
136
+ if drug in text_lower:
137
+ drugs_found.append(drug.capitalize())
138
+ return ', '.join(drugs_found) if drugs_found else 'Unspecified'
139
+
140
+ def calculate_threat_score(row):
141
+ """Calculate threat score based on multiple factors"""
142
+ score = 0
143
+ text = str(row.get('text', '')) + ' ' + str(row.get('title', ''))
144
+ text_lower = text.lower()
145
+
146
+ for keyword in DRUG_KEYWORDS['high_risk']:
147
+ if keyword in text_lower:
148
+ score += 10
149
+
150
+ if 'score' in row:
151
+ score += min(int(row.get('score', 0)) / 10, 5)
152
+
153
+ if 'num_comments' in row:
154
+ score += min(int(row.get('num_comments', 0)) / 5, 5)
155
+
156
+ sentiment = TextBlob(text).sentiment.polarity
157
+ if sentiment < -0.2:
158
+ score += 5
159
+
160
+ return min(score, 100)
161
+
162
+ # ------------------------
163
+ # Load All Data
164
+ # ------------------------
165
+ posts_df, comments_df, wards_df, districts_df, data_status = load_data(
166
+ POSTS_FILE, COMMENTS_FILE, WARD_COORDS_FILE, DISTRICT_COORDS_FILE
167
+ )
168
+
169
+ # ------------------------
170
+ # Data Processing
171
+ # ------------------------
172
+ def process_datetime(df, datetime_col='created_utc'):
173
+ """Process datetime column with robust error handling"""
174
+ if datetime_col not in df.columns:
175
+ return df
176
+
177
+ df["datetime"] = pd.to_datetime(df[datetime_col], errors='coerce')
178
+ df["date"] = df["datetime"].dt.date
179
+ df["hour"] = df["datetime"].dt.hour
180
+ df["day_of_week"] = df["datetime"].dt.day_name()
181
+ return df
182
+
183
+ # Normalize coordinate names
184
+ if not wards_df.empty and "ward_name" in wards_df.columns:
185
+ wards_df["ward_name"] = wards_df["ward_name"].astype(str).str.strip().str.lower()
186
+
187
+ if not districts_df.empty and "district_name" in districts_df.columns:
188
+ districts_df["district_name"] = districts_df["district_name"].astype(str).str.strip().str.lower()
189
+
190
+ # District mapping
191
+ district_mapping = {
192
+ "bangalore": "bengaluru",
193
+ "blr": "bengaluru",
194
+ "mysore": "mysuru",
195
+ }
196
+
197
+ # Create patterns
198
+ ward_pattern = None
199
+ district_pattern = None
200
+
201
+ if not wards_df.empty:
202
+ ward_list = wards_df["ward_name"].str.lower().tolist()
203
+ ward_pattern = r'\b(' + '|'.join(re.escape(w) for w in ward_list) + r')\b'
204
+
205
+ if not districts_df.empty:
206
+ district_list = districts_df["district_name"].str.lower().tolist()
207
+ district_pattern = r'\b(' + '|'.join(re.escape(d) for d in district_list) + r')\b'
208
+
209
+ def extract_locations(text_series, patterns):
210
+ """Extract locations from text using regex patterns"""
211
+ locations = []
212
+ for text in text_series.fillna(""):
213
+ matches = []
214
+ for pattern in patterns:
215
+ matches.extend(re.findall(pattern, str(text).lower()))
216
+ matches = list(set(matches))
217
+ locations.append(", ".join(matches))
218
+ return pd.Series(locations, index=text_series.index)
219
+
220
+ # Process posts
221
+ if not posts_df.empty:
222
+ posts_df = process_datetime(posts_df)
223
+
224
+ post_text = (posts_df.get("title", "") + " " + posts_df.get("text", "")).fillna("")
225
+
226
+ if ward_pattern:
227
+ posts_df["ward_location"] = extract_locations(post_text, [ward_pattern])
228
+ else:
229
+ posts_df["ward_location"] = ""
230
+
231
+ if district_pattern:
232
+ posts_df["district_location"] = extract_locations(post_text, [district_pattern])
233
+ else:
234
+ posts_df["district_location"] = ""
235
+
236
+ posts_df["district_location"] = posts_df["district_location"].replace(district_mapping)
237
+
238
+ posts_df["severity"] = post_text.apply(classify_crime_severity)
239
+ posts_df["drugs_mentioned"] = post_text.apply(extract_drug_mentions)
240
+ posts_df["threat_score"] = posts_df.apply(calculate_threat_score, axis=1)
241
+
242
+ posts_df["sentiment_score"] = post_text.apply(lambda x: TextBlob(str(x)).sentiment.polarity)
243
+ posts_df["sentiment"] = posts_df["sentiment_score"].apply(
244
+ lambda x: "Positive" if x > 0 else ("Negative" if x < 0 else "Neutral")
245
+ )
246
+
247
+ # Process comments
248
+ if not comments_df.empty:
249
+ comments_df = process_datetime(comments_df)
250
+
251
+ # ------------------------
252
+ # Dashboard Header
253
+ # ------------------------
254
+ st.title("🚨 Reddit based Drug Crime Intelligence Dashboard")
255
+ st.markdown("**Real-time intelligence analysis of drug-related criminal activities from Reddit social media monitoring**")
256
+
257
+ # ------------------------
258
+ # Sidebar Filters
259
+ # ------------------------
260
+ st.sidebar.title("πŸ”§ Intelligence Controls")
261
+
262
+ if st.sidebar.button("πŸ”„ Refresh Data"):
263
+ st.cache_data.clear()
264
+ st.rerun()
265
+
266
+ # Severity filter
267
+ if not posts_df.empty and "severity" in posts_df.columns:
268
+ severity_filter = st.sidebar.multiselect(
269
+ "⚠️ Crime Severity Level",
270
+ options=['Critical', 'High', 'Medium', 'Low'],
271
+ default=['Critical', 'High']
272
+ )
273
+ if severity_filter:
274
+ posts_df = posts_df[posts_df["severity"].isin(severity_filter)]
275
+
276
+ # Date range filter
277
+ if not posts_df.empty and "datetime" in posts_df.columns:
278
+ min_date = posts_df["datetime"].min().date()
279
+ max_date = posts_df["datetime"].max().date()
280
+
281
+ date_range = st.sidebar.date_input(
282
+ "πŸ“… Select Date Range",
283
+ value=(min_date, max_date),
284
+ min_value=min_date,
285
+ max_value=max_date
286
+ )
287
+
288
+ if len(date_range) == 2:
289
+ posts_df = posts_df[
290
+ (posts_df["date"] >= date_range[0]) &
291
+ (posts_df["date"] <= date_range[1])
292
+ ]
293
+
294
+ # Subreddit filter
295
+ if not posts_df.empty and "subreddit" in posts_df.columns:
296
+ subreddits = st.sidebar.multiselect(
297
+ "πŸ“± Filter by Subreddits",
298
+ options=posts_df["subreddit"].unique(),
299
+ default=posts_df["subreddit"].value_counts().head(5).index.tolist()
300
+ )
301
+ if subreddits:
302
+ posts_df = posts_df[posts_df["subreddit"].isin(subreddits)]
303
+
304
+ # Keyword search
305
+ search_keyword = st.sidebar.text_input("πŸ” Search Keywords in Content")
306
+ if search_keyword:
307
+ posts_df = posts_df[
308
+ posts_df["text"].str.contains(search_keyword, case=False, na=False) |
309
+ posts_df["title"].str.contains(search_keyword, case=False, na=False)
310
+ ]
311
+
312
+ # ------------------------
313
+ # Main Dashboard Content
314
+ # ------------------------
315
+
316
+ if posts_df.empty and comments_df.empty:
317
+ st.error("🚫 No intelligence data available. Please ensure data collection is operational.")
318
+ st.stop()
319
+
320
+ # --- Crime Intelligence Metrics
321
+ st.subheader("πŸ“Š Crime Intelligence Overview")
322
+ col1, col2, col3, col4 = st.columns(4)
323
+
324
+ with col1:
325
+ critical_posts = len(posts_df[posts_df["severity"] == "Critical"]) if "severity" in posts_df.columns else 0
326
+ st.metric(
327
+ label="Critical Threats",
328
+ value=critical_posts,
329
+ delta=f"{(critical_posts/len(posts_df)*100):.1f}%" if len(posts_df) > 0 else "0%"
330
+ )
331
+
332
+ with col2:
333
+ avg_threat = posts_df["threat_score"].mean() if "threat_score" in posts_df.columns else 0
334
+ st.metric(
335
+ label="Avg Threat Score",
336
+ value=f"{avg_threat:.1f}",
337
+ delta="High" if avg_threat > 50 else "Moderate"
338
+ )
339
+
340
+ with col3:
341
+ if "ward_location" in posts_df.columns:
342
+ ward_exploded_temp = posts_df[posts_df["ward_location"] != ""].copy()
343
+ ward_exploded_temp["ward_location"] = ward_exploded_temp["ward_location"].str.split(", ")
344
+ ward_exploded_temp = ward_exploded_temp.explode("ward_location")
345
+ unique_locations = ward_exploded_temp["ward_location"].nunique()
346
+ st.metric(
347
+ label="Active Locations",
348
+ value=unique_locations
349
+ )
350
+
351
+ with col4:
352
+ drug_types = posts_df["drugs_mentioned"].str.split(", ").explode().nunique() if "drugs_mentioned" in posts_df.columns else 0
353
+ st.metric(
354
+ label="Drug Types Identified",
355
+ value=drug_types
356
+ )
357
+
358
+ st.markdown("---")
359
+
360
+ # --- Crime Severity Distribution
361
+ if "severity" in posts_df.columns:
362
+ st.subheader("⚠️ Crime Severity Analysis")
363
+
364
+ col1, col2 = st.columns(2)
365
+
366
+ with col1:
367
+ severity_counts = posts_df["severity"].value_counts()
368
+ fig_severity = px.pie(
369
+ values=severity_counts.values,
370
+ names=severity_counts.index,
371
+ title="Crime Severity Distribution",
372
+ color=severity_counts.index,
373
+ color_discrete_map={
374
+ 'Critical': '#FF0000',
375
+ 'High': '#FF6B00',
376
+ 'Medium': '#FFD700',
377
+ 'Low': '#90EE90'
378
+ }
379
+ )
380
+ st.plotly_chart(fig_severity, use_container_width=True)
381
+
382
+ with col2:
383
+ fig_threat = px.histogram(
384
+ posts_df,
385
+ x="threat_score",
386
+ nbins=20,
387
+ title="Threat Score Distribution",
388
+ labels={"threat_score": "Threat Score", "count": "Number of Posts"}
389
+ )
390
+ fig_threat.add_vline(x=50, line_dash="dash", line_color="red", annotation_text="High Threat Threshold")
391
+ st.plotly_chart(fig_threat, use_container_width=True)
392
+
393
+ st.markdown("---")
394
+
395
+ # --- Drug Type Analysis
396
+ if "drugs_mentioned" in posts_df.columns:
397
+ st.subheader("πŸ’Š Substance Intelligence")
398
+
399
+ all_drugs = posts_df["drugs_mentioned"].str.split(", ").explode()
400
+ drug_counts = all_drugs[all_drugs != "Unspecified"].value_counts().head(10)
401
+
402
+ if not drug_counts.empty:
403
+ fig_drugs = px.bar(
404
+ x=drug_counts.values,
405
+ y=drug_counts.index,
406
+ orientation='h',
407
+ title="Top 10 Substances Mentioned",
408
+ labels={"x": "Mentions", "y": "Substance"},
409
+ color=drug_counts.values,
410
+ color_continuous_scale="Reds"
411
+ )
412
+ st.plotly_chart(fig_drugs, use_container_width=True)
413
+
414
+ st.markdown("---")
415
+
416
+ # --- Timeline Analysis
417
+ if "date" in posts_df.columns:
418
+ st.subheader("πŸ“ˆ Crime Activity Timeline")
419
+
420
+ col1, col2 = st.columns(2)
421
+
422
+ with col1:
423
+ daily_data = posts_df.groupby(["date", "severity"]).size().reset_index(name="count")
424
+ fig_daily = px.line(
425
+ daily_data,
426
+ x="date",
427
+ y="count",
428
+ color="severity",
429
+ title="Daily Crime Activity by Severity",
430
+ labels={"count": "Number of Incidents", "date": "Date"},
431
+ color_discrete_map={
432
+ 'Critical': '#FF0000',
433
+ 'High': '#FF6B00',
434
+ 'Medium': '#FFD700',
435
+ 'Low': '#90EE90'
436
+ }
437
+ )
438
+ st.plotly_chart(fig_daily, use_container_width=True)
439
+
440
+ with col2:
441
+ if "hour" in posts_df.columns and "day_of_week" in posts_df.columns:
442
+ hourly_activity = posts_df.groupby(["day_of_week", "hour"]).size().reset_index(name="count")
443
+ fig_hourly = px.density_heatmap(
444
+ hourly_activity,
445
+ x="hour",
446
+ y="day_of_week",
447
+ z="count",
448
+ title="Activity Heatmap - High-Risk Hours",
449
+ labels={"hour": "Hour of Day", "day_of_week": "Day", "count": "Incidents"},
450
+ color_continuous_scale="Reds"
451
+ )
452
+ st.plotly_chart(fig_hourly, use_container_width=True)
453
+
454
+ st.markdown("---")
455
+
456
+ # --- Geographic Intelligence - COMBINED MAP
457
+ st.subheader("πŸ—ΊοΈ Geographic Crime Intelligence")
458
+
459
+ # Process both ward and district data
460
+ ward_data_available = not wards_df.empty and "ward_location" in posts_df.columns
461
+ district_data_available = not districts_df.empty and "district_location" in posts_df.columns
462
+
463
+ if ward_data_available or district_data_available:
464
+ st.markdown("**Crime hotspot analysis across Karnataka (Wards & Districts)**")
465
+
466
+ # Prepare ward data
467
+ merged_wards = pd.DataFrame()
468
+ if ward_data_available:
469
+ ward_posts = posts_df[posts_df["ward_location"] != ""].copy()
470
+ ward_exploded = ward_posts.copy()
471
+ ward_exploded["ward_location"] = ward_posts["ward_location"].str.split(", ")
472
+ ward_exploded = ward_exploded.explode("ward_location")
473
+ ward_exploded["ward_location"] = ward_exploded["ward_location"].str.strip().str.lower()
474
+
475
+ loc_counts = ward_exploded.groupby("ward_location").size().reset_index(name="count")
476
+ merged_wards = pd.merge(loc_counts, wards_df, left_on="ward_location", right_on="ward_name", how="inner")
477
+ merged_wards["location_type"] = "Ward"
478
+ merged_wards["location_name"] = merged_wards["ward_name"]
479
+
480
+ # Prepare district data
481
+ merged_districts = pd.DataFrame()
482
+ if district_data_available:
483
+ district_posts = posts_df[posts_df["district_location"] != ""].copy()
484
+ district_exploded = district_posts.copy()
485
+ district_exploded["district_location"] = district_posts["district_location"].str.split(", ")
486
+ district_exploded = district_exploded.explode("district_location")
487
+ district_exploded["district_location"] = district_exploded["district_location"].str.strip().str.lower()
488
+
489
+ district_counts = district_exploded.groupby("district_location").size().reset_index(name="count")
490
+ merged_districts = pd.merge(district_counts, districts_df, left_on="district_location", right_on="district_name", how="inner")
491
+ merged_districts["location_type"] = "District"
492
+ merged_districts["location_name"] = merged_districts["district_name"]
493
+
494
+ # Combine both datasets
495
+ all_locations = pd.concat([merged_wards, merged_districts], ignore_index=True)
496
+
497
+ if not all_locations.empty:
498
+ # Determine center of map
499
+ center_lat = all_locations["lat"].mean()
500
+ center_lon = all_locations["lon"].mean()
501
+
502
+ # Create unified map
503
+ m_unified = folium.Map(
504
+ location=[center_lat, center_lon],
505
+ zoom_start=9 if ward_data_available else 7,
506
+ tiles="OpenStreetMap"
507
+ )
508
+
509
+ # Add heatmap layer
510
+ heat_data = [[row["lat"], row["lon"], row["count"]] for _, row in all_locations.iterrows()]
511
+ HeatMap(heat_data, radius=20, blur=15, max_zoom=13, gradient={
512
+ 0.0: 'blue', 0.5: 'yellow', 0.75: 'orange', 1.0: 'red'
513
+ }).add_to(m_unified)
514
+
515
+ # Determine hotspot threshold
516
+ threshold = all_locations["count"].quantile(0.70)
517
+ all_locations["is_hotspot"] = all_locations["count"] >= threshold
518
+
519
+ # Add markers for each location
520
+ for _, row in all_locations.iterrows():
521
+ location_name = row["location_name"].title()
522
+ location_type = row["location_type"]
523
+ incident_count = row["count"]
524
+
525
+ # Get location-specific crime data
526
+ if location_type == "Ward":
527
+ loc_data = posts_df[posts_df["ward_location"].str.contains(row["location_name"], case=False, na=False)]
528
+ else:
529
+ loc_data = posts_df[posts_df["district_location"].str.contains(row["location_name"], case=False, na=False)]
530
+
531
+ # Severity breakdown
532
+ severity_breakdown = loc_data["severity"].value_counts().to_dict()
533
+ severity_html = "<br>".join([f"&nbsp;&nbsp;β€’ {sev}: {count}" for sev, count in severity_breakdown.items()])
534
+
535
+ # Critical incidents count
536
+ critical_count = severity_breakdown.get("Critical", 0)
537
+
538
+ # Top drugs in this location
539
+ loc_drugs = loc_data["drugs_mentioned"].str.split(", ").explode()
540
+ top_drugs = loc_drugs[loc_drugs != "Unspecified"].value_counts().head(3)
541
+ drugs_html = "<br>".join([f"&nbsp;&nbsp;β€’ {drug}: {count}" for drug, count in top_drugs.items()])
542
+
543
+ # Average threat score
544
+ avg_threat = loc_data["threat_score"].mean()
545
+
546
+ # Recent high-threat incidents
547
+ recent = loc_data.nlargest(3, "threat_score")[["title", "severity", "threat_score"]]
548
+ incidents_html = "<br>".join([
549
+ f"&nbsp;&nbsp;β€’ <b>[{r['severity']}]</b> {r['title'][:50]}... <i>(Score: {r['threat_score']:.0f})</i>"
550
+ for _, r in recent.iterrows()
551
+ ])
552
+
553
+ # Marker color based on severity
554
+ marker_color = 'darkred' if row["is_hotspot"] else ('red' if incident_count >= 5 else ('orange' if incident_count >= 3 else 'blue'))
555
+
556
+ # Icon based on type
557
+ icon_symbol = 'home' if location_type == "Ward" else 'map'
558
+
559
+ # Create detailed popup
560
+ popup_html = f"""
561
+ <div style='width: 350px; font-family: Arial, sans-serif;'>
562
+ <h3 style='color: {marker_color}; margin-bottom: 8px; border-bottom: 2px solid {marker_color}; padding-bottom: 5px;'>
563
+ {location_type}: {location_name}
564
+ </h3>
565
+ <div style='margin: 10px 0;'>
566
+ <b>πŸ“Š Total Incidents:</b> <span style='font-size: 18px; color: {marker_color};'>{incident_count}</span><br>
567
+ <b>🚨 Critical Threats:</b> <span style='font-size: 18px; color: darkred;'>{critical_count}</span><br>
568
+ <b>πŸ“ˆ Avg Threat Score:</b> <span style='font-size: 16px;'>{avg_threat:.1f}/100</span>
569
+ </div>
570
+ <hr style='border: 1px solid #ddd;'>
571
+ <div style='margin: 10px 0;'>
572
+ <b>⚠️ Severity Breakdown:</b><br>
573
+ {severity_html if severity_html else '&nbsp;&nbsp;No data'}
574
+ </div>
575
+ <hr style='border: 1px solid #ddd;'>
576
+ <div style='margin: 10px 0;'>
577
+ <b>πŸ’Š Top Substances Detected:</b><br>
578
+ {drugs_html if not top_drugs.empty else '&nbsp;&nbsp;None identified'}
579
+ </div>
580
+ <hr style='border: 1px solid #ddd;'>
581
+ <div style='margin: 10px 0;'>
582
+ <b>🎯 Recent High-Threat Incidents:</b><br>
583
+ {incidents_html if not recent.empty else '&nbsp;&nbsp;None'}
584
+ </div>
585
+ <div style='margin-top: 10px; padding: 5px; background-color: #f0f0f0; border-radius: 5px; text-align: center; font-size: 11px;'>
586
+ <i>Click marker for details β€’ Hover for quick info</i>
587
+ </div>
588
+ </div>
589
+ """
590
+
591
+ # Tooltip (hover text)
592
+ tooltip_text = f"""
593
+ <b>{location_type}: {location_name}</b><br>
594
+ Total Incidents: {incident_count}<br>
595
+ Critical: {critical_count} | Avg Threat: {avg_threat:.1f}
596
+ """
597
+
598
+ # Add marker
599
+ folium.CircleMarker(
600
+ location=[row["lat"], row["lon"]],
601
+ radius=min(incident_count * 2.5 if location_type == "Ward" else incident_count * 3.5, 25),
602
+ color=marker_color,
603
+ fill=True,
604
+ fill_color=marker_color,
605
+ fill_opacity=0.7,
606
+ weight=2,
607
+ popup=folium.Popup(popup_html, max_width=400),
608
+ tooltip=folium.Tooltip(tooltip_text, sticky=True)
609
+ ).add_to(m_unified)
610
+
611
+ # Display map
612
+ st_folium(m_unified, width="100%", height=700)
613
+
614
+ # Hotspot analysis table
615
+ st.subheader("πŸ”₯ Top Crime Hotspots")
616
+
617
+ col1 = st.columns(1)
618
+
619
+ with col1[0]:
620
+ st.markdown("**High-Activity Wards**")
621
+ if not merged_wards.empty:
622
+ ward_display = merged_wards.sort_values("count", ascending=False).head(10)
623
+ st.dataframe(
624
+ ward_display[["ward_name", "count"]].rename(columns={
625
+ "ward_name": "Ward Name",
626
+ "count": "Incidents"
627
+ }).reset_index(drop=True),
628
+ use_container_width=True,
629
+ height=300
630
+ )
631
+ else:
632
+ st.info("No ward data available")
633
+
634
+ st.markdown("---")
635
+
636
+ # --- High-Priority Intelligence Reports
637
+ st.subheader("🚨 High-Priority Intelligence Reports")
638
+
639
+ if not posts_df.empty:
640
+ priority_posts = posts_df[
641
+ (posts_df["severity"].isin(['Critical', 'High'])) |
642
+ (posts_df["threat_score"] >= 50)
643
+ ].sort_values("threat_score", ascending=False)
644
+
645
+ if not priority_posts.empty:
646
+ priority_posts = priority_posts.drop_duplicates(subset=['id'], keep='first')
647
+
648
+ display_cols = ["datetime", "title", "severity", "threat_score", "drugs_mentioned", "ward_location", "subreddit"]
649
+ available_cols = [col for col in display_cols if col in priority_posts.columns]
650
+
651
+ st.dataframe(
652
+ priority_posts[available_cols].head(50).rename(columns={
653
+ "datetime": "Timestamp",
654
+ "title": "Intelligence Report",
655
+ "severity": "Severity",
656
+ "threat_score": "Threat Score",
657
+ "drugs_mentioned": "Substances",
658
+ "ward_location": "Location",
659
+ "subreddit": "Source"
660
+ }),
661
+ use_container_width=True,
662
+ height=400
663
+ )
664
+
665
+ st.download_button(
666
+ label="πŸ“₯ Download Priority Reports (CSV)",
667
+ data=priority_posts[available_cols].to_csv(index=False).encode("utf-8"),
668
+ file_name=f"priority_intelligence_{datetime.now().strftime('%Y%m%d')}.csv",
669
+ mime="text/csv"
670
+ )
671
+ else:
672
+ st.info("No high-priority incidents in selected date range")
673
+ else:
674
+ st.info("No intelligence data available")
675
+
676
+ st.markdown("---")
677
+
678
+ # --- Advanced Analytics Section
679
+ st.subheader("πŸ”¬ Advanced Crime Analytics")
680
+
681
+ col1, col2 = st.columns(2)
682
+
683
+ with col1:
684
+ if "hour" in posts_df.columns and "severity" in posts_df.columns:
685
+ st.markdown("**Crime Patterns by Time of Day**")
686
+ time_severity = posts_df.groupby(["hour", "severity"]).size().reset_index(name="count")
687
+ fig_time = px.bar(
688
+ time_severity,
689
+ x="hour",
690
+ y="count",
691
+ color="severity",
692
+ title="Crime Activity by Hour and Severity",
693
+ labels={"hour": "Hour of Day", "count": "Incidents"},
694
+ color_discrete_map={
695
+ 'Critical': '#FF0000',
696
+ 'High': '#FF6B00',
697
+ 'Medium': '#FFD700',
698
+ 'Low': '#90EE90'
699
+ }
700
+ )
701
+ st.plotly_chart(fig_time, use_container_width=True)
702
+
703
+ with col2:
704
+ if "sentiment_score" in posts_df.columns and "severity" in posts_df.columns:
705
+ st.markdown("**Sentiment vs Crime Severity**")
706
+ fig_sentiment_severity = px.box(
707
+ posts_df,
708
+ x="severity",
709
+ y="sentiment_score",
710
+ color="severity",
711
+ title="Sentiment Distribution by Crime Severity",
712
+ labels={"sentiment_score": "Sentiment Score", "severity": "Crime Severity"},
713
+ color_discrete_map={
714
+ 'Critical': '#FF0000',
715
+ 'High': '#FF6B00',
716
+ 'Medium': '#FFD700',
717
+ 'Low': '#90EE90'
718
+ }
719
+ )
720
+ st.plotly_chart(fig_sentiment_severity, use_container_width=True)
721
+
722
+ st.markdown("---")
723
+
724
+ # --- Network Analysis
725
+ if "subreddit" in posts_df.columns and "drugs_mentioned" in posts_df.columns:
726
+ st.subheader("πŸ•ΈοΈ Source-Substance Network Analysis")
727
+
728
+ source_drug = posts_df[posts_df["drugs_mentioned"] != "Unspecified"].groupby(
729
+ ["subreddit", "drugs_mentioned"]
730
+ ).size().reset_index(name="mentions")
731
+
732
+ if not source_drug.empty:
733
+ top_relationships = source_drug.nlargest(15, "mentions")
734
+
735
+ fig_network = px.bar(
736
+ top_relationships,
737
+ x="mentions",
738
+ y="subreddit",
739
+ color="drugs_mentioned",
740
+ orientation='h',
741
+ title="Top Source-Substance Relationships",
742
+ labels={"mentions": "Number of Mentions", "subreddit": "Source Community"},
743
+ height=500
744
+ )
745
+ st.plotly_chart(fig_network, use_container_width=True)
746
+
747
+ st.markdown("---")
748
+
749
+ # --- Emerging Threats Detection
750
+ st.subheader("⚑ Emerging Threats Detection")
751
+
752
+ if "date" in posts_df.columns and "threat_score" in posts_df.columns:
753
+ today = posts_df["date"].max()
754
+ last_week = today - timedelta(days=7)
755
+ prev_week = last_week - timedelta(days=7)
756
+
757
+ recent_threats = posts_df[posts_df["date"] >= last_week]["threat_score"].mean()
758
+ previous_threats = posts_df[(posts_df["date"] >= prev_week) & (posts_df["date"] < last_week)]["threat_score"].mean()
759
+
760
+ threat_change = ((recent_threats - previous_threats) / previous_threats * 100) if previous_threats > 0 else 0
761
+
762
+ col1, col2, col3 = st.columns(3)
763
+
764
+ with col1:
765
+ st.metric(
766
+ "Threat Level Trend",
767
+ f"{recent_threats:.1f}",
768
+ f"{threat_change:+.1f}%",
769
+ delta_color="inverse"
770
+ )
771
+
772
+ with col2:
773
+ recent_locs = set(posts_df[posts_df["date"] >= last_week]["ward_location"].str.split(", ").explode())
774
+ prev_locs = set(posts_df[posts_df["date"] < last_week]["ward_location"].str.split(", ").explode())
775
+ new_locations = len(recent_locs - prev_locs)
776
+ st.metric("New Active Locations", new_locations)
777
+
778
+ with col3:
779
+ daily_avg = posts_df.groupby("date").size().mean()
780
+ recent_avg = posts_df[posts_df["date"] >= last_week].groupby("date").size().mean()
781
+ spike = recent_avg > daily_avg * 1.5
782
+ st.metric("Activity Status", "⚠️ SPIKE" if spike else "βœ… Normal")
783
+
784
+ st.markdown("---")
785
+
786
+ # --- Intelligence Summary Report
787
+ st.subheader("πŸ“‹ Executive Intelligence Summary")
788
+
789
+ summary_col1, summary_col2 = st.columns(2)
790
+
791
+ with summary_col1:
792
+ st.markdown("**Key Findings:**")
793
+
794
+ if not posts_df.empty:
795
+ if "ward_location" in posts_df.columns and "threat_score" in posts_df.columns:
796
+ ward_posts_with_location = posts_df[posts_df["ward_location"] != ""].copy()
797
+ if not ward_posts_with_location.empty:
798
+ ward_exploded_threat = ward_posts_with_location.copy()
799
+ ward_exploded_threat["ward_location"] = ward_posts_with_location["ward_location"].str.split(", ")
800
+ ward_exploded_threat = ward_exploded_threat.explode("ward_location").reset_index(drop=True)
801
+
802
+ ward_threat = ward_exploded_threat.groupby("ward_location")["threat_score"].mean().sort_values(ascending=False)
803
+
804
+ if not ward_threat.empty:
805
+ st.markdown(f"🎯 **Highest Threat Zone:** {ward_threat.index[0].title()} (Score: {ward_threat.iloc[0]:.1f})")
806
+
807
+ if "drugs_mentioned" in posts_df.columns:
808
+ top_drug = posts_df["drugs_mentioned"].str.split(", ").explode().value_counts()
809
+ if len(top_drug) > 0 and top_drug.index[0] != "Unspecified":
810
+ st.markdown(f"πŸ’Š **Primary Substance:** {top_drug.index[0]} ({top_drug.iloc[0]} mentions)")
811
+
812
+ if "hour" in posts_df.columns:
813
+ peak_hour = posts_df["hour"].mode()[0]
814
+ st.markdown(f"πŸ• **Peak Activity Time:** {peak_hour}:00 - {peak_hour+1}:00")
815
+
816
+ if "subreddit" in posts_df.columns:
817
+ top_source = posts_df["subreddit"].value_counts().index[0]
818
+ st.markdown(f"πŸ“± **Primary Intelligence Source:** r/{top_source}")
819
+
820
+ with summary_col2:
821
+ st.markdown("**Risk Assessment:**")
822
+
823
+ if not posts_df.empty and "severity" in posts_df.columns:
824
+ critical_pct = (len(posts_df[posts_df["severity"] == "Critical"]) / len(posts_df) * 100)
825
+
826
+ if critical_pct > 30:
827
+ risk_level = "πŸ”΄ CRITICAL"
828
+ risk_desc = "Immediate action required"
829
+ elif critical_pct > 15:
830
+ risk_level = "🟠 HIGH"
831
+ risk_desc = "Enhanced monitoring recommended"
832
+ elif critical_pct > 5:
833
+ risk_level = "🟑 MODERATE"
834
+ risk_desc = "Standard surveillance protocols"
835
+ else:
836
+ risk_level = "🟒 LOW"
837
+ risk_desc = "Routine monitoring sufficient"
838
+
839
+ st.markdown(f"**Overall Risk Level:** {risk_level}")
840
+ st.markdown(f"*{risk_desc}*")
841
+ st.markdown(f"- Critical incidents: {critical_pct:.1f}%")
842
+ st.markdown(f"- Total monitored incidents: {len(posts_df)}")
843
+ st.markdown(f"- Date range: {posts_df['date'].min()} to {posts_df['date'].max()}")
844
+
845
+ st.markdown("---")
846
+
847
+ # --- Export Options
848
+ st.subheader("πŸ“€ Export Intelligence Reports")
849
+
850
+ export_col1, export_col2, export_col3 = st.columns(3)
851
+
852
+ with export_col1:
853
+ if not posts_df.empty:
854
+ full_export = posts_df.to_csv(index=False).encode("utf-8")
855
+ st.download_button(
856
+ label="πŸ“Š Full Dataset",
857
+ data=full_export,
858
+ file_name=f"intelligence_full_{datetime.now().strftime('%Y%m%d')}.csv",
859
+ mime="text/csv"
860
+ )
861
+
862
+ with export_col2:
863
+ if "severity" in posts_df.columns:
864
+ critical_data = posts_df[posts_df["severity"] == "Critical"]
865
+ if not critical_data.empty:
866
+ critical_export = critical_data.to_csv(index=False).encode("utf-8")
867
+ st.download_button(
868
+ label="🚨 Critical Incidents",
869
+ data=critical_export,
870
+ file_name=f"critical_incidents_{datetime.now().strftime('%Y%m%d')}.csv",
871
+ mime="text/csv"
872
+ )
873
+
874
+ with export_col3:
875
+ if 'merged_wards' in locals() and not merged_wards.empty:
876
+ location_export = merged_wards.to_csv(index=False).encode("utf-8")
877
+ st.download_button(
878
+ label="πŸ—ΊοΈ Location Analysis",
879
+ data=location_export,
880
+ file_name=f"location_analysis_{datetime.now().strftime('%Y%m%d')}.csv",
881
+ mime="text/csv"
882
+ )
883
+
884
+ st.markdown("---")
885
+
886
+ # --- System Status Footer
887
+ st.markdown("**πŸ”’ Intelligence System Status:**")
888
+ status_cols = st.columns(4)
889
+ with status_cols[0]:
890
+ st.write("πŸ“„ Posts:", "βœ… Online" if data_status["posts"] else "❌ Offline")
891
+ with status_cols[1]:
892
+ st.write("πŸ’¬ Comments:", "βœ… Online" if data_status["comments"] else "❌ Offline")
893
+ with status_cols[2]:
894
+ st.write("🏘️ Wards:", "βœ… Online" if data_status["wards"] else "❌ Offline")
895
+ with status_cols[3]:
896
+ st.write("🌍 Districts:", "βœ… Online" if data_status["districts"] else "❌ Offline")
897
+
898
+ try:
899
+ file_mod_time = datetime.fromtimestamp(os.path.getmtime(POSTS_FILE))
900
+ st.markdown(f"*Intelligence data last updated: {file_mod_time.strftime('%Y-%m-%d %H:%M:%S')}*")
901
+ except:
902
+ pass
903
 
904
+ st.markdown("---")