lawlevisan commited on
Commit
e0324fc
·
verified ·
1 Parent(s): dc0315b

Upload 5 files

Browse files
src/alerts.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # alerts.py
2
+ import os
3
+ import time
4
+ import logging
5
+ from typing import List, Optional
6
+ from email.mime.text import MIMEText
7
+ from email.mime.multipart import MIMEMultipart
8
+ import smtplib
9
+ from dotenv import load_dotenv
10
+
11
+ # local db wrapper (your db.py must expose these functions)
12
+ from db import fetch_high_risk_unnotified, mark_as_notified
13
+
14
+ load_dotenv()
15
+
16
+ # --- Configuration (via env) ---
17
+ ALERT_EMAIL = os.getenv("ALERT_EMAIL") # sender email (also login user)
18
+ ALERT_PASSWORD = os.getenv("ALERT_PASSWORD") # SMTP password or app-specific password
19
+ ALERT_RECIPIENTS = os.getenv("ALERT_RECIPIENTS") # comma-separated recipients e.g. "a@x.com,b@x.com"
20
+ SMTP_SERVER = os.getenv("ALERT_SMTP", "smtp.gmail.com")
21
+ SMTP_PORT = int(os.getenv("ALERT_SMTP_PORT", "587"))
22
+ SEND_RETRY = int(os.getenv("ALERT_SEND_RETRY", "2"))
23
+ RETRY_DELAY = float(os.getenv("ALERT_RETRY_DELAY", "2.0")) # seconds between retries
24
+ BATCH_DELAY = float(os.getenv("ALERT_BATCH_DELAY", "0.5")) # short wait between operations
25
+
26
+ # Basic validation
27
+ if not ALERT_EMAIL or not ALERT_PASSWORD:
28
+ raise RuntimeError("ALERT_EMAIL and ALERT_PASSWORD must be set in the environment.")
29
+
30
+ # Build recipients list
31
+ def _parse_recipients(env_val: Optional[str]) -> List[str]:
32
+ if not env_val:
33
+ return [ALERT_EMAIL]
34
+ return [r.strip() for r in env_val.split(",") if r.strip()]
35
+
36
+ RECIPIENTS = _parse_recipients(ALERT_RECIPIENTS)
37
+
38
+ # logger
39
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
40
+
41
+ def compute_dynamic_risk(tweet: dict) -> float:
42
+ """
43
+ Returns a dynamic risk score (0–100)
44
+ """
45
+ # 1️⃣ Content-based weight
46
+ drug_score = float(tweet.get("drug_score", 0))
47
+ crime_score = float(tweet.get("crime_score", 0))
48
+ content_score = 0.5*drug_score + 0.5*crime_score # can adjust weights
49
+
50
+ # 2️⃣ User influence weight
51
+ followers = int(tweet.get("followers_count", 0))
52
+ verified = 1 if tweet.get("verified", False) else 0
53
+ user_score = min(followers/1000, 1) * 0.5 + verified*0.5 # normalize
54
+
55
+ # 3️⃣ Engagement weight
56
+ engagement = int(tweet.get("like_count", 0)) + int(tweet.get("retweet_count", 0))
57
+ engagement_score = min(engagement / 50, 1) # normalize to 0-1
58
+
59
+ # 4️⃣ Geo relevance
60
+ location = str(tweet.get("user_location", "")).lower()
61
+ karnataka_keywords = ["bangalore", "bengaluru", "karnataka"]
62
+ geo_score = 1 if any(k in location for k in karnataka_keywords) else 0
63
+
64
+ # Combine weights (adjust relative contributions)
65
+ risk_score = (
66
+ 0.4*content_score +
67
+ 0.2*user_score +
68
+ 0.2*engagement_score +
69
+ 0.2*geo_score
70
+ )
71
+
72
+ # Scale to 0–100
73
+ return round(risk_score*100, 2)
74
+
75
+ def assign_dynamic_risk_level(tweet: dict) -> str:
76
+ score = compute_dynamic_risk(tweet)
77
+ if score >= 75:
78
+ return "CRITICAL"
79
+ elif score >= 50:
80
+ return "HIGH"
81
+ elif score >= 25:
82
+ return "MEDIUM"
83
+ else:
84
+ return "LOW"
85
+
86
+ def _tweet_score_for_sort(tweet: dict) -> float:
87
+ """Use dynamic risk for sorting instead of static RISK_PRIORITIES"""
88
+ risk_score = compute_dynamic_risk(tweet) # 0–100
89
+ engagement = int(tweet.get("like_count", 0) or 0) + int(tweet.get("retweet_count", 0) or 0)
90
+ return risk_score + engagement*0.1 # small weight to engagement
91
+
92
+ def _select_top_tweets(tweets: List[dict], max_tweets: Optional[int], send_all: bool) -> List[dict]:
93
+ """Sort and slice tweets according to priority; return selected tweets."""
94
+ if not tweets:
95
+ return []
96
+
97
+ # normalize risk_level fields to uppercase to avoid mismatches
98
+ for t in tweets:
99
+ if "risk_level" in t and isinstance(t["risk_level"], str):
100
+ t["risk_level"] = t["risk_level"].upper()
101
+
102
+ # sort by risk then engagement (both descending)
103
+ tweets_sorted = sorted(tweets, key=lambda t: _tweet_score_for_sort(t), reverse=True)
104
+
105
+ if send_all or max_tweets is None:
106
+ return tweets_sorted
107
+ return tweets_sorted[:max_tweets]
108
+
109
+ def _format_tweet_html_block(tweet: dict) -> str:
110
+ """Return an HTML block describing a tweet (for batched email)."""
111
+ tweet_id = tweet.get("tweet_id", "N/A")
112
+ user = tweet.get("username") or tweet.get("user") or tweet.get("user_name") or "N/A"
113
+ content = tweet.get("content") or tweet.get("text") or ""
114
+ timestamp = tweet.get("datetime") or tweet.get("timestamp") or "N/A"
115
+ location = tweet.get("user_location") or tweet.get("location") or "N/A"
116
+ risk = tweet.get("risk_level", "N/A")
117
+ likes = tweet.get("like_count", 0)
118
+ rts = tweet.get("retweet_count", 0)
119
+ url = tweet.get("tweet_url") or f"https://x.com/{user}/status/{tweet_id}" if tweet_id != "N/A" else "N/A"
120
+
121
+ # Bulk detection
122
+ bulk_keywords = ["kg", "gram", "bulk", "kilos", "ounce", "pound"]
123
+ bulk_indicator = "Yes" if any(k in content.lower() for k in bulk_keywords) else "No"
124
+
125
+ # Contact detection (simple digit check)
126
+ contact_indicator = "Yes" if any(c.isdigit() for c in content) else "No"
127
+
128
+ html = f"""
129
+ <div style="border:1px solid #ddd;padding:10px;margin-bottom:8px;border-radius:6px;">
130
+ <p><strong>Risk:</strong> <span style="color:#b22222">{risk}</span> &nbsp;
131
+ <strong>User:</strong> @{user} &nbsp; <strong>Time:</strong> {timestamp}</p>
132
+ <p><strong>Location:</strong> {location} &nbsp;<td>{bulk_indicator}</td><td>{contact_indicator}</td><td>{content}</td><strong>Likes:</strong> {likes} &nbsp; <strong>RTs:</strong> {rts}</p>
133
+ <p style="background:#f7f7f7;padding:8px;border-radius:4px;">{content}</p>
134
+ <p><a href="{url}">View Tweet</a> | Tweet ID: {tweet_id}</p>
135
+ </div>
136
+ """
137
+ return html
138
+
139
+ def _compose_batched_email(tweets: List[dict]) -> MIMEMultipart:
140
+ msg = MIMEMultipart("alternative")
141
+ msg["Subject"] = f"🚨 {len(tweets)} High-Priority Drug Alerts"
142
+ msg["From"] = ALERT_EMAIL
143
+ msg["To"] = ", ".join(RECIPIENTS)
144
+
145
+ # --- Top CRITICAL summary ---
146
+ critical_tweets = [t for t in tweets if t.get("risk_level") == "CRITICAL"]
147
+ top_critical = sorted(critical_tweets, key=lambda t: t.get("dynamic_risk_score", 0), reverse=True)[:10]
148
+
149
+ summary_html = ""
150
+ if top_critical:
151
+ summary_html += """
152
+ <h3 style="color:#b22222;">Top CRITICAL Tweets Summary</h3>
153
+ <table border="1" cellpadding="5" cellspacing="0" style="border-collapse: collapse;">
154
+ <tr>
155
+ <th>User</th><th>Dynamic Risk</th><th>Followers</th><th>Verified</th><th>Engagement</th><th>Location</th><th>Link</th>
156
+ </tr>
157
+ """
158
+ for t in top_critical:
159
+ user = t.get("username") or t.get("user") or "N/A"
160
+ risk_score = t.get("dynamic_risk_score", 0)
161
+ followers = t.get("followers_count", 0)
162
+ verified = "Yes" if t.get("verified", False) else "No"
163
+ engagement = int(t.get("like_count", 0)) + int(t.get("retweet_count", 0))
164
+ location = t.get("user_location", "N/A")
165
+ tweet_id = t.get("tweet_id", "N/A")
166
+ url = t.get("tweet_url") or f"https://x.com/{user}/status/{tweet_id}" if tweet_id != "N/A" else "N/A"
167
+
168
+ summary_html += f"""
169
+ <tr>
170
+ <td>@{user}</td>
171
+ <td>{risk_score}</td>
172
+ <td>{followers}</td>
173
+ <td>{verified}</td>
174
+ <td>{engagement}</td>
175
+ <td>{location}</td>
176
+ <td><a href="{url}">View</a></td>
177
+ </tr>
178
+ """
179
+ summary_html += "</table><br>"
180
+
181
+ # --- Main email table with all metrics ---
182
+ html_blocks = ["""
183
+ <table border="1" cellpadding="5" cellspacing="0" style="border-collapse: collapse;">
184
+ <tr>
185
+ <th>Risk</th><th>User</th><th>Dynamic Risk</th><th>Followers</th>
186
+ <th>Verified</th><th>Engagement</th><th>Geo Score</th><th>Location</th>
187
+ <th>Bulk</th><th>Contact</th><th>Content</th><th>Link</th>
188
+ </tr>
189
+ """]
190
+
191
+ for t in tweets:
192
+ tweet_id = t.get("tweet_id", "N/A")
193
+ user = t.get("username") or t.get("user") or t.get("user_name") or "N/A"
194
+ content = t.get("content") or t.get("text") or ""
195
+ timestamp = t.get("datetime") or t.get("timestamp") or "N/A"
196
+ location = str(t.get("user_location") or t.get("location") or "N/A").lower()
197
+ risk = t.get("risk_level", "N/A")
198
+ dyn_risk = t.get("dynamic_risk_score", 0)
199
+ followers = t.get("followers_count", 0)
200
+ verified = "Yes" if t.get("verified", False) else "No"
201
+ engagement = int(t.get("like_count", 0)) + int(t.get("retweet_count", 0))
202
+ geo_score = 1 if any(k in location for k in ["bangalore", "bengaluru", "karnataka"]) else 0
203
+ url = t.get("tweet_url") or f"https://x.com/{user}/status/{tweet_id}" if tweet_id != "N/A" else "N/A"
204
+
205
+ # Bulk and contact indicators
206
+ bulk_keywords = ["kg", "gram", "bulk", "kilos", "ounce", "pound"]
207
+ bulk_indicator = "Yes" if any(k in content.lower() for k in bulk_keywords) else "No"
208
+ contact_indicator = "Yes" if any(c.isdigit() for c in content) else "No"
209
+
210
+ html_blocks.append(f"""
211
+ <tr>
212
+ <td>{risk}</td>
213
+ <td>@{user}</td>
214
+ <td>{dyn_risk}</td>
215
+ <td>{followers}</td>
216
+ <td>{verified}</td>
217
+ <td>{engagement}</td>
218
+ <td>{geo_score}</td>
219
+ <td>{location}</td>
220
+ <td>{bulk_indicator}</td>
221
+ <td>{contact_indicator}</td>
222
+ <td>{content}</td>
223
+ <td><a href="{url}">View</a></td>
224
+ </tr>
225
+ """)
226
+
227
+ html_blocks.append("</table>")
228
+
229
+ html_text = f"""
230
+ <html>
231
+ <body>
232
+ <h2 style="color:#b22222;">High-Priority Drug Alerts</h2>
233
+ {summary_html}
234
+ {''.join(html_blocks)}
235
+ <hr/>
236
+ <p>Generated by Karnataka Drug Crime Monitoring System</p>
237
+ </body>
238
+ </html>
239
+ """
240
+
241
+ # Plain-text fallback
242
+ plain_text = "\n".join([
243
+ f"{t.get('risk_level')} | @{t.get('username')} | {t.get('dynamic_risk_score')} | {t.get('content','')[:100]}"
244
+ for t in tweets
245
+ ])
246
+
247
+ msg.attach(MIMEText(plain_text, "plain"))
248
+ msg.attach(MIMEText(html_text, "html"))
249
+ return msg
250
+
251
+
252
+ # --- SMTP send with retries --- #
253
+ def _send_email_message(msg: MIMEMultipart, recipients: List[str], retry: int = SEND_RETRY) -> bool:
254
+ """Send message via SMTP; return True on success."""
255
+ attempt = 0
256
+ while attempt <= retry:
257
+ try:
258
+ with smtplib.SMTP(SMTP_SERVER, SMTP_PORT, timeout=20) as s:
259
+ s.ehlo()
260
+ if SMTP_PORT == 587:
261
+ s.starttls()
262
+ s.ehlo()
263
+ s.login(ALERT_EMAIL, ALERT_PASSWORD)
264
+ s.sendmail(ALERT_EMAIL, recipients, msg.as_string())
265
+ logging.info(f"Email sent to {recipients}")
266
+ return True
267
+ except Exception as e:
268
+ attempt += 1
269
+ logging.warning(f"Email send attempt {attempt} failed: {e}")
270
+ if attempt > retry:
271
+ logging.error("Exceeded email send retries.")
272
+ return False
273
+ time.sleep(RETRY_DELAY)
274
+
275
+ # --- Public trigger function --- #
276
+ def trigger_alerts(max_tweets: Optional[int] = 10,
277
+ send_all: bool = False,
278
+ separate_emails: bool = False):
279
+ logging.info("Fetching high-risk unnotified tweets from DB...")
280
+ tweets = fetch_high_risk_unnotified()
281
+ if not tweets:
282
+ logging.info("No unnotified high-risk tweets found.")
283
+ return
284
+
285
+ # --- Compute dynamic risk for all fetched tweets ---
286
+ for t in tweets:
287
+ t["dynamic_risk_score"] = compute_dynamic_risk(t)
288
+ t["risk_level"] = assign_dynamic_risk_level(t) # automatically set risk_level
289
+
290
+ selected = _select_top_tweets(tweets, max_tweets, send_all)
291
+ if not selected:
292
+ logging.info("No tweets selected after filtering.")
293
+ return
294
+
295
+ # Compose and send emails (batch or separate)
296
+ success_ids, failure_ids = [], []
297
+
298
+ if separate_emails:
299
+ for t in selected:
300
+ msg = _compose_batched_email([t])
301
+ ok = _send_email_message(msg, RECIPIENTS)
302
+ if ok:
303
+ success_ids.append((t.get("tweet_id"), t.get("_collection_name")))
304
+ else:
305
+ failure_ids.append(t.get("tweet_id"))
306
+ time.sleep(BATCH_DELAY)
307
+ else:
308
+ msg = _compose_batched_email(selected)
309
+ ok = _send_email_message(msg, RECIPIENTS)
310
+ if ok:
311
+ success_ids.extend([(t.get("tweet_id"), t.get("_collection_name")) for t in selected])
312
+ else:
313
+ failure_ids.extend([t.get("tweet_id") for t in selected])
314
+
315
+ # Mark notified
316
+ for tid, maybe_collection in success_ids:
317
+ try:
318
+ mark_as_notified(tid)
319
+ except Exception as e:
320
+ logging.error(f"Failed to mark {tid} as notified: {e}")
321
+
322
+ logging.info(f"Alerts sent: {len(success_ids)}; failures: {len(failure_ids)}")
323
+ if failure_ids:
324
+ logging.warning(f"Failed tweet IDs: {failure_ids}")
325
+
326
+ def compute_risk_probability(dynamic_score: float) -> float:
327
+ """
328
+ Convert dynamic risk score (0–100) into a probability 0–1
329
+ """
330
+ return max(0.0, min(1.0, dynamic_score / 100))
331
+
332
+
333
+ # --- CLI usage example --- #
334
+ if __name__ == "__main__":
335
+ # Example usages:
336
+ # - send up to 5 top tweets (batched) -> trigger_alerts(max_tweets=5)
337
+ # - send all unnotified high-risk tweets -> trigger_alerts(send_all=True)
338
+ # - send one email per tweet -> trigger_alerts(max_tweets=10, separate_emails=True)
339
+
340
+ # Default example: top 10 (batched)
341
+ trigger_alerts(max_tweets=10)
src/db.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # db.py
2
+ from pymongo import MongoClient, UpdateOne
3
+ import pandas as pd
4
+ from dotenv import load_dotenv
5
+ load_dotenv()
6
+ import os
7
+ import json
8
+ import sys
9
+
10
+ # MongoDB configuration
11
+ MONGO_URI = os.getenv("MONGO_URI")
12
+ if not MONGO_URI:
13
+ print("❌ MONGO_URI not set in environment variables.")
14
+ sys.exit(1)
15
+
16
+ DB_NAME = "drug_monitoring_twitter"
17
+ COLLECTION_NAME = "tweets"
18
+ FOLDER_PATH = "drug_analysis_data_3months" # folder with scraper outputs
19
+
20
+ def get_db_collection():
21
+ """Connect to MongoDB and return the collection"""
22
+ client = MongoClient(MONGO_URI)
23
+ db = client[DB_NAME]
24
+
25
+ if COLLECTION_NAME not in db.list_collection_names():
26
+ db.create_collection(COLLECTION_NAME)
27
+ print(f"✅ Created collection: {COLLECTION_NAME}")
28
+
29
+ return db[COLLECTION_NAME]
30
+
31
+ def insert_all_from_folder(folder_path=FOLDER_PATH):
32
+ """Insert all CSV/JSON files from scraper folder into MongoDB"""
33
+ collection = get_db_collection()
34
+
35
+ if not os.path.exists(folder_path):
36
+ print(f"❌ Folder path does not exist: {folder_path}")
37
+ return
38
+
39
+ for file_name in os.listdir(folder_path):
40
+ file_path = os.path.join(folder_path, file_name)
41
+ operations = []
42
+
43
+ try:
44
+ if file_name.endswith(".csv"):
45
+ df = pd.read_csv(file_path, encoding="utf-8")
46
+ for _, row in df.iterrows():
47
+ doc = row.to_dict()
48
+ doc["notified"] = False
49
+ if "tweet_id" in doc:
50
+ operations.append(
51
+ UpdateOne({"tweet_id": doc["tweet_id"]}, {"$set": doc}, upsert=True)
52
+ )
53
+
54
+ elif file_name.endswith(".json"):
55
+ with open(file_path, "r", encoding="utf-8") as f:
56
+ data = json.load(f)
57
+ if isinstance(data, list):
58
+ for tweet in data:
59
+ tweet["notified"] = False
60
+ if "tweet_id" in tweet:
61
+ operations.append(
62
+ UpdateOne({"tweet_id": tweet["tweet_id"]}, {"$set": tweet}, upsert=True)
63
+ )
64
+ else:
65
+ # single JSON report
66
+ operations.append(
67
+ UpdateOne({"report_name": file_name}, {"$set": data}, upsert=True)
68
+ )
69
+
70
+ if operations:
71
+ result = collection.bulk_write(operations)
72
+ print(f"✅ {file_name} -> inserted/updated {result.upserted_count + result.modified_count} documents.")
73
+
74
+ except Exception as e:
75
+ print(f"❌ Failed to process {file_name}: {e}")
76
+
77
+ def fetch_high_risk_unnotified():
78
+ """Get all HIGH or CRITICAL risk tweets that are not notified yet"""
79
+ collection = get_db_collection()
80
+ return list(collection.find({"risk_level": {"$in": ["HIGH", "CRITICAL"]}, "notified": False}))
81
+
82
+ def mark_as_notified(tweet_id):
83
+ """Mark a tweet as notified after sending alert"""
84
+ collection = get_db_collection()
85
+ collection.update_one({"tweet_id": tweet_id}, {"$set": {"notified": True}})
86
+
87
+ if __name__ == "__main__":
88
+ insert_all_from_folder()
89
+ print("✅ All scraper folder contents inserted/updated successfully.")
src/enhanced_drug_crime_scraper_3months.py ADDED
@@ -0,0 +1,774 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # enhanced_drug_crime_scraper_3months.py
2
+ # Enhanced Twitter Scraper for Karnataka Police Drug Crime Analysis - Latest 3 Months
3
+ # Automatically collects data for the most recent 3 months
4
+
5
+ import asyncio
6
+ import pandas as pd
7
+ from twscrape import API, gather
8
+ from fuzzywuzzy import fuzz
9
+ from datetime import timezone, timedelta, datetime
10
+ import logging
11
+ import sys
12
+ import os
13
+ import re
14
+ from collections import Counter
15
+ import json
16
+ import hashlib
17
+ from typing import List, Dict, Set, Tuple, Optional
18
+ import warnings
19
+ warnings.filterwarnings('ignore')
20
+
21
+ # Sentiment analysis (optional)
22
+ try:
23
+ import nltk
24
+ from nltk.sentiment import SentimentIntensityAnalyzer
25
+ nltk.download('vader_lexicon', quiet=True)
26
+ nltk.download('punkt', quiet=True)
27
+ nltk.download('stopwords', quiet=True)
28
+ SENTIMENT_AVAILABLE = True
29
+ except:
30
+ SENTIMENT_AVAILABLE = False
31
+ print("Warning: NLTK not available. Sentiment analysis will be disabled.")
32
+
33
+ # Setup comprehensive logging
34
+ logging.basicConfig(
35
+ level=logging.INFO,
36
+ format='%(asctime)s - %(levelname)s - %(message)s',
37
+ handlers=[
38
+ logging.FileHandler('drug_crime_scraper_3months.log'),
39
+ logging.StreamHandler(sys.stdout)
40
+ ]
41
+ )
42
+ logger = logging.getLogger(__name__)
43
+
44
+ class DrugCrimeScraper3Months:
45
+ def __init__(self):
46
+ self.api = None
47
+ self.sentiment_analyzer = None
48
+ self.seen_ids: Set[str] = set()
49
+ self.collected_tweets: List[Dict] = []
50
+
51
+ # Initialize sentiment analyzer if available
52
+ if SENTIMENT_AVAILABLE:
53
+ try:
54
+ self.sentiment_analyzer = SentimentIntensityAnalyzer()
55
+ except:
56
+ logger.warning("Sentiment analyzer initialization failed")
57
+
58
+ # Enhanced drug-related keywords with criminal context
59
+ self.drug_keywords = [
60
+ # Cannabis variants
61
+ "weed", "ganja", "hash", "hashish", "charas", "cannabis", "marijuana", "dope", "pot", "grass",
62
+ "bhang", "nasha", "nashe", "maal", "stuff", "green", "herb", "mary jane", "bud", "kush",
63
+
64
+ # Hard drugs
65
+ "acid", "lsd", "mdma", "ecstasy", "molly", "coke", "cocaine", "crack", "meth", "crystal",
66
+ "heroin", "brown sugar", "smack", "ketamine", "special k", "oxy", "oxycodone", "percocet",
67
+ "adderall", "xanax", "benzo", "valium", "diazepam", "fentanyl", "tramadol", "codeine",
68
+
69
+ # Drug trade terms
70
+ "dealer", "peddler", "supplier", "pusher", "connect", "plug", "score", "scored",
71
+ "selling", "buying", "supply", "delivery", "pickup", "drop", "stash", "cache",
72
+ "package", "parcel", "consignment", "shipment", "bulk", "wholesale", "retail",
73
+
74
+ # Consumption terms
75
+ "trip", "tripping", "high", "stoned", "blazed", "hit", "dose", "line", "bump",
76
+ "joint", "blunt", "spliff", "bong", "pipe", "chillum", "roll", "smoke",
77
+ "inject", "snort", "pop", "drop", "chase", "shoot up", "mainline",
78
+
79
+ # Slang and coded terms
80
+ "420", "710", "party supplies", "study material", "medicine", "remedy",
81
+ "good stuff", "quality stuff", "premium", "imported", "local", "organic",
82
+ "fresh", "clean", "pure", "top shelf", "fire", "dank",
83
+
84
+ # Emoji and coded representations
85
+ "🍁", "💨", "🔥", "🌿", "💊", "💉", "🚬", "🍄", "❄️", "💎", "🌪️", "🔋",
86
+ "w33d", "g4nja", "h4sh", "c0ke", "m0lly", "xtc", "l$d", "m@al"
87
+ ]
88
+
89
+ # Criminal activity indicators
90
+ self.crime_indicators = [
91
+ "arrest", "arrested", "caught", "raid", "police", "cops", "seized", "confiscated",
92
+ "bust", "busted", "investigation", "case", "fir", "complaint", "crime", "criminal",
93
+ "illegal", "banned", "prohibited", "contraband", "smuggling", "trafficking",
94
+ "possession", "distribution", "manufacturing", "cultivation", "racket", "cartel",
95
+ "kingpin", "network", "operation", "crackdown", "surveillance", "undercover",
96
+ "ncb", "dri", "excise", "customs", "border", "interstate", "mafia"
97
+ ]
98
+
99
+ # Karnataka location keywords (enhanced)
100
+ self.karnataka_keywords = self._prepare_karnataka_keywords()
101
+
102
+ # High-risk areas in Karnataka
103
+ self.high_risk_areas = [
104
+ "malleswaram", "shivaji nagar", "frazer town", "russell market", "chickpet",
105
+ "kr market", "city market", "commercial street", "brigade road", "mg road",
106
+ "hosur road", "electronic city", "silk board", "bommanahalli", "btm layout",
107
+ "jayanagar", "basavanagudi", "hanumanthanagar", "banashankari", "girinagar",
108
+ "vijayanagar", "rajajinagar", "mahalakshmi layout", "peenya", "jalahalli",
109
+ "hebbal", "rt nagar", "kammanahalli", "banaswadi", "krishnarajapuram",
110
+ "whitefield", "marathahalli", "brookefield", "varthur", "sarjapur",
111
+ "koramangala", "indiranagar", "domlur", "ulsoor", "vasanth nagar",
112
+ "sadashiva nagar", "seshadripuram", "gandhinagar", "padhmanabhanagar"
113
+ ]
114
+
115
+ def _prepare_karnataka_keywords(self) -> List[str]:
116
+ """Prepare normalized Karnataka location keywords."""
117
+ raw_keywords = [
118
+ # Main cities and districts
119
+ "karnataka", "bengaluru", "bangalore", "blr", "b'lore", "namma bengaluru", "namma blr",
120
+ "mysore", "mysuru", "mangalore", "mangaluru", "hubli", "dharwad", "belgaum", "belagavi",
121
+ "ballari", "bellary", "bijapur", "vijayapura", "tumkur", "davangere", "shimoga", "shivamogga",
122
+ "hassan", "chitradurga", "gadag", "bidar", "raichur", "bagalkot", "haveri", "koppal",
123
+ "ramanagara", "chikkamagaluru", "yadgir", "mandya", "coorg", "kodagu", "karwar",
124
+ "kolar", "chikkaballapur", "madikeri", "udupi", "manipal",
125
+
126
+ # Bangalore areas and neighborhoods
127
+ "shivaji nagar", "majestic", "btm", "indiranagar", "jayanagar", "koramangala",
128
+ "whitefield", "banashankari", "hebbal", "rajajinagar", "yeshwanthpur", "kengeri",
129
+ "marathahalli", "hoskote", "rt nagar", "rr nagar", "bsk", "mg road", "brigade road",
130
+ "yelhanka", "basavanagudi", "malleswaram", "seshadripuram", "gandhinagar",
131
+ "electronic city", "silk board", "bommanahalli", "peenya", "jalahalli",
132
+ "vijayanagar", "rajajinagar", "kammanahalli", "banaswadi", "kr puram",
133
+ "domlur", "ulsoor", "vasanth nagar", "sadashiva nagar", "frazer town",
134
+ "commercial street", "chickpet", "kr market", "city market", "russell market",
135
+
136
+ # Tourist and party destinations (potential drug hotspots)
137
+ "hampi", "gokarna", "coorg", "chikmagalur", "nandi hills", "skandagiri",
138
+ "br hills", "kudremukh", "jog falls", "murdeshwar", "karwar beach",
139
+ "kabini", "bandipur", "nagarhole", "sakleshpur", "kemmannagundi"
140
+ ]
141
+
142
+ return [self._normalize_text(kw) for kw in raw_keywords]
143
+
144
+ def _normalize_text(self, text: str) -> str:
145
+ """Normalize text for better matching."""
146
+ if not isinstance(text, str):
147
+ return ""
148
+ return re.sub(r'\W+', ' ', text.lower()).strip()
149
+
150
+ def _fuzzy_match(self, text: str, keyword_list: List[str], threshold: int = 75) -> bool:
151
+ """Check if any keyword fuzzy matches text above threshold."""
152
+ text_norm = self._normalize_text(text)
153
+ if not text_norm:
154
+ return False
155
+ return any(fuzz.partial_ratio(text_norm, kw) >= threshold for kw in keyword_list)
156
+
157
+ def _compute_relevance_score(self, text: str, keyword_list: List[str], threshold: int = 70) -> int:
158
+ """Count how many keywords match text above threshold."""
159
+ text_norm = self._normalize_text(text)
160
+ return sum(1 for kw in keyword_list if fuzz.partial_ratio(text_norm, kw) >= threshold)
161
+
162
+ def _analyze_sentiment(self, text: str) -> Dict[str, float]:
163
+ """Analyze sentiment of tweet text."""
164
+ if not self.sentiment_analyzer:
165
+ return {"compound": 0.0, "pos": 0.0, "neu": 0.0, "neg": 0.0}
166
+
167
+ try:
168
+ scores = self.sentiment_analyzer.polarity_scores(text)
169
+ return scores
170
+ except:
171
+ return {"compound": 0.0, "pos": 0.0, "neu": 0.0, "neg": 0.0}
172
+
173
+ def _extract_mentions_hashtags(self, text: str) -> Tuple[List[str], List[str]]:
174
+ """Extract mentions and hashtags from tweet text."""
175
+ mentions = re.findall(r'@(\w+)', text)
176
+ hashtags = re.findall(r'#(\w+)', text.lower())
177
+ return mentions, hashtags
178
+
179
+ def _detect_phone_numbers(self, text: str) -> List[str]:
180
+ """Detect potential phone numbers in text."""
181
+ # Indian phone number patterns
182
+ patterns = [
183
+ r'\b[6-9]\d{9}\b', # 10-digit mobile
184
+ r'\+91[6-9]\d{9}\b', # With +91
185
+ r'\b91[6-9]\d{9}\b', # With 91
186
+ r'\b[6-9]\d{4}\s?\d{5}\b', # With space
187
+ ]
188
+
189
+ phone_numbers = []
190
+ for pattern in patterns:
191
+ phone_numbers.extend(re.findall(pattern, text))
192
+
193
+ return list(set(phone_numbers)) # Remove duplicates
194
+
195
+ def _assess_risk_level(self, tweet_data: Dict) -> str:
196
+ """Assess risk level based on various factors."""
197
+ risk_score = 0
198
+
199
+ # Check for crime indicators
200
+ if self._fuzzy_match(tweet_data['content'], self.crime_indicators, 70):
201
+ risk_score += 3
202
+
203
+ # Check for high drug relevance
204
+ if tweet_data['drug_score'] > 3:
205
+ risk_score += 2
206
+
207
+ # Check for high-risk locations
208
+ location_text = f"{tweet_data['user_location']} {tweet_data['content']}".lower()
209
+ if any(area in location_text for area in self.high_risk_areas):
210
+ risk_score += 2
211
+
212
+ # Check for contact information
213
+ if self._detect_phone_numbers(tweet_data['content']):
214
+ risk_score += 3
215
+
216
+ # Check for selling/buying indicators
217
+ selling_terms = ["selling", "available", "dm me", "contact", "call", "whatsapp", "delivery", "pickup"]
218
+ if any(term in tweet_data['content'].lower() for term in selling_terms):
219
+ risk_score += 2
220
+
221
+ # Check for quantity indicators (bulk operations)
222
+ quantity_terms = ["kg", "gram", "ounce", "pound", "bulk", "wholesale", "lots", "kilos"]
223
+ if any(term in tweet_data['content'].lower() for term in quantity_terms):
224
+ risk_score += 1
225
+
226
+ # Assess sentiment - negative sentiment might indicate problems
227
+ if tweet_data['sentiment']['compound'] < -0.5:
228
+ risk_score += 1
229
+
230
+ # High engagement (viral content)
231
+ if tweet_data.get('like_count', 0) > 100 or tweet_data.get('retweet_count', 0) > 50:
232
+ risk_score += 1
233
+
234
+ # Determine risk level
235
+ if risk_score >= 8:
236
+ return "CRITICAL"
237
+ elif risk_score >= 6:
238
+ return "HIGH"
239
+ elif risk_score >= 3:
240
+ return "MEDIUM"
241
+ else:
242
+ return "LOW"
243
+
244
+ def _is_karnataka_relevant(self, tweet) -> bool:
245
+ """Enhanced Karnataka relevance check."""
246
+ # User location check
247
+ if tweet.user.location and self._fuzzy_match(tweet.user.location, self.karnataka_keywords, 80):
248
+ return True
249
+
250
+ # User description check
251
+ user_desc = getattr(tweet.user, 'description', '') or ''
252
+ if self._fuzzy_match(user_desc, self.karnataka_keywords, 80):
253
+ return True
254
+
255
+ # Tweet content check
256
+ tweet_text = tweet.rawContent or ""
257
+ if self._fuzzy_match(tweet_text, self.karnataka_keywords, 75):
258
+ return True
259
+
260
+ # Hashtags check
261
+ hashtags = getattr(tweet, 'hashtags', []) or []
262
+ hashtags_text = ' '.join(hashtags).lower()
263
+ if any(kw in hashtags_text for kw in self.karnataka_keywords):
264
+ return True
265
+
266
+ return False
267
+
268
+ async def _search_with_retry(self, query: str, limit: int, retries: int = 3, delay: int = 10) -> List:
269
+ """Search tweets with enhanced retry logic."""
270
+ for attempt in range(retries):
271
+ try:
272
+ logger.info(f"Searching with query: {query[:50]}... (Attempt {attempt + 1})")
273
+ tweets = await gather(self.api.search(query, limit=limit))
274
+ logger.info(f"Retrieved {len(tweets)} tweets")
275
+ return tweets
276
+ except Exception as e:
277
+ logger.error(f"Attempt {attempt + 1} failed for query {query[:30]}...: {e}")
278
+ if attempt < retries - 1:
279
+ await asyncio.sleep(delay * (attempt + 1)) # Exponential backoff
280
+
281
+ logger.error(f"All attempts failed for query: {query[:30]}...")
282
+ return []
283
+
284
+ async def _collect_tweets_for_period(self, start_date: str, end_date: str, limit_per_query: int = 200) -> List[Dict]:
285
+ """Collect tweets for a specific time period with enhanced search strategies for latest 3 months."""
286
+ tweets_data = []
287
+
288
+ # Enhanced search queries optimized for recent data (last 3 months)
289
+ search_queries = [
290
+ # Primary drug-related searches
291
+ f"(ganja OR weed OR charas OR hash OR cannabis OR marijuana) (karnataka OR bengaluru OR bangalore OR mysore) lang:en since:{start_date} until:{end_date}",
292
+ f"(cocaine OR heroin OR mdma OR ecstasy OR drugs OR narcotics) (karnataka OR bengaluru OR bangalore) lang:en since:{start_date} until:{end_date}",
293
+
294
+ # Criminal activity searches
295
+ f"(dealer OR supplier OR selling OR buying) (drugs OR maal OR stuff) (karnataka OR bengaluru) since:{start_date} until:{end_date}",
296
+ f"(police OR arrest OR raid OR bust OR seized) (drugs OR narcotics OR ganja) karnataka since:{start_date} until:{end_date}",
297
+
298
+ # Location-specific searches
299
+ f"(drugs OR ganja OR weed) (shivaji nagar OR malleswaram OR btm OR koramangala) since:{start_date} until:{end_date}",
300
+ f"(maal OR stuff OR hash) (indiranagar OR whitefield OR electronic city OR jayanagar) since:{start_date} until:{end_date}",
301
+
302
+ # Coded language searches
303
+ f"(420 OR party supplies OR green OR herb) (bengaluru OR bangalore OR karnataka) since:{start_date} until:{end_date}",
304
+ f"(delivery OR pickup OR dm me OR contact) (stuff OR green OR maal) karnataka since:{start_date} until:{end_date}",
305
+
306
+ # News and official searches
307
+ f"(ncb OR narcotics control OR drug bust OR seizure) karnataka since:{start_date} until:{end_date}",
308
+ f"(drug trafficking OR smuggling OR peddling) (karnataka OR bengaluru OR mysore) since:{start_date} until:{end_date}"
309
+ ]
310
+
311
+ for query in search_queries:
312
+ logger.info(f"Executing query: {query[:100]}...")
313
+ tweets = await self._search_with_retry(query, limit_per_query)
314
+
315
+ for tweet in tweets:
316
+ if tweet.id in self.seen_ids:
317
+ continue
318
+
319
+ # Karnataka relevance filter
320
+ if not self._is_karnataka_relevant(tweet):
321
+ continue
322
+
323
+ self.seen_ids.add(tweet.id)
324
+ tweet_data = self._process_tweet(tweet)
325
+ if tweet_data:
326
+ tweets_data.append(tweet_data)
327
+
328
+ # Rate limiting between queries
329
+ await asyncio.sleep(8)
330
+
331
+ logger.info(f"Collected {len(tweets_data)} relevant tweets for period {start_date} to {end_date}")
332
+ return tweets_data
333
+
334
+ def _process_tweet(self, tweet) -> Optional[Dict]:
335
+ """Process individual tweet and extract relevant information."""
336
+ try:
337
+ # Basic tweet information
338
+ user_location = tweet.user.location or ""
339
+ user_description = getattr(tweet.user, 'description', '') or ""
340
+ tweet_text = tweet.rawContent or ""
341
+ hashtags = getattr(tweet, 'hashtags', []) or []
342
+
343
+ # Skip if no meaningful content
344
+ if not tweet_text.strip() or len(tweet_text) < 10:
345
+ return None
346
+
347
+ # Calculate relevance scores
348
+ kar_score = (
349
+ 4 * self._compute_relevance_score(user_location, self.karnataka_keywords) +
350
+ 2 * self._compute_relevance_score(user_description, self.karnataka_keywords) +
351
+ 3 * self._compute_relevance_score(tweet_text, self.karnataka_keywords)
352
+ )
353
+
354
+ drug_score = self._compute_relevance_score(tweet_text, self.drug_keywords, 80)
355
+ crime_score = self._compute_relevance_score(tweet_text, self.crime_indicators, 75)
356
+
357
+ # Extract additional information
358
+ mentions, hashtags_extracted = self._extract_mentions_hashtags(tweet_text)
359
+ phone_numbers = self._detect_phone_numbers(tweet_text)
360
+ sentiment = self._analyze_sentiment(tweet_text)
361
+
362
+ # Convert datetime to IST
363
+ ist = timezone(timedelta(hours=5, minutes=30))
364
+ utc_time = tweet.date if tweet.date.tzinfo else tweet.date.replace(tzinfo=timezone.utc)
365
+ ist_time = utc_time.astimezone(ist)
366
+
367
+ # Create tweet data dictionary
368
+ tweet_data = {
369
+ "tweet_id": str(tweet.id),
370
+ "datetime": ist_time.strftime("%d-%m-%Y %H:%M:%S"),
371
+ "username": tweet.user.username,
372
+ "user_display_name": getattr(tweet.user, 'displayname', '') or '',
373
+ "user_followers": getattr(tweet.user, 'followersCount', 0),
374
+ "user_following": getattr(tweet.user, 'followingCount', 0),
375
+ "user_verified": getattr(tweet.user, 'verified', False),
376
+ "content": tweet_text,
377
+ "user_location": user_location,
378
+ "user_description": user_description,
379
+ "hashtags": ' '.join(hashtags),
380
+ "mentions": ', '.join(mentions),
381
+ "phone_numbers": ', '.join(phone_numbers),
382
+ "tweet_url": f"https://x.com/{tweet.user.username}/status/{tweet.id}",
383
+ "retweet_count": getattr(tweet, 'retweetCount', 0),
384
+ "like_count": getattr(tweet, 'likeCount', 0),
385
+ "reply_count": getattr(tweet, 'replyCount', 0),
386
+ "kar_score": kar_score,
387
+ "drug_score": drug_score,
388
+ "crime_score": crime_score,
389
+ "sentiment": sentiment,
390
+ "sentiment_compound": sentiment.get('compound', 0),
391
+ "is_drug_related": drug_score > 0,
392
+ "is_crime_related": crime_score > 0,
393
+ "has_contact_info": len(phone_numbers) > 0,
394
+ "risk_level": "", # Will be calculated later
395
+ "content_hash": hashlib.md5(tweet_text.encode()).hexdigest()
396
+ }
397
+
398
+ # Assess risk level
399
+ tweet_data["risk_level"] = self._assess_risk_level(tweet_data)
400
+
401
+ return tweet_data
402
+
403
+ except Exception as e:
404
+ logger.error(f"Error processing tweet {getattr(tweet, 'id', 'unknown')}: {e}")
405
+ return None
406
+
407
+ async def initialize_api(self) -> bool:
408
+ """Initialize Twitter API connection."""
409
+ try:
410
+ self.api = API()
411
+ accounts = await self.api.pool.get_all()
412
+
413
+ if not accounts:
414
+ logger.error("No Twitter accounts found. Please add accounts using: twscrape add_account")
415
+ logger.error("Setup instructions:")
416
+ logger.error("1. twscrape add_account username email password")
417
+ logger.error("2. twscrape login_accounts")
418
+ return False
419
+
420
+ logger.info(f"Initialized with {len(accounts)} accounts: {[a.username for a in accounts]}")
421
+ return True
422
+
423
+ except Exception as e:
424
+ logger.error(f"Failed to initialize API: {e}")
425
+ return False
426
+
427
+ def _ensure_output_directory(self, directory: str = "drug_analysis_data_3months"):
428
+ """Create output directory if it doesn't exist."""
429
+ if not os.path.exists(directory):
430
+ os.makedirs(directory)
431
+ logger.info(f"Created directory: {directory}")
432
+
433
+ def _save_data(self, data: List[Dict], base_filename: str):
434
+ """Save data in multiple formats with detailed analysis."""
435
+ if not data:
436
+ logger.warning("No data to save")
437
+ return
438
+
439
+ df = pd.DataFrame(data)
440
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
441
+
442
+ # Calculate the actual date range from data
443
+ if 'datetime' in df.columns:
444
+ df['date_parsed'] = pd.to_datetime(df['datetime'], format="%d-%m-%Y %H:%M:%S", errors='coerce')
445
+ date_range = f"3months_{df['date_parsed'].min().strftime('%Y%m%d')}_to_{df['date_parsed'].max().strftime('%Y%m%d')}"
446
+ else:
447
+ date_range = "latest_3months"
448
+
449
+ # Save main dataset
450
+ main_file = f"drug_analysis_data_3months/karnataka_drug_tweets_{date_range}_{timestamp}.csv"
451
+ df.to_csv(main_file, index=False, encoding='utf-8')
452
+ logger.info(f"Saved main dataset: {main_file}")
453
+
454
+ # Save high-risk tweets separately (HIGH PRIORITY FOR POLICE)
455
+ high_risk_df = df[df['risk_level'].isin(['HIGH', 'CRITICAL'])]
456
+ if not high_risk_df.empty:
457
+ risk_file = f"drug_analysis_data_3months/HIGH_PRIORITY_tweets_{date_range}_{timestamp}.csv"
458
+ high_risk_df.to_csv(risk_file, index=False, encoding='utf-8')
459
+ logger.info(f"🚨 Saved HIGH PRIORITY tweets: {risk_file}")
460
+
461
+ # Save drug-related tweets
462
+ drug_df = df[df['is_drug_related'] == True]
463
+ if not drug_df.empty:
464
+ drug_file = f"drug_analysis_data_3months/drug_related_{date_range}_{timestamp}.csv"
465
+ drug_df.to_csv(drug_file, index=False, encoding='utf-8')
466
+ logger.info(f"Saved drug-related tweets: {drug_file}")
467
+
468
+ # Save tweets with contact information (INVESTIGATION LEADS)
469
+ contact_df = df[df['has_contact_info'] == True]
470
+ if not contact_df.empty:
471
+ contact_file = f"drug_analysis_data_3months/CONTACT_INFO_tweets_{date_range}_{timestamp}.csv"
472
+ contact_df.to_csv(contact_file, index=False, encoding='utf-8')
473
+ logger.info(f"🔍 Saved tweets with contact info: {contact_file}")
474
+
475
+ # Save by risk level
476
+ for risk_level in ['CRITICAL', 'HIGH', 'MEDIUM']:
477
+ risk_df = df[df['risk_level'] == risk_level]
478
+ if not risk_df.empty:
479
+ risk_level_file = f"drug_analysis_data_3months/{risk_level}_RISK_{date_range}_{timestamp}.csv"
480
+ risk_df.to_csv(risk_level_file, index=False, encoding='utf-8')
481
+ logger.info(f"Saved {risk_level} risk tweets: {risk_level_file}")
482
+
483
+ # Generate summary report
484
+ self._generate_summary_report(df, timestamp, date_range)
485
+
486
+ def _generate_summary_report(self, df: pd.DataFrame, timestamp: str, date_range: str):
487
+ """Generate a comprehensive summary report for police analysis."""
488
+ report = {
489
+ "analysis_metadata": {
490
+ "analysis_timestamp": timestamp,
491
+ "date_range": date_range,
492
+ "collection_period_days": len(df['datetime'].unique()) if 'datetime' in df.columns else 0,
493
+ "scraper_version": "3_months_automatic",
494
+ "total_queries_executed": 10,
495
+ "karnataka_police_authorization": "DRUG_CRIME_ANALYSIS_2024"
496
+ },
497
+ "summary_statistics": {
498
+ "total_tweets": len(df),
499
+ "drug_related_tweets": int(df['is_drug_related'].sum()),
500
+ "crime_related_tweets": int(df['is_crime_related'].sum()),
501
+ "tweets_with_contact_info": int(df['has_contact_info'].sum()),
502
+ "unique_users": int(df['username'].nunique()),
503
+ "verified_users": int(df['user_verified'].sum()),
504
+ "average_followers": int(df['user_followers'].mean()) if len(df) > 0 else 0
505
+ },
506
+ "risk_analysis": {
507
+ "risk_distribution": df['risk_level'].value_counts().to_dict(),
508
+ "critical_alerts": int((df['risk_level'] == 'CRITICAL').sum()),
509
+ "high_priority": int((df['risk_level'] == 'HIGH').sum()),
510
+ "investigation_leads": int(df['has_contact_info'].sum())
511
+ },
512
+ "geographic_analysis": {
513
+ "top_locations": df['user_location'].value_counts().head(15).to_dict(),
514
+ "high_risk_areas_mentioned": sum(1 for area in self.high_risk_areas
515
+ if any(area in tweet.lower() for tweet in df['content']))
516
+ },
517
+ "content_analysis": {
518
+ "sentiment_distribution": {
519
+ "positive": int((df['sentiment_compound'] > 0.1).sum()),
520
+ "neutral": int((df['sentiment_compound'].between(-0.1, 0.1)).sum()),
521
+ "negative": int((df['sentiment_compound'] < -0.1).sum())
522
+ },
523
+ "top_hashtags": df['hashtags'].str.split().explode().value_counts().head(25).to_dict(),
524
+ "most_mentioned_users": df['mentions'].str.split(', ').explode().value_counts().head(20).to_dict(),
525
+ "average_drug_score": float(df['drug_score'].mean()) if len(df) > 0 else 0,
526
+ "average_crime_score": float(df['crime_score'].mean()) if len(df) > 0 else 0
527
+ },
528
+ "user_analysis": {
529
+ "high_activity_users": df['username'].value_counts().head(15).to_dict(),
530
+ "most_followed_users": df.nlargest(10, 'user_followers')[['username', 'user_followers']].to_dict('records'),
531
+ "users_with_contact_info": df[df['has_contact_info']]['username'].tolist()
532
+ },
533
+ "temporal_analysis": {
534
+ "tweets_by_date": df['datetime'].str[:10].value_counts().sort_index().to_dict() if 'datetime' in df.columns else {},
535
+ "peak_activity_days": df['datetime'].str[:10].value_counts().head(10).to_dict() if 'datetime' in df.columns else {}
536
+ },
537
+ "investigation_priorities": {
538
+ "immediate_action_required": len(df[df['risk_level'].isin(['CRITICAL', 'HIGH'])]),
539
+ "contact_information_available": len(df[df['has_contact_info']]),
540
+ "bulk_operation_indicators": len(df[df['content'].str.contains('kg|gram|bulk|wholesale', case=False, na=False)]),
541
+ "cross_border_mentions": len(df[df['content'].str.contains('interstate|border|import', case=False, na=False)])
542
+ }
543
+ }
544
+
545
+ # Save comprehensive report as JSON
546
+ report_file = f"drug_analysis_data_3months/POLICE_ANALYSIS_REPORT_{date_range}_{timestamp}.json"
547
+ with open(report_file, 'w', encoding='utf-8') as f:
548
+ json.dump(report, f, indent=2, ensure_ascii=False, default=str)
549
+
550
+ logger.info(f"📊 Generated comprehensive police report: {report_file}")
551
+
552
+ # Print executive summary to console
553
+ print("\n" + "="*80)
554
+ print("KARNATAKA POLICE DRUG CRIME ANALYSIS REPORT - LATEST 3 MONTHS")
555
+ print("="*80)
556
+ print(f"Analysis completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
557
+ print(f"Period analyzed: Latest 3 months ({date_range})")
558
+ print(f"Total tweets analyzed: {report['summary_statistics']['total_tweets']}")
559
+ print(f"Drug-related tweets: {report['summary_statistics']['drug_related_tweets']} ({report['summary_statistics']['drug_related_tweets']/report['summary_statistics']['total_tweets']*100:.1f}%)")
560
+ print(f"Crime-related tweets: {report['summary_statistics']['crime_related_tweets']} ({report['summary_statistics']['crime_related_tweets']/report['summary_statistics']['total_tweets']*100:.1f}%)")
561
+ print(f"Tweets with contact info: {report['summary_statistics']['tweets_with_contact_info']} (🚨 INVESTIGATION LEADS)")
562
+ print(f"Unique users: {report['summary_statistics']['unique_users']}")
563
+ print(f"Verified users: {report['summary_statistics']['verified_users']}")
564
+
565
+ print("\n🚨 RISK LEVEL DISTRIBUTION:")
566
+ total_tweets = sum(report['risk_analysis']['risk_distribution'].values())
567
+ for risk, count in report['risk_analysis']['risk_distribution'].items():
568
+ percentage = (count/total_tweets*100) if total_tweets > 0 else 0
569
+ emoji = "🔴" if risk == "CRITICAL" else "🟠" if risk == "HIGH" else "🟡" if risk == "MEDIUM" else "🟢"
570
+ print(f" {emoji} {risk}: {count} ({percentage:.1f}%)")
571
+
572
+ print("\n📍 TOP LOCATIONS (First 10):")
573
+ for i, (location, count) in enumerate(list(report['geographic_analysis']['top_locations'].items())[:10], 1):
574
+ print(f" {i:2d}. {location}: {count} tweets")
575
+
576
+ print("\n💭 SENTIMENT ANALYSIS:")
577
+ for sentiment, count in report['content_analysis']['sentiment_distribution'].items():
578
+ percentage = (count/report['summary_statistics']['total_tweets']*100) if report['summary_statistics']['total_tweets'] > 0 else 0
579
+ print(f" {sentiment.capitalize()}: {count} ({percentage:.1f}%)")
580
+
581
+ print("\n👥 HIGH ACTIVITY USERS (Top 5):")
582
+ for i, (user, count) in enumerate(list(report['user_analysis']['high_activity_users'].items())[:5], 1):
583
+ print(f" {i}. @{user}: {count} tweets")
584
+
585
+ print("\n🎯 IMMEDIATE INVESTIGATION PRIORITIES:")
586
+ priorities = report['investigation_priorities']
587
+ if priorities['immediate_action_required'] > 0:
588
+ print(f" 🔴 URGENT: {priorities['immediate_action_required']} high/critical risk tweets requiring immediate review")
589
+ if priorities['contact_information_available'] > 0:
590
+ print(f" 🔍 LEADS: {priorities['contact_information_available']} tweets contain contact information")
591
+ if priorities['bulk_operation_indicators'] > 0:
592
+ print(f" 📦 BULK OPS: {priorities['bulk_operation_indicators']} tweets mention bulk operations")
593
+ if priorities['cross_border_mentions'] > 0:
594
+ print(f" 🌐 INTERSTATE: {priorities['cross_border_mentions']} tweets mention cross-border activity")
595
+
596
+ print("\n📁 OUTPUT FILES GENERATED:")
597
+ print(f" • Main dataset: karnataka_drug_tweets_{date_range}_{timestamp}.csv")
598
+ print(f" • High priority: HIGH_PRIORITY_tweets_{date_range}_{timestamp}.csv")
599
+ print(f" • Contact info: CONTACT_INFO_tweets_{date_range}_{timestamp}.csv")
600
+ print(f" • Analysis report: POLICE_ANALYSIS_REPORT_{date_range}_{timestamp}.json")
601
+
602
+ print("\n✅ NEXT STEPS:")
603
+ print(" 1. Review HIGH_PRIORITY_tweets file for immediate action")
604
+ print(" 2. Investigate users in CONTACT_INFO_tweets file")
605
+ print(" 3. Cross-reference with existing case files")
606
+ print(" 4. Monitor high-activity users for ongoing surveillance")
607
+ print("="*80)
608
+
609
+ async def run_analysis(self, start_date: str, end_date: str, tweets_per_day: int = 200):
610
+ """Run the complete drug crime analysis for latest 3 months."""
611
+ logger.info("🚔 Starting Karnataka Police Drug Crime Analysis - Latest 3 Months")
612
+ logger.info(f"📅 Analysis period: {start_date} to {end_date}")
613
+
614
+ # Calculate total days
615
+ start_dt = datetime.strptime(start_date, "%Y-%m-%d")
616
+ end_dt = datetime.strptime(end_date, "%Y-%m-%d")
617
+ total_days = (end_dt - start_dt).days
618
+
619
+ logger.info(f"⏱️ Total analysis period: {total_days} days")
620
+ logger.info(f"🎯 Expected tweets: ~{tweets_per_day * total_days}")
621
+
622
+ # Initialize API
623
+ if not await self.initialize_api():
624
+ logger.error("❌ Failed to initialize Twitter API. Exiting.")
625
+ return
626
+
627
+ # Ensure output directory exists
628
+ self._ensure_output_directory()
629
+
630
+ # Generate date ranges for systematic collection (7-day chunks for better coverage)
631
+ date_ranges = self._generate_date_ranges(start_date, end_date, days_per_chunk=7)
632
+
633
+ logger.info(f"📊 Will process {len(date_ranges)} date ranges (7-day chunks)")
634
+
635
+ all_tweets = []
636
+ for i, (start, end) in enumerate(date_ranges, 1):
637
+ logger.info(f"🔍 Analyzing period {i}/{len(date_ranges)}: {start} to {end}")
638
+
639
+ tweets = await self._collect_tweets_for_period(start, end, tweets_per_day)
640
+ all_tweets.extend(tweets)
641
+
642
+ logger.info(f"✅ Collected {len(tweets)} tweets for this period. Total so far: {len(all_tweets)}")
643
+
644
+ # Rate limiting between periods - longer delay for larger dataset
645
+ if i < len(date_ranges): # Don't sleep after the last iteration
646
+ logger.info("⏳ Waiting 15 seconds before next period...")
647
+ await asyncio.sleep(15)
648
+
649
+ if not all_tweets:
650
+ logger.error("❌ No relevant tweets collected for analysis")
651
+ return
652
+
653
+ logger.info("🔄 Removing duplicate tweets...")
654
+ # Remove duplicates based on content hash
655
+ unique_tweets = []
656
+ seen_hashes = set()
657
+
658
+ for tweet in all_tweets:
659
+ content_hash = tweet.get('content_hash', '')
660
+ if content_hash not in seen_hashes:
661
+ unique_tweets.append(tweet)
662
+ seen_hashes.add(content_hash)
663
+
664
+ duplicates_removed = len(all_tweets) - len(unique_tweets)
665
+ logger.info(f"🗑️ Removed {duplicates_removed} duplicate tweets")
666
+ logger.info(f"✨ Final unique tweets: {len(unique_tweets)}")
667
+
668
+ # Save collected data
669
+ logger.info("💾 Saving collected data...")
670
+ self._save_data(unique_tweets, "karnataka_drug_analysis_3months")
671
+
672
+ logger.info("🎉 Analysis completed successfully!")
673
+ logger.info(f"📈 Total unique tweets collected: {len(unique_tweets)}")
674
+ logger.info(f"📁 Check the 'drug_analysis_data_3months' directory for results")
675
+
676
+ return unique_tweets
677
+
678
+ def _generate_date_ranges(self, start_date: str, end_date: str, days_per_chunk: int = 7) -> List[Tuple[str, str]]:
679
+ """Generate date ranges for systematic data collection - optimized for 3 months."""
680
+ start = datetime.strptime(start_date, "%Y-%m-%d")
681
+ end = datetime.strptime(end_date, "%Y-%m-%d")
682
+
683
+ ranges = []
684
+ current = start
685
+
686
+ while current < end:
687
+ next_date = min(current + timedelta(days=days_per_chunk), end)
688
+ ranges.append((
689
+ current.strftime("%Y-%m-%d"),
690
+ next_date.strftime("%Y-%m-%d")
691
+ ))
692
+ current = next_date
693
+
694
+ return ranges
695
+
696
+ # Main execution function
697
+ async def main():
698
+ """Main function to run the drug crime analysis for latest 3 months."""
699
+ scraper = DrugCrimeScraper3Months()
700
+
701
+ # Automatic calculation for latest 3 months
702
+ print("\n" + "="*80)
703
+ print("🚔 KARNATAKA POLICE DRUG CRIME ANALYSIS SYSTEM")
704
+ print("📊 LATEST 3 MONTHS DATA COLLECTION")
705
+ print("="*80)
706
+
707
+ # Calculate dates for last 3 months from current date
708
+ end_date = datetime.now()
709
+ start_date = end_date - timedelta(days=90) # Last 3 months (approximately)
710
+
711
+ START_DATE = start_date.strftime("%Y-%m-%d")
712
+ END_DATE = end_date.strftime("%Y-%m-%d")
713
+ TWEETS_PER_DAY = 200 # Increased for better coverage
714
+
715
+ print(f"📅 Automatic Date Range Calculation:")
716
+ print(f" Start Date: {START_DATE}")
717
+ print(f" End Date: {END_DATE}")
718
+ print(f" Total Days: {(end_date - start_date).days}")
719
+ print(f" Expected Tweets: ~{TWEETS_PER_DAY * (end_date - start_date).days}")
720
+ print(f" Estimated Runtime: ~{((end_date - start_date).days // 7) * 2} minutes")
721
+ print("="*80)
722
+
723
+ # Confirm before starting
724
+ try:
725
+ response = input("\n🤔 Proceed with data collection? (y/n): ").lower().strip()
726
+ if response != 'y' and response != 'yes':
727
+ print("❌ Collection cancelled by user.")
728
+ return
729
+ except KeyboardInterrupt:
730
+ print("\n❌ Collection cancelled by user.")
731
+ return
732
+
733
+ print("\n🚀 Starting data collection...")
734
+
735
+ try:
736
+ collected_data = await scraper.run_analysis(START_DATE, END_DATE, TWEETS_PER_DAY)
737
+
738
+ if collected_data and len(collected_data) > 0:
739
+ print(f"\n🎉 SUCCESS! Collected {len(collected_data)} tweets for Karnataka Police analysis.")
740
+ print("📁 Check the 'drug_analysis_data_3months' directory for organized results.")
741
+ else:
742
+ print("\n⚠️ No data collected. Please check:")
743
+ print(" 1. Twitter account setup (twscrape add_account)")
744
+ print(" 2. Internet connection")
745
+ print(" 3. API rate limits")
746
+
747
+ except KeyboardInterrupt:
748
+ logger.info("\n⚠️ Analysis interrupted by user")
749
+ print("\n⚠️ Collection stopped by user. Partial data may be saved.")
750
+ except Exception as e:
751
+ logger.error(f"❌ Analysis failed: {e}")
752
+ print(f"\n❌ Error occurred: {e}")
753
+ print("💡 Check the log file 'drug_crime_scraper_3months.log' for details.")
754
+
755
+ if __name__ == "__main__":
756
+ # Windows compatibility
757
+ if sys.platform.startswith("win"):
758
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
759
+
760
+ # Print setup instructions
761
+ print("\n📋 SETUP REQUIREMENTS:")
762
+ print("1. Install: pip install twscrape pandas fuzzywuzzy nltk python-levenshtein")
763
+ print("2. Add Twitter account: twscrape add_account username email password")
764
+ print("3. Login accounts: twscrape login_accounts")
765
+ print("4. Run this script: python enhanced_drug_crime_scraper_3months.py")
766
+
767
+ # Run the analysis
768
+ try:
769
+ asyncio.run(main())
770
+ except KeyboardInterrupt:
771
+ print("\n👋 Goodbye!")
772
+ except Exception as e:
773
+ print(f"\n❌ Failed to start: {e}")
774
+ print("💡 Please check the setup requirements above.")
src/evaluate.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluate.py
2
+ import pandas as pd
3
+ import os
4
+ from collections import Counter
5
+ import re
6
+ from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
7
+
8
+ SCRAPER_FOLDER = "drug_analysis_data_3months" # Folder where scraper saves CSVs
9
+
10
+ # -----------------------------
11
+ # Load CSVs
12
+ # -----------------------------
13
+ csv_files = [f for f in os.listdir(SCRAPER_FOLDER) if f.endswith(".csv")]
14
+ if not csv_files:
15
+ print("❌ No CSV files found in scraper folder!")
16
+ exit()
17
+
18
+ dfs = [pd.read_csv(os.path.join(SCRAPER_FOLDER, f)) for f in csv_files]
19
+ df = pd.concat(dfs, ignore_index=True)
20
+ print(f"✅ Loaded {len(df)} rows from {len(csv_files)} CSV files.\n")
21
+
22
+ # -----------------------------
23
+ # General Stats
24
+ # -----------------------------
25
+ print("=== General Stats ===")
26
+ print("Columns:", df.columns.tolist())
27
+ print("Total rows:", len(df))
28
+ print("Missing values per column:\n", df.isna().sum())
29
+ print("\nDuplicate rows:", df.duplicated().sum())
30
+
31
+ # Sample rows with missing data
32
+ missing_rows = df[df.isna().any(axis=1)]
33
+ if not missing_rows.empty:
34
+ print("\nSample rows with missing values:\n", missing_rows.head())
35
+
36
+ # Sample duplicate rows
37
+ duplicates = df[df.duplicated(keep=False)]
38
+ if not duplicates.empty:
39
+ print("\nSample duplicate rows:\n", duplicates.head())
40
+
41
+ # -----------------------------
42
+ # Drug/Crime-related stats
43
+ # -----------------------------
44
+ for col in ["is_drug_related", "is_crime_related", "risk_level"]:
45
+ if col in df.columns:
46
+ print(f"\n=== {col} Distribution ===")
47
+ print(df[col].value_counts())
48
+ print("Proportion:\n", round(df[col].value_counts(normalize=True), 4))
49
+
50
+ # Risk level numeric analysis
51
+ if "risk_level" in df.columns and pd.api.types.is_numeric_dtype(df["risk_level"]):
52
+ print("\n=== Risk Level Stats ===")
53
+ print("Average risk:", round(df["risk_level"].mean(), 2))
54
+ print("Max risk:", df["risk_level"].max())
55
+ high_risk_count = (df["risk_level"] >= 0.7).sum() # Threshold
56
+ print("Number of high-risk items (risk >= 0.7):", high_risk_count)
57
+
58
+ # -----------------------------
59
+ # Time coverage
60
+ # -----------------------------
61
+ if "datetime" in df.columns:
62
+ df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")
63
+ print("\n=== Date Range ===")
64
+ print("Earliest:", df["datetime"].min())
65
+ print("Latest:", df["datetime"].max())
66
+
67
+ # Daily counts
68
+ df["date"] = df["datetime"].dt.date
69
+ daily_counts = df.groupby("date").size()
70
+ print("\n=== Daily Counts of Posts ===")
71
+ print(daily_counts)
72
+
73
+ # -----------------------------
74
+ # Text Analysis
75
+ # -----------------------------
76
+ if "text" in df.columns:
77
+ df["text"] = df["text"].astype(str)
78
+ df["text_length"] = df["text"].apply(len)
79
+ print("\n=== Text Length Stats ===")
80
+ print("Average length:", round(df["text_length"].mean(), 2))
81
+ print("Min length:", df["text_length"].min())
82
+ print("Max length:", df["text_length"].max())
83
+
84
+ # Top 10 most common words
85
+ words = Counter()
86
+ for t in df["text"]:
87
+ words.update(re.findall(r"\w+", t.lower()))
88
+ print("\nTop 10 common words:", words.most_common(10))
89
+
90
+ # -----------------------------
91
+ # User / Source Analysis
92
+ # -----------------------------
93
+ if "username" in df.columns:
94
+ print("\n=== User Analysis ===")
95
+ print("Total unique users:", df["username"].nunique())
96
+ top_users = df["username"].value_counts().head(10)
97
+ print("Top 10 users by post count:\n", top_users)
98
+
99
+ # -----------------------------
100
+ # Scraper Evaluation Metrics
101
+ # -----------------------------
102
+ print("\n=== Scraper Evaluation Metrics ===")
103
+
104
+ # 1. Completeness (% of filled cells)
105
+ completeness = 1 - df.isna().mean().mean()
106
+ print(f"Completeness (all columns filled): {round(completeness*100, 2)}%")
107
+
108
+ # 2. Duplicate rate (% of duplicate rows)
109
+ duplicate_rate = df.duplicated().mean()
110
+ print(f"Duplicate rows rate: {round(duplicate_rate*100, 2)}%")
111
+
112
+ # 3. Drug/Crime relevance (if available)
113
+ for col in ["is_drug_related", "is_crime_related"]:
114
+ if col in df.columns:
115
+ relevance = df[col].sum() / len(df)
116
+ print(f"{col} relevance rate: {round(relevance*100,2)}%")
117
+
118
+ # 4. Time coverage (active days vs total days)
119
+ if "datetime" in df.columns:
120
+ total_days = (df["datetime"].max() - df["datetime"].min()).days + 1
121
+ active_days = df["date"].nunique()
122
+ coverage_ratio = active_days / total_days
123
+ print(f"Time coverage ratio (active days / total days): {round(coverage_ratio*100,2)}%")
124
+
125
+ # 5. Average text length (proxy for content richness)
126
+ if "text" in df.columns:
127
+ print(f"Average text length: {round(df['text_length'].mean(),2)} characters")
128
+
129
+ # 6. Classification Metrics (using scraper labels as pseudo-ground truth)
130
+ # If multiple columns available (e.g., is_drug_related vs is_crime_related), compute metrics
131
+ if "is_drug_related" in df.columns and "is_crime_related" in df.columns:
132
+ y_true = df["is_crime_related"]
133
+ y_pred = df["is_drug_related"]
134
+ print("\n=== Classification Metrics (is_drug_related vs is_crime_related) ===")
135
+ print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))
136
+ print("Precision:", round(precision_score(y_true, y_pred), 4))
137
+ print("Recall:", round(recall_score(y_true, y_pred), 4))
138
+ print("F1-score:", round(f1_score(y_true, y_pred), 4))
139
+ print("\nClassification Report:\n", classification_report(y_true, y_pred))
140
+ else:
141
+ print("\n⚠️ Skipping classification metrics: Not enough columns for evaluation.")
142
+
143
+ print("\n✅ Data evaluation + metrics complete!")
src/evaluation.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluation.py
2
+ from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, brier_score_loss
3
+ from alerts import compute_dynamic_risk
4
+
5
+ def evaluate_model(test_tweets):
6
+ """
7
+ test_tweets: list of dicts with fields
8
+ - true_risk_level: "CRITICAL"/"HIGH"/...
9
+ - dynamic_risk_score: 0-100
10
+ """
11
+ # Compute predicted risk level from dynamic score
12
+ y_true = [1 if t['true_risk_level'] == "CRITICAL" else 0 for t in test_tweets]
13
+ y_prob = []
14
+ y_pred = []
15
+
16
+ for t in test_tweets:
17
+ score = t["dynamic_risk_score"]
18
+ prob = compute_dynamic_risk(score)
19
+ y_prob.append(prob)
20
+ y_pred.append(1 if prob >= 0.75 else 0) # threshold for CRITICAL
21
+
22
+ print("=== Classification Report ===")
23
+ print(classification_report(y_true, y_pred, target_names=["Non-Critical","Critical"]))
24
+
25
+ print("=== Confusion Matrix ===")
26
+ print(confusion_matrix(y_true, y_pred))
27
+
28
+ print("ROC-AUC:", roc_auc_score(y_true, y_prob))
29
+ print("Brier Score:", brier_score_loss(y_true, y_prob))