Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- src/alerts.py +341 -0
- src/db.py +89 -0
- src/enhanced_drug_crime_scraper_3months.py +774 -0
- src/evaluate.py +143 -0
- src/evaluation.py +29 -0
src/alerts.py
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# alerts.py
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
from email.mime.text import MIMEText
|
| 7 |
+
from email.mime.multipart import MIMEMultipart
|
| 8 |
+
import smtplib
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
|
| 11 |
+
# local db wrapper (your db.py must expose these functions)
|
| 12 |
+
from db import fetch_high_risk_unnotified, mark_as_notified
|
| 13 |
+
|
| 14 |
+
load_dotenv()
|
| 15 |
+
|
| 16 |
+
# --- Configuration (via env) ---
|
| 17 |
+
ALERT_EMAIL = os.getenv("ALERT_EMAIL") # sender email (also login user)
|
| 18 |
+
ALERT_PASSWORD = os.getenv("ALERT_PASSWORD") # SMTP password or app-specific password
|
| 19 |
+
ALERT_RECIPIENTS = os.getenv("ALERT_RECIPIENTS") # comma-separated recipients e.g. "a@x.com,b@x.com"
|
| 20 |
+
SMTP_SERVER = os.getenv("ALERT_SMTP", "smtp.gmail.com")
|
| 21 |
+
SMTP_PORT = int(os.getenv("ALERT_SMTP_PORT", "587"))
|
| 22 |
+
SEND_RETRY = int(os.getenv("ALERT_SEND_RETRY", "2"))
|
| 23 |
+
RETRY_DELAY = float(os.getenv("ALERT_RETRY_DELAY", "2.0")) # seconds between retries
|
| 24 |
+
BATCH_DELAY = float(os.getenv("ALERT_BATCH_DELAY", "0.5")) # short wait between operations
|
| 25 |
+
|
| 26 |
+
# Basic validation
|
| 27 |
+
if not ALERT_EMAIL or not ALERT_PASSWORD:
|
| 28 |
+
raise RuntimeError("ALERT_EMAIL and ALERT_PASSWORD must be set in the environment.")
|
| 29 |
+
|
| 30 |
+
# Build recipients list
|
| 31 |
+
def _parse_recipients(env_val: Optional[str]) -> List[str]:
|
| 32 |
+
if not env_val:
|
| 33 |
+
return [ALERT_EMAIL]
|
| 34 |
+
return [r.strip() for r in env_val.split(",") if r.strip()]
|
| 35 |
+
|
| 36 |
+
RECIPIENTS = _parse_recipients(ALERT_RECIPIENTS)
|
| 37 |
+
|
| 38 |
+
# logger
|
| 39 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 40 |
+
|
| 41 |
+
def compute_dynamic_risk(tweet: dict) -> float:
|
| 42 |
+
"""
|
| 43 |
+
Returns a dynamic risk score (0–100)
|
| 44 |
+
"""
|
| 45 |
+
# 1️⃣ Content-based weight
|
| 46 |
+
drug_score = float(tweet.get("drug_score", 0))
|
| 47 |
+
crime_score = float(tweet.get("crime_score", 0))
|
| 48 |
+
content_score = 0.5*drug_score + 0.5*crime_score # can adjust weights
|
| 49 |
+
|
| 50 |
+
# 2️⃣ User influence weight
|
| 51 |
+
followers = int(tweet.get("followers_count", 0))
|
| 52 |
+
verified = 1 if tweet.get("verified", False) else 0
|
| 53 |
+
user_score = min(followers/1000, 1) * 0.5 + verified*0.5 # normalize
|
| 54 |
+
|
| 55 |
+
# 3️⃣ Engagement weight
|
| 56 |
+
engagement = int(tweet.get("like_count", 0)) + int(tweet.get("retweet_count", 0))
|
| 57 |
+
engagement_score = min(engagement / 50, 1) # normalize to 0-1
|
| 58 |
+
|
| 59 |
+
# 4️⃣ Geo relevance
|
| 60 |
+
location = str(tweet.get("user_location", "")).lower()
|
| 61 |
+
karnataka_keywords = ["bangalore", "bengaluru", "karnataka"]
|
| 62 |
+
geo_score = 1 if any(k in location for k in karnataka_keywords) else 0
|
| 63 |
+
|
| 64 |
+
# Combine weights (adjust relative contributions)
|
| 65 |
+
risk_score = (
|
| 66 |
+
0.4*content_score +
|
| 67 |
+
0.2*user_score +
|
| 68 |
+
0.2*engagement_score +
|
| 69 |
+
0.2*geo_score
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Scale to 0–100
|
| 73 |
+
return round(risk_score*100, 2)
|
| 74 |
+
|
| 75 |
+
def assign_dynamic_risk_level(tweet: dict) -> str:
|
| 76 |
+
score = compute_dynamic_risk(tweet)
|
| 77 |
+
if score >= 75:
|
| 78 |
+
return "CRITICAL"
|
| 79 |
+
elif score >= 50:
|
| 80 |
+
return "HIGH"
|
| 81 |
+
elif score >= 25:
|
| 82 |
+
return "MEDIUM"
|
| 83 |
+
else:
|
| 84 |
+
return "LOW"
|
| 85 |
+
|
| 86 |
+
def _tweet_score_for_sort(tweet: dict) -> float:
|
| 87 |
+
"""Use dynamic risk for sorting instead of static RISK_PRIORITIES"""
|
| 88 |
+
risk_score = compute_dynamic_risk(tweet) # 0–100
|
| 89 |
+
engagement = int(tweet.get("like_count", 0) or 0) + int(tweet.get("retweet_count", 0) or 0)
|
| 90 |
+
return risk_score + engagement*0.1 # small weight to engagement
|
| 91 |
+
|
| 92 |
+
def _select_top_tweets(tweets: List[dict], max_tweets: Optional[int], send_all: bool) -> List[dict]:
|
| 93 |
+
"""Sort and slice tweets according to priority; return selected tweets."""
|
| 94 |
+
if not tweets:
|
| 95 |
+
return []
|
| 96 |
+
|
| 97 |
+
# normalize risk_level fields to uppercase to avoid mismatches
|
| 98 |
+
for t in tweets:
|
| 99 |
+
if "risk_level" in t and isinstance(t["risk_level"], str):
|
| 100 |
+
t["risk_level"] = t["risk_level"].upper()
|
| 101 |
+
|
| 102 |
+
# sort by risk then engagement (both descending)
|
| 103 |
+
tweets_sorted = sorted(tweets, key=lambda t: _tweet_score_for_sort(t), reverse=True)
|
| 104 |
+
|
| 105 |
+
if send_all or max_tweets is None:
|
| 106 |
+
return tweets_sorted
|
| 107 |
+
return tweets_sorted[:max_tweets]
|
| 108 |
+
|
| 109 |
+
def _format_tweet_html_block(tweet: dict) -> str:
|
| 110 |
+
"""Return an HTML block describing a tweet (for batched email)."""
|
| 111 |
+
tweet_id = tweet.get("tweet_id", "N/A")
|
| 112 |
+
user = tweet.get("username") or tweet.get("user") or tweet.get("user_name") or "N/A"
|
| 113 |
+
content = tweet.get("content") or tweet.get("text") or ""
|
| 114 |
+
timestamp = tweet.get("datetime") or tweet.get("timestamp") or "N/A"
|
| 115 |
+
location = tweet.get("user_location") or tweet.get("location") or "N/A"
|
| 116 |
+
risk = tweet.get("risk_level", "N/A")
|
| 117 |
+
likes = tweet.get("like_count", 0)
|
| 118 |
+
rts = tweet.get("retweet_count", 0)
|
| 119 |
+
url = tweet.get("tweet_url") or f"https://x.com/{user}/status/{tweet_id}" if tweet_id != "N/A" else "N/A"
|
| 120 |
+
|
| 121 |
+
# Bulk detection
|
| 122 |
+
bulk_keywords = ["kg", "gram", "bulk", "kilos", "ounce", "pound"]
|
| 123 |
+
bulk_indicator = "Yes" if any(k in content.lower() for k in bulk_keywords) else "No"
|
| 124 |
+
|
| 125 |
+
# Contact detection (simple digit check)
|
| 126 |
+
contact_indicator = "Yes" if any(c.isdigit() for c in content) else "No"
|
| 127 |
+
|
| 128 |
+
html = f"""
|
| 129 |
+
<div style="border:1px solid #ddd;padding:10px;margin-bottom:8px;border-radius:6px;">
|
| 130 |
+
<p><strong>Risk:</strong> <span style="color:#b22222">{risk}</span>
|
| 131 |
+
<strong>User:</strong> @{user} <strong>Time:</strong> {timestamp}</p>
|
| 132 |
+
<p><strong>Location:</strong> {location} <td>{bulk_indicator}</td><td>{contact_indicator}</td><td>{content}</td><strong>Likes:</strong> {likes} <strong>RTs:</strong> {rts}</p>
|
| 133 |
+
<p style="background:#f7f7f7;padding:8px;border-radius:4px;">{content}</p>
|
| 134 |
+
<p><a href="{url}">View Tweet</a> | Tweet ID: {tweet_id}</p>
|
| 135 |
+
</div>
|
| 136 |
+
"""
|
| 137 |
+
return html
|
| 138 |
+
|
| 139 |
+
def _compose_batched_email(tweets: List[dict]) -> MIMEMultipart:
|
| 140 |
+
msg = MIMEMultipart("alternative")
|
| 141 |
+
msg["Subject"] = f"🚨 {len(tweets)} High-Priority Drug Alerts"
|
| 142 |
+
msg["From"] = ALERT_EMAIL
|
| 143 |
+
msg["To"] = ", ".join(RECIPIENTS)
|
| 144 |
+
|
| 145 |
+
# --- Top CRITICAL summary ---
|
| 146 |
+
critical_tweets = [t for t in tweets if t.get("risk_level") == "CRITICAL"]
|
| 147 |
+
top_critical = sorted(critical_tweets, key=lambda t: t.get("dynamic_risk_score", 0), reverse=True)[:10]
|
| 148 |
+
|
| 149 |
+
summary_html = ""
|
| 150 |
+
if top_critical:
|
| 151 |
+
summary_html += """
|
| 152 |
+
<h3 style="color:#b22222;">Top CRITICAL Tweets Summary</h3>
|
| 153 |
+
<table border="1" cellpadding="5" cellspacing="0" style="border-collapse: collapse;">
|
| 154 |
+
<tr>
|
| 155 |
+
<th>User</th><th>Dynamic Risk</th><th>Followers</th><th>Verified</th><th>Engagement</th><th>Location</th><th>Link</th>
|
| 156 |
+
</tr>
|
| 157 |
+
"""
|
| 158 |
+
for t in top_critical:
|
| 159 |
+
user = t.get("username") or t.get("user") or "N/A"
|
| 160 |
+
risk_score = t.get("dynamic_risk_score", 0)
|
| 161 |
+
followers = t.get("followers_count", 0)
|
| 162 |
+
verified = "Yes" if t.get("verified", False) else "No"
|
| 163 |
+
engagement = int(t.get("like_count", 0)) + int(t.get("retweet_count", 0))
|
| 164 |
+
location = t.get("user_location", "N/A")
|
| 165 |
+
tweet_id = t.get("tweet_id", "N/A")
|
| 166 |
+
url = t.get("tweet_url") or f"https://x.com/{user}/status/{tweet_id}" if tweet_id != "N/A" else "N/A"
|
| 167 |
+
|
| 168 |
+
summary_html += f"""
|
| 169 |
+
<tr>
|
| 170 |
+
<td>@{user}</td>
|
| 171 |
+
<td>{risk_score}</td>
|
| 172 |
+
<td>{followers}</td>
|
| 173 |
+
<td>{verified}</td>
|
| 174 |
+
<td>{engagement}</td>
|
| 175 |
+
<td>{location}</td>
|
| 176 |
+
<td><a href="{url}">View</a></td>
|
| 177 |
+
</tr>
|
| 178 |
+
"""
|
| 179 |
+
summary_html += "</table><br>"
|
| 180 |
+
|
| 181 |
+
# --- Main email table with all metrics ---
|
| 182 |
+
html_blocks = ["""
|
| 183 |
+
<table border="1" cellpadding="5" cellspacing="0" style="border-collapse: collapse;">
|
| 184 |
+
<tr>
|
| 185 |
+
<th>Risk</th><th>User</th><th>Dynamic Risk</th><th>Followers</th>
|
| 186 |
+
<th>Verified</th><th>Engagement</th><th>Geo Score</th><th>Location</th>
|
| 187 |
+
<th>Bulk</th><th>Contact</th><th>Content</th><th>Link</th>
|
| 188 |
+
</tr>
|
| 189 |
+
"""]
|
| 190 |
+
|
| 191 |
+
for t in tweets:
|
| 192 |
+
tweet_id = t.get("tweet_id", "N/A")
|
| 193 |
+
user = t.get("username") or t.get("user") or t.get("user_name") or "N/A"
|
| 194 |
+
content = t.get("content") or t.get("text") or ""
|
| 195 |
+
timestamp = t.get("datetime") or t.get("timestamp") or "N/A"
|
| 196 |
+
location = str(t.get("user_location") or t.get("location") or "N/A").lower()
|
| 197 |
+
risk = t.get("risk_level", "N/A")
|
| 198 |
+
dyn_risk = t.get("dynamic_risk_score", 0)
|
| 199 |
+
followers = t.get("followers_count", 0)
|
| 200 |
+
verified = "Yes" if t.get("verified", False) else "No"
|
| 201 |
+
engagement = int(t.get("like_count", 0)) + int(t.get("retweet_count", 0))
|
| 202 |
+
geo_score = 1 if any(k in location for k in ["bangalore", "bengaluru", "karnataka"]) else 0
|
| 203 |
+
url = t.get("tweet_url") or f"https://x.com/{user}/status/{tweet_id}" if tweet_id != "N/A" else "N/A"
|
| 204 |
+
|
| 205 |
+
# Bulk and contact indicators
|
| 206 |
+
bulk_keywords = ["kg", "gram", "bulk", "kilos", "ounce", "pound"]
|
| 207 |
+
bulk_indicator = "Yes" if any(k in content.lower() for k in bulk_keywords) else "No"
|
| 208 |
+
contact_indicator = "Yes" if any(c.isdigit() for c in content) else "No"
|
| 209 |
+
|
| 210 |
+
html_blocks.append(f"""
|
| 211 |
+
<tr>
|
| 212 |
+
<td>{risk}</td>
|
| 213 |
+
<td>@{user}</td>
|
| 214 |
+
<td>{dyn_risk}</td>
|
| 215 |
+
<td>{followers}</td>
|
| 216 |
+
<td>{verified}</td>
|
| 217 |
+
<td>{engagement}</td>
|
| 218 |
+
<td>{geo_score}</td>
|
| 219 |
+
<td>{location}</td>
|
| 220 |
+
<td>{bulk_indicator}</td>
|
| 221 |
+
<td>{contact_indicator}</td>
|
| 222 |
+
<td>{content}</td>
|
| 223 |
+
<td><a href="{url}">View</a></td>
|
| 224 |
+
</tr>
|
| 225 |
+
""")
|
| 226 |
+
|
| 227 |
+
html_blocks.append("</table>")
|
| 228 |
+
|
| 229 |
+
html_text = f"""
|
| 230 |
+
<html>
|
| 231 |
+
<body>
|
| 232 |
+
<h2 style="color:#b22222;">High-Priority Drug Alerts</h2>
|
| 233 |
+
{summary_html}
|
| 234 |
+
{''.join(html_blocks)}
|
| 235 |
+
<hr/>
|
| 236 |
+
<p>Generated by Karnataka Drug Crime Monitoring System</p>
|
| 237 |
+
</body>
|
| 238 |
+
</html>
|
| 239 |
+
"""
|
| 240 |
+
|
| 241 |
+
# Plain-text fallback
|
| 242 |
+
plain_text = "\n".join([
|
| 243 |
+
f"{t.get('risk_level')} | @{t.get('username')} | {t.get('dynamic_risk_score')} | {t.get('content','')[:100]}"
|
| 244 |
+
for t in tweets
|
| 245 |
+
])
|
| 246 |
+
|
| 247 |
+
msg.attach(MIMEText(plain_text, "plain"))
|
| 248 |
+
msg.attach(MIMEText(html_text, "html"))
|
| 249 |
+
return msg
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
# --- SMTP send with retries --- #
|
| 253 |
+
def _send_email_message(msg: MIMEMultipart, recipients: List[str], retry: int = SEND_RETRY) -> bool:
|
| 254 |
+
"""Send message via SMTP; return True on success."""
|
| 255 |
+
attempt = 0
|
| 256 |
+
while attempt <= retry:
|
| 257 |
+
try:
|
| 258 |
+
with smtplib.SMTP(SMTP_SERVER, SMTP_PORT, timeout=20) as s:
|
| 259 |
+
s.ehlo()
|
| 260 |
+
if SMTP_PORT == 587:
|
| 261 |
+
s.starttls()
|
| 262 |
+
s.ehlo()
|
| 263 |
+
s.login(ALERT_EMAIL, ALERT_PASSWORD)
|
| 264 |
+
s.sendmail(ALERT_EMAIL, recipients, msg.as_string())
|
| 265 |
+
logging.info(f"Email sent to {recipients}")
|
| 266 |
+
return True
|
| 267 |
+
except Exception as e:
|
| 268 |
+
attempt += 1
|
| 269 |
+
logging.warning(f"Email send attempt {attempt} failed: {e}")
|
| 270 |
+
if attempt > retry:
|
| 271 |
+
logging.error("Exceeded email send retries.")
|
| 272 |
+
return False
|
| 273 |
+
time.sleep(RETRY_DELAY)
|
| 274 |
+
|
| 275 |
+
# --- Public trigger function --- #
|
| 276 |
+
def trigger_alerts(max_tweets: Optional[int] = 10,
|
| 277 |
+
send_all: bool = False,
|
| 278 |
+
separate_emails: bool = False):
|
| 279 |
+
logging.info("Fetching high-risk unnotified tweets from DB...")
|
| 280 |
+
tweets = fetch_high_risk_unnotified()
|
| 281 |
+
if not tweets:
|
| 282 |
+
logging.info("No unnotified high-risk tweets found.")
|
| 283 |
+
return
|
| 284 |
+
|
| 285 |
+
# --- Compute dynamic risk for all fetched tweets ---
|
| 286 |
+
for t in tweets:
|
| 287 |
+
t["dynamic_risk_score"] = compute_dynamic_risk(t)
|
| 288 |
+
t["risk_level"] = assign_dynamic_risk_level(t) # automatically set risk_level
|
| 289 |
+
|
| 290 |
+
selected = _select_top_tweets(tweets, max_tweets, send_all)
|
| 291 |
+
if not selected:
|
| 292 |
+
logging.info("No tweets selected after filtering.")
|
| 293 |
+
return
|
| 294 |
+
|
| 295 |
+
# Compose and send emails (batch or separate)
|
| 296 |
+
success_ids, failure_ids = [], []
|
| 297 |
+
|
| 298 |
+
if separate_emails:
|
| 299 |
+
for t in selected:
|
| 300 |
+
msg = _compose_batched_email([t])
|
| 301 |
+
ok = _send_email_message(msg, RECIPIENTS)
|
| 302 |
+
if ok:
|
| 303 |
+
success_ids.append((t.get("tweet_id"), t.get("_collection_name")))
|
| 304 |
+
else:
|
| 305 |
+
failure_ids.append(t.get("tweet_id"))
|
| 306 |
+
time.sleep(BATCH_DELAY)
|
| 307 |
+
else:
|
| 308 |
+
msg = _compose_batched_email(selected)
|
| 309 |
+
ok = _send_email_message(msg, RECIPIENTS)
|
| 310 |
+
if ok:
|
| 311 |
+
success_ids.extend([(t.get("tweet_id"), t.get("_collection_name")) for t in selected])
|
| 312 |
+
else:
|
| 313 |
+
failure_ids.extend([t.get("tweet_id") for t in selected])
|
| 314 |
+
|
| 315 |
+
# Mark notified
|
| 316 |
+
for tid, maybe_collection in success_ids:
|
| 317 |
+
try:
|
| 318 |
+
mark_as_notified(tid)
|
| 319 |
+
except Exception as e:
|
| 320 |
+
logging.error(f"Failed to mark {tid} as notified: {e}")
|
| 321 |
+
|
| 322 |
+
logging.info(f"Alerts sent: {len(success_ids)}; failures: {len(failure_ids)}")
|
| 323 |
+
if failure_ids:
|
| 324 |
+
logging.warning(f"Failed tweet IDs: {failure_ids}")
|
| 325 |
+
|
| 326 |
+
def compute_risk_probability(dynamic_score: float) -> float:
|
| 327 |
+
"""
|
| 328 |
+
Convert dynamic risk score (0–100) into a probability 0–1
|
| 329 |
+
"""
|
| 330 |
+
return max(0.0, min(1.0, dynamic_score / 100))
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# --- CLI usage example --- #
|
| 334 |
+
if __name__ == "__main__":
|
| 335 |
+
# Example usages:
|
| 336 |
+
# - send up to 5 top tweets (batched) -> trigger_alerts(max_tweets=5)
|
| 337 |
+
# - send all unnotified high-risk tweets -> trigger_alerts(send_all=True)
|
| 338 |
+
# - send one email per tweet -> trigger_alerts(max_tweets=10, separate_emails=True)
|
| 339 |
+
|
| 340 |
+
# Default example: top 10 (batched)
|
| 341 |
+
trigger_alerts(max_tweets=10)
|
src/db.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# db.py
|
| 2 |
+
from pymongo import MongoClient, UpdateOne
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
load_dotenv()
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
# MongoDB configuration
|
| 11 |
+
MONGO_URI = os.getenv("MONGO_URI")
|
| 12 |
+
if not MONGO_URI:
|
| 13 |
+
print("❌ MONGO_URI not set in environment variables.")
|
| 14 |
+
sys.exit(1)
|
| 15 |
+
|
| 16 |
+
DB_NAME = "drug_monitoring_twitter"
|
| 17 |
+
COLLECTION_NAME = "tweets"
|
| 18 |
+
FOLDER_PATH = "drug_analysis_data_3months" # folder with scraper outputs
|
| 19 |
+
|
| 20 |
+
def get_db_collection():
|
| 21 |
+
"""Connect to MongoDB and return the collection"""
|
| 22 |
+
client = MongoClient(MONGO_URI)
|
| 23 |
+
db = client[DB_NAME]
|
| 24 |
+
|
| 25 |
+
if COLLECTION_NAME not in db.list_collection_names():
|
| 26 |
+
db.create_collection(COLLECTION_NAME)
|
| 27 |
+
print(f"✅ Created collection: {COLLECTION_NAME}")
|
| 28 |
+
|
| 29 |
+
return db[COLLECTION_NAME]
|
| 30 |
+
|
| 31 |
+
def insert_all_from_folder(folder_path=FOLDER_PATH):
|
| 32 |
+
"""Insert all CSV/JSON files from scraper folder into MongoDB"""
|
| 33 |
+
collection = get_db_collection()
|
| 34 |
+
|
| 35 |
+
if not os.path.exists(folder_path):
|
| 36 |
+
print(f"❌ Folder path does not exist: {folder_path}")
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
for file_name in os.listdir(folder_path):
|
| 40 |
+
file_path = os.path.join(folder_path, file_name)
|
| 41 |
+
operations = []
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
if file_name.endswith(".csv"):
|
| 45 |
+
df = pd.read_csv(file_path, encoding="utf-8")
|
| 46 |
+
for _, row in df.iterrows():
|
| 47 |
+
doc = row.to_dict()
|
| 48 |
+
doc["notified"] = False
|
| 49 |
+
if "tweet_id" in doc:
|
| 50 |
+
operations.append(
|
| 51 |
+
UpdateOne({"tweet_id": doc["tweet_id"]}, {"$set": doc}, upsert=True)
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
elif file_name.endswith(".json"):
|
| 55 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 56 |
+
data = json.load(f)
|
| 57 |
+
if isinstance(data, list):
|
| 58 |
+
for tweet in data:
|
| 59 |
+
tweet["notified"] = False
|
| 60 |
+
if "tweet_id" in tweet:
|
| 61 |
+
operations.append(
|
| 62 |
+
UpdateOne({"tweet_id": tweet["tweet_id"]}, {"$set": tweet}, upsert=True)
|
| 63 |
+
)
|
| 64 |
+
else:
|
| 65 |
+
# single JSON report
|
| 66 |
+
operations.append(
|
| 67 |
+
UpdateOne({"report_name": file_name}, {"$set": data}, upsert=True)
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
if operations:
|
| 71 |
+
result = collection.bulk_write(operations)
|
| 72 |
+
print(f"✅ {file_name} -> inserted/updated {result.upserted_count + result.modified_count} documents.")
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f"❌ Failed to process {file_name}: {e}")
|
| 76 |
+
|
| 77 |
+
def fetch_high_risk_unnotified():
|
| 78 |
+
"""Get all HIGH or CRITICAL risk tweets that are not notified yet"""
|
| 79 |
+
collection = get_db_collection()
|
| 80 |
+
return list(collection.find({"risk_level": {"$in": ["HIGH", "CRITICAL"]}, "notified": False}))
|
| 81 |
+
|
| 82 |
+
def mark_as_notified(tweet_id):
|
| 83 |
+
"""Mark a tweet as notified after sending alert"""
|
| 84 |
+
collection = get_db_collection()
|
| 85 |
+
collection.update_one({"tweet_id": tweet_id}, {"$set": {"notified": True}})
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
insert_all_from_folder()
|
| 89 |
+
print("✅ All scraper folder contents inserted/updated successfully.")
|
src/enhanced_drug_crime_scraper_3months.py
ADDED
|
@@ -0,0 +1,774 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# enhanced_drug_crime_scraper_3months.py
|
| 2 |
+
# Enhanced Twitter Scraper for Karnataka Police Drug Crime Analysis - Latest 3 Months
|
| 3 |
+
# Automatically collects data for the most recent 3 months
|
| 4 |
+
|
| 5 |
+
import asyncio
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from twscrape import API, gather
|
| 8 |
+
from fuzzywuzzy import fuzz
|
| 9 |
+
from datetime import timezone, timedelta, datetime
|
| 10 |
+
import logging
|
| 11 |
+
import sys
|
| 12 |
+
import os
|
| 13 |
+
import re
|
| 14 |
+
from collections import Counter
|
| 15 |
+
import json
|
| 16 |
+
import hashlib
|
| 17 |
+
from typing import List, Dict, Set, Tuple, Optional
|
| 18 |
+
import warnings
|
| 19 |
+
warnings.filterwarnings('ignore')
|
| 20 |
+
|
| 21 |
+
# Sentiment analysis (optional)
|
| 22 |
+
try:
|
| 23 |
+
import nltk
|
| 24 |
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
| 25 |
+
nltk.download('vader_lexicon', quiet=True)
|
| 26 |
+
nltk.download('punkt', quiet=True)
|
| 27 |
+
nltk.download('stopwords', quiet=True)
|
| 28 |
+
SENTIMENT_AVAILABLE = True
|
| 29 |
+
except:
|
| 30 |
+
SENTIMENT_AVAILABLE = False
|
| 31 |
+
print("Warning: NLTK not available. Sentiment analysis will be disabled.")
|
| 32 |
+
|
| 33 |
+
# Setup comprehensive logging
|
| 34 |
+
logging.basicConfig(
|
| 35 |
+
level=logging.INFO,
|
| 36 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 37 |
+
handlers=[
|
| 38 |
+
logging.FileHandler('drug_crime_scraper_3months.log'),
|
| 39 |
+
logging.StreamHandler(sys.stdout)
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
logger = logging.getLogger(__name__)
|
| 43 |
+
|
| 44 |
+
class DrugCrimeScraper3Months:
|
| 45 |
+
def __init__(self):
|
| 46 |
+
self.api = None
|
| 47 |
+
self.sentiment_analyzer = None
|
| 48 |
+
self.seen_ids: Set[str] = set()
|
| 49 |
+
self.collected_tweets: List[Dict] = []
|
| 50 |
+
|
| 51 |
+
# Initialize sentiment analyzer if available
|
| 52 |
+
if SENTIMENT_AVAILABLE:
|
| 53 |
+
try:
|
| 54 |
+
self.sentiment_analyzer = SentimentIntensityAnalyzer()
|
| 55 |
+
except:
|
| 56 |
+
logger.warning("Sentiment analyzer initialization failed")
|
| 57 |
+
|
| 58 |
+
# Enhanced drug-related keywords with criminal context
|
| 59 |
+
self.drug_keywords = [
|
| 60 |
+
# Cannabis variants
|
| 61 |
+
"weed", "ganja", "hash", "hashish", "charas", "cannabis", "marijuana", "dope", "pot", "grass",
|
| 62 |
+
"bhang", "nasha", "nashe", "maal", "stuff", "green", "herb", "mary jane", "bud", "kush",
|
| 63 |
+
|
| 64 |
+
# Hard drugs
|
| 65 |
+
"acid", "lsd", "mdma", "ecstasy", "molly", "coke", "cocaine", "crack", "meth", "crystal",
|
| 66 |
+
"heroin", "brown sugar", "smack", "ketamine", "special k", "oxy", "oxycodone", "percocet",
|
| 67 |
+
"adderall", "xanax", "benzo", "valium", "diazepam", "fentanyl", "tramadol", "codeine",
|
| 68 |
+
|
| 69 |
+
# Drug trade terms
|
| 70 |
+
"dealer", "peddler", "supplier", "pusher", "connect", "plug", "score", "scored",
|
| 71 |
+
"selling", "buying", "supply", "delivery", "pickup", "drop", "stash", "cache",
|
| 72 |
+
"package", "parcel", "consignment", "shipment", "bulk", "wholesale", "retail",
|
| 73 |
+
|
| 74 |
+
# Consumption terms
|
| 75 |
+
"trip", "tripping", "high", "stoned", "blazed", "hit", "dose", "line", "bump",
|
| 76 |
+
"joint", "blunt", "spliff", "bong", "pipe", "chillum", "roll", "smoke",
|
| 77 |
+
"inject", "snort", "pop", "drop", "chase", "shoot up", "mainline",
|
| 78 |
+
|
| 79 |
+
# Slang and coded terms
|
| 80 |
+
"420", "710", "party supplies", "study material", "medicine", "remedy",
|
| 81 |
+
"good stuff", "quality stuff", "premium", "imported", "local", "organic",
|
| 82 |
+
"fresh", "clean", "pure", "top shelf", "fire", "dank",
|
| 83 |
+
|
| 84 |
+
# Emoji and coded representations
|
| 85 |
+
"🍁", "💨", "🔥", "🌿", "💊", "💉", "🚬", "🍄", "❄️", "💎", "🌪️", "🔋",
|
| 86 |
+
"w33d", "g4nja", "h4sh", "c0ke", "m0lly", "xtc", "l$d", "m@al"
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
# Criminal activity indicators
|
| 90 |
+
self.crime_indicators = [
|
| 91 |
+
"arrest", "arrested", "caught", "raid", "police", "cops", "seized", "confiscated",
|
| 92 |
+
"bust", "busted", "investigation", "case", "fir", "complaint", "crime", "criminal",
|
| 93 |
+
"illegal", "banned", "prohibited", "contraband", "smuggling", "trafficking",
|
| 94 |
+
"possession", "distribution", "manufacturing", "cultivation", "racket", "cartel",
|
| 95 |
+
"kingpin", "network", "operation", "crackdown", "surveillance", "undercover",
|
| 96 |
+
"ncb", "dri", "excise", "customs", "border", "interstate", "mafia"
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
# Karnataka location keywords (enhanced)
|
| 100 |
+
self.karnataka_keywords = self._prepare_karnataka_keywords()
|
| 101 |
+
|
| 102 |
+
# High-risk areas in Karnataka
|
| 103 |
+
self.high_risk_areas = [
|
| 104 |
+
"malleswaram", "shivaji nagar", "frazer town", "russell market", "chickpet",
|
| 105 |
+
"kr market", "city market", "commercial street", "brigade road", "mg road",
|
| 106 |
+
"hosur road", "electronic city", "silk board", "bommanahalli", "btm layout",
|
| 107 |
+
"jayanagar", "basavanagudi", "hanumanthanagar", "banashankari", "girinagar",
|
| 108 |
+
"vijayanagar", "rajajinagar", "mahalakshmi layout", "peenya", "jalahalli",
|
| 109 |
+
"hebbal", "rt nagar", "kammanahalli", "banaswadi", "krishnarajapuram",
|
| 110 |
+
"whitefield", "marathahalli", "brookefield", "varthur", "sarjapur",
|
| 111 |
+
"koramangala", "indiranagar", "domlur", "ulsoor", "vasanth nagar",
|
| 112 |
+
"sadashiva nagar", "seshadripuram", "gandhinagar", "padhmanabhanagar"
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
def _prepare_karnataka_keywords(self) -> List[str]:
|
| 116 |
+
"""Prepare normalized Karnataka location keywords."""
|
| 117 |
+
raw_keywords = [
|
| 118 |
+
# Main cities and districts
|
| 119 |
+
"karnataka", "bengaluru", "bangalore", "blr", "b'lore", "namma bengaluru", "namma blr",
|
| 120 |
+
"mysore", "mysuru", "mangalore", "mangaluru", "hubli", "dharwad", "belgaum", "belagavi",
|
| 121 |
+
"ballari", "bellary", "bijapur", "vijayapura", "tumkur", "davangere", "shimoga", "shivamogga",
|
| 122 |
+
"hassan", "chitradurga", "gadag", "bidar", "raichur", "bagalkot", "haveri", "koppal",
|
| 123 |
+
"ramanagara", "chikkamagaluru", "yadgir", "mandya", "coorg", "kodagu", "karwar",
|
| 124 |
+
"kolar", "chikkaballapur", "madikeri", "udupi", "manipal",
|
| 125 |
+
|
| 126 |
+
# Bangalore areas and neighborhoods
|
| 127 |
+
"shivaji nagar", "majestic", "btm", "indiranagar", "jayanagar", "koramangala",
|
| 128 |
+
"whitefield", "banashankari", "hebbal", "rajajinagar", "yeshwanthpur", "kengeri",
|
| 129 |
+
"marathahalli", "hoskote", "rt nagar", "rr nagar", "bsk", "mg road", "brigade road",
|
| 130 |
+
"yelhanka", "basavanagudi", "malleswaram", "seshadripuram", "gandhinagar",
|
| 131 |
+
"electronic city", "silk board", "bommanahalli", "peenya", "jalahalli",
|
| 132 |
+
"vijayanagar", "rajajinagar", "kammanahalli", "banaswadi", "kr puram",
|
| 133 |
+
"domlur", "ulsoor", "vasanth nagar", "sadashiva nagar", "frazer town",
|
| 134 |
+
"commercial street", "chickpet", "kr market", "city market", "russell market",
|
| 135 |
+
|
| 136 |
+
# Tourist and party destinations (potential drug hotspots)
|
| 137 |
+
"hampi", "gokarna", "coorg", "chikmagalur", "nandi hills", "skandagiri",
|
| 138 |
+
"br hills", "kudremukh", "jog falls", "murdeshwar", "karwar beach",
|
| 139 |
+
"kabini", "bandipur", "nagarhole", "sakleshpur", "kemmannagundi"
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
return [self._normalize_text(kw) for kw in raw_keywords]
|
| 143 |
+
|
| 144 |
+
def _normalize_text(self, text: str) -> str:
|
| 145 |
+
"""Normalize text for better matching."""
|
| 146 |
+
if not isinstance(text, str):
|
| 147 |
+
return ""
|
| 148 |
+
return re.sub(r'\W+', ' ', text.lower()).strip()
|
| 149 |
+
|
| 150 |
+
def _fuzzy_match(self, text: str, keyword_list: List[str], threshold: int = 75) -> bool:
|
| 151 |
+
"""Check if any keyword fuzzy matches text above threshold."""
|
| 152 |
+
text_norm = self._normalize_text(text)
|
| 153 |
+
if not text_norm:
|
| 154 |
+
return False
|
| 155 |
+
return any(fuzz.partial_ratio(text_norm, kw) >= threshold for kw in keyword_list)
|
| 156 |
+
|
| 157 |
+
def _compute_relevance_score(self, text: str, keyword_list: List[str], threshold: int = 70) -> int:
|
| 158 |
+
"""Count how many keywords match text above threshold."""
|
| 159 |
+
text_norm = self._normalize_text(text)
|
| 160 |
+
return sum(1 for kw in keyword_list if fuzz.partial_ratio(text_norm, kw) >= threshold)
|
| 161 |
+
|
| 162 |
+
def _analyze_sentiment(self, text: str) -> Dict[str, float]:
|
| 163 |
+
"""Analyze sentiment of tweet text."""
|
| 164 |
+
if not self.sentiment_analyzer:
|
| 165 |
+
return {"compound": 0.0, "pos": 0.0, "neu": 0.0, "neg": 0.0}
|
| 166 |
+
|
| 167 |
+
try:
|
| 168 |
+
scores = self.sentiment_analyzer.polarity_scores(text)
|
| 169 |
+
return scores
|
| 170 |
+
except:
|
| 171 |
+
return {"compound": 0.0, "pos": 0.0, "neu": 0.0, "neg": 0.0}
|
| 172 |
+
|
| 173 |
+
def _extract_mentions_hashtags(self, text: str) -> Tuple[List[str], List[str]]:
|
| 174 |
+
"""Extract mentions and hashtags from tweet text."""
|
| 175 |
+
mentions = re.findall(r'@(\w+)', text)
|
| 176 |
+
hashtags = re.findall(r'#(\w+)', text.lower())
|
| 177 |
+
return mentions, hashtags
|
| 178 |
+
|
| 179 |
+
def _detect_phone_numbers(self, text: str) -> List[str]:
|
| 180 |
+
"""Detect potential phone numbers in text."""
|
| 181 |
+
# Indian phone number patterns
|
| 182 |
+
patterns = [
|
| 183 |
+
r'\b[6-9]\d{9}\b', # 10-digit mobile
|
| 184 |
+
r'\+91[6-9]\d{9}\b', # With +91
|
| 185 |
+
r'\b91[6-9]\d{9}\b', # With 91
|
| 186 |
+
r'\b[6-9]\d{4}\s?\d{5}\b', # With space
|
| 187 |
+
]
|
| 188 |
+
|
| 189 |
+
phone_numbers = []
|
| 190 |
+
for pattern in patterns:
|
| 191 |
+
phone_numbers.extend(re.findall(pattern, text))
|
| 192 |
+
|
| 193 |
+
return list(set(phone_numbers)) # Remove duplicates
|
| 194 |
+
|
| 195 |
+
def _assess_risk_level(self, tweet_data: Dict) -> str:
|
| 196 |
+
"""Assess risk level based on various factors."""
|
| 197 |
+
risk_score = 0
|
| 198 |
+
|
| 199 |
+
# Check for crime indicators
|
| 200 |
+
if self._fuzzy_match(tweet_data['content'], self.crime_indicators, 70):
|
| 201 |
+
risk_score += 3
|
| 202 |
+
|
| 203 |
+
# Check for high drug relevance
|
| 204 |
+
if tweet_data['drug_score'] > 3:
|
| 205 |
+
risk_score += 2
|
| 206 |
+
|
| 207 |
+
# Check for high-risk locations
|
| 208 |
+
location_text = f"{tweet_data['user_location']} {tweet_data['content']}".lower()
|
| 209 |
+
if any(area in location_text for area in self.high_risk_areas):
|
| 210 |
+
risk_score += 2
|
| 211 |
+
|
| 212 |
+
# Check for contact information
|
| 213 |
+
if self._detect_phone_numbers(tweet_data['content']):
|
| 214 |
+
risk_score += 3
|
| 215 |
+
|
| 216 |
+
# Check for selling/buying indicators
|
| 217 |
+
selling_terms = ["selling", "available", "dm me", "contact", "call", "whatsapp", "delivery", "pickup"]
|
| 218 |
+
if any(term in tweet_data['content'].lower() for term in selling_terms):
|
| 219 |
+
risk_score += 2
|
| 220 |
+
|
| 221 |
+
# Check for quantity indicators (bulk operations)
|
| 222 |
+
quantity_terms = ["kg", "gram", "ounce", "pound", "bulk", "wholesale", "lots", "kilos"]
|
| 223 |
+
if any(term in tweet_data['content'].lower() for term in quantity_terms):
|
| 224 |
+
risk_score += 1
|
| 225 |
+
|
| 226 |
+
# Assess sentiment - negative sentiment might indicate problems
|
| 227 |
+
if tweet_data['sentiment']['compound'] < -0.5:
|
| 228 |
+
risk_score += 1
|
| 229 |
+
|
| 230 |
+
# High engagement (viral content)
|
| 231 |
+
if tweet_data.get('like_count', 0) > 100 or tweet_data.get('retweet_count', 0) > 50:
|
| 232 |
+
risk_score += 1
|
| 233 |
+
|
| 234 |
+
# Determine risk level
|
| 235 |
+
if risk_score >= 8:
|
| 236 |
+
return "CRITICAL"
|
| 237 |
+
elif risk_score >= 6:
|
| 238 |
+
return "HIGH"
|
| 239 |
+
elif risk_score >= 3:
|
| 240 |
+
return "MEDIUM"
|
| 241 |
+
else:
|
| 242 |
+
return "LOW"
|
| 243 |
+
|
| 244 |
+
def _is_karnataka_relevant(self, tweet) -> bool:
|
| 245 |
+
"""Enhanced Karnataka relevance check."""
|
| 246 |
+
# User location check
|
| 247 |
+
if tweet.user.location and self._fuzzy_match(tweet.user.location, self.karnataka_keywords, 80):
|
| 248 |
+
return True
|
| 249 |
+
|
| 250 |
+
# User description check
|
| 251 |
+
user_desc = getattr(tweet.user, 'description', '') or ''
|
| 252 |
+
if self._fuzzy_match(user_desc, self.karnataka_keywords, 80):
|
| 253 |
+
return True
|
| 254 |
+
|
| 255 |
+
# Tweet content check
|
| 256 |
+
tweet_text = tweet.rawContent or ""
|
| 257 |
+
if self._fuzzy_match(tweet_text, self.karnataka_keywords, 75):
|
| 258 |
+
return True
|
| 259 |
+
|
| 260 |
+
# Hashtags check
|
| 261 |
+
hashtags = getattr(tweet, 'hashtags', []) or []
|
| 262 |
+
hashtags_text = ' '.join(hashtags).lower()
|
| 263 |
+
if any(kw in hashtags_text for kw in self.karnataka_keywords):
|
| 264 |
+
return True
|
| 265 |
+
|
| 266 |
+
return False
|
| 267 |
+
|
| 268 |
+
async def _search_with_retry(self, query: str, limit: int, retries: int = 3, delay: int = 10) -> List:
|
| 269 |
+
"""Search tweets with enhanced retry logic."""
|
| 270 |
+
for attempt in range(retries):
|
| 271 |
+
try:
|
| 272 |
+
logger.info(f"Searching with query: {query[:50]}... (Attempt {attempt + 1})")
|
| 273 |
+
tweets = await gather(self.api.search(query, limit=limit))
|
| 274 |
+
logger.info(f"Retrieved {len(tweets)} tweets")
|
| 275 |
+
return tweets
|
| 276 |
+
except Exception as e:
|
| 277 |
+
logger.error(f"Attempt {attempt + 1} failed for query {query[:30]}...: {e}")
|
| 278 |
+
if attempt < retries - 1:
|
| 279 |
+
await asyncio.sleep(delay * (attempt + 1)) # Exponential backoff
|
| 280 |
+
|
| 281 |
+
logger.error(f"All attempts failed for query: {query[:30]}...")
|
| 282 |
+
return []
|
| 283 |
+
|
| 284 |
+
async def _collect_tweets_for_period(self, start_date: str, end_date: str, limit_per_query: int = 200) -> List[Dict]:
|
| 285 |
+
"""Collect tweets for a specific time period with enhanced search strategies for latest 3 months."""
|
| 286 |
+
tweets_data = []
|
| 287 |
+
|
| 288 |
+
# Enhanced search queries optimized for recent data (last 3 months)
|
| 289 |
+
search_queries = [
|
| 290 |
+
# Primary drug-related searches
|
| 291 |
+
f"(ganja OR weed OR charas OR hash OR cannabis OR marijuana) (karnataka OR bengaluru OR bangalore OR mysore) lang:en since:{start_date} until:{end_date}",
|
| 292 |
+
f"(cocaine OR heroin OR mdma OR ecstasy OR drugs OR narcotics) (karnataka OR bengaluru OR bangalore) lang:en since:{start_date} until:{end_date}",
|
| 293 |
+
|
| 294 |
+
# Criminal activity searches
|
| 295 |
+
f"(dealer OR supplier OR selling OR buying) (drugs OR maal OR stuff) (karnataka OR bengaluru) since:{start_date} until:{end_date}",
|
| 296 |
+
f"(police OR arrest OR raid OR bust OR seized) (drugs OR narcotics OR ganja) karnataka since:{start_date} until:{end_date}",
|
| 297 |
+
|
| 298 |
+
# Location-specific searches
|
| 299 |
+
f"(drugs OR ganja OR weed) (shivaji nagar OR malleswaram OR btm OR koramangala) since:{start_date} until:{end_date}",
|
| 300 |
+
f"(maal OR stuff OR hash) (indiranagar OR whitefield OR electronic city OR jayanagar) since:{start_date} until:{end_date}",
|
| 301 |
+
|
| 302 |
+
# Coded language searches
|
| 303 |
+
f"(420 OR party supplies OR green OR herb) (bengaluru OR bangalore OR karnataka) since:{start_date} until:{end_date}",
|
| 304 |
+
f"(delivery OR pickup OR dm me OR contact) (stuff OR green OR maal) karnataka since:{start_date} until:{end_date}",
|
| 305 |
+
|
| 306 |
+
# News and official searches
|
| 307 |
+
f"(ncb OR narcotics control OR drug bust OR seizure) karnataka since:{start_date} until:{end_date}",
|
| 308 |
+
f"(drug trafficking OR smuggling OR peddling) (karnataka OR bengaluru OR mysore) since:{start_date} until:{end_date}"
|
| 309 |
+
]
|
| 310 |
+
|
| 311 |
+
for query in search_queries:
|
| 312 |
+
logger.info(f"Executing query: {query[:100]}...")
|
| 313 |
+
tweets = await self._search_with_retry(query, limit_per_query)
|
| 314 |
+
|
| 315 |
+
for tweet in tweets:
|
| 316 |
+
if tweet.id in self.seen_ids:
|
| 317 |
+
continue
|
| 318 |
+
|
| 319 |
+
# Karnataka relevance filter
|
| 320 |
+
if not self._is_karnataka_relevant(tweet):
|
| 321 |
+
continue
|
| 322 |
+
|
| 323 |
+
self.seen_ids.add(tweet.id)
|
| 324 |
+
tweet_data = self._process_tweet(tweet)
|
| 325 |
+
if tweet_data:
|
| 326 |
+
tweets_data.append(tweet_data)
|
| 327 |
+
|
| 328 |
+
# Rate limiting between queries
|
| 329 |
+
await asyncio.sleep(8)
|
| 330 |
+
|
| 331 |
+
logger.info(f"Collected {len(tweets_data)} relevant tweets for period {start_date} to {end_date}")
|
| 332 |
+
return tweets_data
|
| 333 |
+
|
| 334 |
+
def _process_tweet(self, tweet) -> Optional[Dict]:
|
| 335 |
+
"""Process individual tweet and extract relevant information."""
|
| 336 |
+
try:
|
| 337 |
+
# Basic tweet information
|
| 338 |
+
user_location = tweet.user.location or ""
|
| 339 |
+
user_description = getattr(tweet.user, 'description', '') or ""
|
| 340 |
+
tweet_text = tweet.rawContent or ""
|
| 341 |
+
hashtags = getattr(tweet, 'hashtags', []) or []
|
| 342 |
+
|
| 343 |
+
# Skip if no meaningful content
|
| 344 |
+
if not tweet_text.strip() or len(tweet_text) < 10:
|
| 345 |
+
return None
|
| 346 |
+
|
| 347 |
+
# Calculate relevance scores
|
| 348 |
+
kar_score = (
|
| 349 |
+
4 * self._compute_relevance_score(user_location, self.karnataka_keywords) +
|
| 350 |
+
2 * self._compute_relevance_score(user_description, self.karnataka_keywords) +
|
| 351 |
+
3 * self._compute_relevance_score(tweet_text, self.karnataka_keywords)
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
drug_score = self._compute_relevance_score(tweet_text, self.drug_keywords, 80)
|
| 355 |
+
crime_score = self._compute_relevance_score(tweet_text, self.crime_indicators, 75)
|
| 356 |
+
|
| 357 |
+
# Extract additional information
|
| 358 |
+
mentions, hashtags_extracted = self._extract_mentions_hashtags(tweet_text)
|
| 359 |
+
phone_numbers = self._detect_phone_numbers(tweet_text)
|
| 360 |
+
sentiment = self._analyze_sentiment(tweet_text)
|
| 361 |
+
|
| 362 |
+
# Convert datetime to IST
|
| 363 |
+
ist = timezone(timedelta(hours=5, minutes=30))
|
| 364 |
+
utc_time = tweet.date if tweet.date.tzinfo else tweet.date.replace(tzinfo=timezone.utc)
|
| 365 |
+
ist_time = utc_time.astimezone(ist)
|
| 366 |
+
|
| 367 |
+
# Create tweet data dictionary
|
| 368 |
+
tweet_data = {
|
| 369 |
+
"tweet_id": str(tweet.id),
|
| 370 |
+
"datetime": ist_time.strftime("%d-%m-%Y %H:%M:%S"),
|
| 371 |
+
"username": tweet.user.username,
|
| 372 |
+
"user_display_name": getattr(tweet.user, 'displayname', '') or '',
|
| 373 |
+
"user_followers": getattr(tweet.user, 'followersCount', 0),
|
| 374 |
+
"user_following": getattr(tweet.user, 'followingCount', 0),
|
| 375 |
+
"user_verified": getattr(tweet.user, 'verified', False),
|
| 376 |
+
"content": tweet_text,
|
| 377 |
+
"user_location": user_location,
|
| 378 |
+
"user_description": user_description,
|
| 379 |
+
"hashtags": ' '.join(hashtags),
|
| 380 |
+
"mentions": ', '.join(mentions),
|
| 381 |
+
"phone_numbers": ', '.join(phone_numbers),
|
| 382 |
+
"tweet_url": f"https://x.com/{tweet.user.username}/status/{tweet.id}",
|
| 383 |
+
"retweet_count": getattr(tweet, 'retweetCount', 0),
|
| 384 |
+
"like_count": getattr(tweet, 'likeCount', 0),
|
| 385 |
+
"reply_count": getattr(tweet, 'replyCount', 0),
|
| 386 |
+
"kar_score": kar_score,
|
| 387 |
+
"drug_score": drug_score,
|
| 388 |
+
"crime_score": crime_score,
|
| 389 |
+
"sentiment": sentiment,
|
| 390 |
+
"sentiment_compound": sentiment.get('compound', 0),
|
| 391 |
+
"is_drug_related": drug_score > 0,
|
| 392 |
+
"is_crime_related": crime_score > 0,
|
| 393 |
+
"has_contact_info": len(phone_numbers) > 0,
|
| 394 |
+
"risk_level": "", # Will be calculated later
|
| 395 |
+
"content_hash": hashlib.md5(tweet_text.encode()).hexdigest()
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
# Assess risk level
|
| 399 |
+
tweet_data["risk_level"] = self._assess_risk_level(tweet_data)
|
| 400 |
+
|
| 401 |
+
return tweet_data
|
| 402 |
+
|
| 403 |
+
except Exception as e:
|
| 404 |
+
logger.error(f"Error processing tweet {getattr(tweet, 'id', 'unknown')}: {e}")
|
| 405 |
+
return None
|
| 406 |
+
|
| 407 |
+
async def initialize_api(self) -> bool:
|
| 408 |
+
"""Initialize Twitter API connection."""
|
| 409 |
+
try:
|
| 410 |
+
self.api = API()
|
| 411 |
+
accounts = await self.api.pool.get_all()
|
| 412 |
+
|
| 413 |
+
if not accounts:
|
| 414 |
+
logger.error("No Twitter accounts found. Please add accounts using: twscrape add_account")
|
| 415 |
+
logger.error("Setup instructions:")
|
| 416 |
+
logger.error("1. twscrape add_account username email password")
|
| 417 |
+
logger.error("2. twscrape login_accounts")
|
| 418 |
+
return False
|
| 419 |
+
|
| 420 |
+
logger.info(f"Initialized with {len(accounts)} accounts: {[a.username for a in accounts]}")
|
| 421 |
+
return True
|
| 422 |
+
|
| 423 |
+
except Exception as e:
|
| 424 |
+
logger.error(f"Failed to initialize API: {e}")
|
| 425 |
+
return False
|
| 426 |
+
|
| 427 |
+
def _ensure_output_directory(self, directory: str = "drug_analysis_data_3months"):
|
| 428 |
+
"""Create output directory if it doesn't exist."""
|
| 429 |
+
if not os.path.exists(directory):
|
| 430 |
+
os.makedirs(directory)
|
| 431 |
+
logger.info(f"Created directory: {directory}")
|
| 432 |
+
|
| 433 |
+
def _save_data(self, data: List[Dict], base_filename: str):
|
| 434 |
+
"""Save data in multiple formats with detailed analysis."""
|
| 435 |
+
if not data:
|
| 436 |
+
logger.warning("No data to save")
|
| 437 |
+
return
|
| 438 |
+
|
| 439 |
+
df = pd.DataFrame(data)
|
| 440 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 441 |
+
|
| 442 |
+
# Calculate the actual date range from data
|
| 443 |
+
if 'datetime' in df.columns:
|
| 444 |
+
df['date_parsed'] = pd.to_datetime(df['datetime'], format="%d-%m-%Y %H:%M:%S", errors='coerce')
|
| 445 |
+
date_range = f"3months_{df['date_parsed'].min().strftime('%Y%m%d')}_to_{df['date_parsed'].max().strftime('%Y%m%d')}"
|
| 446 |
+
else:
|
| 447 |
+
date_range = "latest_3months"
|
| 448 |
+
|
| 449 |
+
# Save main dataset
|
| 450 |
+
main_file = f"drug_analysis_data_3months/karnataka_drug_tweets_{date_range}_{timestamp}.csv"
|
| 451 |
+
df.to_csv(main_file, index=False, encoding='utf-8')
|
| 452 |
+
logger.info(f"Saved main dataset: {main_file}")
|
| 453 |
+
|
| 454 |
+
# Save high-risk tweets separately (HIGH PRIORITY FOR POLICE)
|
| 455 |
+
high_risk_df = df[df['risk_level'].isin(['HIGH', 'CRITICAL'])]
|
| 456 |
+
if not high_risk_df.empty:
|
| 457 |
+
risk_file = f"drug_analysis_data_3months/HIGH_PRIORITY_tweets_{date_range}_{timestamp}.csv"
|
| 458 |
+
high_risk_df.to_csv(risk_file, index=False, encoding='utf-8')
|
| 459 |
+
logger.info(f"🚨 Saved HIGH PRIORITY tweets: {risk_file}")
|
| 460 |
+
|
| 461 |
+
# Save drug-related tweets
|
| 462 |
+
drug_df = df[df['is_drug_related'] == True]
|
| 463 |
+
if not drug_df.empty:
|
| 464 |
+
drug_file = f"drug_analysis_data_3months/drug_related_{date_range}_{timestamp}.csv"
|
| 465 |
+
drug_df.to_csv(drug_file, index=False, encoding='utf-8')
|
| 466 |
+
logger.info(f"Saved drug-related tweets: {drug_file}")
|
| 467 |
+
|
| 468 |
+
# Save tweets with contact information (INVESTIGATION LEADS)
|
| 469 |
+
contact_df = df[df['has_contact_info'] == True]
|
| 470 |
+
if not contact_df.empty:
|
| 471 |
+
contact_file = f"drug_analysis_data_3months/CONTACT_INFO_tweets_{date_range}_{timestamp}.csv"
|
| 472 |
+
contact_df.to_csv(contact_file, index=False, encoding='utf-8')
|
| 473 |
+
logger.info(f"🔍 Saved tweets with contact info: {contact_file}")
|
| 474 |
+
|
| 475 |
+
# Save by risk level
|
| 476 |
+
for risk_level in ['CRITICAL', 'HIGH', 'MEDIUM']:
|
| 477 |
+
risk_df = df[df['risk_level'] == risk_level]
|
| 478 |
+
if not risk_df.empty:
|
| 479 |
+
risk_level_file = f"drug_analysis_data_3months/{risk_level}_RISK_{date_range}_{timestamp}.csv"
|
| 480 |
+
risk_df.to_csv(risk_level_file, index=False, encoding='utf-8')
|
| 481 |
+
logger.info(f"Saved {risk_level} risk tweets: {risk_level_file}")
|
| 482 |
+
|
| 483 |
+
# Generate summary report
|
| 484 |
+
self._generate_summary_report(df, timestamp, date_range)
|
| 485 |
+
|
| 486 |
+
def _generate_summary_report(self, df: pd.DataFrame, timestamp: str, date_range: str):
|
| 487 |
+
"""Generate a comprehensive summary report for police analysis."""
|
| 488 |
+
report = {
|
| 489 |
+
"analysis_metadata": {
|
| 490 |
+
"analysis_timestamp": timestamp,
|
| 491 |
+
"date_range": date_range,
|
| 492 |
+
"collection_period_days": len(df['datetime'].unique()) if 'datetime' in df.columns else 0,
|
| 493 |
+
"scraper_version": "3_months_automatic",
|
| 494 |
+
"total_queries_executed": 10,
|
| 495 |
+
"karnataka_police_authorization": "DRUG_CRIME_ANALYSIS_2024"
|
| 496 |
+
},
|
| 497 |
+
"summary_statistics": {
|
| 498 |
+
"total_tweets": len(df),
|
| 499 |
+
"drug_related_tweets": int(df['is_drug_related'].sum()),
|
| 500 |
+
"crime_related_tweets": int(df['is_crime_related'].sum()),
|
| 501 |
+
"tweets_with_contact_info": int(df['has_contact_info'].sum()),
|
| 502 |
+
"unique_users": int(df['username'].nunique()),
|
| 503 |
+
"verified_users": int(df['user_verified'].sum()),
|
| 504 |
+
"average_followers": int(df['user_followers'].mean()) if len(df) > 0 else 0
|
| 505 |
+
},
|
| 506 |
+
"risk_analysis": {
|
| 507 |
+
"risk_distribution": df['risk_level'].value_counts().to_dict(),
|
| 508 |
+
"critical_alerts": int((df['risk_level'] == 'CRITICAL').sum()),
|
| 509 |
+
"high_priority": int((df['risk_level'] == 'HIGH').sum()),
|
| 510 |
+
"investigation_leads": int(df['has_contact_info'].sum())
|
| 511 |
+
},
|
| 512 |
+
"geographic_analysis": {
|
| 513 |
+
"top_locations": df['user_location'].value_counts().head(15).to_dict(),
|
| 514 |
+
"high_risk_areas_mentioned": sum(1 for area in self.high_risk_areas
|
| 515 |
+
if any(area in tweet.lower() for tweet in df['content']))
|
| 516 |
+
},
|
| 517 |
+
"content_analysis": {
|
| 518 |
+
"sentiment_distribution": {
|
| 519 |
+
"positive": int((df['sentiment_compound'] > 0.1).sum()),
|
| 520 |
+
"neutral": int((df['sentiment_compound'].between(-0.1, 0.1)).sum()),
|
| 521 |
+
"negative": int((df['sentiment_compound'] < -0.1).sum())
|
| 522 |
+
},
|
| 523 |
+
"top_hashtags": df['hashtags'].str.split().explode().value_counts().head(25).to_dict(),
|
| 524 |
+
"most_mentioned_users": df['mentions'].str.split(', ').explode().value_counts().head(20).to_dict(),
|
| 525 |
+
"average_drug_score": float(df['drug_score'].mean()) if len(df) > 0 else 0,
|
| 526 |
+
"average_crime_score": float(df['crime_score'].mean()) if len(df) > 0 else 0
|
| 527 |
+
},
|
| 528 |
+
"user_analysis": {
|
| 529 |
+
"high_activity_users": df['username'].value_counts().head(15).to_dict(),
|
| 530 |
+
"most_followed_users": df.nlargest(10, 'user_followers')[['username', 'user_followers']].to_dict('records'),
|
| 531 |
+
"users_with_contact_info": df[df['has_contact_info']]['username'].tolist()
|
| 532 |
+
},
|
| 533 |
+
"temporal_analysis": {
|
| 534 |
+
"tweets_by_date": df['datetime'].str[:10].value_counts().sort_index().to_dict() if 'datetime' in df.columns else {},
|
| 535 |
+
"peak_activity_days": df['datetime'].str[:10].value_counts().head(10).to_dict() if 'datetime' in df.columns else {}
|
| 536 |
+
},
|
| 537 |
+
"investigation_priorities": {
|
| 538 |
+
"immediate_action_required": len(df[df['risk_level'].isin(['CRITICAL', 'HIGH'])]),
|
| 539 |
+
"contact_information_available": len(df[df['has_contact_info']]),
|
| 540 |
+
"bulk_operation_indicators": len(df[df['content'].str.contains('kg|gram|bulk|wholesale', case=False, na=False)]),
|
| 541 |
+
"cross_border_mentions": len(df[df['content'].str.contains('interstate|border|import', case=False, na=False)])
|
| 542 |
+
}
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
# Save comprehensive report as JSON
|
| 546 |
+
report_file = f"drug_analysis_data_3months/POLICE_ANALYSIS_REPORT_{date_range}_{timestamp}.json"
|
| 547 |
+
with open(report_file, 'w', encoding='utf-8') as f:
|
| 548 |
+
json.dump(report, f, indent=2, ensure_ascii=False, default=str)
|
| 549 |
+
|
| 550 |
+
logger.info(f"📊 Generated comprehensive police report: {report_file}")
|
| 551 |
+
|
| 552 |
+
# Print executive summary to console
|
| 553 |
+
print("\n" + "="*80)
|
| 554 |
+
print("KARNATAKA POLICE DRUG CRIME ANALYSIS REPORT - LATEST 3 MONTHS")
|
| 555 |
+
print("="*80)
|
| 556 |
+
print(f"Analysis completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 557 |
+
print(f"Period analyzed: Latest 3 months ({date_range})")
|
| 558 |
+
print(f"Total tweets analyzed: {report['summary_statistics']['total_tweets']}")
|
| 559 |
+
print(f"Drug-related tweets: {report['summary_statistics']['drug_related_tweets']} ({report['summary_statistics']['drug_related_tweets']/report['summary_statistics']['total_tweets']*100:.1f}%)")
|
| 560 |
+
print(f"Crime-related tweets: {report['summary_statistics']['crime_related_tweets']} ({report['summary_statistics']['crime_related_tweets']/report['summary_statistics']['total_tweets']*100:.1f}%)")
|
| 561 |
+
print(f"Tweets with contact info: {report['summary_statistics']['tweets_with_contact_info']} (🚨 INVESTIGATION LEADS)")
|
| 562 |
+
print(f"Unique users: {report['summary_statistics']['unique_users']}")
|
| 563 |
+
print(f"Verified users: {report['summary_statistics']['verified_users']}")
|
| 564 |
+
|
| 565 |
+
print("\n🚨 RISK LEVEL DISTRIBUTION:")
|
| 566 |
+
total_tweets = sum(report['risk_analysis']['risk_distribution'].values())
|
| 567 |
+
for risk, count in report['risk_analysis']['risk_distribution'].items():
|
| 568 |
+
percentage = (count/total_tweets*100) if total_tweets > 0 else 0
|
| 569 |
+
emoji = "🔴" if risk == "CRITICAL" else "🟠" if risk == "HIGH" else "🟡" if risk == "MEDIUM" else "🟢"
|
| 570 |
+
print(f" {emoji} {risk}: {count} ({percentage:.1f}%)")
|
| 571 |
+
|
| 572 |
+
print("\n📍 TOP LOCATIONS (First 10):")
|
| 573 |
+
for i, (location, count) in enumerate(list(report['geographic_analysis']['top_locations'].items())[:10], 1):
|
| 574 |
+
print(f" {i:2d}. {location}: {count} tweets")
|
| 575 |
+
|
| 576 |
+
print("\n💭 SENTIMENT ANALYSIS:")
|
| 577 |
+
for sentiment, count in report['content_analysis']['sentiment_distribution'].items():
|
| 578 |
+
percentage = (count/report['summary_statistics']['total_tweets']*100) if report['summary_statistics']['total_tweets'] > 0 else 0
|
| 579 |
+
print(f" {sentiment.capitalize()}: {count} ({percentage:.1f}%)")
|
| 580 |
+
|
| 581 |
+
print("\n👥 HIGH ACTIVITY USERS (Top 5):")
|
| 582 |
+
for i, (user, count) in enumerate(list(report['user_analysis']['high_activity_users'].items())[:5], 1):
|
| 583 |
+
print(f" {i}. @{user}: {count} tweets")
|
| 584 |
+
|
| 585 |
+
print("\n🎯 IMMEDIATE INVESTIGATION PRIORITIES:")
|
| 586 |
+
priorities = report['investigation_priorities']
|
| 587 |
+
if priorities['immediate_action_required'] > 0:
|
| 588 |
+
print(f" 🔴 URGENT: {priorities['immediate_action_required']} high/critical risk tweets requiring immediate review")
|
| 589 |
+
if priorities['contact_information_available'] > 0:
|
| 590 |
+
print(f" 🔍 LEADS: {priorities['contact_information_available']} tweets contain contact information")
|
| 591 |
+
if priorities['bulk_operation_indicators'] > 0:
|
| 592 |
+
print(f" 📦 BULK OPS: {priorities['bulk_operation_indicators']} tweets mention bulk operations")
|
| 593 |
+
if priorities['cross_border_mentions'] > 0:
|
| 594 |
+
print(f" 🌐 INTERSTATE: {priorities['cross_border_mentions']} tweets mention cross-border activity")
|
| 595 |
+
|
| 596 |
+
print("\n📁 OUTPUT FILES GENERATED:")
|
| 597 |
+
print(f" • Main dataset: karnataka_drug_tweets_{date_range}_{timestamp}.csv")
|
| 598 |
+
print(f" • High priority: HIGH_PRIORITY_tweets_{date_range}_{timestamp}.csv")
|
| 599 |
+
print(f" • Contact info: CONTACT_INFO_tweets_{date_range}_{timestamp}.csv")
|
| 600 |
+
print(f" • Analysis report: POLICE_ANALYSIS_REPORT_{date_range}_{timestamp}.json")
|
| 601 |
+
|
| 602 |
+
print("\n✅ NEXT STEPS:")
|
| 603 |
+
print(" 1. Review HIGH_PRIORITY_tweets file for immediate action")
|
| 604 |
+
print(" 2. Investigate users in CONTACT_INFO_tweets file")
|
| 605 |
+
print(" 3. Cross-reference with existing case files")
|
| 606 |
+
print(" 4. Monitor high-activity users for ongoing surveillance")
|
| 607 |
+
print("="*80)
|
| 608 |
+
|
| 609 |
+
async def run_analysis(self, start_date: str, end_date: str, tweets_per_day: int = 200):
|
| 610 |
+
"""Run the complete drug crime analysis for latest 3 months."""
|
| 611 |
+
logger.info("🚔 Starting Karnataka Police Drug Crime Analysis - Latest 3 Months")
|
| 612 |
+
logger.info(f"📅 Analysis period: {start_date} to {end_date}")
|
| 613 |
+
|
| 614 |
+
# Calculate total days
|
| 615 |
+
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
| 616 |
+
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
| 617 |
+
total_days = (end_dt - start_dt).days
|
| 618 |
+
|
| 619 |
+
logger.info(f"⏱️ Total analysis period: {total_days} days")
|
| 620 |
+
logger.info(f"🎯 Expected tweets: ~{tweets_per_day * total_days}")
|
| 621 |
+
|
| 622 |
+
# Initialize API
|
| 623 |
+
if not await self.initialize_api():
|
| 624 |
+
logger.error("❌ Failed to initialize Twitter API. Exiting.")
|
| 625 |
+
return
|
| 626 |
+
|
| 627 |
+
# Ensure output directory exists
|
| 628 |
+
self._ensure_output_directory()
|
| 629 |
+
|
| 630 |
+
# Generate date ranges for systematic collection (7-day chunks for better coverage)
|
| 631 |
+
date_ranges = self._generate_date_ranges(start_date, end_date, days_per_chunk=7)
|
| 632 |
+
|
| 633 |
+
logger.info(f"📊 Will process {len(date_ranges)} date ranges (7-day chunks)")
|
| 634 |
+
|
| 635 |
+
all_tweets = []
|
| 636 |
+
for i, (start, end) in enumerate(date_ranges, 1):
|
| 637 |
+
logger.info(f"🔍 Analyzing period {i}/{len(date_ranges)}: {start} to {end}")
|
| 638 |
+
|
| 639 |
+
tweets = await self._collect_tweets_for_period(start, end, tweets_per_day)
|
| 640 |
+
all_tweets.extend(tweets)
|
| 641 |
+
|
| 642 |
+
logger.info(f"✅ Collected {len(tweets)} tweets for this period. Total so far: {len(all_tweets)}")
|
| 643 |
+
|
| 644 |
+
# Rate limiting between periods - longer delay for larger dataset
|
| 645 |
+
if i < len(date_ranges): # Don't sleep after the last iteration
|
| 646 |
+
logger.info("⏳ Waiting 15 seconds before next period...")
|
| 647 |
+
await asyncio.sleep(15)
|
| 648 |
+
|
| 649 |
+
if not all_tweets:
|
| 650 |
+
logger.error("❌ No relevant tweets collected for analysis")
|
| 651 |
+
return
|
| 652 |
+
|
| 653 |
+
logger.info("🔄 Removing duplicate tweets...")
|
| 654 |
+
# Remove duplicates based on content hash
|
| 655 |
+
unique_tweets = []
|
| 656 |
+
seen_hashes = set()
|
| 657 |
+
|
| 658 |
+
for tweet in all_tweets:
|
| 659 |
+
content_hash = tweet.get('content_hash', '')
|
| 660 |
+
if content_hash not in seen_hashes:
|
| 661 |
+
unique_tweets.append(tweet)
|
| 662 |
+
seen_hashes.add(content_hash)
|
| 663 |
+
|
| 664 |
+
duplicates_removed = len(all_tweets) - len(unique_tweets)
|
| 665 |
+
logger.info(f"🗑️ Removed {duplicates_removed} duplicate tweets")
|
| 666 |
+
logger.info(f"✨ Final unique tweets: {len(unique_tweets)}")
|
| 667 |
+
|
| 668 |
+
# Save collected data
|
| 669 |
+
logger.info("💾 Saving collected data...")
|
| 670 |
+
self._save_data(unique_tweets, "karnataka_drug_analysis_3months")
|
| 671 |
+
|
| 672 |
+
logger.info("🎉 Analysis completed successfully!")
|
| 673 |
+
logger.info(f"📈 Total unique tweets collected: {len(unique_tweets)}")
|
| 674 |
+
logger.info(f"📁 Check the 'drug_analysis_data_3months' directory for results")
|
| 675 |
+
|
| 676 |
+
return unique_tweets
|
| 677 |
+
|
| 678 |
+
def _generate_date_ranges(self, start_date: str, end_date: str, days_per_chunk: int = 7) -> List[Tuple[str, str]]:
|
| 679 |
+
"""Generate date ranges for systematic data collection - optimized for 3 months."""
|
| 680 |
+
start = datetime.strptime(start_date, "%Y-%m-%d")
|
| 681 |
+
end = datetime.strptime(end_date, "%Y-%m-%d")
|
| 682 |
+
|
| 683 |
+
ranges = []
|
| 684 |
+
current = start
|
| 685 |
+
|
| 686 |
+
while current < end:
|
| 687 |
+
next_date = min(current + timedelta(days=days_per_chunk), end)
|
| 688 |
+
ranges.append((
|
| 689 |
+
current.strftime("%Y-%m-%d"),
|
| 690 |
+
next_date.strftime("%Y-%m-%d")
|
| 691 |
+
))
|
| 692 |
+
current = next_date
|
| 693 |
+
|
| 694 |
+
return ranges
|
| 695 |
+
|
| 696 |
+
# Main execution function
|
| 697 |
+
async def main():
|
| 698 |
+
"""Main function to run the drug crime analysis for latest 3 months."""
|
| 699 |
+
scraper = DrugCrimeScraper3Months()
|
| 700 |
+
|
| 701 |
+
# Automatic calculation for latest 3 months
|
| 702 |
+
print("\n" + "="*80)
|
| 703 |
+
print("🚔 KARNATAKA POLICE DRUG CRIME ANALYSIS SYSTEM")
|
| 704 |
+
print("📊 LATEST 3 MONTHS DATA COLLECTION")
|
| 705 |
+
print("="*80)
|
| 706 |
+
|
| 707 |
+
# Calculate dates for last 3 months from current date
|
| 708 |
+
end_date = datetime.now()
|
| 709 |
+
start_date = end_date - timedelta(days=90) # Last 3 months (approximately)
|
| 710 |
+
|
| 711 |
+
START_DATE = start_date.strftime("%Y-%m-%d")
|
| 712 |
+
END_DATE = end_date.strftime("%Y-%m-%d")
|
| 713 |
+
TWEETS_PER_DAY = 200 # Increased for better coverage
|
| 714 |
+
|
| 715 |
+
print(f"📅 Automatic Date Range Calculation:")
|
| 716 |
+
print(f" Start Date: {START_DATE}")
|
| 717 |
+
print(f" End Date: {END_DATE}")
|
| 718 |
+
print(f" Total Days: {(end_date - start_date).days}")
|
| 719 |
+
print(f" Expected Tweets: ~{TWEETS_PER_DAY * (end_date - start_date).days}")
|
| 720 |
+
print(f" Estimated Runtime: ~{((end_date - start_date).days // 7) * 2} minutes")
|
| 721 |
+
print("="*80)
|
| 722 |
+
|
| 723 |
+
# Confirm before starting
|
| 724 |
+
try:
|
| 725 |
+
response = input("\n🤔 Proceed with data collection? (y/n): ").lower().strip()
|
| 726 |
+
if response != 'y' and response != 'yes':
|
| 727 |
+
print("❌ Collection cancelled by user.")
|
| 728 |
+
return
|
| 729 |
+
except KeyboardInterrupt:
|
| 730 |
+
print("\n❌ Collection cancelled by user.")
|
| 731 |
+
return
|
| 732 |
+
|
| 733 |
+
print("\n🚀 Starting data collection...")
|
| 734 |
+
|
| 735 |
+
try:
|
| 736 |
+
collected_data = await scraper.run_analysis(START_DATE, END_DATE, TWEETS_PER_DAY)
|
| 737 |
+
|
| 738 |
+
if collected_data and len(collected_data) > 0:
|
| 739 |
+
print(f"\n🎉 SUCCESS! Collected {len(collected_data)} tweets for Karnataka Police analysis.")
|
| 740 |
+
print("📁 Check the 'drug_analysis_data_3months' directory for organized results.")
|
| 741 |
+
else:
|
| 742 |
+
print("\n⚠️ No data collected. Please check:")
|
| 743 |
+
print(" 1. Twitter account setup (twscrape add_account)")
|
| 744 |
+
print(" 2. Internet connection")
|
| 745 |
+
print(" 3. API rate limits")
|
| 746 |
+
|
| 747 |
+
except KeyboardInterrupt:
|
| 748 |
+
logger.info("\n⚠️ Analysis interrupted by user")
|
| 749 |
+
print("\n⚠️ Collection stopped by user. Partial data may be saved.")
|
| 750 |
+
except Exception as e:
|
| 751 |
+
logger.error(f"❌ Analysis failed: {e}")
|
| 752 |
+
print(f"\n❌ Error occurred: {e}")
|
| 753 |
+
print("💡 Check the log file 'drug_crime_scraper_3months.log' for details.")
|
| 754 |
+
|
| 755 |
+
if __name__ == "__main__":
|
| 756 |
+
# Windows compatibility
|
| 757 |
+
if sys.platform.startswith("win"):
|
| 758 |
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
| 759 |
+
|
| 760 |
+
# Print setup instructions
|
| 761 |
+
print("\n📋 SETUP REQUIREMENTS:")
|
| 762 |
+
print("1. Install: pip install twscrape pandas fuzzywuzzy nltk python-levenshtein")
|
| 763 |
+
print("2. Add Twitter account: twscrape add_account username email password")
|
| 764 |
+
print("3. Login accounts: twscrape login_accounts")
|
| 765 |
+
print("4. Run this script: python enhanced_drug_crime_scraper_3months.py")
|
| 766 |
+
|
| 767 |
+
# Run the analysis
|
| 768 |
+
try:
|
| 769 |
+
asyncio.run(main())
|
| 770 |
+
except KeyboardInterrupt:
|
| 771 |
+
print("\n👋 Goodbye!")
|
| 772 |
+
except Exception as e:
|
| 773 |
+
print(f"\n❌ Failed to start: {e}")
|
| 774 |
+
print("💡 Please check the setup requirements above.")
|
src/evaluate.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# evaluate.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import os
|
| 4 |
+
from collections import Counter
|
| 5 |
+
import re
|
| 6 |
+
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
|
| 7 |
+
|
| 8 |
+
SCRAPER_FOLDER = "drug_analysis_data_3months" # Folder where scraper saves CSVs
|
| 9 |
+
|
| 10 |
+
# -----------------------------
|
| 11 |
+
# Load CSVs
|
| 12 |
+
# -----------------------------
|
| 13 |
+
csv_files = [f for f in os.listdir(SCRAPER_FOLDER) if f.endswith(".csv")]
|
| 14 |
+
if not csv_files:
|
| 15 |
+
print("❌ No CSV files found in scraper folder!")
|
| 16 |
+
exit()
|
| 17 |
+
|
| 18 |
+
dfs = [pd.read_csv(os.path.join(SCRAPER_FOLDER, f)) for f in csv_files]
|
| 19 |
+
df = pd.concat(dfs, ignore_index=True)
|
| 20 |
+
print(f"✅ Loaded {len(df)} rows from {len(csv_files)} CSV files.\n")
|
| 21 |
+
|
| 22 |
+
# -----------------------------
|
| 23 |
+
# General Stats
|
| 24 |
+
# -----------------------------
|
| 25 |
+
print("=== General Stats ===")
|
| 26 |
+
print("Columns:", df.columns.tolist())
|
| 27 |
+
print("Total rows:", len(df))
|
| 28 |
+
print("Missing values per column:\n", df.isna().sum())
|
| 29 |
+
print("\nDuplicate rows:", df.duplicated().sum())
|
| 30 |
+
|
| 31 |
+
# Sample rows with missing data
|
| 32 |
+
missing_rows = df[df.isna().any(axis=1)]
|
| 33 |
+
if not missing_rows.empty:
|
| 34 |
+
print("\nSample rows with missing values:\n", missing_rows.head())
|
| 35 |
+
|
| 36 |
+
# Sample duplicate rows
|
| 37 |
+
duplicates = df[df.duplicated(keep=False)]
|
| 38 |
+
if not duplicates.empty:
|
| 39 |
+
print("\nSample duplicate rows:\n", duplicates.head())
|
| 40 |
+
|
| 41 |
+
# -----------------------------
|
| 42 |
+
# Drug/Crime-related stats
|
| 43 |
+
# -----------------------------
|
| 44 |
+
for col in ["is_drug_related", "is_crime_related", "risk_level"]:
|
| 45 |
+
if col in df.columns:
|
| 46 |
+
print(f"\n=== {col} Distribution ===")
|
| 47 |
+
print(df[col].value_counts())
|
| 48 |
+
print("Proportion:\n", round(df[col].value_counts(normalize=True), 4))
|
| 49 |
+
|
| 50 |
+
# Risk level numeric analysis
|
| 51 |
+
if "risk_level" in df.columns and pd.api.types.is_numeric_dtype(df["risk_level"]):
|
| 52 |
+
print("\n=== Risk Level Stats ===")
|
| 53 |
+
print("Average risk:", round(df["risk_level"].mean(), 2))
|
| 54 |
+
print("Max risk:", df["risk_level"].max())
|
| 55 |
+
high_risk_count = (df["risk_level"] >= 0.7).sum() # Threshold
|
| 56 |
+
print("Number of high-risk items (risk >= 0.7):", high_risk_count)
|
| 57 |
+
|
| 58 |
+
# -----------------------------
|
| 59 |
+
# Time coverage
|
| 60 |
+
# -----------------------------
|
| 61 |
+
if "datetime" in df.columns:
|
| 62 |
+
df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")
|
| 63 |
+
print("\n=== Date Range ===")
|
| 64 |
+
print("Earliest:", df["datetime"].min())
|
| 65 |
+
print("Latest:", df["datetime"].max())
|
| 66 |
+
|
| 67 |
+
# Daily counts
|
| 68 |
+
df["date"] = df["datetime"].dt.date
|
| 69 |
+
daily_counts = df.groupby("date").size()
|
| 70 |
+
print("\n=== Daily Counts of Posts ===")
|
| 71 |
+
print(daily_counts)
|
| 72 |
+
|
| 73 |
+
# -----------------------------
|
| 74 |
+
# Text Analysis
|
| 75 |
+
# -----------------------------
|
| 76 |
+
if "text" in df.columns:
|
| 77 |
+
df["text"] = df["text"].astype(str)
|
| 78 |
+
df["text_length"] = df["text"].apply(len)
|
| 79 |
+
print("\n=== Text Length Stats ===")
|
| 80 |
+
print("Average length:", round(df["text_length"].mean(), 2))
|
| 81 |
+
print("Min length:", df["text_length"].min())
|
| 82 |
+
print("Max length:", df["text_length"].max())
|
| 83 |
+
|
| 84 |
+
# Top 10 most common words
|
| 85 |
+
words = Counter()
|
| 86 |
+
for t in df["text"]:
|
| 87 |
+
words.update(re.findall(r"\w+", t.lower()))
|
| 88 |
+
print("\nTop 10 common words:", words.most_common(10))
|
| 89 |
+
|
| 90 |
+
# -----------------------------
|
| 91 |
+
# User / Source Analysis
|
| 92 |
+
# -----------------------------
|
| 93 |
+
if "username" in df.columns:
|
| 94 |
+
print("\n=== User Analysis ===")
|
| 95 |
+
print("Total unique users:", df["username"].nunique())
|
| 96 |
+
top_users = df["username"].value_counts().head(10)
|
| 97 |
+
print("Top 10 users by post count:\n", top_users)
|
| 98 |
+
|
| 99 |
+
# -----------------------------
|
| 100 |
+
# Scraper Evaluation Metrics
|
| 101 |
+
# -----------------------------
|
| 102 |
+
print("\n=== Scraper Evaluation Metrics ===")
|
| 103 |
+
|
| 104 |
+
# 1. Completeness (% of filled cells)
|
| 105 |
+
completeness = 1 - df.isna().mean().mean()
|
| 106 |
+
print(f"Completeness (all columns filled): {round(completeness*100, 2)}%")
|
| 107 |
+
|
| 108 |
+
# 2. Duplicate rate (% of duplicate rows)
|
| 109 |
+
duplicate_rate = df.duplicated().mean()
|
| 110 |
+
print(f"Duplicate rows rate: {round(duplicate_rate*100, 2)}%")
|
| 111 |
+
|
| 112 |
+
# 3. Drug/Crime relevance (if available)
|
| 113 |
+
for col in ["is_drug_related", "is_crime_related"]:
|
| 114 |
+
if col in df.columns:
|
| 115 |
+
relevance = df[col].sum() / len(df)
|
| 116 |
+
print(f"{col} relevance rate: {round(relevance*100,2)}%")
|
| 117 |
+
|
| 118 |
+
# 4. Time coverage (active days vs total days)
|
| 119 |
+
if "datetime" in df.columns:
|
| 120 |
+
total_days = (df["datetime"].max() - df["datetime"].min()).days + 1
|
| 121 |
+
active_days = df["date"].nunique()
|
| 122 |
+
coverage_ratio = active_days / total_days
|
| 123 |
+
print(f"Time coverage ratio (active days / total days): {round(coverage_ratio*100,2)}%")
|
| 124 |
+
|
| 125 |
+
# 5. Average text length (proxy for content richness)
|
| 126 |
+
if "text" in df.columns:
|
| 127 |
+
print(f"Average text length: {round(df['text_length'].mean(),2)} characters")
|
| 128 |
+
|
| 129 |
+
# 6. Classification Metrics (using scraper labels as pseudo-ground truth)
|
| 130 |
+
# If multiple columns available (e.g., is_drug_related vs is_crime_related), compute metrics
|
| 131 |
+
if "is_drug_related" in df.columns and "is_crime_related" in df.columns:
|
| 132 |
+
y_true = df["is_crime_related"]
|
| 133 |
+
y_pred = df["is_drug_related"]
|
| 134 |
+
print("\n=== Classification Metrics (is_drug_related vs is_crime_related) ===")
|
| 135 |
+
print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))
|
| 136 |
+
print("Precision:", round(precision_score(y_true, y_pred), 4))
|
| 137 |
+
print("Recall:", round(recall_score(y_true, y_pred), 4))
|
| 138 |
+
print("F1-score:", round(f1_score(y_true, y_pred), 4))
|
| 139 |
+
print("\nClassification Report:\n", classification_report(y_true, y_pred))
|
| 140 |
+
else:
|
| 141 |
+
print("\n⚠️ Skipping classification metrics: Not enough columns for evaluation.")
|
| 142 |
+
|
| 143 |
+
print("\n✅ Data evaluation + metrics complete!")
|
src/evaluation.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# evaluation.py
|
| 2 |
+
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, brier_score_loss
|
| 3 |
+
from alerts import compute_dynamic_risk
|
| 4 |
+
|
| 5 |
+
def evaluate_model(test_tweets):
|
| 6 |
+
"""
|
| 7 |
+
test_tweets: list of dicts with fields
|
| 8 |
+
- true_risk_level: "CRITICAL"/"HIGH"/...
|
| 9 |
+
- dynamic_risk_score: 0-100
|
| 10 |
+
"""
|
| 11 |
+
# Compute predicted risk level from dynamic score
|
| 12 |
+
y_true = [1 if t['true_risk_level'] == "CRITICAL" else 0 for t in test_tweets]
|
| 13 |
+
y_prob = []
|
| 14 |
+
y_pred = []
|
| 15 |
+
|
| 16 |
+
for t in test_tweets:
|
| 17 |
+
score = t["dynamic_risk_score"]
|
| 18 |
+
prob = compute_dynamic_risk(score)
|
| 19 |
+
y_prob.append(prob)
|
| 20 |
+
y_pred.append(1 if prob >= 0.75 else 0) # threshold for CRITICAL
|
| 21 |
+
|
| 22 |
+
print("=== Classification Report ===")
|
| 23 |
+
print(classification_report(y_true, y_pred, target_names=["Non-Critical","Critical"]))
|
| 24 |
+
|
| 25 |
+
print("=== Confusion Matrix ===")
|
| 26 |
+
print(confusion_matrix(y_true, y_pred))
|
| 27 |
+
|
| 28 |
+
print("ROC-AUC:", roc_auc_score(y_true, y_prob))
|
| 29 |
+
print("Brier Score:", brier_score_loss(y_true, y_prob))
|