Spaces:

lawlevisan
/

Twitter-Analysis

Sleeping

App Files Files Community

lawlevisan commited on Oct 22, 2025

Commit

e0324fc

verified ·

1 Parent(s): dc0315b

Upload 5 files

Browse files

Files changed (5) hide show

src/alerts.py +341 -0
src/db.py +89 -0
src/enhanced_drug_crime_scraper_3months.py +774 -0
src/evaluate.py +143 -0
src/evaluation.py +29 -0

src/alerts.py ADDED Viewed

	@@ -0,0 +1,341 @@

+# alerts.py
+import os
+import time
+import logging
+from typing import List, Optional
+from email.mime.text import MIMEText
+from email.mime.multipart import MIMEMultipart
+import smtplib
+from dotenv import load_dotenv
+# local db wrapper (your db.py must expose these functions)
+from db import fetch_high_risk_unnotified, mark_as_notified
+load_dotenv()
+# --- Configuration (via env) ---
+ALERT_EMAIL = os.getenv("ALERT_EMAIL")            # sender email (also login user)
+ALERT_PASSWORD = os.getenv("ALERT_PASSWORD")      # SMTP password or app-specific password
+ALERT_RECIPIENTS = os.getenv("ALERT_RECIPIENTS")  # comma-separated recipients e.g. "a@x.com,b@x.com"
+SMTP_SERVER = os.getenv("ALERT_SMTP", "smtp.gmail.com")
+SMTP_PORT = int(os.getenv("ALERT_SMTP_PORT", "587"))
+SEND_RETRY = int(os.getenv("ALERT_SEND_RETRY", "2"))
+RETRY_DELAY = float(os.getenv("ALERT_RETRY_DELAY", "2.0"))  # seconds between retries
+BATCH_DELAY = float(os.getenv("ALERT_BATCH_DELAY", "0.5"))  # short wait between operations
+# Basic validation
+if not ALERT_EMAIL or not ALERT_PASSWORD:
+    raise RuntimeError("ALERT_EMAIL and ALERT_PASSWORD must be set in the environment.")
+# Build recipients list
+def _parse_recipients(env_val: Optional[str]) -> List[str]:
+    if not env_val:
+        return [ALERT_EMAIL]
+    return [r.strip() for r in env_val.split(",") if r.strip()]
+RECIPIENTS = _parse_recipients(ALERT_RECIPIENTS)
+# logger
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+def compute_dynamic_risk(tweet: dict) -> float:
+    """
+    Returns a dynamic risk score (0–100)
+    """
+    # 1️⃣ Content-based weight
+    drug_score = float(tweet.get("drug_score", 0))
+    crime_score = float(tweet.get("crime_score", 0))
+    content_score = 0.5*drug_score + 0.5*crime_score   # can adjust weights
+    # 2️⃣ User influence weight
+    followers = int(tweet.get("followers_count", 0))
+    verified = 1 if tweet.get("verified", False) else 0
+    user_score = min(followers/1000, 1) * 0.5 + verified*0.5  # normalize
+    # 3️⃣ Engagement weight
+    engagement = int(tweet.get("like_count", 0)) + int(tweet.get("retweet_count", 0))
+    engagement_score = min(engagement / 50, 1)  # normalize to 0-1
+    # 4️⃣ Geo relevance
+    location = str(tweet.get("user_location", "")).lower()
+    karnataka_keywords = ["bangalore", "bengaluru", "karnataka"]
+    geo_score = 1 if any(k in location for k in karnataka_keywords) else 0
+    # Combine weights (adjust relative contributions)
+    risk_score = (
+        0.4*content_score +
+        0.2*user_score +
+        0.2*engagement_score +
+        0.2*geo_score
+    )
+    # Scale to 0–100
+    return round(risk_score*100, 2)
+def assign_dynamic_risk_level(tweet: dict) -> str:
+    score = compute_dynamic_risk(tweet)
+    if score >= 75:
+        return "CRITICAL"
+    elif score >= 50:
+        return "HIGH"
+    elif score >= 25:
+        return "MEDIUM"
+    else:
+        return "LOW"
+def _tweet_score_for_sort(tweet: dict) -> float:
+    """Use dynamic risk for sorting instead of static RISK_PRIORITIES"""
+    risk_score = compute_dynamic_risk(tweet)  # 0–100
+    engagement = int(tweet.get("like_count", 0) or 0) + int(tweet.get("retweet_count", 0) or 0)
+    return risk_score + engagement*0.1  # small weight to engagement
+def _select_top_tweets(tweets: List[dict], max_tweets: Optional[int], send_all: bool) -> List[dict]:
+    """Sort and slice tweets according to priority; return selected tweets."""
+    if not tweets:
+        return []
+    # normalize risk_level fields to uppercase to avoid mismatches
+    for t in tweets:
+        if "risk_level" in t and isinstance(t["risk_level"], str):
+            t["risk_level"] = t["risk_level"].upper()
+    # sort by risk then engagement (both descending)
+    tweets_sorted = sorted(tweets, key=lambda t: _tweet_score_for_sort(t), reverse=True)
+    if send_all or max_tweets is None:
+        return tweets_sorted
+    return tweets_sorted[:max_tweets]
+def _format_tweet_html_block(tweet: dict) -> str:
+    """Return an HTML block describing a tweet (for batched email)."""
+    tweet_id = tweet.get("tweet_id", "N/A")
+    user = tweet.get("username") or tweet.get("user") or tweet.get("user_name") or "N/A"
+    content = tweet.get("content") or tweet.get("text") or ""
+    timestamp = tweet.get("datetime") or tweet.get("timestamp") or "N/A"
+    location = tweet.get("user_location") or tweet.get("location") or "N/A"
+    risk = tweet.get("risk_level", "N/A")
+    likes = tweet.get("like_count", 0)
+    rts = tweet.get("retweet_count", 0)
+    url = tweet.get("tweet_url") or f"https://x.com/{user}/status/{tweet_id}" if tweet_id != "N/A" else "N/A"
+    # Bulk detection
+    bulk_keywords = ["kg", "gram", "bulk", "kilos", "ounce", "pound"]
+    bulk_indicator = "Yes" if any(k in content.lower() for k in bulk_keywords) else "No"
+    # Contact detection (simple digit check)
+    contact_indicator = "Yes" if any(c.isdigit() for c in content) else "No"
+    html = f"""
+    <div style="border:1px solid #ddd;padding:10px;margin-bottom:8px;border-radius:6px;">
+      <p><strong>Risk:</strong> <span style="color:#b22222">{risk}</span> &nbsp;
+         <strong>User:</strong> @{user} &nbsp; <strong>Time:</strong> {timestamp}</p>
+      <p><strong>Location:</strong> {location} &nbsp;<td>{bulk_indicator}</td><td>{contact_indicator}</td><td>{content}</td><strong>Likes:</strong> {likes} &nbsp; <strong>RTs:</strong> {rts}</p>
+      <p style="background:#f7f7f7;padding:8px;border-radius:4px;">{content}</p>
+      <p><a href="{url}">View Tweet</a> | Tweet ID: {tweet_id}</p>
+    </div>
+    """
+    return html
+def _compose_batched_email(tweets: List[dict]) -> MIMEMultipart:
+    msg = MIMEMultipart("alternative")
+    msg["Subject"] = f"🚨 {len(tweets)} High-Priority Drug Alerts"
+    msg["From"] = ALERT_EMAIL
+    msg["To"] = ", ".join(RECIPIENTS)
+    # --- Top CRITICAL summary ---
+    critical_tweets = [t for t in tweets if t.get("risk_level") == "CRITICAL"]
+    top_critical = sorted(critical_tweets, key=lambda t: t.get("dynamic_risk_score", 0), reverse=True)[:10]
+    summary_html = ""
+    if top_critical:
+        summary_html += """
+        <h3 style="color:#b22222;">Top CRITICAL Tweets Summary</h3>
+        <table border="1" cellpadding="5" cellspacing="0" style="border-collapse: collapse;">
+            <tr>
+                <th>User</th><th>Dynamic Risk</th><th>Followers</th><th>Verified</th><th>Engagement</th><th>Location</th><th>Link</th>
+            </tr>
+        """
+        for t in top_critical:
+            user = t.get("username") or t.get("user") or "N/A"
+            risk_score = t.get("dynamic_risk_score", 0)
+            followers = t.get("followers_count", 0)
+            verified = "Yes" if t.get("verified", False) else "No"
+            engagement = int(t.get("like_count", 0)) + int(t.get("retweet_count", 0))
+            location = t.get("user_location", "N/A")
+            tweet_id = t.get("tweet_id", "N/A")
+            url = t.get("tweet_url") or f"https://x.com/{user}/status/{tweet_id}" if tweet_id != "N/A" else "N/A"
+            summary_html += f"""
+            <tr>
+                <td>@{user}</td>
+                <td>{risk_score}</td>
+                <td>{followers}</td>
+                <td>{verified}</td>
+                <td>{engagement}</td>
+                <td>{location}</td>
+                <td><a href="{url}">View</a></td>
+            </tr>
+            """
+        summary_html += "</table><br>"
+    # --- Main email table with all metrics ---
+    html_blocks = ["""
+    <table border="1" cellpadding="5" cellspacing="0" style="border-collapse: collapse;">
+        <tr>
+            <th>Risk</th><th>User</th><th>Dynamic Risk</th><th>Followers</th>
+            <th>Verified</th><th>Engagement</th><th>Geo Score</th><th>Location</th>
+            <th>Bulk</th><th>Contact</th><th>Content</th><th>Link</th>
+        </tr>
+    """]
+    for t in tweets:
+        tweet_id = t.get("tweet_id", "N/A")
+        user = t.get("username") or t.get("user") or t.get("user_name") or "N/A"
+        content = t.get("content") or t.get("text") or ""
+        timestamp = t.get("datetime") or t.get("timestamp") or "N/A"
+        location = str(t.get("user_location") or t.get("location") or "N/A").lower()
+        risk = t.get("risk_level", "N/A")
+        dyn_risk = t.get("dynamic_risk_score", 0)
+        followers = t.get("followers_count", 0)
+        verified = "Yes" if t.get("verified", False) else "No"
+        engagement = int(t.get("like_count", 0)) + int(t.get("retweet_count", 0))
+        geo_score = 1 if any(k in location for k in ["bangalore", "bengaluru", "karnataka"]) else 0
+        url = t.get("tweet_url") or f"https://x.com/{user}/status/{tweet_id}" if tweet_id != "N/A" else "N/A"
+        # Bulk and contact indicators
+        bulk_keywords = ["kg", "gram", "bulk", "kilos", "ounce", "pound"]
+        bulk_indicator = "Yes" if any(k in content.lower() for k in bulk_keywords) else "No"
+        contact_indicator = "Yes" if any(c.isdigit() for c in content) else "No"
+        html_blocks.append(f"""
+        <tr>
+            <td>{risk}</td>
+            <td>@{user}</td>
+            <td>{dyn_risk}</td>
+            <td>{followers}</td>
+            <td>{verified}</td>
+            <td>{engagement}</td>
+            <td>{geo_score}</td>
+            <td>{location}</td>
+            <td>{bulk_indicator}</td>
+            <td>{contact_indicator}</td>
+            <td>{content}</td>
+            <td><a href="{url}">View</a></td>
+        </tr>
+        """)
+    html_blocks.append("</table>")
+    html_text = f"""
+    <html>
+      <body>
+        <h2 style="color:#b22222;">High-Priority Drug Alerts</h2>
+        {summary_html}
+        {''.join(html_blocks)}
+        <hr/>
+        <p>Generated by Karnataka Drug Crime Monitoring System</p>
+      </body>
+    </html>
+    """
+    # Plain-text fallback
+    plain_text = "\n".join([
+        f"{t.get('risk_level')} | @{t.get('username')} | {t.get('dynamic_risk_score')} | {t.get('content','')[:100]}"
+        for t in tweets
+    ])
+    msg.attach(MIMEText(plain_text, "plain"))
+    msg.attach(MIMEText(html_text, "html"))
+    return msg
+# --- SMTP send with retries --- #
+def _send_email_message(msg: MIMEMultipart, recipients: List[str], retry: int = SEND_RETRY) -> bool:
+    """Send message via SMTP; return True on success."""
+    attempt = 0
+    while attempt <= retry:
+        try:
+            with smtplib.SMTP(SMTP_SERVER, SMTP_PORT, timeout=20) as s:
+                s.ehlo()
+                if SMTP_PORT == 587:
+                    s.starttls()
+                    s.ehlo()
+                s.login(ALERT_EMAIL, ALERT_PASSWORD)
+                s.sendmail(ALERT_EMAIL, recipients, msg.as_string())
+            logging.info(f"Email sent to {recipients}")
+            return True
+        except Exception as e:
+            attempt += 1
+            logging.warning(f"Email send attempt {attempt} failed: {e}")
+            if attempt > retry:
+                logging.error("Exceeded email send retries.")
+                return False
+            time.sleep(RETRY_DELAY)
+# --- Public trigger function --- #
+def trigger_alerts(max_tweets: Optional[int] = 10,
+                   send_all: bool = False,
+                   separate_emails: bool = False):
+    logging.info("Fetching high-risk unnotified tweets from DB...")
+    tweets = fetch_high_risk_unnotified()
+    if not tweets:
+        logging.info("No unnotified high-risk tweets found.")
+        return
+    # --- Compute dynamic risk for all fetched tweets ---
+    for t in tweets:
+        t["dynamic_risk_score"] = compute_dynamic_risk(t)
+        t["risk_level"] = assign_dynamic_risk_level(t)  # automatically set risk_level
+    selected = _select_top_tweets(tweets, max_tweets, send_all)
+    if not selected:
+        logging.info("No tweets selected after filtering.")
+        return
+    # Compose and send emails (batch or separate)
+    success_ids, failure_ids = [], []
+    if separate_emails:
+        for t in selected:
+            msg = _compose_batched_email([t])
+            ok = _send_email_message(msg, RECIPIENTS)
+            if ok:
+                success_ids.append((t.get("tweet_id"), t.get("_collection_name")))
+            else:
+                failure_ids.append(t.get("tweet_id"))
+            time.sleep(BATCH_DELAY)
+    else:
+        msg = _compose_batched_email(selected)
+        ok = _send_email_message(msg, RECIPIENTS)
+        if ok:
+            success_ids.extend([(t.get("tweet_id"), t.get("_collection_name")) for t in selected])
+        else:
+            failure_ids.extend([t.get("tweet_id") for t in selected])
+    # Mark notified
+    for tid, maybe_collection in success_ids:
+        try:
+            mark_as_notified(tid)
+        except Exception as e:
+            logging.error(f"Failed to mark {tid} as notified: {e}")
+    logging.info(f"Alerts sent: {len(success_ids)}; failures: {len(failure_ids)}")
+    if failure_ids:
+        logging.warning(f"Failed tweet IDs: {failure_ids}")
+def compute_risk_probability(dynamic_score: float) -> float:
+    """
+    Convert dynamic risk score (0–100) into a probability 0–1
+    """
+    return max(0.0, min(1.0, dynamic_score / 100))
+# --- CLI usage example --- #
+if __name__ == "__main__":
+    # Example usages:
+    # - send up to 5 top tweets (batched) -> trigger_alerts(max_tweets=5)
+    # - send all unnotified high-risk tweets -> trigger_alerts(send_all=True)
+    # - send one email per tweet -> trigger_alerts(max_tweets=10, separate_emails=True)
+    # Default example: top 10 (batched)
+    trigger_alerts(max_tweets=10)

src/db.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# db.py
+from pymongo import MongoClient, UpdateOne
+import pandas as pd
+from dotenv import load_dotenv
+load_dotenv()
+import os
+import json
+import sys
+# MongoDB configuration
+MONGO_URI = os.getenv("MONGO_URI")
+if not MONGO_URI:
+    print("❌ MONGO_URI not set in environment variables.")
+    sys.exit(1)
+DB_NAME = "drug_monitoring_twitter"
+COLLECTION_NAME = "tweets"
+FOLDER_PATH = "drug_analysis_data_3months"  # folder with scraper outputs
+def get_db_collection():
+    """Connect to MongoDB and return the collection"""
+    client = MongoClient(MONGO_URI)
+    db = client[DB_NAME]
+    if COLLECTION_NAME not in db.list_collection_names():
+        db.create_collection(COLLECTION_NAME)
+        print(f"✅ Created collection: {COLLECTION_NAME}")
+    return db[COLLECTION_NAME]
+def insert_all_from_folder(folder_path=FOLDER_PATH):
+    """Insert all CSV/JSON files from scraper folder into MongoDB"""
+    collection = get_db_collection()
+    if not os.path.exists(folder_path):
+        print(f"❌ Folder path does not exist: {folder_path}")
+        return
+    for file_name in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, file_name)
+        operations = []
+        try:
+            if file_name.endswith(".csv"):
+                df = pd.read_csv(file_path, encoding="utf-8")
+                for _, row in df.iterrows():
+                    doc = row.to_dict()
+                    doc["notified"] = False
+                    if "tweet_id" in doc:
+                        operations.append(
+                            UpdateOne({"tweet_id": doc["tweet_id"]}, {"$set": doc}, upsert=True)
+                        )
+            elif file_name.endswith(".json"):
+                with open(file_path, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+                if isinstance(data, list):
+                    for tweet in data:
+                        tweet["notified"] = False
+                        if "tweet_id" in tweet:
+                            operations.append(
+                                UpdateOne({"tweet_id": tweet["tweet_id"]}, {"$set": tweet}, upsert=True)
+                            )
+                else:
+                    # single JSON report
+                    operations.append(
+                        UpdateOne({"report_name": file_name}, {"$set": data}, upsert=True)
+                    )
+            if operations:
+                result = collection.bulk_write(operations)
+                print(f"✅ {file_name} -> inserted/updated {result.upserted_count + result.modified_count} documents.")
+        except Exception as e:
+            print(f"❌ Failed to process {file_name}: {e}")
+def fetch_high_risk_unnotified():
+    """Get all HIGH or CRITICAL risk tweets that are not notified yet"""
+    collection = get_db_collection()
+    return list(collection.find({"risk_level": {"$in": ["HIGH", "CRITICAL"]}, "notified": False}))
+def mark_as_notified(tweet_id):
+    """Mark a tweet as notified after sending alert"""
+    collection = get_db_collection()
+    collection.update_one({"tweet_id": tweet_id}, {"$set": {"notified": True}})
+if __name__ == "__main__":
+    insert_all_from_folder()
+    print("✅ All scraper folder contents inserted/updated successfully.")

src/enhanced_drug_crime_scraper_3months.py ADDED Viewed

	@@ -0,0 +1,774 @@

+# enhanced_drug_crime_scraper_3months.py
+# Enhanced Twitter Scraper for Karnataka Police Drug Crime Analysis - Latest 3 Months
+# Automatically collects data for the most recent 3 months
+import asyncio
+import pandas as pd
+from twscrape import API, gather
+from fuzzywuzzy import fuzz
+from datetime import timezone, timedelta, datetime
+import logging
+import sys
+import os
+import re
+from collections import Counter
+import json
+import hashlib
+from typing import List, Dict, Set, Tuple, Optional
+import warnings
+warnings.filterwarnings('ignore')
+# Sentiment analysis (optional)
+try:
+    import nltk
+    from nltk.sentiment import SentimentIntensityAnalyzer
+    nltk.download('vader_lexicon', quiet=True)
+    nltk.download('punkt', quiet=True)
+    nltk.download('stopwords', quiet=True)
+    SENTIMENT_AVAILABLE = True
+except:
+    SENTIMENT_AVAILABLE = False
+    print("Warning: NLTK not available. Sentiment analysis will be disabled.")
+# Setup comprehensive logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('drug_crime_scraper_3months.log'),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger(__name__)
+class DrugCrimeScraper3Months:
+    def __init__(self):
+        self.api = None
+        self.sentiment_analyzer = None
+        self.seen_ids: Set[str] = set()
+        self.collected_tweets: List[Dict] = []
+        # Initialize sentiment analyzer if available
+        if SENTIMENT_AVAILABLE:
+            try:
+                self.sentiment_analyzer = SentimentIntensityAnalyzer()
+            except:
+                logger.warning("Sentiment analyzer initialization failed")
+        # Enhanced drug-related keywords with criminal context
+        self.drug_keywords = [
+            # Cannabis variants
+            "weed", "ganja", "hash", "hashish", "charas", "cannabis", "marijuana", "dope", "pot", "grass",
+            "bhang", "nasha", "nashe", "maal", "stuff", "green", "herb", "mary jane", "bud", "kush",
+            # Hard drugs
+            "acid", "lsd", "mdma", "ecstasy", "molly", "coke", "cocaine", "crack", "meth", "crystal",
+            "heroin", "brown sugar", "smack", "ketamine", "special k", "oxy", "oxycodone", "percocet",
+            "adderall", "xanax", "benzo", "valium", "diazepam", "fentanyl", "tramadol", "codeine",
+            # Drug trade terms
+            "dealer", "peddler", "supplier", "pusher", "connect", "plug", "score", "scored",
+            "selling", "buying", "supply", "delivery", "pickup", "drop", "stash", "cache",
+            "package", "parcel", "consignment", "shipment", "bulk", "wholesale", "retail",
+            # Consumption terms
+            "trip", "tripping", "high", "stoned", "blazed", "hit", "dose", "line", "bump",
+            "joint", "blunt", "spliff", "bong", "pipe", "chillum", "roll", "smoke",
+            "inject", "snort", "pop", "drop", "chase", "shoot up", "mainline",
+            # Slang and coded terms
+            "420", "710", "party supplies", "study material", "medicine", "remedy",
+            "good stuff", "quality stuff", "premium", "imported", "local", "organic",
+            "fresh", "clean", "pure", "top shelf", "fire", "dank",
+            # Emoji and coded representations
+            "🍁", "💨", "🔥", "🌿", "💊", "💉", "🚬", "🍄", "❄️", "💎", "🌪️", "🔋",
+            "w33d", "g4nja", "h4sh", "c0ke", "m0lly", "xtc", "l$d", "m@al"
+        ]
+        # Criminal activity indicators
+        self.crime_indicators = [
+            "arrest", "arrested", "caught", "raid", "police", "cops", "seized", "confiscated",
+            "bust", "busted", "investigation", "case", "fir", "complaint", "crime", "criminal",
+            "illegal", "banned", "prohibited", "contraband", "smuggling", "trafficking",
+            "possession", "distribution", "manufacturing", "cultivation", "racket", "cartel",
+            "kingpin", "network", "operation", "crackdown", "surveillance", "undercover",
+            "ncb", "dri", "excise", "customs", "border", "interstate", "mafia"
+        ]
+        # Karnataka location keywords (enhanced)
+        self.karnataka_keywords = self._prepare_karnataka_keywords()
+        # High-risk areas in Karnataka
+        self.high_risk_areas = [
+            "malleswaram", "shivaji nagar", "frazer town", "russell market", "chickpet",
+            "kr market", "city market", "commercial street", "brigade road", "mg road",
+            "hosur road", "electronic city", "silk board", "bommanahalli", "btm layout",
+            "jayanagar", "basavanagudi", "hanumanthanagar", "banashankari", "girinagar",
+            "vijayanagar", "rajajinagar", "mahalakshmi layout", "peenya", "jalahalli",
+            "hebbal", "rt nagar", "kammanahalli", "banaswadi", "krishnarajapuram",
+            "whitefield", "marathahalli", "brookefield", "varthur", "sarjapur",
+            "koramangala", "indiranagar", "domlur", "ulsoor", "vasanth nagar",
+            "sadashiva nagar", "seshadripuram", "gandhinagar", "padhmanabhanagar"
+        ]
+    def _prepare_karnataka_keywords(self) -> List[str]:
+        """Prepare normalized Karnataka location keywords."""
+        raw_keywords = [
+            # Main cities and districts
+            "karnataka", "bengaluru", "bangalore", "blr", "b'lore", "namma bengaluru", "namma blr",
+            "mysore", "mysuru", "mangalore", "mangaluru", "hubli", "dharwad", "belgaum", "belagavi",
+            "ballari", "bellary", "bijapur", "vijayapura", "tumkur", "davangere", "shimoga", "shivamogga",
+            "hassan", "chitradurga", "gadag", "bidar", "raichur", "bagalkot", "haveri", "koppal",
+            "ramanagara", "chikkamagaluru", "yadgir", "mandya", "coorg", "kodagu", "karwar",
+            "kolar", "chikkaballapur", "madikeri", "udupi", "manipal",
+            # Bangalore areas and neighborhoods
+            "shivaji nagar", "majestic", "btm", "indiranagar", "jayanagar", "koramangala",
+            "whitefield", "banashankari", "hebbal", "rajajinagar", "yeshwanthpur", "kengeri",
+            "marathahalli", "hoskote", "rt nagar", "rr nagar", "bsk", "mg road", "brigade road",
+            "yelhanka", "basavanagudi", "malleswaram", "seshadripuram", "gandhinagar",
+            "electronic city", "silk board", "bommanahalli", "peenya", "jalahalli",
+            "vijayanagar", "rajajinagar", "kammanahalli", "banaswadi", "kr puram",
+            "domlur", "ulsoor", "vasanth nagar", "sadashiva nagar", "frazer town",
+            "commercial street", "chickpet", "kr market", "city market", "russell market",
+            # Tourist and party destinations (potential drug hotspots)
+            "hampi", "gokarna", "coorg", "chikmagalur", "nandi hills", "skandagiri",
+            "br hills", "kudremukh", "jog falls", "murdeshwar", "karwar beach",
+            "kabini", "bandipur", "nagarhole", "sakleshpur", "kemmannagundi"
+        ]
+        return [self._normalize_text(kw) for kw in raw_keywords]
+    def _normalize_text(self, text: str) -> str:
+        """Normalize text for better matching."""
+        if not isinstance(text, str):
+            return ""
+        return re.sub(r'\W+', ' ', text.lower()).strip()
+    def _fuzzy_match(self, text: str, keyword_list: List[str], threshold: int = 75) -> bool:
+        """Check if any keyword fuzzy matches text above threshold."""
+        text_norm = self._normalize_text(text)
+        if not text_norm:
+            return False
+        return any(fuzz.partial_ratio(text_norm, kw) >= threshold for kw in keyword_list)
+    def _compute_relevance_score(self, text: str, keyword_list: List[str], threshold: int = 70) -> int:
+        """Count how many keywords match text above threshold."""
+        text_norm = self._normalize_text(text)
+        return sum(1 for kw in keyword_list if fuzz.partial_ratio(text_norm, kw) >= threshold)
+    def _analyze_sentiment(self, text: str) -> Dict[str, float]:
+        """Analyze sentiment of tweet text."""
+        if not self.sentiment_analyzer:
+            return {"compound": 0.0, "pos": 0.0, "neu": 0.0, "neg": 0.0}
+        try:
+            scores = self.sentiment_analyzer.polarity_scores(text)
+            return scores
+        except:
+            return {"compound": 0.0, "pos": 0.0, "neu": 0.0, "neg": 0.0}
+    def _extract_mentions_hashtags(self, text: str) -> Tuple[List[str], List[str]]:
+        """Extract mentions and hashtags from tweet text."""
+        mentions = re.findall(r'@(\w+)', text)
+        hashtags = re.findall(r'#(\w+)', text.lower())
+        return mentions, hashtags
+    def _detect_phone_numbers(self, text: str) -> List[str]:
+        """Detect potential phone numbers in text."""
+        # Indian phone number patterns
+        patterns = [
+            r'\b[6-9]\d{9}\b',  # 10-digit mobile
+            r'\+91[6-9]\d{9}\b',  # With +91
+            r'\b91[6-9]\d{9}\b',   # With 91
+            r'\b[6-9]\d{4}\s?\d{5}\b',  # With space
+        ]
+        phone_numbers = []
+        for pattern in patterns:
+            phone_numbers.extend(re.findall(pattern, text))
+        return list(set(phone_numbers))  # Remove duplicates
+    def _assess_risk_level(self, tweet_data: Dict) -> str:
+        """Assess risk level based on various factors."""
+        risk_score = 0
+        # Check for crime indicators
+        if self._fuzzy_match(tweet_data['content'], self.crime_indicators, 70):
+            risk_score += 3
+        # Check for high drug relevance
+        if tweet_data['drug_score'] > 3:
+            risk_score += 2
+        # Check for high-risk locations
+        location_text = f"{tweet_data['user_location']} {tweet_data['content']}".lower()
+        if any(area in location_text for area in self.high_risk_areas):
+            risk_score += 2
+        # Check for contact information
+        if self._detect_phone_numbers(tweet_data['content']):
+            risk_score += 3
+        # Check for selling/buying indicators
+        selling_terms = ["selling", "available", "dm me", "contact", "call", "whatsapp", "delivery", "pickup"]
+        if any(term in tweet_data['content'].lower() for term in selling_terms):
+            risk_score += 2
+        # Check for quantity indicators (bulk operations)
+        quantity_terms = ["kg", "gram", "ounce", "pound", "bulk", "wholesale", "lots", "kilos"]
+        if any(term in tweet_data['content'].lower() for term in quantity_terms):
+            risk_score += 1
+        # Assess sentiment - negative sentiment might indicate problems
+        if tweet_data['sentiment']['compound'] < -0.5:
+            risk_score += 1
+        # High engagement (viral content)
+        if tweet_data.get('like_count', 0) > 100 or tweet_data.get('retweet_count', 0) > 50:
+            risk_score += 1
+        # Determine risk level
+        if risk_score >= 8:
+            return "CRITICAL"
+        elif risk_score >= 6:
+            return "HIGH"
+        elif risk_score >= 3:
+            return "MEDIUM"
+        else:
+            return "LOW"
+    def _is_karnataka_relevant(self, tweet) -> bool:
+        """Enhanced Karnataka relevance check."""
+        # User location check
+        if tweet.user.location and self._fuzzy_match(tweet.user.location, self.karnataka_keywords, 80):
+            return True
+        # User description check
+        user_desc = getattr(tweet.user, 'description', '') or ''
+        if self._fuzzy_match(user_desc, self.karnataka_keywords, 80):
+            return True
+        # Tweet content check
+        tweet_text = tweet.rawContent or ""
+        if self._fuzzy_match(tweet_text, self.karnataka_keywords, 75):
+            return True
+        # Hashtags check
+        hashtags = getattr(tweet, 'hashtags', []) or []
+        hashtags_text = ' '.join(hashtags).lower()
+        if any(kw in hashtags_text for kw in self.karnataka_keywords):
+            return True
+        return False
+    async def _search_with_retry(self, query: str, limit: int, retries: int = 3, delay: int = 10) -> List:
+        """Search tweets with enhanced retry logic."""
+        for attempt in range(retries):
+            try:
+                logger.info(f"Searching with query: {query[:50]}... (Attempt {attempt + 1})")
+                tweets = await gather(self.api.search(query, limit=limit))
+                logger.info(f"Retrieved {len(tweets)} tweets")
+                return tweets
+            except Exception as e:
+                logger.error(f"Attempt {attempt + 1} failed for query {query[:30]}...: {e}")
+                if attempt < retries - 1:
+                    await asyncio.sleep(delay * (attempt + 1))  # Exponential backoff
+        logger.error(f"All attempts failed for query: {query[:30]}...")
+        return []
+    async def _collect_tweets_for_period(self, start_date: str, end_date: str, limit_per_query: int = 200) -> List[Dict]:
+        """Collect tweets for a specific time period with enhanced search strategies for latest 3 months."""
+        tweets_data = []
+        # Enhanced search queries optimized for recent data (last 3 months)
+        search_queries = [
+            # Primary drug-related searches
+            f"(ganja OR weed OR charas OR hash OR cannabis OR marijuana) (karnataka OR bengaluru OR bangalore OR mysore) lang:en since:{start_date} until:{end_date}",
+            f"(cocaine OR heroin OR mdma OR ecstasy OR drugs OR narcotics) (karnataka OR bengaluru OR bangalore) lang:en since:{start_date} until:{end_date}",
+            # Criminal activity searches
+            f"(dealer OR supplier OR selling OR buying) (drugs OR maal OR stuff) (karnataka OR bengaluru) since:{start_date} until:{end_date}",
+            f"(police OR arrest OR raid OR bust OR seized) (drugs OR narcotics OR ganja) karnataka since:{start_date} until:{end_date}",
+            # Location-specific searches
+            f"(drugs OR ganja OR weed) (shivaji nagar OR malleswaram OR btm OR koramangala) since:{start_date} until:{end_date}",
+            f"(maal OR stuff OR hash) (indiranagar OR whitefield OR electronic city OR jayanagar) since:{start_date} until:{end_date}",
+            # Coded language searches
+            f"(420 OR party supplies OR green OR herb) (bengaluru OR bangalore OR karnataka) since:{start_date} until:{end_date}",
+            f"(delivery OR pickup OR dm me OR contact) (stuff OR green OR maal) karnataka since:{start_date} until:{end_date}",
+            # News and official searches
+            f"(ncb OR narcotics control OR drug bust OR seizure) karnataka since:{start_date} until:{end_date}",
+            f"(drug trafficking OR smuggling OR peddling) (karnataka OR bengaluru OR mysore) since:{start_date} until:{end_date}"
+        ]
+        for query in search_queries:
+            logger.info(f"Executing query: {query[:100]}...")
+            tweets = await self._search_with_retry(query, limit_per_query)
+            for tweet in tweets:
+                if tweet.id in self.seen_ids:
+                    continue
+                # Karnataka relevance filter
+                if not self._is_karnataka_relevant(tweet):
+                    continue
+                self.seen_ids.add(tweet.id)
+                tweet_data = self._process_tweet(tweet)
+                if tweet_data:
+                    tweets_data.append(tweet_data)
+            # Rate limiting between queries
+            await asyncio.sleep(8)
+        logger.info(f"Collected {len(tweets_data)} relevant tweets for period {start_date} to {end_date}")
+        return tweets_data
+    def _process_tweet(self, tweet) -> Optional[Dict]:
+        """Process individual tweet and extract relevant information."""
+        try:
+            # Basic tweet information
+            user_location = tweet.user.location or ""
+            user_description = getattr(tweet.user, 'description', '') or ""
+            tweet_text = tweet.rawContent or ""
+            hashtags = getattr(tweet, 'hashtags', []) or []
+            # Skip if no meaningful content
+            if not tweet_text.strip() or len(tweet_text) < 10:
+                return None
+            # Calculate relevance scores
+            kar_score = (
+                4 * self._compute_relevance_score(user_location, self.karnataka_keywords) +
+                2 * self._compute_relevance_score(user_description, self.karnataka_keywords) +
+                3 * self._compute_relevance_score(tweet_text, self.karnataka_keywords)
+            )
+            drug_score = self._compute_relevance_score(tweet_text, self.drug_keywords, 80)
+            crime_score = self._compute_relevance_score(tweet_text, self.crime_indicators, 75)
+            # Extract additional information
+            mentions, hashtags_extracted = self._extract_mentions_hashtags(tweet_text)
+            phone_numbers = self._detect_phone_numbers(tweet_text)
+            sentiment = self._analyze_sentiment(tweet_text)
+            # Convert datetime to IST
+            ist = timezone(timedelta(hours=5, minutes=30))
+            utc_time = tweet.date if tweet.date.tzinfo else tweet.date.replace(tzinfo=timezone.utc)
+            ist_time = utc_time.astimezone(ist)
+            # Create tweet data dictionary
+            tweet_data = {
+                "tweet_id": str(tweet.id),
+                "datetime": ist_time.strftime("%d-%m-%Y %H:%M:%S"),
+                "username": tweet.user.username,
+                "user_display_name": getattr(tweet.user, 'displayname', '') or '',
+                "user_followers": getattr(tweet.user, 'followersCount', 0),
+                "user_following": getattr(tweet.user, 'followingCount', 0),
+                "user_verified": getattr(tweet.user, 'verified', False),
+                "content": tweet_text,
+                "user_location": user_location,
+                "user_description": user_description,
+                "hashtags": ' '.join(hashtags),
+                "mentions": ', '.join(mentions),
+                "phone_numbers": ', '.join(phone_numbers),
+                "tweet_url": f"https://x.com/{tweet.user.username}/status/{tweet.id}",
+                "retweet_count": getattr(tweet, 'retweetCount', 0),
+                "like_count": getattr(tweet, 'likeCount', 0),
+                "reply_count": getattr(tweet, 'replyCount', 0),
+                "kar_score": kar_score,
+                "drug_score": drug_score,
+                "crime_score": crime_score,
+                "sentiment": sentiment,
+                "sentiment_compound": sentiment.get('compound', 0),
+                "is_drug_related": drug_score > 0,
+                "is_crime_related": crime_score > 0,
+                "has_contact_info": len(phone_numbers) > 0,
+                "risk_level": "",  # Will be calculated later
+                "content_hash": hashlib.md5(tweet_text.encode()).hexdigest()
+            }
+            # Assess risk level
+            tweet_data["risk_level"] = self._assess_risk_level(tweet_data)
+            return tweet_data
+        except Exception as e:
+            logger.error(f"Error processing tweet {getattr(tweet, 'id', 'unknown')}: {e}")
+            return None
+    async def initialize_api(self) -> bool:
+        """Initialize Twitter API connection."""
+        try:
+            self.api = API()
+            accounts = await self.api.pool.get_all()
+            if not accounts:
+                logger.error("No Twitter accounts found. Please add accounts using: twscrape add_account")
+                logger.error("Setup instructions:")
+                logger.error("1. twscrape add_account username email password")
+                logger.error("2. twscrape login_accounts")
+                return False
+            logger.info(f"Initialized with {len(accounts)} accounts: {[a.username for a in accounts]}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to initialize API: {e}")
+            return False
+    def _ensure_output_directory(self, directory: str = "drug_analysis_data_3months"):
+        """Create output directory if it doesn't exist."""
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+            logger.info(f"Created directory: {directory}")
+    def _save_data(self, data: List[Dict], base_filename: str):
+        """Save data in multiple formats with detailed analysis."""
+        if not data:
+            logger.warning("No data to save")
+            return
+        df = pd.DataFrame(data)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        # Calculate the actual date range from data
+        if 'datetime' in df.columns:
+            df['date_parsed'] = pd.to_datetime(df['datetime'], format="%d-%m-%Y %H:%M:%S", errors='coerce')
+            date_range = f"3months_{df['date_parsed'].min().strftime('%Y%m%d')}_to_{df['date_parsed'].max().strftime('%Y%m%d')}"
+        else:
+            date_range = "latest_3months"
+        # Save main dataset
+        main_file = f"drug_analysis_data_3months/karnataka_drug_tweets_{date_range}_{timestamp}.csv"
+        df.to_csv(main_file, index=False, encoding='utf-8')
+        logger.info(f"Saved main dataset: {main_file}")
+        # Save high-risk tweets separately (HIGH PRIORITY FOR POLICE)
+        high_risk_df = df[df['risk_level'].isin(['HIGH', 'CRITICAL'])]
+        if not high_risk_df.empty:
+            risk_file = f"drug_analysis_data_3months/HIGH_PRIORITY_tweets_{date_range}_{timestamp}.csv"
+            high_risk_df.to_csv(risk_file, index=False, encoding='utf-8')
+            logger.info(f"🚨 Saved HIGH PRIORITY tweets: {risk_file}")
+        # Save drug-related tweets
+        drug_df = df[df['is_drug_related'] == True]
+        if not drug_df.empty:
+            drug_file = f"drug_analysis_data_3months/drug_related_{date_range}_{timestamp}.csv"
+            drug_df.to_csv(drug_file, index=False, encoding='utf-8')
+            logger.info(f"Saved drug-related tweets: {drug_file}")
+        # Save tweets with contact information (INVESTIGATION LEADS)
+        contact_df = df[df['has_contact_info'] == True]
+        if not contact_df.empty:
+            contact_file = f"drug_analysis_data_3months/CONTACT_INFO_tweets_{date_range}_{timestamp}.csv"
+            contact_df.to_csv(contact_file, index=False, encoding='utf-8')
+            logger.info(f"🔍 Saved tweets with contact info: {contact_file}")
+        # Save by risk level
+        for risk_level in ['CRITICAL', 'HIGH', 'MEDIUM']:
+            risk_df = df[df['risk_level'] == risk_level]
+            if not risk_df.empty:
+                risk_level_file = f"drug_analysis_data_3months/{risk_level}_RISK_{date_range}_{timestamp}.csv"
+                risk_df.to_csv(risk_level_file, index=False, encoding='utf-8')
+                logger.info(f"Saved {risk_level} risk tweets: {risk_level_file}")
+        # Generate summary report
+        self._generate_summary_report(df, timestamp, date_range)
+    def _generate_summary_report(self, df: pd.DataFrame, timestamp: str, date_range: str):
+        """Generate a comprehensive summary report for police analysis."""
+        report = {
+            "analysis_metadata": {
+                "analysis_timestamp": timestamp,
+                "date_range": date_range,
+                "collection_period_days": len(df['datetime'].unique()) if 'datetime' in df.columns else 0,
+                "scraper_version": "3_months_automatic",
+                "total_queries_executed": 10,
+                "karnataka_police_authorization": "DRUG_CRIME_ANALYSIS_2024"
+            },
+            "summary_statistics": {
+                "total_tweets": len(df),
+                "drug_related_tweets": int(df['is_drug_related'].sum()),
+                "crime_related_tweets": int(df['is_crime_related'].sum()),
+                "tweets_with_contact_info": int(df['has_contact_info'].sum()),
+                "unique_users": int(df['username'].nunique()),
+                "verified_users": int(df['user_verified'].sum()),
+                "average_followers": int(df['user_followers'].mean()) if len(df) > 0 else 0
+            },
+            "risk_analysis": {
+                "risk_distribution": df['risk_level'].value_counts().to_dict(),
+                "critical_alerts": int((df['risk_level'] == 'CRITICAL').sum()),
+                "high_priority": int((df['risk_level'] == 'HIGH').sum()),
+                "investigation_leads": int(df['has_contact_info'].sum())
+            },
+            "geographic_analysis": {
+                "top_locations": df['user_location'].value_counts().head(15).to_dict(),
+                "high_risk_areas_mentioned": sum(1 for area in self.high_risk_areas
+                                               if any(area in tweet.lower() for tweet in df['content']))
+            },
+            "content_analysis": {
+                "sentiment_distribution": {
+                    "positive": int((df['sentiment_compound'] > 0.1).sum()),
+                    "neutral": int((df['sentiment_compound'].between(-0.1, 0.1)).sum()),
+                    "negative": int((df['sentiment_compound'] < -0.1).sum())
+                },
+                "top_hashtags": df['hashtags'].str.split().explode().value_counts().head(25).to_dict(),
+                "most_mentioned_users": df['mentions'].str.split(', ').explode().value_counts().head(20).to_dict(),
+                "average_drug_score": float(df['drug_score'].mean()) if len(df) > 0 else 0,
+                "average_crime_score": float(df['crime_score'].mean()) if len(df) > 0 else 0
+            },
+            "user_analysis": {
+                "high_activity_users": df['username'].value_counts().head(15).to_dict(),
+                "most_followed_users": df.nlargest(10, 'user_followers')[['username', 'user_followers']].to_dict('records'),
+                "users_with_contact_info": df[df['has_contact_info']]['username'].tolist()
+            },
+            "temporal_analysis": {
+                "tweets_by_date": df['datetime'].str[:10].value_counts().sort_index().to_dict() if 'datetime' in df.columns else {},
+                "peak_activity_days": df['datetime'].str[:10].value_counts().head(10).to_dict() if 'datetime' in df.columns else {}
+            },
+            "investigation_priorities": {
+                "immediate_action_required": len(df[df['risk_level'].isin(['CRITICAL', 'HIGH'])]),
+                "contact_information_available": len(df[df['has_contact_info']]),
+                "bulk_operation_indicators": len(df[df['content'].str.contains('kg|gram|bulk|wholesale', case=False, na=False)]),
+                "cross_border_mentions": len(df[df['content'].str.contains('interstate|border|import', case=False, na=False)])
+            }
+        }
+        # Save comprehensive report as JSON
+        report_file = f"drug_analysis_data_3months/POLICE_ANALYSIS_REPORT_{date_range}_{timestamp}.json"
+        with open(report_file, 'w', encoding='utf-8') as f:
+            json.dump(report, f, indent=2, ensure_ascii=False, default=str)
+        logger.info(f"📊 Generated comprehensive police report: {report_file}")
+        # Print executive summary to console
+        print("\n" + "="*80)
+        print("KARNATAKA POLICE DRUG CRIME ANALYSIS REPORT - LATEST 3 MONTHS")
+        print("="*80)
+        print(f"Analysis completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        print(f"Period analyzed: Latest 3 months ({date_range})")
+        print(f"Total tweets analyzed: {report['summary_statistics']['total_tweets']}")
+        print(f"Drug-related tweets: {report['summary_statistics']['drug_related_tweets']} ({report['summary_statistics']['drug_related_tweets']/report['summary_statistics']['total_tweets']*100:.1f}%)")
+        print(f"Crime-related tweets: {report['summary_statistics']['crime_related_tweets']} ({report['summary_statistics']['crime_related_tweets']/report['summary_statistics']['total_tweets']*100:.1f}%)")
+        print(f"Tweets with contact info: {report['summary_statistics']['tweets_with_contact_info']} (🚨 INVESTIGATION LEADS)")
+        print(f"Unique users: {report['summary_statistics']['unique_users']}")
+        print(f"Verified users: {report['summary_statistics']['verified_users']}")
+        print("\n🚨 RISK LEVEL DISTRIBUTION:")
+        total_tweets = sum(report['risk_analysis']['risk_distribution'].values())
+        for risk, count in report['risk_analysis']['risk_distribution'].items():
+            percentage = (count/total_tweets*100) if total_tweets > 0 else 0
+            emoji = "🔴" if risk == "CRITICAL" else "🟠" if risk == "HIGH" else "🟡" if risk == "MEDIUM" else "🟢"
+            print(f"  {emoji} {risk}: {count} ({percentage:.1f}%)")
+        print("\n📍 TOP LOCATIONS (First 10):")
+        for i, (location, count) in enumerate(list(report['geographic_analysis']['top_locations'].items())[:10], 1):
+            print(f"  {i:2d}. {location}: {count} tweets")
+        print("\n💭 SENTIMENT ANALYSIS:")
+        for sentiment, count in report['content_analysis']['sentiment_distribution'].items():
+            percentage = (count/report['summary_statistics']['total_tweets']*100) if report['summary_statistics']['total_tweets'] > 0 else 0
+            print(f"  {sentiment.capitalize()}: {count} ({percentage:.1f}%)")
+        print("\n👥 HIGH ACTIVITY USERS (Top 5):")
+        for i, (user, count) in enumerate(list(report['user_analysis']['high_activity_users'].items())[:5], 1):
+            print(f"  {i}. @{user}: {count} tweets")
+        print("\n🎯 IMMEDIATE INVESTIGATION PRIORITIES:")
+        priorities = report['investigation_priorities']
+        if priorities['immediate_action_required'] > 0:
+            print(f"  🔴 URGENT: {priorities['immediate_action_required']} high/critical risk tweets requiring immediate review")
+        if priorities['contact_information_available'] > 0:
+            print(f"  🔍 LEADS: {priorities['contact_information_available']} tweets contain contact information")
+        if priorities['bulk_operation_indicators'] > 0:
+            print(f"  📦 BULK OPS: {priorities['bulk_operation_indicators']} tweets mention bulk operations")
+        if priorities['cross_border_mentions'] > 0:
+            print(f"  🌐 INTERSTATE: {priorities['cross_border_mentions']} tweets mention cross-border activity")
+        print("\n📁 OUTPUT FILES GENERATED:")
+        print(f"  • Main dataset: karnataka_drug_tweets_{date_range}_{timestamp}.csv")
+        print(f"  • High priority: HIGH_PRIORITY_tweets_{date_range}_{timestamp}.csv")
+        print(f"  • Contact info: CONTACT_INFO_tweets_{date_range}_{timestamp}.csv")
+        print(f"  • Analysis report: POLICE_ANALYSIS_REPORT_{date_range}_{timestamp}.json")
+        print("\n✅ NEXT STEPS:")
+        print("  1. Review HIGH_PRIORITY_tweets file for immediate action")
+        print("  2. Investigate users in CONTACT_INFO_tweets file")
+        print("  3. Cross-reference with existing case files")
+        print("  4. Monitor high-activity users for ongoing surveillance")
+        print("="*80)
+    async def run_analysis(self, start_date: str, end_date: str, tweets_per_day: int = 200):
+        """Run the complete drug crime analysis for latest 3 months."""
+        logger.info("🚔 Starting Karnataka Police Drug Crime Analysis - Latest 3 Months")
+        logger.info(f"📅 Analysis period: {start_date} to {end_date}")
+        # Calculate total days
+        start_dt = datetime.strptime(start_date, "%Y-%m-%d")
+        end_dt = datetime.strptime(end_date, "%Y-%m-%d")
+        total_days = (end_dt - start_dt).days
+        logger.info(f"⏱️  Total analysis period: {total_days} days")
+        logger.info(f"🎯 Expected tweets: ~{tweets_per_day * total_days}")
+        # Initialize API
+        if not await self.initialize_api():
+            logger.error("❌ Failed to initialize Twitter API. Exiting.")
+            return
+        # Ensure output directory exists
+        self._ensure_output_directory()
+        # Generate date ranges for systematic collection (7-day chunks for better coverage)
+        date_ranges = self._generate_date_ranges(start_date, end_date, days_per_chunk=7)
+        logger.info(f"📊 Will process {len(date_ranges)} date ranges (7-day chunks)")
+        all_tweets = []
+        for i, (start, end) in enumerate(date_ranges, 1):
+            logger.info(f"🔍 Analyzing period {i}/{len(date_ranges)}: {start} to {end}")
+            tweets = await self._collect_tweets_for_period(start, end, tweets_per_day)
+            all_tweets.extend(tweets)
+            logger.info(f"✅ Collected {len(tweets)} tweets for this period. Total so far: {len(all_tweets)}")
+            # Rate limiting between periods - longer delay for larger dataset
+            if i < len(date_ranges):  # Don't sleep after the last iteration
+                logger.info("⏳ Waiting 15 seconds before next period...")
+                await asyncio.sleep(15)
+        if not all_tweets:
+            logger.error("❌ No relevant tweets collected for analysis")
+            return
+        logger.info("🔄 Removing duplicate tweets...")
+        # Remove duplicates based on content hash
+        unique_tweets = []
+        seen_hashes = set()
+        for tweet in all_tweets:
+            content_hash = tweet.get('content_hash', '')
+            if content_hash not in seen_hashes:
+                unique_tweets.append(tweet)
+                seen_hashes.add(content_hash)
+        duplicates_removed = len(all_tweets) - len(unique_tweets)
+        logger.info(f"🗑️  Removed {duplicates_removed} duplicate tweets")
+        logger.info(f"✨ Final unique tweets: {len(unique_tweets)}")
+        # Save collected data
+        logger.info("💾 Saving collected data...")
+        self._save_data(unique_tweets, "karnataka_drug_analysis_3months")
+        logger.info("🎉 Analysis completed successfully!")
+        logger.info(f"📈 Total unique tweets collected: {len(unique_tweets)}")
+        logger.info(f"📁 Check the 'drug_analysis_data_3months' directory for results")
+        return unique_tweets
+    def _generate_date_ranges(self, start_date: str, end_date: str, days_per_chunk: int = 7) -> List[Tuple[str, str]]:
+        """Generate date ranges for systematic data collection - optimized for 3 months."""
+        start = datetime.strptime(start_date, "%Y-%m-%d")
+        end = datetime.strptime(end_date, "%Y-%m-%d")
+        ranges = []
+        current = start
+        while current < end:
+            next_date = min(current + timedelta(days=days_per_chunk), end)
+            ranges.append((
+                current.strftime("%Y-%m-%d"),
+                next_date.strftime("%Y-%m-%d")
+            ))
+            current = next_date
+        return ranges
+# Main execution function
+async def main():
+    """Main function to run the drug crime analysis for latest 3 months."""
+    scraper = DrugCrimeScraper3Months()
+    # Automatic calculation for latest 3 months
+    print("\n" + "="*80)
+    print("🚔 KARNATAKA POLICE DRUG CRIME ANALYSIS SYSTEM")
+    print("📊 LATEST 3 MONTHS DATA COLLECTION")
+    print("="*80)
+    # Calculate dates for last 3 months from current date
+    end_date = datetime.now()
+    start_date = end_date - timedelta(days=90)  # Last 3 months (approximately)
+    START_DATE = start_date.strftime("%Y-%m-%d")
+    END_DATE = end_date.strftime("%Y-%m-%d")
+    TWEETS_PER_DAY = 200  # Increased for better coverage
+    print(f"📅 Automatic Date Range Calculation:")
+    print(f"   Start Date: {START_DATE}")
+    print(f"   End Date: {END_DATE}")
+    print(f"   Total Days: {(end_date - start_date).days}")
+    print(f"   Expected Tweets: ~{TWEETS_PER_DAY * (end_date - start_date).days}")
+    print(f"   Estimated Runtime: ~{((end_date - start_date).days // 7) * 2} minutes")
+    print("="*80)
+    # Confirm before starting
+    try:
+        response = input("\n🤔 Proceed with data collection? (y/n): ").lower().strip()
+        if response != 'y' and response != 'yes':
+            print("❌ Collection cancelled by user.")
+            return
+    except KeyboardInterrupt:
+        print("\n❌ Collection cancelled by user.")
+        return
+    print("\n🚀 Starting data collection...")
+    try:
+        collected_data = await scraper.run_analysis(START_DATE, END_DATE, TWEETS_PER_DAY)
+        if collected_data and len(collected_data) > 0:
+            print(f"\n🎉 SUCCESS! Collected {len(collected_data)} tweets for Karnataka Police analysis.")
+            print("📁 Check the 'drug_analysis_data_3months' directory for organized results.")
+        else:
+            print("\n⚠️  No data collected. Please check:")
+            print("   1. Twitter account setup (twscrape add_account)")
+            print("   2. Internet connection")
+            print("   3. API rate limits")
+    except KeyboardInterrupt:
+        logger.info("\n⚠️  Analysis interrupted by user")
+        print("\n⚠️  Collection stopped by user. Partial data may be saved.")
+    except Exception as e:
+        logger.error(f"❌ Analysis failed: {e}")
+        print(f"\n❌ Error occurred: {e}")
+        print("💡 Check the log file 'drug_crime_scraper_3months.log' for details.")
+if __name__ == "__main__":
+    # Windows compatibility
+    if sys.platform.startswith("win"):
+        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+    # Print setup instructions
+    print("\n📋 SETUP REQUIREMENTS:")
+    print("1. Install: pip install twscrape pandas fuzzywuzzy nltk python-levenshtein")
+    print("2. Add Twitter account: twscrape add_account username email password")
+    print("3. Login accounts: twscrape login_accounts")
+    print("4. Run this script: python enhanced_drug_crime_scraper_3months.py")
+    # Run the analysis
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\n👋 Goodbye!")
+    except Exception as e:
+        print(f"\n❌ Failed to start: {e}")
+        print("💡 Please check the setup requirements above.")

src/evaluate.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# evaluate.py
+import pandas as pd
+import os
+from collections import Counter
+import re
+from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
+SCRAPER_FOLDER = "drug_analysis_data_3months"  # Folder where scraper saves CSVs
+# -----------------------------
+# Load CSVs
+# -----------------------------
+csv_files = [f for f in os.listdir(SCRAPER_FOLDER) if f.endswith(".csv")]
+if not csv_files:
+    print("❌ No CSV files found in scraper folder!")
+    exit()
+dfs = [pd.read_csv(os.path.join(SCRAPER_FOLDER, f)) for f in csv_files]
+df = pd.concat(dfs, ignore_index=True)
+print(f"✅ Loaded {len(df)} rows from {len(csv_files)} CSV files.\n")
+# -----------------------------
+# General Stats
+# -----------------------------
+print("=== General Stats ===")
+print("Columns:", df.columns.tolist())
+print("Total rows:", len(df))
+print("Missing values per column:\n", df.isna().sum())
+print("\nDuplicate rows:", df.duplicated().sum())
+# Sample rows with missing data
+missing_rows = df[df.isna().any(axis=1)]
+if not missing_rows.empty:
+    print("\nSample rows with missing values:\n", missing_rows.head())
+# Sample duplicate rows
+duplicates = df[df.duplicated(keep=False)]
+if not duplicates.empty:
+    print("\nSample duplicate rows:\n", duplicates.head())
+# -----------------------------
+# Drug/Crime-related stats
+# -----------------------------
+for col in ["is_drug_related", "is_crime_related", "risk_level"]:
+    if col in df.columns:
+        print(f"\n=== {col} Distribution ===")
+        print(df[col].value_counts())
+        print("Proportion:\n", round(df[col].value_counts(normalize=True), 4))
+# Risk level numeric analysis
+if "risk_level" in df.columns and pd.api.types.is_numeric_dtype(df["risk_level"]):
+    print("\n=== Risk Level Stats ===")
+    print("Average risk:", round(df["risk_level"].mean(), 2))
+    print("Max risk:", df["risk_level"].max())
+    high_risk_count = (df["risk_level"] >= 0.7).sum()  # Threshold
+    print("Number of high-risk items (risk >= 0.7):", high_risk_count)
+# -----------------------------
+# Time coverage
+# -----------------------------
+if "datetime" in df.columns:
+    df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")
+    print("\n=== Date Range ===")
+    print("Earliest:", df["datetime"].min())
+    print("Latest:", df["datetime"].max())
+    # Daily counts
+    df["date"] = df["datetime"].dt.date
+    daily_counts = df.groupby("date").size()
+    print("\n=== Daily Counts of Posts ===")
+    print(daily_counts)
+# -----------------------------
+# Text Analysis
+# -----------------------------
+if "text" in df.columns:
+    df["text"] = df["text"].astype(str)
+    df["text_length"] = df["text"].apply(len)
+    print("\n=== Text Length Stats ===")
+    print("Average length:", round(df["text_length"].mean(), 2))
+    print("Min length:", df["text_length"].min())
+    print("Max length:", df["text_length"].max())
+    # Top 10 most common words
+    words = Counter()
+    for t in df["text"]:
+        words.update(re.findall(r"\w+", t.lower()))
+    print("\nTop 10 common words:", words.most_common(10))
+# -----------------------------
+# User / Source Analysis
+# -----------------------------
+if "username" in df.columns:
+    print("\n=== User Analysis ===")
+    print("Total unique users:", df["username"].nunique())
+    top_users = df["username"].value_counts().head(10)
+    print("Top 10 users by post count:\n", top_users)
+# -----------------------------
+# Scraper Evaluation Metrics
+# -----------------------------
+print("\n=== Scraper Evaluation Metrics ===")
+# 1. Completeness (% of filled cells)
+completeness = 1 - df.isna().mean().mean()
+print(f"Completeness (all columns filled): {round(completeness*100, 2)}%")
+# 2. Duplicate rate (% of duplicate rows)
+duplicate_rate = df.duplicated().mean()
+print(f"Duplicate rows rate: {round(duplicate_rate*100, 2)}%")
+# 3. Drug/Crime relevance (if available)
+for col in ["is_drug_related", "is_crime_related"]:
+    if col in df.columns:
+        relevance = df[col].sum() / len(df)
+        print(f"{col} relevance rate: {round(relevance*100,2)}%")
+# 4. Time coverage (active days vs total days)
+if "datetime" in df.columns:
+    total_days = (df["datetime"].max() - df["datetime"].min()).days + 1
+    active_days = df["date"].nunique()
+    coverage_ratio = active_days / total_days
+    print(f"Time coverage ratio (active days / total days): {round(coverage_ratio*100,2)}%")
+# 5. Average text length (proxy for content richness)
+if "text" in df.columns:
+    print(f"Average text length: {round(df['text_length'].mean(),2)} characters")
+# 6. Classification Metrics (using scraper labels as pseudo-ground truth)
+# If multiple columns available (e.g., is_drug_related vs is_crime_related), compute metrics
+if "is_drug_related" in df.columns and "is_crime_related" in df.columns:
+    y_true = df["is_crime_related"]
+    y_pred = df["is_drug_related"]
+    print("\n=== Classification Metrics (is_drug_related vs is_crime_related) ===")
+    print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))
+    print("Precision:", round(precision_score(y_true, y_pred), 4))
+    print("Recall:", round(recall_score(y_true, y_pred), 4))
+    print("F1-score:", round(f1_score(y_true, y_pred), 4))
+    print("\nClassification Report:\n", classification_report(y_true, y_pred))
+else:
+    print("\n⚠️ Skipping classification metrics: Not enough columns for evaluation.")
+print("\n✅ Data evaluation + metrics complete!")

src/evaluation.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# evaluation.py
+from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, brier_score_loss
+from alerts import compute_dynamic_risk
+def evaluate_model(test_tweets):
+    """
+    test_tweets: list of dicts with fields
+        - true_risk_level: "CRITICAL"/"HIGH"/...
+        - dynamic_risk_score: 0-100
+    """
+    # Compute predicted risk level from dynamic score
+    y_true = [1 if t['true_risk_level'] == "CRITICAL" else 0 for t in test_tweets]
+    y_prob = []
+    y_pred = []
+    for t in test_tweets:
+        score = t["dynamic_risk_score"]
+        prob = compute_dynamic_risk(score)
+        y_prob.append(prob)
+        y_pred.append(1 if prob >= 0.75 else 0)  # threshold for CRITICAL
+    print("=== Classification Report ===")
+    print(classification_report(y_true, y_pred, target_names=["Non-Critical","Critical"]))
+    print("=== Confusion Matrix ===")
+    print(confusion_matrix(y_true, y_pred))
+    print("ROC-AUC:", roc_auc_score(y_true, y_prob))
+    print("Brier Score:", brier_score_loss(y_true, y_prob))