"""Generate data/scam_dataset.json — curated synthetic scenarios + validation.""" from __future__ import annotations import json import sys from pathlib import Path def main() -> None: rows: list[dict] = [] def add(**kwargs: object) -> None: rows.append(dict(kwargs)) # --- Easy scams (obvious) --- easy_scam_templates = [ ("Your KYC expired. Send OTP immediately to avoid blocking.", "kyc", True, 0.95), ("URGENT: Click http://bit.ly/fakebank to verify your account.", "phishing", True, 0.9), ("Dear user, your ATM card is blocked. Share CVV to unlock.", "phishing", False, 0.85), ("You won Rs 5 lakh! Pay processing fee via UPI to claim.", "investment", False, 0.8), ("NetBanking access suspended. Reply with MPIN to restore.", "phishing", False, 0.92), ("Your account will be debited in 1 hour unless you confirm OTP here.", "otp", False, 0.88), ("HDFC Security: unusual login. Verify at hdfc-secure-login.xyz", "phishing", True, 0.87), ("SBI: Your YONO session expired. Enter OTP to continue.", "impersonation", False, 0.9), ("Income Tax refund pending. Submit bank details on itrefund.co.in", "phishing", True, 0.82), ("Your wallet will be closed. Send Aadhaar OTP now.", "kyc", False, 0.91), ("Crypto doubling: send 1 ETH get 2 back in 10 minutes.", "investment", True, 0.78), ("WhatsApp: Your payment failed. Tap wa.me/fakepay to fix.", "phishing", True, 0.84), ("Amazon: Order on hold. Confirm payment link amzn-security.ru", "phishing", True, 0.83), ("IRCTC: Ticket cancelled unless you pay rebooking fee via link.", "phishing", True, 0.8), ("Paytm KYC incomplete. Share OTP to complete in 30 minutes.", "kyc", False, 0.89), ("Your credit score dropped. Pay Rs 499 to fix instantly.", "scam_fee", True, 0.76), ("FedEx: Package held at customs. Pay duty at fake-fedex.net", "phishing", True, 0.81), ("LinkedIn: Someone viewed your profile 50 times. Click to see.", "phishing", True, 0.7), ("Your SIM will be deactivated. Send last 4 digits of Aadhaar.", "phishing", False, 0.86), ("Bank server upgrade: validate credentials on secure-portal.tk", "phishing", True, 0.88), ] for i, (msg, tag, link, urg) in enumerate(easy_scam_templates, start=1): add( id=f"easy_scam_{i:02d}", difficulty="easy", true_label="scam", channel="sms" if i % 2 else "email", language="en" if i % 3 else "hinglish", sender_type="unknown", message=msg, messages=[msg], link_present=link, urgency_score=urg, tags=[tag, "easy"], ) easy_scam_variants = [ ("RBI Notice: Your PAN is linked to suspicious UPI. Validate on rbisecure.in now.", True, 0.93), ("Electricity dept: Bill unpaid. Pay Rs 2,499 via link or power cut in 2 hours.", True, 0.9), ("DHL: Customs hold on parcel. Pay clearance fee to dhl-track-support.net", True, 0.84), ("Job offer: Work from home, earn 50k/week. Pay registration Rs 999 first.", False, 0.77), ("Your Netflix payment failed. Update card on netflix-billing-support.com", True, 0.82), ("WINNER! You got iPhone 15 in lucky draw. Pay GST Rs 2,500 to claim.", False, 0.8), ("Aadhaar eKYC failed. Re-verify on uidai-helpdesk.co to avoid SIM block.", True, 0.91), ("SEBI alert: Your demat will freeze. Confirm PAN on sebi-verifyportal.xyz", True, 0.86), ("Mother needs surgery funds urgently — wrong account sent, please refund to this UPI.", False, 0.88), ("Your Flipkart refund of Rs 1,200 is pending. Approve on flipkart-refund.co", True, 0.83), ("Instagram: Copyright strike on your reel. Appeal within 24h at meta-appeals.io", True, 0.74), ("Truecaller Premium renewal failed. Tap tc-premium-billing.ru to fix.", True, 0.72), ("Gas subsidy credited. Claim Rs 3,200 on pm-subsidy-portal.in (last date today).", True, 0.85), ("Your child’s school fee link expired. Pay immediately on schoolpay-verify.com", True, 0.79), ("Bank merger: re-KYC mandatory. Complete on merger-kyc-bank.net by tonight.", True, 0.9), ] for j, (msg, link, urg) in enumerate(easy_scam_variants, start=21): add( id=f"easy_scam_{j:02d}", difficulty="easy", true_label="scam", channel="whatsapp" if j % 4 == 0 else "sms", language="en" if j % 3 else "hinglish", sender_type="unknown", message=msg, messages=[msg], link_present=link, urgency_score=urg, tags=["phishing", "easy", "variant"], ) easy_scam_subtle = [ ("Reminder: Your free trial ends tomorrow. Add card on billing-portal-now.net to avoid loss of data.", True, 0.62), ("We noticed login from Windows PC. If this was you, no action needed.", False, 0.45), ("Security check: confirm your mobile number is still +91-XXXXXX7890 by tapping link.", True, 0.7), ("Your reward points expire in 48 hours. Redeem on rewards-bank-partner.co", True, 0.66), ("Voice mailbox full. Listen to urgent bank voicemail: call this premium-rate number.", False, 0.72), ("Linked bank account sync failed. Re-link on upi-sync-verify.in", True, 0.81), ("Minor statement mismatch of Rs 1.00 — confirm via link to avoid freeze.", True, 0.74), ("Your cheque book dispatch delayed. Track on cheque-dispatch-fake.com", True, 0.69), ("NEFT return: beneficiary name mismatch. Fix on neft-correction-portal.net", True, 0.77), ("ATM card found at branch. Claim by paying courier fee via UPI.", False, 0.71), ("Loan pre-closure quote generated. Pay processing fee to download PDF.", True, 0.78), ("CIBIL score updated. View full report on cibil-free-score.in", True, 0.73), ("International roaming pack activated without request. Cancel via roaming-cancel.ru", True, 0.68), ("Your UPI PIN will expire. Reset on pin-reset-upi.co", True, 0.89), ("Bank holiday list 2026 attached. Open macro-enabled Excel from link.", True, 0.64), ] for n, (msg, link, urg) in enumerate(easy_scam_subtle, start=36): add( id=f"easy_scam_{n:02d}", difficulty="easy", true_label="scam", channel="email" if n % 3 == 0 else "sms", language="en", sender_type="unknown", message=msg, messages=[msg], link_present=link, urgency_score=urg, tags=["phishing", "easy", "subtle"], ) # Easy legitimate easy_legit = [ "Your salary of INR 85,000 has been credited to A/c XX1234.", "Reminder: EMI of Rs 12,400 due on 10th. Pay via official app only.", "Your debit card ending 4242 was used at SWIGGY for Rs 320.", "NetBanking: Scheduled transfer of Rs 5,000 to Rahul completed.", "Welcome to mobile banking. Download the official app from Play Store.", "Your FD receipt is available in the secure inbox of internet banking.", "Loan statement for March generated. Login to official portal to view.", "UPI payment of Rs 150 to ZOMATO successful. Ref: 998877.", "Your meeting with branch manager is confirmed for Tuesday 3 PM.", "Security tip: Never share OTP. Bank staff will never ask for it.", ] for i, msg in enumerate(easy_legit, start=1): add( id=f"easy_legit_{i:02d}", difficulty="easy", true_label="legitimate", channel="email" if i % 2 else "in_app", language="en", sender_type="bank_official", message=msg, messages=[msg], link_present=False, urgency_score=0.2 + (i % 5) * 0.05, tags=["legitimate", "easy"], ) easy_legit_unknown = [ ("Blinkit: Your order #8821 arrives by 8 PM. Show the in-app rider code only on your phone.", "delivery_partner", 0.35), ("Swiggy: Rider is nearby. Never share your delivery PIN outside the Swiggy app.", "delivery_partner", 0.3), ("IRCTC: PNR confirmed. Ticket details in official app only. No payment link via SMS.", "service_provider", 0.25), ("Government: GST filing reminder for GSTR-3B due date. File on gst.gov.in portal only.", "government_notice", 0.4), ("Airtel: Data pack renewed. For queries use 121 or official app.", "telco_official", 0.22), ("NPS: Contribution of Rs 5,000 received for PRAN XX****12. Statement in CRA login.", "pension_authority", 0.28), ] for i, (msg, stype, urg) in enumerate(easy_legit_unknown, start=1): add( id=f"easy_legit_u_{i:02d}", difficulty="easy", true_label="legitimate", channel="sms", language="en", sender_type=stype, message=msg, messages=[msg], link_present=False, urgency_score=urg, tags=["legitimate", "easy", "unknown_brand_ok"], ) # Medium: two-part social engineering medium_pairs = [ ( "Hello, I am calling from SBI support regarding your account activity.", "Our system shows an unverified device login. Please confirm you received this SMS.", ), ( "This is ICICI fraud desk. We noticed a suspicious UPI attempt.", "To secure your account, do not share OTP; we only need you to acknowledge this alert.", ), ( "Axis Bank priority support: unusual transaction flagged.", "If this was not you, we recommend warning any joint account holders immediately.", ), ( "Kotak 811: We could not reach you on call about a high-value transfer.", "Reply only if you initiated a transfer above Rs 50,000 in the last hour.", ), ( "HDFC: Potential phishing attempt detected on your registered email.", "Please verify sender identity using official app chat before clicking any links.", ), ( "Federal Bank: New beneficiary added from unknown IP.", "If you did not add this beneficiary, warn your family and contact branch.", ), ( "Bandhan Bank: SIM swap alert on your registered mobile.", "We have paused high-risk transactions until you verify identity in branch or app.", ), ( "IDFC FIRST: Credit card used abroad while phone location shows India.", "Confirm travel or warn user via registered channels — do not use links in SMS.", ), ] for i, (m1, m2) in enumerate(medium_pairs, start=1): add( id=f"medium_scam_{i:02d}", difficulty="medium", true_label="scam", channel="sms", language="en" if i % 2 else "hinglish", sender_type="unknown", message=m1, messages=[m1, m2], link_present=False, urgency_score=0.55 + i * 0.02, tags=["impersonation", "social_engineering"], ) medium_scam_extra = [ ( "BOB World: We detected a login from a new city.", "If this was not you, warn linked family accounts and call 1800-XXX from number printed on your card.", ), ( "PNB: UPI collect request spike on your account.", "Do not approve unknown requests. Acknowledge if you want us to pause UPI temporarily.", ), ( "Union Bank: Salary credit reversal risk due to name mismatch.", "Please verify with HR; meanwhile warn beneficiaries of delayed transfers.", ), ( "Yes Bank: Premium card used at high-risk merchant category.", "Confirm purchase or warn household cards on same profile.", ), ( "IndusInd: Device binding changed for mobile banking.", "If unexpected, warn joint holders and visit branch with ID.", ), ( "RBL: International POS attempt while card is domestic-only.", "Reply ONLY if you are traveling; otherwise warn and block via app.", ), ( "CSB Bank: Net banking password reset initiated from new browser.", "Warn anyone with access to your email; we never ask for OTP on call.", ), ] for idx, (m1, m2) in enumerate(medium_scam_extra, start=9): add( id=f"medium_scam_{idx:02d}", difficulty="medium", true_label="scam", channel="sms", language="en" if idx % 2 else "hinglish", sender_type="unknown", message=m1, messages=[m1, m2], link_present=False, urgency_score=0.58 + (idx % 5) * 0.02, tags=["impersonation", "bank_brand_spoof"], ) medium_legit = [ ( "Your branch relationship manager will call from official 1800 number today.", "No action needed if you did not request a callback.", ), ( "Annual privacy notice: how we use your data is updated on our website.", "Log in to the official site only; ignore forwarded PDFs claiming to be us.", ), ] for i, (m1, m2) in enumerate(medium_legit, start=1): add( id=f"medium_legit_{i:02d}", difficulty="medium", true_label="legitimate", channel="email", language="en", sender_type="bank_official", message=m1, messages=[m1, m2], link_present=True, urgency_score=0.25, tags=["legitimate"], ) # Hard: multi-step attack chains hard_chains = [ [ "Hi, your salary credit is delayed due to payroll sync. We will update shortly.", "Please confirm your employee ID by replying YES to validate HR records.", "Now share the OTP you received to complete HR verification.", ], [ "Netflix: Payment failed for renewal.", "Update payment on our secure page within 24h.", "Enter OTP from bank to authorize Rs 649 debit.", ], [ "Courier: Package arriving today. Track with reference 8821.", "Customs fee Rs 499 required — pay via link for express clearance.", "Urgent: OTP needed to release shipment or it will be returned.", ], [ "Investment tip: Bluechip fund up 12% this quarter.", "Open demat on partner portal to get free research.", "Transfer token amount via UPI to activate premium alerts.", ], [ "Bank: Low balance alert on salary account.", "Quick loan pre-approved — tap to see offer (no OTP needed to view).", "To disburse, confirm OTP sent to mobile for Rs 1 verification debit.", ], [ "Govt subsidy scheme: you are eligible for LPG cashback.", "Upload bank proof on portal (official-looking domain).", "Final step: share OTP to link Aadhaar for instant credit.", ], [ "Friend hi, wrong number pe message aa gaya — delete mat karna.", "Actually I need small help, bank server down hai.", "Mere ko OTP forward kar do, main tumhara recharge kar dunga.", ], [ "HR: Please join new attendance app before Monday.", "Use temp password Attend2024!", "Sync bank for payroll — enter OTP when prompted to verify account.", ], ] stage_sets = [ ["lure", "harvest", "pressure"], ["lure", "harvest", "pressure"], ["lure", "harvest", "pressure"], ["lure", "harvest", "pressure"], ["lure", "harvest", "pressure"], ["lure", "harvest", "pressure"], ["lure", "harvest", "pressure"], ["lure", "harvest", "pressure"], ] for i, (msgs, stages) in enumerate(zip(hard_chains, stage_sets), start=1): add( id=f"hard_scam_{i:02d}", difficulty="hard", true_label="scam", channel="whatsapp" if i % 3 == 0 else "sms", language="en" if i != 7 else "hinglish", sender_type="unknown", message=msgs[0], messages=msgs, link_present=True, urgency_score=0.65 + 0.03 * i, tags=["multi_step", "otp"], stage_labels=stages, otp_message_index=2, ) hard_scam_extra_chains = [ [ "Team: Standup notes for today attached (no action needed).", "IT: Please approve MFA push for new VPN profile (looks like routine SSO).", "IT-Helpdesk: Enter the OTP shown in email to finish VPN setup — urgent before audit.", ], [ "Utility bill autopay succeeded for March. Thank you.", "Correction: duplicate debit detected. Refund will be processed after OTP confirmation.", "Send OTP you receive from bank to this thread to speed up refund.", ], [ "Dating app match: Hi! I trade forex part-time, happy to share tips.", "I can guide you on Binance — first deposit small USDT to test withdrawal.", "Share your wallet seed screenshot privately so I can check if your wallet is compatible.", ], [ "Zoom: Meeting invite updated — join link unchanged.", "Host requests pre-install of remote support plugin for screen share.", "Run AnyDesk quick support code 9 8 7 6 5 so we can fix your audio driver.", ], [ "Charity: Thank you for donating Rs 500 last month.", "Matching grant: double impact if you add Rs 500 via UPI collect request.", "UPI collect sent — approve and share transaction OTP for receipt automation.", ], ] for k, msgs in enumerate(hard_scam_extra_chains, start=9): add( id=f"hard_scam_{k:02d}", difficulty="hard", true_label="scam", channel="whatsapp" if k % 2 == 0 else "sms", language="en", sender_type="unknown", message=msgs[0], messages=msgs, link_present=True, urgency_score=0.68 + 0.02 * k, tags=["multi_step", "social_engineering"], stage_labels=["lure", "harvest", "pressure"], otp_message_index=2, ) for i in range(1, 6): add( id=f"hard_legit_{i:02d}", difficulty="hard", true_label="legitimate", channel="in_app", language="en", sender_type="bank_official", message=f"Secure message thread {i}: loan discussion continues.", messages=[ f"Secure message thread {i}: loan discussion continues.", "Please upload documents only inside the mobile banking secure vault.", "No OTP is required for document upload; ignore any SMS asking for OTP.", ], link_present=False, urgency_score=0.35, tags=["legitimate"], stage_labels=["info", "info", "info"], otp_message_index=None, ) scripts_dir = Path(__file__).resolve().parent if str(scripts_dir) not in sys.path: sys.path.insert(0, str(scripts_dir)) from validate_dataset import assert_dataset_ok assert_dataset_ok(rows) out = Path(__file__).resolve().parent.parent / "data" / "scam_dataset.json" out.parent.mkdir(parents=True, exist_ok=True) out.write_text(json.dumps(rows, indent=2, ensure_ascii=False), encoding="utf-8") print(f"Wrote {len(rows)} validated scenarios to {out}") if __name__ == "__main__": main()