import gradio as gr import pandas as pd import numpy as np import re from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from sklearn.ensemble import RandomForestClassifier # ====================== # Load dataset # ====================== df = pd.read_csv("scam_features.csv") feature_cols = [ "urgency_count", "suspicious_count", "link_count", "has_money", "msg_length", "caps_ratio", "exclamation_count", "sentiment_score" ] X = df[feature_cols] y = df["risk_level"] # ====================== # Train model # ====================== model = RandomForestClassifier( n_estimators=100, max_depth=10, class_weight="balanced", random_state=42 ) model.fit(X, y) # ====================== # Feature logic # ====================== analyzer = SentimentIntensityAnalyzer() URGENCY_WORDS = ["urgent", "act now", "hurry", "limited time", "expires", "immediately", "today only", "last chance", "asap", "quick", "now"] SUSPICIOUS_KEYWORDS = ["bank", "password", "verify", "account", "login", "click", "winner", "prize", "congratulations", "claim", "free", "cash", "loan", "refund"] MONEY_PATTERNS = [r"\$", r"€", r"£", r"\bfree\b", r"\bprize\b", r"\bcash\b"] def extract_features(text): text = str(text) text_low = text.lower() letters = [c for c in text if c.isalpha()] caps = sum(1 for c in letters if c.isupper()) / len(letters) if letters else 0 return { "urgency_count": sum(1 for w in URGENCY_WORDS if w in text_low), "suspicious_count": sum(1 for w in SUSPICIOUS_KEYWORDS if w in text_low), "link_count": len(re.findall(r"(http|www|\.com|\.net|\.org)", text_low)), "has_money": int(any(re.search(p, text_low) for p in MONEY_PATTERNS)), "msg_length": len(text), "caps_ratio": caps, "exclamation_count": text.count("!"), "sentiment_score": analyzer.polarity_scores(text)["compound"] } # ====================== # Main function # ====================== def analyze_message(text): if not text.strip(): return "Please enter a message.", "", "" feats = extract_features(text) X_new = pd.DataFrame([feats])[feature_cols] pred = model.predict(X_new)[0] proba = model.predict_proba(X_new)[0] confidence = max(proba) red_flags = [] if feats["suspicious_count"] >= 2: red_flags.append("suspicious keywords detected") if "won" in text.lower() or "click" in text.lower(): pred = "high" if feats["urgency_count"] >= 1: red_flags.append("urgency language detected") if feats["link_count"] >= 1: red_flags.append("link detected") if feats["has_money"]: red_flags.append("money-related content") if feats["caps_ratio"] > 0.3: red_flags.append("excessive capital letters") if feats["exclamation_count"] >= 3: red_flags.append("too many exclamation marks") if not red_flags: red_flags = ["no obvious red flags"] if pred == "high": risk_display = "🔴 HIGH RISK" elif pred == "medium": risk_display = "🟠 MEDIUM RISK" else: risk_display = "🟢 LOW RISK" summary = f"{risk_display}" confidence_text = f"Confidence: {confidence:.0%}" flags_text = "\n".join([f"⚠️ {flag}" for flag in red_flags]) return summary, confidence_text, flags_text # ====================== # Interface # ====================== demo = gr.Interface( fn=analyze_message, inputs=gr.Textbox(lines=6, placeholder="Paste a suspicious SMS or email here..."), outputs=[ gr.Textbox(label="Risk Assessment"), gr.Textbox(label="Confidence"), gr.Textbox(label="Scam Indicators") ], title="AI Scam Detection Tool", description="Assess scam messages using machine learning and risk indicators to support fraud prevention." ) demo.launch()