Spaces:

ESCP
/

sms-scam-detector

Sleeping

File size: 3,932 Bytes

import gradio as gr
import pandas as pd
import numpy as np
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier

# ======================
# Load dataset
# ======================
df = pd.read_csv("scam_features.csv")

feature_cols = [
    "urgency_count", "suspicious_count", "link_count",
    "has_money", "msg_length", "caps_ratio",
    "exclamation_count", "sentiment_score"
]

X = df[feature_cols]
y = df["risk_level"]

# ======================
# Train model
# ======================
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight="balanced",
    random_state=42
)
model.fit(X, y)

# ======================
# Feature logic
# ======================
analyzer = SentimentIntensityAnalyzer()

URGENCY_WORDS = ["urgent", "act now", "hurry", "limited time", "expires", "immediately",
                 "today only", "last chance", "asap", "quick", "now"]

SUSPICIOUS_KEYWORDS = ["bank", "password", "verify", "account", "login", "click",
                       "winner", "prize", "congratulations", "claim", "free",
                       "cash", "loan", "refund"]

MONEY_PATTERNS = [r"\$", r"€", r"£", r"\bfree\b", r"\bprize\b", r"\bcash\b"]

def extract_features(text):
    text = str(text)
    text_low = text.lower()

    letters = [c for c in text if c.isalpha()]
    caps = sum(1 for c in letters if c.isupper()) / len(letters) if letters else 0

    return {
        "urgency_count": sum(1 for w in URGENCY_WORDS if w in text_low),
        "suspicious_count": sum(1 for w in SUSPICIOUS_KEYWORDS if w in text_low),
        "link_count": len(re.findall(r"(http|www|\.com|\.net|\.org)", text_low)),
        "has_money": int(any(re.search(p, text_low) for p in MONEY_PATTERNS)),
        "msg_length": len(text),
        "caps_ratio": caps,
        "exclamation_count": text.count("!"),
        "sentiment_score": analyzer.polarity_scores(text)["compound"]
    }

# ======================
# Main function
# ======================
def analyze_message(text):
    if not text.strip():
        return "Please enter a message.", "", ""

    feats = extract_features(text)
    X_new = pd.DataFrame([feats])[feature_cols]

    pred = model.predict(X_new)[0]
    proba = model.predict_proba(X_new)[0]
    confidence = max(proba)

    red_flags = []
    if feats["suspicious_count"] >= 2:
        red_flags.append("suspicious keywords detected")
    if "won" in text.lower() or "click" in text.lower():
        pred = "high"
    if feats["urgency_count"] >= 1:
        red_flags.append("urgency language detected")
    if feats["link_count"] >= 1:
        red_flags.append("link detected")
    if feats["has_money"]:
        red_flags.append("money-related content")
    if feats["caps_ratio"] > 0.3:
        red_flags.append("excessive capital letters")
    if feats["exclamation_count"] >= 3:
        red_flags.append("too many exclamation marks")

    if not red_flags:
        red_flags = ["no obvious red flags"]

    if pred == "high":
        risk_display = "🔴 HIGH RISK"
    elif pred == "medium":
        risk_display = "🟠 MEDIUM RISK"
    else:
        risk_display = "🟢 LOW RISK"

    summary = f"{risk_display}"
    confidence_text = f"Confidence: {confidence:.0%}"
    flags_text = "\n".join([f"⚠️ {flag}" for flag in red_flags])

    return summary, confidence_text, flags_text

      
# ======================
# Interface
# ======================
demo = gr.Interface(
    fn=analyze_message,
    inputs=gr.Textbox(lines=6, placeholder="Paste a suspicious SMS or email here..."),
    outputs=[
        gr.Textbox(label="Risk Assessment"),
        gr.Textbox(label="Confidence"),
        gr.Textbox(label="Scam Indicators")
    ],
    title="AI Scam Detection Tool",
    description="Assess scam messages using machine learning and risk indicators to support fraud prevention."
)

demo.launch()