Spaces:
Sleeping
Sleeping
File size: 3,932 Bytes
519679e 64d98c0 519679e 4430286 519679e c77b701 4e44cb6 c77b701 519679e c77b701 519679e c77b701 519679e 6507e25 519679e 6507e25 519679e 086b1fc d4a0860 519679e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | import gradio as gr
import pandas as pd
import numpy as np
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
# ======================
# Load dataset
# ======================
df = pd.read_csv("scam_features.csv")
feature_cols = [
"urgency_count", "suspicious_count", "link_count",
"has_money", "msg_length", "caps_ratio",
"exclamation_count", "sentiment_score"
]
X = df[feature_cols]
y = df["risk_level"]
# ======================
# Train model
# ======================
model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
class_weight="balanced",
random_state=42
)
model.fit(X, y)
# ======================
# Feature logic
# ======================
analyzer = SentimentIntensityAnalyzer()
URGENCY_WORDS = ["urgent", "act now", "hurry", "limited time", "expires", "immediately",
"today only", "last chance", "asap", "quick", "now"]
SUSPICIOUS_KEYWORDS = ["bank", "password", "verify", "account", "login", "click",
"winner", "prize", "congratulations", "claim", "free",
"cash", "loan", "refund"]
MONEY_PATTERNS = [r"\$", r"€", r"£", r"\bfree\b", r"\bprize\b", r"\bcash\b"]
def extract_features(text):
text = str(text)
text_low = text.lower()
letters = [c for c in text if c.isalpha()]
caps = sum(1 for c in letters if c.isupper()) / len(letters) if letters else 0
return {
"urgency_count": sum(1 for w in URGENCY_WORDS if w in text_low),
"suspicious_count": sum(1 for w in SUSPICIOUS_KEYWORDS if w in text_low),
"link_count": len(re.findall(r"(http|www|\.com|\.net|\.org)", text_low)),
"has_money": int(any(re.search(p, text_low) for p in MONEY_PATTERNS)),
"msg_length": len(text),
"caps_ratio": caps,
"exclamation_count": text.count("!"),
"sentiment_score": analyzer.polarity_scores(text)["compound"]
}
# ======================
# Main function
# ======================
def analyze_message(text):
if not text.strip():
return "Please enter a message.", "", ""
feats = extract_features(text)
X_new = pd.DataFrame([feats])[feature_cols]
pred = model.predict(X_new)[0]
proba = model.predict_proba(X_new)[0]
confidence = max(proba)
red_flags = []
if feats["suspicious_count"] >= 2:
red_flags.append("suspicious keywords detected")
if "won" in text.lower() or "click" in text.lower():
pred = "high"
if feats["urgency_count"] >= 1:
red_flags.append("urgency language detected")
if feats["link_count"] >= 1:
red_flags.append("link detected")
if feats["has_money"]:
red_flags.append("money-related content")
if feats["caps_ratio"] > 0.3:
red_flags.append("excessive capital letters")
if feats["exclamation_count"] >= 3:
red_flags.append("too many exclamation marks")
if not red_flags:
red_flags = ["no obvious red flags"]
if pred == "high":
risk_display = "🔴 HIGH RISK"
elif pred == "medium":
risk_display = "🟠 MEDIUM RISK"
else:
risk_display = "🟢 LOW RISK"
summary = f"{risk_display}"
confidence_text = f"Confidence: {confidence:.0%}"
flags_text = "\n".join([f"⚠️ {flag}" for flag in red_flags])
return summary, confidence_text, flags_text
# ======================
# Interface
# ======================
demo = gr.Interface(
fn=analyze_message,
inputs=gr.Textbox(lines=6, placeholder="Paste a suspicious SMS or email here..."),
outputs=[
gr.Textbox(label="Risk Assessment"),
gr.Textbox(label="Confidence"),
gr.Textbox(label="Scam Indicators")
],
title="AI Scam Detection Tool",
description="Assess scam messages using machine learning and risk indicators to support fraud prevention."
)
demo.launch() |