charlotteeeeeeeeeeeee's picture
Update app.py
d4a0860 verified
import gradio as gr
import pandas as pd
import numpy as np
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
# ======================
# Load dataset
# ======================
df = pd.read_csv("scam_features.csv")
feature_cols = [
"urgency_count", "suspicious_count", "link_count",
"has_money", "msg_length", "caps_ratio",
"exclamation_count", "sentiment_score"
]
X = df[feature_cols]
y = df["risk_level"]
# ======================
# Train model
# ======================
model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
class_weight="balanced",
random_state=42
)
model.fit(X, y)
# ======================
# Feature logic
# ======================
analyzer = SentimentIntensityAnalyzer()
URGENCY_WORDS = ["urgent", "act now", "hurry", "limited time", "expires", "immediately",
"today only", "last chance", "asap", "quick", "now"]
SUSPICIOUS_KEYWORDS = ["bank", "password", "verify", "account", "login", "click",
"winner", "prize", "congratulations", "claim", "free",
"cash", "loan", "refund"]
MONEY_PATTERNS = [r"\$", r"€", r"£", r"\bfree\b", r"\bprize\b", r"\bcash\b"]
def extract_features(text):
text = str(text)
text_low = text.lower()
letters = [c for c in text if c.isalpha()]
caps = sum(1 for c in letters if c.isupper()) / len(letters) if letters else 0
return {
"urgency_count": sum(1 for w in URGENCY_WORDS if w in text_low),
"suspicious_count": sum(1 for w in SUSPICIOUS_KEYWORDS if w in text_low),
"link_count": len(re.findall(r"(http|www|\.com|\.net|\.org)", text_low)),
"has_money": int(any(re.search(p, text_low) for p in MONEY_PATTERNS)),
"msg_length": len(text),
"caps_ratio": caps,
"exclamation_count": text.count("!"),
"sentiment_score": analyzer.polarity_scores(text)["compound"]
}
# ======================
# Main function
# ======================
def analyze_message(text):
if not text.strip():
return "Please enter a message.", "", ""
feats = extract_features(text)
X_new = pd.DataFrame([feats])[feature_cols]
pred = model.predict(X_new)[0]
proba = model.predict_proba(X_new)[0]
confidence = max(proba)
red_flags = []
if feats["suspicious_count"] >= 2:
red_flags.append("suspicious keywords detected")
if "won" in text.lower() or "click" in text.lower():
pred = "high"
if feats["urgency_count"] >= 1:
red_flags.append("urgency language detected")
if feats["link_count"] >= 1:
red_flags.append("link detected")
if feats["has_money"]:
red_flags.append("money-related content")
if feats["caps_ratio"] > 0.3:
red_flags.append("excessive capital letters")
if feats["exclamation_count"] >= 3:
red_flags.append("too many exclamation marks")
if not red_flags:
red_flags = ["no obvious red flags"]
if pred == "high":
risk_display = "🔴 HIGH RISK"
elif pred == "medium":
risk_display = "🟠 MEDIUM RISK"
else:
risk_display = "🟢 LOW RISK"
summary = f"{risk_display}"
confidence_text = f"Confidence: {confidence:.0%}"
flags_text = "\n".join([f"⚠️ {flag}" for flag in red_flags])
return summary, confidence_text, flags_text
# ======================
# Interface
# ======================
demo = gr.Interface(
fn=analyze_message,
inputs=gr.Textbox(lines=6, placeholder="Paste a suspicious SMS or email here..."),
outputs=[
gr.Textbox(label="Risk Assessment"),
gr.Textbox(label="Confidence"),
gr.Textbox(label="Scam Indicators")
],
title="AI Scam Detection Tool",
description="Assess scam messages using machine learning and risk indicators to support fraud prevention."
)
demo.launch()