Spaces:

ESCP
/

sms-scam-detector

Sleeping

App Files Files Community

sms-scam-detector / app.py

charlotteeeeeeeeeeeee

Update app.py

d4a0860 verified about 1 month ago

raw

history blame contribute delete

3.93 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import re
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
	from sklearn.ensemble import RandomForestClassifier

	# ======================
	# Load dataset
	# ======================
	df = pd.read_csv("scam_features.csv")

	feature_cols = [
	"urgency_count", "suspicious_count", "link_count",
	"has_money", "msg_length", "caps_ratio",
	"exclamation_count", "sentiment_score"
	]

	X = df[feature_cols]
	y = df["risk_level"]

	# ======================
	# Train model
	# ======================
	model = RandomForestClassifier(
	n_estimators=100,
	max_depth=10,
	class_weight="balanced",
	random_state=42
	)
	model.fit(X, y)

	# ======================
	# Feature logic
	# ======================
	analyzer = SentimentIntensityAnalyzer()

	URGENCY_WORDS = ["urgent", "act now", "hurry", "limited time", "expires", "immediately",
	"today only", "last chance", "asap", "quick", "now"]

	SUSPICIOUS_KEYWORDS = ["bank", "password", "verify", "account", "login", "click",
	"winner", "prize", "congratulations", "claim", "free",
	"cash", "loan", "refund"]

	MONEY_PATTERNS = [r"\$", r"€", r"£", r"\bfree\b", r"\bprize\b", r"\bcash\b"]

	def extract_features(text):
	text = str(text)
	text_low = text.lower()

	letters = [c for c in text if c.isalpha()]
	caps = sum(1 for c in letters if c.isupper()) / len(letters) if letters else 0

	return {
	"urgency_count": sum(1 for w in URGENCY_WORDS if w in text_low),
	"suspicious_count": sum(1 for w in SUSPICIOUS_KEYWORDS if w in text_low),
	"link_count": len(re.findall(r"(http\|www\|\.com\|\.net\|\.org)", text_low)),
	"has_money": int(any(re.search(p, text_low) for p in MONEY_PATTERNS)),
	"msg_length": len(text),
	"caps_ratio": caps,
	"exclamation_count": text.count("!"),
	"sentiment_score": analyzer.polarity_scores(text)["compound"]
	}

	# ======================
	# Main function
	# ======================
	def analyze_message(text):
	if not text.strip():
	return "Please enter a message.", "", ""

	feats = extract_features(text)
	X_new = pd.DataFrame([feats])[feature_cols]

	pred = model.predict(X_new)[0]
	proba = model.predict_proba(X_new)[0]
	confidence = max(proba)

	red_flags = []
	if feats["suspicious_count"] >= 2:
	red_flags.append("suspicious keywords detected")
	if "won" in text.lower() or "click" in text.lower():
	pred = "high"
	if feats["urgency_count"] >= 1:
	red_flags.append("urgency language detected")
	if feats["link_count"] >= 1:
	red_flags.append("link detected")
	if feats["has_money"]:
	red_flags.append("money-related content")
	if feats["caps_ratio"] > 0.3:
	red_flags.append("excessive capital letters")
	if feats["exclamation_count"] >= 3:
	red_flags.append("too many exclamation marks")

	if not red_flags:
	red_flags = ["no obvious red flags"]

	if pred == "high":
	risk_display = "🔴 HIGH RISK"
	elif pred == "medium":
	risk_display = "🟠 MEDIUM RISK"
	else:
	risk_display = "🟢 LOW RISK"

	summary = f"{risk_display}"
	confidence_text = f"Confidence: {confidence:.0%}"
	flags_text = "\n".join([f"⚠️ {flag}" for flag in red_flags])

	return summary, confidence_text, flags_text


	# ======================
	# Interface
	# ======================
	demo = gr.Interface(
	fn=analyze_message,
	inputs=gr.Textbox(lines=6, placeholder="Paste a suspicious SMS or email here..."),
	outputs=[
	gr.Textbox(label="Risk Assessment"),
	gr.Textbox(label="Confidence"),
	gr.Textbox(label="Scam Indicators")
	],
	title="AI Scam Detection Tool",
	description="Assess scam messages using machine learning and risk indicators to support fraud prevention."
	)

	demo.launch()