Update app.py
Browse files
app.py
CHANGED
|
@@ -3,105 +3,146 @@ import torch
|
|
| 3 |
import torch.nn as nn
|
| 4 |
import numpy as np
|
| 5 |
import re
|
| 6 |
-
from sklearn.preprocessing import StandardScaler
|
| 7 |
import gradio as gr
|
|
|
|
| 8 |
|
| 9 |
# ---------------- DEVICE ----------------
|
| 10 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 11 |
|
| 12 |
# ---------------- MODEL DEFINITION ----------------
|
| 13 |
-
class
|
| 14 |
-
def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=
|
| 15 |
super().__init__()
|
| 16 |
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
|
| 17 |
-
self.lstm = nn.LSTM(
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
self.attention_fc = nn.Linear(hidden_dim * 2, 1)
|
| 20 |
-
self.
|
| 21 |
-
self.
|
|
|
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
attn_weights = torch.softmax(self.attention_fc(lstm_out), dim=1)
|
| 27 |
-
context = torch.sum(attn_weights * lstm_out, dim=1)
|
| 28 |
-
context = self.dropout(context)
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
with open("vocab.json", "r") as f:
|
| 38 |
vocab_data = json.load(f)
|
| 39 |
-
word2idx = vocab_data["word2idx"]
|
| 40 |
-
|
| 41 |
|
| 42 |
with open("labels.json", "r") as f:
|
| 43 |
labels = json.load(f)
|
| 44 |
|
| 45 |
with open("scaler.json", "r") as f:
|
| 46 |
scaler_data = json.load(f)
|
|
|
|
| 47 |
scaler = StandardScaler()
|
| 48 |
scaler.mean_ = np.array(scaler_data["mean"])
|
| 49 |
scaler.scale_ = np.array(scaler_data["scale"])
|
| 50 |
|
| 51 |
-
|
| 52 |
-
vocab_size
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
output_dim
|
| 56 |
-
extra_features_dim
|
| 57 |
-
|
| 58 |
-
model = BiLSTMAttention(
|
| 59 |
-
vocab_size=vocab_size,
|
| 60 |
-
embed_dim=embed_dim,
|
| 61 |
-
hidden_dim=hidden_dim,
|
| 62 |
-
output_dim=output_dim,
|
| 63 |
-
num_layers=2,
|
| 64 |
-
dropout=0.3,
|
| 65 |
-
extra_features_dim=extra_features_dim
|
| 66 |
).to(device)
|
| 67 |
|
| 68 |
model.load_state_dict(torch.load("model.pt", map_location=device))
|
| 69 |
model.eval()
|
| 70 |
|
| 71 |
-
# ----------------
|
| 72 |
-
def
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
-
|
| 80 |
text_len = len(text)
|
|
|
|
| 81 |
num_digits = sum(c.isdigit() for c in text)
|
| 82 |
num_upper = sum(c.isupper() for c in text)
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
def predict(text):
|
| 90 |
-
|
|
|
|
|
|
|
| 91 |
with torch.no_grad():
|
| 92 |
-
logits = model(
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
-
# ---------------- GRADIO
|
| 97 |
-
|
| 98 |
fn=predict,
|
| 99 |
-
inputs=gr.Textbox(lines=
|
| 100 |
-
outputs=gr.
|
| 101 |
-
title="Scam Message Detector",
|
| 102 |
-
description="Detect
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
)
|
| 104 |
|
| 105 |
if __name__ == "__main__":
|
| 106 |
-
|
| 107 |
-
|
|
|
|
| 3 |
import torch.nn as nn
|
| 4 |
import numpy as np
|
| 5 |
import re
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
+
from sklearn.preprocessing import StandardScaler
|
| 8 |
|
| 9 |
# ---------------- DEVICE ----------------
|
| 10 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 11 |
|
| 12 |
# ---------------- MODEL DEFINITION ----------------
|
| 13 |
+
class ImprovedBiLSTMAttention(nn.Module):
|
| 14 |
+
def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=3, dropout=0.4, extra_features_dim=10):
|
| 15 |
super().__init__()
|
| 16 |
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
|
| 17 |
+
self.lstm = nn.LSTM(
|
| 18 |
+
embed_dim,
|
| 19 |
+
hidden_dim,
|
| 20 |
+
num_layers=num_layers,
|
| 21 |
+
bidirectional=True,
|
| 22 |
+
batch_first=True,
|
| 23 |
+
dropout=dropout if num_layers > 1 else 0
|
| 24 |
+
)
|
| 25 |
self.attention_fc = nn.Linear(hidden_dim * 2, 1)
|
| 26 |
+
self.batch_norm = nn.BatchNorm1d(hidden_dim * 2)
|
| 27 |
+
self.dropout1 = nn.Dropout(dropout)
|
| 28 |
+
self.dropout2 = nn.Dropout(dropout * 0.5)
|
| 29 |
|
| 30 |
+
self.fc1 = nn.Linear(hidden_dim * 2 + extra_features_dim, hidden_dim)
|
| 31 |
+
self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
|
| 32 |
+
self.fc3 = nn.Linear(hidden_dim // 2, output_dim)
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
self.relu = nn.ReLU()
|
| 35 |
+
self.layer_norm = nn.LayerNorm(hidden_dim)
|
| 36 |
|
| 37 |
+
def forward(self, x, extra_features):
|
| 38 |
+
embedded = self.embedding(x)
|
| 39 |
+
lstm_out, _ = self.lstm(embedded)
|
| 40 |
+
attn = torch.softmax(self.attention_fc(lstm_out), dim=1)
|
| 41 |
+
context = torch.sum(attn * lstm_out, dim=1)
|
| 42 |
+
context = self.batch_norm(context)
|
| 43 |
+
context = self.dropout1(context)
|
| 44 |
+
context = torch.cat([context, extra_features], dim=1)
|
| 45 |
+
|
| 46 |
+
x = self.relu(self.fc1(context))
|
| 47 |
+
x = self.layer_norm(x)
|
| 48 |
+
x = self.dropout2(x)
|
| 49 |
+
x = self.relu(self.fc2(x))
|
| 50 |
+
x = self.dropout2(x)
|
| 51 |
+
return self.fc3(x)
|
| 52 |
+
|
| 53 |
+
# ---------------- LOAD FILES ----------------
|
| 54 |
with open("vocab.json", "r") as f:
|
| 55 |
vocab_data = json.load(f)
|
| 56 |
+
word2idx = vocab_data["word2idx"]
|
| 57 |
+
MAX_LEN = vocab_data["max_len"]
|
| 58 |
|
| 59 |
with open("labels.json", "r") as f:
|
| 60 |
labels = json.load(f)
|
| 61 |
|
| 62 |
with open("scaler.json", "r") as f:
|
| 63 |
scaler_data = json.load(f)
|
| 64 |
+
|
| 65 |
scaler = StandardScaler()
|
| 66 |
scaler.mean_ = np.array(scaler_data["mean"])
|
| 67 |
scaler.scale_ = np.array(scaler_data["scale"])
|
| 68 |
|
| 69 |
+
model = ImprovedBiLSTMAttention(
|
| 70 |
+
vocab_size=len(word2idx) + 1,
|
| 71 |
+
embed_dim=200,
|
| 72 |
+
hidden_dim=256,
|
| 73 |
+
output_dim=len(labels),
|
| 74 |
+
extra_features_dim=10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
).to(device)
|
| 76 |
|
| 77 |
model.load_state_dict(torch.load("model.pt", map_location=device))
|
| 78 |
model.eval()
|
| 79 |
|
| 80 |
+
# ---------------- TEXT PREPROCESS ----------------
|
| 81 |
+
def clean_text(text):
|
| 82 |
+
text = text.lower()
|
| 83 |
+
text = re.sub(r'[^\w\s!?$%]', ' ', text)
|
| 84 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 85 |
+
return text
|
| 86 |
+
|
| 87 |
+
def tokenize(text):
|
| 88 |
+
words = clean_text(text).split()
|
| 89 |
+
seq = [word2idx.get(w, 0) for w in words][:MAX_LEN]
|
| 90 |
+
seq += [0] * (MAX_LEN - len(seq))
|
| 91 |
+
return torch.tensor([seq], dtype=torch.long)
|
| 92 |
|
| 93 |
+
def extract_features(text):
|
| 94 |
text_len = len(text)
|
| 95 |
+
word_count = len(text.split())
|
| 96 |
num_digits = sum(c.isdigit() for c in text)
|
| 97 |
num_upper = sum(c.isupper() for c in text)
|
| 98 |
+
num_special = sum(not c.isalnum() and not c.isspace() for c in text)
|
| 99 |
+
|
| 100 |
+
features = np.array([[
|
| 101 |
+
text_len,
|
| 102 |
+
word_count,
|
| 103 |
+
num_digits,
|
| 104 |
+
num_upper,
|
| 105 |
+
num_special,
|
| 106 |
+
text_len / (word_count + 1),
|
| 107 |
+
num_digits / (text_len + 1),
|
| 108 |
+
num_upper / (text_len + 1),
|
| 109 |
+
num_special / (text_len + 1),
|
| 110 |
+
sum(kw in text.lower() for kw in [
|
| 111 |
+
"urgent", "click", "free", "winner", "prize",
|
| 112 |
+
"claim", "verify", "bank", "password"
|
| 113 |
+
])
|
| 114 |
+
]])
|
| 115 |
+
|
| 116 |
+
features = scaler.transform(features)
|
| 117 |
+
return torch.tensor(features, dtype=torch.float)
|
| 118 |
+
|
| 119 |
+
# ---------------- PREDICTION ----------------
|
| 120 |
def predict(text):
|
| 121 |
+
tokens = tokenize(text).to(device)
|
| 122 |
+
features = extract_features(text).to(device)
|
| 123 |
+
|
| 124 |
with torch.no_grad():
|
| 125 |
+
logits = model(tokens, features)
|
| 126 |
+
probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
|
| 127 |
+
idx = int(np.argmax(probs))
|
| 128 |
+
|
| 129 |
+
return {
|
| 130 |
+
"Prediction": labels[idx],
|
| 131 |
+
"Confidence": f"{probs[idx]*100:.2f}%"
|
| 132 |
+
}
|
| 133 |
|
| 134 |
+
# ---------------- GRADIO UI ----------------
|
| 135 |
+
app = gr.Interface(
|
| 136 |
fn=predict,
|
| 137 |
+
inputs=gr.Textbox(lines=5, placeholder="Paste SMS / Email / Message here"),
|
| 138 |
+
outputs=gr.JSON(),
|
| 139 |
+
title="🚨 AI Scam Message Detector",
|
| 140 |
+
description="Detect scam vs legitimate messages using BiLSTM + Attention + NLP features",
|
| 141 |
+
examples=[
|
| 142 |
+
["Urgent! Your bank account is suspended. Click here to verify now"],
|
| 143 |
+
["Hey, are we meeting tomorrow at 5 PM?"]
|
| 144 |
+
]
|
| 145 |
)
|
| 146 |
|
| 147 |
if __name__ == "__main__":
|
| 148 |
+
app.launch()
|
|
|