Spaces:

Nadun102
/

new_scam

Sleeping

App Files Files Community

Nadun102 commited on Dec 20, 2025

Commit

de60d48

verified ·

1 Parent(s): 03acccf

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -62

app.py CHANGED Viewed

@@ -3,105 +3,146 @@ import torch
 import torch.nn as nn
 import numpy as np
 import re
-from sklearn.preprocessing import StandardScaler
 import gradio as gr
 # ---------------- DEVICE ----------------
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # ---------------- MODEL DEFINITION ----------------
-class BiLSTMAttention(nn.Module):
-    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=2, dropout=0.3, extra_features_dim=0):
         super().__init__()
         self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
-        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers,
-                            bidirectional=True, batch_first=True, dropout=dropout)
         self.attention_fc = nn.Linear(hidden_dim * 2, 1)
-        self.dropout = nn.Dropout(dropout)
-        self.fc = nn.Linear(hidden_dim * 2 + extra_features_dim, output_dim)
-    def forward(self, x, extra_features=None):
-        embedded = self.embedding(x)
-        lstm_out, _ = self.lstm(embedded)
-        attn_weights = torch.softmax(self.attention_fc(lstm_out), dim=1)
-        context = torch.sum(attn_weights * lstm_out, dim=1)
-        context = self.dropout(context)
-        if extra_features is not None:
-            context = torch.cat([context, extra_features], dim=1)
-        output = self.fc(context)
-        return output
-# ---------------- LOAD METADATA ----------------
 with open("vocab.json", "r") as f:
     vocab_data = json.load(f)
-word2idx = vocab_data["word2idx"]
-max_len = vocab_data["max_len"]
 with open("labels.json", "r") as f:
     labels = json.load(f)
 with open("scaler.json", "r") as f:
     scaler_data = json.load(f)
 scaler = StandardScaler()
 scaler.mean_ = np.array(scaler_data["mean"])
 scaler.scale_ = np.array(scaler_data["scale"])
-# ---------------- INITIALIZE MODEL ----------------
-vocab_size = len(word2idx) + 1
-hidden_dim = 128
-embed_dim = 100
-output_dim = len(labels)
-extra_features_dim = 3  # text_len, num_digits, num_upper
-model = BiLSTMAttention(
-    vocab_size=vocab_size,
-    embed_dim=embed_dim,
-    hidden_dim=hidden_dim,
-    output_dim=output_dim,
-    num_layers=2,
-    dropout=0.3,
-    extra_features_dim=extra_features_dim
 ).to(device)
 model.load_state_dict(torch.load("model.pt", map_location=device))
 model.eval()
-# ---------------- HELPER FUNCTIONS ----------------
-def preprocess_text(text):
-    # Tokenize
-    tokens = re.findall(r"\w+", text.lower())
-    seq = [word2idx.get(w, 0) for w in tokens][:max_len]
-    seq += [0] * (max_len - len(seq))
-    X = torch.tensor([seq], dtype=torch.long).to(device)
-    # Extra features
     text_len = len(text)
     num_digits = sum(c.isdigit() for c in text)
     num_upper = sum(c.isupper() for c in text)
-    extra = np.array([[text_len, num_digits, num_upper]])
-    extra = scaler.transform(extra)
-    extra = torch.tensor(extra, dtype=torch.float).to(device)
-    return X, extra
 def predict(text):
-    X, extra = preprocess_text(text)
     with torch.no_grad():
-        logits = model(X, extra)
-        pred_idx = torch.argmax(logits, dim=1).item()
-        return labels[pred_idx]  # Return only "scam" or "normal"
-# ---------------- GRADIO APP ----------------
-iface = gr.Interface(
     fn=predict,
-    inputs=gr.Textbox(lines=3, placeholder="Enter a message..."),
-    outputs=gr.Textbox(),  # Only show "scam" or "normal"
-    title="Scam Message Detector",
-    description="Detect if a message is 'spam' or 'normal'."
 )
 if __name__ == "__main__":
-    iface.launch()

 import torch.nn as nn
 import numpy as np
 import re
 import gradio as gr
+from sklearn.preprocessing import StandardScaler
 # ---------------- DEVICE ----------------
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # ---------------- MODEL DEFINITION ----------------
+class ImprovedBiLSTMAttention(nn.Module):
+    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=3, dropout=0.4, extra_features_dim=10):
         super().__init__()
         self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
+        self.lstm = nn.LSTM(
+            embed_dim,
+            hidden_dim,
+            num_layers=num_layers,
+            bidirectional=True,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0
+        )
         self.attention_fc = nn.Linear(hidden_dim * 2, 1)
+        self.batch_norm = nn.BatchNorm1d(hidden_dim * 2)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout * 0.5)
+        self.fc1 = nn.Linear(hidden_dim * 2 + extra_features_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
+        self.fc3 = nn.Linear(hidden_dim // 2, output_dim)
+        self.relu = nn.ReLU()
+        self.layer_norm = nn.LayerNorm(hidden_dim)
+    def forward(self, x, extra_features):
+        embedded = self.embedding(x)
+        lstm_out, _ = self.lstm(embedded)
+        attn = torch.softmax(self.attention_fc(lstm_out), dim=1)
+        context = torch.sum(attn * lstm_out, dim=1)
+        context = self.batch_norm(context)
+        context = self.dropout1(context)
+        context = torch.cat([context, extra_features], dim=1)
+        x = self.relu(self.fc1(context))
+        x = self.layer_norm(x)
+        x = self.dropout2(x)
+        x = self.relu(self.fc2(x))
+        x = self.dropout2(x)
+        return self.fc3(x)
+# ---------------- LOAD FILES ----------------
 with open("vocab.json", "r") as f:
     vocab_data = json.load(f)
+    word2idx = vocab_data["word2idx"]
+    MAX_LEN = vocab_data["max_len"]
 with open("labels.json", "r") as f:
     labels = json.load(f)
 with open("scaler.json", "r") as f:
     scaler_data = json.load(f)
 scaler = StandardScaler()
 scaler.mean_ = np.array(scaler_data["mean"])
 scaler.scale_ = np.array(scaler_data["scale"])
+model = ImprovedBiLSTMAttention(
+    vocab_size=len(word2idx) + 1,
+    embed_dim=200,
+    hidden_dim=256,
+    output_dim=len(labels),
+    extra_features_dim=10
 ).to(device)
 model.load_state_dict(torch.load("model.pt", map_location=device))
 model.eval()
+# ---------------- TEXT PREPROCESS ----------------
+def clean_text(text):
+    text = text.lower()
+    text = re.sub(r'[^\w\s!?$%]', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def tokenize(text):
+    words = clean_text(text).split()
+    seq = [word2idx.get(w, 0) for w in words][:MAX_LEN]
+    seq += [0] * (MAX_LEN - len(seq))
+    return torch.tensor([seq], dtype=torch.long)
+def extract_features(text):
     text_len = len(text)
+    word_count = len(text.split())
     num_digits = sum(c.isdigit() for c in text)
     num_upper = sum(c.isupper() for c in text)
+    num_special = sum(not c.isalnum() and not c.isspace() for c in text)
+    features = np.array([[
+        text_len,
+        word_count,
+        num_digits,
+        num_upper,
+        num_special,
+        text_len / (word_count + 1),
+        num_digits / (text_len + 1),
+        num_upper / (text_len + 1),
+        num_special / (text_len + 1),
+        sum(kw in text.lower() for kw in [
+            "urgent", "click", "free", "winner", "prize",
+            "claim", "verify", "bank", "password"
+        ])
+    ]])
+    features = scaler.transform(features)
+    return torch.tensor(features, dtype=torch.float)
+# ---------------- PREDICTION ----------------
 def predict(text):
+    tokens = tokenize(text).to(device)
+    features = extract_features(text).to(device)
     with torch.no_grad():
+        logits = model(tokens, features)
+        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
+        idx = int(np.argmax(probs))
+    return {
+        "Prediction": labels[idx],
+        "Confidence": f"{probs[idx]*100:.2f}%"
+    }
+# ---------------- GRADIO UI ----------------
+app = gr.Interface(
     fn=predict,
+    inputs=gr.Textbox(lines=5, placeholder="Paste SMS / Email / Message here"),
+    outputs=gr.JSON(),
+    title="🚨 AI Scam Message Detector",
+    description="Detect scam vs legitimate messages using BiLSTM + Attention + NLP features",
+    examples=[
+        ["Urgent! Your bank account is suspended. Click here to verify now"],
+        ["Hey, are we meeting tomorrow at 5 PM?"]
+    ]
 )
 if __name__ == "__main__":
+    app.launch()