|
|
import torch |
|
|
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification |
|
|
import gradio as gr |
|
|
import re |
|
|
import nltk |
|
|
from nltk.tokenize import word_tokenize |
|
|
from nltk.corpus import stopwords |
|
|
from nltk.stem import WordNetLemmatizer |
|
|
|
|
|
|
|
|
nltk.download('punkt_tab') |
|
|
nltk.download('stopwords') |
|
|
nltk.download('wordnet') |
|
|
|
|
|
|
|
|
stop_words = set(stopwords.words('english')) |
|
|
lemmatizer = WordNetLemmatizer() |
|
|
|
|
|
def preprocess_text(text): |
|
|
|
|
|
text = re.sub(r'[^A-Za-z\s]', '', text) |
|
|
|
|
|
text = re.sub(r'http\S+|www\S+|https\S+', '', text) |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
|
|
|
text = text.lower() |
|
|
|
|
|
tokens = word_tokenize(text) |
|
|
|
|
|
tokens = [word for word in tokens if word not in stop_words] |
|
|
|
|
|
tokens = [lemmatizer.lemmatize(word) for word in tokens] |
|
|
return ' '.join(tokens) |
|
|
|
|
|
|
|
|
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") |
|
|
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2) |
|
|
|
|
|
|
|
|
model.load_state_dict(torch.load("best_model.pth", map_location=torch.device("cpu"))) |
|
|
model.eval() |
|
|
|
|
|
|
|
|
idx2label = {0: "phishing", 1: "legitimate"} |
|
|
|
|
|
|
|
|
def predict(text): |
|
|
clean_text = preprocess_text(text) |
|
|
inputs = tokenizer(clean_text, return_tensors="pt", truncation=True, padding=True, max_length=128) |
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
probs = torch.nn.functional.softmax(outputs.logits, dim=1)[0].numpy() |
|
|
|
|
|
return {idx2label[i]: float(round(probs[i], 4)) for i in range(2)} |
|
|
|
|
|
|
|
|
interface = gr.Interface( |
|
|
fn=predict, |
|
|
inputs=gr.Textbox(lines=4, placeholder="Enter a suspicious message or account description..."), |
|
|
outputs=gr.Label(num_top_classes=2), |
|
|
title="🛡️ Phishing Account Detector", |
|
|
description="Detects whether an account or message is likely phishing or legitimate using a custom DistilBERT model." |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
interface.launch() |
|
|
|