Spaces:

nijatmammadov
/

pda

Sleeping

App Files Files Community

nijatmammadov commited on Apr 22, 2025

Commit

c54099a

1 Parent(s): 018e8fc

initial cmmt

Browse files

Files changed (8) hide show

app.py +77 -0
evaluate.py +37 -0
load_data.py +9 -0
main.py +230 -0
model.pth +3 -0
model.py +34 -0
preprocess_data.py +30 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import gdown
+import torch
+from fastapi import FastAPI
+from transformers import AutoModel, BertTokenizerFast
+from pydantic import BaseModel
+from model import BERT_Arch
+from preprocess_data import remove_html,remove_links
+import gradio as gr
+class TextRequest(BaseModel):
+    text: str
+# Download model from Google Drive
+#link:https://drive.google.com/drive/folders/102UPd446eHCCENR58EC3UxnJfcYkBa8U?usp=sharing
+model_url = "https://drive.google.com/uc?id=16ZWVa0d2V0T3s11Oq86rLOTA6bOR0DnR"
+model_path = "model.pth"
+gdown.download(model_url, model_path, quiet=False)
+# Load pre-trained BERT model
+bert = AutoModel.from_pretrained("bert-base-uncased")
+for param in bert.parameters():
+    param.requires_grad = False
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load custom model
+model = BERT_Arch(bert)
+model.load_state_dict(torch.load(model_path, map_location=device))
+model.to(device)
+model.eval()
+# Load tokenizer
+tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+# Initialize FastAPI
+app = FastAPI()
+@app.get("/")
+def home():
+    return {"message": "Phishing Detection API is running!"}
+@app.post("/predict/")
+def predict(request: TextRequest):
+    try:
+        text = request.text.strip()
+        # Preprocess text
+        text = remove_html(text)
+        text = remove_links(text)
+        # Tokenize input text
+        tokens = tokenizer(
+            text, return_tensors="pt", truncation=True, padding="max_length", max_length=512
+        )
+        input_ids = tokens["input_ids"].to(device)
+        attention_mask = tokens["attention_mask"].to(device)
+        # Perform inference
+        with torch.no_grad():
+            output = model(input_ids, attention_mask)
+        prediction = torch.argmax(output.cpu(), dim=1).item()
+        return {"prediction": "Phishing" if prediction == 1 else "Not Phishing"}
+    except Exception as e:
+        return {"error": str(e)}
+def greet(name):
+    return "Hello " + name + "!"
+gr.Interface(
+    fn=greet,
+    inputs="text",
+    outputs="text",
+    allow_flagging="never"
+).launch()

evaluate.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from preprocess_data import remove_html,remove_links
+import torch
+from model import BERT_Arch
+from transformers import AutoModel,BertTokenizerFast
+import numpy as np
+def evaluate(data,device):
+    bert = AutoModel.from_pretrained('bert-base-uncased')
+    data = [data]
+    for param in bert.parameters():
+        param.requires_grad = False
+    model = BERT_Arch(bert)
+    map_location = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.load_state_dict(torch.load("/content/model.pth", weights_only=True,map_location=map_location))
+    model = model.to(device)
+    data = [remove_html(i) for i in data]
+    data = [remove_links(i) for i in data]
+    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+    tokenized = tokenizer.batch_encode_plus(data,
+                                            max_length = 25,
+                                            pad_to_max_length=True,
+                                            truncation=True
+                                            )
+    tokenized_seq = torch.tensor(tokenized['input_ids'])
+    tokenized_mask = torch.tensor(tokenized['attention_mask'])
+    with torch.no_grad():
+        preds = model(tokenized_seq.to(device), tokenized_mask.to(device))
+        preds = preds.detach().cpu().numpy()
+    pred = np.argmax(preds, axis = 1)
+    return pred

load_data.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from datasets import load_dataset
+def load_dataset_():
+    dataset_id ="huynq3Cyradar/Phishing_Detection_Dataset"
+    dataset = load_dataset(dataset_id)
+    return dataset

main.py ADDED Viewed

	@@ -0,0 +1,230 @@

+from preprocess_data import preprocess
+from load_data import load_dataset_
+from model import BERT_Arch
+import pandas as pd
+import numpy as np
+import torch
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch import nn
+from torch.optim import AdamW
+from sklearn.utils.class_weight import compute_class_weight
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report
+from imblearn.under_sampling import RandomUnderSampler
+device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
+import transformers
+from transformers import AutoModel, BertTokenizerFast
+from transformers import AutoTokenizer
+# texts, labels = preprocess()
+dataset = load_dataset_()
+texts,labels = preprocess(dataset)
+df = pd.DataFrame({"texts":texts, "labels":labels})
+df = df.iloc[:-40000][["texts","labels"]]
+rus = RandomUnderSampler(random_state=42)
+X_res, y_res = rus.fit_resample(pd.DataFrame(df['texts']), pd.DataFrame(df['labels']))
+train_text, temp_text, train_labels, temp_labels = train_test_split(X_res,y_res,
+                                                                    random_state=2018,
+                                                                    test_size=0.3,
+                                                                    stratify=y_res)
+val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
+                                                                random_state=2018,
+                                                                test_size=0.5,
+                                                                stratify=temp_labels)
+bert = AutoModel.from_pretrained('bert-base-uncased')
+tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+tokens_train = tokenizer.batch_encode_plus(
+    train_text['texts'].tolist(),
+    max_length = 25,
+    pad_to_max_length=True,
+    truncation=True
+)
+tokens_val = tokenizer.batch_encode_plus(
+    val_text['texts'].tolist(),
+    max_length = 25,
+    pad_to_max_length=True,
+    truncation=True
+)
+tokens_test = tokenizer.batch_encode_plus(
+    test_text['texts'].tolist(),
+    max_length = 25,
+    pad_to_max_length=True,
+    truncation=True
+)
+train_seq = torch.tensor(tokens_train['input_ids'])
+train_mask = torch.tensor(tokens_train['attention_mask'])
+train_y = torch.tensor(train_labels['labels'].tolist())
+val_seq = torch.tensor(tokens_val['input_ids'])
+val_mask = torch.tensor(tokens_val['attention_mask'])
+val_y = torch.tensor(val_labels['labels'].tolist())
+test_seq = torch.tensor(tokens_test['input_ids'])
+test_mask = torch.tensor(tokens_test['attention_mask'])
+test_y = torch.tensor(test_labels['labels'].tolist())
+batch_size = 32
+train_data = TensorDataset(train_seq, train_mask, train_y)
+train_sampler = RandomSampler(train_data)
+train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
+val_data = TensorDataset(val_seq, val_mask, val_y)
+val_sampler = SequentialSampler(val_data)
+val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)
+for param in bert.parameters():
+    param.requires_grad = False
+model = BERT_Arch(bert)
+model = model.to(device)
+optimizer = AdamW(model.parameters(),lr = 1e-5)
+class_weights = compute_class_weight("balanced",classes = np.unique(train_labels),y =train_labels['labels'] )
+weights= torch.tensor(class_weights,dtype=torch.float)
+weights = weights.to(device)
+cross_entropy  = nn.NLLLoss(weight=weights)
+epochs = 10
+def train():
+    model.train()
+    total_loss, total_accuracy = 0, 0
+    total_preds=[]
+    for step,batch in enumerate(train_dataloader):
+        if step % 50 == 0 and not step == 0:
+            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
+        batch = [r.to(device) for r in batch]
+        sent_id, mask, labels = batch
+        model.zero_grad()
+        preds = model(sent_id, mask)
+        loss = cross_entropy(preds, labels)
+        total_loss = total_loss + loss.item()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+        preds=preds.detach().cpu().numpy()
+    total_preds.append(preds)
+    avg_loss = total_loss / len(train_dataloader)
+    total_preds  = np.concatenate(total_preds, axis=0)
+    return avg_loss, total_preds
+def evaluate():
+    print("\nEvaluating...")
+    model.eval()
+    total_loss, total_accuracy = 0, 0
+    total_preds = []
+    for step,batch in enumerate(val_dataloader):
+        batch = [t.to(device) for t in batch]
+        sent_id, mask, labels = batch
+        with torch.no_grad():
+            preds = model(sent_id, mask)
+            loss = cross_entropy(preds,labels)
+            total_loss = total_loss + loss.item()
+            preds = preds.detach().cpu().numpy()
+            total_preds.append(preds)
+    avg_loss = total_loss / len(val_dataloader)
+    total_preds  = np.concatenate(total_preds, axis=0)
+    return avg_loss, total_preds
+best_valid_loss = float('inf')
+epochs = 50
+train_losses=[]
+valid_losses=[]
+for epoch in range(epochs):
+    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
+    train_loss, _ = train()
+    valid_loss, _ = evaluate()
+    if valid_loss < best_valid_loss:
+        best_valid_loss = valid_loss
+        torch.save(model.state_dict(), 'saved_weights.pt')
+    train_losses.append(train_loss)
+    valid_losses.append(valid_loss)
+    print(f'\nTraining Loss: {train_loss:.3f}')
+    print(f'Validation Loss: {valid_loss:.3f}')
+# get predictions for test data
+with torch.no_grad():
+    preds = model(test_seq.to(device), test_mask.to(device))
+    preds = preds.detach().cpu().numpy()
+# model's performance
+preds = np.argmax(preds, axis = 1)
+print(classification_report(test_y, preds))
+torch.save(model.state_dict(),'model.pth')

model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43e1830a4a951434b7b91356e84d598e0f13f6bd3060cb99e98584330160455a
+size 439576894

model.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from torch import nn
+class BERT_Arch(nn.Module):
+    def __init__(self, bert):
+        super(BERT_Arch, self).__init__()
+        self.bert = bert
+        self.dropout = nn.Dropout(0.1)
+        self.relu =  nn.ReLU()
+        self.fc1 = nn.Linear(768,512)
+        self.fc2 = nn.Linear(512,2)
+        self.softmax = nn.LogSoftmax(dim=1)
+    def forward(self, sent_id, mask):
+        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
+        x = self.fc1(cls_hs)
+        x = self.relu(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.softmax(x)
+        return x

preprocess_data.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from load_data import load_dataset_
+from bs4 import BeautifulSoup as bs4
+import re
+def remove_html(text):
+    if text is None:
+      return None
+    if "<" not in text and ">" not in text:
+        return text
+    # Otherwise, parse and clean the HTML
+    soup = bs4(text, "html.parser")
+    return soup.get_text()
+def remove_links(text):
+    if text is None:
+      return None
+    pattern = r'https?://\S+|www\.\S+'
+    clean_text = re.sub(pattern, '', text).lower().strip()
+    return clean_text
+def preprocess(dataset):
+   texts, labels = zip(*[
+    (remove_links(remove_html(i['text'])).lower().strip(), i['label'])
+    for i in dataset['train']
+    if i and i.get('text') and remove_links(remove_html(i['text'])).strip()
+    ])
+   return texts, labels

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+fastapi
+uvicorn
+torch
+transformers