nijatmammadov commited on
Commit
c54099a
·
1 Parent(s): 018e8fc

initial cmmt

Browse files
Files changed (8) hide show
  1. app.py +77 -0
  2. evaluate.py +37 -0
  3. load_data.py +9 -0
  4. main.py +230 -0
  5. model.pth +3 -0
  6. model.py +34 -0
  7. preprocess_data.py +30 -0
  8. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gdown
2
+ import torch
3
+ from fastapi import FastAPI
4
+ from transformers import AutoModel, BertTokenizerFast
5
+ from pydantic import BaseModel
6
+ from model import BERT_Arch
7
+ from preprocess_data import remove_html,remove_links
8
+ import gradio as gr
9
+
10
+ class TextRequest(BaseModel):
11
+ text: str
12
+
13
+ # Download model from Google Drive
14
+ #link:https://drive.google.com/drive/folders/102UPd446eHCCENR58EC3UxnJfcYkBa8U?usp=sharing
15
+ model_url = "https://drive.google.com/uc?id=16ZWVa0d2V0T3s11Oq86rLOTA6bOR0DnR"
16
+ model_path = "model.pth"
17
+ gdown.download(model_url, model_path, quiet=False)
18
+
19
+ # Load pre-trained BERT model
20
+ bert = AutoModel.from_pretrained("bert-base-uncased")
21
+ for param in bert.parameters():
22
+ param.requires_grad = False
23
+
24
+ # Set device
25
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+
27
+ # Load custom model
28
+ model = BERT_Arch(bert)
29
+ model.load_state_dict(torch.load(model_path, map_location=device))
30
+ model.to(device)
31
+ model.eval()
32
+
33
+ # Load tokenizer
34
+ tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
35
+
36
+ # Initialize FastAPI
37
+ app = FastAPI()
38
+
39
+ @app.get("/")
40
+ def home():
41
+ return {"message": "Phishing Detection API is running!"}
42
+
43
+ @app.post("/predict/")
44
+ def predict(request: TextRequest):
45
+ try:
46
+ text = request.text.strip()
47
+
48
+ # Preprocess text
49
+ text = remove_html(text)
50
+ text = remove_links(text)
51
+
52
+ # Tokenize input text
53
+ tokens = tokenizer(
54
+ text, return_tensors="pt", truncation=True, padding="max_length", max_length=512
55
+ )
56
+
57
+ input_ids = tokens["input_ids"].to(device)
58
+ attention_mask = tokens["attention_mask"].to(device)
59
+
60
+ # Perform inference
61
+ with torch.no_grad():
62
+ output = model(input_ids, attention_mask)
63
+
64
+ prediction = torch.argmax(output.cpu(), dim=1).item()
65
+
66
+ return {"prediction": "Phishing" if prediction == 1 else "Not Phishing"}
67
+
68
+ except Exception as e:
69
+ return {"error": str(e)}
70
+ def greet(name):
71
+ return "Hello " + name + "!"
72
+ gr.Interface(
73
+ fn=greet,
74
+ inputs="text",
75
+ outputs="text",
76
+ allow_flagging="never"
77
+ ).launch()
evaluate.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from preprocess_data import remove_html,remove_links
2
+ import torch
3
+ from model import BERT_Arch
4
+ from transformers import AutoModel,BertTokenizerFast
5
+ import numpy as np
6
+
7
+
8
+
9
+
10
+ def evaluate(data,device):
11
+ bert = AutoModel.from_pretrained('bert-base-uncased')
12
+ data = [data]
13
+ for param in bert.parameters():
14
+ param.requires_grad = False
15
+
16
+ model = BERT_Arch(bert)
17
+
18
+ map_location = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19
+ model.load_state_dict(torch.load("/content/model.pth", weights_only=True,map_location=map_location))
20
+ model = model.to(device)
21
+ data = [remove_html(i) for i in data]
22
+ data = [remove_links(i) for i in data]
23
+
24
+ tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
25
+ tokenized = tokenizer.batch_encode_plus(data,
26
+ max_length = 25,
27
+ pad_to_max_length=True,
28
+ truncation=True
29
+ )
30
+ tokenized_seq = torch.tensor(tokenized['input_ids'])
31
+ tokenized_mask = torch.tensor(tokenized['attention_mask'])
32
+
33
+ with torch.no_grad():
34
+ preds = model(tokenized_seq.to(device), tokenized_mask.to(device))
35
+ preds = preds.detach().cpu().numpy()
36
+ pred = np.argmax(preds, axis = 1)
37
+ return pred
load_data.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ def load_dataset_():
4
+
5
+ dataset_id ="huynq3Cyradar/Phishing_Detection_Dataset"
6
+
7
+ dataset = load_dataset(dataset_id)
8
+
9
+ return dataset
main.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from preprocess_data import preprocess
2
+ from load_data import load_dataset_
3
+ from model import BERT_Arch
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ import torch
8
+ from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
9
+ from torch import nn
10
+ from torch.optim import AdamW
11
+ from sklearn.utils.class_weight import compute_class_weight
12
+
13
+
14
+
15
+ from sklearn.model_selection import train_test_split
16
+ from sklearn.metrics import classification_report
17
+ from imblearn.under_sampling import RandomUnderSampler
18
+
19
+
20
+ device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
21
+
22
+ import transformers
23
+ from transformers import AutoModel, BertTokenizerFast
24
+ from transformers import AutoTokenizer
25
+
26
+ # texts, labels = preprocess()
27
+
28
+ dataset = load_dataset_()
29
+
30
+ texts,labels = preprocess(dataset)
31
+
32
+ df = pd.DataFrame({"texts":texts, "labels":labels})
33
+ df = df.iloc[:-40000][["texts","labels"]]
34
+
35
+ rus = RandomUnderSampler(random_state=42)
36
+ X_res, y_res = rus.fit_resample(pd.DataFrame(df['texts']), pd.DataFrame(df['labels']))
37
+
38
+ train_text, temp_text, train_labels, temp_labels = train_test_split(X_res,y_res,
39
+ random_state=2018,
40
+ test_size=0.3,
41
+ stratify=y_res)
42
+
43
+
44
+ val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
45
+ random_state=2018,
46
+ test_size=0.5,
47
+ stratify=temp_labels)
48
+
49
+ bert = AutoModel.from_pretrained('bert-base-uncased')
50
+
51
+ tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
52
+
53
+ tokens_train = tokenizer.batch_encode_plus(
54
+ train_text['texts'].tolist(),
55
+ max_length = 25,
56
+ pad_to_max_length=True,
57
+ truncation=True
58
+ )
59
+
60
+ tokens_val = tokenizer.batch_encode_plus(
61
+ val_text['texts'].tolist(),
62
+ max_length = 25,
63
+ pad_to_max_length=True,
64
+ truncation=True
65
+ )
66
+
67
+ tokens_test = tokenizer.batch_encode_plus(
68
+ test_text['texts'].tolist(),
69
+ max_length = 25,
70
+ pad_to_max_length=True,
71
+ truncation=True
72
+ )
73
+
74
+ train_seq = torch.tensor(tokens_train['input_ids'])
75
+ train_mask = torch.tensor(tokens_train['attention_mask'])
76
+ train_y = torch.tensor(train_labels['labels'].tolist())
77
+
78
+ val_seq = torch.tensor(tokens_val['input_ids'])
79
+ val_mask = torch.tensor(tokens_val['attention_mask'])
80
+ val_y = torch.tensor(val_labels['labels'].tolist())
81
+
82
+ test_seq = torch.tensor(tokens_test['input_ids'])
83
+ test_mask = torch.tensor(tokens_test['attention_mask'])
84
+ test_y = torch.tensor(test_labels['labels'].tolist())
85
+
86
+ batch_size = 32
87
+
88
+ train_data = TensorDataset(train_seq, train_mask, train_y)
89
+
90
+ train_sampler = RandomSampler(train_data)
91
+
92
+ train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
93
+
94
+ val_data = TensorDataset(val_seq, val_mask, val_y)
95
+
96
+ val_sampler = SequentialSampler(val_data)
97
+
98
+ val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)
99
+
100
+ for param in bert.parameters():
101
+ param.requires_grad = False
102
+
103
+ model = BERT_Arch(bert)
104
+
105
+ model = model.to(device)
106
+
107
+
108
+ optimizer = AdamW(model.parameters(),lr = 1e-5)
109
+
110
+ class_weights = compute_class_weight("balanced",classes = np.unique(train_labels),y =train_labels['labels'] )
111
+
112
+ weights= torch.tensor(class_weights,dtype=torch.float)
113
+
114
+ weights = weights.to(device)
115
+
116
+ cross_entropy = nn.NLLLoss(weight=weights)
117
+
118
+ epochs = 10
119
+
120
+ def train():
121
+
122
+ model.train()
123
+ total_loss, total_accuracy = 0, 0
124
+
125
+ total_preds=[]
126
+
127
+ for step,batch in enumerate(train_dataloader):
128
+
129
+ if step % 50 == 0 and not step == 0:
130
+ print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
131
+
132
+ batch = [r.to(device) for r in batch]
133
+
134
+ sent_id, mask, labels = batch
135
+
136
+ model.zero_grad()
137
+
138
+ preds = model(sent_id, mask)
139
+
140
+ loss = cross_entropy(preds, labels)
141
+
142
+ total_loss = total_loss + loss.item()
143
+
144
+ loss.backward()
145
+
146
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
147
+
148
+ optimizer.step()
149
+
150
+ preds=preds.detach().cpu().numpy()
151
+
152
+ total_preds.append(preds)
153
+
154
+ avg_loss = total_loss / len(train_dataloader)
155
+
156
+ total_preds = np.concatenate(total_preds, axis=0)
157
+
158
+ return avg_loss, total_preds
159
+
160
+ def evaluate():
161
+
162
+ print("\nEvaluating...")
163
+
164
+ model.eval()
165
+
166
+ total_loss, total_accuracy = 0, 0
167
+
168
+ total_preds = []
169
+
170
+ for step,batch in enumerate(val_dataloader):
171
+
172
+
173
+ batch = [t.to(device) for t in batch]
174
+
175
+ sent_id, mask, labels = batch
176
+
177
+ with torch.no_grad():
178
+
179
+ preds = model(sent_id, mask)
180
+
181
+ loss = cross_entropy(preds,labels)
182
+
183
+ total_loss = total_loss + loss.item()
184
+
185
+ preds = preds.detach().cpu().numpy()
186
+
187
+ total_preds.append(preds)
188
+
189
+ avg_loss = total_loss / len(val_dataloader)
190
+
191
+ total_preds = np.concatenate(total_preds, axis=0)
192
+
193
+ return avg_loss, total_preds
194
+
195
+ best_valid_loss = float('inf')
196
+
197
+ epochs = 50
198
+
199
+ train_losses=[]
200
+ valid_losses=[]
201
+
202
+ for epoch in range(epochs):
203
+
204
+ print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
205
+
206
+ train_loss, _ = train()
207
+
208
+ valid_loss, _ = evaluate()
209
+
210
+ if valid_loss < best_valid_loss:
211
+ best_valid_loss = valid_loss
212
+ torch.save(model.state_dict(), 'saved_weights.pt')
213
+
214
+ train_losses.append(train_loss)
215
+ valid_losses.append(valid_loss)
216
+
217
+ print(f'\nTraining Loss: {train_loss:.3f}')
218
+ print(f'Validation Loss: {valid_loss:.3f}')
219
+
220
+ # get predictions for test data
221
+ with torch.no_grad():
222
+ preds = model(test_seq.to(device), test_mask.to(device))
223
+ preds = preds.detach().cpu().numpy()
224
+
225
+
226
+ # model's performance
227
+ preds = np.argmax(preds, axis = 1)
228
+ print(classification_report(test_y, preds))
229
+
230
+ torch.save(model.state_dict(),'model.pth')
model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43e1830a4a951434b7b91356e84d598e0f13f6bd3060cb99e98584330160455a
3
+ size 439576894
model.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+
3
+ class BERT_Arch(nn.Module):
4
+
5
+ def __init__(self, bert):
6
+ super(BERT_Arch, self).__init__()
7
+
8
+ self.bert = bert
9
+
10
+ self.dropout = nn.Dropout(0.1)
11
+
12
+ self.relu = nn.ReLU()
13
+
14
+ self.fc1 = nn.Linear(768,512)
15
+
16
+ self.fc2 = nn.Linear(512,2)
17
+
18
+ self.softmax = nn.LogSoftmax(dim=1)
19
+
20
+ def forward(self, sent_id, mask):
21
+
22
+ _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
23
+
24
+ x = self.fc1(cls_hs)
25
+
26
+ x = self.relu(x)
27
+
28
+ x = self.dropout(x)
29
+
30
+ x = self.fc2(x)
31
+
32
+ x = self.softmax(x)
33
+
34
+ return x
preprocess_data.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from load_data import load_dataset_
2
+ from bs4 import BeautifulSoup as bs4
3
+ import re
4
+
5
+ def remove_html(text):
6
+ if text is None:
7
+ return None
8
+ if "<" not in text and ">" not in text:
9
+ return text
10
+
11
+ # Otherwise, parse and clean the HTML
12
+ soup = bs4(text, "html.parser")
13
+ return soup.get_text()
14
+
15
+ def remove_links(text):
16
+
17
+ if text is None:
18
+ return None
19
+ pattern = r'https?://\S+|www\.\S+'
20
+ clean_text = re.sub(pattern, '', text).lower().strip()
21
+ return clean_text
22
+
23
+ def preprocess(dataset):
24
+
25
+ texts, labels = zip(*[
26
+ (remove_links(remove_html(i['text'])).lower().strip(), i['label'])
27
+ for i in dataset['train']
28
+ if i and i.get('text') and remove_links(remove_html(i['text'])).strip()
29
+ ])
30
+ return texts, labels
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ torch
4
+ transformers