RYU-KASH commited on
Commit
fa16300
·
verified ·
1 Parent(s): b277089

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +728 -0
app.py ADDED
@@ -0,0 +1,728 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy as np # linear algebra
3
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
4
+
5
+ import matplotlib.pyplot as plt
6
+ #%matplotlib inline
7
+ import seaborn as sns
8
+ from sklearn import metrics
9
+ import warnings
10
+ warnings.filterwarnings('ignore')
11
+
12
+ data = pd.read_csv('phishing.csv')
13
+ data.head(20)
14
+
15
+ data.columns
16
+ len(data.columns)
17
+ data.isnull().sum()
18
+ X = data.drop(["class","Index"],axis =1)
19
+ y = data["class"]
20
+
21
+ fig, ax = plt.subplots(1, 1, figsize=(15, 9))
22
+ sns.heatmap(data.corr(), annot=True,cmap='viridis')
23
+ plt.title('Correlation between different features', fontsize = 15, c='black')
24
+ plt.show()
25
+
26
+ corr=data.corr()
27
+ corr.head()
28
+
29
+ corr['class']=abs(corr['class'])
30
+ corr.head()
31
+
32
+ incCorr=corr.sort_values(by='class',ascending=False)
33
+ incCorr.head()
34
+
35
+ incCorr['class']
36
+
37
+ tenfeatures=incCorr[1:11].index
38
+ twenfeatures=incCorr[1:21].index
39
+
40
+ #Structutre to Store metrics
41
+ ML_Model = []
42
+ accuracy = []
43
+ f1_score = []
44
+ precision = []
45
+
46
+ def storeResults(model, a,b,c):
47
+ ML_Model.append(model)
48
+ accuracy.append(round(a, 3))
49
+ f1_score.append(round(b, 3))
50
+ precision.append(round(c, 3))
51
+
52
+ def KNN(X):
53
+ x=[a for a in range(1,10,2)]
54
+ knntrain=[]
55
+ knntest=[]
56
+ from sklearn.model_selection import train_test_split
57
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
58
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
59
+ for i in range(1,10,2):
60
+ from sklearn.neighbors import KNeighborsClassifier
61
+ knn = KNeighborsClassifier(n_neighbors=i)
62
+ knn.fit(X_train,y_train)
63
+ y_train_knn = knn.predict(X_train)
64
+ y_test_knn = knn.predict(X_test)
65
+ acc_train_knn = metrics.accuracy_score(y_train,y_train_knn)
66
+ acc_test_knn = metrics.accuracy_score(y_test,y_test_knn)
67
+ print("K-Nearest Neighbors with k={}: Accuracy on training Data: {:.3f}".format(i,acc_train_knn))
68
+ print("K-Nearest Neighbors with k={}: Accuracy on test Data: {:.3f}".format(i,acc_test_knn))
69
+ knntrain.append(acc_train_knn)
70
+ knntest.append(acc_test_knn)
71
+ print()
72
+ import matplotlib.pyplot as plt
73
+ plt.plot(x,knntrain,label="Train accuracy")
74
+ plt.plot(x,knntest,label="Test accuracy")
75
+ plt.legend()
76
+ plt.show()
77
+
78
+ Xmain=X
79
+ Xten=X[tenfeatures]
80
+ Xtwen=X[twenfeatures]
81
+
82
+ KNN(Xmain)
83
+
84
+ KNN(Xten)
85
+
86
+ KNN(Xtwen)
87
+
88
+ from sklearn.model_selection import train_test_split
89
+ from sklearn.neighbors import KNeighborsClassifier
90
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
91
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
92
+
93
+ knn = KNeighborsClassifier(n_neighbors=5)
94
+ knn.fit(X_train,y_train)
95
+
96
+ y_train_knn = knn.predict(X_train)
97
+ y_test_knn = knn.predict(X_test)
98
+
99
+ acc_train_knn = metrics.accuracy_score(y_train,y_train_knn)
100
+ acc_test_knn = metrics.accuracy_score(y_test,y_test_knn)
101
+
102
+ f1_score_train_knn = metrics.f1_score(y_train,y_train_knn)
103
+ f1_score_test_knn = metrics.f1_score(y_test,y_test_knn)
104
+
105
+ precision_score_train_knn = metrics.precision_score(y_train,y_train_knn)
106
+ precision_score_test_knn = metrics.precision_score(y_test,y_test_knn)
107
+
108
+ storeResults('K-Nearest Neighbors',acc_test_knn,f1_score_test_knn,precision_score_train_knn)
109
+
110
+ def SVM(X, y):
111
+ x=[a for a in range(1,10,2)]
112
+ svmtrain=[]
113
+ svmtest=[]
114
+ from sklearn.model_selection import train_test_split
115
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
116
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
117
+ from sklearn.svm import SVC
118
+ for i in range(1,10,2):
119
+ svm = SVC(kernel='linear', C=i)
120
+ svm.fit(X_train, y_train)
121
+ y_train_svm = svm.predict(X_train)
122
+ y_test_svm = svm.predict(X_test)
123
+ acc_train_svm = metrics.accuracy_score(y_train, y_train_svm)
124
+ acc_test_svm = metrics.accuracy_score(y_test, y_test_svm)
125
+ print("SVM with C={}: Accuracy on training Data: {:.3f}".format(i,acc_train_svm))
126
+ print("SVM with C={}: Accuracy on test Data: {:.3f}".format(i,acc_test_svm))
127
+ svmtrain.append(acc_train_svm)
128
+ svmtest.append(acc_test_svm)
129
+ print()
130
+ import matplotlib.pyplot as plt
131
+ plt.plot(x,svmtrain,label="Train accuracy")
132
+ plt.plot(x,svmtest,label="Test accuracy")
133
+ plt.legend()
134
+ plt.show()
135
+
136
+
137
+ Xmain=X
138
+ Xten=X[tenfeatures]
139
+ Xtwen=X[twenfeatures]
140
+
141
+ SVM(Xmain,y)
142
+ SVM(Xten,y)
143
+ SVM(Xtwen,y)
144
+
145
+ from sklearn.model_selection import train_test_split
146
+ from sklearn.svm import SVC
147
+ from sklearn import metrics
148
+
149
+
150
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
151
+
152
+ svm = SVC(kernel='linear', C=1, random_state=42)
153
+ svm.fit(X_train, y_train)
154
+
155
+
156
+ y_train_svm = svm.predict(X_train)
157
+ y_test_svm = svm.predict(X_test)
158
+
159
+
160
+ acc_train_svm = metrics.accuracy_score(y_train, y_train_svm)
161
+ acc_test_svm = metrics.accuracy_score(y_test, y_test_svm)
162
+
163
+ f1_score_train_svm = metrics.f1_score(y_train, y_train_svm)
164
+ f1_score_test_svm = metrics.f1_score(y_test, y_test_svm)
165
+
166
+ precision_score_train_svm = metrics.precision_score(y_train, y_train_svm)
167
+ precision_score_test_svm = metrics.precision_score(y_test, y_test_svm)
168
+
169
+ print("SVM with C={}: Accuracy on training data: {:.3f}".format(1, acc_train_svm))
170
+ print("SVM with C={}: Accuracy on test data: {:.3f}".format(1, acc_test_svm))
171
+ print("SVM with C={}: F1 score on training data: {:.3f}".format(1, f1_score_train_svm))
172
+ print("SVM with C={}: F1 score on test data: {:.3f}".format(1, f1_score_test_svm))
173
+ print("SVM with C={}: Precision on training data: {:.3f}".format(1, precision_score_train_svm))
174
+ print("SVM with C={}: Precision on test data: {:.3f}".format(1, precision_score_test_svm))
175
+
176
+ storeResults('Support Vector Machines',acc_test_svm,f1_score_test_svm,precision_score_train_svm)
177
+
178
+ from sklearn.model_selection import train_test_split
179
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
180
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
181
+
182
+ from sklearn.ensemble import GradientBoostingClassifier
183
+ gbc = GradientBoostingClassifier(max_depth=4,learning_rate=0.7)
184
+ gbc.fit(X_train,y_train)
185
+
186
+ y_train_gbc = gbc.predict(X_train)
187
+ y_test_gbc = gbc.predict(X_test)
188
+
189
+ acc_train_gbc = metrics.accuracy_score(y_train,y_train_gbc)
190
+ acc_test_gbc = metrics.accuracy_score(y_test,y_test_gbc)
191
+ print("Gradient Boosting Classifier : Accuracy on training Data: {:.3f}".format(acc_train_gbc))
192
+ print("Gradient Boosting Classifier : Accuracy on test Data: {:.3f}".format(acc_test_gbc))
193
+ print()
194
+
195
+ f1_score_train_gbc = metrics.f1_score(y_train,y_train_gbc)
196
+ f1_score_test_gbc = metrics.f1_score(y_test,y_test_gbc)
197
+
198
+ precision_score_train_gbc = metrics.precision_score(y_train,y_train_gbc)
199
+ precision_score_test_gbc = metrics.precision_score(y_test,y_test_gbc)
200
+
201
+ storeResults('Gradient Boosting Classifier',acc_test_gbc,f1_score_test_gbc,precision_score_train_gbc)
202
+
203
+ df = pd.DataFrame({
204
+ 'Modelname': ML_Model,
205
+ 'Accuracy Score': accuracy,
206
+ 'F1 Score': f1_score,
207
+ 'Precision Score': precision
208
+ })
209
+ df.set_index('Modelname', inplace=True)
210
+
211
+ # plot the scores for each model
212
+
213
+ fig, ax = plt.subplots(figsize=(10,10))
214
+ df.plot(kind='bar', ax=ax)
215
+ ax.set_xticklabels(df.index, rotation=0)
216
+ ax.set_ylim([0.9, 1])
217
+ ax.set_yticks([0.9,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99,1])
218
+ ax.set_xlabel('Model')
219
+ ax.set_ylabel('Score')
220
+ ax.set_title('Model Scores')
221
+ plt.show()
222
+
223
+ import whois
224
+
225
+ import googlesearch
226
+
227
+ import ipaddress
228
+ import re
229
+ import urllib.request
230
+ from bs4 import BeautifulSoup
231
+ import socket
232
+ import requests
233
+ import google
234
+ import whois
235
+ from datetime import date, datetime
236
+ import time
237
+ from dateutil.parser import parse as date_parse
238
+ from urllib.parse import urlparse
239
+
240
+ class FeatureExtraction:
241
+ features = []
242
+ def __init__(self,url):
243
+ self.features = []
244
+ self.url = url
245
+ self.domain = ""
246
+ self.whois_response = ""
247
+ self.urlparse = ""
248
+ self.response = ""
249
+ self.soup = ""
250
+
251
+ try:
252
+ self.response = requests.get(url)
253
+ self.soup = BeautifulSoup(response.text, 'html.parser')
254
+ except:
255
+ pass
256
+
257
+ try:
258
+ self.urlparse = urlparse(url)
259
+ self.domain = self.urlparse.netloc
260
+ except:
261
+ pass
262
+
263
+ try:
264
+ self.whois_response = whois.whois(self.domain)
265
+ except:
266
+ pass
267
+
268
+
269
+
270
+
271
+ self.features.append(self.UsingIp())
272
+ self.features.append(self.longUrl())
273
+ self.features.append(self.shortUrl())
274
+ self.features.append(self.symbol())
275
+ self.features.append(self.redirecting())
276
+ self.features.append(self.prefixSuffix())
277
+ self.features.append(self.SubDomains())
278
+ self.features.append(self.Hppts())
279
+ self.features.append(self.DomainRegLen())
280
+ self.features.append(self.Favicon())
281
+
282
+
283
+ self.features.append(self.NonStdPort())
284
+ self.features.append(self.HTTPSDomainURL())
285
+ self.features.append(self.RequestURL())
286
+ self.features.append(self.AnchorURL())
287
+ self.features.append(self.LinksInScriptTags())
288
+ self.features.append(self.ServerFormHandler())
289
+ self.features.append(self.InfoEmail())
290
+ self.features.append(self.AbnormalURL())
291
+ self.features.append(self.WebsiteForwarding())
292
+ self.features.append(self.StatusBarCust())
293
+
294
+ self.features.append(self.DisableRightClick())
295
+ self.features.append(self.UsingPopupWindow())
296
+ self.features.append(self.IframeRedirection())
297
+ self.features.append(self.AgeofDomain())
298
+ self.features.append(self.DNSRecording())
299
+ self.features.append(self.WebsiteTraffic())
300
+ self.features.append(self.PageRank())
301
+ self.features.append(self.GoogleIndex())
302
+ self.features.append(self.LinksPointingToPage())
303
+ self.features.append(self.StatsReport())
304
+
305
+
306
+ # 1.UsingIp
307
+ def UsingIp(self):
308
+ try:
309
+ ipaddress.ip_address(self.url)
310
+ return -1
311
+ except:
312
+ return 1
313
+
314
+ # 2.longUrl
315
+ def longUrl(self):
316
+ if len(self.url) < 54:
317
+ return 1
318
+ if len(self.url) >= 54 and len(self.url) <= 75:
319
+ return 0
320
+ return -1
321
+
322
+ # 3.shortUrl
323
+ def shortUrl(self):
324
+ match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
325
+ 'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
326
+ 'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
327
+ 'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
328
+ 'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
329
+ 'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
330
+ 'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net', self.url)
331
+ if match:
332
+ return -1
333
+ return 1
334
+
335
+ # 4.Symbol@
336
+ def symbol(self):
337
+ if re.findall("@",self.url):
338
+ return -1
339
+ return 1
340
+
341
+ # 5.Redirecting//
342
+ def redirecting(self):
343
+ if self.url.rfind('//')>6:
344
+ return -1
345
+ return 1
346
+
347
+ # 6.prefixSuffix
348
+ def prefixSuffix(self):
349
+ try:
350
+ match = re.findall('\-', self.domain)
351
+ if match:
352
+ return -1
353
+ return 1
354
+ except:
355
+ return -1
356
+
357
+ # 7.SubDomains
358
+ def SubDomains(self):
359
+ dot_count = len(re.findall("\.", self.url))
360
+ if dot_count == 1:
361
+ return 1
362
+ elif dot_count == 2:
363
+ return 0
364
+ return -1
365
+
366
+ # 8.HTTPS
367
+ def Hppts(self):
368
+ try:
369
+ https = self.urlparse.scheme
370
+ if 'https' in https:
371
+ return 1
372
+ return -1
373
+ except:
374
+ return 1
375
+
376
+ # 9.DomainRegLen
377
+ def DomainRegLen(self):
378
+ try:
379
+ expiration_date = self.whois_response.expiration_date
380
+ creation_date = self.whois_response.creation_date
381
+ try:
382
+ if(len(expiration_date)):
383
+ expiration_date = expiration_date[0]
384
+ except:
385
+ pass
386
+ try:
387
+ if(len(creation_date)):
388
+ creation_date = creation_date[0]
389
+ except:
390
+ pass
391
+
392
+ age = (expiration_date.year-creation_date.year)*12+ (expiration_date.month-creation_date.month)
393
+ if age >=12:
394
+ return 1
395
+ return -1
396
+ except:
397
+ return -1
398
+
399
+ # 10. Favicon
400
+ def Favicon(self):
401
+ try:
402
+ for head in self.soup.find_all('head'):
403
+ for head.link in self.soup.find_all('link', href=True):
404
+ dots = [x.start(0) for x in re.finditer('\.', head.link['href'])]
405
+ if self.url in head.link['href'] or len(dots) == 1 or domain in head.link['href']:
406
+ return 1
407
+ return -1
408
+ except:
409
+ return -1
410
+
411
+ # 11. NonStdPort
412
+ def NonStdPort(self):
413
+ try:
414
+ port = self.domain.split(":")
415
+ if len(port)>1:
416
+ return -1
417
+ return 1
418
+ except:
419
+ return -1
420
+
421
+ # 12. HTTPSDomainURL
422
+ def HTTPSDomainURL(self):
423
+ try:
424
+ if 'https' in self.domain:
425
+ return -1
426
+ return 1
427
+ except:
428
+ return -1
429
+
430
+ # 13. RequestURL
431
+ def RequestURL(self):
432
+ try:
433
+ for img in self.soup.find_all('img', src=True):
434
+ dots = [x.start(0) for x in re.finditer('\.', img['src'])]
435
+ if self.url in img['src'] or self.domain in img['src'] or len(dots) == 1:
436
+ success = success + 1
437
+ i = i+1
438
+
439
+ for audio in self.soup.find_all('audio', src=True):
440
+ dots = [x.start(0) for x in re.finditer('\.', audio['src'])]
441
+ if self.url in audio['src'] or self.domain in audio['src'] or len(dots) == 1:
442
+ success = success + 1
443
+ i = i+1
444
+
445
+ for embed in self.soup.find_all('embed', src=True):
446
+ dots = [x.start(0) for x in re.finditer('\.', embed['src'])]
447
+ if self.url in embed['src'] or self.domain in embed['src'] or len(dots) == 1:
448
+ success = success + 1
449
+ i = i+1
450
+
451
+ for iframe in self.soup.find_all('iframe', src=True):
452
+ dots = [x.start(0) for x in re.finditer('\.', iframe['src'])]
453
+ if self.url in iframe['src'] or self.domain in iframe['src'] or len(dots) == 1:
454
+ success = success + 1
455
+ i = i+1
456
+
457
+ try:
458
+ percentage = success/float(i) * 100
459
+ if percentage < 22.0:
460
+ return 1
461
+ elif((percentage >= 22.0) and (percentage < 61.0)):
462
+ return 0
463
+ else:
464
+ return -1
465
+ except:
466
+ return 0
467
+ except:
468
+ return -1
469
+
470
+ # 14. AnchorURL
471
+ def AnchorURL(self):
472
+ try:
473
+ i,unsafe = 0,0
474
+ for a in self.soup.find_all('a', href=True):
475
+ if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not (url in a['href'] or self.domain in a['href']):
476
+ unsafe = unsafe + 1
477
+ i = i + 1
478
+
479
+ try:
480
+ percentage = unsafe / float(i) * 100
481
+ if percentage < 31.0:
482
+ return 1
483
+ elif ((percentage >= 31.0) and (percentage < 67.0)):
484
+ return 0
485
+ else:
486
+ return -1
487
+ except:
488
+ return -1
489
+
490
+ except:
491
+ return -1
492
+
493
+ # 15. LinksInScriptTags
494
+ def LinksInScriptTags(self):
495
+ try:
496
+ i,success = 0,0
497
+
498
+ for link in self.soup.find_all('link', href=True):
499
+ dots = [x.start(0) for x in re.finditer('\.', link['href'])]
500
+ if self.url in link['href'] or self.domain in link['href'] or len(dots) == 1:
501
+ success = success + 1
502
+ i = i+1
503
+
504
+ for script in self.soup.find_all('script', src=True):
505
+ dots = [x.start(0) for x in re.finditer('\.', script['src'])]
506
+ if self.url in script['src'] or self.domain in script['src'] or len(dots) == 1:
507
+ success = success + 1
508
+ i = i+1
509
+
510
+ try:
511
+ percentage = success / float(i) * 100
512
+ if percentage < 17.0:
513
+ return 1
514
+ elif((percentage >= 17.0) and (percentage < 81.0)):
515
+ return 0
516
+ else:
517
+ return -1
518
+ except:
519
+ return 0
520
+ except:
521
+ return -1
522
+
523
+ # 16. ServerFormHandler
524
+ def ServerFormHandler(self):
525
+ try:
526
+ if len(self.soup.find_all('form', action=True))==0:
527
+ return 1
528
+ else :
529
+ for form in self.soup.find_all('form', action=True):
530
+ if form['action'] == "" or form['action'] == "about:blank":
531
+ return -1
532
+ elif self.url not in form['action'] and self.domain not in form['action']:
533
+ return 0
534
+ else:
535
+ return 1
536
+ except:
537
+ return -1
538
+
539
+ # 17. InfoEmail
540
+ def InfoEmail(self):
541
+ try:
542
+ if re.findall(r"[mail\(\)|mailto:?]", self.soap):
543
+ return -1
544
+ else:
545
+ return 1
546
+ except:
547
+ return -1
548
+
549
+ # 18. AbnormalURL
550
+ def AbnormalURL(self):
551
+ try:
552
+ if self.response.text == self.whois_response:
553
+ return 1
554
+ else:
555
+ return -1
556
+ except:
557
+ return -1
558
+
559
+ # 19. WebsiteForwarding
560
+ def WebsiteForwarding(self):
561
+ try:
562
+ if len(self.response.history) <= 1:
563
+ return 1
564
+ elif len(self.response.history) <= 4:
565
+ return 0
566
+ else:
567
+ return -1
568
+ except:
569
+ return -1
570
+
571
+ # 20. StatusBarCust
572
+ def StatusBarCust(self):
573
+ try:
574
+ if re.findall("<script>.+onmouseover.+</script>", self.response.text):
575
+ return 1
576
+ else:
577
+ return -1
578
+ except:
579
+ return -1
580
+
581
+ # 21. DisableRightClick
582
+ def DisableRightClick(self):
583
+ try:
584
+ if re.findall(r"event.button ?== ?2", self.response.text):
585
+ return 1
586
+ else:
587
+ return -1
588
+ except:
589
+ return -1
590
+
591
+ # 22. UsingPopupWindow
592
+ def UsingPopupWindow(self):
593
+ try:
594
+ if re.findall(r"alert\(", self.response.text):
595
+ return 1
596
+ else:
597
+ return -1
598
+ except:
599
+ return -1
600
+
601
+ # 23. IframeRedirection
602
+ def IframeRedirection(self):
603
+ try:
604
+ if re.findall(r"[<iframe>|<frameBorder>]", self.response.text):
605
+ return 1
606
+ else:
607
+ return -1
608
+ except:
609
+ return -1
610
+
611
+ # 24. AgeofDomain
612
+ def AgeofDomain(self):
613
+ try:
614
+ creation_date = self.whois_response.creation_date
615
+ try:
616
+ if(len(creation_date)):
617
+ creation_date = creation_date[0]
618
+ except:
619
+ pass
620
+
621
+ today = date.today()
622
+ age = (today.year-creation_date.year)*12+(today.month-creation_date.month)
623
+ if age >=6:
624
+ return 1
625
+ return -1
626
+ except:
627
+ return -1
628
+
629
+ # 25. DNSRecording
630
+ def DNSRecording(self):
631
+ try:
632
+ creation_date = self.whois_response.creation_date
633
+ try:
634
+ if(len(creation_date)):
635
+ creation_date = creation_date[0]
636
+ except:
637
+ pass
638
+
639
+ today = date.today()
640
+ age = (today.year-creation_date.year)*12+(today.month-creation_date.month)
641
+ if age >=6:
642
+ return 1
643
+ return -1
644
+ except:
645
+ return -1
646
+
647
+ # 26. WebsiteTraffic
648
+ def WebsiteTraffic(self):
649
+ try:
650
+ rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find("REACH")['RANK']
651
+ if (int(rank) < 100000):
652
+ return 1
653
+ return 0
654
+ except :
655
+ return -1
656
+
657
+ # 27. PageRank
658
+ def PageRank(self):
659
+ try:
660
+ prank_checker_response = requests.post("https://www.checkpagerank.net/index.php", {"name": self.domain})
661
+
662
+ global_rank = int(re.findall(r"Global Rank: ([0-9]+)", rank_checker_response.text)[0])
663
+ if global_rank > 0 and global_rank < 100000:
664
+ return 1
665
+ return -1
666
+ except:
667
+ return -1
668
+
669
+
670
+ # 28. GoogleIndex
671
+ def GoogleIndex(self):
672
+ try:
673
+ site = search(self.url, 5)
674
+ if site:
675
+ return 1
676
+ else:
677
+ return -1
678
+ except:
679
+ return 1
680
+
681
+ # 29. LinksPointingToPage
682
+ def LinksPointingToPage(self):
683
+ try:
684
+ number_of_links = len(re.findall(r"<a href=", self.response.text))
685
+ if number_of_links == 0:
686
+ return 1
687
+ elif number_of_links <= 2:
688
+ return 0
689
+ else:
690
+ return -1
691
+ except:
692
+ return -1
693
+
694
+ # 30. StatsReport
695
+ def StatsReport(self):
696
+ try:
697
+ url_match = re.search(
698
+ 'at\.ua|usa\.cc|baltazarpresentes\.com\.br|pe\.hu|esy\.es|hol\.es|sweddy\.com|myjino\.ru|96\.lt|ow\.ly', url)
699
+ ip_address = socket.gethostbyname(self.domain)
700
+ ip_match = re.search('146\.112\.61\.108|213\.174\.157\.151|121\.50\.168\.88|192\.185\.217\.116|78\.46\.211\.158|181\.174\.165\.13|46\.242\.145\.103|121\.50\.168\.40|83\.125\.22\.219|46\.242\.145\.98|'
701
+ '107\.151\.148\.44|107\.151\.148\.107|64\.70\.19\.203|199\.184\.144\.27|107\.151\.148\.108|107\.151\.148\.109|119\.28\.52\.61|54\.83\.43\.69|52\.69\.166\.231|216\.58\.192\.225|'
702
+ '118\.184\.25\.86|67\.208\.74\.71|23\.253\.126\.58|104\.239\.157\.210|175\.126\.123\.219|141\.8\.224\.221|10\.10\.10\.10|43\.229\.108\.32|103\.232\.215\.140|69\.172\.201\.153|'
703
+ '216\.218\.185\.162|54\.225\.104\.146|103\.243\.24\.98|199\.59\.243\.120|31\.170\.160\.61|213\.19\.128\.77|62\.113\.226\.131|208\.100\.26\.234|195\.16\.127\.102|195\.16\.127\.157|'
704
+ '34\.196\.13\.28|103\.224\.212\.222|172\.217\.4\.225|54\.72\.9\.51|192\.64\.147\.141|198\.200\.56\.183|23\.253\.164\.103|52\.48\.191\.26|52\.214\.197\.72|87\.98\.255\.18|209\.99\.17\.27|'
705
+ '216\.38\.62\.18|104\.130\.124\.96|47\.89\.58\.141|78\.46\.211\.158|54\.86\.225\.156|54\.82\.156\.19|37\.157\.192\.102|204\.11\.56\.48|110\.34\.231\.42', ip_address)
706
+ if url_match:
707
+ return -1
708
+ elif ip_match:
709
+ return -1
710
+ return 1
711
+ except:
712
+ return 1
713
+
714
+ def getFeaturesList(self):
715
+ return self.features
716
+
717
+ gbc = GradientBoostingClassifier(max_depth=4,learning_rate=0.7)
718
+ gbc.fit(X_train,y_train)
719
+
720
+ url=input("Enter the Url:")
721
+ #can provide any URL. this URL was taken from PhishTank
722
+ obj = FeatureExtraction(url)
723
+ x = np.array(obj.getFeaturesList()).reshape(1,30)
724
+ y_pred =gbc.predict(x)[0]
725
+ if y_pred==1:
726
+ print("We guess it is a safe website")
727
+ else:
728
+ print("Caution! Suspicious website detected")