quantum-drive commited on
Commit
e5d904c
Β·
verified Β·
1 Parent(s): e4a0cc7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +302 -0
app.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import joblib
3
+ import pandas as pd
4
+ import re
5
+ import string
6
+ import socket
7
+ import ssl
8
+ import whois
9
+ import dns.resolver
10
+ from urllib.parse import urlparse
11
+ from datetime import datetime
12
+
13
+ # -------------------------------
14
+ # Load Trained Models
15
+ # -------------------------------
16
+ phishing_model = joblib.load("phishing_stack.pkl")
17
+ malware_model = joblib.load("new_malware_stack.pkl")
18
+
19
+ # -------------------------------
20
+ # Enhanced Feature Extraction
21
+ # -------------------------------
22
+ def extract_phishing_features(url):
23
+ parsed = urlparse(url)
24
+ hostname = parsed.hostname if parsed.hostname else ""
25
+ tld = hostname.split('.')[-1] if '.' in hostname else ""
26
+
27
+ return {
28
+ "url_length": len(url),
29
+ "hostname_length": len(hostname),
30
+ "num_dots": url.count('.'),
31
+ "num_hyphens": url.count('-'),
32
+ "num_digits": sum(char.isdigit() for char in url),
33
+ "num_special_chars": len(re.findall(r"[^\w]", url)) - url.count('/'),
34
+ "has_ip_address": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", hostname) else 0,
35
+ "has_https": 1 if parsed.scheme == "https" else 0,
36
+ "has_suspicious_words": 1 if any(word in url.lower() for word in
37
+ ["login", "secure", "update", "verify", "account", "banking", "paypal"]) else 0,
38
+ "is_shortened": 1 if any(short in url for short in
39
+ ["bit.ly", "tinyurl", "goo.gl", "t.co", "ow.ly", "is.gd"]) else 0,
40
+ "tld": tld
41
+ }
42
+
43
+ def extract_malware_features(url):
44
+ parsed = urlparse(url)
45
+ hostname = parsed.hostname or ""
46
+ scheme = parsed.scheme
47
+
48
+ # Basic URL features
49
+ url_length = len(url)
50
+ hostname_length = len(hostname)
51
+ num_dots = url.count('.')
52
+ num_hyphens = url.count('-')
53
+ num_digits = len(re.findall(r'\d', url))
54
+ special_chars = set(string.punctuation) - {'/'}
55
+ num_specials = sum(1 for c in url if c in special_chars)
56
+ has_suspicious_keyword = any(k in url.lower() for k in
57
+ ['login', 'secure', 'verify', 'update', 'download', 'install', 'free'])
58
+ has_ip = bool(re.match(r'https?://(\d{1,3}\.){3}\d{1,3}', url))
59
+ is_https = scheme == 'https'
60
+ is_shortened = any(s in url for s in
61
+ ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'shorte.st'])
62
+ tld = hostname.split('.')[-1] if '.' in hostname else ''
63
+
64
+ # Network features
65
+ try:
66
+ ip_address = socket.gethostbyname(hostname)
67
+ except:
68
+ ip_address = None
69
+
70
+ # WHOIS features
71
+ try:
72
+ w = whois.whois(url)
73
+ domain_age = (datetime.now() - w.creation_date[0]).days if w.creation_date else -1
74
+ domain_expiry = (w.expiration_date[0] - datetime.now()).days if w.expiration_date else -1
75
+ except:
76
+ domain_age = domain_expiry = -1
77
+
78
+ # DNS features
79
+ try:
80
+ answers = dns.resolver.resolve(hostname, 'A')
81
+ ttl = answers.rrset.ttl
82
+ except:
83
+ ttl = -1
84
+
85
+ # SSL features
86
+ ssl_issuer = "Unknown"
87
+ ssl_valid = False
88
+ if is_https and hostname:
89
+ try:
90
+ ctx = ssl.create_default_context()
91
+ with ctx.wrap_socket(socket.socket(), server_hostname=hostname) as s:
92
+ s.settimeout(3)
93
+ s.connect((hostname, 443))
94
+ cert = s.getpeercert()
95
+ issuer = dict(x[0] for x in cert['issuer'])['organizationName']
96
+ ssl_issuer = issuer if issuer else "Unknown"
97
+ ssl_valid = datetime.strptime(cert['notAfter'], '%b %d %H:%M:%S %Y %Z') > datetime.now()
98
+ except:
99
+ pass
100
+
101
+ return {
102
+ "url_length": url_length,
103
+ "hostname_length": hostname_length,
104
+ "num_dots": num_dots,
105
+ "num_hyphens": num_hyphens,
106
+ "num_digits": num_digits,
107
+ "num_special_chars": num_specials,
108
+ "has_suspicious_keyword": int(has_suspicious_keyword),
109
+ "has_ip_address": int(has_ip),
110
+ "is_https": int(is_https),
111
+ "is_shortened": int(is_shortened),
112
+ "tld": tld,
113
+ "domain_age_days": domain_age,
114
+ "domain_expiry_days": domain_expiry,
115
+ "dns_ttl": ttl,
116
+ "ssl_issuer": ssl_issuer,
117
+ "ssl_valid": int(ssl_valid)
118
+ }
119
+
120
+ # -------------------------------
121
+ # Prepare Model Inputs
122
+ # -------------------------------
123
+ def prepare_phishing_input(url):
124
+ features = extract_phishing_features(url)
125
+ df = pd.DataFrame([features])
126
+ df = pd.get_dummies(df, columns=["tld"], prefix="tld")
127
+ df = df.reindex(columns=phishing_model.feature_names_in_, fill_value=0)
128
+ return df
129
+
130
+ def prepare_malware_input(url):
131
+ features = extract_malware_features(url)
132
+ df = pd.DataFrame([features])
133
+ df = pd.get_dummies(df, columns=["tld", "ssl_issuer"], prefix=["tld", "ssl_issuer"])
134
+ df = df.reindex(columns=malware_model.feature_names_in_, fill_value=0)
135
+ return df
136
+
137
+ # -------------------------------
138
+ # PREDICTION NORMALIZATION
139
+ # -------------------------------
140
+ def normalize_prediction(prediction):
141
+ """Normalize different prediction formats to standard format"""
142
+ pred_str = str(prediction).lower().strip()
143
+
144
+ # Handle different formats that might come from models
145
+ if pred_str in ['phishing', '1', 'malicious', 'threat', 'bad']:
146
+ return 'threat'
147
+ elif pred_str in ['benign', '0', 'safe', 'good', 'legitimate']:
148
+ return 'benign'
149
+ else:
150
+ return 'unknown'
151
+
152
+ # -------------------------------
153
+ # IMPROVED TRUTH TABLE DECISION LOGIC
154
+ # -------------------------------
155
+ def analyze_url(url):
156
+ try:
157
+ # Get model predictions
158
+ phishing_pred_raw = phishing_model.predict(prepare_phishing_input(url))[0]
159
+ malware_pred_raw = malware_model.predict(prepare_malware_input(url))[0]
160
+
161
+ # Normalize predictions
162
+ phishing_pred = normalize_prediction(phishing_pred_raw)
163
+ malware_pred = normalize_prediction(malware_pred_raw)
164
+
165
+ # IMPROVED TRUTH TABLE DECISION LOGIC
166
+ # Priority: Malware > Phishing > Benign (with benign bias for legitimate sites)
167
+
168
+ if malware_pred == "threat" and phishing_pred == "threat":
169
+ final_result = "Malicious"
170
+ reason = "Both models detected threats - High risk malware and phishing"
171
+
172
+ elif malware_pred == "threat" and phishing_pred == "benign":
173
+ final_result = "Malicious"
174
+ reason = "Malware model detected malicious content"
175
+
176
+ elif malware_pred == "benign" and phishing_pred == "benign":
177
+ final_result = "Benign"
178
+ reason = "Both models confirm URL is safe"
179
+
180
+ elif malware_pred == "benign" and phishing_pred == "threat":
181
+ # Check if URL looks legitimate (has common TLDs and reasonable structure)
182
+ parsed = urlparse(url)
183
+ hostname = parsed.hostname or ""
184
+ legitimate_tlds = ['.com', '.org', '.net', '.edu', '.gov', '.co.uk', '.ca', '.au']
185
+ is_legitimate_structure = any(tld in hostname for tld in legitimate_tlds) and len(hostname.split('.')) >= 2
186
+
187
+ if is_legitimate_structure and not any(suspicious in url.lower() for suspicious in
188
+ ['login', 'signin', 'verify', 'update', 'secure', 'account', 'banking']):
189
+ final_result = "Benign"
190
+ reason = "Legitimate website structure detected, overriding phishing model false positive"
191
+ else:
192
+ final_result = "Phishing"
193
+ reason = "Phishing model detected phishing attempt"
194
+
195
+ else:
196
+ # Handle unknown/uncertain cases
197
+ final_result = "Suspicious"
198
+ reason = f"Inconclusive results - Malware: {malware_pred}, Phishing: {phishing_pred}"
199
+
200
+ return {
201
+ "url": url,
202
+ "final_result": final_result,
203
+ "decision_reason": reason,
204
+ "phishing_model_prediction": str(phishing_pred_raw),
205
+ "malware_model_prediction": str(malware_pred_raw),
206
+ "normalized_phishing": phishing_pred,
207
+ "normalized_malware": malware_pred
208
+ }
209
+
210
+ except Exception as e:
211
+ return {"error": str(e)}
212
+
213
+ # -------------------------------
214
+ # GRADIO INTERFACE
215
+ # -------------------------------
216
+ def interface_fn(url):
217
+ if not url.strip():
218
+ return "❌ Please enter a valid URL"
219
+
220
+ # Add protocol if missing
221
+ if not url.startswith(('http://', 'https://')):
222
+ url = 'https://' + url
223
+
224
+ result = analyze_url(url)
225
+
226
+ if "error" in result:
227
+ return f"❌ Error analyzing URL: {result['error']}"
228
+
229
+ # Format output for better readability
230
+ output = f"""
231
+ πŸ” Analysis Report for: {result['url']}
232
+
233
+ ⚠️ Final Verdict: {result['final_result']}
234
+ πŸ“Œ Decision Reason: {result['decision_reason']}
235
+
236
+ πŸ”’ Phishing Model: {result['phishing_model_prediction']} (normalized: {result['normalized_phishing']})
237
+ πŸ›‘οΈ Malware Model: {result['malware_model_prediction']} (normalized: {result['normalized_malware']})
238
+
239
+ {'='*50}
240
+ """
241
+
242
+ # Add appropriate emoji and color coding
243
+ if result['final_result'] == "Benign":
244
+ output = "βœ… SAFE " + output
245
+ elif result['final_result'] in ["Phishing", "Malicious"]:
246
+ output = "❌ DANGEROUS " + output
247
+ else:
248
+ output = "⚠️ SUSPICIOUS " + output
249
+
250
+ return output
251
+
252
+ # -------------------------------
253
+ # GRADIO APP
254
+ # -------------------------------
255
+ demo = gr.Interface(
256
+ fn=interface_fn,
257
+ inputs=gr.Text(
258
+ label="Enter URL to Analyze",
259
+ placeholder="https://example.com or just example.com",
260
+ lines=1
261
+ ),
262
+ outputs=gr.Textbox(
263
+ label="πŸ›‘οΈ Threat Analysis Report",
264
+ lines=10,
265
+ max_lines=15
266
+ ),
267
+ title="πŸ›‘οΈ AI-Powered URL Threat Analyzer",
268
+ description="""
269
+ **Advanced URL Security Scanner**
270
+
271
+ This tool uses dual AI models to detect:
272
+ β€’ 🎣 Phishing attacks
273
+ β€’ 🦠 Malware threats
274
+ β€’ πŸ”’ Overall URL safety
275
+
276
+ Enter any URL to get a comprehensive security analysis.
277
+ """,
278
+ examples=[
279
+ ["https://www.google.com"],
280
+ ["https://www.paypal.com/signin"],
281
+ ["https://www.bbc.com/news"],
282
+ ["bit.ly/suspicious-link"],
283
+ ["http://malware-site.ru/download.exe"]
284
+ ],
285
+ theme=gr.themes.Soft(),
286
+ css="""
287
+ .gradio-container {
288
+ max-width: 800px;
289
+ margin: auto;
290
+ }
291
+ """
292
+ )
293
+
294
+ if __name__ == "__main__":
295
+ demo.launch(
296
+ share=True,
297
+ server_name="0.0.0.0",
298
+ server_port=7860,
299
+ show_error=True
300
+ )
301
+
302
+