| | import numpy as np |
| | import pandas as pd |
| | import re |
| | import gradio as gr |
| | import requests |
| | import json |
| | import joblib |
| | from urllib.parse import urlparse |
| | import base64 |
| |
|
| | feature_names = joblib.load('feature_names.joblib') |
| | non_numeric_columns = joblib.load('non_numeric_columns.joblib') |
| | scaler = joblib.load('scaler.joblib') |
| |
|
| | label_encoders = {} |
| | for col in non_numeric_columns: |
| | try: |
| | le = joblib.load(f'le_{col}.joblib') |
| | label_encoders[col] = le |
| | except: |
| | pass |
| |
|
| | |
| | model = joblib.load('AdaBoost_best_model.joblib') |
| |
|
| | |
| | GOOGLE_API_KEY = "AIzaSyCmXigpl6t7MZyvq8Rv37XW_lZVCi0oMSM" |
| | PHISHTANK_USER_AGENT = "phishtank/PHISH" |
| | VIRUSTOTAL_API_KEY = "7c38bbc8b0f461b25b38d1cda16404fc8c3997c6b569a257707505759da7a996" |
| | VIRUSTOTAL_BASE_URL = "https://www.virustotal.com/api/v3/" |
| |
|
| | vt_headers = { |
| | "x-apikey": VIRUSTOTAL_API_KEY, |
| | "Accept": "application/json" |
| | } |
| |
|
| | def check_url_virustotal(target_url): |
| | try: |
| | url_id = base64.urlsafe_b64encode(target_url.encode()).decode().strip("=") |
| | |
| | url = f"{VIRUSTOTAL_BASE_URL}urls/{url_id}" |
| | response = requests.get(url, headers=vt_headers, timeout=10) |
| | |
| | if response.status_code == 200: |
| | data = response.json() |
| | |
| | malicious_count = 0 |
| | phishing_keywords = ["phishing", "malicious"] |
| | |
| | for vendor, result_data in data['data']['attributes']['last_analysis_results'].items(): |
| | if result_data['result'] and any(keyword in result_data['result'].lower() for keyword in phishing_keywords): |
| | malicious_count += 1 |
| | |
| | if malicious_count >= 5: |
| | return "Phishing" |
| | else: |
| | return "Legitimate" |
| | else: |
| | return "Legitimate" |
| | except: |
| | return "Legitimate" |
| |
|
| | def check_url_phishtank(url): |
| | endpoint = "https://checkurl.phishtank.com/checkurl/" |
| | payload = {"url": url, "format": "json"} |
| | headers = {"User-Agent": PHISHTANK_USER_AGENT} |
| | |
| | try: |
| | response = requests.post(endpoint, data=payload, headers=headers, timeout=5) |
| | response.raise_for_status() |
| | data = response.json() |
| | |
| | if data.get('results', {}).get('in_database', False) and data.get('results', {}).get('verified', False): |
| | return "Phishing" |
| | return "Legitimate" |
| | except Exception as e: |
| | return "Legitimate" |
| |
|
| | def check_url_safebrowsing(url): |
| | endpoint = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={GOOGLE_API_KEY}" |
| | payload = { |
| | "client": {"clientId": "PhishShield", "clientVersion": "1.0"}, |
| | "threatInfo": { |
| | "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING", "UNWANTED_SOFTWARE"], |
| | "platformTypes": ["ANY_PLATFORM"], |
| | "threatEntryTypes": ["URL"], |
| | "threatEntries": [{"url": url}] |
| | } |
| | } |
| | try: |
| | response = requests.post(endpoint, json=payload, timeout=5) |
| | response.raise_for_status() |
| | data = response.json() |
| | |
| | if "matches" in data: |
| | return "Phishing" |
| | return "Legitimate" |
| | except Exception as e: |
| | return "Legitimate" |
| |
|
| | def extract_features(url): |
| | parsed = urlparse(url) |
| | hostname = parsed.hostname if parsed.hostname else '' |
| | path = parsed.path if parsed.path else '' |
| | |
| | entropy = 0.0 |
| | if len(url) > 0: |
| | for c in set(url): |
| | p = url.count(c) / len(url) |
| | entropy -= p * np.log2(p) if p > 0 else 0 |
| | |
| | features = { |
| | "length_url": len(url), |
| | "length_hostname": len(hostname), |
| | "ip": 1 if any(char.isdigit() for char in hostname) else 0, |
| | "nb_dots": url.count('.'), |
| | "nb_hyphens": url.count('-'), |
| | "nb_at": url.count('@'), |
| | "nb_qm": url.count('?'), |
| | "nb_and": url.count('&'), |
| | "nb_or": url.count('|'), |
| | "nb_eq": url.count('='), |
| | "nb_underscore": url.count('_'), |
| | "nb_tilde": url.count('~'), |
| | "nb_percent": url.count('%'), |
| | "nb_slash": url.count('/'), |
| | "nb_star": url.count('*'), |
| | "nb_colon": url.count(':'), |
| | "nb_comma": url.count(','), |
| | "nb_semicolumn": url.count(';'), |
| | "nb_dollar": url.count('$'), |
| | "nb_space": url.count(' '), |
| | "nb_www": 1 if "www" in url else 0, |
| | "nb_com": 1 if ".com" in url else 0, |
| | "nb_dslash": url.count('//'), |
| | "http_in_path": 1 if "http" in path else 0, |
| | "https_token": 1 if "https" in url else 0, |
| | "ratio_digits_url": sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0, |
| | "ratio_digits_host": sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0, |
| | "punycode": 1 if re.search(r'xn--', url, re.IGNORECASE) else 0, |
| | "port": parsed.port if parsed.port else 0, |
| | "tld_in_path": 1 if any(tld in path for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0, |
| | "tld_in_subdomain": 1 if any(tld in hostname for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0, |
| | "abnormal_subdomain": 1 if len(hostname.split('.')) > 3 else 0, |
| | "nb_subdomains": len(hostname.split('.')) - 1, |
| | "prefix_suffix": 1 if url.startswith("www") else 0, |
| | "shortening_service": 1 if any(short in url for short in ['bit.ly', 'goo.gl', 'tinyurl.com']) else 0, |
| | "path_extension": 1 if any(ext in path for ext in ['.exe', '.zip', '.rar', '.tar', '.pdf']) else 0, |
| | "length_words_raw": len(url.split()), |
| | "char_repeat": len(set(url)), |
| | "shortest_words_raw": min(len(word) for word in url.split()) if url.split() else 0, |
| | "longest_words_raw": max(len(word) for word in url.split()) if url.split() else 0, |
| | "shortest_word_host": min(len(word) for word in hostname.split('.')) if hostname else 0, |
| | "longest_word_host": max(len(word) for word in hostname.split('.')) if hostname else 0, |
| | "shortest_word_path": min(len(word) for word in path.split('/')) if path else 0, |
| | "longest_word_path": max(len(word) for word in path.split('/')) if path else 0, |
| | "avg_words_raw": np.mean([len(word) for word in url.split()]) if url.split() else 0, |
| | "avg_word_host": np.mean([len(word) for word in hostname.split('.')]) if hostname else 0, |
| | "avg_word_path": np.mean([len(word) for word in path.split('/')]) if path else 0, |
| | "phish_hints": 1 if any(kw in url.lower() for kw in ['login', 'secure', 'verify', 'account']) else 0, |
| | "domain_in_brand": 1 if 'apple' in hostname.lower() else 0, |
| | "brand_in_subdomain": 1 if 'apple' in (hostname.split('.')[0] if hostname else '') else 0, |
| | "brand_in_path": 1 if 'apple' in path.lower() else 0, |
| | "suspicious_tld": 1 if hostname.endswith(('.xyz', '.top', '.club', '.gq', '.cf', '.tk')) else 0, |
| | "entropy": entropy |
| | } |
| | return features |
| |
|
| | def predict_with_model(url): |
| | try: |
| | features = extract_features(url) |
| | |
| | input_data = pd.DataFrame([features], columns=feature_names) |
| | |
| | for col in non_numeric_columns: |
| | if col in input_data.columns and col in label_encoders: |
| | try: |
| | input_data[col] = label_encoders[col].transform(input_data[col].astype(str)) |
| | except ValueError: |
| | input_data[col] = len(label_encoders[col].classes_) |
| | |
| | input_scaled = scaler.transform(input_data) |
| | |
| | |
| | phish_prob = model.predict_proba(input_scaled)[0][1] |
| | |
| | if phish_prob >= 0.95: |
| | return "Phishing" |
| | else: |
| | return "Legitimate" |
| | except Exception as e: |
| | return "Legitimate" |
| |
|
| | def predict_url(url): |
| | try: |
| | if not url.startswith(("http://", "https://")): |
| | url = "https://" + url |
| | |
| | |
| | vt_result = check_url_virustotal(url) |
| | if vt_result == "Phishing": |
| | return "Phishing" |
| | |
| | |
| | pt_result = check_url_phishtank(url) |
| | if pt_result == "Phishing": |
| | return "Phishing" |
| | |
| | |
| | gsb_result = check_url_safebrowsing(url) |
| | if gsb_result == "Phishing": |
| | return "Phishing" |
| | |
| | |
| | if vt_result == "Legitimate" and pt_result == "Legitimate" and gsb_result == "Legitimate": |
| | return "Legitimate" |
| | |
| | ml_result = predict_with_model(url) |
| | return ml_result |
| | |
| | except Exception as e: |
| | return "Legitimate" |
| |
|
| | |
| | interface = gr.Interface( |
| | fn=predict_url, |
| | inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com"), |
| | outputs=gr.Textbox(label="Result"), |
| | title="Phishing URL Detector", |
| | description="Check if a URL is phishing or legitimate", |
| | examples=[ |
| | ["https://www.apple.com"], |
| | ["https://login-facebook-secure.xyz/login.php"], |
| | ["https://bit.ly/suspicious-download"] |
| | ] |
| | ) |
| |
|
| | interface.launch(server_name="0.0.0.0", server_port=7860) |