import numpy as np import pandas as pd import re import gradio as gr import requests import json import joblib from urllib.parse import urlparse import base64 feature_names = joblib.load('feature_names.joblib') non_numeric_columns = joblib.load('non_numeric_columns.joblib') scaler = joblib.load('scaler.joblib') label_encoders = {} for col in non_numeric_columns: try: le = joblib.load(f'le_{col}.joblib') label_encoders[col] = le except: pass # Load AdaBoost model model = joblib.load('AdaBoost_best_model.joblib') # API GOOGLE_API_KEY = "AIzaSyCmXigpl6t7MZyvq8Rv37XW_lZVCi0oMSM" PHISHTANK_USER_AGENT = "phishtank/PHISH" VIRUSTOTAL_API_KEY = "7c38bbc8b0f461b25b38d1cda16404fc8c3997c6b569a257707505759da7a996" VIRUSTOTAL_BASE_URL = "https://www.virustotal.com/api/v3/" vt_headers = { "x-apikey": VIRUSTOTAL_API_KEY, "Accept": "application/json" } def check_url_virustotal(target_url): try: url_id = base64.urlsafe_b64encode(target_url.encode()).decode().strip("=") url = f"{VIRUSTOTAL_BASE_URL}urls/{url_id}" response = requests.get(url, headers=vt_headers, timeout=10) if response.status_code == 200: data = response.json() malicious_count = 0 phishing_keywords = ["phishing", "malicious"] for vendor, result_data in data['data']['attributes']['last_analysis_results'].items(): if result_data['result'] and any(keyword in result_data['result'].lower() for keyword in phishing_keywords): malicious_count += 1 if malicious_count >= 5: return "Phishing" else: return "Legitimate" else: return "Legitimate" except: return "Legitimate" def check_url_phishtank(url): endpoint = "https://checkurl.phishtank.com/checkurl/" payload = {"url": url, "format": "json"} headers = {"User-Agent": PHISHTANK_USER_AGENT} try: response = requests.post(endpoint, data=payload, headers=headers, timeout=5) response.raise_for_status() data = response.json() if data.get('results', {}).get('in_database', False) and data.get('results', {}).get('verified', False): return "Phishing" return "Legitimate" except Exception as e: return "Legitimate" def check_url_safebrowsing(url): endpoint = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={GOOGLE_API_KEY}" payload = { "client": {"clientId": "PhishShield", "clientVersion": "1.0"}, "threatInfo": { "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING", "UNWANTED_SOFTWARE"], "platformTypes": ["ANY_PLATFORM"], "threatEntryTypes": ["URL"], "threatEntries": [{"url": url}] } } try: response = requests.post(endpoint, json=payload, timeout=5) response.raise_for_status() data = response.json() if "matches" in data: return "Phishing" return "Legitimate" except Exception as e: return "Legitimate" def extract_features(url): parsed = urlparse(url) hostname = parsed.hostname if parsed.hostname else '' path = parsed.path if parsed.path else '' entropy = 0.0 if len(url) > 0: for c in set(url): p = url.count(c) / len(url) entropy -= p * np.log2(p) if p > 0 else 0 features = { "length_url": len(url), "length_hostname": len(hostname), "ip": 1 if any(char.isdigit() for char in hostname) else 0, "nb_dots": url.count('.'), "nb_hyphens": url.count('-'), "nb_at": url.count('@'), "nb_qm": url.count('?'), "nb_and": url.count('&'), "nb_or": url.count('|'), "nb_eq": url.count('='), "nb_underscore": url.count('_'), "nb_tilde": url.count('~'), "nb_percent": url.count('%'), "nb_slash": url.count('/'), "nb_star": url.count('*'), "nb_colon": url.count(':'), "nb_comma": url.count(','), "nb_semicolumn": url.count(';'), "nb_dollar": url.count('$'), "nb_space": url.count(' '), "nb_www": 1 if "www" in url else 0, "nb_com": 1 if ".com" in url else 0, "nb_dslash": url.count('//'), "http_in_path": 1 if "http" in path else 0, "https_token": 1 if "https" in url else 0, "ratio_digits_url": sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0, "ratio_digits_host": sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0, "punycode": 1 if re.search(r'xn--', url, re.IGNORECASE) else 0, "port": parsed.port if parsed.port else 0, "tld_in_path": 1 if any(tld in path for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0, "tld_in_subdomain": 1 if any(tld in hostname for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0, "abnormal_subdomain": 1 if len(hostname.split('.')) > 3 else 0, "nb_subdomains": len(hostname.split('.')) - 1, "prefix_suffix": 1 if url.startswith("www") else 0, "shortening_service": 1 if any(short in url for short in ['bit.ly', 'goo.gl', 'tinyurl.com']) else 0, "path_extension": 1 if any(ext in path for ext in ['.exe', '.zip', '.rar', '.tar', '.pdf']) else 0, "length_words_raw": len(url.split()), "char_repeat": len(set(url)), "shortest_words_raw": min(len(word) for word in url.split()) if url.split() else 0, "longest_words_raw": max(len(word) for word in url.split()) if url.split() else 0, "shortest_word_host": min(len(word) for word in hostname.split('.')) if hostname else 0, "longest_word_host": max(len(word) for word in hostname.split('.')) if hostname else 0, "shortest_word_path": min(len(word) for word in path.split('/')) if path else 0, "longest_word_path": max(len(word) for word in path.split('/')) if path else 0, "avg_words_raw": np.mean([len(word) for word in url.split()]) if url.split() else 0, "avg_word_host": np.mean([len(word) for word in hostname.split('.')]) if hostname else 0, "avg_word_path": np.mean([len(word) for word in path.split('/')]) if path else 0, "phish_hints": 1 if any(kw in url.lower() for kw in ['login', 'secure', 'verify', 'account']) else 0, "domain_in_brand": 1 if 'apple' in hostname.lower() else 0, "brand_in_subdomain": 1 if 'apple' in (hostname.split('.')[0] if hostname else '') else 0, "brand_in_path": 1 if 'apple' in path.lower() else 0, "suspicious_tld": 1 if hostname.endswith(('.xyz', '.top', '.club', '.gq', '.cf', '.tk')) else 0, "entropy": entropy } return features def predict_with_model(url): try: features = extract_features(url) input_data = pd.DataFrame([features], columns=feature_names) for col in non_numeric_columns: if col in input_data.columns and col in label_encoders: try: input_data[col] = label_encoders[col].transform(input_data[col].astype(str)) except ValueError: input_data[col] = len(label_encoders[col].classes_) input_scaled = scaler.transform(input_data) # AdaBoost model prediction (returns probability for class 1) phish_prob = model.predict_proba(input_scaled)[0][1] if phish_prob >= 0.95: return "Phishing" else: return "Legitimate" except Exception as e: return "Legitimate" def predict_url(url): try: if not url.startswith(("http://", "https://")): url = "https://" + url # Check VirusTotal vt_result = check_url_virustotal(url) if vt_result == "Phishing": return "Phishing" # Check PhishTank pt_result = check_url_phishtank(url) if pt_result == "Phishing": return "Phishing" # Check Google Safe Browsing gsb_result = check_url_safebrowsing(url) if gsb_result == "Phishing": return "Phishing" # If all three services return "Legitimate", return "Legitimate" without running the model if vt_result == "Legitimate" and pt_result == "Legitimate" and gsb_result == "Legitimate": return "Legitimate" ml_result = predict_with_model(url) return ml_result except Exception as e: return "Legitimate" # Gradio Interface interface = gr.Interface( fn=predict_url, inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com"), outputs=gr.Textbox(label="Result"), title="Phishing URL Detector", description="Check if a URL is phishing or legitimate", examples=[ ["https://www.apple.com"], ["https://login-facebook-secure.xyz/login.php"], ["https://bit.ly/suspicious-download"] ] ) interface.launch(server_name="0.0.0.0", server_port=7860)