Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import pandas as pd | |
| import re | |
| import gradio as gr | |
| import requests | |
| import json | |
| import xgboost as xgb | |
| import joblib | |
| from urllib.parse import urlparse | |
| import base64 | |
| feature_names = joblib.load('feature_names.joblib') | |
| non_numeric_columns = joblib.load('non_numeric_columns.joblib') | |
| scaler = joblib.load('scaler.joblib') | |
| label_encoders = {} | |
| for col in non_numeric_columns: | |
| try: | |
| le = joblib.load(f'le_{col}.joblib') | |
| label_encoders[col] = le | |
| except: | |
| pass | |
| # Load XGBoost model | |
| model = xgb.Booster() | |
| model.load_model('xgboost_best_model.json') | |
| # API | |
| GOOGLE_API_KEY = "AIzaSyCmXigpl6t7MZyvq8Rv37XW_lZVCi0oMSM" | |
| PHISHTANK_USER_AGENT = "phishtank/PHISH" | |
| VIRUSTOTAL_API_KEY = "7c38bbc8b0f461b25b38d1cda16404fc8c3997c6b569a257707505759da7a996" | |
| VIRUSTOTAL_BASE_URL = "https://www.virustotal.com/api/v3/" | |
| vt_headers = { | |
| "x-apikey": VIRUSTOTAL_API_KEY, | |
| "Accept": "application/json" | |
| } | |
| def check_url_virustotal(target_url): | |
| try: | |
| url_id = base64.urlsafe_b64encode(target_url.encode()).decode().strip("=") | |
| url = f"{VIRUSTOTAL_BASE_URL}urls/{url_id}" | |
| response = requests.get(url, headers=vt_headers, timeout=10) | |
| if response.status_code == 200: | |
| data = response.json() | |
| malicious_count = 0 | |
| phishing_keywords = ["phishing", "malicious"] | |
| for vendor, result_data in data['data']['attributes']['last_analysis_results'].items(): | |
| if result_data['result'] and any(keyword in result_data['result'].lower() for keyword in phishing_keywords): | |
| malicious_count += 1 | |
| if malicious_count >= 5: | |
| return "Phishing" | |
| else: | |
| return "Legitimate" | |
| else: | |
| return "Legitimate" | |
| except: | |
| return "Legitimate" | |
| def check_url_phishtank(url): | |
| endpoint = "https://checkurl.phishtank.com/checkurl/" | |
| payload = {"url": url, "format": "json"} | |
| headers = {"User-Agent": PHISHTANK_USER_AGENT} | |
| try: | |
| response = requests.post(endpoint, data=payload, headers=headers, timeout=5) | |
| response.raise_for_status() | |
| data = response.json() | |
| if data.get('results', {}).get('in_database', False) and data.get('results', {}).get('verified', False): | |
| return "Phishing" | |
| return "Legitimate" | |
| except Exception as e: | |
| return "Legitimate" | |
| def check_url_safebrowsing(url): | |
| endpoint = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={GOOGLE_API_KEY}" | |
| payload = { | |
| "client": {"clientId": "PhishShield", "clientVersion": "1.0"}, | |
| "threatInfo": { | |
| "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING", "UNWANTED_SOFTWARE"], | |
| "platformTypes": ["ANY_PLATFORM"], | |
| "threatEntryTypes": ["URL"], | |
| "threatEntries": [{"url": url}] | |
| } | |
| } | |
| try: | |
| response = requests.post(endpoint, json=payload, timeout=5) | |
| response.raise_for_status() | |
| data = response.json() | |
| if "matches" in data: | |
| return "Phishing" | |
| return "Legitimate" | |
| except Exception as e: | |
| return "Legitimate" | |
| def extract_features(url): | |
| parsed = urlparse(url) | |
| hostname = parsed.hostname if parsed.hostname else '' | |
| path = parsed.path if parsed.path else '' | |
| entropy = 0.0 | |
| if len(url) > 0: | |
| for c in set(url): | |
| p = url.count(c) / len(url) | |
| entropy -= p * np.log2(p) if p > 0 else 0 | |
| features = { | |
| "length_url": len(url), | |
| "length_hostname": len(hostname), | |
| "ip": 1 if any(char.isdigit() for char in hostname) else 0, | |
| "nb_dots": url.count('.'), | |
| "nb_hyphens": url.count('-'), | |
| "nb_at": url.count('@'), | |
| "nb_qm": url.count('?'), | |
| "nb_and": url.count('&'), | |
| "nb_or": url.count('|'), | |
| "nb_eq": url.count('='), | |
| "nb_underscore": url.count('_'), | |
| "nb_tilde": url.count('~'), | |
| "nb_percent": url.count('%'), | |
| "nb_slash": url.count('/'), | |
| "nb_star": url.count('*'), | |
| "nb_colon": url.count(':'), | |
| "nb_comma": url.count(','), | |
| "nb_semicolumn": url.count(';'), | |
| "nb_dollar": url.count('$'), | |
| "nb_space": url.count(' '), | |
| "nb_www": 1 if "www" in url else 0, | |
| "nb_com": 1 if ".com" in url else 0, | |
| "nb_dslash": url.count('//'), | |
| "http_in_path": 1 if "http" in path else 0, | |
| "https_token": 1 if "https" in url else 0, | |
| "ratio_digits_url": sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0, | |
| "ratio_digits_host": sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0, | |
| "punycode": 1 if re.search(r'xn--', url, re.IGNORECASE) else 0, | |
| "port": parsed.port if parsed.port else 0, | |
| "tld_in_path": 1 if any(tld in path for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0, | |
| "tld_in_subdomain": 1 if any(tld in hostname for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0, | |
| "abnormal_subdomain": 1 if len(hostname.split('.')) > 3 else 0, | |
| "nb_subdomains": len(hostname.split('.')) - 1, | |
| "prefix_suffix": 1 if url.startswith("www") else 0, | |
| "shortening_service": 1 if any(short in url for short in ['bit.ly', 'goo.gl', 'tinyurl.com']) else 0, | |
| "path_extension": 1 if any(ext in path for ext in ['.exe', '.zip', '.rar', '.tar', '.pdf']) else 0, | |
| "length_words_raw": len(url.split()), | |
| "char_repeat": len(set(url)), | |
| "shortest_words_raw": min(len(word) for word in url.split()) if url.split() else 0, | |
| "longest_words_raw": max(len(word) for word in url.split()) if url.split() else 0, | |
| "shortest_word_host": min(len(word) for word in hostname.split('.')) if hostname else 0, | |
| "longest_word_host": max(len(word) for word in hostname.split('.')) if hostname else 0, | |
| "shortest_word_path": min(len(word) for word in path.split('/')) if path else 0, | |
| "longest_word_path": max(len(word) for word in path.split('/')) if path else 0, | |
| "avg_words_raw": np.mean([len(word) for word in url.split()]) if url.split() else 0, | |
| "avg_word_host": np.mean([len(word) for word in hostname.split('.')]) if hostname else 0, | |
| "avg_word_path": np.mean([len(word) for word in path.split('/')]) if path else 0, | |
| "phish_hints": 1 if any(kw in url.lower() for kw in ['login', 'secure', 'verify', 'account']) else 0, | |
| "domain_in_brand": 1 if 'apple' in hostname.lower() else 0, | |
| "brand_in_subdomain": 1 if 'apple' in (hostname.split('.')[0] if hostname else '') else 0, | |
| "brand_in_path": 1 if 'apple' in path.lower() else 0, | |
| "suspicious_tld": 1 if hostname.endswith(('.xyz', '.top', '.club', '.gq', '.cf', '.tk')) else 0, | |
| "entropy": entropy | |
| } | |
| return features | |
| def predict_with_model(url): | |
| try: | |
| features = extract_features(url) | |
| input_data = pd.DataFrame([features], columns=feature_names) | |
| for col in non_numeric_columns: | |
| if col in input_data.columns and col in label_encoders: | |
| try: | |
| input_data[col] = label_encoders[col].transform(input_data[col].astype(str)) | |
| except ValueError: | |
| input_data[col] = len(label_encoders[col].classes_) | |
| input_scaled = scaler.transform(input_data) | |
| dmatrix = xgb.DMatrix(input_scaled) | |
| phish_prob = model.predict(dmatrix)[0] | |
| if phish_prob >= 0.95: | |
| return "Phishing" | |
| else: | |
| return "Legitimate" | |
| except Exception as e: | |
| return "Legitimate" | |
| def predict_url(url): | |
| try: | |
| if not url.startswith(("http://", "https://")): | |
| url = "https://" + url | |
| # Check VirusTotal | |
| vt_result = check_url_virustotal(url) | |
| if vt_result == "Phishing": | |
| return "Phishing" | |
| # Check PhishTank | |
| pt_result = check_url_phis极 | |
| if pt_result == "Phishing": | |
| return "Phishing" | |
| # Check Google Safe Browsing | |
| gsb_result = check_url_safebrowsing(url) | |
| if gsb_result == "Phishing": | |
| return "Phishing" | |
| # If all three services return "Legitimate", return "Legitimate" without running the model | |
| if vt_result == "Legitimate" and pt_result == "Legitimate" and gsb_result == "Legitimate": | |
| return "Legitimate" | |
| ml_result = predict_with_model(url) | |
| return ml_result | |
| except Exception as e: | |
| return "Legitimate" | |
| # Gradio Interface | |
| interface = gr.Interface( | |
| fn=predict_url, | |
| inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com"), | |
| outputs=gr.Textbox(label="Result"), | |
| title="Phishing URL Detector", | |
| description="Check if a URL is phishing or legitimate", | |
| examples=[ | |
| ["https://www.apple.com"], | |
| ["https://login-facebook-secure.xyz/login.php"], | |
| ["https://bit.ly/suspicious-download"] | |
| ] | |
| ) | |
| interface.launch(server_name="0.0.0.0", server_port=7860) |