Spaces:
Runtime error
Runtime error
| """ | |
| CNN Phishing Detector - Interactive Demo | |
| Test any URL with both character-level CNN models: | |
| 1. CNN URL — analyzes the URL string itself | |
| 2. CNN HTML — fetches the page and analyzes its HTML source | |
| Usage: | |
| python scripts/predict_url_cnn.py | |
| """ | |
| import sys | |
| import json | |
| import logging | |
| import warnings | |
| from pathlib import Path | |
| import os | |
| os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' | |
| os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' | |
| import numpy as np | |
| from colorama import init, Fore, Style | |
| init(autoreset=True) | |
| warnings.filterwarnings('ignore') | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| datefmt='%H:%M:%S', | |
| ) | |
| logger = logging.getLogger('cnn_predictor') | |
| # --------------------------------------------------------------------------- | |
| # Project paths | |
| # --------------------------------------------------------------------------- | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] # src/ | |
| MODELS_DIR = PROJECT_ROOT / 'saved_models' | |
| # URL CNN | |
| URL_MODEL_PATH = MODELS_DIR / 'cnn_url_model.keras' | |
| URL_VOCAB_PATH = MODELS_DIR / 'cnn_url_vocab.json' | |
| # HTML CNN | |
| HTML_MODEL_PATH = MODELS_DIR / 'cnn_html_model.keras' | |
| HTML_VOCAB_PATH = MODELS_DIR / 'cnn_html_vocab.json' | |
| class CNNPhishingDetector: | |
| """Detect phishing URLs using both character-level CNN models.""" | |
| def __init__(self): | |
| self.url_model = None | |
| self.html_model = None | |
| self.url_vocab = None | |
| self.html_vocab = None | |
| self._load_url_model() | |
| self._load_html_model() | |
| # ── Loading ──────────────────────────────────────────────────── | |
| def _load_url_model(self): | |
| """Load URL CNN model and vocabulary.""" | |
| if not URL_VOCAB_PATH.exists() or not URL_MODEL_PATH.exists(): | |
| logger.warning("URL CNN model not found — skipping") | |
| return | |
| with open(URL_VOCAB_PATH, 'r') as f: | |
| self.url_vocab = json.load(f) | |
| import tensorflow as tf | |
| self.url_model = tf.keras.models.load_model(str(URL_MODEL_PATH)) | |
| logger.info(f"✓ URL CNN loaded (vocab={self.url_vocab['vocab_size']}, " | |
| f"max_len={self.url_vocab['max_len']})") | |
| def _load_html_model(self): | |
| """Load HTML CNN model and vocabulary.""" | |
| if not HTML_VOCAB_PATH.exists() or not HTML_MODEL_PATH.exists(): | |
| logger.warning("HTML CNN model not found — skipping") | |
| return | |
| with open(HTML_VOCAB_PATH, 'r') as f: | |
| self.html_vocab = json.load(f) | |
| import tensorflow as tf | |
| self.html_model = tf.keras.models.load_model(str(HTML_MODEL_PATH)) | |
| logger.info(f"✓ HTML CNN loaded (vocab={self.html_vocab['vocab_size']}, " | |
| f"max_len={self.html_vocab['max_len']})") | |
| # ── Encoding ─────────────────────────────────────────────────── | |
| def _encode_url(self, url: str) -> np.ndarray: | |
| """Encode a URL string for the URL CNN.""" | |
| char_to_idx = self.url_vocab['char_to_idx'] | |
| max_len = self.url_vocab['max_len'] | |
| encoded = [char_to_idx.get(c, 1) for c in url[:max_len]] | |
| encoded += [0] * (max_len - len(encoded)) | |
| return np.array([encoded], dtype=np.int32) | |
| def _encode_html(self, html: str) -> np.ndarray: | |
| """Encode an HTML string for the HTML CNN.""" | |
| char_to_idx = self.html_vocab['char_to_idx'] | |
| max_len = self.html_vocab['max_len'] | |
| encoded = [char_to_idx.get(c, 1) for c in html[:max_len]] | |
| encoded += [0] * (max_len - len(encoded)) | |
| return np.array([encoded], dtype=np.int32) | |
| # ── HTML fetching ────────────────────────────────────────────── | |
| def _fetch_html(url: str, timeout: int = 10) -> str | None: | |
| """Fetch HTML content from a URL. Returns None on failure.""" | |
| try: | |
| import requests | |
| headers = { | |
| 'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' | |
| 'AppleWebKit/537.36 (KHTML, like Gecko) ' | |
| 'Chrome/120.0.0.0 Safari/537.36'), | |
| } | |
| resp = requests.get(url, headers=headers, timeout=timeout, | |
| verify=False, allow_redirects=True) | |
| resp.raise_for_status() | |
| return resp.text | |
| except Exception as e: | |
| logger.warning(f" Could not fetch HTML: {e}") | |
| return None | |
| # ── Prediction ───────────────────────────────────────────────── | |
| def predict_url(self, url: str, threshold: float = 0.5) -> dict | None: | |
| """Predict using the URL CNN model.""" | |
| if self.url_model is None: | |
| return None | |
| X = self._encode_url(url) | |
| phishing_prob = float(self.url_model.predict(X, verbose=0)[0][0]) | |
| legitimate_prob = 1.0 - phishing_prob | |
| is_phishing = phishing_prob >= threshold | |
| return { | |
| 'model_name': 'CNN URL (Char-level)', | |
| 'prediction': 'PHISHING' if is_phishing else 'LEGITIMATE', | |
| 'prediction_code': int(is_phishing), | |
| 'confidence': (phishing_prob if is_phishing else legitimate_prob) * 100, | |
| 'phishing_probability': phishing_prob * 100, | |
| 'legitimate_probability': legitimate_prob * 100, | |
| 'threshold': threshold, | |
| } | |
| def predict_html(self, html: str, threshold: float = 0.5) -> dict | None: | |
| """Predict using the HTML CNN model.""" | |
| if self.html_model is None: | |
| return None | |
| X = self._encode_html(html) | |
| phishing_prob = float(self.html_model.predict(X, verbose=0)[0][0]) | |
| legitimate_prob = 1.0 - phishing_prob | |
| is_phishing = phishing_prob >= threshold | |
| return { | |
| 'model_name': 'CNN HTML (Char-level)', | |
| 'prediction': 'PHISHING' if is_phishing else 'LEGITIMATE', | |
| 'prediction_code': int(is_phishing), | |
| 'confidence': (phishing_prob if is_phishing else legitimate_prob) * 100, | |
| 'phishing_probability': phishing_prob * 100, | |
| 'legitimate_probability': legitimate_prob * 100, | |
| 'threshold': threshold, | |
| 'html_length': len(html), | |
| } | |
| def predict_full(self, url: str, threshold: float = 0.5) -> dict: | |
| """ | |
| Run both CNN models on a URL. | |
| Returns dict with url_result, html_result, and combined verdict. | |
| """ | |
| # URL CNN | |
| url_result = self.predict_url(url, threshold) | |
| # HTML CNN — fetch page first | |
| html_result = None | |
| html_content = None | |
| if self.html_model is not None: | |
| html_content = self._fetch_html(url) | |
| if html_content and len(html_content) >= 100: | |
| html_result = self.predict_html(html_content, threshold) | |
| # Combined verdict | |
| results = [r for r in [url_result, html_result] if r is not None] | |
| if len(results) == 2: | |
| avg_phish = (url_result['phishing_probability'] + | |
| html_result['phishing_probability']) / 2 | |
| combined_is_phishing = avg_phish >= (threshold * 100) | |
| combined = { | |
| 'prediction': 'PHISHING' if combined_is_phishing else 'LEGITIMATE', | |
| 'phishing_probability': avg_phish, | |
| 'legitimate_probability': 100 - avg_phish, | |
| 'confidence': avg_phish if combined_is_phishing else 100 - avg_phish, | |
| 'agree': url_result['prediction'] == html_result['prediction'], | |
| } | |
| elif len(results) == 1: | |
| r = results[0] | |
| combined = { | |
| 'prediction': r['prediction'], | |
| 'phishing_probability': r['phishing_probability'], | |
| 'legitimate_probability': r['legitimate_probability'], | |
| 'confidence': r['confidence'], | |
| 'agree': True, | |
| } | |
| else: | |
| combined = None | |
| return { | |
| 'url_result': url_result, | |
| 'html_result': html_result, | |
| 'html_fetched': html_content is not None, | |
| 'html_length': len(html_content) if html_content else 0, | |
| 'combined': combined, | |
| } | |
| # ── Pretty print ─────────────────────────────────────────────── | |
| def print_results(self, url: str, full: dict): | |
| """Print formatted prediction results from both models.""" | |
| print("\n" + "=" * 80) | |
| print(f"{Fore.CYAN}{Style.BRIGHT}CNN PHISHING DETECTION RESULTS{Style.RESET_ALL}") | |
| print("=" * 80) | |
| print(f"\n{Fore.YELLOW}URL:{Style.RESET_ALL} {url}") | |
| # ── URL CNN ── | |
| url_r = full['url_result'] | |
| if url_r: | |
| pred = url_r['prediction'] | |
| color = Fore.RED if pred == 'PHISHING' else Fore.GREEN | |
| icon = "⚠️" if pred == 'PHISHING' else "✓" | |
| print(f"\n{Style.BRIGHT}1. CNN URL (Character-level):{Style.RESET_ALL}") | |
| print(f" {icon} Prediction: {color}{Style.BRIGHT}{pred}{Style.RESET_ALL}") | |
| print(f" Confidence: {url_r['confidence']:.2f}%") | |
| print(f" Phishing: {Fore.RED}{url_r['phishing_probability']:6.2f}%{Style.RESET_ALL}") | |
| print(f" Legitimate: {Fore.GREEN}{url_r['legitimate_probability']:6.2f}%{Style.RESET_ALL}") | |
| else: | |
| print(f"\n{Style.BRIGHT}1. CNN URL:{Style.RESET_ALL} {Fore.YELLOW}Not available{Style.RESET_ALL}") | |
| # ── HTML CNN ── | |
| html_r = full['html_result'] | |
| if html_r: | |
| pred = html_r['prediction'] | |
| color = Fore.RED if pred == 'PHISHING' else Fore.GREEN | |
| icon = "⚠️" if pred == 'PHISHING' else "✓" | |
| print(f"\n{Style.BRIGHT}2. CNN HTML (Character-level):{Style.RESET_ALL}") | |
| print(f" {icon} Prediction: {color}{Style.BRIGHT}{pred}{Style.RESET_ALL}") | |
| print(f" Confidence: {html_r['confidence']:.2f}%") | |
| print(f" Phishing: {Fore.RED}{html_r['phishing_probability']:6.2f}%{Style.RESET_ALL}") | |
| print(f" Legitimate: {Fore.GREEN}{html_r['legitimate_probability']:6.2f}%{Style.RESET_ALL}") | |
| print(f" HTML length: {html_r['html_length']:,} chars") | |
| elif full['html_fetched']: | |
| print(f"\n{Style.BRIGHT}2. CNN HTML:{Style.RESET_ALL} " | |
| f"{Fore.YELLOW}HTML too short for analysis{Style.RESET_ALL}") | |
| else: | |
| print(f"\n{Style.BRIGHT}2. CNN HTML:{Style.RESET_ALL} " | |
| f"{Fore.YELLOW}Could not fetch page HTML{Style.RESET_ALL}") | |
| # ── Combined verdict ── | |
| combined = full['combined'] | |
| if combined: | |
| pred = combined['prediction'] | |
| color = Fore.RED if pred == 'PHISHING' else Fore.GREEN | |
| icon = "⚠️" if pred == 'PHISHING' else "✓" | |
| agree_str = (f"{Fore.GREEN}YES{Style.RESET_ALL}" if combined['agree'] | |
| else f"{Fore.YELLOW}NO{Style.RESET_ALL}") | |
| print(f"\n{'─' * 80}") | |
| print(f"{Style.BRIGHT}COMBINED VERDICT:{Style.RESET_ALL}") | |
| print(f" {icon} {color}{Style.BRIGHT}{pred}{Style.RESET_ALL} " | |
| f"(confidence: {combined['confidence']:.2f}%)") | |
| print(f" Phishing: {Fore.RED}{combined['phishing_probability']:6.2f}%{Style.RESET_ALL}") | |
| print(f" Legitimate: {Fore.GREEN}{combined['legitimate_probability']:6.2f}%{Style.RESET_ALL}") | |
| if url_r and html_r: | |
| print(f" Models agree: {agree_str}") | |
| print("\n" + "=" * 80 + "\n") | |
| def main(): | |
| """Interactive prediction loop.""" | |
| print(f"\n{Fore.CYAN}{Style.BRIGHT}╔══════════════════════════════════════════════════════════════╗") | |
| print(f"║ CNN PHISHING DETECTOR - INTERACTIVE DEMO ║") | |
| print(f"║ URL CNN + HTML CNN (Dual Analysis) ║") | |
| print(f"╚══════════════════════════════════════════════════════════════╝{Style.RESET_ALL}\n") | |
| print(f"{Fore.YELLOW}Loading CNN models...{Style.RESET_ALL}") | |
| detector = CNNPhishingDetector() | |
| available = [] | |
| if detector.url_model is not None: | |
| available.append("URL CNN") | |
| if detector.html_model is not None: | |
| available.append("HTML CNN") | |
| if not available: | |
| print(f"{Fore.RED}No CNN models found! Train models first.{Style.RESET_ALL}") | |
| sys.exit(1) | |
| print(f"{Fore.GREEN}✓ Models loaded: {', '.join(available)}{Style.RESET_ALL}\n") | |
| while True: | |
| print(f"{Fore.CYAN}{'─' * 80}{Style.RESET_ALL}") | |
| url = input(f"{Fore.YELLOW}Enter URL to test (or 'quit' to exit):{Style.RESET_ALL} ").strip() | |
| if url.lower() in ('quit', 'exit', 'q'): | |
| print(f"\n{Fore.GREEN}Goodbye!{Style.RESET_ALL}\n") | |
| break | |
| if not url: | |
| print(f"{Fore.RED}Please enter a valid URL.{Style.RESET_ALL}\n") | |
| continue | |
| if not url.startswith(('http://', 'https://')): | |
| url = 'http://' + url | |
| try: | |
| full = detector.predict_full(url) | |
| detector.print_results(url, full) | |
| except Exception as e: | |
| print(f"\n{Fore.RED}Error: {e}{Style.RESET_ALL}\n") | |
| logger.error(str(e)) | |
| if __name__ == '__main__': | |
| main() | |