File size: 14,339 Bytes
2cc7f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
"""

CNN Phishing Detector - Interactive Demo



Test any URL with both character-level CNN models:

  1. CNN URL  — analyzes the URL string itself

  2. CNN HTML — fetches the page and analyzes its HTML source



Usage:

    python scripts/predict_url_cnn.py

"""

import sys
import json
import logging
import warnings
from pathlib import Path

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' 

import numpy as np
from colorama import init, Fore, Style

init(autoreset=True)
warnings.filterwarnings('ignore')

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%H:%M:%S',
)
logger = logging.getLogger('cnn_predictor')

# ---------------------------------------------------------------------------
# Project paths
# ---------------------------------------------------------------------------
PROJECT_ROOT = Path(__file__).resolve().parents[1]  # src/
MODELS_DIR = PROJECT_ROOT / 'saved_models'

# URL CNN
URL_MODEL_PATH = MODELS_DIR / 'cnn_url_model.keras'
URL_VOCAB_PATH = MODELS_DIR / 'cnn_url_vocab.json'

# HTML CNN
HTML_MODEL_PATH = MODELS_DIR / 'cnn_html_model.keras'
HTML_VOCAB_PATH = MODELS_DIR / 'cnn_html_vocab.json'


class CNNPhishingDetector:
    """Detect phishing URLs using both character-level CNN models."""

    def __init__(self):
        self.url_model = None
        self.html_model = None
        self.url_vocab = None
        self.html_vocab = None

        self._load_url_model()
        self._load_html_model()

    # ── Loading ────────────────────────────────────────────────────

    def _load_url_model(self):
        """Load URL CNN model and vocabulary."""
        if not URL_VOCAB_PATH.exists() or not URL_MODEL_PATH.exists():
            logger.warning("URL CNN model not found — skipping")
            return

        with open(URL_VOCAB_PATH, 'r') as f:
            self.url_vocab = json.load(f)

        import tensorflow as tf
        self.url_model = tf.keras.models.load_model(str(URL_MODEL_PATH))
        logger.info(f"✓ URL CNN loaded (vocab={self.url_vocab['vocab_size']}, "
                     f"max_len={self.url_vocab['max_len']})")

    def _load_html_model(self):
        """Load HTML CNN model and vocabulary."""
        if not HTML_VOCAB_PATH.exists() or not HTML_MODEL_PATH.exists():
            logger.warning("HTML CNN model not found — skipping")
            return

        with open(HTML_VOCAB_PATH, 'r') as f:
            self.html_vocab = json.load(f)

        import tensorflow as tf
        self.html_model = tf.keras.models.load_model(str(HTML_MODEL_PATH))
        logger.info(f"✓ HTML CNN loaded (vocab={self.html_vocab['vocab_size']}, "
                     f"max_len={self.html_vocab['max_len']})")

    # ── Encoding ───────────────────────────────────────────────────

    def _encode_url(self, url: str) -> np.ndarray:
        """Encode a URL string for the URL CNN."""
        char_to_idx = self.url_vocab['char_to_idx']
        max_len = self.url_vocab['max_len']
        encoded = [char_to_idx.get(c, 1) for c in url[:max_len]]
        encoded += [0] * (max_len - len(encoded))
        return np.array([encoded], dtype=np.int32)

    def _encode_html(self, html: str) -> np.ndarray:
        """Encode an HTML string for the HTML CNN."""
        char_to_idx = self.html_vocab['char_to_idx']
        max_len = self.html_vocab['max_len']
        encoded = [char_to_idx.get(c, 1) for c in html[:max_len]]
        encoded += [0] * (max_len - len(encoded))
        return np.array([encoded], dtype=np.int32)

    # ── HTML fetching ──────────────────────────────────────────────

    @staticmethod
    def _fetch_html(url: str, timeout: int = 10) -> str | None:
        """Fetch HTML content from a URL. Returns None on failure."""
        try:
            import requests
            headers = {
                'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                               'AppleWebKit/537.36 (KHTML, like Gecko) '
                               'Chrome/120.0.0.0 Safari/537.36'),
            }
            resp = requests.get(url, headers=headers, timeout=timeout,
                                verify=False, allow_redirects=True)
            resp.raise_for_status()
            return resp.text
        except Exception as e:
            logger.warning(f"  Could not fetch HTML: {e}")
            return None

    # ── Prediction ─────────────────────────────────────────────────

    def predict_url(self, url: str, threshold: float = 0.5) -> dict | None:
        """Predict using the URL CNN model."""
        if self.url_model is None:
            return None

        X = self._encode_url(url)
        phishing_prob = float(self.url_model.predict(X, verbose=0)[0][0])
        legitimate_prob = 1.0 - phishing_prob
        is_phishing = phishing_prob >= threshold

        return {
            'model_name': 'CNN URL (Char-level)',
            'prediction': 'PHISHING' if is_phishing else 'LEGITIMATE',
            'prediction_code': int(is_phishing),
            'confidence': (phishing_prob if is_phishing else legitimate_prob) * 100,
            'phishing_probability': phishing_prob * 100,
            'legitimate_probability': legitimate_prob * 100,
            'threshold': threshold,
        }

    def predict_html(self, html: str, threshold: float = 0.5) -> dict | None:
        """Predict using the HTML CNN model."""
        if self.html_model is None:
            return None

        X = self._encode_html(html)
        phishing_prob = float(self.html_model.predict(X, verbose=0)[0][0])
        legitimate_prob = 1.0 - phishing_prob
        is_phishing = phishing_prob >= threshold

        return {
            'model_name': 'CNN HTML (Char-level)',
            'prediction': 'PHISHING' if is_phishing else 'LEGITIMATE',
            'prediction_code': int(is_phishing),
            'confidence': (phishing_prob if is_phishing else legitimate_prob) * 100,
            'phishing_probability': phishing_prob * 100,
            'legitimate_probability': legitimate_prob * 100,
            'threshold': threshold,
            'html_length': len(html),
        }

    def predict_full(self, url: str, threshold: float = 0.5) -> dict:
        """

        Run both CNN models on a URL.



        Returns dict with url_result, html_result, and combined verdict.

        """
        # URL CNN
        url_result = self.predict_url(url, threshold)

        # HTML CNN — fetch page first
        html_result = None
        html_content = None
        if self.html_model is not None:
            html_content = self._fetch_html(url)
            if html_content and len(html_content) >= 100:
                html_result = self.predict_html(html_content, threshold)

        # Combined verdict
        results = [r for r in [url_result, html_result] if r is not None]
        if len(results) == 2:
            avg_phish = (url_result['phishing_probability'] +
                         html_result['phishing_probability']) / 2
            combined_is_phishing = avg_phish >= (threshold * 100)
            combined = {
                'prediction': 'PHISHING' if combined_is_phishing else 'LEGITIMATE',
                'phishing_probability': avg_phish,
                'legitimate_probability': 100 - avg_phish,
                'confidence': avg_phish if combined_is_phishing else 100 - avg_phish,
                'agree': url_result['prediction'] == html_result['prediction'],
            }
        elif len(results) == 1:
            r = results[0]
            combined = {
                'prediction': r['prediction'],
                'phishing_probability': r['phishing_probability'],
                'legitimate_probability': r['legitimate_probability'],
                'confidence': r['confidence'],
                'agree': True,
            }
        else:
            combined = None

        return {
            'url_result': url_result,
            'html_result': html_result,
            'html_fetched': html_content is not None,
            'html_length': len(html_content) if html_content else 0,
            'combined': combined,
        }

    # ── Pretty print ───────────────────────────────────────────────

    def print_results(self, url: str, full: dict):
        """Print formatted prediction results from both models."""
        print("\n" + "=" * 80)
        print(f"{Fore.CYAN}{Style.BRIGHT}CNN PHISHING DETECTION RESULTS{Style.RESET_ALL}")
        print("=" * 80)
        print(f"\n{Fore.YELLOW}URL:{Style.RESET_ALL} {url}")

        # ── URL CNN ──
        url_r = full['url_result']
        if url_r:
            pred = url_r['prediction']
            color = Fore.RED if pred == 'PHISHING' else Fore.GREEN
            icon = "⚠️" if pred == 'PHISHING' else "✓"
            print(f"\n{Style.BRIGHT}1. CNN URL (Character-level):{Style.RESET_ALL}")
            print(f"   {icon} Prediction: {color}{Style.BRIGHT}{pred}{Style.RESET_ALL}")
            print(f"   Confidence:  {url_r['confidence']:.2f}%")
            print(f"   Phishing:    {Fore.RED}{url_r['phishing_probability']:6.2f}%{Style.RESET_ALL}")
            print(f"   Legitimate:  {Fore.GREEN}{url_r['legitimate_probability']:6.2f}%{Style.RESET_ALL}")
        else:
            print(f"\n{Style.BRIGHT}1. CNN URL:{Style.RESET_ALL} {Fore.YELLOW}Not available{Style.RESET_ALL}")

        # ── HTML CNN ──
        html_r = full['html_result']
        if html_r:
            pred = html_r['prediction']
            color = Fore.RED if pred == 'PHISHING' else Fore.GREEN
            icon = "⚠️" if pred == 'PHISHING' else "✓"
            print(f"\n{Style.BRIGHT}2. CNN HTML (Character-level):{Style.RESET_ALL}")
            print(f"   {icon} Prediction: {color}{Style.BRIGHT}{pred}{Style.RESET_ALL}")
            print(f"   Confidence:  {html_r['confidence']:.2f}%")
            print(f"   Phishing:    {Fore.RED}{html_r['phishing_probability']:6.2f}%{Style.RESET_ALL}")
            print(f"   Legitimate:  {Fore.GREEN}{html_r['legitimate_probability']:6.2f}%{Style.RESET_ALL}")
            print(f"   HTML length: {html_r['html_length']:,} chars")
        elif full['html_fetched']:
            print(f"\n{Style.BRIGHT}2. CNN HTML:{Style.RESET_ALL} "
                  f"{Fore.YELLOW}HTML too short for analysis{Style.RESET_ALL}")
        else:
            print(f"\n{Style.BRIGHT}2. CNN HTML:{Style.RESET_ALL} "
                  f"{Fore.YELLOW}Could not fetch page HTML{Style.RESET_ALL}")

        # ── Combined verdict ──
        combined = full['combined']
        if combined:
            pred = combined['prediction']
            color = Fore.RED if pred == 'PHISHING' else Fore.GREEN
            icon = "⚠️" if pred == 'PHISHING' else "✓"
            agree_str = (f"{Fore.GREEN}YES{Style.RESET_ALL}" if combined['agree']
                         else f"{Fore.YELLOW}NO{Style.RESET_ALL}")

            print(f"\n{'─' * 80}")
            print(f"{Style.BRIGHT}COMBINED VERDICT:{Style.RESET_ALL}")
            print(f"   {icon} {color}{Style.BRIGHT}{pred}{Style.RESET_ALL}  "
                  f"(confidence: {combined['confidence']:.2f}%)")
            print(f"   Phishing:    {Fore.RED}{combined['phishing_probability']:6.2f}%{Style.RESET_ALL}")
            print(f"   Legitimate:  {Fore.GREEN}{combined['legitimate_probability']:6.2f}%{Style.RESET_ALL}")
            if url_r and html_r:
                print(f"   Models agree: {agree_str}")

        print("\n" + "=" * 80 + "\n")


def main():
    """Interactive prediction loop."""
    print(f"\n{Fore.CYAN}{Style.BRIGHT}╔══════════════════════════════════════════════════════════════╗")
    print(f"║         CNN PHISHING DETECTOR - INTERACTIVE DEMO            ║")
    print(f"║            URL CNN + HTML CNN (Dual Analysis)               ║")
    print(f"╚══════════════════════════════════════════════════════════════╝{Style.RESET_ALL}\n")

    print(f"{Fore.YELLOW}Loading CNN models...{Style.RESET_ALL}")
    detector = CNNPhishingDetector()

    available = []
    if detector.url_model is not None:
        available.append("URL CNN")
    if detector.html_model is not None:
        available.append("HTML CNN")

    if not available:
        print(f"{Fore.RED}No CNN models found! Train models first.{Style.RESET_ALL}")
        sys.exit(1)

    print(f"{Fore.GREEN}✓ Models loaded: {', '.join(available)}{Style.RESET_ALL}\n")

    while True:
        print(f"{Fore.CYAN}{'─' * 80}{Style.RESET_ALL}")
        url = input(f"{Fore.YELLOW}Enter URL to test (or 'quit' to exit):{Style.RESET_ALL} ").strip()

        if url.lower() in ('quit', 'exit', 'q'):
            print(f"\n{Fore.GREEN}Goodbye!{Style.RESET_ALL}\n")
            break

        if not url:
            print(f"{Fore.RED}Please enter a valid URL.{Style.RESET_ALL}\n")
            continue

        if not url.startswith(('http://', 'https://')):
            url = 'http://' + url

        try:
            full = detector.predict_full(url)
            detector.print_results(url, full)
        except Exception as e:
            print(f"\n{Fore.RED}Error: {e}{Style.RESET_ALL}\n")
            logger.error(str(e))


if __name__ == '__main__':
    main()