Spaces:

rb1337
/

Phishing-Detection-System

Runtime error

File size: 16,686 Bytes

2cc7f91

"""

URL Phishing Detector - Interactive Demo



Test any URL with all trained models and see predictions with confidence scores.

"""

import sys
import pandas as pd
import joblib
from pathlib import Path
from colorama import init, Fore, Style

# Initialize colorama for colored output
init(autoreset=True)
import logging

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%H:%M:%S'
)
logger = logging.getLogger("url_predictor")

sys.path.append(str(Path(__file__).parent.parent))
from scripts.feature_extraction.url.url_features_v2 import URLFeatureExtractorV2


class URLPhishingDetector:
    """Detect phishing URLs using trained models."""
    
    def __init__(self):
        """Initialize detector with all models."""
        self.script_dir = Path(__file__).parent.parent
        self.models_dir = (self.script_dir / 'saved_models').resolve()
        
        # Whitelist of trusted domains
        self.trusted_domains = {
            # Tech giants
            'youtube.com', 'facebook.com', 'twitter.com', 'x.com',
            'linkedin.com', 'microsoft.com', 'apple.com', 'amazon.com',
            # Development
            'github.com', 'gitlab.com', 'stackoverflow.com', 'npmjs.com',
            # AI Services
            'claude.ai', 'anthropic.com', 'openai.com', 'chatgpt.com',
            # Education & Info
            'wikipedia.org', 'reddit.com', 'quora.com', 'medium.com',
            # Cloud & Services
            'aws.amazon.com', 'azure.microsoft.com', 'cloud.google.com',
            'vercel.com', 'netlify.com', 'heroku.com',
            # Communication
            'slack.com', 'discord.com', 'zoom.us', 'teams.microsoft.com',
            # Finance (major)
            'paypal.com', 'stripe.com', 'visa.com', 'mastercard.com',
            # E-commerce
            'ebay.com', 'shopify.com', 'etsy.com', 'walmart.com',
        }
        
        # Custom thresholds for each model (reduce false positives)
        self.thresholds = {
            'Logistic Regression': 0.5,  # Standard threshold
            'Random Forest': 0.5,         # Standard threshold  
            'XGBoost': 0.5                # Standard threshold
        }
        
        # Load feature extractor
        self.extractor = URLFeatureExtractorV2()
        
        # Load scaler (only needed for Logistic Regression)
        scaler_path = self.models_dir / 'scaler.joblib'
        if scaler_path.exists():
            self.scaler = joblib.load(scaler_path)
            logger.info("✓ Loaded scaler")
        else:
            self.scaler = None
            logger.warning("✗ Scaler not found (only needed for Logistic Regression)")
        
        # Load all models
        self.models = {}
        self.feature_names = {}
        self._load_models()
    
    def _load_models(self):
        """Load all trained models."""
        model_files = {
            'Logistic Regression': 'logistic_regression.joblib',
            'Random Forest': 'random_forest.joblib',
            'XGBoost': 'xgboost.joblib'
        }
        
        for name, filename in model_files.items():
            model_path = self.models_dir / filename
            if model_path.exists():
                model = joblib.load(model_path)
                self.models[name] = model
                
                # Store expected feature names from model
                if hasattr(model, 'feature_names_in_'):
                    self.feature_names[name] = list(model.feature_names_in_)
                    logger.info(f"✓ Loaded {name} ({len(self.feature_names[name])} features)")
                elif self.scaler is not None and hasattr(self.scaler, 'feature_names_in_'):
                    # Use scaler's feature names for models without them (like Logistic Regression)
                    self.feature_names[name] = list(self.scaler.feature_names_in_)
                    logger.info(f"✓ Loaded {name} (using scaler features: {len(self.feature_names[name])} features)")
                else:
                    logger.info(f"✓ Loaded {name}")
            else:
                logger.warning(f"✗ Model not found: {filename}")
    
    def predict_url(self, url: str) -> dict:
        """

        Predict if URL is phishing or legitimate.

        

        Args:

            url: URL string to analyze

            

        Returns:

            Dictionary with predictions from all models

        """
        # Check if domain is in whitelist
        from urllib.parse import urlparse
        parsed = urlparse(url)
        domain = parsed.netloc.lower().replace('www.', '')
        
        # If trusted domain, override predictions
        is_whitelisted = any(domain.endswith(trusted) for trusted in self.trusted_domains)
        
        # Extract features
        features_dict = self.extractor.extract_features(url)
        
        # Convert to DataFrame (excluding label)
        features_df = pd.DataFrame([features_dict])
        if 'label' in features_df.columns:
            features_df = features_df.drop('label', axis=1)
        
        # Get predictions from all models
        results = {}
        for model_name, model in self.models.items():
            # Override for whitelisted domains
            if is_whitelisted:
                results[model_name] = {
                    'prediction': 'LEGITIMATE',
                    'prediction_code': 0,
                    'confidence': 99.99,
                    'phishing_probability': 0.01,
                    'legitimate_probability': 99.99,
                    'whitelisted': True
                }
                continue
            
            # Align features with model's expected features
            if model_name in self.feature_names:
                expected_features = self.feature_names[model_name]
                # Create aligned DataFrame with correct column order
                features_aligned = pd.DataFrame(columns=expected_features)
                for feat in expected_features:
                    if feat in features_df.columns:
                        features_aligned[feat] = features_df[feat].values
                    else:
                        features_aligned[feat] = 0  # Fill missing features with 0
                # Convert to numpy to avoid sklearn feature name validation
                features_to_predict = features_aligned.values
            else:
                # Fallback: use any stored feature names from other models
                if self.feature_names:
                    expected_features = list(self.feature_names.values())[0]
                    features_aligned = pd.DataFrame(columns=expected_features)
                    for feat in expected_features:
                        if feat in features_df.columns:
                            features_aligned[feat] = features_df[feat].values
                        else:
                            features_aligned[feat] = 0
                    features_to_predict = features_aligned.values
                else:
                    features_to_predict = features_df.values
            
            # Scale features only for Logistic Regression
            if model_name == 'Logistic Regression' and self.scaler is not None:
                features_to_use = self.scaler.transform(features_to_predict)
            else:
                features_to_use = features_to_predict
            
            # Get probability/confidence (features are already numpy arrays)
            if hasattr(model, 'predict_proba'):
                probabilities = model.predict_proba(features_to_use)[0]
                phishing_prob = probabilities[1] * 100
                legitimate_prob = probabilities[0] * 100
                
                # Apply custom threshold
                threshold = self.thresholds.get(model_name, 0.5)
                prediction = 1 if probabilities[1] > threshold else 0
                confidence = probabilities[prediction] * 100
            else:
                # For models without predict_proba (fallback)
                prediction = model.predict(features_to_use)[0]
                confidence = 100.0
                phishing_prob = 100.0 if prediction == 1 else 0.0
                legitimate_prob = 0.0 if prediction == 1 else 100.0
            
            results[model_name] = {
                'prediction': 'PHISHING' if prediction == 1 else 'LEGITIMATE',
                'prediction_code': int(prediction),
                'confidence': confidence,
                'phishing_probability': phishing_prob,
                'legitimate_probability': legitimate_prob,
                'whitelisted': False,
                'threshold': self.thresholds.get(model_name, 0.5)
            }
        
        return results, features_dict # type: ignore
    
    def print_results(self, url: str, results: dict, features: dict):
        """Print formatted results."""
        print("\n" + "=" * 80)
        print(f"{Fore.CYAN}{Style.BRIGHT}URL PHISHING DETECTION RESULTS{Style.RESET_ALL}")
        print("=" * 80)
        
        # Print URL
        print(f"\n{Fore.YELLOW}URL:{Style.RESET_ALL} {url}")
        
        # Print model predictions
        print(f"\n{Fore.CYAN}{Style.BRIGHT}MODEL PREDICTIONS:{Style.RESET_ALL}")
        print("-" * 80)
        
        for model_name, result in results.items():
            prediction = result['prediction']
            confidence = result['confidence']
            phishing_prob = result['phishing_probability']
            legitimate_prob = result['legitimate_probability']
            threshold = result.get('threshold', 0.5)
            
            # Color based on prediction
            if prediction == 'PHISHING':
                color = Fore.RED
                icon = "⚠️"
            else:
                color = Fore.GREEN
                icon = "✓"
            
            print(f"\n{Style.BRIGHT}{model_name}:{Style.RESET_ALL}")
            print(f"  {icon} Prediction: {color}{Style.BRIGHT}{prediction}{Style.RESET_ALL}")
            
            # Show if whitelisted
            if result.get('whitelisted', False):
                print(f"  {Fore.CYAN}ℹ️  Trusted domain (whitelisted){Style.RESET_ALL}")
            else:
                print(f"  Decision Threshold: {threshold*100:.0f}%")
            
            print(f"  Confidence: {confidence:.2f}%")
            print(f"  Probabilities:")
            print(f"    • Phishing:   {Fore.RED}{phishing_prob:6.2f}%{Style.RESET_ALL}")
            print(f"    • Legitimate: {Fore.GREEN}{legitimate_prob:6.2f}%{Style.RESET_ALL}")
        
        # Consensus
        print(f"\n{Fore.CYAN}{Style.BRIGHT}CONSENSUS:{Style.RESET_ALL}")
        print("-" * 80)
        
        phishing_votes = sum(1 for r in results.values() if r['prediction'] == 'PHISHING')
        total_models = len(results)
        
        if phishing_votes == total_models:
            consensus_color = Fore.RED
            consensus_icon = "🚨"
            consensus_text = "ALL MODELS AGREE: PHISHING"
        elif phishing_votes == 0:
            consensus_color = Fore.GREEN
            consensus_icon = "✅"
            consensus_text = "ALL MODELS AGREE: LEGITIMATE"
        else:
            consensus_color = Fore.YELLOW
            consensus_icon = "⚠️"
            consensus_text = f"MIXED RESULTS: {phishing_votes}/{total_models} models say PHISHING"
        
        print(f"{consensus_icon} {consensus_color}{Style.BRIGHT}{consensus_text}{Style.RESET_ALL}")
        
        # Key features (based on top features from models)
        print(f"\n{Fore.CYAN}{Style.BRIGHT}TOP FEATURES (Model Importance):{Style.RESET_ALL}")
        print("-" * 80)
        
        # Top features from Random Forest and XGBoost analysis
        top_features = [
            ('Num Domain Parts', features.get('num_domain_parts', 0), None),
            ('Domain Dots', features.get('domain_dots', 0), None),
            ('URL Shortener', '✓ Yes' if features.get('is_shortened', 0) == 1 else '✗ No',
             features.get('is_shortened', 0)),
            ('Num Subdomains', features.get('num_subdomains', 0), None),
            ('Domain Hyphens', features.get('domain_hyphens', 0), None),
            ('Free Platform', '✓ Yes' if features.get('is_free_platform', 0) == 1 else '✗ No',
             features.get('is_free_platform', 0)),
            ('Free Hosting', '✓ Yes' if features.get('is_free_hosting', 0) == 1 else '✗ No',
             features.get('is_free_hosting', 0)),
            ('Platform Subdomain Len', features.get('platform_subdomain_length', 0), None),
            ('Avg Domain Part Len', f"{features.get('avg_domain_part_len', 0):.2f}", None),
            ('Domain Length Category', features.get('domain_length_category', 0), None),
            ('Path Digits', features.get('path_digits', 0), None),
            ('Is HTTP', '✓ Yes' if features.get('is_http', 0) == 1 else '✗ No',
             features.get('is_http', 0)),
            ('Multiple Brands in URL', '✓ Yes' if features.get('multiple_brands_in_url', 0) == 1 else '✗ No',
             features.get('multiple_brands_in_url', 0)),
            ('Brand in Path', '✓ Yes' if features.get('brand_in_path', 0) == 1 else '✗ No',
             features.get('brand_in_path', 0)),
            ('Path Slashes', features.get('path_slashes', 0), None),
            ('Encoding Diff', f"{features.get('encoding_diff', 0):.3f}", None),
            ('Symbol Ratio (Domain)', f"{features.get('symbol_ratio_domain', 0):.3f}", None),
            ('Domain Length', features.get('domain_length', 0), None),
            ('Has @ Symbol', '✓ Yes' if features.get('has_at_symbol', 0) == 1 else '✗ No',
             features.get('has_at_symbol', 0)),
            ('TLD Length', features.get('tld_length', 0), None),
        ]
        
        for feature_name, value, risk_flag in top_features:
            # Color code risky features
            if risk_flag is not None:
                if risk_flag == 1:  # Risky feature is present
                    value_display = f"{Fore.RED}{value}{Style.RESET_ALL}"
                else:
                    value_display = f"{Fore.GREEN}{value}{Style.RESET_ALL}"
            else:
                value_display = str(value)
            
            print(f"  • {feature_name:25s}: {value_display}")
        
        print("\n" + "=" * 80 + "\n")


def main():
    """Main interactive function."""
    print(f"\n{Fore.CYAN}{Style.BRIGHT}╔══════════════════════════════════════════════════════════════╗")
    print(f"║          URL PHISHING DETECTOR - INTERACTIVE DEMO            ║")
    print(f"╚══════════════════════════════════════════════════════════════╝{Style.RESET_ALL}\n")
    
    # Initialize detector
    print(f"{Fore.YELLOW}Loading models...{Style.RESET_ALL}")
    detector = URLPhishingDetector()
    print(f"{Fore.GREEN}✓ All models loaded successfully!{Style.RESET_ALL}\n")
    
    # Interactive loop
    while True:
        print(f"{Fore.CYAN}{'─' * 80}{Style.RESET_ALL}")
        url = input(f"{Fore.YELLOW}Enter URL to test (or 'quit' to exit):{Style.RESET_ALL} ").strip()
        
        if url.lower() in ['quit', 'exit', 'q']:
            print(f"\n{Fore.GREEN}Thank you for using URL Phishing Detector!{Style.RESET_ALL}\n")
            break
        
        if not url:
            print(f"{Fore.RED}Please enter a valid URL{Style.RESET_ALL}\n")
            continue
        
        # Add http:// if no scheme
        if not url.startswith(('http://', 'https://')):
            url = 'http://' + url
        
        try:
            # Get predictions
            results, features = detector.predict_url(url)
            
            # Print results
            detector.print_results(url, results, features)
            
        except Exception as e:
            print(f"\n{Fore.RED}Error analyzing URL: {str(e)}{Style.RESET_ALL}\n")
            logger.error(f"Error: {str(e)}")


if __name__ == "__main__":
    main()