Spaces:

rb1337
/

Phishing-Detection-System

Runtime error

File size: 3,491 Bytes

2cc7f91

"""

Test URL normalization - verify www/http variants produce same features

"""
import sys
from pathlib import Path
import pandas as pd

sys.path.append(str(Path(__file__).parent))
from scripts.feature_extraction.url_features_v3 import URLFeatureExtractorOptimized

def test_normalization():
    """Test that www/http variants produce identical features."""
    
    extractor = URLFeatureExtractorOptimized()
    
    print("=" * 80)
    print("URL NORMALIZATION TEST")
    print("=" * 80)
    print()
    
    # Test cases - should all have IDENTICAL features (except is_http)
    test_cases = [
        [
            "https://github.com/user/repo",
            "http://github.com/user/repo",
            "https://www.github.com/user/repo",
            "http://www.github.com/user/repo",
            "www.github.com/user/repo",
            "github.com/user/repo"
        ],
        [
            "https://example.com/login?user=test",
            "www.example.com/login?user=test",
            "http://www.example.com/login?user=test"
        ]
    ]
    
    for i, urls in enumerate(test_cases, 1):
        print(f"Test Case {i}: {urls[0].split('/')[2]}")
        print("-" * 80)
        
        features_list = []
        for url in urls:
            features = extractor.extract_features(url)
            features_list.append(features)
            
            # Show normalization
            norm_url, orig, norm_domain, is_http = extractor.normalize_url(url)
            print(f"  {url:45s} → {norm_domain:20s} http={is_http}")
        
        # Compare key features (should be identical except is_http)
        key_features = [
            'domain_length', 'domain_dots', 'num_subdomains', 'domain_entropy',
            'path_length', 'url_entropy', 'is_shortened', 'is_free_platform',
            'has_suspicious_tld', 'num_phishing_keywords'
        ]
        
        print("\n  Key Features Comparison:")
        print("  " + "-" * 76)
        
        # Check if all features are identical (except www/http)
        first_features = features_list[0]
        all_identical = True
        
        for feat in key_features:
            values = [f[feat] for f in features_list]
            unique_vals = set(values)
            
            if len(unique_vals) == 1:
                status = "✓"
            else:
                status = "✗"
                all_identical = False
            
            print(f"  {status} {feat:30s}: {values[0]}")
        
        # Check is_http (should vary)
        print("\n  HTTP Flag (should vary based on input):")
        print("  " + "-" * 76)
        for j, url in enumerate(urls):
            http_flag = features_list[j]['is_http']
            print(f"  {url:45s} → http={http_flag}")
        
        print()
        if all_identical:
            print(f"  ✅ TEST PASSED: All key features identical!")
        else:
            print(f"  ❌ TEST FAILED: Features differ!")
        
        print("\n")
    
    print("=" * 80)
    print("FEATURE COUNT")
    print("=" * 80)
    
    feature_names = extractor.get_feature_names() # pyright: ignore[reportAttributeAccessIssue]
    print(f"Total features: {len(feature_names)}")
    print()
    print("Top 30 features:")
    for i, name in enumerate(feature_names[:30], 1):
        print(f"  {i:2d}. {name}")


if __name__ == "__main__":
    test_normalization()