File size: 3,491 Bytes
2cc7f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""

Test URL normalization - verify www/http variants produce same features

"""
import sys
from pathlib import Path
import pandas as pd

sys.path.append(str(Path(__file__).parent))
from scripts.feature_extraction.url_features_v3 import URLFeatureExtractorOptimized

def test_normalization():
    """Test that www/http variants produce identical features."""
    
    extractor = URLFeatureExtractorOptimized()
    
    print("=" * 80)
    print("URL NORMALIZATION TEST")
    print("=" * 80)
    print()
    
    # Test cases - should all have IDENTICAL features (except is_http)
    test_cases = [
        [
            "https://github.com/user/repo",
            "http://github.com/user/repo",
            "https://www.github.com/user/repo",
            "http://www.github.com/user/repo",
            "www.github.com/user/repo",
            "github.com/user/repo"
        ],
        [
            "https://example.com/login?user=test",
            "www.example.com/login?user=test",
            "http://www.example.com/login?user=test"
        ]
    ]
    
    for i, urls in enumerate(test_cases, 1):
        print(f"Test Case {i}: {urls[0].split('/')[2]}")
        print("-" * 80)
        
        features_list = []
        for url in urls:
            features = extractor.extract_features(url)
            features_list.append(features)
            
            # Show normalization
            norm_url, orig, norm_domain, is_http = extractor.normalize_url(url)
            print(f"  {url:45s}{norm_domain:20s} http={is_http}")
        
        # Compare key features (should be identical except is_http)
        key_features = [
            'domain_length', 'domain_dots', 'num_subdomains', 'domain_entropy',
            'path_length', 'url_entropy', 'is_shortened', 'is_free_platform',
            'has_suspicious_tld', 'num_phishing_keywords'
        ]
        
        print("\n  Key Features Comparison:")
        print("  " + "-" * 76)
        
        # Check if all features are identical (except www/http)
        first_features = features_list[0]
        all_identical = True
        
        for feat in key_features:
            values = [f[feat] for f in features_list]
            unique_vals = set(values)
            
            if len(unique_vals) == 1:
                status = "✓"
            else:
                status = "✗"
                all_identical = False
            
            print(f"  {status} {feat:30s}: {values[0]}")
        
        # Check is_http (should vary)
        print("\n  HTTP Flag (should vary based on input):")
        print("  " + "-" * 76)
        for j, url in enumerate(urls):
            http_flag = features_list[j]['is_http']
            print(f"  {url:45s} → http={http_flag}")
        
        print()
        if all_identical:
            print(f"  ✅ TEST PASSED: All key features identical!")
        else:
            print(f"  ❌ TEST FAILED: Features differ!")
        
        print("\n")
    
    print("=" * 80)
    print("FEATURE COUNT")
    print("=" * 80)
    
    feature_names = extractor.get_feature_names() # pyright: ignore[reportAttributeAccessIssue]
    print(f"Total features: {len(feature_names)}")
    print()
    print("Top 30 features:")
    for i, name in enumerate(feature_names[:30], 1):
        print(f"  {i:2d}. {name}")


if __name__ == "__main__":
    test_normalization()