""" Test URL normalization - verify www/http variants produce same features """ import sys from pathlib import Path import pandas as pd sys.path.append(str(Path(__file__).parent)) from scripts.feature_extraction.url_features_v3 import URLFeatureExtractorOptimized def test_normalization(): """Test that www/http variants produce identical features.""" extractor = URLFeatureExtractorOptimized() print("=" * 80) print("URL NORMALIZATION TEST") print("=" * 80) print() # Test cases - should all have IDENTICAL features (except is_http) test_cases = [ [ "https://github.com/user/repo", "http://github.com/user/repo", "https://www.github.com/user/repo", "http://www.github.com/user/repo", "www.github.com/user/repo", "github.com/user/repo" ], [ "https://example.com/login?user=test", "www.example.com/login?user=test", "http://www.example.com/login?user=test" ] ] for i, urls in enumerate(test_cases, 1): print(f"Test Case {i}: {urls[0].split('/')[2]}") print("-" * 80) features_list = [] for url in urls: features = extractor.extract_features(url) features_list.append(features) # Show normalization norm_url, orig, norm_domain, is_http = extractor.normalize_url(url) print(f" {url:45s} → {norm_domain:20s} http={is_http}") # Compare key features (should be identical except is_http) key_features = [ 'domain_length', 'domain_dots', 'num_subdomains', 'domain_entropy', 'path_length', 'url_entropy', 'is_shortened', 'is_free_platform', 'has_suspicious_tld', 'num_phishing_keywords' ] print("\n Key Features Comparison:") print(" " + "-" * 76) # Check if all features are identical (except www/http) first_features = features_list[0] all_identical = True for feat in key_features: values = [f[feat] for f in features_list] unique_vals = set(values) if len(unique_vals) == 1: status = "✓" else: status = "✗" all_identical = False print(f" {status} {feat:30s}: {values[0]}") # Check is_http (should vary) print("\n HTTP Flag (should vary based on input):") print(" " + "-" * 76) for j, url in enumerate(urls): http_flag = features_list[j]['is_http'] print(f" {url:45s} → http={http_flag}") print() if all_identical: print(f" ✅ TEST PASSED: All key features identical!") else: print(f" ❌ TEST FAILED: Features differ!") print("\n") print("=" * 80) print("FEATURE COUNT") print("=" * 80) feature_names = extractor.get_feature_names() # pyright: ignore[reportAttributeAccessIssue] print(f"Total features: {len(feature_names)}") print() print("Top 30 features:") for i, name in enumerate(feature_names[:30], 1): print(f" {i:2d}. {name}") if __name__ == "__main__": test_normalization()