Spaces:
Runtime error
Runtime error
File size: 3,491 Bytes
2cc7f91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | """
Test URL normalization - verify www/http variants produce same features
"""
import sys
from pathlib import Path
import pandas as pd
sys.path.append(str(Path(__file__).parent))
from scripts.feature_extraction.url_features_v3 import URLFeatureExtractorOptimized
def test_normalization():
"""Test that www/http variants produce identical features."""
extractor = URLFeatureExtractorOptimized()
print("=" * 80)
print("URL NORMALIZATION TEST")
print("=" * 80)
print()
# Test cases - should all have IDENTICAL features (except is_http)
test_cases = [
[
"https://github.com/user/repo",
"http://github.com/user/repo",
"https://www.github.com/user/repo",
"http://www.github.com/user/repo",
"www.github.com/user/repo",
"github.com/user/repo"
],
[
"https://example.com/login?user=test",
"www.example.com/login?user=test",
"http://www.example.com/login?user=test"
]
]
for i, urls in enumerate(test_cases, 1):
print(f"Test Case {i}: {urls[0].split('/')[2]}")
print("-" * 80)
features_list = []
for url in urls:
features = extractor.extract_features(url)
features_list.append(features)
# Show normalization
norm_url, orig, norm_domain, is_http = extractor.normalize_url(url)
print(f" {url:45s} → {norm_domain:20s} http={is_http}")
# Compare key features (should be identical except is_http)
key_features = [
'domain_length', 'domain_dots', 'num_subdomains', 'domain_entropy',
'path_length', 'url_entropy', 'is_shortened', 'is_free_platform',
'has_suspicious_tld', 'num_phishing_keywords'
]
print("\n Key Features Comparison:")
print(" " + "-" * 76)
# Check if all features are identical (except www/http)
first_features = features_list[0]
all_identical = True
for feat in key_features:
values = [f[feat] for f in features_list]
unique_vals = set(values)
if len(unique_vals) == 1:
status = "✓"
else:
status = "✗"
all_identical = False
print(f" {status} {feat:30s}: {values[0]}")
# Check is_http (should vary)
print("\n HTTP Flag (should vary based on input):")
print(" " + "-" * 76)
for j, url in enumerate(urls):
http_flag = features_list[j]['is_http']
print(f" {url:45s} → http={http_flag}")
print()
if all_identical:
print(f" ✅ TEST PASSED: All key features identical!")
else:
print(f" ❌ TEST FAILED: Features differ!")
print("\n")
print("=" * 80)
print("FEATURE COUNT")
print("=" * 80)
feature_names = extractor.get_feature_names() # pyright: ignore[reportAttributeAccessIssue]
print(f"Total features: {len(feature_names)}")
print()
print("Top 30 features:")
for i, name in enumerate(feature_names[:30], 1):
print(f" {i:2d}. {name}")
if __name__ == "__main__":
test_normalization()
|