Spaces:
Runtime error
Runtime error
| """ | |
| Test URL normalization - verify www/http variants produce same features | |
| """ | |
| import sys | |
| from pathlib import Path | |
| import pandas as pd | |
| sys.path.append(str(Path(__file__).parent)) | |
| from scripts.feature_extraction.url_features_v3 import URLFeatureExtractorOptimized | |
| def test_normalization(): | |
| """Test that www/http variants produce identical features.""" | |
| extractor = URLFeatureExtractorOptimized() | |
| print("=" * 80) | |
| print("URL NORMALIZATION TEST") | |
| print("=" * 80) | |
| print() | |
| # Test cases - should all have IDENTICAL features (except is_http) | |
| test_cases = [ | |
| [ | |
| "https://github.com/user/repo", | |
| "http://github.com/user/repo", | |
| "https://www.github.com/user/repo", | |
| "http://www.github.com/user/repo", | |
| "www.github.com/user/repo", | |
| "github.com/user/repo" | |
| ], | |
| [ | |
| "https://example.com/login?user=test", | |
| "www.example.com/login?user=test", | |
| "http://www.example.com/login?user=test" | |
| ] | |
| ] | |
| for i, urls in enumerate(test_cases, 1): | |
| print(f"Test Case {i}: {urls[0].split('/')[2]}") | |
| print("-" * 80) | |
| features_list = [] | |
| for url in urls: | |
| features = extractor.extract_features(url) | |
| features_list.append(features) | |
| # Show normalization | |
| norm_url, orig, norm_domain, is_http = extractor.normalize_url(url) | |
| print(f" {url:45s} → {norm_domain:20s} http={is_http}") | |
| # Compare key features (should be identical except is_http) | |
| key_features = [ | |
| 'domain_length', 'domain_dots', 'num_subdomains', 'domain_entropy', | |
| 'path_length', 'url_entropy', 'is_shortened', 'is_free_platform', | |
| 'has_suspicious_tld', 'num_phishing_keywords' | |
| ] | |
| print("\n Key Features Comparison:") | |
| print(" " + "-" * 76) | |
| # Check if all features are identical (except www/http) | |
| first_features = features_list[0] | |
| all_identical = True | |
| for feat in key_features: | |
| values = [f[feat] for f in features_list] | |
| unique_vals = set(values) | |
| if len(unique_vals) == 1: | |
| status = "✓" | |
| else: | |
| status = "✗" | |
| all_identical = False | |
| print(f" {status} {feat:30s}: {values[0]}") | |
| # Check is_http (should vary) | |
| print("\n HTTP Flag (should vary based on input):") | |
| print(" " + "-" * 76) | |
| for j, url in enumerate(urls): | |
| http_flag = features_list[j]['is_http'] | |
| print(f" {url:45s} → http={http_flag}") | |
| print() | |
| if all_identical: | |
| print(f" ✅ TEST PASSED: All key features identical!") | |
| else: | |
| print(f" ❌ TEST FAILED: Features differ!") | |
| print("\n") | |
| print("=" * 80) | |
| print("FEATURE COUNT") | |
| print("=" * 80) | |
| feature_names = extractor.get_feature_names() # pyright: ignore[reportAttributeAccessIssue] | |
| print(f"Total features: {len(feature_names)}") | |
| print() | |
| print("Top 30 features:") | |
| for i, name in enumerate(feature_names[:30], 1): | |
| print(f" {i:2d}. {name}") | |
| if __name__ == "__main__": | |
| test_normalization() | |