Spaces:

rb1337
/

Phishing-Detection-System

Runtime error

App Files Files Community

Phishing-Detection-System / scripts /testing /test_normalization.py

rb1337

Upload 50 files

2cc7f91 verified 8 days ago

raw

history blame contribute delete

3.49 kB

	"""
	Test URL normalization - verify www/http variants produce same features
	"""
	import sys
	from pathlib import Path
	import pandas as pd

	sys.path.append(str(Path(__file__).parent))
	from scripts.feature_extraction.url_features_v3 import URLFeatureExtractorOptimized

	def test_normalization():
	"""Test that www/http variants produce identical features."""

	extractor = URLFeatureExtractorOptimized()

	print("=" * 80)
	print("URL NORMALIZATION TEST")
	print("=" * 80)
	print()

	# Test cases - should all have IDENTICAL features (except is_http)
	test_cases = [
	[
	"https://github.com/user/repo",
	"http://github.com/user/repo",
	"https://www.github.com/user/repo",
	"http://www.github.com/user/repo",
	"www.github.com/user/repo",
	"github.com/user/repo"
	],
	[
	"https://example.com/login?user=test",
	"www.example.com/login?user=test",
	"http://www.example.com/login?user=test"
	]
	]

	for i, urls in enumerate(test_cases, 1):
	print(f"Test Case {i}: {urls[0].split('/')[2]}")
	print("-" * 80)

	features_list = []
	for url in urls:
	features = extractor.extract_features(url)
	features_list.append(features)

	# Show normalization
	norm_url, orig, norm_domain, is_http = extractor.normalize_url(url)
	print(f" {url:45s} → {norm_domain:20s} http={is_http}")

	# Compare key features (should be identical except is_http)
	key_features = [
	'domain_length', 'domain_dots', 'num_subdomains', 'domain_entropy',
	'path_length', 'url_entropy', 'is_shortened', 'is_free_platform',
	'has_suspicious_tld', 'num_phishing_keywords'
	]

	print("\n Key Features Comparison:")
	print(" " + "-" * 76)

	# Check if all features are identical (except www/http)
	first_features = features_list[0]
	all_identical = True

	for feat in key_features:
	values = [f[feat] for f in features_list]
	unique_vals = set(values)

	if len(unique_vals) == 1:
	status = "✓"
	else:
	status = "✗"
	all_identical = False

	print(f" {status} {feat:30s}: {values[0]}")

	# Check is_http (should vary)
	print("\n HTTP Flag (should vary based on input):")
	print(" " + "-" * 76)
	for j, url in enumerate(urls):
	http_flag = features_list[j]['is_http']
	print(f" {url:45s} → http={http_flag}")

	print()
	if all_identical:
	print(f" ✅ TEST PASSED: All key features identical!")
	else:
	print(f" ❌ TEST FAILED: Features differ!")

	print("\n")

	print("=" * 80)
	print("FEATURE COUNT")
	print("=" * 80)

	feature_names = extractor.get_feature_names() # pyright: ignore[reportAttributeAccessIssue]
	print(f"Total features: {len(feature_names)}")
	print()
	print("Top 30 features:")
	for i, name in enumerate(feature_names[:30], 1):
	print(f" {i:2d}. {name}")


	if __name__ == "__main__":
	test_normalization()