Spaces:

rb1337
/

Phishing-Detection-System

Runtime error

App Files Files Community

Phishing-Detection-System / scripts /testing /test_feature_alignment.py

rb1337

Upload 50 files

2cc7f91 verified 8 days ago

raw

history blame contribute delete

4.52 kB

	"""
	Test feature alignment between extractor and models
	"""
	import sys
	from pathlib import Path
	import joblib
	import pandas as pd

	sys.path.append(str(Path(__file__).parent))
	from scripts.feature_extraction.url_features_v2 import URLFeatureExtractorV2

	def test_feature_alignment():
	"""Test that feature extraction produces features in the correct order for models"""

	# Load models
	models_dir = Path(__file__).parent / 'saved_models'

	model_files = {
	'Logistic Regression': 'logistic_regression.joblib',
	'Random Forest': 'random_forest.joblib',
	'XGBoost': 'xgboost.joblib'
	}

	# Load scaler
	scaler_path = models_dir / 'scaler.joblib'
	scaler = None
	if scaler_path.exists():
	scaler = joblib.load(scaler_path)
	print(f"✓ Loaded scaler")
	if hasattr(scaler, 'feature_names_in_'):
	print(f" Scaler has {len(scaler.feature_names_in_)} feature names\n")

	# Initialize extractor
	extractor = URLFeatureExtractorV2()

	# Test URL
	test_url = "https://github.com/user/repo"

	print("Testing feature alignment...\n")
	print(f"Test URL: {test_url}\n")

	# Extract features
	features_dict = extractor.extract_features(test_url)
	features_df = pd.DataFrame([features_dict])
	if 'label' in features_df.columns:
	features_df = features_df.drop('label', axis=1)

	print(f"Extracted {len(features_df.columns)} features\n")

	# Store feature names for fallback
	feature_names_store = {}

	# Check each model
	for name, filename in model_files.items():
	model_path = models_dir / filename
	if not model_path.exists():
	print(f"❌ {name}: Model file not found")
	continue

	model = joblib.load(model_path)

	# Determine expected features
	expected_features = None
	source = None

	if hasattr(model, 'feature_names_in_'):
	expected_features = list(model.feature_names_in_)
	source = "model"
	elif hasattr(scaler, 'feature_names_in_'):
	expected_features = list(scaler.feature_names_in_)
	source = "scaler"
	elif feature_names_store:
	expected_features = list(feature_names_store.values())[0]
	source = "fallback"

	if expected_features:
	feature_names_store[name] = expected_features
	print(f"✓ {name}:")
	print(f" Expected features: {len(expected_features)} (from {source})")
	print(f" Expected features: {len(expected_features)} (from {source})")

	# Check missing features
	missing = set(expected_features) - set(features_df.columns)
	extra = set(features_df.columns) - set(expected_features)

	if missing:
	print(f" ⚠ Missing features: {len(missing)}")
	print(f" {list(missing)[:5]}...")

	if extra:
	print(f" ⚠ Extra features: {len(extra)}")
	print(f" {list(extra)[:5]}...")

	if not missing and not extra:
	print(f" ✓ Perfect match!")

	# Try prediction with alignment
	features_aligned = pd.DataFrame(columns=expected_features)
	for feat in expected_features:
	if feat in features_df.columns:
	features_aligned[feat] = features_df[feat].values
	else:
	features_aligned[feat] = 0

	# Scale for Logistic Regression
	if name == 'Logistic Regression' and scaler is not None:
	features_to_use = scaler.transform(features_aligned)
	else:
	features_to_use = features_aligned

	try:
	pred = model.predict(features_to_use)[0]
	proba = model.predict_proba(features_to_use)[0]
	print(f" ✓ Prediction successful: {'PHISHING' if pred == 1 else 'LEGITIMATE'} ({proba[pred]*100:.1f}%)")
	except Exception as e:
	print(f" ❌ Prediction failed: {e}")
	else:
	print(f"⚠ {name}: No feature names available")

	print()

	if __name__ == "__main__":
	test_feature_alignment()