Phishing-Detection-System / scripts /testing /test_feature_alignment.py
rb1337's picture
Upload 50 files
2cc7f91 verified
"""
Test feature alignment between extractor and models
"""
import sys
from pathlib import Path
import joblib
import pandas as pd
sys.path.append(str(Path(__file__).parent))
from scripts.feature_extraction.url_features_v2 import URLFeatureExtractorV2
def test_feature_alignment():
"""Test that feature extraction produces features in the correct order for models"""
# Load models
models_dir = Path(__file__).parent / 'saved_models'
model_files = {
'Logistic Regression': 'logistic_regression.joblib',
'Random Forest': 'random_forest.joblib',
'XGBoost': 'xgboost.joblib'
}
# Load scaler
scaler_path = models_dir / 'scaler.joblib'
scaler = None
if scaler_path.exists():
scaler = joblib.load(scaler_path)
print(f"✓ Loaded scaler")
if hasattr(scaler, 'feature_names_in_'):
print(f" Scaler has {len(scaler.feature_names_in_)} feature names\n")
# Initialize extractor
extractor = URLFeatureExtractorV2()
# Test URL
test_url = "https://github.com/user/repo"
print("Testing feature alignment...\n")
print(f"Test URL: {test_url}\n")
# Extract features
features_dict = extractor.extract_features(test_url)
features_df = pd.DataFrame([features_dict])
if 'label' in features_df.columns:
features_df = features_df.drop('label', axis=1)
print(f"Extracted {len(features_df.columns)} features\n")
# Store feature names for fallback
feature_names_store = {}
# Check each model
for name, filename in model_files.items():
model_path = models_dir / filename
if not model_path.exists():
print(f"❌ {name}: Model file not found")
continue
model = joblib.load(model_path)
# Determine expected features
expected_features = None
source = None
if hasattr(model, 'feature_names_in_'):
expected_features = list(model.feature_names_in_)
source = "model"
elif hasattr(scaler, 'feature_names_in_'):
expected_features = list(scaler.feature_names_in_)
source = "scaler"
elif feature_names_store:
expected_features = list(feature_names_store.values())[0]
source = "fallback"
if expected_features:
feature_names_store[name] = expected_features
print(f"✓ {name}:")
print(f" Expected features: {len(expected_features)} (from {source})")
print(f" Expected features: {len(expected_features)} (from {source})")
# Check missing features
missing = set(expected_features) - set(features_df.columns)
extra = set(features_df.columns) - set(expected_features)
if missing:
print(f" ⚠ Missing features: {len(missing)}")
print(f" {list(missing)[:5]}...")
if extra:
print(f" ⚠ Extra features: {len(extra)}")
print(f" {list(extra)[:5]}...")
if not missing and not extra:
print(f" ✓ Perfect match!")
# Try prediction with alignment
features_aligned = pd.DataFrame(columns=expected_features)
for feat in expected_features:
if feat in features_df.columns:
features_aligned[feat] = features_df[feat].values
else:
features_aligned[feat] = 0
# Scale for Logistic Regression
if name == 'Logistic Regression' and scaler is not None:
features_to_use = scaler.transform(features_aligned)
else:
features_to_use = features_aligned
try:
pred = model.predict(features_to_use)[0]
proba = model.predict_proba(features_to_use)[0]
print(f" ✓ Prediction successful: {'PHISHING' if pred == 1 else 'LEGITIMATE'} ({proba[pred]*100:.1f}%)")
except Exception as e:
print(f" ❌ Prediction failed: {e}")
else:
print(f"⚠ {name}: No feature names available")
print()
if __name__ == "__main__":
test_feature_alignment()