""" Test feature alignment between extractor and models """ import sys from pathlib import Path import joblib import pandas as pd sys.path.append(str(Path(__file__).parent)) from scripts.feature_extraction.url_features_v2 import URLFeatureExtractorV2 def test_feature_alignment(): """Test that feature extraction produces features in the correct order for models""" # Load models models_dir = Path(__file__).parent / 'saved_models' model_files = { 'Logistic Regression': 'logistic_regression.joblib', 'Random Forest': 'random_forest.joblib', 'XGBoost': 'xgboost.joblib' } # Load scaler scaler_path = models_dir / 'scaler.joblib' scaler = None if scaler_path.exists(): scaler = joblib.load(scaler_path) print(f"✓ Loaded scaler") if hasattr(scaler, 'feature_names_in_'): print(f" Scaler has {len(scaler.feature_names_in_)} feature names\n") # Initialize extractor extractor = URLFeatureExtractorV2() # Test URL test_url = "https://github.com/user/repo" print("Testing feature alignment...\n") print(f"Test URL: {test_url}\n") # Extract features features_dict = extractor.extract_features(test_url) features_df = pd.DataFrame([features_dict]) if 'label' in features_df.columns: features_df = features_df.drop('label', axis=1) print(f"Extracted {len(features_df.columns)} features\n") # Store feature names for fallback feature_names_store = {} # Check each model for name, filename in model_files.items(): model_path = models_dir / filename if not model_path.exists(): print(f"❌ {name}: Model file not found") continue model = joblib.load(model_path) # Determine expected features expected_features = None source = None if hasattr(model, 'feature_names_in_'): expected_features = list(model.feature_names_in_) source = "model" elif hasattr(scaler, 'feature_names_in_'): expected_features = list(scaler.feature_names_in_) source = "scaler" elif feature_names_store: expected_features = list(feature_names_store.values())[0] source = "fallback" if expected_features: feature_names_store[name] = expected_features print(f"✓ {name}:") print(f" Expected features: {len(expected_features)} (from {source})") print(f" Expected features: {len(expected_features)} (from {source})") # Check missing features missing = set(expected_features) - set(features_df.columns) extra = set(features_df.columns) - set(expected_features) if missing: print(f" ⚠ Missing features: {len(missing)}") print(f" {list(missing)[:5]}...") if extra: print(f" ⚠ Extra features: {len(extra)}") print(f" {list(extra)[:5]}...") if not missing and not extra: print(f" ✓ Perfect match!") # Try prediction with alignment features_aligned = pd.DataFrame(columns=expected_features) for feat in expected_features: if feat in features_df.columns: features_aligned[feat] = features_df[feat].values else: features_aligned[feat] = 0 # Scale for Logistic Regression if name == 'Logistic Regression' and scaler is not None: features_to_use = scaler.transform(features_aligned) else: features_to_use = features_aligned try: pred = model.predict(features_to_use)[0] proba = model.predict_proba(features_to_use)[0] print(f" ✓ Prediction successful: {'PHISHING' if pred == 1 else 'LEGITIMATE'} ({proba[pred]*100:.1f}%)") except Exception as e: print(f" ❌ Prediction failed: {e}") else: print(f"⚠ {name}: No feature names available") print() if __name__ == "__main__": test_feature_alignment()