Spaces:
Runtime error
Runtime error
| """ | |
| Test feature alignment between extractor and models | |
| """ | |
| import sys | |
| from pathlib import Path | |
| import joblib | |
| import pandas as pd | |
| sys.path.append(str(Path(__file__).parent)) | |
| from scripts.feature_extraction.url_features_v2 import URLFeatureExtractorV2 | |
| def test_feature_alignment(): | |
| """Test that feature extraction produces features in the correct order for models""" | |
| # Load models | |
| models_dir = Path(__file__).parent / 'saved_models' | |
| model_files = { | |
| 'Logistic Regression': 'logistic_regression.joblib', | |
| 'Random Forest': 'random_forest.joblib', | |
| 'XGBoost': 'xgboost.joblib' | |
| } | |
| # Load scaler | |
| scaler_path = models_dir / 'scaler.joblib' | |
| scaler = None | |
| if scaler_path.exists(): | |
| scaler = joblib.load(scaler_path) | |
| print(f"✓ Loaded scaler") | |
| if hasattr(scaler, 'feature_names_in_'): | |
| print(f" Scaler has {len(scaler.feature_names_in_)} feature names\n") | |
| # Initialize extractor | |
| extractor = URLFeatureExtractorV2() | |
| # Test URL | |
| test_url = "https://github.com/user/repo" | |
| print("Testing feature alignment...\n") | |
| print(f"Test URL: {test_url}\n") | |
| # Extract features | |
| features_dict = extractor.extract_features(test_url) | |
| features_df = pd.DataFrame([features_dict]) | |
| if 'label' in features_df.columns: | |
| features_df = features_df.drop('label', axis=1) | |
| print(f"Extracted {len(features_df.columns)} features\n") | |
| # Store feature names for fallback | |
| feature_names_store = {} | |
| # Check each model | |
| for name, filename in model_files.items(): | |
| model_path = models_dir / filename | |
| if not model_path.exists(): | |
| print(f"❌ {name}: Model file not found") | |
| continue | |
| model = joblib.load(model_path) | |
| # Determine expected features | |
| expected_features = None | |
| source = None | |
| if hasattr(model, 'feature_names_in_'): | |
| expected_features = list(model.feature_names_in_) | |
| source = "model" | |
| elif hasattr(scaler, 'feature_names_in_'): | |
| expected_features = list(scaler.feature_names_in_) | |
| source = "scaler" | |
| elif feature_names_store: | |
| expected_features = list(feature_names_store.values())[0] | |
| source = "fallback" | |
| if expected_features: | |
| feature_names_store[name] = expected_features | |
| print(f"✓ {name}:") | |
| print(f" Expected features: {len(expected_features)} (from {source})") | |
| print(f" Expected features: {len(expected_features)} (from {source})") | |
| # Check missing features | |
| missing = set(expected_features) - set(features_df.columns) | |
| extra = set(features_df.columns) - set(expected_features) | |
| if missing: | |
| print(f" ⚠ Missing features: {len(missing)}") | |
| print(f" {list(missing)[:5]}...") | |
| if extra: | |
| print(f" ⚠ Extra features: {len(extra)}") | |
| print(f" {list(extra)[:5]}...") | |
| if not missing and not extra: | |
| print(f" ✓ Perfect match!") | |
| # Try prediction with alignment | |
| features_aligned = pd.DataFrame(columns=expected_features) | |
| for feat in expected_features: | |
| if feat in features_df.columns: | |
| features_aligned[feat] = features_df[feat].values | |
| else: | |
| features_aligned[feat] = 0 | |
| # Scale for Logistic Regression | |
| if name == 'Logistic Regression' and scaler is not None: | |
| features_to_use = scaler.transform(features_aligned) | |
| else: | |
| features_to_use = features_aligned | |
| try: | |
| pred = model.predict(features_to_use)[0] | |
| proba = model.predict_proba(features_to_use)[0] | |
| print(f" ✓ Prediction successful: {'PHISHING' if pred == 1 else 'LEGITIMATE'} ({proba[pred]*100:.1f}%)") | |
| except Exception as e: | |
| print(f" ❌ Prediction failed: {e}") | |
| else: | |
| print(f"⚠ {name}: No feature names available") | |
| print() | |
| if __name__ == "__main__": | |
| test_feature_alignment() | |