File size: 4,518 Bytes
2cc7f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""

Test feature alignment between extractor and models

"""
import sys
from pathlib import Path
import joblib
import pandas as pd

sys.path.append(str(Path(__file__).parent))
from scripts.feature_extraction.url_features_v2 import URLFeatureExtractorV2

def test_feature_alignment():
    """Test that feature extraction produces features in the correct order for models"""
    
    # Load models
    models_dir = Path(__file__).parent / 'saved_models'
    
    model_files = {
        'Logistic Regression': 'logistic_regression.joblib',
        'Random Forest': 'random_forest.joblib',
        'XGBoost': 'xgboost.joblib'
    }
    
    # Load scaler
    scaler_path = models_dir / 'scaler.joblib'
    scaler = None
    if scaler_path.exists():
        scaler = joblib.load(scaler_path)
        print(f"✓ Loaded scaler")
        if hasattr(scaler, 'feature_names_in_'):
            print(f"  Scaler has {len(scaler.feature_names_in_)} feature names\n")
    
    # Initialize extractor
    extractor = URLFeatureExtractorV2()
    
    # Test URL
    test_url = "https://github.com/user/repo"
    
    print("Testing feature alignment...\n")
    print(f"Test URL: {test_url}\n")
    
    # Extract features
    features_dict = extractor.extract_features(test_url)
    features_df = pd.DataFrame([features_dict])
    if 'label' in features_df.columns:
        features_df = features_df.drop('label', axis=1)
    
    print(f"Extracted {len(features_df.columns)} features\n")
    
    # Store feature names for fallback
    feature_names_store = {}
    
    # Check each model
    for name, filename in model_files.items():
        model_path = models_dir / filename
        if not model_path.exists():
            print(f"❌ {name}: Model file not found")
            continue
        
        model = joblib.load(model_path)
        
        # Determine expected features
        expected_features = None
        source = None
        
        if hasattr(model, 'feature_names_in_'):
            expected_features = list(model.feature_names_in_)
            source = "model"
        elif hasattr(scaler, 'feature_names_in_'):
            expected_features = list(scaler.feature_names_in_)
            source = "scaler"
        elif feature_names_store:
            expected_features = list(feature_names_store.values())[0]
            source = "fallback"
        
        if expected_features:
            feature_names_store[name] = expected_features
            print(f"✓ {name}:")
            print(f"  Expected features: {len(expected_features)} (from {source})")
            print(f"  Expected features: {len(expected_features)} (from {source})")
            
            # Check missing features
            missing = set(expected_features) - set(features_df.columns)
            extra = set(features_df.columns) - set(expected_features)
            
            if missing:
                print(f"  ⚠ Missing features: {len(missing)}")
                print(f"    {list(missing)[:5]}...")
            
            if extra:
                print(f"  ⚠ Extra features: {len(extra)}")
                print(f"    {list(extra)[:5]}...")
            
            if not missing and not extra:
                print(f"  ✓ Perfect match!")
            
            # Try prediction with alignment
            features_aligned = pd.DataFrame(columns=expected_features)
            for feat in expected_features:
                if feat in features_df.columns:
                    features_aligned[feat] = features_df[feat].values
                else:
                    features_aligned[feat] = 0
            
            # Scale for Logistic Regression
            if name == 'Logistic Regression' and scaler is not None:
                features_to_use = scaler.transform(features_aligned)
            else:
                features_to_use = features_aligned
            
            try:
                pred = model.predict(features_to_use)[0]
                proba = model.predict_proba(features_to_use)[0]
                print(f"  ✓ Prediction successful: {'PHISHING' if pred == 1 else 'LEGITIMATE'} ({proba[pred]*100:.1f}%)")
            except Exception as e:
                print(f"  ❌ Prediction failed: {e}")
        else:
            print(f"⚠ {name}: No feature names available")
        
        print()

if __name__ == "__main__":
    test_feature_alignment()