Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- feature_detector.py +156 -70
- model_wrapper.py +14 -7
feature_detector.py
CHANGED
|
@@ -15,9 +15,6 @@ def get_model_feature_names(preprocessor):
|
|
| 15 |
def create_complete_feature_set(transaction_data, expected_features):
|
| 16 |
"""Create a complete feature set matching exactly what the model expects"""
|
| 17 |
|
| 18 |
-
# Start with provided data
|
| 19 |
-
complete_data = transaction_data.copy()
|
| 20 |
-
|
| 21 |
# Define comprehensive defaults
|
| 22 |
feature_defaults = {
|
| 23 |
# Transaction basics
|
|
@@ -56,54 +53,51 @@ def create_complete_feature_set(transaction_data, expected_features):
|
|
| 56 |
**{f'id_{i:02d}': 0.0 for i in range(1, 39)},
|
| 57 |
}
|
| 58 |
|
| 59 |
-
#
|
|
|
|
|
|
|
|
|
|
| 60 |
for feature in expected_features:
|
| 61 |
-
if feature
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
else:
|
| 65 |
-
#
|
| 66 |
-
|
| 67 |
-
complete_data[feature] = 1.0
|
| 68 |
-
elif feature.startswith('id_'):
|
| 69 |
-
complete_data[feature] = 0.0
|
| 70 |
-
elif feature.startswith('C'):
|
| 71 |
-
complete_data[feature] = 0.0
|
| 72 |
-
elif feature.startswith('D'):
|
| 73 |
-
complete_data[feature] = 0.0
|
| 74 |
-
elif feature.startswith('M'):
|
| 75 |
-
complete_data[feature] = 'F'
|
| 76 |
-
else:
|
| 77 |
-
# Numeric default
|
| 78 |
-
complete_data[feature] = 0.0
|
| 79 |
|
| 80 |
return complete_data
|
| 81 |
|
| 82 |
-
def
|
| 83 |
-
"""
|
| 84 |
-
model_set = set(model_features) if model_features else set()
|
| 85 |
-
data_set = set(data_features) if data_features else set()
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
|
| 98 |
-
return
|
| 99 |
-
'missing': list(missing_in_data),
|
| 100 |
-
'extra': list(extra_in_data),
|
| 101 |
-
'model_count': len(model_set),
|
| 102 |
-
'data_count': len(data_set)
|
| 103 |
-
}
|
| 104 |
|
| 105 |
-
def
|
| 106 |
-
"""Safely make predictions
|
| 107 |
|
| 108 |
try:
|
| 109 |
# Get expected features from preprocessor
|
|
@@ -112,37 +106,137 @@ def safe_predict_with_feature_matching(model, preprocessor, transaction_data):
|
|
| 112 |
if expected_features is None:
|
| 113 |
raise ValueError("Could not determine expected features from preprocessor")
|
| 114 |
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
complete_data = create_complete_feature_set(transaction_data, expected_features)
|
| 117 |
|
| 118 |
-
# Create DataFrame
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
-
#
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
-
#
|
| 126 |
-
|
|
|
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
# Make prediction
|
| 133 |
-
prediction_proba = model.predict_proba(
|
| 134 |
|
| 135 |
return {
|
| 136 |
'success': True,
|
| 137 |
-
'probability': float(prediction_proba)
|
| 138 |
-
'debug_info': debug_info
|
| 139 |
}
|
| 140 |
|
| 141 |
except Exception as e:
|
|
|
|
| 142 |
return {
|
| 143 |
'success': False,
|
| 144 |
-
'error': str(e)
|
| 145 |
-
'debug_info': None
|
| 146 |
}
|
| 147 |
|
| 148 |
# Function to inspect your saved model's expected features
|
|
@@ -163,19 +257,11 @@ def inspect_model_features(model_path, preprocessor_path):
|
|
| 163 |
print("First 20 features:", features[:20])
|
| 164 |
print("Last 20 features:", features[-20:])
|
| 165 |
|
| 166 |
-
#
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
m_features = [f for f in features if f.startswith('M')]
|
| 172 |
-
|
| 173 |
-
print(f"\nFeature breakdown:")
|
| 174 |
-
print(f"V features: {len(v_features)} (range: {min(v_features)} to {max(v_features)})")
|
| 175 |
-
print(f"ID features: {len(id_features)}")
|
| 176 |
-
print(f"C features: {len(c_features)}")
|
| 177 |
-
print(f"D features: {len(d_features)}")
|
| 178 |
-
print(f"M features: {len(m_features)}")
|
| 179 |
|
| 180 |
return features
|
| 181 |
else:
|
|
|
|
| 15 |
def create_complete_feature_set(transaction_data, expected_features):
|
| 16 |
"""Create a complete feature set matching exactly what the model expects"""
|
| 17 |
|
|
|
|
|
|
|
|
|
|
| 18 |
# Define comprehensive defaults
|
| 19 |
feature_defaults = {
|
| 20 |
# Transaction basics
|
|
|
|
| 53 |
**{f'id_{i:02d}': 0.0 for i in range(1, 39)},
|
| 54 |
}
|
| 55 |
|
| 56 |
+
# Create ordered dictionary with EXACT feature order
|
| 57 |
+
complete_data = {}
|
| 58 |
+
|
| 59 |
+
# Fill features in the EXACT order expected by the model
|
| 60 |
for feature in expected_features:
|
| 61 |
+
if feature in transaction_data:
|
| 62 |
+
complete_data[feature] = transaction_data[feature]
|
| 63 |
+
elif feature in feature_defaults:
|
| 64 |
+
complete_data[feature] = feature_defaults[feature]
|
| 65 |
+
else:
|
| 66 |
+
# Default based on feature name pattern
|
| 67 |
+
if feature.startswith('V'):
|
| 68 |
+
complete_data[feature] = 1.0
|
| 69 |
+
elif feature.startswith('id_'):
|
| 70 |
+
complete_data[feature] = 0.0
|
| 71 |
+
elif feature.startswith('C'):
|
| 72 |
+
complete_data[feature] = 0.0
|
| 73 |
+
elif feature.startswith('D'):
|
| 74 |
+
complete_data[feature] = 0.0
|
| 75 |
+
elif feature.startswith('M'):
|
| 76 |
+
complete_data[feature] = 'F'
|
| 77 |
else:
|
| 78 |
+
# Numeric default
|
| 79 |
+
complete_data[feature] = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
return complete_data
|
| 82 |
|
| 83 |
+
def create_ordered_dataframe(transaction_data, expected_features):
|
| 84 |
+
"""Create DataFrame with features in exact order expected by model"""
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
# Create complete feature set
|
| 87 |
+
complete_data = create_complete_feature_set(transaction_data, expected_features)
|
| 88 |
|
| 89 |
+
# Create DataFrame with features in EXACT order
|
| 90 |
+
ordered_data = {}
|
| 91 |
+
for feature in expected_features:
|
| 92 |
+
ordered_data[feature] = complete_data[feature]
|
| 93 |
|
| 94 |
+
# Create DataFrame with ordered columns
|
| 95 |
+
df = pd.DataFrame([ordered_data], columns=expected_features)
|
| 96 |
|
| 97 |
+
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
+
def safe_predict_with_exact_features(model, preprocessor, transaction_data):
|
| 100 |
+
"""Safely make predictions with exact feature order matching"""
|
| 101 |
|
| 102 |
try:
|
| 103 |
# Get expected features from preprocessor
|
|
|
|
| 106 |
if expected_features is None:
|
| 107 |
raise ValueError("Could not determine expected features from preprocessor")
|
| 108 |
|
| 109 |
+
print(f"Model expects {len(expected_features)} features")
|
| 110 |
+
print(f"First 10 features: {expected_features[:10]}")
|
| 111 |
+
print(f"Last 10 features: {expected_features[-10:]}")
|
| 112 |
+
|
| 113 |
+
# Create DataFrame with exact feature order (BYPASS PREPROCESSING)
|
| 114 |
complete_data = create_complete_feature_set(transaction_data, expected_features)
|
| 115 |
|
| 116 |
+
# Create DataFrame with features in exact order expected by model
|
| 117 |
+
X_ordered = pd.DataFrame([complete_data], columns=expected_features)
|
| 118 |
+
|
| 119 |
+
print(f"Created DataFrame with shape: {X_ordered.shape}")
|
| 120 |
+
print(f"DataFrame columns match expected: {list(X_ordered.columns) == expected_features}")
|
| 121 |
+
|
| 122 |
+
# Make prediction directly (skip preprocessing to avoid feature reordering)
|
| 123 |
+
prediction_proba = model.predict_proba(X_ordered)[0, 1]
|
| 124 |
+
|
| 125 |
+
return {
|
| 126 |
+
'success': True,
|
| 127 |
+
'probability': float(prediction_proba)
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
print(f"Prediction error: {str(e)}")
|
| 132 |
+
return {
|
| 133 |
+
'success': False,
|
| 134 |
+
'error': str(e)
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
def bypass_preprocessing_predict(model, preprocessor, transaction_data):
|
| 138 |
+
"""Make prediction by bypassing preprocessing and using direct feature mapping"""
|
| 139 |
+
|
| 140 |
+
try:
|
| 141 |
+
# Get the exact features and their order from the preprocessor
|
| 142 |
+
if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names:
|
| 143 |
+
expected_features = preprocessor.feature_names
|
| 144 |
+
else:
|
| 145 |
+
raise ValueError("Cannot determine expected features from preprocessor")
|
| 146 |
+
|
| 147 |
+
print(f"Expected features count: {len(expected_features)}")
|
| 148 |
|
| 149 |
+
# Create feature defaults that match training data patterns
|
| 150 |
+
defaults = {
|
| 151 |
+
# Core transaction features
|
| 152 |
+
'TransactionAmt': 100.0,
|
| 153 |
+
'TransactionDT': 86400,
|
| 154 |
+
'ProductCD': 'W',
|
| 155 |
+
|
| 156 |
+
# Card features
|
| 157 |
+
'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa',
|
| 158 |
+
'card5': 142.0, 'card6': 'credit',
|
| 159 |
+
|
| 160 |
+
# Address features
|
| 161 |
+
'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0,
|
| 162 |
+
|
| 163 |
+
# Email features
|
| 164 |
+
'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com',
|
| 165 |
+
|
| 166 |
+
# Device features
|
| 167 |
+
'DeviceType': 'desktop', 'DeviceInfo': 'Windows',
|
| 168 |
+
|
| 169 |
+
# Engineered features that preprocessing creates
|
| 170 |
+
'TransactionAmt_log': np.log1p(100.0),
|
| 171 |
+
'TransactionAmt_sqrt': np.sqrt(100.0),
|
| 172 |
+
'TransactionDT_hour': (86400 / 3600) % 24,
|
| 173 |
+
'TransactionDT_day': (86400 / (3600 * 24)) % 7,
|
| 174 |
+
'card1_card2_ratio': 13553 / (150.0 + 1),
|
| 175 |
+
'addr_match': 0,
|
| 176 |
+
}
|
| 177 |
|
| 178 |
+
# Add all possible C, D, M, V, id features with defaults
|
| 179 |
+
for i in range(1, 15):
|
| 180 |
+
defaults[f'C{i}'] = 1.0 if i in [1, 2, 6, 9, 11, 12, 13, 14] else 0.0
|
| 181 |
|
| 182 |
+
for i in range(1, 16):
|
| 183 |
+
defaults[f'D{i}'] = 20.0 if i == 5 else 0.0
|
| 184 |
+
|
| 185 |
+
for i in range(1, 10):
|
| 186 |
+
if i <= 3:
|
| 187 |
+
defaults[f'M{i}'] = 1.0 # Encoded T
|
| 188 |
+
elif i == 4:
|
| 189 |
+
defaults[f'M{i}'] = 0.0 # Encoded M0
|
| 190 |
+
else:
|
| 191 |
+
defaults[f'M{i}'] = 0.0 # Encoded F
|
| 192 |
+
|
| 193 |
+
# V features (1-339) - default to 1.0
|
| 194 |
+
for i in range(1, 340):
|
| 195 |
+
defaults[f'V{i}'] = 1.0
|
| 196 |
+
|
| 197 |
+
# Identity features
|
| 198 |
+
for i in range(1, 39):
|
| 199 |
+
defaults[f'id_{i:02d}'] = 0.0
|
| 200 |
+
|
| 201 |
+
# Create feature vector in exact order
|
| 202 |
+
feature_values = []
|
| 203 |
+
for feature in expected_features:
|
| 204 |
+
if feature in transaction_data:
|
| 205 |
+
value = transaction_data[feature]
|
| 206 |
+
# Apply same transformations as preprocessing would
|
| 207 |
+
if feature == 'TransactionAmt_log':
|
| 208 |
+
value = np.log1p(transaction_data.get('TransactionAmt', 100.0))
|
| 209 |
+
elif feature == 'TransactionAmt_sqrt':
|
| 210 |
+
value = np.sqrt(transaction_data.get('TransactionAmt', 100.0))
|
| 211 |
+
elif feature == 'TransactionDT_hour':
|
| 212 |
+
dt = transaction_data.get('TransactionDT', 86400)
|
| 213 |
+
value = (dt / 3600) % 24
|
| 214 |
+
elif feature == 'TransactionDT_day':
|
| 215 |
+
dt = transaction_data.get('TransactionDT', 86400)
|
| 216 |
+
value = (dt / (3600 * 24)) % 7
|
| 217 |
+
feature_values.append(value)
|
| 218 |
+
else:
|
| 219 |
+
feature_values.append(defaults.get(feature, 0.0))
|
| 220 |
+
|
| 221 |
+
# Create properly ordered DataFrame
|
| 222 |
+
X = pd.DataFrame([feature_values], columns=expected_features)
|
| 223 |
+
|
| 224 |
+
print(f"Final feature matrix shape: {X.shape}")
|
| 225 |
+
print(f"Sample values: {X.iloc[0, :5].values}")
|
| 226 |
|
| 227 |
# Make prediction
|
| 228 |
+
prediction_proba = model.predict_proba(X)[0, 1]
|
| 229 |
|
| 230 |
return {
|
| 231 |
'success': True,
|
| 232 |
+
'probability': float(prediction_proba)
|
|
|
|
| 233 |
}
|
| 234 |
|
| 235 |
except Exception as e:
|
| 236 |
+
print(f"Bypass prediction error: {str(e)}")
|
| 237 |
return {
|
| 238 |
'success': False,
|
| 239 |
+
'error': str(e)
|
|
|
|
| 240 |
}
|
| 241 |
|
| 242 |
# Function to inspect your saved model's expected features
|
|
|
|
| 257 |
print("First 20 features:", features[:20])
|
| 258 |
print("Last 20 features:", features[-20:])
|
| 259 |
|
| 260 |
+
# Save feature order to file
|
| 261 |
+
with open('model_feature_order.txt', 'w') as f:
|
| 262 |
+
for i, feature in enumerate(features):
|
| 263 |
+
f.write(f"{i:4d}: {feature}\n")
|
| 264 |
+
print("Feature order saved to model_feature_order.txt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
return features
|
| 267 |
else:
|
model_wrapper.py
CHANGED
|
@@ -143,11 +143,11 @@ class FraudDetectionModel:
|
|
| 143 |
raise ValueError("Model not loaded. Please load model first.")
|
| 144 |
|
| 145 |
try:
|
| 146 |
-
# Import the
|
| 147 |
-
from feature_detector import
|
| 148 |
|
| 149 |
-
# Use
|
| 150 |
-
result =
|
| 151 |
|
| 152 |
if not result['success']:
|
| 153 |
return {
|
|
@@ -177,12 +177,13 @@ class FraudDetectionModel:
|
|
| 177 |
"fraud_probability": float(fraud_probability),
|
| 178 |
"risk_level": risk_level,
|
| 179 |
"recommendation": recommendation,
|
| 180 |
-
"is_suspicious": fraud_probability >= 0.5
|
| 181 |
-
"debug_info": result.get('debug_info')
|
| 182 |
}
|
| 183 |
|
| 184 |
except ImportError:
|
| 185 |
-
# Fallback to
|
|
|
|
|
|
|
| 186 |
try:
|
| 187 |
from feature_utils import fill_missing_features
|
| 188 |
complete_data = fill_missing_features(transaction_data)
|
|
@@ -200,6 +201,12 @@ class FraudDetectionModel:
|
|
| 200 |
# Preprocess the data
|
| 201 |
X_processed, _ = self.preprocessor.preprocess(df, fit=False)
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
# Make prediction
|
| 204 |
fraud_probability = self.model.predict_proba(X_processed)[0, 1]
|
| 205 |
|
|
|
|
| 143 |
raise ValueError("Model not loaded. Please load model first.")
|
| 144 |
|
| 145 |
try:
|
| 146 |
+
# Import the bypass preprocessing approach
|
| 147 |
+
from feature_detector import bypass_preprocessing_predict
|
| 148 |
|
| 149 |
+
# Use bypass preprocessing to maintain exact feature order
|
| 150 |
+
result = bypass_preprocessing_predict(self.model, self.preprocessor, transaction_data)
|
| 151 |
|
| 152 |
if not result['success']:
|
| 153 |
return {
|
|
|
|
| 177 |
"fraud_probability": float(fraud_probability),
|
| 178 |
"risk_level": risk_level,
|
| 179 |
"recommendation": recommendation,
|
| 180 |
+
"is_suspicious": fraud_probability >= 0.5
|
|
|
|
| 181 |
}
|
| 182 |
|
| 183 |
except ImportError:
|
| 184 |
+
# Fallback to trying preprocessing approach
|
| 185 |
+
print("⚠️ Using fallback preprocessing approach")
|
| 186 |
+
|
| 187 |
try:
|
| 188 |
from feature_utils import fill_missing_features
|
| 189 |
complete_data = fill_missing_features(transaction_data)
|
|
|
|
| 201 |
# Preprocess the data
|
| 202 |
X_processed, _ = self.preprocessor.preprocess(df, fit=False)
|
| 203 |
|
| 204 |
+
# Get expected feature order
|
| 205 |
+
if hasattr(self.preprocessor, 'feature_names'):
|
| 206 |
+
expected_features = self.preprocessor.feature_names
|
| 207 |
+
# Reorder columns to match expected order
|
| 208 |
+
X_processed = X_processed.reindex(columns=expected_features, fill_value=0.0)
|
| 209 |
+
|
| 210 |
# Make prediction
|
| 211 |
fraud_probability = self.model.predict_proba(X_processed)[0, 1]
|
| 212 |
|