Spaces:

0xnu
/

fraud-detection

Sleeping

App Files Files Community

0xnu commited on Aug 9, 2025

Commit

22773a1

verified ·

1 Parent(s): 4de4bb8

Upload 5 files

Browse files

Files changed (2) hide show

feature_detector.py +156 -70
model_wrapper.py +14 -7

feature_detector.py CHANGED Viewed

@@ -15,9 +15,6 @@ def get_model_feature_names(preprocessor):
 def create_complete_feature_set(transaction_data, expected_features):
     """Create a complete feature set matching exactly what the model expects"""
-    # Start with provided data
-    complete_data = transaction_data.copy()
     # Define comprehensive defaults
     feature_defaults = {
         # Transaction basics
@@ -56,54 +53,51 @@ def create_complete_feature_set(transaction_data, expected_features):
         **{f'id_{i:02d}': 0.0 for i in range(1, 39)},
     }
-    # Fill missing features
     for feature in expected_features:
-        if feature not in complete_data:
-            if feature in feature_defaults:
-                complete_data[feature] = feature_defaults[feature]
             else:
-                # Default based on feature name pattern
-                if feature.startswith('V'):
-                    complete_data[feature] = 1.0
-                elif feature.startswith('id_'):
-                    complete_data[feature] = 0.0
-                elif feature.startswith('C'):
-                    complete_data[feature] = 0.0
-                elif feature.startswith('D'):
-                    complete_data[feature] = 0.0
-                elif feature.startswith('M'):
-                    complete_data[feature] = 'F'
-                else:
-                    # Numeric default
-                    complete_data[feature] = 0.0
     return complete_data
-def debug_feature_mismatch(model_features, data_features):
-    """Debug feature mismatches between model and data"""
-    model_set = set(model_features) if model_features else set()
-    data_set = set(data_features) if data_features else set()
-    missing_in_data = model_set - data_set
-    extra_in_data = data_set - model_set
-    print(f"Model expects {len(model_set)} features")
-    print(f"Data has {len(data_set)} features")
-    print(f"Missing in data: {len(missing_in_data)} features")
-    print(f"Extra in data: {len(extra_in_data)} features")
-    if missing_in_data:
-        print(f"First 10 missing features: {list(missing_in_data)[:10]}")
-    return {
-        'missing': list(missing_in_data),
-        'extra': list(extra_in_data),
-        'model_count': len(model_set),
-        'data_count': len(data_set)
-    }
-def safe_predict_with_feature_matching(model, preprocessor, transaction_data):
-    """Safely make predictions by ensuring feature alignment"""
     try:
         # Get expected features from preprocessor
@@ -112,37 +106,137 @@ def safe_predict_with_feature_matching(model, preprocessor, transaction_data):
         if expected_features is None:
             raise ValueError("Could not determine expected features from preprocessor")
-        # Create complete feature set
         complete_data = create_complete_feature_set(transaction_data, expected_features)
-        # Create DataFrame
-        df = pd.DataFrame([complete_data])
-        # Add TransactionID if not present
-        if 'TransactionID' not in df.columns:
-            df['TransactionID'] = 'temp_id'
-        # Preprocess
-        X_processed, _ = preprocessor.preprocess(df, fit=False)
-        # Debug feature alignment
-        debug_info = debug_feature_mismatch(expected_features, X_processed.columns.tolist())
-        print(f"Feature alignment: {debug_info}")
         # Make prediction
-        prediction_proba = model.predict_proba(X_processed)[0, 1]
         return {
             'success': True,
-            'probability': float(prediction_proba),
-            'debug_info': debug_info
         }
     except Exception as e:
         return {
             'success': False,
-            'error': str(e),
-            'debug_info': None
         }
 # Function to inspect your saved model's expected features
@@ -163,19 +257,11 @@ def inspect_model_features(model_path, preprocessor_path):
             print("First 20 features:", features[:20])
             print("Last 20 features:", features[-20:])
-            # Analyze feature patterns
-            v_features = [f for f in features if f.startswith('V')]
-            id_features = [f for f in features if f.startswith('id_')]
-            c_features = [f for f in features if f.startswith('C')]
-            d_features = [f for f in features if f.startswith('D')]
-            m_features = [f for f in features if f.startswith('M')]
-            print(f"\nFeature breakdown:")
-            print(f"V features: {len(v_features)} (range: {min(v_features)} to {max(v_features)})")
-            print(f"ID features: {len(id_features)}")
-            print(f"C features: {len(c_features)}")
-            print(f"D features: {len(d_features)}")
-            print(f"M features: {len(m_features)}")
             return features
         else:

 def create_complete_feature_set(transaction_data, expected_features):
     """Create a complete feature set matching exactly what the model expects"""
     # Define comprehensive defaults
     feature_defaults = {
         # Transaction basics
         **{f'id_{i:02d}': 0.0 for i in range(1, 39)},
     }
+    # Create ordered dictionary with EXACT feature order
+    complete_data = {}
+    # Fill features in the EXACT order expected by the model
     for feature in expected_features:
+        if feature in transaction_data:
+            complete_data[feature] = transaction_data[feature]
+        elif feature in feature_defaults:
+            complete_data[feature] = feature_defaults[feature]
+        else:
+            # Default based on feature name pattern
+            if feature.startswith('V'):
+                complete_data[feature] = 1.0
+            elif feature.startswith('id_'):
+                complete_data[feature] = 0.0
+            elif feature.startswith('C'):
+                complete_data[feature] = 0.0
+            elif feature.startswith('D'):
+                complete_data[feature] = 0.0
+            elif feature.startswith('M'):
+                complete_data[feature] = 'F'
             else:
+                # Numeric default
+                complete_data[feature] = 0.0
     return complete_data
+def create_ordered_dataframe(transaction_data, expected_features):
+    """Create DataFrame with features in exact order expected by model"""
+    # Create complete feature set
+    complete_data = create_complete_feature_set(transaction_data, expected_features)
+    # Create DataFrame with features in EXACT order
+    ordered_data = {}
+    for feature in expected_features:
+        ordered_data[feature] = complete_data[feature]
+    # Create DataFrame with ordered columns
+    df = pd.DataFrame([ordered_data], columns=expected_features)
+    return df
+def safe_predict_with_exact_features(model, preprocessor, transaction_data):
+    """Safely make predictions with exact feature order matching"""
     try:
         # Get expected features from preprocessor
         if expected_features is None:
             raise ValueError("Could not determine expected features from preprocessor")
+        print(f"Model expects {len(expected_features)} features")
+        print(f"First 10 features: {expected_features[:10]}")
+        print(f"Last 10 features: {expected_features[-10:]}")
+        # Create DataFrame with exact feature order (BYPASS PREPROCESSING)
         complete_data = create_complete_feature_set(transaction_data, expected_features)
+        # Create DataFrame with features in exact order expected by model
+        X_ordered = pd.DataFrame([complete_data], columns=expected_features)
+        print(f"Created DataFrame with shape: {X_ordered.shape}")
+        print(f"DataFrame columns match expected: {list(X_ordered.columns) == expected_features}")
+        # Make prediction directly (skip preprocessing to avoid feature reordering)
+        prediction_proba = model.predict_proba(X_ordered)[0, 1]
+        return {
+            'success': True,
+            'probability': float(prediction_proba)
+        }
+    except Exception as e:
+        print(f"Prediction error: {str(e)}")
+        return {
+            'success': False,
+            'error': str(e)
+        }
+def bypass_preprocessing_predict(model, preprocessor, transaction_data):
+    """Make prediction by bypassing preprocessing and using direct feature mapping"""
+    try:
+        # Get the exact features and their order from the preprocessor
+        if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names:
+            expected_features = preprocessor.feature_names
+        else:
+            raise ValueError("Cannot determine expected features from preprocessor")
+        print(f"Expected features count: {len(expected_features)}")
+        # Create feature defaults that match training data patterns
+        defaults = {
+            # Core transaction features
+            'TransactionAmt': 100.0,
+            'TransactionDT': 86400,
+            'ProductCD': 'W',
+            # Card features
+            'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa',
+            'card5': 142.0, 'card6': 'credit',
+            # Address features
+            'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0,
+            # Email features
+            'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com',
+            # Device features
+            'DeviceType': 'desktop', 'DeviceInfo': 'Windows',
+            # Engineered features that preprocessing creates
+            'TransactionAmt_log': np.log1p(100.0),
+            'TransactionAmt_sqrt': np.sqrt(100.0),
+            'TransactionDT_hour': (86400 / 3600) % 24,
+            'TransactionDT_day': (86400 / (3600 * 24)) % 7,
+            'card1_card2_ratio': 13553 / (150.0 + 1),
+            'addr_match': 0,
+        }
+        # Add all possible C, D, M, V, id features with defaults
+        for i in range(1, 15):
+            defaults[f'C{i}'] = 1.0 if i in [1, 2, 6, 9, 11, 12, 13, 14] else 0.0
+        for i in range(1, 16):
+            defaults[f'D{i}'] = 20.0 if i == 5 else 0.0
+        for i in range(1, 10):
+            if i <= 3:
+                defaults[f'M{i}'] = 1.0  # Encoded T
+            elif i == 4:
+                defaults[f'M{i}'] = 0.0  # Encoded M0
+            else:
+                defaults[f'M{i}'] = 0.0  # Encoded F
+        # V features (1-339) - default to 1.0
+        for i in range(1, 340):
+            defaults[f'V{i}'] = 1.0
+        # Identity features
+        for i in range(1, 39):
+            defaults[f'id_{i:02d}'] = 0.0
+        # Create feature vector in exact order
+        feature_values = []
+        for feature in expected_features:
+            if feature in transaction_data:
+                value = transaction_data[feature]
+                # Apply same transformations as preprocessing would
+                if feature == 'TransactionAmt_log':
+                    value = np.log1p(transaction_data.get('TransactionAmt', 100.0))
+                elif feature == 'TransactionAmt_sqrt':
+                    value = np.sqrt(transaction_data.get('TransactionAmt', 100.0))
+                elif feature == 'TransactionDT_hour':
+                    dt = transaction_data.get('TransactionDT', 86400)
+                    value = (dt / 3600) % 24
+                elif feature == 'TransactionDT_day':
+                    dt = transaction_data.get('TransactionDT', 86400)
+                    value = (dt / (3600 * 24)) % 7
+                feature_values.append(value)
+            else:
+                feature_values.append(defaults.get(feature, 0.0))
+        # Create properly ordered DataFrame
+        X = pd.DataFrame([feature_values], columns=expected_features)
+        print(f"Final feature matrix shape: {X.shape}")
+        print(f"Sample values: {X.iloc[0, :5].values}")
         # Make prediction
+        prediction_proba = model.predict_proba(X)[0, 1]
         return {
             'success': True,
+            'probability': float(prediction_proba)
         }
     except Exception as e:
+        print(f"Bypass prediction error: {str(e)}")
         return {
             'success': False,
+            'error': str(e)
         }
 # Function to inspect your saved model's expected features
             print("First 20 features:", features[:20])
             print("Last 20 features:", features[-20:])
+            # Save feature order to file
+            with open('model_feature_order.txt', 'w') as f:
+                for i, feature in enumerate(features):
+                    f.write(f"{i:4d}: {feature}\n")
+            print("Feature order saved to model_feature_order.txt")
             return features
         else:

model_wrapper.py CHANGED Viewed

@@ -143,11 +143,11 @@ class FraudDetectionModel:
             raise ValueError("Model not loaded. Please load model first.")
         try:
-            # Import the robust feature handling
-            from feature_detector import safe_predict_with_feature_matching
-            # Use safe prediction with automatic feature matching
-            result = safe_predict_with_feature_matching(self.model, self.preprocessor, transaction_data)
             if not result['success']:
                 return {
@@ -177,12 +177,13 @@ class FraudDetectionModel:
                 "fraud_probability": float(fraud_probability),
                 "risk_level": risk_level,
                 "recommendation": recommendation,
-                "is_suspicious": fraud_probability >= 0.5,
-                "debug_info": result.get('debug_info')
             }
         except ImportError:
-            # Fallback to original feature filling approach
             try:
                 from feature_utils import fill_missing_features
                 complete_data = fill_missing_features(transaction_data)
@@ -200,6 +201,12 @@ class FraudDetectionModel:
                 # Preprocess the data
                 X_processed, _ = self.preprocessor.preprocess(df, fit=False)
                 # Make prediction
                 fraud_probability = self.model.predict_proba(X_processed)[0, 1]

             raise ValueError("Model not loaded. Please load model first.")
         try:
+            # Import the bypass preprocessing approach
+            from feature_detector import bypass_preprocessing_predict
+            # Use bypass preprocessing to maintain exact feature order
+            result = bypass_preprocessing_predict(self.model, self.preprocessor, transaction_data)
             if not result['success']:
                 return {
                 "fraud_probability": float(fraud_probability),
                 "risk_level": risk_level,
                 "recommendation": recommendation,
+                "is_suspicious": fraud_probability >= 0.5
             }
         except ImportError:
+            # Fallback to trying preprocessing approach
+            print("⚠️ Using fallback preprocessing approach")
             try:
                 from feature_utils import fill_missing_features
                 complete_data = fill_missing_features(transaction_data)
                 # Preprocess the data
                 X_processed, _ = self.preprocessor.preprocess(df, fit=False)
+                # Get expected feature order
+                if hasattr(self.preprocessor, 'feature_names'):
+                    expected_features = self.preprocessor.feature_names
+                    # Reorder columns to match expected order
+                    X_processed = X_processed.reindex(columns=expected_features, fill_value=0.0)
                 # Make prediction
                 fraud_probability = self.model.predict_proba(X_processed)[0, 1]