Zayeemk commited on
Commit
aaf12e6
·
verified ·
1 Parent(s): 34a6484

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -1,2 +1,4 @@
1
  creditcard_original.csv filter=lfs diff=lfs merge=lfs -text
2
  data/creditcard_original.csv filter=lfs diff=lfs merge=lfs -text
 
 
 
1
  creditcard_original.csv filter=lfs diff=lfs merge=lfs -text
2
  data/creditcard_original.csv filter=lfs diff=lfs merge=lfs -text
3
+ isolation_forest_model.pkl filter=lfs diff=lfs merge=lfs -text
4
+ lof_model.pkl filter=lfs diff=lfs merge=lfs -text
explain.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from typing import Dict, List, Any
4
+
5
+
6
+ class AnomalyExplainer:
7
+ """Generate explanations for detected anomalies."""
8
+
9
+ def __init__(self):
10
+ self.thresholds = {
11
+ 'amount_ratio': 3.0, # 3x user average
12
+ 'z_score': 2.5, # 2.5 standard deviations
13
+ 'hour_distance': 6, # 6 hours from common hour
14
+ 'time_since_last': 48, # 48 hours since last transaction
15
+ 'night_transaction': True, # Transaction at night
16
+ 'weekend_ratio': 2.0 # Weekend transaction ratio
17
+ }
18
+
19
+ def explain_anomaly(self, transaction: pd.Series, user_stats: Dict = None) -> List[str]:
20
+ """Generate explanation for a single anomalous transaction."""
21
+ explanations = []
22
+
23
+ # Check amount deviation
24
+ if 'AmountRatio_Mean' in transaction and not pd.isna(transaction['AmountRatio_Mean']):
25
+ ratio = transaction['AmountRatio_Mean']
26
+ if ratio > self.thresholds['amount_ratio']:
27
+ explanations.append(f"Transaction amount is {ratio:.1f}x higher than user's average")
28
+ elif ratio > self.thresholds['amount_ratio'] * 0.5:
29
+ explanations.append(f"Transaction amount is {ratio:.1f}x higher than user's average")
30
+
31
+ if 'Amount_ZScore' in transaction and not pd.isna(transaction['Amount_ZScore']):
32
+ z_score = abs(transaction['Amount_ZScore'])
33
+ if z_score > self.thresholds['z_score']:
34
+ explanations.append(f"Transaction amount deviates by {z_score:.1f} standard deviations from user's normal spending")
35
+
36
+ # Check time-based anomalies
37
+ if 'Hour' in transaction and not pd.isna(transaction['Hour']):
38
+ hour = transaction['Hour']
39
+ if hour >= 22 or hour <= 5:
40
+ explanations.append(f"Unusual transaction time: {hour}:00 (night hours)")
41
+
42
+ if 'Hour_Distance' in transaction and not pd.isna(transaction['Hour_Distance']):
43
+ hour_dist = transaction['Hour_Distance']
44
+ if hour_dist > self.thresholds['hour_distance']:
45
+ explanations.append(f"Transaction time is {hour_dist:.0f} hours away from user's typical transaction hour")
46
+
47
+ if 'IsWeekend' in transaction and transaction['IsWeekend'] == 1:
48
+ explanations.append("Transaction occurred on a weekend")
49
+
50
+ # Check frequency anomalies
51
+ if 'TimeSinceLastTx' in transaction and not pd.isna(transaction['TimeSinceLastTx']):
52
+ time_since = transaction['TimeSinceLastTx']
53
+ if time_since > self.thresholds['time_since_last']:
54
+ explanations.append(f"Unusual transaction pattern: {time_since:.0f} hours since last transaction")
55
+ elif time_since < 1:
56
+ explanations.append("Rapid succession: multiple transactions within 1 hour")
57
+
58
+ # Check category anomalies
59
+ if 'Merchant Category' in transaction:
60
+ category = transaction['Merchant Category']
61
+ explanations.append(f"Merchant category: {category}")
62
+
63
+ if 'Category_Entropy' in transaction and not pd.isna(transaction['Category_Entropy']):
64
+ entropy = transaction['Category_Entropy']
65
+ if entropy < 1.0:
66
+ explanations.append("User typically has low category diversity - this transaction may be unusual")
67
+
68
+ # If no specific explanations found, provide general one
69
+ if not explanations:
70
+ explanations.append("Anomaly detected based on combined feature analysis")
71
+
72
+ return explanations
73
+
74
+ def explain_batch(self, df: pd.DataFrame, user_stats: Dict = None) -> pd.DataFrame:
75
+ """Generate explanations for a batch of transactions."""
76
+ df = df.copy()
77
+
78
+ explanations = []
79
+ for idx, row in df.iterrows():
80
+ explanation = self.explain_anomaly(row, user_stats)
81
+ explanations.append('; '.join(explanation))
82
+
83
+ df['Explanation'] = explanations
84
+ return df
85
+
86
+ def get_feature_importance(self, transaction: pd.Series) -> Dict[str, float]:
87
+ """Calculate feature importance for the anomaly."""
88
+ importance = {}
89
+
90
+ # Amount importance
91
+ if 'Amount_ZScore' in transaction and not pd.isna(transaction['Amount_ZScore']):
92
+ importance['Amount'] = min(abs(transaction['Amount_ZScore']) / 5.0, 1.0)
93
+
94
+ # Time importance
95
+ if 'Hour_Distance' in transaction and not pd.isna(transaction['Hour_Distance']):
96
+ importance['Time'] = min(transaction['Hour_Distance'] / 12.0, 1.0)
97
+
98
+ # Frequency importance
99
+ if 'TimeSinceLastTx' in transaction and not pd.isna(transaction['TimeSinceLastTx']):
100
+ importance['Frequency'] = min(transaction['TimeSinceLastTx'] / 72.0, 1.0)
101
+
102
+ # Category importance
103
+ if 'Category_Entropy' in transaction and not pd.isna(transaction['Category_Entropy']):
104
+ importance['Category'] = max(0, 1.0 - transaction['Category_Entropy'] / 3.0)
105
+
106
+ # Normalize importance scores
107
+ total = sum(importance.values()) if importance else 1
108
+ if total > 0:
109
+ importance = {k: v / total for k, v in importance.items()}
110
+
111
+ return importance
112
+
113
+ def generate_radar_data(self, df: pd.DataFrame) -> Dict[str, List]:
114
+ """Generate data for radar chart visualization."""
115
+ if df.empty:
116
+ return {'labels': [], 'datasets': []}
117
+
118
+ # Calculate metrics for radar chart
119
+ metrics = {
120
+ 'Amount': df['Amount'].mean() if 'Amount' in df.columns else 0,
121
+ 'Frequency': df.get('TxCount_Window', pd.Series([1])).mean(),
122
+ 'Time Variance': df.get('Hour_Variance', pd.Series([0])).mean(),
123
+ 'Category Diversity': df.get('Category_Entropy', pd.Series([0])).mean()
124
+ }
125
+
126
+ # Normalize to 0-100 scale
127
+ max_vals = {
128
+ 'Amount': df['Amount'].max() if 'Amount' in df.columns else 1,
129
+ 'Frequency': metrics['Frequency'] * 2,
130
+ 'Time Variance': 50,
131
+ 'Category Diversity': 3
132
+ }
133
+
134
+ normalized = {
135
+ 'Amount': (metrics['Amount'] / max_vals['Amount'] * 100) if max_vals['Amount'] > 0 else 50,
136
+ 'Frequency': (metrics['Frequency'] / max_vals['Frequency'] * 100) if max_vals['Frequency'] > 0 else 50,
137
+ 'Time Variance': (metrics['Time Variance'] / max_vals['Time Variance'] * 100),
138
+ 'Category Diversity': (metrics['Category Diversity'] / max_vals['Category Diversity'] * 100)
139
+ }
140
+
141
+ return {
142
+ 'labels': list(normalized.keys()),
143
+ 'values': [normalized[k] for k in normalized.keys()]
144
+ }
145
+
146
+ def compare_with_user_baseline(self, transaction: pd.Series, user_baseline: Dict) -> Dict[str, Any]:
147
+ """Compare transaction with user's baseline behavior."""
148
+ comparison = {}
149
+
150
+ if 'Amount' in transaction and 'mean' in user_baseline:
151
+ comparison['amount_vs_avg'] = transaction['Amount'] / user_baseline['mean']
152
+
153
+ if 'Hour' in transaction:
154
+ comparison['hour'] = transaction['Hour']
155
+
156
+ if 'Merchant Category' in transaction:
157
+ comparison['category'] = transaction['Merchant Category']
158
+
159
+ return comparison
feature_columns.pkl ADDED
Binary file (171 Bytes). View file
 
isolation_forest_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e83f211f7ee754655cb43cf77877a801febbc72b3e36ea27728e848c5e922948
3
+ size 1263449
isolation_forest_scaler.pkl ADDED
Binary file (863 Bytes). View file
 
lof_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b0a408c1618194e4662b2d3d3e41335a70923d38906fac79e7ec5d0856e92e4
3
+ size 1854630
lof_scaler.pkl ADDED
Binary file (863 Bytes). View file
 
predict.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+ import os
5
+ from typing import Dict, List, Tuple, Any
6
+ from model.train import AnomalyDetectorTrainer
7
+
8
+
9
+ class AnomalyPredictor:
10
+ """Predict anomalies in credit card transactions."""
11
+
12
+ def __init__(self, model_dir: str = 'model'):
13
+ self.model_dir = model_dir
14
+ self.trainer = AnomalyDetectorTrainer(model_dir)
15
+ self.current_model = None
16
+ self.current_scaler = None
17
+ self.feature_columns = []
18
+
19
+ def load_model(self, model_type: str = 'isolation_forest') -> bool:
20
+ """Load the trained model."""
21
+ success = self.trainer.load_model(model_type)
22
+ if success:
23
+ self.current_model = self.trainer.models[model_type]
24
+ self.current_scaler = self.trainer.scalers[model_type]
25
+ self.feature_columns = self.trainer.feature_columns
26
+ return success
27
+
28
+ def prepare_features(self, df: pd.DataFrame) -> np.ndarray:
29
+ """Prepare features for prediction."""
30
+ df = df.copy()
31
+
32
+ # Select available feature columns
33
+ available_features = [col for col in self.feature_columns if col in df.columns]
34
+
35
+ if not available_features:
36
+ # Fallback to Amount if available
37
+ if 'Amount' in df.columns:
38
+ available_features = ['Amount']
39
+ else:
40
+ raise ValueError("No valid features found in dataframe")
41
+
42
+ # Fill missing values with median
43
+ for col in available_features:
44
+ if col in df.columns:
45
+ df[col] = df[col].fillna(df[col].median())
46
+
47
+ X = df[available_features].values
48
+
49
+ # Scale features
50
+ if self.current_scaler is not None:
51
+ X_scaled = self.current_scaler.transform(X)
52
+ else:
53
+ X_scaled = X
54
+
55
+ return X_scaled, available_features
56
+
57
+ def predict(self, df: pd.DataFrame) -> pd.DataFrame:
58
+ """Predict anomalies for transactions."""
59
+ if self.current_model is None:
60
+ raise ValueError("No model loaded. Call load_model() first.")
61
+
62
+ df = df.copy()
63
+
64
+ # Prepare features
65
+ X_scaled, used_features = self.prepare_features(df)
66
+
67
+ # Get predictions
68
+ predictions = self.current_model.predict(X_scaled)
69
+
70
+ # Get anomaly scores
71
+ if hasattr(self.current_model, 'decision_function'):
72
+ scores = self.current_model.decision_function(X_scaled)
73
+ elif hasattr(self.current_model, 'score_samples'):
74
+ scores = self.current_model.score_samples(X_scaled)
75
+ else:
76
+ scores = np.zeros(len(X_scaled))
77
+
78
+ # Convert predictions to labels (-1 = anomaly, 1 = normal)
79
+ df['Prediction'] = predictions
80
+ df['Anomaly_Score'] = scores
81
+ df['Status'] = df['Prediction'].apply(lambda x: 'Anomalous' if x == -1 else 'Normal')
82
+ df['Confidence'] = np.abs(scores) / (np.max(np.abs(scores)) + 1e-8)
83
+
84
+ return df
85
+
86
+ def predict_single(self, transaction: Dict[str, Any]) -> Dict[str, Any]:
87
+ """Predict anomaly for a single transaction."""
88
+ df = pd.DataFrame([transaction])
89
+ result = self.predict(df)
90
+
91
+ return {
92
+ 'transaction_id': transaction.get('Transaction ID', 'Unknown'),
93
+ 'prediction': result.iloc[0]['Status'],
94
+ 'anomaly_score': float(result.iloc[0]['Anomaly_Score']),
95
+ 'confidence': float(result.iloc[0]['Confidence'])
96
+ }
97
+
98
+ def get_anomalies(self, df: pd.DataFrame) -> pd.DataFrame:
99
+ """Get only the anomalous transactions."""
100
+ predictions = self.predict(df)
101
+ anomalies = predictions[predictions['Status'] == 'Anomalous']
102
+ return anomalies
103
+
104
+ def get_statistics(self, df: pd.DataFrame) -> Dict[str, Any]:
105
+ """Get prediction statistics."""
106
+ predictions = self.predict(df)
107
+
108
+ total = len(predictions)
109
+ anomalies = len(predictions[predictions['Status'] == 'Anomalous'])
110
+ normal = total - anomalies
111
+
112
+ return {
113
+ 'total_transactions': total,
114
+ 'anomalies_detected': anomalies,
115
+ 'normal_transactions': normal,
116
+ 'detection_rate': (anomalies / total * 100) if total > 0 else 0,
117
+ 'average_anomaly_score': float(predictions[predictions['Status'] == 'Anomalous']['Anomaly_Score'].mean()) if anomalies > 0 else 0
118
+ }
119
+
120
+ def batch_predict(self, df: pd.DataFrame, batch_size: int = 1000) -> pd.DataFrame:
121
+ """Predict in batches for large datasets."""
122
+ results = []
123
+
124
+ for i in range(0, len(df), batch_size):
125
+ batch = df.iloc[i:i + batch_size]
126
+ batch_result = self.predict(batch)
127
+ results.append(batch_result)
128
+
129
+ return pd.concat(results, ignore_index=True)
train.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.ensemble import IsolationForest
4
+ from sklearn.neighbors import LocalOutlierFactor
5
+ from sklearn.preprocessing import StandardScaler
6
+ import joblib
7
+ import os
8
+ from typing import Dict, Tuple, Any
9
+
10
+
11
+ class AnomalyDetectorTrainer:
12
+ """Train anomaly detection models for credit card transactions."""
13
+
14
+ def __init__(self, model_dir: str = 'model'):
15
+ self.model_dir = model_dir
16
+ self.models = {}
17
+ self.scalers = {}
18
+ self.feature_columns = []
19
+
20
+ # Create model directory if it doesn't exist
21
+ os.makedirs(model_dir, exist_ok=True)
22
+
23
+ def prepare_features(self, df: pd.DataFrame) -> Tuple[np.ndarray, list]:
24
+ """Prepare features for training."""
25
+ df = df.copy()
26
+
27
+ # Select numerical features
28
+ feature_cols = []
29
+
30
+ # Core features
31
+ if 'Amount' in df.columns:
32
+ feature_cols.append('Amount')
33
+ if 'Amount_ZScore' in df.columns:
34
+ feature_cols.append('Amount_ZScore')
35
+ if 'AmountRatio_Mean' in df.columns:
36
+ feature_cols.append('AmountRatio_Mean')
37
+
38
+ # Time features
39
+ if 'Hour' in df.columns:
40
+ feature_cols.append('Hour')
41
+ if 'DayOfWeek' in df.columns:
42
+ feature_cols.append('DayOfWeek')
43
+ if 'IsWeekend' in df.columns:
44
+ feature_cols.append('IsWeekend')
45
+ if 'IsNight' in df.columns:
46
+ feature_cols.append('IsNight')
47
+ if 'Hour_Distance' in df.columns:
48
+ feature_cols.append('Hour_Distance')
49
+
50
+ # Frequency features
51
+ if 'TimeSinceLastTx' in df.columns:
52
+ feature_cols.append('TimeSinceLastTx')
53
+ if 'TxCount_Window' in df.columns:
54
+ feature_cols.append('TxCount_Window')
55
+
56
+ # Category features
57
+ if 'Category_Entropy' in df.columns:
58
+ feature_cols.append('Category_Entropy')
59
+ if 'Merchant_Category_Encoded' in df.columns:
60
+ feature_cols.append('Merchant_Category_Encoded')
61
+
62
+ # Filter to only columns that exist
63
+ available_features = [col for col in feature_cols if col in df.columns]
64
+
65
+ if not available_features:
66
+ # Fallback to basic features
67
+ available_features = ['Amount']
68
+
69
+ # Fill missing values
70
+ for col in available_features:
71
+ df[col] = df[col].fillna(df[col].median())
72
+
73
+ self.feature_columns = available_features
74
+ X = df[available_features].values
75
+
76
+ return X, available_features
77
+
78
+ def train_isolation_forest(
79
+ self,
80
+ X: np.ndarray,
81
+ contamination: float = 0.1,
82
+ n_estimators: int = 100,
83
+ random_state: int = 42
84
+ ) -> IsolationForest:
85
+ """Train Isolation Forest model."""
86
+ model = IsolationForest(
87
+ contamination=contamination,
88
+ n_estimators=n_estimators,
89
+ random_state=random_state,
90
+ n_jobs=-1
91
+ )
92
+ model.fit(X)
93
+ return model
94
+
95
+ def train_lof(
96
+ self,
97
+ X: np.ndarray,
98
+ contamination: float = 0.1,
99
+ n_neighbors: int = 20
100
+ ) -> LocalOutlierFactor:
101
+ """Train Local Outlier Factor model."""
102
+ model = LocalOutlierFactor(
103
+ contamination=contamination,
104
+ n_neighbors=n_neighbors,
105
+ novelty=True
106
+ )
107
+ model.fit(X)
108
+ return model
109
+
110
+ def train_models(
111
+ self,
112
+ df: pd.DataFrame,
113
+ model_type: str = 'isolation_forest',
114
+ contamination: float = 0.1
115
+ ) -> Dict[str, Any]:
116
+ """Train selected anomaly detection model."""
117
+ # Prepare features
118
+ X, feature_cols = self.prepare_features(df)
119
+
120
+ # Scale features
121
+ scaler = StandardScaler()
122
+ X_scaled = scaler.fit_transform(X)
123
+
124
+ # Train model based on type
125
+ if model_type == 'isolation_forest':
126
+ model = self.train_isolation_forest(X_scaled, contamination=contamination)
127
+ elif model_type == 'lof':
128
+ model = self.train_lof(X_scaled, contamination=contamination)
129
+ else:
130
+ raise ValueError(f"Unknown model type: {model_type}")
131
+
132
+ # Store model and scaler
133
+ self.models[model_type] = model
134
+ self.scalers[model_type] = scaler
135
+
136
+ # Save models
137
+ self.save_model(model_type)
138
+
139
+ return {
140
+ 'model_type': model_type,
141
+ 'contamination': contamination,
142
+ 'feature_columns': feature_cols,
143
+ 'n_samples': len(X),
144
+ 'n_features': X.shape[1]
145
+ }
146
+
147
+ def save_model(self, model_type: str):
148
+ """Save trained model and scaler to disk."""
149
+ model_path = os.path.join(self.model_dir, f'{model_type}_model.pkl')
150
+ scaler_path = os.path.join(self.model_dir, f'{model_type}_scaler.pkl')
151
+
152
+ joblib.dump(self.models[model_type], model_path)
153
+ joblib.dump(self.scalers[model_type], scaler_path)
154
+ joblib.dump(self.feature_columns, os.path.join(self.model_dir, 'feature_columns.pkl'))
155
+
156
+ def load_model(self, model_type: str):
157
+ """Load trained model and scaler from disk."""
158
+ model_path = os.path.join(self.model_dir, f'{model_type}_model.pkl')
159
+ scaler_path = os.path.join(self.model_dir, f'{model_type}_scaler.pkl')
160
+
161
+ if os.path.exists(model_path) and os.path.exists(scaler_path):
162
+ self.models[model_type] = joblib.load(model_path)
163
+ self.scalers[model_type] = joblib.load(scaler_path)
164
+ self.feature_columns = joblib.load(os.path.join(self.model_dir, 'feature_columns.pkl'))
165
+ return True
166
+ return False
167
+
168
+ def get_model_info(self, model_type: str) -> Dict[str, Any]:
169
+ """Get information about the trained model."""
170
+ if model_type not in self.models:
171
+ return {}
172
+
173
+ model = self.models[model_type]
174
+
175
+ info = {
176
+ 'model_type': model_type,
177
+ 'feature_columns': self.feature_columns,
178
+ 'is_trained': True
179
+ }
180
+
181
+ if model_type == 'isolation_forest':
182
+ info.update({
183
+ 'n_estimators': model.n_estimators,
184
+ 'contamination': model.contamination,
185
+ 'max_samples': model.max_samples
186
+ })
187
+ elif model_type == 'lof':
188
+ info.update({
189
+ 'n_neighbors': model.n_neighbors,
190
+ 'contamination': model.contamination
191
+ })
192
+
193
+ return info