Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- .gitattributes +2 -0
- explain.py +159 -0
- feature_columns.pkl +0 -0
- isolation_forest_model.pkl +3 -0
- isolation_forest_scaler.pkl +0 -0
- lof_model.pkl +3 -0
- lof_scaler.pkl +0 -0
- predict.py +129 -0
- train.py +193 -0
.gitattributes
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
creditcard_original.csv filter=lfs diff=lfs merge=lfs -text
|
| 2 |
data/creditcard_original.csv filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 1 |
creditcard_original.csv filter=lfs diff=lfs merge=lfs -text
|
| 2 |
data/creditcard_original.csv filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
isolation_forest_model.pkl filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
lof_model.pkl filter=lfs diff=lfs merge=lfs -text
|
explain.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from typing import Dict, List, Any
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class AnomalyExplainer:
|
| 7 |
+
"""Generate explanations for detected anomalies."""
|
| 8 |
+
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.thresholds = {
|
| 11 |
+
'amount_ratio': 3.0, # 3x user average
|
| 12 |
+
'z_score': 2.5, # 2.5 standard deviations
|
| 13 |
+
'hour_distance': 6, # 6 hours from common hour
|
| 14 |
+
'time_since_last': 48, # 48 hours since last transaction
|
| 15 |
+
'night_transaction': True, # Transaction at night
|
| 16 |
+
'weekend_ratio': 2.0 # Weekend transaction ratio
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
def explain_anomaly(self, transaction: pd.Series, user_stats: Dict = None) -> List[str]:
|
| 20 |
+
"""Generate explanation for a single anomalous transaction."""
|
| 21 |
+
explanations = []
|
| 22 |
+
|
| 23 |
+
# Check amount deviation
|
| 24 |
+
if 'AmountRatio_Mean' in transaction and not pd.isna(transaction['AmountRatio_Mean']):
|
| 25 |
+
ratio = transaction['AmountRatio_Mean']
|
| 26 |
+
if ratio > self.thresholds['amount_ratio']:
|
| 27 |
+
explanations.append(f"Transaction amount is {ratio:.1f}x higher than user's average")
|
| 28 |
+
elif ratio > self.thresholds['amount_ratio'] * 0.5:
|
| 29 |
+
explanations.append(f"Transaction amount is {ratio:.1f}x higher than user's average")
|
| 30 |
+
|
| 31 |
+
if 'Amount_ZScore' in transaction and not pd.isna(transaction['Amount_ZScore']):
|
| 32 |
+
z_score = abs(transaction['Amount_ZScore'])
|
| 33 |
+
if z_score > self.thresholds['z_score']:
|
| 34 |
+
explanations.append(f"Transaction amount deviates by {z_score:.1f} standard deviations from user's normal spending")
|
| 35 |
+
|
| 36 |
+
# Check time-based anomalies
|
| 37 |
+
if 'Hour' in transaction and not pd.isna(transaction['Hour']):
|
| 38 |
+
hour = transaction['Hour']
|
| 39 |
+
if hour >= 22 or hour <= 5:
|
| 40 |
+
explanations.append(f"Unusual transaction time: {hour}:00 (night hours)")
|
| 41 |
+
|
| 42 |
+
if 'Hour_Distance' in transaction and not pd.isna(transaction['Hour_Distance']):
|
| 43 |
+
hour_dist = transaction['Hour_Distance']
|
| 44 |
+
if hour_dist > self.thresholds['hour_distance']:
|
| 45 |
+
explanations.append(f"Transaction time is {hour_dist:.0f} hours away from user's typical transaction hour")
|
| 46 |
+
|
| 47 |
+
if 'IsWeekend' in transaction and transaction['IsWeekend'] == 1:
|
| 48 |
+
explanations.append("Transaction occurred on a weekend")
|
| 49 |
+
|
| 50 |
+
# Check frequency anomalies
|
| 51 |
+
if 'TimeSinceLastTx' in transaction and not pd.isna(transaction['TimeSinceLastTx']):
|
| 52 |
+
time_since = transaction['TimeSinceLastTx']
|
| 53 |
+
if time_since > self.thresholds['time_since_last']:
|
| 54 |
+
explanations.append(f"Unusual transaction pattern: {time_since:.0f} hours since last transaction")
|
| 55 |
+
elif time_since < 1:
|
| 56 |
+
explanations.append("Rapid succession: multiple transactions within 1 hour")
|
| 57 |
+
|
| 58 |
+
# Check category anomalies
|
| 59 |
+
if 'Merchant Category' in transaction:
|
| 60 |
+
category = transaction['Merchant Category']
|
| 61 |
+
explanations.append(f"Merchant category: {category}")
|
| 62 |
+
|
| 63 |
+
if 'Category_Entropy' in transaction and not pd.isna(transaction['Category_Entropy']):
|
| 64 |
+
entropy = transaction['Category_Entropy']
|
| 65 |
+
if entropy < 1.0:
|
| 66 |
+
explanations.append("User typically has low category diversity - this transaction may be unusual")
|
| 67 |
+
|
| 68 |
+
# If no specific explanations found, provide general one
|
| 69 |
+
if not explanations:
|
| 70 |
+
explanations.append("Anomaly detected based on combined feature analysis")
|
| 71 |
+
|
| 72 |
+
return explanations
|
| 73 |
+
|
| 74 |
+
def explain_batch(self, df: pd.DataFrame, user_stats: Dict = None) -> pd.DataFrame:
|
| 75 |
+
"""Generate explanations for a batch of transactions."""
|
| 76 |
+
df = df.copy()
|
| 77 |
+
|
| 78 |
+
explanations = []
|
| 79 |
+
for idx, row in df.iterrows():
|
| 80 |
+
explanation = self.explain_anomaly(row, user_stats)
|
| 81 |
+
explanations.append('; '.join(explanation))
|
| 82 |
+
|
| 83 |
+
df['Explanation'] = explanations
|
| 84 |
+
return df
|
| 85 |
+
|
| 86 |
+
def get_feature_importance(self, transaction: pd.Series) -> Dict[str, float]:
|
| 87 |
+
"""Calculate feature importance for the anomaly."""
|
| 88 |
+
importance = {}
|
| 89 |
+
|
| 90 |
+
# Amount importance
|
| 91 |
+
if 'Amount_ZScore' in transaction and not pd.isna(transaction['Amount_ZScore']):
|
| 92 |
+
importance['Amount'] = min(abs(transaction['Amount_ZScore']) / 5.0, 1.0)
|
| 93 |
+
|
| 94 |
+
# Time importance
|
| 95 |
+
if 'Hour_Distance' in transaction and not pd.isna(transaction['Hour_Distance']):
|
| 96 |
+
importance['Time'] = min(transaction['Hour_Distance'] / 12.0, 1.0)
|
| 97 |
+
|
| 98 |
+
# Frequency importance
|
| 99 |
+
if 'TimeSinceLastTx' in transaction and not pd.isna(transaction['TimeSinceLastTx']):
|
| 100 |
+
importance['Frequency'] = min(transaction['TimeSinceLastTx'] / 72.0, 1.0)
|
| 101 |
+
|
| 102 |
+
# Category importance
|
| 103 |
+
if 'Category_Entropy' in transaction and not pd.isna(transaction['Category_Entropy']):
|
| 104 |
+
importance['Category'] = max(0, 1.0 - transaction['Category_Entropy'] / 3.0)
|
| 105 |
+
|
| 106 |
+
# Normalize importance scores
|
| 107 |
+
total = sum(importance.values()) if importance else 1
|
| 108 |
+
if total > 0:
|
| 109 |
+
importance = {k: v / total for k, v in importance.items()}
|
| 110 |
+
|
| 111 |
+
return importance
|
| 112 |
+
|
| 113 |
+
def generate_radar_data(self, df: pd.DataFrame) -> Dict[str, List]:
|
| 114 |
+
"""Generate data for radar chart visualization."""
|
| 115 |
+
if df.empty:
|
| 116 |
+
return {'labels': [], 'datasets': []}
|
| 117 |
+
|
| 118 |
+
# Calculate metrics for radar chart
|
| 119 |
+
metrics = {
|
| 120 |
+
'Amount': df['Amount'].mean() if 'Amount' in df.columns else 0,
|
| 121 |
+
'Frequency': df.get('TxCount_Window', pd.Series([1])).mean(),
|
| 122 |
+
'Time Variance': df.get('Hour_Variance', pd.Series([0])).mean(),
|
| 123 |
+
'Category Diversity': df.get('Category_Entropy', pd.Series([0])).mean()
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
# Normalize to 0-100 scale
|
| 127 |
+
max_vals = {
|
| 128 |
+
'Amount': df['Amount'].max() if 'Amount' in df.columns else 1,
|
| 129 |
+
'Frequency': metrics['Frequency'] * 2,
|
| 130 |
+
'Time Variance': 50,
|
| 131 |
+
'Category Diversity': 3
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
normalized = {
|
| 135 |
+
'Amount': (metrics['Amount'] / max_vals['Amount'] * 100) if max_vals['Amount'] > 0 else 50,
|
| 136 |
+
'Frequency': (metrics['Frequency'] / max_vals['Frequency'] * 100) if max_vals['Frequency'] > 0 else 50,
|
| 137 |
+
'Time Variance': (metrics['Time Variance'] / max_vals['Time Variance'] * 100),
|
| 138 |
+
'Category Diversity': (metrics['Category Diversity'] / max_vals['Category Diversity'] * 100)
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
return {
|
| 142 |
+
'labels': list(normalized.keys()),
|
| 143 |
+
'values': [normalized[k] for k in normalized.keys()]
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
def compare_with_user_baseline(self, transaction: pd.Series, user_baseline: Dict) -> Dict[str, Any]:
|
| 147 |
+
"""Compare transaction with user's baseline behavior."""
|
| 148 |
+
comparison = {}
|
| 149 |
+
|
| 150 |
+
if 'Amount' in transaction and 'mean' in user_baseline:
|
| 151 |
+
comparison['amount_vs_avg'] = transaction['Amount'] / user_baseline['mean']
|
| 152 |
+
|
| 153 |
+
if 'Hour' in transaction:
|
| 154 |
+
comparison['hour'] = transaction['Hour']
|
| 155 |
+
|
| 156 |
+
if 'Merchant Category' in transaction:
|
| 157 |
+
comparison['category'] = transaction['Merchant Category']
|
| 158 |
+
|
| 159 |
+
return comparison
|
feature_columns.pkl
ADDED
|
Binary file (171 Bytes). View file
|
|
|
isolation_forest_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e83f211f7ee754655cb43cf77877a801febbc72b3e36ea27728e848c5e922948
|
| 3 |
+
size 1263449
|
isolation_forest_scaler.pkl
ADDED
|
Binary file (863 Bytes). View file
|
|
|
lof_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b0a408c1618194e4662b2d3d3e41335a70923d38906fac79e7ec5d0856e92e4
|
| 3 |
+
size 1854630
|
lof_scaler.pkl
ADDED
|
Binary file (863 Bytes). View file
|
|
|
predict.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import joblib
|
| 4 |
+
import os
|
| 5 |
+
from typing import Dict, List, Tuple, Any
|
| 6 |
+
from model.train import AnomalyDetectorTrainer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class AnomalyPredictor:
|
| 10 |
+
"""Predict anomalies in credit card transactions."""
|
| 11 |
+
|
| 12 |
+
def __init__(self, model_dir: str = 'model'):
|
| 13 |
+
self.model_dir = model_dir
|
| 14 |
+
self.trainer = AnomalyDetectorTrainer(model_dir)
|
| 15 |
+
self.current_model = None
|
| 16 |
+
self.current_scaler = None
|
| 17 |
+
self.feature_columns = []
|
| 18 |
+
|
| 19 |
+
def load_model(self, model_type: str = 'isolation_forest') -> bool:
|
| 20 |
+
"""Load the trained model."""
|
| 21 |
+
success = self.trainer.load_model(model_type)
|
| 22 |
+
if success:
|
| 23 |
+
self.current_model = self.trainer.models[model_type]
|
| 24 |
+
self.current_scaler = self.trainer.scalers[model_type]
|
| 25 |
+
self.feature_columns = self.trainer.feature_columns
|
| 26 |
+
return success
|
| 27 |
+
|
| 28 |
+
def prepare_features(self, df: pd.DataFrame) -> np.ndarray:
|
| 29 |
+
"""Prepare features for prediction."""
|
| 30 |
+
df = df.copy()
|
| 31 |
+
|
| 32 |
+
# Select available feature columns
|
| 33 |
+
available_features = [col for col in self.feature_columns if col in df.columns]
|
| 34 |
+
|
| 35 |
+
if not available_features:
|
| 36 |
+
# Fallback to Amount if available
|
| 37 |
+
if 'Amount' in df.columns:
|
| 38 |
+
available_features = ['Amount']
|
| 39 |
+
else:
|
| 40 |
+
raise ValueError("No valid features found in dataframe")
|
| 41 |
+
|
| 42 |
+
# Fill missing values with median
|
| 43 |
+
for col in available_features:
|
| 44 |
+
if col in df.columns:
|
| 45 |
+
df[col] = df[col].fillna(df[col].median())
|
| 46 |
+
|
| 47 |
+
X = df[available_features].values
|
| 48 |
+
|
| 49 |
+
# Scale features
|
| 50 |
+
if self.current_scaler is not None:
|
| 51 |
+
X_scaled = self.current_scaler.transform(X)
|
| 52 |
+
else:
|
| 53 |
+
X_scaled = X
|
| 54 |
+
|
| 55 |
+
return X_scaled, available_features
|
| 56 |
+
|
| 57 |
+
def predict(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 58 |
+
"""Predict anomalies for transactions."""
|
| 59 |
+
if self.current_model is None:
|
| 60 |
+
raise ValueError("No model loaded. Call load_model() first.")
|
| 61 |
+
|
| 62 |
+
df = df.copy()
|
| 63 |
+
|
| 64 |
+
# Prepare features
|
| 65 |
+
X_scaled, used_features = self.prepare_features(df)
|
| 66 |
+
|
| 67 |
+
# Get predictions
|
| 68 |
+
predictions = self.current_model.predict(X_scaled)
|
| 69 |
+
|
| 70 |
+
# Get anomaly scores
|
| 71 |
+
if hasattr(self.current_model, 'decision_function'):
|
| 72 |
+
scores = self.current_model.decision_function(X_scaled)
|
| 73 |
+
elif hasattr(self.current_model, 'score_samples'):
|
| 74 |
+
scores = self.current_model.score_samples(X_scaled)
|
| 75 |
+
else:
|
| 76 |
+
scores = np.zeros(len(X_scaled))
|
| 77 |
+
|
| 78 |
+
# Convert predictions to labels (-1 = anomaly, 1 = normal)
|
| 79 |
+
df['Prediction'] = predictions
|
| 80 |
+
df['Anomaly_Score'] = scores
|
| 81 |
+
df['Status'] = df['Prediction'].apply(lambda x: 'Anomalous' if x == -1 else 'Normal')
|
| 82 |
+
df['Confidence'] = np.abs(scores) / (np.max(np.abs(scores)) + 1e-8)
|
| 83 |
+
|
| 84 |
+
return df
|
| 85 |
+
|
| 86 |
+
def predict_single(self, transaction: Dict[str, Any]) -> Dict[str, Any]:
|
| 87 |
+
"""Predict anomaly for a single transaction."""
|
| 88 |
+
df = pd.DataFrame([transaction])
|
| 89 |
+
result = self.predict(df)
|
| 90 |
+
|
| 91 |
+
return {
|
| 92 |
+
'transaction_id': transaction.get('Transaction ID', 'Unknown'),
|
| 93 |
+
'prediction': result.iloc[0]['Status'],
|
| 94 |
+
'anomaly_score': float(result.iloc[0]['Anomaly_Score']),
|
| 95 |
+
'confidence': float(result.iloc[0]['Confidence'])
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
def get_anomalies(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 99 |
+
"""Get only the anomalous transactions."""
|
| 100 |
+
predictions = self.predict(df)
|
| 101 |
+
anomalies = predictions[predictions['Status'] == 'Anomalous']
|
| 102 |
+
return anomalies
|
| 103 |
+
|
| 104 |
+
def get_statistics(self, df: pd.DataFrame) -> Dict[str, Any]:
|
| 105 |
+
"""Get prediction statistics."""
|
| 106 |
+
predictions = self.predict(df)
|
| 107 |
+
|
| 108 |
+
total = len(predictions)
|
| 109 |
+
anomalies = len(predictions[predictions['Status'] == 'Anomalous'])
|
| 110 |
+
normal = total - anomalies
|
| 111 |
+
|
| 112 |
+
return {
|
| 113 |
+
'total_transactions': total,
|
| 114 |
+
'anomalies_detected': anomalies,
|
| 115 |
+
'normal_transactions': normal,
|
| 116 |
+
'detection_rate': (anomalies / total * 100) if total > 0 else 0,
|
| 117 |
+
'average_anomaly_score': float(predictions[predictions['Status'] == 'Anomalous']['Anomaly_Score'].mean()) if anomalies > 0 else 0
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
def batch_predict(self, df: pd.DataFrame, batch_size: int = 1000) -> pd.DataFrame:
|
| 121 |
+
"""Predict in batches for large datasets."""
|
| 122 |
+
results = []
|
| 123 |
+
|
| 124 |
+
for i in range(0, len(df), batch_size):
|
| 125 |
+
batch = df.iloc[i:i + batch_size]
|
| 126 |
+
batch_result = self.predict(batch)
|
| 127 |
+
results.append(batch_result)
|
| 128 |
+
|
| 129 |
+
return pd.concat(results, ignore_index=True)
|
train.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.ensemble import IsolationForest
|
| 4 |
+
from sklearn.neighbors import LocalOutlierFactor
|
| 5 |
+
from sklearn.preprocessing import StandardScaler
|
| 6 |
+
import joblib
|
| 7 |
+
import os
|
| 8 |
+
from typing import Dict, Tuple, Any
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AnomalyDetectorTrainer:
|
| 12 |
+
"""Train anomaly detection models for credit card transactions."""
|
| 13 |
+
|
| 14 |
+
def __init__(self, model_dir: str = 'model'):
|
| 15 |
+
self.model_dir = model_dir
|
| 16 |
+
self.models = {}
|
| 17 |
+
self.scalers = {}
|
| 18 |
+
self.feature_columns = []
|
| 19 |
+
|
| 20 |
+
# Create model directory if it doesn't exist
|
| 21 |
+
os.makedirs(model_dir, exist_ok=True)
|
| 22 |
+
|
| 23 |
+
def prepare_features(self, df: pd.DataFrame) -> Tuple[np.ndarray, list]:
|
| 24 |
+
"""Prepare features for training."""
|
| 25 |
+
df = df.copy()
|
| 26 |
+
|
| 27 |
+
# Select numerical features
|
| 28 |
+
feature_cols = []
|
| 29 |
+
|
| 30 |
+
# Core features
|
| 31 |
+
if 'Amount' in df.columns:
|
| 32 |
+
feature_cols.append('Amount')
|
| 33 |
+
if 'Amount_ZScore' in df.columns:
|
| 34 |
+
feature_cols.append('Amount_ZScore')
|
| 35 |
+
if 'AmountRatio_Mean' in df.columns:
|
| 36 |
+
feature_cols.append('AmountRatio_Mean')
|
| 37 |
+
|
| 38 |
+
# Time features
|
| 39 |
+
if 'Hour' in df.columns:
|
| 40 |
+
feature_cols.append('Hour')
|
| 41 |
+
if 'DayOfWeek' in df.columns:
|
| 42 |
+
feature_cols.append('DayOfWeek')
|
| 43 |
+
if 'IsWeekend' in df.columns:
|
| 44 |
+
feature_cols.append('IsWeekend')
|
| 45 |
+
if 'IsNight' in df.columns:
|
| 46 |
+
feature_cols.append('IsNight')
|
| 47 |
+
if 'Hour_Distance' in df.columns:
|
| 48 |
+
feature_cols.append('Hour_Distance')
|
| 49 |
+
|
| 50 |
+
# Frequency features
|
| 51 |
+
if 'TimeSinceLastTx' in df.columns:
|
| 52 |
+
feature_cols.append('TimeSinceLastTx')
|
| 53 |
+
if 'TxCount_Window' in df.columns:
|
| 54 |
+
feature_cols.append('TxCount_Window')
|
| 55 |
+
|
| 56 |
+
# Category features
|
| 57 |
+
if 'Category_Entropy' in df.columns:
|
| 58 |
+
feature_cols.append('Category_Entropy')
|
| 59 |
+
if 'Merchant_Category_Encoded' in df.columns:
|
| 60 |
+
feature_cols.append('Merchant_Category_Encoded')
|
| 61 |
+
|
| 62 |
+
# Filter to only columns that exist
|
| 63 |
+
available_features = [col for col in feature_cols if col in df.columns]
|
| 64 |
+
|
| 65 |
+
if not available_features:
|
| 66 |
+
# Fallback to basic features
|
| 67 |
+
available_features = ['Amount']
|
| 68 |
+
|
| 69 |
+
# Fill missing values
|
| 70 |
+
for col in available_features:
|
| 71 |
+
df[col] = df[col].fillna(df[col].median())
|
| 72 |
+
|
| 73 |
+
self.feature_columns = available_features
|
| 74 |
+
X = df[available_features].values
|
| 75 |
+
|
| 76 |
+
return X, available_features
|
| 77 |
+
|
| 78 |
+
def train_isolation_forest(
|
| 79 |
+
self,
|
| 80 |
+
X: np.ndarray,
|
| 81 |
+
contamination: float = 0.1,
|
| 82 |
+
n_estimators: int = 100,
|
| 83 |
+
random_state: int = 42
|
| 84 |
+
) -> IsolationForest:
|
| 85 |
+
"""Train Isolation Forest model."""
|
| 86 |
+
model = IsolationForest(
|
| 87 |
+
contamination=contamination,
|
| 88 |
+
n_estimators=n_estimators,
|
| 89 |
+
random_state=random_state,
|
| 90 |
+
n_jobs=-1
|
| 91 |
+
)
|
| 92 |
+
model.fit(X)
|
| 93 |
+
return model
|
| 94 |
+
|
| 95 |
+
def train_lof(
|
| 96 |
+
self,
|
| 97 |
+
X: np.ndarray,
|
| 98 |
+
contamination: float = 0.1,
|
| 99 |
+
n_neighbors: int = 20
|
| 100 |
+
) -> LocalOutlierFactor:
|
| 101 |
+
"""Train Local Outlier Factor model."""
|
| 102 |
+
model = LocalOutlierFactor(
|
| 103 |
+
contamination=contamination,
|
| 104 |
+
n_neighbors=n_neighbors,
|
| 105 |
+
novelty=True
|
| 106 |
+
)
|
| 107 |
+
model.fit(X)
|
| 108 |
+
return model
|
| 109 |
+
|
| 110 |
+
def train_models(
|
| 111 |
+
self,
|
| 112 |
+
df: pd.DataFrame,
|
| 113 |
+
model_type: str = 'isolation_forest',
|
| 114 |
+
contamination: float = 0.1
|
| 115 |
+
) -> Dict[str, Any]:
|
| 116 |
+
"""Train selected anomaly detection model."""
|
| 117 |
+
# Prepare features
|
| 118 |
+
X, feature_cols = self.prepare_features(df)
|
| 119 |
+
|
| 120 |
+
# Scale features
|
| 121 |
+
scaler = StandardScaler()
|
| 122 |
+
X_scaled = scaler.fit_transform(X)
|
| 123 |
+
|
| 124 |
+
# Train model based on type
|
| 125 |
+
if model_type == 'isolation_forest':
|
| 126 |
+
model = self.train_isolation_forest(X_scaled, contamination=contamination)
|
| 127 |
+
elif model_type == 'lof':
|
| 128 |
+
model = self.train_lof(X_scaled, contamination=contamination)
|
| 129 |
+
else:
|
| 130 |
+
raise ValueError(f"Unknown model type: {model_type}")
|
| 131 |
+
|
| 132 |
+
# Store model and scaler
|
| 133 |
+
self.models[model_type] = model
|
| 134 |
+
self.scalers[model_type] = scaler
|
| 135 |
+
|
| 136 |
+
# Save models
|
| 137 |
+
self.save_model(model_type)
|
| 138 |
+
|
| 139 |
+
return {
|
| 140 |
+
'model_type': model_type,
|
| 141 |
+
'contamination': contamination,
|
| 142 |
+
'feature_columns': feature_cols,
|
| 143 |
+
'n_samples': len(X),
|
| 144 |
+
'n_features': X.shape[1]
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
def save_model(self, model_type: str):
|
| 148 |
+
"""Save trained model and scaler to disk."""
|
| 149 |
+
model_path = os.path.join(self.model_dir, f'{model_type}_model.pkl')
|
| 150 |
+
scaler_path = os.path.join(self.model_dir, f'{model_type}_scaler.pkl')
|
| 151 |
+
|
| 152 |
+
joblib.dump(self.models[model_type], model_path)
|
| 153 |
+
joblib.dump(self.scalers[model_type], scaler_path)
|
| 154 |
+
joblib.dump(self.feature_columns, os.path.join(self.model_dir, 'feature_columns.pkl'))
|
| 155 |
+
|
| 156 |
+
def load_model(self, model_type: str):
|
| 157 |
+
"""Load trained model and scaler from disk."""
|
| 158 |
+
model_path = os.path.join(self.model_dir, f'{model_type}_model.pkl')
|
| 159 |
+
scaler_path = os.path.join(self.model_dir, f'{model_type}_scaler.pkl')
|
| 160 |
+
|
| 161 |
+
if os.path.exists(model_path) and os.path.exists(scaler_path):
|
| 162 |
+
self.models[model_type] = joblib.load(model_path)
|
| 163 |
+
self.scalers[model_type] = joblib.load(scaler_path)
|
| 164 |
+
self.feature_columns = joblib.load(os.path.join(self.model_dir, 'feature_columns.pkl'))
|
| 165 |
+
return True
|
| 166 |
+
return False
|
| 167 |
+
|
| 168 |
+
def get_model_info(self, model_type: str) -> Dict[str, Any]:
|
| 169 |
+
"""Get information about the trained model."""
|
| 170 |
+
if model_type not in self.models:
|
| 171 |
+
return {}
|
| 172 |
+
|
| 173 |
+
model = self.models[model_type]
|
| 174 |
+
|
| 175 |
+
info = {
|
| 176 |
+
'model_type': model_type,
|
| 177 |
+
'feature_columns': self.feature_columns,
|
| 178 |
+
'is_trained': True
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
if model_type == 'isolation_forest':
|
| 182 |
+
info.update({
|
| 183 |
+
'n_estimators': model.n_estimators,
|
| 184 |
+
'contamination': model.contamination,
|
| 185 |
+
'max_samples': model.max_samples
|
| 186 |
+
})
|
| 187 |
+
elif model_type == 'lof':
|
| 188 |
+
info.update({
|
| 189 |
+
'n_neighbors': model.n_neighbors,
|
| 190 |
+
'contamination': model.contamination
|
| 191 |
+
})
|
| 192 |
+
|
| 193 |
+
return info
|