File size: 2,630 Bytes
d1fb1ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import duckdb
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from gold import setup_gold_layer

def train_model():
    """
    Trains a RandomForestClassifier on the gold layer data.
    """
    # Ensure the full data pipeline has been run
    setup_gold_layer()

    db_path = os.path.join('..', 'data', 'fraud_detection.duckdb')
    con = duckdb.connect(database=db_path, read_only=False)

    print("Loading data from gold.gold_transactions...")
    # Load the entire table into a pandas DataFrame
    df = con.execute("SELECT * FROM gold.gold_transactions").fetchdf()
    con.close()

    print("Preparing data for training...")

    # Define features (X) and target (y)
    # Exclude identifiers, raw timestamps, and the target variable itself
    features = [col for col in df.columns if col not in [
        'cc_num', 'first', 'last', 'street', 'city', 'state', 'zip', 'dob',
        'trans_num', 'trans_date_trans_time', 'trans_date_time', 'is_fraud'
    ]]
    
    X = df[features]
    y = df['is_fraud']

    # One-hot encode categorical features
    categorical_features = ['merchant', 'category', 'gender', 'job']
    X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

    # Align columns for prediction later - crucial if test set has different categories
    train_cols = X.columns
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    print("Training RandomForestClassifier model...")
    # Initialize and train the model
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)

    print("Evaluating model performance...")
    # Make predictions and evaluate
    y_pred = model.predict(X_test)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Save the trained model and the column list
    model_path = os.path.join('..', 'models')
    if not os.path.exists(model_path):
        os.makedirs(model_path)
        
    joblib.dump(model, os.path.join(model_path, 'fraud_detection_model.joblib'))
    joblib.dump(train_cols, os.path.join(model_path, 'model_columns.joblib'))

    print(f"Model saved to {model_path}")

if __name__ == "__main__":
    # The train_model function now handles the full pipeline run and training
    train_model()