File size: 2,425 Bytes
4ba360f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python3
"""
Save the trained model and artifacts
"""

import joblib
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

def main():
    # Load the processed data
    df = pd.read_csv('data/processed/telco_churn_processed.csv')
    
    # Convert target to numeric
    df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})
    
    # Separate features and target
    feature_columns = [col for col in df.columns if col != 'Churn']
    X = df[feature_columns]
    y = df['Churn']
    
    # Train model
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"Class imbalance ratio: {scale_pos_weight:.2f}")
    
    model = XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        n_jobs=-1,
        eval_metric="logloss",
        scale_pos_weight=scale_pos_weight
    )
    
    print("Training model...")
    model.fit(X_train, y_train)
    
    # Save model
    import os
    artifacts_dir = 'artifacts'
    os.makedirs(artifacts_dir, exist_ok=True)
    
    model_path = os.path.join(artifacts_dir, 'model.pkl')
    joblib.dump(model, model_path)
    print(f"Model saved to {model_path}")
    
    # Save feature columns
    feature_columns_path = os.path.join(artifacts_dir, 'feature_columns.json')
    with open(feature_columns_path, 'w') as f:
        json.dump(feature_columns, f)
    print(f"Feature columns saved to {feature_columns_path}")
    
    # Test the model
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
    
    threshold = 0.35
    y_pred_thresholded = (y_pred_proba >= threshold).astype(int)
    
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred_thresholded),
        'precision': precision_score(y_test, y_pred_thresholded),
        'recall': recall_score(y_test, y_pred_thresholded),
        'f1': f1_score(y_test, y_pred_thresholded),
        'roc_auc': roc_auc_score(y_test, y_pred_proba)
    }
    
    print("\nModel Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.3f}")

if __name__ == "__main__":
    main()