File size: 2,809 Bytes
95409ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import json
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import joblib
import os
from config import DATA_PATH, MODELS_PATH, MODEL_FILENAME, FEATURES_PATH

# Top 10 features for user input
SELECTED_FEATURES = [
    'Credit_Mix_Ordinal',
    'Outstanding_Debt',
    'Delay_from_due_date',
    'Payment_of_Min_Amount_Yes',
    'Num_Credit_Card',
    'Interest_Rate',
    'Num_of_Delayed_Payment',
    'Installment_to_Income',
    'Num_Bank_Accounts',
    'Num_Credit_Inquiries'
]

def run_pipeline():
    print("Starting pipeline...")

    # Load processed training data
    train_processed_path = os.path.join(DATA_PATH, 'processed', 'train_processed.csv')
    if not os.path.exists(train_processed_path):
        raise FileNotFoundError(f"Processed training data not found at {train_processed_path}")

    train_processed = pd.read_csv(train_processed_path)

    # Train model with ALL FEATURES except the target
    target = 'Credit_Score'
    if target not in train_processed.columns:
        raise ValueError("Target column 'Credit_Score' is missing from processed training data.")

    X = train_processed.drop(target, axis=1)
    y = train_processed[target]

    ALL_FEATURES = X.columns.tolist()

    print(f"Training model using ALL {len(ALL_FEATURES)} features...")
    print(f"Training data loaded: {X.shape[0]} samples, {X.shape[1]} features")

    # Define models
    rf_model = RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        class_weight='balanced',
        criterion='entropy',
        random_state=1907,
        n_jobs=-1
    )

    xgb_model = XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        random_state=1907,
        verbosity=0
    )

    stacking_clf = StackingClassifier(
        estimators=[('rf', rf_model), ('xgb', xgb_model)],
        final_estimator=LogisticRegression(max_iter=1000, random_state=1907),
        cv=5
    )

    # Train model
    print("Training stacking classifier...")
    stacking_clf.fit(X, y)

    # Save model
    os.makedirs(MODELS_PATH, exist_ok=True)
    model_path = os.path.join(MODELS_PATH, MODEL_FILENAME)
    joblib.dump(stacking_clf, model_path)
    print(f"Model saved to {model_path}")

    # Save BOTH feature lists
    feature_data = {
        "all_features": ALL_FEATURES,
        "top_10_features": SELECTED_FEATURES
    }

    with open(FEATURES_PATH, 'w') as f:
        json.dump(feature_data, f, indent=4)

    print("Feature lists saved.")
    print("Pipeline completed successfully.")


if __name__ == "__main__":
    run_pipeline()