import json import pandas as pd from sklearn.ensemble import RandomForestClassifier, StackingClassifier from sklearn.linear_model import LogisticRegression from xgboost import XGBClassifier import joblib import os from config import DATA_PATH, MODELS_PATH, MODEL_FILENAME, FEATURES_PATH # Top 10 features for user input SELECTED_FEATURES = [ 'Credit_Mix_Ordinal', 'Outstanding_Debt', 'Delay_from_due_date', 'Payment_of_Min_Amount_Yes', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Delayed_Payment', 'Installment_to_Income', 'Num_Bank_Accounts', 'Num_Credit_Inquiries' ] def run_pipeline(): print("Starting pipeline...") # Load processed training data train_processed_path = os.path.join(DATA_PATH, 'processed', 'train_processed.csv') if not os.path.exists(train_processed_path): raise FileNotFoundError(f"Processed training data not found at {train_processed_path}") train_processed = pd.read_csv(train_processed_path) # Train model with ALL FEATURES except the target target = 'Credit_Score' if target not in train_processed.columns: raise ValueError("Target column 'Credit_Score' is missing from processed training data.") X = train_processed.drop(target, axis=1) y = train_processed[target] ALL_FEATURES = X.columns.tolist() print(f"Training model using ALL {len(ALL_FEATURES)} features...") print(f"Training data loaded: {X.shape[0]} samples, {X.shape[1]} features") # Define models rf_model = RandomForestClassifier( n_estimators=300, max_depth=12, class_weight='balanced', criterion='entropy', random_state=1907, n_jobs=-1 ) xgb_model = XGBClassifier( n_estimators=300, learning_rate=0.1, max_depth=6, random_state=1907, verbosity=0 ) stacking_clf = StackingClassifier( estimators=[('rf', rf_model), ('xgb', xgb_model)], final_estimator=LogisticRegression(max_iter=1000, random_state=1907), cv=5 ) # Train model print("Training stacking classifier...") stacking_clf.fit(X, y) # Save model os.makedirs(MODELS_PATH, exist_ok=True) model_path = os.path.join(MODELS_PATH, MODEL_FILENAME) joblib.dump(stacking_clf, model_path) print(f"Model saved to {model_path}") # Save BOTH feature lists feature_data = { "all_features": ALL_FEATURES, "top_10_features": SELECTED_FEATURES } with open(FEATURES_PATH, 'w') as f: json.dump(feature_data, f, indent=4) print("Feature lists saved.") print("Pipeline completed successfully.") if __name__ == "__main__": run_pipeline()