FinRisk-AI / src /tests /pipeline.py
iremrit's picture
Upload 36 files
95409ed verified
import json
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import joblib
import os
from config import DATA_PATH, MODELS_PATH, MODEL_FILENAME, FEATURES_PATH
# Top 10 features for user input
SELECTED_FEATURES = [
'Credit_Mix_Ordinal',
'Outstanding_Debt',
'Delay_from_due_date',
'Payment_of_Min_Amount_Yes',
'Num_Credit_Card',
'Interest_Rate',
'Num_of_Delayed_Payment',
'Installment_to_Income',
'Num_Bank_Accounts',
'Num_Credit_Inquiries'
]
def run_pipeline():
print("Starting pipeline...")
# Load processed training data
train_processed_path = os.path.join(DATA_PATH, 'processed', 'train_processed.csv')
if not os.path.exists(train_processed_path):
raise FileNotFoundError(f"Processed training data not found at {train_processed_path}")
train_processed = pd.read_csv(train_processed_path)
# Train model with ALL FEATURES except the target
target = 'Credit_Score'
if target not in train_processed.columns:
raise ValueError("Target column 'Credit_Score' is missing from processed training data.")
X = train_processed.drop(target, axis=1)
y = train_processed[target]
ALL_FEATURES = X.columns.tolist()
print(f"Training model using ALL {len(ALL_FEATURES)} features...")
print(f"Training data loaded: {X.shape[0]} samples, {X.shape[1]} features")
# Define models
rf_model = RandomForestClassifier(
n_estimators=300,
max_depth=12,
class_weight='balanced',
criterion='entropy',
random_state=1907,
n_jobs=-1
)
xgb_model = XGBClassifier(
n_estimators=300,
learning_rate=0.1,
max_depth=6,
random_state=1907,
verbosity=0
)
stacking_clf = StackingClassifier(
estimators=[('rf', rf_model), ('xgb', xgb_model)],
final_estimator=LogisticRegression(max_iter=1000, random_state=1907),
cv=5
)
# Train model
print("Training stacking classifier...")
stacking_clf.fit(X, y)
# Save model
os.makedirs(MODELS_PATH, exist_ok=True)
model_path = os.path.join(MODELS_PATH, MODEL_FILENAME)
joblib.dump(stacking_clf, model_path)
print(f"Model saved to {model_path}")
# Save BOTH feature lists
feature_data = {
"all_features": ALL_FEATURES,
"top_10_features": SELECTED_FEATURES
}
with open(FEATURES_PATH, 'w') as f:
json.dump(feature_data, f, indent=4)
print("Feature lists saved.")
print("Pipeline completed successfully.")
if __name__ == "__main__":
run_pipeline()