credit-risk-api / src /train.py
junaid17's picture
Update src/train.py
d59cad8 verified
import pandas as pd
import joblib
from sklearn.preprocessing import MinMaxScaler
from imblearn.combine import SMOTETomek
from sklearn.linear_model import LogisticRegression
from src.utils import load_config, get_versioned_path
from src.ingestion import load_raw_data
from src.preprocessing import clean_and_engineer
def train_pipeline():
config = load_config()
df = load_raw_data()
df = clean_and_engineer(df)
target = config["data"]["target"]
X = df.drop(columns=[target])
y = df[target]
X_encoded = pd.get_dummies(X, drop_first=True)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_encoded)
smt = SMOTETomek(random_state=config["training"]["random_state"])
X_res, y_res = smt.fit_resample(X_scaled, y)
params = config["model"]["params"]
model = LogisticRegression(**params)
model.fit(X_res, y_res)
model_path = get_versioned_path(config["artifacts"]["model_dir"], "credit_model", "pkl")
scaler_path = get_versioned_path(config["artifacts"]["model_dir"], "scaler", "pkl")
columns_path = get_versioned_path(config["artifacts"]["model_dir"], "columns", "pkl")
joblib.dump(model, model_path)
joblib.dump(scaler, scaler_path)
joblib.dump(list(X_encoded.columns), columns_path)
print(f"Model saved at: {model_path}")
print(f"Scaler saved at: {scaler_path}")
print(f"Columns saved at: {columns_path}")
return model_path, scaler_path, columns_path
if __name__ == "__main__":
train_pipeline()