# src/model_utils.py from typing import List, Tuple import numpy as np import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.linear_model import LogisticRegression NUMERIC_FEATURES: List[str] = [ "age", "dependents", "monthly_income", "employment_months", "requested_amount", "loan_term_months", "interest_rate", "installment", "debt_to_income", "num_open_loans", "num_credit_cards", ] CATEGORICAL_FEATURES: List[str] = [ "gender", "marital_status", "employment_type", "has_mortgage", "channel", "region", ] def split_features_target(df: pd.DataFrame, target_column: str) -> Tuple[pd.DataFrame, np.ndarray]: X = df[NUMERIC_FEATURES + CATEGORICAL_FEATURES].copy() y = df[target_column].values return X, y def build_model_pipeline( random_state: int = 42, C: float = 1.0, penalty: str = "l2", solver: str = "lbfgs", max_iter: int = 1000, class_weight: str = None, l1_ratio: float = 0.5, ) -> Pipeline: """ Build a model pipeline with configurable hyperparameters. Args: random_state: Random state for reproducibility C: Inverse of regularization strength (smaller = stronger regularization) penalty: Regularization penalty ('l1', 'l2', 'elasticnet', None) solver: Algorithm to use ('lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga') max_iter: Maximum number of iterations class_weight: Class weight strategy ('balanced', None, or dict) Returns: Scikit-learn Pipeline """ numeric_transformer = Pipeline( steps=[ ("scaler", StandardScaler()), ] ) categorical_transformer = Pipeline( steps=[ ("onehot", OneHotEncoder(handle_unknown="ignore")), ] ) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, NUMERIC_FEATURES), ("cat", categorical_transformer, CATEGORICAL_FEATURES), ] ) # Handle solver compatibility with penalty if penalty == "l1": if solver not in ["liblinear", "saga"]: solver = "liblinear" elif penalty == "elasticnet": if solver != "saga": solver = "saga" elif penalty == "l2" or penalty is None: if solver not in ["lbfgs", "liblinear", "newton-cg", "sag", "saga"]: solver = "lbfgs" # Build LogisticRegression with appropriate parameters lr_params = { "C": C, "penalty": penalty, "solver": solver, "max_iter": max_iter, "random_state": random_state, "class_weight": class_weight, } # Add l1_ratio only for elasticnet penalty if penalty == "elasticnet": lr_params["l1_ratio"] = l1_ratio clf = LogisticRegression(**lr_params) model = Pipeline( steps=[ ("preprocessor", preprocessor), ("clf", clf), ] ) return model