Spaces:
Sleeping
Sleeping
| # src/model_utils.py | |
| from typing import List, Tuple | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler | |
| from sklearn.linear_model import LogisticRegression | |
| NUMERIC_FEATURES: List[str] = [ | |
| "age", | |
| "dependents", | |
| "monthly_income", | |
| "employment_months", | |
| "requested_amount", | |
| "loan_term_months", | |
| "interest_rate", | |
| "installment", | |
| "debt_to_income", | |
| "num_open_loans", | |
| "num_credit_cards", | |
| ] | |
| CATEGORICAL_FEATURES: List[str] = [ | |
| "gender", | |
| "marital_status", | |
| "employment_type", | |
| "has_mortgage", | |
| "channel", | |
| "region", | |
| ] | |
| def split_features_target(df: pd.DataFrame, target_column: str) -> Tuple[pd.DataFrame, np.ndarray]: | |
| X = df[NUMERIC_FEATURES + CATEGORICAL_FEATURES].copy() | |
| y = df[target_column].values | |
| return X, y | |
| def build_model_pipeline( | |
| random_state: int = 42, | |
| C: float = 1.0, | |
| penalty: str = "l2", | |
| solver: str = "lbfgs", | |
| max_iter: int = 1000, | |
| class_weight: str = None, | |
| l1_ratio: float = 0.5, | |
| ) -> Pipeline: | |
| """ | |
| Build a model pipeline with configurable hyperparameters. | |
| Args: | |
| random_state: Random state for reproducibility | |
| C: Inverse of regularization strength (smaller = stronger regularization) | |
| penalty: Regularization penalty ('l1', 'l2', 'elasticnet', None) | |
| solver: Algorithm to use ('lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga') | |
| max_iter: Maximum number of iterations | |
| class_weight: Class weight strategy ('balanced', None, or dict) | |
| Returns: | |
| Scikit-learn Pipeline | |
| """ | |
| numeric_transformer = Pipeline( | |
| steps=[ | |
| ("scaler", StandardScaler()), | |
| ] | |
| ) | |
| categorical_transformer = Pipeline( | |
| steps=[ | |
| ("onehot", OneHotEncoder(handle_unknown="ignore")), | |
| ] | |
| ) | |
| preprocessor = ColumnTransformer( | |
| transformers=[ | |
| ("num", numeric_transformer, NUMERIC_FEATURES), | |
| ("cat", categorical_transformer, CATEGORICAL_FEATURES), | |
| ] | |
| ) | |
| # Handle solver compatibility with penalty | |
| if penalty == "l1": | |
| if solver not in ["liblinear", "saga"]: | |
| solver = "liblinear" | |
| elif penalty == "elasticnet": | |
| if solver != "saga": | |
| solver = "saga" | |
| elif penalty == "l2" or penalty is None: | |
| if solver not in ["lbfgs", "liblinear", "newton-cg", "sag", "saga"]: | |
| solver = "lbfgs" | |
| # Build LogisticRegression with appropriate parameters | |
| lr_params = { | |
| "C": C, | |
| "penalty": penalty, | |
| "solver": solver, | |
| "max_iter": max_iter, | |
| "random_state": random_state, | |
| "class_weight": class_weight, | |
| } | |
| # Add l1_ratio only for elasticnet penalty | |
| if penalty == "elasticnet": | |
| lr_params["l1_ratio"] = l1_ratio | |
| clf = LogisticRegression(**lr_params) | |
| model = Pipeline( | |
| steps=[ | |
| ("preprocessor", preprocessor), | |
| ("clf", clf), | |
| ] | |
| ) | |
| return model | |