MLOps-risk-model / src /model_utils.py
github-actions[bot]
deploy: sync from GitHub main
1e5b98a
# src/model_utils.py
from typing import List, Tuple
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
NUMERIC_FEATURES: List[str] = [
"age",
"dependents",
"monthly_income",
"employment_months",
"requested_amount",
"loan_term_months",
"interest_rate",
"installment",
"debt_to_income",
"num_open_loans",
"num_credit_cards",
]
CATEGORICAL_FEATURES: List[str] = [
"gender",
"marital_status",
"employment_type",
"has_mortgage",
"channel",
"region",
]
def split_features_target(df: pd.DataFrame, target_column: str) -> Tuple[pd.DataFrame, np.ndarray]:
X = df[NUMERIC_FEATURES + CATEGORICAL_FEATURES].copy()
y = df[target_column].values
return X, y
def build_model_pipeline(
random_state: int = 42,
C: float = 1.0,
penalty: str = "l2",
solver: str = "lbfgs",
max_iter: int = 1000,
class_weight: str = None,
l1_ratio: float = 0.5,
) -> Pipeline:
"""
Build a model pipeline with configurable hyperparameters.
Args:
random_state: Random state for reproducibility
C: Inverse of regularization strength (smaller = stronger regularization)
penalty: Regularization penalty ('l1', 'l2', 'elasticnet', None)
solver: Algorithm to use ('lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga')
max_iter: Maximum number of iterations
class_weight: Class weight strategy ('balanced', None, or dict)
Returns:
Scikit-learn Pipeline
"""
numeric_transformer = Pipeline(
steps=[
("scaler", StandardScaler()),
]
)
categorical_transformer = Pipeline(
steps=[
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, NUMERIC_FEATURES),
("cat", categorical_transformer, CATEGORICAL_FEATURES),
]
)
# Handle solver compatibility with penalty
if penalty == "l1":
if solver not in ["liblinear", "saga"]:
solver = "liblinear"
elif penalty == "elasticnet":
if solver != "saga":
solver = "saga"
elif penalty == "l2" or penalty is None:
if solver not in ["lbfgs", "liblinear", "newton-cg", "sag", "saga"]:
solver = "lbfgs"
# Build LogisticRegression with appropriate parameters
lr_params = {
"C": C,
"penalty": penalty,
"solver": solver,
"max_iter": max_iter,
"random_state": random_state,
"class_weight": class_weight,
}
# Add l1_ratio only for elasticnet penalty
if penalty == "elasticnet":
lr_params["l1_ratio"] = l1_ratio
clf = LogisticRegression(**lr_params)
model = Pipeline(
steps=[
("preprocessor", preprocessor),
("clf", clf),
]
)
return model