File size: 3,240 Bytes
1e5b98a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# src/model_utils.py
from typing import List, Tuple

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression


NUMERIC_FEATURES: List[str] = [
    "age",
    "dependents",
    "monthly_income",
    "employment_months",
    "requested_amount",
    "loan_term_months",
    "interest_rate",
    "installment",
    "debt_to_income",
    "num_open_loans",
    "num_credit_cards",
]

CATEGORICAL_FEATURES: List[str] = [
    "gender",
    "marital_status",
    "employment_type",
    "has_mortgage",
    "channel",
    "region",
]


def split_features_target(df: pd.DataFrame, target_column: str) -> Tuple[pd.DataFrame, np.ndarray]:
    X = df[NUMERIC_FEATURES + CATEGORICAL_FEATURES].copy()
    y = df[target_column].values
    return X, y


def build_model_pipeline(

    random_state: int = 42,

    C: float = 1.0,

    penalty: str = "l2",

    solver: str = "lbfgs",

    max_iter: int = 1000,

    class_weight: str = None,

    l1_ratio: float = 0.5,

) -> Pipeline:
    """

    Build a model pipeline with configurable hyperparameters.

    

    Args:

        random_state: Random state for reproducibility

        C: Inverse of regularization strength (smaller = stronger regularization)

        penalty: Regularization penalty ('l1', 'l2', 'elasticnet', None)

        solver: Algorithm to use ('lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga')

        max_iter: Maximum number of iterations

        class_weight: Class weight strategy ('balanced', None, or dict)

        

    Returns:

        Scikit-learn Pipeline

    """
    numeric_transformer = Pipeline(
        steps=[
            ("scaler", StandardScaler()),
        ]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, NUMERIC_FEATURES),
            ("cat", categorical_transformer, CATEGORICAL_FEATURES),
        ]
    )

    # Handle solver compatibility with penalty
    if penalty == "l1":
        if solver not in ["liblinear", "saga"]:
            solver = "liblinear"
    elif penalty == "elasticnet":
        if solver != "saga":
            solver = "saga"
    elif penalty == "l2" or penalty is None:
        if solver not in ["lbfgs", "liblinear", "newton-cg", "sag", "saga"]:
            solver = "lbfgs"

    # Build LogisticRegression with appropriate parameters
    lr_params = {
        "C": C,
        "penalty": penalty,
        "solver": solver,
        "max_iter": max_iter,
        "random_state": random_state,
        "class_weight": class_weight,
    }
    
    # Add l1_ratio only for elasticnet penalty
    if penalty == "elasticnet":
        lr_params["l1_ratio"] = l1_ratio
    
    clf = LogisticRegression(**lr_params)

    model = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("clf", clf),
        ]
    )
    return model