File size: 5,245 Bytes
1e5b98a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# src/hyperparameter_optimization.py
from typing import Dict, Any, Optional

import numpy as np
import pandas as pd
import optuna
from optuna import Trial
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline

from .model_utils import build_model_pipeline
from .config import RANDOM_STATE, CV_N_SPLITS, OPTUNA_METRIC


def objective(
    trial: Trial,
    X: pd.DataFrame,
    y: np.ndarray,
    n_splits: int = 5,
    metric: str = "auc",
) -> float:
    """
    Optuna objective function for hyperparameter optimization.
    
    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target array
        n_splits: Number of folds for cross-validation
        metric: Metric to optimize ("auc" or "f1")
        
    Returns:
        Mean metric value across CV folds
    """
    # Suggest hyperparameters
    C = trial.suggest_float("C", 1e-4, 100.0, log=True)
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"])
    solver = trial.suggest_categorical(
        "solver", ["lbfgs", "liblinear", "newton-cg", "sag", "saga"]
    )
    max_iter = trial.suggest_int("max_iter", 500, 2000, step=100)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    
    # Handle solver compatibility with penalty
    if penalty == "l1":
        if solver not in ["liblinear", "saga"]:
            solver = "liblinear"
    elif penalty == "elasticnet":
        if solver != "saga":
            solver = "saga"
    elif penalty == "l2":
        if solver not in ["lbfgs", "liblinear", "newton-cg", "sag", "saga"]:
            solver = "lbfgs"
    
    # Suggest l1_ratio for elasticnet
    l1_ratio = None
    if penalty == "elasticnet":
        l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)
    
    # Build model with suggested hyperparameters
    model_params = {
        "random_state": RANDOM_STATE,
        "C": C,
        "penalty": penalty,
        "solver": solver,
        "max_iter": max_iter,
        "class_weight": class_weight,
    }
    
    if l1_ratio is not None:
        model_params["l1_ratio"] = l1_ratio
    
    model = build_model_pipeline(**model_params)
    
    # Perform cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        # Train model on fold
        model.fit(X_train_fold, y_train_fold)
        
        # Predict on validation fold
        y_proba_fold = model.predict_proba(X_val_fold)[:, 1]
        
        # Calculate metric
        if metric == "auc":
            score = roc_auc_score(y_val_fold, y_proba_fold)
        elif metric == "f1":
            y_pred_fold = (y_proba_fold >= 0.5).astype(int)
            score = f1_score(y_val_fold, y_pred_fold)
        else:
            raise ValueError(f"Unknown metric: {metric}")
        
        scores.append(score)
    
    # Return mean score across folds
    return float(np.mean(scores))


def optimize_hyperparameters(
    X: pd.DataFrame,
    y: np.ndarray,
    n_trials: int = 50,
    n_splits: int = 5,
    metric: str = "auc",
    timeout: Optional[int] = None,
    study_name: str = "credit_risk_optimization",
    direction: str = "maximize",
    show_progress_bar: bool = True,
) -> Dict[str, Any]:
    """
    Optimize hyperparameters using Optuna.
    
    Args:
        X: Feature matrix
        y: Target array
        n_trials: Number of optimization trials
        n_splits: Number of folds for cross-validation
        metric: Metric to optimize ("auc" or "f1")
        timeout: Timeout in seconds (None = no timeout)
        study_name: Name of the Optuna study
        direction: Optimization direction ("maximize" or "minimize")
        show_progress_bar: Whether to show progress bar
        
    Returns:
        Dictionary with best parameters and best value
    """
    print(f"\n{'='*60}")
    print(f"Optuna Hyperparameter Optimization")
    print(f"{'='*60}")
    print(f"Metric: {metric.upper()}")
    print(f"Trials: {n_trials}")
    print(f"CV Folds: {n_splits}")
    print(f"Direction: {direction}")
    print(f"{'='*60}\n")
    
    # Create study
    study = optuna.create_study(
        direction=direction,
        study_name=study_name,
        sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE),
    )
    
    # Optimize
    study.optimize(
        lambda trial: objective(trial, X, y, n_splits=n_splits, metric=metric),
        n_trials=n_trials,
        timeout=timeout,
        show_progress_bar=show_progress_bar,
    )
    
    print(f"\n{'='*60}")
    print("Optimization Complete")
    print(f"{'='*60}")
    print(f"Best {metric.upper()}: {study.best_value:.4f}")
    print(f"\nBest Parameters:")
    for param, value in study.best_params.items():
        print(f"  {param}: {value}")
    print(f"{'='*60}\n")
    
    return {
        "best_params": study.best_params,
        "best_value": study.best_value,
        "n_trials": len(study.trials),
        "study": study,
    }