"""Optuna hyperparameter optimization for the salary prediction model.""" import argparse from pathlib import Path import numpy as np import optuna import pandas as pd import yaml from sklearn.model_selection import KFold from xgboost import XGBRegressor from src.preprocessing import prepare_features from src.train import ( apply_cardinality_reduction, drop_other_rows, filter_salaries, ) def sample_params(trial: optuna.Trial, search_space: dict) -> dict: """Sample hyperparameters from the search space using an Optuna trial. Args: trial: Optuna trial object. search_space: Dict mapping parameter names to their search config (type, low, high, optional log). Returns: Dict of sampled hyperparameter values. """ params = {} for name, spec in search_space.items(): param_type = spec["type"] if param_type == "int": params[name] = trial.suggest_int(name, spec["low"], spec["high"]) elif param_type == "float": log = spec.get("log", False) params[name] = trial.suggest_float(name, spec["low"], spec["high"], log=log) return params def build_objective(X: pd.DataFrame, y: pd.Series, optuna_config: dict) -> callable: """Build an Optuna objective function for XGBoost CV evaluation. Args: X: Feature matrix. y: Target vector. optuna_config: Full optuna config dict with search_space, fixed, study. Returns: Objective function that takes a trial and returns mean MAPE. """ search_space = optuna_config["search_space"] fixed = optuna_config["fixed"] cv_splits = optuna_config["study"]["cv_splits"] random_state = fixed.get("random_state", 42) def objective(trial: optuna.Trial) -> float: params = sample_params(trial, search_space) params.update(fixed) kf = KFold(n_splits=cv_splits, shuffle=True, random_state=random_state) mape_scores = [] for train_idx, test_idx in kf.split(X): X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] model = XGBRegressor(**params) model.fit( X_train, y_train, eval_set=[(X_test, y_test)], verbose=False, ) preds = model.predict(X_test) mape = np.mean(np.abs((y_test - preds) / y_test)) * 100 mape_scores.append(mape) return np.mean(mape_scores) return objective def save_best_params(best_params: dict, config_path: Path) -> None: """Save the best hyperparameters to model_parameters.yaml. Updates the model: section with tuned values, preserving all other config. Args: best_params: Dict of best hyperparameter values from Optuna. config_path: Path to model_parameters.yaml. """ with open(config_path, "r") as f: config = yaml.safe_load(f) config["model"].update(best_params) with open(config_path, "w") as f: yaml.dump(config, f, default_flow_style=False, sort_keys=False) def main(): """Run Optuna hyperparameter optimization.""" parser = argparse.ArgumentParser( description="Optuna hyperparameter optimization for salary prediction" ) parser.add_argument( "--n-trials", type=int, default=None, help="Number of optimization trials (overrides config default)", ) args = parser.parse_args() # Load configs optuna_config_path = Path("config/optuna_config.yaml") with open(optuna_config_path, "r") as f: optuna_config = yaml.safe_load(f) model_config_path = Path("config/model_parameters.yaml") with open(model_config_path, "r") as f: config = yaml.safe_load(f) n_trials = args.n_trials or optuna_config["study"]["n_trials"] # Load and preprocess data print("Loading data...") data_path = Path("data/survey_results_public.csv") if not data_path.exists(): print(f"Error: Data file not found at {data_path}") print( "Please download the Stack Overflow Developer Survey CSV " "and place it in the data/ directory." ) return df = pd.read_csv( data_path, usecols=[ "Country", "YearsCode", "WorkExp", "EdLevel", "DevType", "Industry", "Age", "ICorPM", "OrgSize", "Employment", "Currency", "CompTotal", "ConvertedCompYearly", ], ) print(f"Loaded {len(df):,} rows") df = filter_salaries(df, config) df = apply_cardinality_reduction(df) df = drop_other_rows(df, config) main_label = "ConvertedCompYearly" X = prepare_features(df) y = df[main_label] * config["data"]["salary_scale"] print(f"Feature matrix shape: {X.shape}") print(f"\nStarting Optuna optimization with {n_trials} trials...") # Run optimization objective = build_objective(X, y, optuna_config) study = optuna.create_study( direction=optuna_config["study"]["direction"], ) study.optimize(objective, n_trials=n_trials) # Report results print(f"\nBest trial: #{study.best_trial.number}") print(f"Best MAPE: {study.best_value:.2f}%") print("Best hyperparameters:") for name, value in study.best_params.items(): print(f" {name}: {value}") # Save best params to model_parameters.yaml save_best_params(study.best_params, model_config_path) print(f"\nBest parameters saved to {model_config_path}") if __name__ == "__main__": main()