"""Optuna hyperparameter optimization for the salary prediction model."""

import argparse
from pathlib import Path

import numpy as np
import optuna
import pandas as pd
import yaml
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

from src.preprocessing import prepare_features
from src.train import (
    apply_cardinality_reduction,
    drop_other_rows,
    filter_salaries,
)


def sample_params(trial: optuna.Trial, search_space: dict) -> dict:
    """Sample hyperparameters from the search space using an Optuna trial.

    Args:
        trial: Optuna trial object.
        search_space: Dict mapping parameter names to their search config
                      (type, low, high, optional log).

    Returns:
        Dict of sampled hyperparameter values.
    """
    params = {}
    for name, spec in search_space.items():
        param_type = spec["type"]
        if param_type == "int":
            params[name] = trial.suggest_int(name, spec["low"], spec["high"])
        elif param_type == "float":
            log = spec.get("log", False)
            params[name] = trial.suggest_float(name, spec["low"], spec["high"], log=log)
    return params


def build_objective(X: pd.DataFrame, y: pd.Series, optuna_config: dict) -> callable:
    """Build an Optuna objective function for XGBoost CV evaluation.

    Args:
        X: Feature matrix.
        y: Target vector.
        optuna_config: Full optuna config dict with search_space, fixed, study.

    Returns:
        Objective function that takes a trial and returns mean MAPE.
    """
    search_space = optuna_config["search_space"]
    fixed = optuna_config["fixed"]
    cv_splits = optuna_config["study"]["cv_splits"]
    random_state = fixed.get("random_state", 42)

    def objective(trial: optuna.Trial) -> float:
        params = sample_params(trial, search_space)
        params.update(fixed)

        kf = KFold(n_splits=cv_splits, shuffle=True, random_state=random_state)
        mape_scores = []

        for train_idx, test_idx in kf.split(X):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            model = XGBRegressor(**params)
            model.fit(
                X_train,
                y_train,
                eval_set=[(X_test, y_test)],
                verbose=False,
            )

            preds = model.predict(X_test)
            mape = np.mean(np.abs((y_test - preds) / y_test)) * 100
            mape_scores.append(mape)

        return np.mean(mape_scores)

    return objective


def save_best_params(best_params: dict, config_path: Path) -> None:
    """Save the best hyperparameters to model_parameters.yaml.

    Updates the model: section with tuned values, preserving all other config.

    Args:
        best_params: Dict of best hyperparameter values from Optuna.
        config_path: Path to model_parameters.yaml.
    """
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)

    config["model"].update(best_params)

    with open(config_path, "w") as f:
        yaml.dump(config, f, default_flow_style=False, sort_keys=False)


def main():
    """Run Optuna hyperparameter optimization."""
    parser = argparse.ArgumentParser(
        description="Optuna hyperparameter optimization for salary prediction"
    )
    parser.add_argument(
        "--n-trials",
        type=int,
        default=None,
        help="Number of optimization trials (overrides config default)",
    )
    args = parser.parse_args()

    # Load configs
    optuna_config_path = Path("config/optuna_config.yaml")
    with open(optuna_config_path, "r") as f:
        optuna_config = yaml.safe_load(f)

    model_config_path = Path("config/model_parameters.yaml")
    with open(model_config_path, "r") as f:
        config = yaml.safe_load(f)

    n_trials = args.n_trials or optuna_config["study"]["n_trials"]

    # Load and preprocess data
    print("Loading data...")
    data_path = Path("data/survey_results_public.csv")
    if not data_path.exists():
        print(f"Error: Data file not found at {data_path}")
        print(
            "Please download the Stack Overflow Developer Survey CSV "
            "and place it in the data/ directory."
        )
        return

    df = pd.read_csv(
        data_path,
        usecols=[
            "Country",
            "YearsCode",
            "WorkExp",
            "EdLevel",
            "DevType",
            "Industry",
            "Age",
            "ICorPM",
            "OrgSize",
            "Employment",
            "Currency",
            "CompTotal",
            "ConvertedCompYearly",
        ],
    )
    print(f"Loaded {len(df):,} rows")

    df = filter_salaries(df, config)
    df = apply_cardinality_reduction(df)
    df = drop_other_rows(df, config)

    main_label = "ConvertedCompYearly"
    X = prepare_features(df)
    y = df[main_label] * config["data"]["salary_scale"]

    print(f"Feature matrix shape: {X.shape}")
    print(f"\nStarting Optuna optimization with {n_trials} trials...")

    # Run optimization
    objective = build_objective(X, y, optuna_config)
    study = optuna.create_study(
        direction=optuna_config["study"]["direction"],
    )
    study.optimize(objective, n_trials=n_trials)

    # Report results
    print(f"\nBest trial: #{study.best_trial.number}")
    print(f"Best MAPE: {study.best_value:.2f}%")
    print("Best hyperparameters:")
    for name, value in study.best_params.items():
        print(f"  {name}: {value}")

    # Save best params to model_parameters.yaml
    save_best_params(study.best_params, model_config_path)
    print(f"\nBest parameters saved to {model_config_path}")


if __name__ == "__main__":
    main()