import os
import sys
import warnings
import joblib
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    AdaBoostClassifier, GradientBoostingClassifier,
)
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score,
    recall_score, accuracy_score,
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# allow imports from project root
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
from src.mlflow_setup import init_mlflow, log_model_run

warnings.filterwarnings('ignore')

PROCESSED_DIR = os.path.join('data', 'processed')
MODELS_DIR    = os.path.join('models')


# ---------------------------------------------------------------------------
# Model definitions — (name, estimator, params_to_log)
# ---------------------------------------------------------------------------
MODELS = [
    (
        "Logistic Regression",
        LogisticRegression(max_iter=1000, random_state=42),
        {"max_iter": 1000, "solver": "lbfgs"},
    ),
    (
        "Decision Tree",
        DecisionTreeClassifier(max_depth=10, random_state=42),
        {"max_depth": 10},
    ),
    (
        "Random Forest",
        RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42),
        {"n_estimators": 100},
    ),
    (
        "Extra Trees",
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=42),
        {"n_estimators": 100},
    ),
    (
        "AdaBoost",
        AdaBoostClassifier(n_estimators=100, random_state=42),
        {"n_estimators": 100},
    ),
    (
        "Gradient Boosting",
        GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42),
        {"n_estimators": 100, "max_depth": 4},
    ),
    (
        "XGBoost",
        XGBClassifier(
            n_estimators=200, max_depth=6, learning_rate=0.1,
            scale_pos_weight=1, eval_metric='logloss',
            random_state=42, n_jobs=-1,
        ),
        {"n_estimators": 200, "max_depth": 6, "learning_rate": 0.1},
    ),
    (
        "LightGBM",
        LGBMClassifier(
            n_estimators=200, max_depth=6, learning_rate=0.1,
            random_state=42, n_jobs=-1, verbose=-1,
        ),
        {"n_estimators": 200, "max_depth": 6, "learning_rate": 0.1},
    ),
    (
        "CatBoost",
        CatBoostClassifier(
            iterations=200, depth=6, learning_rate=0.1,
            random_seed=42, verbose=0,
        ),
        {"iterations": 200, "depth": 6, "learning_rate": 0.1},
    ),
]


def load_data():
    X_train = joblib.load(os.path.join(PROCESSED_DIR, 'X_train_smote.pkl'))
    y_train = joblib.load(os.path.join(PROCESSED_DIR, 'y_train_smote.pkl'))
    X_test  = joblib.load(os.path.join(PROCESSED_DIR, 'X_test.pkl'))
    y_test  = joblib.load(os.path.join(PROCESSED_DIR, 'y_test.pkl'))
    return X_train, y_train, X_test, y_test


def evaluate(model, X_test, y_test):
    y_pred      = model.predict(X_test)
    y_proba     = model.predict_proba(X_test)[:, 1]
    return {
        "auc_roc"   : round(roc_auc_score(y_test, y_proba), 6),
        "f1"        : round(f1_score(y_test, y_pred), 6),
        "precision" : round(precision_score(y_test, y_pred), 6),
        "recall"    : round(recall_score(y_test, y_pred), 6),
        "accuracy"  : round(accuracy_score(y_test, y_pred), 6),
    }


def run():
    print("=" * 65)
    print("  Training & Benchmarking 9 Models")
    print("=" * 65)

    init_mlflow()
    X_train, y_train, X_test, y_test = load_data()

    os.makedirs(MODELS_DIR, exist_ok=True)
    results = []

    for name, model, params in MODELS:
        print(f"\n  Training : {name}")
        model.fit(X_train, y_train)
        metrics = evaluate(model, X_test, y_test)

        # log to MLflow
        log_model_run(
            model_name=name,
            model=model,
            params=params,
            metrics=metrics,
            X_sample=X_test.iloc[:5],
        )

        # save model locally
        safe_name = name.lower().replace(' ', '_').replace('(', '').replace(')', '')
        joblib.dump(model, os.path.join(MODELS_DIR, f'{safe_name}.pkl'))

        results.append({"Model": name, **metrics})

    # -----------------------------------------------------------------------
    # Print comparison table ranked by AUC-ROC
    # -----------------------------------------------------------------------
    df = pd.DataFrame(results).sort_values("auc_roc", ascending=False).reset_index(drop=True)
    df.index += 1

    print("\n")
    print("=" * 65)
    print("  Model Benchmark Results (ranked by AUC-ROC)")
    print("=" * 65)
    print(df.to_string())
    print("=" * 65)
    print(f"\n  Best model : {df.iloc[0]['Model']}  (AUC-ROC = {df.iloc[0]['auc_roc']})")
    print("=" * 65)

    return df


if __name__ == '__main__':
    run()