Spaces:

FocusGuard
/

final_test

Sleeping

File size: 12,614 Bytes

22a6915

import csv
import json
import os
import random
import sys

import numpy as np
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    precision_recall_fscore_support,
    roc_auc_score,
    roc_curve,
)

from data_preparation.prepare_dataset import get_numpy_splits, SELECTED_FEATURES
from models.xgboost.config import XGB_BASE_PARAMS, build_xgb_classifier

_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))


def _load_cfg():
    try:
        from config import get
        xgb = get("xgboost") or {}
        data = get("data") or {}
        ratios = data.get("split_ratios", [0.7, 0.15, 0.15])
        return {
            "model_name": get("mlp.model_name") or "face_orientation",
            "seed": get("mlp.seed") or 42,
            "split_ratios": tuple(ratios),
            "scale": False,
            "checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"),
            "logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"),
            "xgb_params": dict(XGB_BASE_PARAMS),
        }
    except Exception:
        return {
            "model_name": "face_orientation",
            "seed": 42,
            "split_ratios": (0.7, 0.15, 0.15),
            "scale": False,
            "checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"),
            "logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"),
            "xgb_params": dict(XGB_BASE_PARAMS),
        }


CFG = _load_cfg()

USE_CLEARML = os.environ.get("USE_CLEARML", "0") == "1" or bool(os.environ.get("CLEARML_TASK_ID"))
CLEARML_QUEUE = os.environ.get("CLEARML_QUEUE", "")

task = None
if USE_CLEARML:
    try:
        from clearml import Task
        from config import CLEARML_PROJECT_NAME, flatten_for_clearml
        task = Task.init(
            project_name=CLEARML_PROJECT_NAME,
            task_name="XGBoost Model Training",
            tags=["training", "xgboost"],
        )
        from config.clearml_enrich import enrich_task, upload_repro_artifacts

        enrich_task(task, role="train_xgboost")
        flat = flatten_for_clearml()
        for k, v in CFG.get("xgb_params", {}).items():
            flat[f"xgb_params/{k}"] = v
        flat["model_name"] = CFG["model_name"]
        flat["seed"] = CFG["seed"]
        flat["split_ratios"] = str(CFG["split_ratios"])
        task.connect(flat)
        upload_repro_artifacts(task)
        if CLEARML_QUEUE:
            print(f"[ClearML] Enqueuing to queue '{CLEARML_QUEUE}'.")
            task.execute_remotely(queue_name=CLEARML_QUEUE)
            sys.exit(0)
    except ImportError:
        task = None
        USE_CLEARML = False

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)


def main():
    set_seed(CFG["seed"])

    print(f"[TRAIN] Model: XGBoost")
    print(f"[TRAIN] Task: {CFG['model_name']}")

    # ── Data ──────────────────────────────────────────────────────
    splits, num_features, num_classes, scaler = get_numpy_splits(
        model_name=CFG["model_name"],
        split_ratios=CFG["split_ratios"],
        seed=CFG["seed"],
        scale=CFG["scale"],
    )

    X_train, y_train = splits["X_train"], splits["y_train"]
    X_val,   y_val   = splits["X_val"],   splits["y_val"]
    X_test,  y_test  = splits["X_test"],  splits["y_test"]

    # ── Model ─────────────────────────────────────────────────────
    model = build_xgb_classifier(CFG["seed"], verbosity=1, early_stopping_rounds=30)

    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        verbose=10,
    )
    best_it = getattr(model, "best_iteration", None)
    print(f"[TRAIN] Best iteration: {best_it} / {CFG['xgb_params']['n_estimators']}")

    # ── Evaluation ────────────────────────────────────────────────
    evals = model.evals_result()
    eval_metric_name = CFG["xgb_params"]["eval_metric"]
    train_losses = evals["validation_0"][eval_metric_name]
    val_losses   = evals["validation_1"][eval_metric_name]

    # Test metrics
    test_preds = model.predict(X_test)
    test_probs = model.predict_proba(X_test)
    test_acc   = float(np.mean(test_preds == y_test))
    test_f1    = float(f1_score(y_test, test_preds, average='weighted'))

    if num_classes > 2:
        test_auc = float(roc_auc_score(y_test, test_probs, multi_class='ovr', average='weighted'))
    else:
        test_auc = float(roc_auc_score(y_test, test_probs[:, 1]))

    print(f"\n[TEST] Accuracy: {test_acc:.2%}")
    print(f"[TEST] F1:       {test_f1:.4f}")
    print(f"[TEST] ROC-AUC:  {test_auc:.4f}")

    # Dataset stats
    dataset_stats = {
        "train_size": len(y_train),
        "val_size": len(y_val),
        "test_size": len(y_test),
        "train_class_counts": np.bincount(y_train.astype(int), minlength=num_classes).tolist(),
        "val_class_counts": np.bincount(y_val.astype(int), minlength=num_classes).tolist(),
        "test_class_counts": np.bincount(y_test.astype(int), minlength=num_classes).tolist(),
    }

    logs_dir = CFG["logs_dir"]
    os.makedirs(logs_dir, exist_ok=True)
    cm = confusion_matrix(y_test, test_preds)
    y_test_i = y_test.astype(int)
    pred_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_test_predictions.csv")
    with open(pred_path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["y_true", "y_pred"] + [f"prob_{j}" for j in range(num_classes)])
        for i in range(len(y_test_i)):
            w.writerow(
                [int(y_test_i[i]), int(test_preds[i])]
                + [float(x) for x in test_probs[i]]
            )
    summary_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_test_metrics_summary.json")
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(
            {
                "model": "xgboost",
                "model_name": CFG["model_name"],
                "test_accuracy": round(test_acc, 6),
                "test_f1_weighted": round(test_f1, 6),
                "test_roc_auc": round(test_auc, 6),
                "confusion_matrix": cm.tolist(),
                "classification_report": classification_report(
                    y_test, test_preds, digits=4
                ),
            },
            f,
            indent=2,
        )
    feat_names = list(
        SELECTED_FEATURES.get(CFG["model_name"], SELECTED_FEATURES["face_orientation"])
    )
    imp_vals = model.feature_importances_
    imp_rows = [
        {"feature": feat_names[i], "importance": float(imp_vals[i])}
        for i in range(min(len(feat_names), len(imp_vals)))
    ]
    imp_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_feature_importance.json")
    with open(imp_path, "w", encoding="utf-8") as f:
        json.dump(imp_rows, f, indent=2)
    print(f"[LOG] Test predictions → {pred_path}")

    if task is not None:
        for i, (tl, vl) in enumerate(zip(train_losses, val_losses)):
            task.logger.report_scalar("Loss", "Train", tl, iteration=i + 1)
            task.logger.report_scalar("Loss", "Val", vl, iteration=i + 1)
        task.logger.report_single_value("test/accuracy", test_acc)
        task.logger.report_single_value("test/f1_weighted", test_f1)
        task.logger.report_single_value("test/roc_auc", test_auc)
        for key, val in dataset_stats.items():
            if isinstance(val, list):
                for i, v in enumerate(val):
                    task.logger.report_single_value(f"dataset/{key}/{i}", float(v))
            else:
                task.logger.report_single_value(f"dataset/{key}", float(val))
        prec, rec, f1_per_class, _ = precision_recall_fscore_support(
            y_test, test_preds, average=None, zero_division=0
        )
        for c in range(num_classes):
            task.logger.report_single_value(f"test/class_{c}_precision", float(prec[c]))
            task.logger.report_single_value(f"test/class_{c}_recall", float(rec[c]))
            task.logger.report_single_value(f"test/class_{c}_f1", float(f1_per_class[c]))
        import matplotlib
        matplotlib.use("Agg")
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots(figsize=(6, 5))
        ax.imshow(cm, cmap="Blues")
        ax.set_xticks(range(num_classes))
        ax.set_yticks(range(num_classes))
        ax.set_xticklabels([f"Class {i}" for i in range(num_classes)])
        ax.set_yticklabels([f"Class {i}" for i in range(num_classes)])
        for i in range(num_classes):
            for j in range(num_classes):
                ax.text(j, i, str(cm[i, j]), ha="center", va="center", color="black")
        ax.set_xlabel("Predicted")
        ax.set_ylabel("True")
        ax.set_title("Test set confusion matrix")
        fig.tight_layout()
        task.logger.report_matplotlib_figure(title="Confusion Matrix", series="test", figure=fig, iteration=0)
        plt.close(fig)
        if num_classes == 2:
            fpr, tpr, _ = roc_curve(y_test, test_probs[:, 1])
            fig_r, ax_r = plt.subplots(figsize=(6, 5))
            ax_r.plot(fpr, tpr, label=f"ROC-AUC = {test_auc:.4f}")
            ax_r.plot([0, 1], [0, 1], "k--", lw=1)
            ax_r.set_xlabel("False positive rate")
            ax_r.set_ylabel("True positive rate")
            ax_r.set_title("Test ROC (XGBoost)")
            ax_r.legend(loc="lower right")
            fig_r.tight_layout()
            task.logger.report_matplotlib_figure(
                title="ROC", series="test", figure=fig_r, iteration=0
            )
            plt.close(fig_r)
        task.logger.flush()

    # ── Save checkpoint ───────────────────────────────────────────
    ckpt_dir = CFG["checkpoints_dir"]
    os.makedirs(ckpt_dir, exist_ok=True)
    model_path = os.path.join(ckpt_dir, f"xgboost_{CFG['model_name']}_best.json")
    model.save_model(model_path)
    print(f"\n[CKPT] Model saved to: {model_path}")

    # ── Write JSON log (same schema as MLP) ───────────────────────
    # pandas-free tree/node count (trees_to_dataframe() needs pandas)
    booster = model.get_booster()
    tree_count = int(booster.num_boosted_rounds())
    node_count = int(sum(tree.count("\n") + 1 for tree in booster.get_dump()))

    history = {
        "model_name": f"xgboost_{CFG['model_name']}",
        "param_count": node_count,
        "tree_count": tree_count,
        "xgb_params": CFG["xgb_params"],
        "epochs": list(range(1, len(train_losses) + 1)),
        "train_loss": [round(v, 4) for v in train_losses],
        "val_loss": [round(v, 4) for v in val_losses],
        "test_acc": round(test_acc, 4),
        "test_f1": round(test_f1, 4),
        "test_auc": round(test_auc, 4),
        "dataset_stats": dataset_stats,
    }

    log_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_training_log.json")

    with open(log_path, "w") as f:
        json.dump(history, f, indent=2)

    print(f"[LOG] Training history saved to: {log_path}")

    if task is not None:
        from clearml import OutputModel
        from config.clearml_enrich import attach_output_metrics, task_done_summary

        task.upload_artifact(name="xgboost_model", artifact_object=model_path)
        task.upload_artifact(name="training_log", artifact_object=log_path)
        task.upload_artifact(name="test_predictions", artifact_object=pred_path)
        task.upload_artifact(name="test_metrics_summary", artifact_object=summary_path)
        task.upload_artifact(name="feature_importance", artifact_object=imp_path)
        out_model = OutputModel(
            task=task, name=f"XGBoost_{CFG['model_name']}", framework="XGBoost"
        )
        out_model.update_weights(weights_filename=model_path, auto_delete_file=False)
        attach_output_metrics(
            out_model,
            {
                "test_accuracy": round(test_acc, 6),
                "test_f1_weighted": round(test_f1, 6),
                "test_roc_auc": round(test_auc, 6),
            },
        )
        task_done_summary(
            task,
            f"XGBoost {CFG['model_name']}: test acc={test_acc:.4f}, F1={test_f1:.4f}, ROC-AUC={test_auc:.4f}",
        )


if __name__ == "__main__":
    main()