import csv import json import os import random import sys import numpy as np from sklearn.metrics import ( classification_report, confusion_matrix, f1_score, precision_recall_fscore_support, roc_auc_score, roc_curve, ) from data_preparation.prepare_dataset import get_numpy_splits, SELECTED_FEATURES from models.xgboost.config import XGB_BASE_PARAMS, build_xgb_classifier _PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) def _load_cfg(): try: from config import get xgb = get("xgboost") or {} data = get("data") or {} ratios = data.get("split_ratios", [0.7, 0.15, 0.15]) return { "model_name": get("mlp.model_name") or "face_orientation", "seed": get("mlp.seed") or 42, "split_ratios": tuple(ratios), "scale": False, "checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"), "logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"), "xgb_params": dict(XGB_BASE_PARAMS), } except Exception: return { "model_name": "face_orientation", "seed": 42, "split_ratios": (0.7, 0.15, 0.15), "scale": False, "checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"), "logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"), "xgb_params": dict(XGB_BASE_PARAMS), } CFG = _load_cfg() USE_CLEARML = os.environ.get("USE_CLEARML", "0") == "1" or bool(os.environ.get("CLEARML_TASK_ID")) CLEARML_QUEUE = os.environ.get("CLEARML_QUEUE", "") task = None if USE_CLEARML: try: from clearml import Task from config import CLEARML_PROJECT_NAME, flatten_for_clearml task = Task.init( project_name=CLEARML_PROJECT_NAME, task_name="XGBoost Model Training", tags=["training", "xgboost"], ) from config.clearml_enrich import enrich_task, upload_repro_artifacts enrich_task(task, role="train_xgboost") flat = flatten_for_clearml() for k, v in CFG.get("xgb_params", {}).items(): flat[f"xgb_params/{k}"] = v flat["model_name"] = CFG["model_name"] flat["seed"] = CFG["seed"] flat["split_ratios"] = str(CFG["split_ratios"]) task.connect(flat) upload_repro_artifacts(task) if CLEARML_QUEUE: print(f"[ClearML] Enqueuing to queue '{CLEARML_QUEUE}'.") task.execute_remotely(queue_name=CLEARML_QUEUE) sys.exit(0) except ImportError: task = None USE_CLEARML = False def set_seed(seed: int): random.seed(seed) np.random.seed(seed) def main(): set_seed(CFG["seed"]) print(f"[TRAIN] Model: XGBoost") print(f"[TRAIN] Task: {CFG['model_name']}") # ── Data ────────────────────────────────────────────────────── splits, num_features, num_classes, scaler = get_numpy_splits( model_name=CFG["model_name"], split_ratios=CFG["split_ratios"], seed=CFG["seed"], scale=CFG["scale"], ) X_train, y_train = splits["X_train"], splits["y_train"] X_val, y_val = splits["X_val"], splits["y_val"] X_test, y_test = splits["X_test"], splits["y_test"] # ── Model ───────────────────────────────────────────────────── model = build_xgb_classifier(CFG["seed"], verbosity=1, early_stopping_rounds=30) model.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=10, ) best_it = getattr(model, "best_iteration", None) print(f"[TRAIN] Best iteration: {best_it} / {CFG['xgb_params']['n_estimators']}") # ── Evaluation ──────────────────────────────────────────────── evals = model.evals_result() eval_metric_name = CFG["xgb_params"]["eval_metric"] train_losses = evals["validation_0"][eval_metric_name] val_losses = evals["validation_1"][eval_metric_name] # Test metrics test_preds = model.predict(X_test) test_probs = model.predict_proba(X_test) test_acc = float(np.mean(test_preds == y_test)) test_f1 = float(f1_score(y_test, test_preds, average='weighted')) if num_classes > 2: test_auc = float(roc_auc_score(y_test, test_probs, multi_class='ovr', average='weighted')) else: test_auc = float(roc_auc_score(y_test, test_probs[:, 1])) print(f"\n[TEST] Accuracy: {test_acc:.2%}") print(f"[TEST] F1: {test_f1:.4f}") print(f"[TEST] ROC-AUC: {test_auc:.4f}") # Dataset stats dataset_stats = { "train_size": len(y_train), "val_size": len(y_val), "test_size": len(y_test), "train_class_counts": np.bincount(y_train.astype(int), minlength=num_classes).tolist(), "val_class_counts": np.bincount(y_val.astype(int), minlength=num_classes).tolist(), "test_class_counts": np.bincount(y_test.astype(int), minlength=num_classes).tolist(), } logs_dir = CFG["logs_dir"] os.makedirs(logs_dir, exist_ok=True) cm = confusion_matrix(y_test, test_preds) y_test_i = y_test.astype(int) pred_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_test_predictions.csv") with open(pred_path, "w", newline="") as f: w = csv.writer(f) w.writerow(["y_true", "y_pred"] + [f"prob_{j}" for j in range(num_classes)]) for i in range(len(y_test_i)): w.writerow( [int(y_test_i[i]), int(test_preds[i])] + [float(x) for x in test_probs[i]] ) summary_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_test_metrics_summary.json") with open(summary_path, "w", encoding="utf-8") as f: json.dump( { "model": "xgboost", "model_name": CFG["model_name"], "test_accuracy": round(test_acc, 6), "test_f1_weighted": round(test_f1, 6), "test_roc_auc": round(test_auc, 6), "confusion_matrix": cm.tolist(), "classification_report": classification_report( y_test, test_preds, digits=4 ), }, f, indent=2, ) feat_names = list( SELECTED_FEATURES.get(CFG["model_name"], SELECTED_FEATURES["face_orientation"]) ) imp_vals = model.feature_importances_ imp_rows = [ {"feature": feat_names[i], "importance": float(imp_vals[i])} for i in range(min(len(feat_names), len(imp_vals))) ] imp_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_feature_importance.json") with open(imp_path, "w", encoding="utf-8") as f: json.dump(imp_rows, f, indent=2) print(f"[LOG] Test predictions → {pred_path}") if task is not None: for i, (tl, vl) in enumerate(zip(train_losses, val_losses)): task.logger.report_scalar("Loss", "Train", tl, iteration=i + 1) task.logger.report_scalar("Loss", "Val", vl, iteration=i + 1) task.logger.report_single_value("test/accuracy", test_acc) task.logger.report_single_value("test/f1_weighted", test_f1) task.logger.report_single_value("test/roc_auc", test_auc) for key, val in dataset_stats.items(): if isinstance(val, list): for i, v in enumerate(val): task.logger.report_single_value(f"dataset/{key}/{i}", float(v)) else: task.logger.report_single_value(f"dataset/{key}", float(val)) prec, rec, f1_per_class, _ = precision_recall_fscore_support( y_test, test_preds, average=None, zero_division=0 ) for c in range(num_classes): task.logger.report_single_value(f"test/class_{c}_precision", float(prec[c])) task.logger.report_single_value(f"test/class_{c}_recall", float(rec[c])) task.logger.report_single_value(f"test/class_{c}_f1", float(f1_per_class[c])) import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(6, 5)) ax.imshow(cm, cmap="Blues") ax.set_xticks(range(num_classes)) ax.set_yticks(range(num_classes)) ax.set_xticklabels([f"Class {i}" for i in range(num_classes)]) ax.set_yticklabels([f"Class {i}" for i in range(num_classes)]) for i in range(num_classes): for j in range(num_classes): ax.text(j, i, str(cm[i, j]), ha="center", va="center", color="black") ax.set_xlabel("Predicted") ax.set_ylabel("True") ax.set_title("Test set confusion matrix") fig.tight_layout() task.logger.report_matplotlib_figure(title="Confusion Matrix", series="test", figure=fig, iteration=0) plt.close(fig) if num_classes == 2: fpr, tpr, _ = roc_curve(y_test, test_probs[:, 1]) fig_r, ax_r = plt.subplots(figsize=(6, 5)) ax_r.plot(fpr, tpr, label=f"ROC-AUC = {test_auc:.4f}") ax_r.plot([0, 1], [0, 1], "k--", lw=1) ax_r.set_xlabel("False positive rate") ax_r.set_ylabel("True positive rate") ax_r.set_title("Test ROC (XGBoost)") ax_r.legend(loc="lower right") fig_r.tight_layout() task.logger.report_matplotlib_figure( title="ROC", series="test", figure=fig_r, iteration=0 ) plt.close(fig_r) task.logger.flush() # ── Save checkpoint ─────────────────────────────────────────── ckpt_dir = CFG["checkpoints_dir"] os.makedirs(ckpt_dir, exist_ok=True) model_path = os.path.join(ckpt_dir, f"xgboost_{CFG['model_name']}_best.json") model.save_model(model_path) print(f"\n[CKPT] Model saved to: {model_path}") # ── Write JSON log (same schema as MLP) ─────────────────────── # pandas-free tree/node count (trees_to_dataframe() needs pandas) booster = model.get_booster() tree_count = int(booster.num_boosted_rounds()) node_count = int(sum(tree.count("\n") + 1 for tree in booster.get_dump())) history = { "model_name": f"xgboost_{CFG['model_name']}", "param_count": node_count, "tree_count": tree_count, "xgb_params": CFG["xgb_params"], "epochs": list(range(1, len(train_losses) + 1)), "train_loss": [round(v, 4) for v in train_losses], "val_loss": [round(v, 4) for v in val_losses], "test_acc": round(test_acc, 4), "test_f1": round(test_f1, 4), "test_auc": round(test_auc, 4), "dataset_stats": dataset_stats, } log_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_training_log.json") with open(log_path, "w") as f: json.dump(history, f, indent=2) print(f"[LOG] Training history saved to: {log_path}") if task is not None: from clearml import OutputModel from config.clearml_enrich import attach_output_metrics, task_done_summary task.upload_artifact(name="xgboost_model", artifact_object=model_path) task.upload_artifact(name="training_log", artifact_object=log_path) task.upload_artifact(name="test_predictions", artifact_object=pred_path) task.upload_artifact(name="test_metrics_summary", artifact_object=summary_path) task.upload_artifact(name="feature_importance", artifact_object=imp_path) out_model = OutputModel( task=task, name=f"XGBoost_{CFG['model_name']}", framework="XGBoost" ) out_model.update_weights(weights_filename=model_path, auto_delete_file=False) attach_output_metrics( out_model, { "test_accuracy": round(test_acc, 6), "test_f1_weighted": round(test_f1, 6), "test_roc_auc": round(test_auc, 6), }, ) task_done_summary( task, f"XGBoost {CFG['model_name']}: test acc={test_acc:.4f}, F1={test_f1:.4f}, ROC-AUC={test_auc:.4f}", ) if __name__ == "__main__": main()