Spaces:

FocusGuard
/

IntegrationTest

Sleeping

File size: 6,100 Bytes

8bbb872

"""
XGBoost Hyperparameter Sweep — ClearML HPO Controller
======================================================
Run this script once to launch the full sweep.  It clones the base
XGBoost training Task (identified by BASE_TASK_ID) and dispatches one
clone per trial, each with a different hyperparameter configuration.

Prerequisites
-------------
1.  Run train.py at least once so a completed Task exists in ClearML.
    Copy its Task-ID into BASE_TASK_ID below.
2.  Have at least one `clearml-agent` listening on the "default" queue,
    OR set EXECUTE_LOCALLY = True to run trials in-process (slower but
    needs no agent).

Usage
-----
    python models/xgboost/sweep.py
"""

import time
from clearml import Task
from clearml.automation import (
    HyperParameterOptimizer,
    UniformParameterRange,
    DiscreteParameterRange,
)

try:
    from clearml.automation.optuna import OptimizerOptuna  # preferred
    OPTIMIZER_CLASS = OptimizerOptuna
except ImportError:
    from clearml.automation import OptimizerBOHB           # fallback
    OPTIMIZER_CLASS = OptimizerBOHB

# ── Configuration ─────────────────────────────────────────────────────────────

# Paste the Task-ID of a successfully completed XGBoost training run here.
# Find it in the ClearML UI: Projects → Focus Guard → XGBoost Model Training
# → right-click the task → Copy ID.
BASE_TASK_ID = "0f42afbb3396400babc7a1a0728e7326"

# Set True to run trials in the same process (no agent needed, but serial).
# Set False to dispatch to clearml-agents on the "default" queue (parallel).
EXECUTE_LOCALLY = True

# Total budget and concurrency
MAX_TRIALS     = 40   # total number of hyperparameter configurations to try
MAX_CONCURRENT = 4    # how many trials to run in parallel (match agent count)

# ── Search space ──────────────────────────────────────────────────────────────
# These names must match the keys in CFG inside train.py exactly, because
# task.connect(CFG) registers them under those names.

SEARCH_SPACE = [
    DiscreteParameterRange("General/n_estimators",    values=[100, 200, 400, 600]),
    DiscreteParameterRange("General/max_depth",       values=[3, 4, 5, 6, 8]),
    UniformParameterRange( "General/learning_rate",   min_value=0.01, max_value=0.30),
    UniformParameterRange( "General/subsample",       min_value=0.50, max_value=1.00),
    UniformParameterRange( "General/colsample_bytree",min_value=0.50, max_value=1.00),
    UniformParameterRange( "General/reg_alpha",       min_value=0.00, max_value=2.00),
    UniformParameterRange( "General/reg_lambda",      min_value=0.50, max_value=5.00),
]

# Switch back to the per-epoch validation logloss scalar curve since Optuna 
# integration struggles to use single-value metrics natively.
OBJECTIVE_METRIC_TITLE  = "Loss"
OBJECTIVE_METRIC_SERIES = "Val"
OBJECTIVE_SIGN          = "min"   # minimize logloss

# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    # Register this controller as its own ClearML Task so it is tracked too.
    controller_task = Task.init(
        project_name="FocusGuards Large Group Project",
        task_name="XGBoost HPO Sweep Controller",
        task_type=Task.TaskTypes.optimizer,
        tags=["sweep", "xgboost", "hpo"],
    )

    optimizer = HyperParameterOptimizer(
        base_task_id=BASE_TASK_ID,
        hyper_parameters=SEARCH_SPACE,
        objective_metric_title=OBJECTIVE_METRIC_TITLE,
        objective_metric_series=OBJECTIVE_METRIC_SERIES,
        objective_metric_sign=OBJECTIVE_SIGN,
        optimizer_class=OPTIMIZER_CLASS,
        # Execution
        execution_queue="default",
        max_number_of_concurrent_tasks=MAX_CONCURRENT,
        total_max_jobs=MAX_TRIALS,
        # Early stop a trial if validation loss hasn't improved in 10 rounds
        # (relies on the per-round "Loss/Val" scalars logged in train.py)
        min_iteration_per_job=10,
        max_iteration_per_job=600,
    )

    if EXECUTE_LOCALLY:
        optimizer.start_locally(job_complete_callback=_on_trial_done)
    else:
        optimizer.start(job_complete_callback=_on_trial_done)

    print("[SWEEP] Optimizer started.  Waiting for trials to complete …")
    print(f"[SWEEP] Budget: {MAX_TRIALS} trials, {MAX_CONCURRENT} concurrent")

    # Poll until the budget is exhausted
    optimizer.wait()
    optimizer.stop()

    # ── Print best result ─────────────────────────────────────────────────────
    top_k = optimizer.get_top_experiments(top_k=5)
    print("\n[SWEEP] ── Top-5 trials by Validation Loss ──────────────────────────────")
    for rank, task in enumerate(top_k, 1):
        params = task.get_parameters()
        cfg    = {k.split("/")[-1]: v for k, v in params.items()
                  if k.startswith("General/")}
        metrics = task.get_last_scalar_metrics()
        val_loss = metrics.get(OBJECTIVE_METRIC_TITLE, {}).get(OBJECTIVE_METRIC_SERIES, {}).get("last", float('inf'))
        val_acc = metrics.get("Summary", {}).get("val_accuracy", {}).get("last", 0.0)
        val_f1 = metrics.get("Summary", {}).get("val_f1", {}).get("last", 0.0)
        
        print(f"  #{rank}  Val_Loss={val_loss:.4f}  Val_Acc={val_acc:.2%}  Val_F1={val_f1:.4f}  task_id={task.id}")
        print(f"       {cfg}")

    controller_task.close()


def _on_trial_done(job_id: str, objective: float):
    """Callback fired each time a trial finishes."""
    print(f"[SWEEP] Trial {job_id} finished  →  {OBJECTIVE_METRIC_TITLE}={objective:.4f}")


if __name__ == "__main__":
    main()