File size: 6,100 Bytes
8bbb872
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
XGBoost Hyperparameter Sweep β€” ClearML HPO Controller
======================================================
Run this script once to launch the full sweep.  It clones the base
XGBoost training Task (identified by BASE_TASK_ID) and dispatches one
clone per trial, each with a different hyperparameter configuration.

Prerequisites
-------------
1.  Run train.py at least once so a completed Task exists in ClearML.
    Copy its Task-ID into BASE_TASK_ID below.
2.  Have at least one `clearml-agent` listening on the "default" queue,
    OR set EXECUTE_LOCALLY = True to run trials in-process (slower but
    needs no agent).

Usage
-----
    python models/xgboost/sweep.py
"""

import time
from clearml import Task
from clearml.automation import (
    HyperParameterOptimizer,
    UniformParameterRange,
    DiscreteParameterRange,
)

try:
    from clearml.automation.optuna import OptimizerOptuna  # preferred
    OPTIMIZER_CLASS = OptimizerOptuna
except ImportError:
    from clearml.automation import OptimizerBOHB           # fallback
    OPTIMIZER_CLASS = OptimizerBOHB

# ── Configuration ─────────────────────────────────────────────────────────────

# Paste the Task-ID of a successfully completed XGBoost training run here.
# Find it in the ClearML UI: Projects β†’ Focus Guard β†’ XGBoost Model Training
# β†’ right-click the task β†’ Copy ID.
BASE_TASK_ID = "0f42afbb3396400babc7a1a0728e7326"

# Set True to run trials in the same process (no agent needed, but serial).
# Set False to dispatch to clearml-agents on the "default" queue (parallel).
EXECUTE_LOCALLY = True

# Total budget and concurrency
MAX_TRIALS     = 40   # total number of hyperparameter configurations to try
MAX_CONCURRENT = 4    # how many trials to run in parallel (match agent count)

# ── Search space ──────────────────────────────────────────────────────────────
# These names must match the keys in CFG inside train.py exactly, because
# task.connect(CFG) registers them under those names.

SEARCH_SPACE = [
    DiscreteParameterRange("General/n_estimators",    values=[100, 200, 400, 600]),
    DiscreteParameterRange("General/max_depth",       values=[3, 4, 5, 6, 8]),
    UniformParameterRange( "General/learning_rate",   min_value=0.01, max_value=0.30),
    UniformParameterRange( "General/subsample",       min_value=0.50, max_value=1.00),
    UniformParameterRange( "General/colsample_bytree",min_value=0.50, max_value=1.00),
    UniformParameterRange( "General/reg_alpha",       min_value=0.00, max_value=2.00),
    UniformParameterRange( "General/reg_lambda",      min_value=0.50, max_value=5.00),
]

# Switch back to the per-epoch validation logloss scalar curve since Optuna 
# integration struggles to use single-value metrics natively.
OBJECTIVE_METRIC_TITLE  = "Loss"
OBJECTIVE_METRIC_SERIES = "Val"
OBJECTIVE_SIGN          = "min"   # minimize logloss

# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    # Register this controller as its own ClearML Task so it is tracked too.
    controller_task = Task.init(
        project_name="FocusGuards Large Group Project",
        task_name="XGBoost HPO Sweep Controller",
        task_type=Task.TaskTypes.optimizer,
        tags=["sweep", "xgboost", "hpo"],
    )

    optimizer = HyperParameterOptimizer(
        base_task_id=BASE_TASK_ID,
        hyper_parameters=SEARCH_SPACE,
        objective_metric_title=OBJECTIVE_METRIC_TITLE,
        objective_metric_series=OBJECTIVE_METRIC_SERIES,
        objective_metric_sign=OBJECTIVE_SIGN,
        optimizer_class=OPTIMIZER_CLASS,
        # Execution
        execution_queue="default",
        max_number_of_concurrent_tasks=MAX_CONCURRENT,
        total_max_jobs=MAX_TRIALS,
        # Early stop a trial if validation loss hasn't improved in 10 rounds
        # (relies on the per-round "Loss/Val" scalars logged in train.py)
        min_iteration_per_job=10,
        max_iteration_per_job=600,
    )

    if EXECUTE_LOCALLY:
        optimizer.start_locally(job_complete_callback=_on_trial_done)
    else:
        optimizer.start(job_complete_callback=_on_trial_done)

    print("[SWEEP] Optimizer started.  Waiting for trials to complete …")
    print(f"[SWEEP] Budget: {MAX_TRIALS} trials, {MAX_CONCURRENT} concurrent")

    # Poll until the budget is exhausted
    optimizer.wait()
    optimizer.stop()

    # ── Print best result ─────────────────────────────────────────────────────
    top_k = optimizer.get_top_experiments(top_k=5)
    print("\n[SWEEP] ── Top-5 trials by Validation Loss ──────────────────────────────")
    for rank, task in enumerate(top_k, 1):
        params = task.get_parameters()
        cfg    = {k.split("/")[-1]: v for k, v in params.items()
                  if k.startswith("General/")}
        metrics = task.get_last_scalar_metrics()
        val_loss = metrics.get(OBJECTIVE_METRIC_TITLE, {}).get(OBJECTIVE_METRIC_SERIES, {}).get("last", float('inf'))
        val_acc = metrics.get("Summary", {}).get("val_accuracy", {}).get("last", 0.0)
        val_f1 = metrics.get("Summary", {}).get("val_f1", {}).get("last", 0.0)
        
        print(f"  #{rank}  Val_Loss={val_loss:.4f}  Val_Acc={val_acc:.2%}  Val_F1={val_f1:.4f}  task_id={task.id}")
        print(f"       {cfg}")

    controller_task.close()


def _on_trial_done(job_id: str, objective: float):
    """Callback fired each time a trial finishes."""
    print(f"[SWEEP] Trial {job_id} finished  β†’  {OBJECTIVE_METRIC_TITLE}={objective:.4f}")


if __name__ == "__main__":
    main()