Spaces:
Sleeping
Sleeping
File size: 6,100 Bytes
8bbb872 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | """
XGBoost Hyperparameter Sweep β ClearML HPO Controller
======================================================
Run this script once to launch the full sweep. It clones the base
XGBoost training Task (identified by BASE_TASK_ID) and dispatches one
clone per trial, each with a different hyperparameter configuration.
Prerequisites
-------------
1. Run train.py at least once so a completed Task exists in ClearML.
Copy its Task-ID into BASE_TASK_ID below.
2. Have at least one `clearml-agent` listening on the "default" queue,
OR set EXECUTE_LOCALLY = True to run trials in-process (slower but
needs no agent).
Usage
-----
python models/xgboost/sweep.py
"""
import time
from clearml import Task
from clearml.automation import (
HyperParameterOptimizer,
UniformParameterRange,
DiscreteParameterRange,
)
try:
from clearml.automation.optuna import OptimizerOptuna # preferred
OPTIMIZER_CLASS = OptimizerOptuna
except ImportError:
from clearml.automation import OptimizerBOHB # fallback
OPTIMIZER_CLASS = OptimizerBOHB
# ββ Configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Paste the Task-ID of a successfully completed XGBoost training run here.
# Find it in the ClearML UI: Projects β Focus Guard β XGBoost Model Training
# β right-click the task β Copy ID.
BASE_TASK_ID = "0f42afbb3396400babc7a1a0728e7326"
# Set True to run trials in the same process (no agent needed, but serial).
# Set False to dispatch to clearml-agents on the "default" queue (parallel).
EXECUTE_LOCALLY = True
# Total budget and concurrency
MAX_TRIALS = 40 # total number of hyperparameter configurations to try
MAX_CONCURRENT = 4 # how many trials to run in parallel (match agent count)
# ββ Search space ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# These names must match the keys in CFG inside train.py exactly, because
# task.connect(CFG) registers them under those names.
SEARCH_SPACE = [
DiscreteParameterRange("General/n_estimators", values=[100, 200, 400, 600]),
DiscreteParameterRange("General/max_depth", values=[3, 4, 5, 6, 8]),
UniformParameterRange( "General/learning_rate", min_value=0.01, max_value=0.30),
UniformParameterRange( "General/subsample", min_value=0.50, max_value=1.00),
UniformParameterRange( "General/colsample_bytree",min_value=0.50, max_value=1.00),
UniformParameterRange( "General/reg_alpha", min_value=0.00, max_value=2.00),
UniformParameterRange( "General/reg_lambda", min_value=0.50, max_value=5.00),
]
# Switch back to the per-epoch validation logloss scalar curve since Optuna
# integration struggles to use single-value metrics natively.
OBJECTIVE_METRIC_TITLE = "Loss"
OBJECTIVE_METRIC_SERIES = "Val"
OBJECTIVE_SIGN = "min" # minimize logloss
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def main():
# Register this controller as its own ClearML Task so it is tracked too.
controller_task = Task.init(
project_name="FocusGuards Large Group Project",
task_name="XGBoost HPO Sweep Controller",
task_type=Task.TaskTypes.optimizer,
tags=["sweep", "xgboost", "hpo"],
)
optimizer = HyperParameterOptimizer(
base_task_id=BASE_TASK_ID,
hyper_parameters=SEARCH_SPACE,
objective_metric_title=OBJECTIVE_METRIC_TITLE,
objective_metric_series=OBJECTIVE_METRIC_SERIES,
objective_metric_sign=OBJECTIVE_SIGN,
optimizer_class=OPTIMIZER_CLASS,
# Execution
execution_queue="default",
max_number_of_concurrent_tasks=MAX_CONCURRENT,
total_max_jobs=MAX_TRIALS,
# Early stop a trial if validation loss hasn't improved in 10 rounds
# (relies on the per-round "Loss/Val" scalars logged in train.py)
min_iteration_per_job=10,
max_iteration_per_job=600,
)
if EXECUTE_LOCALLY:
optimizer.start_locally(job_complete_callback=_on_trial_done)
else:
optimizer.start(job_complete_callback=_on_trial_done)
print("[SWEEP] Optimizer started. Waiting for trials to complete β¦")
print(f"[SWEEP] Budget: {MAX_TRIALS} trials, {MAX_CONCURRENT} concurrent")
# Poll until the budget is exhausted
optimizer.wait()
optimizer.stop()
# ββ Print best result βββββββββββββββββββββββββββββββββββββββββββββββββββββ
top_k = optimizer.get_top_experiments(top_k=5)
print("\n[SWEEP] ββ Top-5 trials by Validation Loss ββββββββββββββββββββββββββββββ")
for rank, task in enumerate(top_k, 1):
params = task.get_parameters()
cfg = {k.split("/")[-1]: v for k, v in params.items()
if k.startswith("General/")}
metrics = task.get_last_scalar_metrics()
val_loss = metrics.get(OBJECTIVE_METRIC_TITLE, {}).get(OBJECTIVE_METRIC_SERIES, {}).get("last", float('inf'))
val_acc = metrics.get("Summary", {}).get("val_accuracy", {}).get("last", 0.0)
val_f1 = metrics.get("Summary", {}).get("val_f1", {}).get("last", 0.0)
print(f" #{rank} Val_Loss={val_loss:.4f} Val_Acc={val_acc:.2%} Val_F1={val_f1:.4f} task_id={task.id}")
print(f" {cfg}")
controller_task.close()
def _on_trial_done(job_id: str, objective: float):
"""Callback fired each time a trial finishes."""
print(f"[SWEEP] Trial {job_id} finished β {OBJECTIVE_METRIC_TITLE}={objective:.4f}")
if __name__ == "__main__":
main()
|