Commit ·
0fe8a5f
1
Parent(s): 2660c6c
Architecture sweep: GPU affinity, arch search space, train.py overrides
Browse files- train.py: add --d-model, --n-layers, --n-heads, --d-ff, --lr,
--weight-decay, --warmup-steps flags to override named variants
- sweep.py: add --n-gpus (round-robin GPU pinning via CUDA_VISIBLE_DEVICES)
and --total-steps for architecture/pretrain sweeps
- pawn/sweep.py: new suggest_architecture() search space (150M-500M range),
AdapterObjective pins trials to GPUs, handles pretrain/architecture modes
- pawn/sweep.py +47 -5
- scripts/sweep.py +19 -6
- scripts/train.py +25 -0
pawn/sweep.py
CHANGED
|
@@ -20,6 +20,7 @@ from __future__ import annotations
|
|
| 20 |
|
| 21 |
import argparse
|
| 22 |
import json
|
|
|
|
| 23 |
import subprocess
|
| 24 |
import sys
|
| 25 |
from pathlib import Path
|
|
@@ -110,7 +111,7 @@ def suggest_tiny(trial: "optuna.Trial") -> dict:
|
|
| 110 |
|
| 111 |
|
| 112 |
def suggest_pretrain(trial: "optuna.Trial") -> dict:
|
| 113 |
-
"""Pretraining hyperparameters."""
|
| 114 |
return {
|
| 115 |
"lr": trial.suggest_float("lr", 1e-5, 1e-3, log=True),
|
| 116 |
"batch_size": trial.suggest_categorical("batch_size", [128, 256, 512]),
|
|
@@ -120,6 +121,29 @@ def suggest_pretrain(trial: "optuna.Trial") -> dict:
|
|
| 120 |
}
|
| 121 |
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
SUGGEST_FNS = {
|
| 124 |
"lora": suggest_lora,
|
| 125 |
"bottleneck": suggest_bottleneck,
|
|
@@ -127,6 +151,7 @@ SUGGEST_FNS = {
|
|
| 127 |
"sparse": suggest_sparse,
|
| 128 |
"hybrid": suggest_hybrid,
|
| 129 |
"tiny": suggest_tiny,
|
|
|
|
| 130 |
"pretrain": suggest_pretrain,
|
| 131 |
}
|
| 132 |
|
|
@@ -138,6 +163,7 @@ ADAPTER_SCRIPTS = {
|
|
| 138 |
"hybrid": "scripts/train_hybrid.py",
|
| 139 |
"tiny": "scripts/train_tiny.py",
|
| 140 |
"pretrain": "scripts/train.py",
|
|
|
|
| 141 |
}
|
| 142 |
|
| 143 |
|
|
@@ -208,6 +234,7 @@ class AdapterObjective:
|
|
| 208 |
device: str = "cuda",
|
| 209 |
output_base: str = "sweeps",
|
| 210 |
epochs: int = 50,
|
|
|
|
| 211 |
extra_args: list[str] | None = None,
|
| 212 |
):
|
| 213 |
self.adapter_type = adapter_type
|
|
@@ -217,6 +244,7 @@ class AdapterObjective:
|
|
| 217 |
self.output_base = Path(output_base) / adapter_type
|
| 218 |
self.output_base.mkdir(parents=True, exist_ok=True)
|
| 219 |
self.epochs = epochs
|
|
|
|
| 220 |
self.extra_args = extra_args or []
|
| 221 |
self.script = ADAPTER_SCRIPTS[adapter_type]
|
| 222 |
|
|
@@ -229,13 +257,20 @@ class AdapterObjective:
|
|
| 229 |
# Build command
|
| 230 |
cmd = [sys.executable, self.script]
|
| 231 |
|
| 232 |
-
# Fixed args
|
| 233 |
-
if self.adapter_type
|
| 234 |
cmd.extend(["--checkpoint", self.checkpoint])
|
| 235 |
cmd.extend(["--pgn", self.pgn])
|
| 236 |
cmd.extend(["--device", self.device])
|
| 237 |
-
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
cmd.extend(["--epochs", str(self.epochs)])
|
| 240 |
|
| 241 |
# Suggested hyperparameters
|
|
@@ -244,11 +279,18 @@ class AdapterObjective:
|
|
| 244 |
# Extra user-provided args
|
| 245 |
cmd.extend(self.extra_args)
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
# Run training
|
| 248 |
result = subprocess.run(
|
| 249 |
cmd,
|
| 250 |
capture_output=True,
|
| 251 |
text=True,
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
if result.returncode != 0:
|
|
|
|
| 20 |
|
| 21 |
import argparse
|
| 22 |
import json
|
| 23 |
+
import os
|
| 24 |
import subprocess
|
| 25 |
import sys
|
| 26 |
from pathlib import Path
|
|
|
|
| 111 |
|
| 112 |
|
| 113 |
def suggest_pretrain(trial: "optuna.Trial") -> dict:
|
| 114 |
+
"""Pretraining hyperparameters (fixed architecture, tune training)."""
|
| 115 |
return {
|
| 116 |
"lr": trial.suggest_float("lr", 1e-5, 1e-3, log=True),
|
| 117 |
"batch_size": trial.suggest_categorical("batch_size", [128, 256, 512]),
|
|
|
|
| 121 |
}
|
| 122 |
|
| 123 |
|
| 124 |
+
def suggest_architecture(trial: "optuna.Trial") -> dict:
|
| 125 |
+
"""Architecture search space for pretraining.
|
| 126 |
+
|
| 127 |
+
Explores model size, depth/width tradeoff, and training hyperparameters.
|
| 128 |
+
Target budget: 150M-500M parameters on 80GB GPUs.
|
| 129 |
+
"""
|
| 130 |
+
d_model = trial.suggest_categorical("d_model", [512, 640, 768, 896, 1024, 1280])
|
| 131 |
+
n_layers = trial.suggest_int("n_layers", 8, 24, step=2)
|
| 132 |
+
n_heads = trial.suggest_categorical("n_heads", [8, 16])
|
| 133 |
+
d_ff_mult = trial.suggest_categorical("d_ff_mult", [3, 4, 5])
|
| 134 |
+
|
| 135 |
+
return {
|
| 136 |
+
"d_model": d_model,
|
| 137 |
+
"n_layers": n_layers,
|
| 138 |
+
"n_heads": n_heads,
|
| 139 |
+
"d_ff": d_model * d_ff_mult,
|
| 140 |
+
"lr": trial.suggest_float("lr", 1e-5, 1e-3, log=True),
|
| 141 |
+
"batch_size": trial.suggest_categorical("batch_size", [128, 256]),
|
| 142 |
+
"weight_decay": trial.suggest_float("weight_decay", 0.0, 0.1),
|
| 143 |
+
"warmup_steps": trial.suggest_int("warmup_steps", 500, 3000, step=500),
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
|
| 147 |
SUGGEST_FNS = {
|
| 148 |
"lora": suggest_lora,
|
| 149 |
"bottleneck": suggest_bottleneck,
|
|
|
|
| 151 |
"sparse": suggest_sparse,
|
| 152 |
"hybrid": suggest_hybrid,
|
| 153 |
"tiny": suggest_tiny,
|
| 154 |
+
"architecture": suggest_architecture,
|
| 155 |
"pretrain": suggest_pretrain,
|
| 156 |
}
|
| 157 |
|
|
|
|
| 163 |
"hybrid": "scripts/train_hybrid.py",
|
| 164 |
"tiny": "scripts/train_tiny.py",
|
| 165 |
"pretrain": "scripts/train.py",
|
| 166 |
+
"architecture": "scripts/train.py",
|
| 167 |
}
|
| 168 |
|
| 169 |
|
|
|
|
| 234 |
device: str = "cuda",
|
| 235 |
output_base: str = "sweeps",
|
| 236 |
epochs: int = 50,
|
| 237 |
+
n_gpus: int = 1,
|
| 238 |
extra_args: list[str] | None = None,
|
| 239 |
):
|
| 240 |
self.adapter_type = adapter_type
|
|
|
|
| 244 |
self.output_base = Path(output_base) / adapter_type
|
| 245 |
self.output_base.mkdir(parents=True, exist_ok=True)
|
| 246 |
self.epochs = epochs
|
| 247 |
+
self.n_gpus = n_gpus
|
| 248 |
self.extra_args = extra_args or []
|
| 249 |
self.script = ADAPTER_SCRIPTS[adapter_type]
|
| 250 |
|
|
|
|
| 257 |
# Build command
|
| 258 |
cmd = [sys.executable, self.script]
|
| 259 |
|
| 260 |
+
# Fixed args — architecture and pretrain sweeps use train.py directly
|
| 261 |
+
if self.adapter_type not in ("pretrain", "architecture"):
|
| 262 |
cmd.extend(["--checkpoint", self.checkpoint])
|
| 263 |
cmd.extend(["--pgn", self.pgn])
|
| 264 |
cmd.extend(["--device", self.device])
|
| 265 |
+
|
| 266 |
+
# output-dir for adapters, log-dir for pretraining
|
| 267 |
+
if self.adapter_type in ("pretrain", "architecture"):
|
| 268 |
+
cmd.extend(["--log-dir", str(trial_dir)])
|
| 269 |
+
cmd.extend(["--local-checkpoints"])
|
| 270 |
+
else:
|
| 271 |
+
cmd.extend(["--output-dir", str(trial_dir)])
|
| 272 |
+
|
| 273 |
+
if "epochs" not in params and "total_steps" not in params:
|
| 274 |
cmd.extend(["--epochs", str(self.epochs)])
|
| 275 |
|
| 276 |
# Suggested hyperparameters
|
|
|
|
| 279 |
# Extra user-provided args
|
| 280 |
cmd.extend(self.extra_args)
|
| 281 |
|
| 282 |
+
# GPU affinity: pin trial to GPU (trial.number % n_gpus)
|
| 283 |
+
env = os.environ.copy()
|
| 284 |
+
if self.n_gpus > 1:
|
| 285 |
+
gpu_id = trial.number % self.n_gpus
|
| 286 |
+
env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
| 287 |
+
|
| 288 |
# Run training
|
| 289 |
result = subprocess.run(
|
| 290 |
cmd,
|
| 291 |
capture_output=True,
|
| 292 |
text=True,
|
| 293 |
+
env=env,
|
| 294 |
)
|
| 295 |
|
| 296 |
if result.returncode != 0:
|
scripts/sweep.py
CHANGED
|
@@ -56,9 +56,13 @@ def main():
|
|
| 56 |
p.add_argument("--n-trials", type=int, default=30,
|
| 57 |
help="Number of trials to run")
|
| 58 |
p.add_argument("--n-jobs", type=int, default=1,
|
| 59 |
-
help="Parallel trials (
|
|
|
|
|
|
|
| 60 |
p.add_argument("--epochs", type=int, default=30,
|
| 61 |
-
help="Max epochs per trial")
|
|
|
|
|
|
|
| 62 |
p.add_argument("--device", type=str, default="cuda")
|
| 63 |
p.add_argument("--pruner", type=str, default="hyperband",
|
| 64 |
choices=["hyperband", "median", "none"])
|
|
@@ -69,7 +73,7 @@ def main():
|
|
| 69 |
|
| 70 |
args = p.parse_args()
|
| 71 |
|
| 72 |
-
if args.adapter
|
| 73 |
p.error("--pgn is required for adapter sweeps")
|
| 74 |
|
| 75 |
study_name = args.study_name or args.adapter
|
|
@@ -77,8 +81,11 @@ def main():
|
|
| 77 |
|
| 78 |
print(f"=== PAWN Hyperparameter Sweep ===")
|
| 79 |
print(f"Adapter: {args.adapter}")
|
| 80 |
-
print(f"Trials: {args.n_trials} (parallel: {args.n_jobs})")
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
| 82 |
print(f"Pruner: {args.pruner}")
|
| 83 |
print(f"Storage: {db_path}")
|
| 84 |
print(f"Dashboard: uv run optuna-dashboard {db_path}")
|
|
@@ -90,6 +97,11 @@ def main():
|
|
| 90 |
pruner=args.pruner,
|
| 91 |
)
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
objective = AdapterObjective(
|
| 94 |
adapter_type=args.adapter,
|
| 95 |
checkpoint=args.checkpoint,
|
|
@@ -97,7 +109,8 @@ def main():
|
|
| 97 |
device=args.device,
|
| 98 |
output_base=args.output_dir,
|
| 99 |
epochs=args.epochs,
|
| 100 |
-
|
|
|
|
| 101 |
)
|
| 102 |
|
| 103 |
study.optimize(
|
|
|
|
| 56 |
p.add_argument("--n-trials", type=int, default=30,
|
| 57 |
help="Number of trials to run")
|
| 58 |
p.add_argument("--n-jobs", type=int, default=1,
|
| 59 |
+
help="Parallel trials (match --n-gpus for multi-GPU)")
|
| 60 |
+
p.add_argument("--n-gpus", type=int, default=1,
|
| 61 |
+
help="Number of GPUs. Trials are pinned to GPUs round-robin.")
|
| 62 |
p.add_argument("--epochs", type=int, default=30,
|
| 63 |
+
help="Max epochs per trial (adapter sweeps)")
|
| 64 |
+
p.add_argument("--total-steps", type=int, default=20000,
|
| 65 |
+
help="Total steps per trial (architecture/pretrain sweeps)")
|
| 66 |
p.add_argument("--device", type=str, default="cuda")
|
| 67 |
p.add_argument("--pruner", type=str, default="hyperband",
|
| 68 |
choices=["hyperband", "median", "none"])
|
|
|
|
| 73 |
|
| 74 |
args = p.parse_args()
|
| 75 |
|
| 76 |
+
if args.adapter not in ("pretrain", "architecture") and not args.pgn:
|
| 77 |
p.error("--pgn is required for adapter sweeps")
|
| 78 |
|
| 79 |
study_name = args.study_name or args.adapter
|
|
|
|
| 81 |
|
| 82 |
print(f"=== PAWN Hyperparameter Sweep ===")
|
| 83 |
print(f"Adapter: {args.adapter}")
|
| 84 |
+
print(f"Trials: {args.n_trials} (parallel: {args.n_jobs}, GPUs: {args.n_gpus})")
|
| 85 |
+
if args.adapter in ("pretrain", "architecture"):
|
| 86 |
+
print(f"Steps/trial: {args.total_steps}")
|
| 87 |
+
else:
|
| 88 |
+
print(f"Epochs/trial: {args.epochs}")
|
| 89 |
print(f"Pruner: {args.pruner}")
|
| 90 |
print(f"Storage: {db_path}")
|
| 91 |
print(f"Dashboard: uv run optuna-dashboard {db_path}")
|
|
|
|
| 97 |
pruner=args.pruner,
|
| 98 |
)
|
| 99 |
|
| 100 |
+
# For architecture/pretrain sweeps, pass --total-steps via extra args
|
| 101 |
+
extra = list(args.extra_args)
|
| 102 |
+
if args.adapter in ("pretrain", "architecture"):
|
| 103 |
+
extra.extend(["--total-steps", str(args.total_steps)])
|
| 104 |
+
|
| 105 |
objective = AdapterObjective(
|
| 106 |
adapter_type=args.adapter,
|
| 107 |
checkpoint=args.checkpoint,
|
|
|
|
| 109 |
device=args.device,
|
| 110 |
output_base=args.output_dir,
|
| 111 |
epochs=args.epochs,
|
| 112 |
+
n_gpus=args.n_gpus,
|
| 113 |
+
extra_args=extra,
|
| 114 |
)
|
| 115 |
|
| 116 |
study.optimize(
|
scripts/train.py
CHANGED
|
@@ -31,6 +31,15 @@ def parse_args():
|
|
| 31 |
parser.add_argument("--discard-ply-limit", action="store_true",
|
| 32 |
help="Only train on games that ended naturally (no ply limit truncation)")
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
ckpt_group = parser.add_mutually_exclusive_group(required=True)
|
| 35 |
ckpt_group.add_argument("--hf-repo", type=str, default=None,
|
| 36 |
help="Push checkpoints to this HuggingFace repo (requires HF_TOKEN)")
|
|
@@ -76,6 +85,22 @@ def main():
|
|
| 76 |
if args.discard_ply_limit:
|
| 77 |
train_cfg.discard_ply_limit = True
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
print(f"Model config: {model_cfg}")
|
| 80 |
print(f"Training config: {train_cfg}")
|
| 81 |
|
|
|
|
| 31 |
parser.add_argument("--discard-ply-limit", action="store_true",
|
| 32 |
help="Only train on games that ended naturally (no ply limit truncation)")
|
| 33 |
|
| 34 |
+
# Architecture overrides (for sweeps — override the named variant)
|
| 35 |
+
parser.add_argument("--d-model", type=int, default=None, help="Override d_model")
|
| 36 |
+
parser.add_argument("--n-layers", type=int, default=None, help="Override n_layers")
|
| 37 |
+
parser.add_argument("--n-heads", type=int, default=None, help="Override n_heads")
|
| 38 |
+
parser.add_argument("--d-ff", type=int, default=None, help="Override d_ff")
|
| 39 |
+
parser.add_argument("--lr", type=float, default=None, help="Override learning rate")
|
| 40 |
+
parser.add_argument("--weight-decay", type=float, default=None, help="Override weight decay")
|
| 41 |
+
parser.add_argument("--warmup-steps", type=int, default=None, help="Override warmup steps")
|
| 42 |
+
|
| 43 |
ckpt_group = parser.add_mutually_exclusive_group(required=True)
|
| 44 |
ckpt_group.add_argument("--hf-repo", type=str, default=None,
|
| 45 |
help="Push checkpoints to this HuggingFace repo (requires HF_TOKEN)")
|
|
|
|
| 85 |
if args.discard_ply_limit:
|
| 86 |
train_cfg.discard_ply_limit = True
|
| 87 |
|
| 88 |
+
# Architecture overrides
|
| 89 |
+
if args.d_model is not None:
|
| 90 |
+
model_cfg.d_model = args.d_model
|
| 91 |
+
if args.n_layers is not None:
|
| 92 |
+
model_cfg.n_layers = args.n_layers
|
| 93 |
+
if args.n_heads is not None:
|
| 94 |
+
model_cfg.n_heads = args.n_heads
|
| 95 |
+
if args.d_ff is not None:
|
| 96 |
+
model_cfg.d_ff = args.d_ff
|
| 97 |
+
if args.lr is not None:
|
| 98 |
+
train_cfg.lr = args.lr
|
| 99 |
+
if args.weight_decay is not None:
|
| 100 |
+
train_cfg.weight_decay = args.weight_decay
|
| 101 |
+
if args.warmup_steps is not None:
|
| 102 |
+
train_cfg.warmup_steps = args.warmup_steps
|
| 103 |
+
|
| 104 |
print(f"Model config: {model_cfg}")
|
| 105 |
print(f"Training config: {train_cfg}")
|
| 106 |
|