thomas-schweich commited on
Commit
0fe8a5f
·
1 Parent(s): 2660c6c

Architecture sweep: GPU affinity, arch search space, train.py overrides

Browse files

- train.py: add --d-model, --n-layers, --n-heads, --d-ff, --lr,
--weight-decay, --warmup-steps flags to override named variants
- sweep.py: add --n-gpus (round-robin GPU pinning via CUDA_VISIBLE_DEVICES)
and --total-steps for architecture/pretrain sweeps
- pawn/sweep.py: new suggest_architecture() search space (150M-500M range),
AdapterObjective pins trials to GPUs, handles pretrain/architecture modes

Files changed (3) hide show
  1. pawn/sweep.py +47 -5
  2. scripts/sweep.py +19 -6
  3. scripts/train.py +25 -0
pawn/sweep.py CHANGED
@@ -20,6 +20,7 @@ from __future__ import annotations
20
 
21
  import argparse
22
  import json
 
23
  import subprocess
24
  import sys
25
  from pathlib import Path
@@ -110,7 +111,7 @@ def suggest_tiny(trial: "optuna.Trial") -> dict:
110
 
111
 
112
  def suggest_pretrain(trial: "optuna.Trial") -> dict:
113
- """Pretraining hyperparameters."""
114
  return {
115
  "lr": trial.suggest_float("lr", 1e-5, 1e-3, log=True),
116
  "batch_size": trial.suggest_categorical("batch_size", [128, 256, 512]),
@@ -120,6 +121,29 @@ def suggest_pretrain(trial: "optuna.Trial") -> dict:
120
  }
121
 
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  SUGGEST_FNS = {
124
  "lora": suggest_lora,
125
  "bottleneck": suggest_bottleneck,
@@ -127,6 +151,7 @@ SUGGEST_FNS = {
127
  "sparse": suggest_sparse,
128
  "hybrid": suggest_hybrid,
129
  "tiny": suggest_tiny,
 
130
  "pretrain": suggest_pretrain,
131
  }
132
 
@@ -138,6 +163,7 @@ ADAPTER_SCRIPTS = {
138
  "hybrid": "scripts/train_hybrid.py",
139
  "tiny": "scripts/train_tiny.py",
140
  "pretrain": "scripts/train.py",
 
141
  }
142
 
143
 
@@ -208,6 +234,7 @@ class AdapterObjective:
208
  device: str = "cuda",
209
  output_base: str = "sweeps",
210
  epochs: int = 50,
 
211
  extra_args: list[str] | None = None,
212
  ):
213
  self.adapter_type = adapter_type
@@ -217,6 +244,7 @@ class AdapterObjective:
217
  self.output_base = Path(output_base) / adapter_type
218
  self.output_base.mkdir(parents=True, exist_ok=True)
219
  self.epochs = epochs
 
220
  self.extra_args = extra_args or []
221
  self.script = ADAPTER_SCRIPTS[adapter_type]
222
 
@@ -229,13 +257,20 @@ class AdapterObjective:
229
  # Build command
230
  cmd = [sys.executable, self.script]
231
 
232
- # Fixed args
233
- if self.adapter_type != "pretrain":
234
  cmd.extend(["--checkpoint", self.checkpoint])
235
  cmd.extend(["--pgn", self.pgn])
236
  cmd.extend(["--device", self.device])
237
- cmd.extend(["--output-dir", str(trial_dir)])
238
- if "epochs" not in params:
 
 
 
 
 
 
 
239
  cmd.extend(["--epochs", str(self.epochs)])
240
 
241
  # Suggested hyperparameters
@@ -244,11 +279,18 @@ class AdapterObjective:
244
  # Extra user-provided args
245
  cmd.extend(self.extra_args)
246
 
 
 
 
 
 
 
247
  # Run training
248
  result = subprocess.run(
249
  cmd,
250
  capture_output=True,
251
  text=True,
 
252
  )
253
 
254
  if result.returncode != 0:
 
20
 
21
  import argparse
22
  import json
23
+ import os
24
  import subprocess
25
  import sys
26
  from pathlib import Path
 
111
 
112
 
113
  def suggest_pretrain(trial: "optuna.Trial") -> dict:
114
+ """Pretraining hyperparameters (fixed architecture, tune training)."""
115
  return {
116
  "lr": trial.suggest_float("lr", 1e-5, 1e-3, log=True),
117
  "batch_size": trial.suggest_categorical("batch_size", [128, 256, 512]),
 
121
  }
122
 
123
 
124
+ def suggest_architecture(trial: "optuna.Trial") -> dict:
125
+ """Architecture search space for pretraining.
126
+
127
+ Explores model size, depth/width tradeoff, and training hyperparameters.
128
+ Target budget: 150M-500M parameters on 80GB GPUs.
129
+ """
130
+ d_model = trial.suggest_categorical("d_model", [512, 640, 768, 896, 1024, 1280])
131
+ n_layers = trial.suggest_int("n_layers", 8, 24, step=2)
132
+ n_heads = trial.suggest_categorical("n_heads", [8, 16])
133
+ d_ff_mult = trial.suggest_categorical("d_ff_mult", [3, 4, 5])
134
+
135
+ return {
136
+ "d_model": d_model,
137
+ "n_layers": n_layers,
138
+ "n_heads": n_heads,
139
+ "d_ff": d_model * d_ff_mult,
140
+ "lr": trial.suggest_float("lr", 1e-5, 1e-3, log=True),
141
+ "batch_size": trial.suggest_categorical("batch_size", [128, 256]),
142
+ "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.1),
143
+ "warmup_steps": trial.suggest_int("warmup_steps", 500, 3000, step=500),
144
+ }
145
+
146
+
147
  SUGGEST_FNS = {
148
  "lora": suggest_lora,
149
  "bottleneck": suggest_bottleneck,
 
151
  "sparse": suggest_sparse,
152
  "hybrid": suggest_hybrid,
153
  "tiny": suggest_tiny,
154
+ "architecture": suggest_architecture,
155
  "pretrain": suggest_pretrain,
156
  }
157
 
 
163
  "hybrid": "scripts/train_hybrid.py",
164
  "tiny": "scripts/train_tiny.py",
165
  "pretrain": "scripts/train.py",
166
+ "architecture": "scripts/train.py",
167
  }
168
 
169
 
 
234
  device: str = "cuda",
235
  output_base: str = "sweeps",
236
  epochs: int = 50,
237
+ n_gpus: int = 1,
238
  extra_args: list[str] | None = None,
239
  ):
240
  self.adapter_type = adapter_type
 
244
  self.output_base = Path(output_base) / adapter_type
245
  self.output_base.mkdir(parents=True, exist_ok=True)
246
  self.epochs = epochs
247
+ self.n_gpus = n_gpus
248
  self.extra_args = extra_args or []
249
  self.script = ADAPTER_SCRIPTS[adapter_type]
250
 
 
257
  # Build command
258
  cmd = [sys.executable, self.script]
259
 
260
+ # Fixed args — architecture and pretrain sweeps use train.py directly
261
+ if self.adapter_type not in ("pretrain", "architecture"):
262
  cmd.extend(["--checkpoint", self.checkpoint])
263
  cmd.extend(["--pgn", self.pgn])
264
  cmd.extend(["--device", self.device])
265
+
266
+ # output-dir for adapters, log-dir for pretraining
267
+ if self.adapter_type in ("pretrain", "architecture"):
268
+ cmd.extend(["--log-dir", str(trial_dir)])
269
+ cmd.extend(["--local-checkpoints"])
270
+ else:
271
+ cmd.extend(["--output-dir", str(trial_dir)])
272
+
273
+ if "epochs" not in params and "total_steps" not in params:
274
  cmd.extend(["--epochs", str(self.epochs)])
275
 
276
  # Suggested hyperparameters
 
279
  # Extra user-provided args
280
  cmd.extend(self.extra_args)
281
 
282
+ # GPU affinity: pin trial to GPU (trial.number % n_gpus)
283
+ env = os.environ.copy()
284
+ if self.n_gpus > 1:
285
+ gpu_id = trial.number % self.n_gpus
286
+ env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
287
+
288
  # Run training
289
  result = subprocess.run(
290
  cmd,
291
  capture_output=True,
292
  text=True,
293
+ env=env,
294
  )
295
 
296
  if result.returncode != 0:
scripts/sweep.py CHANGED
@@ -56,9 +56,13 @@ def main():
56
  p.add_argument("--n-trials", type=int, default=30,
57
  help="Number of trials to run")
58
  p.add_argument("--n-jobs", type=int, default=1,
59
- help="Parallel trials (careful with GPU memory)")
 
 
60
  p.add_argument("--epochs", type=int, default=30,
61
- help="Max epochs per trial")
 
 
62
  p.add_argument("--device", type=str, default="cuda")
63
  p.add_argument("--pruner", type=str, default="hyperband",
64
  choices=["hyperband", "median", "none"])
@@ -69,7 +73,7 @@ def main():
69
 
70
  args = p.parse_args()
71
 
72
- if args.adapter != "pretrain" and not args.pgn:
73
  p.error("--pgn is required for adapter sweeps")
74
 
75
  study_name = args.study_name or args.adapter
@@ -77,8 +81,11 @@ def main():
77
 
78
  print(f"=== PAWN Hyperparameter Sweep ===")
79
  print(f"Adapter: {args.adapter}")
80
- print(f"Trials: {args.n_trials} (parallel: {args.n_jobs})")
81
- print(f"Epochs/trial: {args.epochs}")
 
 
 
82
  print(f"Pruner: {args.pruner}")
83
  print(f"Storage: {db_path}")
84
  print(f"Dashboard: uv run optuna-dashboard {db_path}")
@@ -90,6 +97,11 @@ def main():
90
  pruner=args.pruner,
91
  )
92
 
 
 
 
 
 
93
  objective = AdapterObjective(
94
  adapter_type=args.adapter,
95
  checkpoint=args.checkpoint,
@@ -97,7 +109,8 @@ def main():
97
  device=args.device,
98
  output_base=args.output_dir,
99
  epochs=args.epochs,
100
- extra_args=args.extra_args,
 
101
  )
102
 
103
  study.optimize(
 
56
  p.add_argument("--n-trials", type=int, default=30,
57
  help="Number of trials to run")
58
  p.add_argument("--n-jobs", type=int, default=1,
59
+ help="Parallel trials (match --n-gpus for multi-GPU)")
60
+ p.add_argument("--n-gpus", type=int, default=1,
61
+ help="Number of GPUs. Trials are pinned to GPUs round-robin.")
62
  p.add_argument("--epochs", type=int, default=30,
63
+ help="Max epochs per trial (adapter sweeps)")
64
+ p.add_argument("--total-steps", type=int, default=20000,
65
+ help="Total steps per trial (architecture/pretrain sweeps)")
66
  p.add_argument("--device", type=str, default="cuda")
67
  p.add_argument("--pruner", type=str, default="hyperband",
68
  choices=["hyperband", "median", "none"])
 
73
 
74
  args = p.parse_args()
75
 
76
+ if args.adapter not in ("pretrain", "architecture") and not args.pgn:
77
  p.error("--pgn is required for adapter sweeps")
78
 
79
  study_name = args.study_name or args.adapter
 
81
 
82
  print(f"=== PAWN Hyperparameter Sweep ===")
83
  print(f"Adapter: {args.adapter}")
84
+ print(f"Trials: {args.n_trials} (parallel: {args.n_jobs}, GPUs: {args.n_gpus})")
85
+ if args.adapter in ("pretrain", "architecture"):
86
+ print(f"Steps/trial: {args.total_steps}")
87
+ else:
88
+ print(f"Epochs/trial: {args.epochs}")
89
  print(f"Pruner: {args.pruner}")
90
  print(f"Storage: {db_path}")
91
  print(f"Dashboard: uv run optuna-dashboard {db_path}")
 
97
  pruner=args.pruner,
98
  )
99
 
100
+ # For architecture/pretrain sweeps, pass --total-steps via extra args
101
+ extra = list(args.extra_args)
102
+ if args.adapter in ("pretrain", "architecture"):
103
+ extra.extend(["--total-steps", str(args.total_steps)])
104
+
105
  objective = AdapterObjective(
106
  adapter_type=args.adapter,
107
  checkpoint=args.checkpoint,
 
109
  device=args.device,
110
  output_base=args.output_dir,
111
  epochs=args.epochs,
112
+ n_gpus=args.n_gpus,
113
+ extra_args=extra,
114
  )
115
 
116
  study.optimize(
scripts/train.py CHANGED
@@ -31,6 +31,15 @@ def parse_args():
31
  parser.add_argument("--discard-ply-limit", action="store_true",
32
  help="Only train on games that ended naturally (no ply limit truncation)")
33
 
 
 
 
 
 
 
 
 
 
34
  ckpt_group = parser.add_mutually_exclusive_group(required=True)
35
  ckpt_group.add_argument("--hf-repo", type=str, default=None,
36
  help="Push checkpoints to this HuggingFace repo (requires HF_TOKEN)")
@@ -76,6 +85,22 @@ def main():
76
  if args.discard_ply_limit:
77
  train_cfg.discard_ply_limit = True
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  print(f"Model config: {model_cfg}")
80
  print(f"Training config: {train_cfg}")
81
 
 
31
  parser.add_argument("--discard-ply-limit", action="store_true",
32
  help="Only train on games that ended naturally (no ply limit truncation)")
33
 
34
+ # Architecture overrides (for sweeps — override the named variant)
35
+ parser.add_argument("--d-model", type=int, default=None, help="Override d_model")
36
+ parser.add_argument("--n-layers", type=int, default=None, help="Override n_layers")
37
+ parser.add_argument("--n-heads", type=int, default=None, help="Override n_heads")
38
+ parser.add_argument("--d-ff", type=int, default=None, help="Override d_ff")
39
+ parser.add_argument("--lr", type=float, default=None, help="Override learning rate")
40
+ parser.add_argument("--weight-decay", type=float, default=None, help="Override weight decay")
41
+ parser.add_argument("--warmup-steps", type=int, default=None, help="Override warmup steps")
42
+
43
  ckpt_group = parser.add_mutually_exclusive_group(required=True)
44
  ckpt_group.add_argument("--hf-repo", type=str, default=None,
45
  help="Push checkpoints to this HuggingFace repo (requires HF_TOKEN)")
 
85
  if args.discard_ply_limit:
86
  train_cfg.discard_ply_limit = True
87
 
88
+ # Architecture overrides
89
+ if args.d_model is not None:
90
+ model_cfg.d_model = args.d_model
91
+ if args.n_layers is not None:
92
+ model_cfg.n_layers = args.n_layers
93
+ if args.n_heads is not None:
94
+ model_cfg.n_heads = args.n_heads
95
+ if args.d_ff is not None:
96
+ model_cfg.d_ff = args.d_ff
97
+ if args.lr is not None:
98
+ train_cfg.lr = args.lr
99
+ if args.weight_decay is not None:
100
+ train_cfg.weight_decay = args.weight_decay
101
+ if args.warmup_steps is not None:
102
+ train_cfg.warmup_steps = args.warmup_steps
103
+
104
  print(f"Model config: {model_cfg}")
105
  print(f"Training config: {train_cfg}")
106