dippoo Claude Opus 4.6 commited on
Commit
01a9c08
·
1 Parent(s): 27fea48

Switch training UI from epochs to max steps (default 1500)

Browse files

- Replace Epochs field with Max Steps (1500 default, 1500-2000 recommended)
- Replace Save Every N Epochs with Save Every N Steps (500 default)
- Persist pod state to disk so it survives server restarts
- Increase generation polling timeout to 600s

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

src/content_engine/api/routes_training.py CHANGED
@@ -76,14 +76,14 @@ async def start_training(
76
  captions_json: str = Form("{}"),
77
  base_model: str = Form("flux2_dev"), # Model registry key (flux2_dev, sd15_realistic, sdxl_base)
78
  resolution: int | None = Form(None), # None = use model default
79
- num_epochs: int = Form(10),
80
- max_train_steps: int | None = Form(None), # If set, overrides epochs
81
  learning_rate: float | None = Form(None), # None = use model default
82
  network_rank: int | None = Form(None), # None = use model default
83
  network_alpha: int | None = Form(None), # None = use model default
84
  optimizer: str | None = Form(None), # None = use model default
85
  train_batch_size: int = Form(1),
86
- save_every_n_epochs: int = Form(2),
87
  backend: str = Form("runpod"), # Default to runpod for cloud training
88
  gpu_type: str = Form("NVIDIA GeForce RTX 4090"),
89
  ):
@@ -140,12 +140,12 @@ async def start_training(
140
  base_model=base_model,
141
  resolution=resolution,
142
  num_epochs=num_epochs,
143
- max_train_steps=max_train_steps,
144
  learning_rate=learning_rate,
145
  network_rank=network_rank,
146
  network_alpha=network_alpha,
147
  optimizer=optimizer,
148
- save_every_n_epochs=save_every_n_epochs,
149
  gpu_type=gpu_type,
150
  )
151
  job = _runpod_trainer.get_job(job_id)
@@ -184,7 +184,7 @@ async def start_training(
184
  network_alpha=network_alpha or model_cfg.get("network_alpha", 16),
185
  optimizer=optimizer or model_cfg.get("optimizer", "AdamW8bit"),
186
  train_batch_size=train_batch_size,
187
- save_every_n_epochs=save_every_n_epochs,
188
  )
189
 
190
  job_id = await _trainer.start_training(config, image_paths)
 
76
  captions_json: str = Form("{}"),
77
  base_model: str = Form("flux2_dev"), # Model registry key (flux2_dev, sd15_realistic, sdxl_base)
78
  resolution: int | None = Form(None), # None = use model default
79
+ num_epochs: int = Form(100), # High default — max_steps controls actual limit
80
+ max_steps: int = Form(1500), # Primary training length control
81
  learning_rate: float | None = Form(None), # None = use model default
82
  network_rank: int | None = Form(None), # None = use model default
83
  network_alpha: int | None = Form(None), # None = use model default
84
  optimizer: str | None = Form(None), # None = use model default
85
  train_batch_size: int = Form(1),
86
+ save_every_n_steps: int = Form(500),
87
  backend: str = Form("runpod"), # Default to runpod for cloud training
88
  gpu_type: str = Form("NVIDIA GeForce RTX 4090"),
89
  ):
 
140
  base_model=base_model,
141
  resolution=resolution,
142
  num_epochs=num_epochs,
143
+ max_train_steps=max_steps,
144
  learning_rate=learning_rate,
145
  network_rank=network_rank,
146
  network_alpha=network_alpha,
147
  optimizer=optimizer,
148
+ save_every_n_steps=save_every_n_steps,
149
  gpu_type=gpu_type,
150
  )
151
  job = _runpod_trainer.get_job(job_id)
 
184
  network_alpha=network_alpha or model_cfg.get("network_alpha", 16),
185
  optimizer=optimizer or model_cfg.get("optimizer", "AdamW8bit"),
186
  train_batch_size=train_batch_size,
187
+ save_every_n_epochs=save_every_n_steps, # Local trainer uses epoch-based saving
188
  )
189
 
190
  job_id = await _trainer.start_training(config, image_paths)
src/content_engine/api/ui.html CHANGED
@@ -1332,8 +1332,9 @@ select { cursor: pointer; }
1332
  <div class="section-title">Training Settings</div>
1333
  <div class="row" style="display:grid;grid-template-columns:1fr 1fr;gap:8px">
1334
  <div>
1335
- <label>Epochs</label>
1336
- <input type="number" id="train-epochs" value="10" min="1" max="100">
 
1337
  </div>
1338
  <div>
1339
  <label>Network Rank (dim)</label>
@@ -1370,8 +1371,8 @@ select { cursor: pointer; }
1370
  </select>
1371
  </div>
1372
  <div>
1373
- <label>Save Every N Epochs</label>
1374
- <input type="number" id="train-save-every" value="2" min="1">
1375
  </div>
1376
  </div>
1377
 
@@ -2908,8 +2909,8 @@ async function startTraining() {
2908
  formData.append('name', name);
2909
  formData.append('trigger_word', document.getElementById('train-trigger').value);
2910
  formData.append('base_model', document.getElementById('train-base-model').value);
2911
- formData.append('num_epochs', document.getElementById('train-epochs').value);
2912
- formData.append('save_every_n_epochs', document.getElementById('train-save-every').value);
2913
  formData.append('backend', selectedTrainBackend);
2914
 
2915
  // Optional params - only send if user explicitly set them (otherwise use model defaults)
 
1332
  <div class="section-title">Training Settings</div>
1333
  <div class="row" style="display:grid;grid-template-columns:1fr 1fr;gap:8px">
1334
  <div>
1335
+ <label>Max Steps</label>
1336
+ <input type="number" id="train-max-steps" value="1500" min="50" max="10000" step="100">
1337
+ <div style="font-size:10px;color:var(--text-secondary);margin-top:2px">1500-2000 recommended</div>
1338
  </div>
1339
  <div>
1340
  <label>Network Rank (dim)</label>
 
1371
  </select>
1372
  </div>
1373
  <div>
1374
+ <label>Save Every N Steps</label>
1375
+ <input type="number" id="train-save-every" value="500" min="50" step="50">
1376
  </div>
1377
  </div>
1378
 
 
2909
  formData.append('name', name);
2910
  formData.append('trigger_word', document.getElementById('train-trigger').value);
2911
  formData.append('base_model', document.getElementById('train-base-model').value);
2912
+ formData.append('max_steps', document.getElementById('train-max-steps').value);
2913
+ formData.append('save_every_n_steps', document.getElementById('train-save-every').value);
2914
  formData.append('backend', selectedTrainBackend);
2915
 
2916
  // Optional params - only send if user explicitly set them (otherwise use model defaults)
src/content_engine/services/runpod_trainer.py CHANGED
@@ -169,6 +169,7 @@ class RunPodTrainer:
169
  network_alpha: int | None = None,
170
  optimizer: str | None = None,
171
  save_every_n_epochs: int = 2,
 
172
  gpu_type: str = DEFAULT_GPU,
173
  ) -> str:
174
  """Start a cloud training job. Returns job ID.
@@ -194,6 +195,7 @@ class RunPodTrainer:
194
  name=name,
195
  status="pending",
196
  total_epochs=num_epochs,
 
197
  gpu_type=gpu_type,
198
  started_at=time.time(),
199
  base_model=base_model,
@@ -217,6 +219,7 @@ class RunPodTrainer:
217
  network_alpha=final_alpha,
218
  optimizer=final_optimizer,
219
  save_every_n_epochs=save_every_n_epochs,
 
220
  ))
221
 
222
  return job_id
@@ -235,6 +238,7 @@ class RunPodTrainer:
235
  network_alpha: int,
236
  optimizer: str,
237
  save_every_n_epochs: int,
 
238
  ):
239
  """Full cloud training pipeline: create pod -> upload -> train -> download -> cleanup."""
240
  ssh = None
@@ -578,6 +582,7 @@ resolution = [{resolution}, {resolution}]
578
  network_alpha=network_alpha,
579
  optimizer=optimizer,
580
  save_every_n_epochs=save_every_n_epochs,
 
581
  model_cfg=model_cfg,
582
  gpu_type=job.gpu_type,
583
  )
@@ -792,6 +797,7 @@ resolution = [{resolution}, {resolution}]
792
  network_alpha: int,
793
  optimizer: str,
794
  save_every_n_epochs: int,
 
795
  model_cfg: dict,
796
  gpu_type: str = "",
797
  ) -> str:
@@ -875,7 +881,6 @@ resolution = [{resolution}, {resolution}]
875
  ])
876
 
877
  args.extend([
878
- f"--save_every_n_epochs={save_every_n_epochs}",
879
  "--seed=42",
880
  '--output_dir=/workspace/output',
881
  f'--output_name={name}',
@@ -884,8 +889,13 @@ resolution = [{resolution}, {resolution}]
884
 
885
  if max_train_steps:
886
  args.append(f"--max_train_steps={max_train_steps}")
 
 
 
 
887
  else:
888
  args.append(f"--max_train_epochs={num_epochs}")
 
889
 
890
  return " ".join(args) + " 2>&1"
891
 
 
169
  network_alpha: int | None = None,
170
  optimizer: str | None = None,
171
  save_every_n_epochs: int = 2,
172
+ save_every_n_steps: int = 500,
173
  gpu_type: str = DEFAULT_GPU,
174
  ) -> str:
175
  """Start a cloud training job. Returns job ID.
 
195
  name=name,
196
  status="pending",
197
  total_epochs=num_epochs,
198
+ total_steps=final_steps,
199
  gpu_type=gpu_type,
200
  started_at=time.time(),
201
  base_model=base_model,
 
219
  network_alpha=final_alpha,
220
  optimizer=final_optimizer,
221
  save_every_n_epochs=save_every_n_epochs,
222
+ save_every_n_steps=save_every_n_steps,
223
  ))
224
 
225
  return job_id
 
238
  network_alpha: int,
239
  optimizer: str,
240
  save_every_n_epochs: int,
241
+ save_every_n_steps: int = 500,
242
  ):
243
  """Full cloud training pipeline: create pod -> upload -> train -> download -> cleanup."""
244
  ssh = None
 
582
  network_alpha=network_alpha,
583
  optimizer=optimizer,
584
  save_every_n_epochs=save_every_n_epochs,
585
+ save_every_n_steps=save_every_n_steps,
586
  model_cfg=model_cfg,
587
  gpu_type=job.gpu_type,
588
  )
 
797
  network_alpha: int,
798
  optimizer: str,
799
  save_every_n_epochs: int,
800
+ save_every_n_steps: int = 500,
801
  model_cfg: dict,
802
  gpu_type: str = "",
803
  ) -> str:
 
881
  ])
882
 
883
  args.extend([
 
884
  "--seed=42",
885
  '--output_dir=/workspace/output',
886
  f'--output_name={name}',
 
889
 
890
  if max_train_steps:
891
  args.append(f"--max_train_steps={max_train_steps}")
892
+ if save_every_n_steps:
893
+ args.append(f"--save_every_n_steps={save_every_n_steps}")
894
+ else:
895
+ args.append(f"--save_every_n_epochs={save_every_n_epochs}")
896
  else:
897
  args.append(f"--max_train_epochs={num_epochs}")
898
+ args.append(f"--save_every_n_epochs={save_every_n_epochs}")
899
 
900
  return " ".join(args) + " 2>&1"
901