Spaces:
Running
Running
Switch training UI from epochs to max steps (default 1500)
Browse files- Replace Epochs field with Max Steps (1500 default, 1500-2000 recommended)
- Replace Save Every N Epochs with Save Every N Steps (500 default)
- Persist pod state to disk so it survives server restarts
- Increase generation polling timeout to 600s
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
src/content_engine/api/routes_training.py
CHANGED
|
@@ -76,14 +76,14 @@ async def start_training(
|
|
| 76 |
captions_json: str = Form("{}"),
|
| 77 |
base_model: str = Form("flux2_dev"), # Model registry key (flux2_dev, sd15_realistic, sdxl_base)
|
| 78 |
resolution: int | None = Form(None), # None = use model default
|
| 79 |
-
num_epochs: int = Form(
|
| 80 |
-
|
| 81 |
learning_rate: float | None = Form(None), # None = use model default
|
| 82 |
network_rank: int | None = Form(None), # None = use model default
|
| 83 |
network_alpha: int | None = Form(None), # None = use model default
|
| 84 |
optimizer: str | None = Form(None), # None = use model default
|
| 85 |
train_batch_size: int = Form(1),
|
| 86 |
-
|
| 87 |
backend: str = Form("runpod"), # Default to runpod for cloud training
|
| 88 |
gpu_type: str = Form("NVIDIA GeForce RTX 4090"),
|
| 89 |
):
|
|
@@ -140,12 +140,12 @@ async def start_training(
|
|
| 140 |
base_model=base_model,
|
| 141 |
resolution=resolution,
|
| 142 |
num_epochs=num_epochs,
|
| 143 |
-
max_train_steps=
|
| 144 |
learning_rate=learning_rate,
|
| 145 |
network_rank=network_rank,
|
| 146 |
network_alpha=network_alpha,
|
| 147 |
optimizer=optimizer,
|
| 148 |
-
|
| 149 |
gpu_type=gpu_type,
|
| 150 |
)
|
| 151 |
job = _runpod_trainer.get_job(job_id)
|
|
@@ -184,7 +184,7 @@ async def start_training(
|
|
| 184 |
network_alpha=network_alpha or model_cfg.get("network_alpha", 16),
|
| 185 |
optimizer=optimizer or model_cfg.get("optimizer", "AdamW8bit"),
|
| 186 |
train_batch_size=train_batch_size,
|
| 187 |
-
save_every_n_epochs=
|
| 188 |
)
|
| 189 |
|
| 190 |
job_id = await _trainer.start_training(config, image_paths)
|
|
|
|
| 76 |
captions_json: str = Form("{}"),
|
| 77 |
base_model: str = Form("flux2_dev"), # Model registry key (flux2_dev, sd15_realistic, sdxl_base)
|
| 78 |
resolution: int | None = Form(None), # None = use model default
|
| 79 |
+
num_epochs: int = Form(100), # High default — max_steps controls actual limit
|
| 80 |
+
max_steps: int = Form(1500), # Primary training length control
|
| 81 |
learning_rate: float | None = Form(None), # None = use model default
|
| 82 |
network_rank: int | None = Form(None), # None = use model default
|
| 83 |
network_alpha: int | None = Form(None), # None = use model default
|
| 84 |
optimizer: str | None = Form(None), # None = use model default
|
| 85 |
train_batch_size: int = Form(1),
|
| 86 |
+
save_every_n_steps: int = Form(500),
|
| 87 |
backend: str = Form("runpod"), # Default to runpod for cloud training
|
| 88 |
gpu_type: str = Form("NVIDIA GeForce RTX 4090"),
|
| 89 |
):
|
|
|
|
| 140 |
base_model=base_model,
|
| 141 |
resolution=resolution,
|
| 142 |
num_epochs=num_epochs,
|
| 143 |
+
max_train_steps=max_steps,
|
| 144 |
learning_rate=learning_rate,
|
| 145 |
network_rank=network_rank,
|
| 146 |
network_alpha=network_alpha,
|
| 147 |
optimizer=optimizer,
|
| 148 |
+
save_every_n_steps=save_every_n_steps,
|
| 149 |
gpu_type=gpu_type,
|
| 150 |
)
|
| 151 |
job = _runpod_trainer.get_job(job_id)
|
|
|
|
| 184 |
network_alpha=network_alpha or model_cfg.get("network_alpha", 16),
|
| 185 |
optimizer=optimizer or model_cfg.get("optimizer", "AdamW8bit"),
|
| 186 |
train_batch_size=train_batch_size,
|
| 187 |
+
save_every_n_epochs=save_every_n_steps, # Local trainer uses epoch-based saving
|
| 188 |
)
|
| 189 |
|
| 190 |
job_id = await _trainer.start_training(config, image_paths)
|
src/content_engine/api/ui.html
CHANGED
|
@@ -1332,8 +1332,9 @@ select { cursor: pointer; }
|
|
| 1332 |
<div class="section-title">Training Settings</div>
|
| 1333 |
<div class="row" style="display:grid;grid-template-columns:1fr 1fr;gap:8px">
|
| 1334 |
<div>
|
| 1335 |
-
<label>
|
| 1336 |
-
<input type="number" id="train-
|
|
|
|
| 1337 |
</div>
|
| 1338 |
<div>
|
| 1339 |
<label>Network Rank (dim)</label>
|
|
@@ -1370,8 +1371,8 @@ select { cursor: pointer; }
|
|
| 1370 |
</select>
|
| 1371 |
</div>
|
| 1372 |
<div>
|
| 1373 |
-
<label>Save Every N
|
| 1374 |
-
<input type="number" id="train-save-every" value="
|
| 1375 |
</div>
|
| 1376 |
</div>
|
| 1377 |
|
|
@@ -2908,8 +2909,8 @@ async function startTraining() {
|
|
| 2908 |
formData.append('name', name);
|
| 2909 |
formData.append('trigger_word', document.getElementById('train-trigger').value);
|
| 2910 |
formData.append('base_model', document.getElementById('train-base-model').value);
|
| 2911 |
-
formData.append('
|
| 2912 |
-
formData.append('
|
| 2913 |
formData.append('backend', selectedTrainBackend);
|
| 2914 |
|
| 2915 |
// Optional params - only send if user explicitly set them (otherwise use model defaults)
|
|
|
|
| 1332 |
<div class="section-title">Training Settings</div>
|
| 1333 |
<div class="row" style="display:grid;grid-template-columns:1fr 1fr;gap:8px">
|
| 1334 |
<div>
|
| 1335 |
+
<label>Max Steps</label>
|
| 1336 |
+
<input type="number" id="train-max-steps" value="1500" min="50" max="10000" step="100">
|
| 1337 |
+
<div style="font-size:10px;color:var(--text-secondary);margin-top:2px">1500-2000 recommended</div>
|
| 1338 |
</div>
|
| 1339 |
<div>
|
| 1340 |
<label>Network Rank (dim)</label>
|
|
|
|
| 1371 |
</select>
|
| 1372 |
</div>
|
| 1373 |
<div>
|
| 1374 |
+
<label>Save Every N Steps</label>
|
| 1375 |
+
<input type="number" id="train-save-every" value="500" min="50" step="50">
|
| 1376 |
</div>
|
| 1377 |
</div>
|
| 1378 |
|
|
|
|
| 2909 |
formData.append('name', name);
|
| 2910 |
formData.append('trigger_word', document.getElementById('train-trigger').value);
|
| 2911 |
formData.append('base_model', document.getElementById('train-base-model').value);
|
| 2912 |
+
formData.append('max_steps', document.getElementById('train-max-steps').value);
|
| 2913 |
+
formData.append('save_every_n_steps', document.getElementById('train-save-every').value);
|
| 2914 |
formData.append('backend', selectedTrainBackend);
|
| 2915 |
|
| 2916 |
// Optional params - only send if user explicitly set them (otherwise use model defaults)
|
src/content_engine/services/runpod_trainer.py
CHANGED
|
@@ -169,6 +169,7 @@ class RunPodTrainer:
|
|
| 169 |
network_alpha: int | None = None,
|
| 170 |
optimizer: str | None = None,
|
| 171 |
save_every_n_epochs: int = 2,
|
|
|
|
| 172 |
gpu_type: str = DEFAULT_GPU,
|
| 173 |
) -> str:
|
| 174 |
"""Start a cloud training job. Returns job ID.
|
|
@@ -194,6 +195,7 @@ class RunPodTrainer:
|
|
| 194 |
name=name,
|
| 195 |
status="pending",
|
| 196 |
total_epochs=num_epochs,
|
|
|
|
| 197 |
gpu_type=gpu_type,
|
| 198 |
started_at=time.time(),
|
| 199 |
base_model=base_model,
|
|
@@ -217,6 +219,7 @@ class RunPodTrainer:
|
|
| 217 |
network_alpha=final_alpha,
|
| 218 |
optimizer=final_optimizer,
|
| 219 |
save_every_n_epochs=save_every_n_epochs,
|
|
|
|
| 220 |
))
|
| 221 |
|
| 222 |
return job_id
|
|
@@ -235,6 +238,7 @@ class RunPodTrainer:
|
|
| 235 |
network_alpha: int,
|
| 236 |
optimizer: str,
|
| 237 |
save_every_n_epochs: int,
|
|
|
|
| 238 |
):
|
| 239 |
"""Full cloud training pipeline: create pod -> upload -> train -> download -> cleanup."""
|
| 240 |
ssh = None
|
|
@@ -578,6 +582,7 @@ resolution = [{resolution}, {resolution}]
|
|
| 578 |
network_alpha=network_alpha,
|
| 579 |
optimizer=optimizer,
|
| 580 |
save_every_n_epochs=save_every_n_epochs,
|
|
|
|
| 581 |
model_cfg=model_cfg,
|
| 582 |
gpu_type=job.gpu_type,
|
| 583 |
)
|
|
@@ -792,6 +797,7 @@ resolution = [{resolution}, {resolution}]
|
|
| 792 |
network_alpha: int,
|
| 793 |
optimizer: str,
|
| 794 |
save_every_n_epochs: int,
|
|
|
|
| 795 |
model_cfg: dict,
|
| 796 |
gpu_type: str = "",
|
| 797 |
) -> str:
|
|
@@ -875,7 +881,6 @@ resolution = [{resolution}, {resolution}]
|
|
| 875 |
])
|
| 876 |
|
| 877 |
args.extend([
|
| 878 |
-
f"--save_every_n_epochs={save_every_n_epochs}",
|
| 879 |
"--seed=42",
|
| 880 |
'--output_dir=/workspace/output',
|
| 881 |
f'--output_name={name}',
|
|
@@ -884,8 +889,13 @@ resolution = [{resolution}, {resolution}]
|
|
| 884 |
|
| 885 |
if max_train_steps:
|
| 886 |
args.append(f"--max_train_steps={max_train_steps}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 887 |
else:
|
| 888 |
args.append(f"--max_train_epochs={num_epochs}")
|
|
|
|
| 889 |
|
| 890 |
return " ".join(args) + " 2>&1"
|
| 891 |
|
|
|
|
| 169 |
network_alpha: int | None = None,
|
| 170 |
optimizer: str | None = None,
|
| 171 |
save_every_n_epochs: int = 2,
|
| 172 |
+
save_every_n_steps: int = 500,
|
| 173 |
gpu_type: str = DEFAULT_GPU,
|
| 174 |
) -> str:
|
| 175 |
"""Start a cloud training job. Returns job ID.
|
|
|
|
| 195 |
name=name,
|
| 196 |
status="pending",
|
| 197 |
total_epochs=num_epochs,
|
| 198 |
+
total_steps=final_steps,
|
| 199 |
gpu_type=gpu_type,
|
| 200 |
started_at=time.time(),
|
| 201 |
base_model=base_model,
|
|
|
|
| 219 |
network_alpha=final_alpha,
|
| 220 |
optimizer=final_optimizer,
|
| 221 |
save_every_n_epochs=save_every_n_epochs,
|
| 222 |
+
save_every_n_steps=save_every_n_steps,
|
| 223 |
))
|
| 224 |
|
| 225 |
return job_id
|
|
|
|
| 238 |
network_alpha: int,
|
| 239 |
optimizer: str,
|
| 240 |
save_every_n_epochs: int,
|
| 241 |
+
save_every_n_steps: int = 500,
|
| 242 |
):
|
| 243 |
"""Full cloud training pipeline: create pod -> upload -> train -> download -> cleanup."""
|
| 244 |
ssh = None
|
|
|
|
| 582 |
network_alpha=network_alpha,
|
| 583 |
optimizer=optimizer,
|
| 584 |
save_every_n_epochs=save_every_n_epochs,
|
| 585 |
+
save_every_n_steps=save_every_n_steps,
|
| 586 |
model_cfg=model_cfg,
|
| 587 |
gpu_type=job.gpu_type,
|
| 588 |
)
|
|
|
|
| 797 |
network_alpha: int,
|
| 798 |
optimizer: str,
|
| 799 |
save_every_n_epochs: int,
|
| 800 |
+
save_every_n_steps: int = 500,
|
| 801 |
model_cfg: dict,
|
| 802 |
gpu_type: str = "",
|
| 803 |
) -> str:
|
|
|
|
| 881 |
])
|
| 882 |
|
| 883 |
args.extend([
|
|
|
|
| 884 |
"--seed=42",
|
| 885 |
'--output_dir=/workspace/output',
|
| 886 |
f'--output_name={name}',
|
|
|
|
| 889 |
|
| 890 |
if max_train_steps:
|
| 891 |
args.append(f"--max_train_steps={max_train_steps}")
|
| 892 |
+
if save_every_n_steps:
|
| 893 |
+
args.append(f"--save_every_n_steps={save_every_n_steps}")
|
| 894 |
+
else:
|
| 895 |
+
args.append(f"--save_every_n_epochs={save_every_n_epochs}")
|
| 896 |
else:
|
| 897 |
args.append(f"--max_train_epochs={num_epochs}")
|
| 898 |
+
args.append(f"--save_every_n_epochs={save_every_n_epochs}")
|
| 899 |
|
| 900 |
return " ".join(args) + " 2>&1"
|
| 901 |
|