Nekochu commited on
Commit
c2cb0b9
·
1 Parent(s): c37e80e

stop ace-server during training to free RAM, restart after, add log visibility

Browse files
Files changed (1) hide show
  1. app.py +29 -11
app.py CHANGED
@@ -316,10 +316,20 @@ def gradio_main():
316
  "",
317
  ]
318
 
 
 
 
 
319
  try:
 
 
 
 
 
 
320
  ckpt_files = os.listdir(ACE_CHECKPOINT_DIR) if os.path.isdir(ACE_CHECKPOINT_DIR) else []
321
  if len(ckpt_files) < 3:
322
- log_lines.append("[Step 0] Downloading model checkpoints...")
323
  progress(0.02, desc="Downloading checkpoints...")
324
  from huggingface_hub import snapshot_download
325
  snapshot_download(
@@ -327,7 +337,7 @@ def gradio_main():
327
  local_dir=ACE_CHECKPOINT_DIR,
328
  ignore_patterns=["*.md", "*.txt", ".gitattributes"],
329
  )
330
- log_lines.append(" Checkpoints downloaded.")
331
 
332
  if ACE_SOURCE_DIR not in sys.path:
333
  sys.path.insert(0, ACE_SOURCE_DIR)
@@ -339,7 +349,7 @@ def gradio_main():
339
  return _orig_load(filepath, *args, **kwargs)
340
  torchaudio.load = _load_soundfile
341
 
342
- log_lines.append("[Step 1/2] Preprocessing audio files...")
343
  progress(0.10, desc="Preprocessing audio...")
344
 
345
  tensor_dir = os.path.join(output_dir, "preprocessed_tensors")
@@ -359,13 +369,13 @@ def gradio_main():
359
  processed = result.get("processed", 0)
360
  total_files = result.get("total", 0)
361
  failed = result.get("failed", 0)
362
- log_lines.append(f" Preprocessed: {processed}/{total_files} (failed: {failed})")
363
 
364
  if processed == 0:
365
- log_lines.append("ERROR: No files preprocessed successfully.")
366
  return "\n".join(log_lines)
367
 
368
- log_lines.append("[Step 2/2] Training LoRA adapter (CPU, this will be slow)...")
369
  progress(0.30, desc="Loading model for training...")
370
 
371
  from acestep.training_v2.model_loader import load_decoder_for_training
@@ -412,21 +422,29 @@ def gradio_main():
412
  pct = 0.30 + 0.65 * min(step_count / max(epochs * processed, 1), 1.0)
413
  progress(pct, desc=f"Step {step_count}, loss={last_loss:.4f}")
414
 
415
- log_lines.append(f"Training complete! Final: step {step_count}, loss={last_loss:.4f}")
416
- log_lines.append(f"LoRA saved to: {output_dir}")
417
 
418
  del model, trainer
419
  gc.collect()
420
 
421
  except ImportError as e:
422
- log_lines.append(f"Import error: {e}")
423
- log_lines.append(f"Check ACE-Step source at {ACE_SOURCE_DIR}")
424
  import traceback
425
  log_lines.append(traceback.format_exc())
426
  except Exception as e:
427
  import traceback
428
- log_lines.append(f"ERROR: {e}")
429
  log_lines.append(traceback.format_exc())
 
 
 
 
 
 
 
 
430
 
431
  return "\n".join(log_lines)
432
 
 
316
  "",
317
  ]
318
 
319
+ def _log(msg):
320
+ log_lines.append(msg)
321
+ print(f"[train] {msg}", flush=True)
322
+
323
  try:
324
+ import subprocess, signal
325
+ _log("Stopping ace-server to free RAM for training...")
326
+ subprocess.run(["pkill", "-f", "ace-server"], stderr=subprocess.DEVNULL)
327
+ time.sleep(2)
328
+ gc.collect()
329
+
330
  ckpt_files = os.listdir(ACE_CHECKPOINT_DIR) if os.path.isdir(ACE_CHECKPOINT_DIR) else []
331
  if len(ckpt_files) < 3:
332
+ _log("[Step 0] Downloading model checkpoints...")
333
  progress(0.02, desc="Downloading checkpoints...")
334
  from huggingface_hub import snapshot_download
335
  snapshot_download(
 
337
  local_dir=ACE_CHECKPOINT_DIR,
338
  ignore_patterns=["*.md", "*.txt", ".gitattributes"],
339
  )
340
+ _log(" Checkpoints downloaded.")
341
 
342
  if ACE_SOURCE_DIR not in sys.path:
343
  sys.path.insert(0, ACE_SOURCE_DIR)
 
349
  return _orig_load(filepath, *args, **kwargs)
350
  torchaudio.load = _load_soundfile
351
 
352
+ _log("[Step 1/2] Preprocessing audio files...")
353
  progress(0.10, desc="Preprocessing audio...")
354
 
355
  tensor_dir = os.path.join(output_dir, "preprocessed_tensors")
 
369
  processed = result.get("processed", 0)
370
  total_files = result.get("total", 0)
371
  failed = result.get("failed", 0)
372
+ _log(f" Preprocessed: {processed}/{total_files} (failed: {failed})")
373
 
374
  if processed == 0:
375
+ _log("ERROR: No files preprocessed successfully.")
376
  return "\n".join(log_lines)
377
 
378
+ _log("[Step 2/2] Training LoRA adapter (CPU, this will be slow)...")
379
  progress(0.30, desc="Loading model for training...")
380
 
381
  from acestep.training_v2.model_loader import load_decoder_for_training
 
422
  pct = 0.30 + 0.65 * min(step_count / max(epochs * processed, 1), 1.0)
423
  progress(pct, desc=f"Step {step_count}, loss={last_loss:.4f}")
424
 
425
+ _log(f"Training complete! Final: step {step_count}, loss={last_loss:.4f}")
426
+ _log(f"LoRA saved to: {output_dir}")
427
 
428
  del model, trainer
429
  gc.collect()
430
 
431
  except ImportError as e:
432
+ _log(f"Import error: {e}")
433
+ _log(f"Check ACE-Step source at {ACE_SOURCE_DIR}")
434
  import traceback
435
  log_lines.append(traceback.format_exc())
436
  except Exception as e:
437
  import traceback
438
+ _log(f"ERROR: {e}")
439
  log_lines.append(traceback.format_exc())
440
+ finally:
441
+ _log("Restarting ace-server...")
442
+ import subprocess
443
+ subprocess.Popen([
444
+ "/app/ace-server", "--host", "127.0.0.1", "--port", "8085",
445
+ "--models", "/app/models", "--adapters", "/app/adapters",
446
+ "--max-batch", "1",
447
+ ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
448
 
449
  return "\n".join(log_lines)
450