Spaces:
Running
Running
stop ace-server during training to free RAM, restart after, add log visibility
Browse files
app.py
CHANGED
|
@@ -316,10 +316,20 @@ def gradio_main():
|
|
| 316 |
"",
|
| 317 |
]
|
| 318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
ckpt_files = os.listdir(ACE_CHECKPOINT_DIR) if os.path.isdir(ACE_CHECKPOINT_DIR) else []
|
| 321 |
if len(ckpt_files) < 3:
|
| 322 |
-
|
| 323 |
progress(0.02, desc="Downloading checkpoints...")
|
| 324 |
from huggingface_hub import snapshot_download
|
| 325 |
snapshot_download(
|
|
@@ -327,7 +337,7 @@ def gradio_main():
|
|
| 327 |
local_dir=ACE_CHECKPOINT_DIR,
|
| 328 |
ignore_patterns=["*.md", "*.txt", ".gitattributes"],
|
| 329 |
)
|
| 330 |
-
|
| 331 |
|
| 332 |
if ACE_SOURCE_DIR not in sys.path:
|
| 333 |
sys.path.insert(0, ACE_SOURCE_DIR)
|
|
@@ -339,7 +349,7 @@ def gradio_main():
|
|
| 339 |
return _orig_load(filepath, *args, **kwargs)
|
| 340 |
torchaudio.load = _load_soundfile
|
| 341 |
|
| 342 |
-
|
| 343 |
progress(0.10, desc="Preprocessing audio...")
|
| 344 |
|
| 345 |
tensor_dir = os.path.join(output_dir, "preprocessed_tensors")
|
|
@@ -359,13 +369,13 @@ def gradio_main():
|
|
| 359 |
processed = result.get("processed", 0)
|
| 360 |
total_files = result.get("total", 0)
|
| 361 |
failed = result.get("failed", 0)
|
| 362 |
-
|
| 363 |
|
| 364 |
if processed == 0:
|
| 365 |
-
|
| 366 |
return "\n".join(log_lines)
|
| 367 |
|
| 368 |
-
|
| 369 |
progress(0.30, desc="Loading model for training...")
|
| 370 |
|
| 371 |
from acestep.training_v2.model_loader import load_decoder_for_training
|
|
@@ -412,21 +422,29 @@ def gradio_main():
|
|
| 412 |
pct = 0.30 + 0.65 * min(step_count / max(epochs * processed, 1), 1.0)
|
| 413 |
progress(pct, desc=f"Step {step_count}, loss={last_loss:.4f}")
|
| 414 |
|
| 415 |
-
|
| 416 |
-
|
| 417 |
|
| 418 |
del model, trainer
|
| 419 |
gc.collect()
|
| 420 |
|
| 421 |
except ImportError as e:
|
| 422 |
-
|
| 423 |
-
|
| 424 |
import traceback
|
| 425 |
log_lines.append(traceback.format_exc())
|
| 426 |
except Exception as e:
|
| 427 |
import traceback
|
| 428 |
-
|
| 429 |
log_lines.append(traceback.format_exc())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
|
| 431 |
return "\n".join(log_lines)
|
| 432 |
|
|
|
|
| 316 |
"",
|
| 317 |
]
|
| 318 |
|
| 319 |
+
def _log(msg):
|
| 320 |
+
log_lines.append(msg)
|
| 321 |
+
print(f"[train] {msg}", flush=True)
|
| 322 |
+
|
| 323 |
try:
|
| 324 |
+
import subprocess, signal
|
| 325 |
+
_log("Stopping ace-server to free RAM for training...")
|
| 326 |
+
subprocess.run(["pkill", "-f", "ace-server"], stderr=subprocess.DEVNULL)
|
| 327 |
+
time.sleep(2)
|
| 328 |
+
gc.collect()
|
| 329 |
+
|
| 330 |
ckpt_files = os.listdir(ACE_CHECKPOINT_DIR) if os.path.isdir(ACE_CHECKPOINT_DIR) else []
|
| 331 |
if len(ckpt_files) < 3:
|
| 332 |
+
_log("[Step 0] Downloading model checkpoints...")
|
| 333 |
progress(0.02, desc="Downloading checkpoints...")
|
| 334 |
from huggingface_hub import snapshot_download
|
| 335 |
snapshot_download(
|
|
|
|
| 337 |
local_dir=ACE_CHECKPOINT_DIR,
|
| 338 |
ignore_patterns=["*.md", "*.txt", ".gitattributes"],
|
| 339 |
)
|
| 340 |
+
_log(" Checkpoints downloaded.")
|
| 341 |
|
| 342 |
if ACE_SOURCE_DIR not in sys.path:
|
| 343 |
sys.path.insert(0, ACE_SOURCE_DIR)
|
|
|
|
| 349 |
return _orig_load(filepath, *args, **kwargs)
|
| 350 |
torchaudio.load = _load_soundfile
|
| 351 |
|
| 352 |
+
_log("[Step 1/2] Preprocessing audio files...")
|
| 353 |
progress(0.10, desc="Preprocessing audio...")
|
| 354 |
|
| 355 |
tensor_dir = os.path.join(output_dir, "preprocessed_tensors")
|
|
|
|
| 369 |
processed = result.get("processed", 0)
|
| 370 |
total_files = result.get("total", 0)
|
| 371 |
failed = result.get("failed", 0)
|
| 372 |
+
_log(f" Preprocessed: {processed}/{total_files} (failed: {failed})")
|
| 373 |
|
| 374 |
if processed == 0:
|
| 375 |
+
_log("ERROR: No files preprocessed successfully.")
|
| 376 |
return "\n".join(log_lines)
|
| 377 |
|
| 378 |
+
_log("[Step 2/2] Training LoRA adapter (CPU, this will be slow)...")
|
| 379 |
progress(0.30, desc="Loading model for training...")
|
| 380 |
|
| 381 |
from acestep.training_v2.model_loader import load_decoder_for_training
|
|
|
|
| 422 |
pct = 0.30 + 0.65 * min(step_count / max(epochs * processed, 1), 1.0)
|
| 423 |
progress(pct, desc=f"Step {step_count}, loss={last_loss:.4f}")
|
| 424 |
|
| 425 |
+
_log(f"Training complete! Final: step {step_count}, loss={last_loss:.4f}")
|
| 426 |
+
_log(f"LoRA saved to: {output_dir}")
|
| 427 |
|
| 428 |
del model, trainer
|
| 429 |
gc.collect()
|
| 430 |
|
| 431 |
except ImportError as e:
|
| 432 |
+
_log(f"Import error: {e}")
|
| 433 |
+
_log(f"Check ACE-Step source at {ACE_SOURCE_DIR}")
|
| 434 |
import traceback
|
| 435 |
log_lines.append(traceback.format_exc())
|
| 436 |
except Exception as e:
|
| 437 |
import traceback
|
| 438 |
+
_log(f"ERROR: {e}")
|
| 439 |
log_lines.append(traceback.format_exc())
|
| 440 |
+
finally:
|
| 441 |
+
_log("Restarting ace-server...")
|
| 442 |
+
import subprocess
|
| 443 |
+
subprocess.Popen([
|
| 444 |
+
"/app/ace-server", "--host", "127.0.0.1", "--port", "8085",
|
| 445 |
+
"--models", "/app/models", "--adapters", "/app/adapters",
|
| 446 |
+
"--max-batch", "1",
|
| 447 |
+
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 448 |
|
| 449 |
return "\n".join(log_lines)
|
| 450 |
|