Spaces:
Running on Zero
Running on Zero
Harden MelBand model loading on Spaces
#1
by YanTianlong - opened
- app.py +15 -1
- docs/work-log.md +640 -620
- scripts/bootstrap_comfy.py +80 -1
app.py
CHANGED
|
@@ -25,6 +25,7 @@ import spaces
|
|
| 25 |
import torch
|
| 26 |
import websocket
|
| 27 |
|
|
|
|
| 28 |
from scripts.workflow_client import load_workflow, patch_voicegate_workflow
|
| 29 |
|
| 30 |
|
|
@@ -39,6 +40,7 @@ COMFY_PORT = "8188"
|
|
| 39 |
COMFY_PROCESS: subprocess.Popen | None = None
|
| 40 |
PREPARE_PROCESS: subprocess.Popen | None = None
|
| 41 |
BOOTSTRAPPED = False
|
|
|
|
| 42 |
BOOTSTRAP_LOG = Path("/tmp/voicegate_bootstrap.log")
|
| 43 |
USER_OUTPUT_DIR = ROOT / "user_outputs"
|
| 44 |
REQUIRED_MODEL_PATHS = [
|
|
@@ -487,7 +489,18 @@ def run_bootstrap(lines: list[str], *, allow_heavy: bool = True) -> None:
|
|
| 487 |
|
| 488 |
|
| 489 |
def missing_required_models() -> list[Path]:
|
| 490 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 491 |
|
| 492 |
|
| 493 |
def ensure_runtime_assets(lines: list[str]) -> None:
|
|
@@ -533,6 +546,7 @@ def ensure_comfy(lines: list[str], *, timeout: float = 240) -> dict[str, Any]:
|
|
| 533 |
raise RuntimeError(f"Runtime preparation failed with return code {returncode}.")
|
| 534 |
|
| 535 |
run_bootstrap(lines, allow_heavy=False)
|
|
|
|
| 536 |
|
| 537 |
try:
|
| 538 |
stats = wait_for_comfy(timeout=5)
|
|
|
|
| 25 |
import torch
|
| 26 |
import websocket
|
| 27 |
|
| 28 |
+
from scripts.bootstrap_comfy import patch_melband_loader, validate_melband_model
|
| 29 |
from scripts.workflow_client import load_workflow, patch_voicegate_workflow
|
| 30 |
|
| 31 |
|
|
|
|
| 40 |
COMFY_PROCESS: subprocess.Popen | None = None
|
| 41 |
PREPARE_PROCESS: subprocess.Popen | None = None
|
| 42 |
BOOTSTRAPPED = False
|
| 43 |
+
MODELS_VALIDATED = False
|
| 44 |
BOOTSTRAP_LOG = Path("/tmp/voicegate_bootstrap.log")
|
| 45 |
USER_OUTPUT_DIR = ROOT / "user_outputs"
|
| 46 |
REQUIRED_MODEL_PATHS = [
|
|
|
|
| 489 |
|
| 490 |
|
| 491 |
def missing_required_models() -> list[Path]:
|
| 492 |
+
global MODELS_VALIDATED
|
| 493 |
+
|
| 494 |
+
missing = [path for path in REQUIRED_MODEL_PATHS if not path.exists()]
|
| 495 |
+
if missing:
|
| 496 |
+
MODELS_VALIDATED = False
|
| 497 |
+
return missing
|
| 498 |
+
if not MODELS_VALIDATED:
|
| 499 |
+
melband_valid, _reason = validate_melband_model(verify_hash=True)
|
| 500 |
+
if not melband_valid:
|
| 501 |
+
return [REQUIRED_MODEL_PATHS[0]]
|
| 502 |
+
MODELS_VALIDATED = True
|
| 503 |
+
return []
|
| 504 |
|
| 505 |
|
| 506 |
def ensure_runtime_assets(lines: list[str]) -> None:
|
|
|
|
| 546 |
raise RuntimeError(f"Runtime preparation failed with return code {returncode}.")
|
| 547 |
|
| 548 |
run_bootstrap(lines, allow_heavy=False)
|
| 549 |
+
patch_melband_loader()
|
| 550 |
|
| 551 |
try:
|
| 552 |
stats = wait_for_comfy(timeout=5)
|
docs/work-log.md
CHANGED
|
@@ -1,38 +1,38 @@
|
|
| 1 |
-
# VoiceGate HF Space Work Log
|
| 2 |
-
|
| 3 |
-
This document records the effective work completed while preparing the
|
| 4 |
-
`build-small-hackathon/VoiceGate` Hugging Face Space, plus the pitfalls found
|
| 5 |
-
and how they were resolved.
|
| 6 |
-
|
| 7 |
-
## Current Snapshot
|
| 8 |
-
|
| 9 |
-
- Space: `https://huggingface.co/spaces/build-small-hackathon/VoiceGate`
|
| 10 |
-
- Space git remote: `https://huggingface.co/spaces/build-small-hackathon/VoiceGate`
|
| 11 |
-
- Runtime hardware: ZeroGPU / `zero-a10g`
|
| 12 |
-
- Space SDK: Gradio
|
| 13 |
-
- Local Space wrapper repo: `VoiceGate-hf`
|
| 14 |
-
- Local upstream reference checkout: `VoiceGate/`
|
| 15 |
- Latest confirmed normal runtime commit: `316b35db739d74d05543d6c8c9dd9c16e0580b17`
|
| 16 |
-
- Current expected Space secret: `DEEPSEEK_API_KEY`
|
| 17 |
-
- Default persistent model root: `/data/voicegate_models`
|
| 18 |
-
|
| 19 |
-
Do not commit API keys, model weights, uploaded media, generated outputs, or the
|
| 20 |
-
local `VoiceGate/` upstream checkout.
|
| 21 |
-
|
| 22 |
-
## Executive Summary
|
| 23 |
-
|
| 24 |
-
The Space is no longer just a blank scaffold. It can now run Gradio, invoke
|
| 25 |
-
ZeroGPU, prepare a ComfyUI runtime, start ComfyUI from a GPU-backed Gradio
|
| 26 |
-
function, and submit several segmented ComfyUI workflows.
|
| 27 |
-
|
| 28 |
-
Confirmed working:
|
| 29 |
-
|
| 30 |
-
- Hugging Face Space git push and normal rebuild flow.
|
| 31 |
-
- Dev Mode SSH for CPU/container diagnostics.
|
| 32 |
-
- ZeroGPU invocation from Gradio through `@spaces.GPU`.
|
| 33 |
-
- ComfyUI startup from inside a `@spaces.GPU` function.
|
| 34 |
-
- ComfyUI API calls from the Gradio process.
|
| 35 |
-
- DeepSeek-compatible LLM node with the Space secret.
|
| 36 |
- MelBand RoFormer smoke tests in CPU mode and ZeroGPU mode.
|
| 37 |
- VoxCPM2 TTS-only smoke test in ZeroGPU mode.
|
| 38 |
- VoiceBridge ASR-only smoke test in ZeroGPU mode.
|
|
@@ -44,223 +44,223 @@ Not yet confirmed at the start of 2026-06-06:
|
|
| 44 |
- SRT split -> VoxCPM -> SRT merge.
|
| 45 |
- Full short-audio VoiceGate workflow.
|
| 46 |
- Final user-facing Gradio upload/download UI.
|
| 47 |
-
|
| 48 |
-
## Repository Setup Completed
|
| 49 |
-
|
| 50 |
-
- Created and pushed the Space wrapper repository.
|
| 51 |
-
- Kept `VoiceGate/` as a local-only upstream reference and ignored it in git.
|
| 52 |
-
- Preserved Hugging Face LFS rules.
|
| 53 |
-
- Copied deployment workflows:
|
| 54 |
-
- `workflows/voicegate_api.json`
|
| 55 |
-
- `workflows/voicegate_ui.json`
|
| 56 |
-
- Confirmed the API workflow JSON is valid.
|
| 57 |
-
- Confirmed workflow files contain no committed API key.
|
| 58 |
-
|
| 59 |
-
## Dependency Inventory Completed
|
| 60 |
-
|
| 61 |
-
Required workflow node providers were identified and pinned:
|
| 62 |
-
|
| 63 |
-
- ComfyUI core:
|
| 64 |
-
`comfyanonymous/ComfyUI`
|
| 65 |
-
- VoiceBridge:
|
| 66 |
-
`YanTianlong-01/comfyui_voicebridge`
|
| 67 |
-
- RunningHub VoxCPM:
|
| 68 |
-
`RH-RunningHub/ComfyUI_RH_VoxCPM`
|
| 69 |
-
- MelBand RoFormer:
|
| 70 |
-
`kijai/ComfyUI-MelBandRoFormer`
|
| 71 |
-
- RunningHub LLM API:
|
| 72 |
-
`HM-RunningHub/ComfyUI_RH_LLM_API`
|
| 73 |
-
- rgthree:
|
| 74 |
-
`rgthree/rgthree-comfy`
|
| 75 |
-
- Easy Use:
|
| 76 |
-
`yolain/ComfyUI-Easy-Use`
|
| 77 |
-
- Comfyroll:
|
| 78 |
-
`Suzie1/ComfyUI_Comfyroll_CustomNodes`
|
| 79 |
-
- MW AudioTools:
|
| 80 |
-
`billwuhao/ComfyUI_AudioTools`
|
| 81 |
-
|
| 82 |
-
Important node source confirmations:
|
| 83 |
-
|
| 84 |
-
- `ReplaceText` is provided by ComfyUI core extra nodes.
|
| 85 |
-
- `MergeAudioMW` is provided by `ComfyUI_AudioTools`.
|
| 86 |
-
- `RH_LLMAPI_NODE` is provided by `ComfyUI_RH_LLM_API`.
|
| 87 |
-
|
| 88 |
-
## Runtime Bootstrap Added
|
| 89 |
-
|
| 90 |
-
The following scripts were added:
|
| 91 |
-
|
| 92 |
-
- `scripts/bootstrap_comfy.py`
|
| 93 |
-
- Clones ComfyUI.
|
| 94 |
-
- Checks out pinned commits.
|
| 95 |
-
- Clones required custom node repositories.
|
| 96 |
-
- Installs ComfyUI and custom node Python requirements.
|
| 97 |
-
- Prepares expected model directories.
|
| 98 |
-
- Optionally downloads large model assets with `--with-models`.
|
| 99 |
-
- `scripts/run_comfy.py`
|
| 100 |
-
- Starts ComfyUI.
|
| 101 |
-
- Waits for `/system_stats`.
|
| 102 |
-
- Supports `--cpu` for SSH diagnostics.
|
| 103 |
-
- `scripts/workflow_client.py`
|
| 104 |
-
- Loads `workflows/voicegate_api.json`.
|
| 105 |
-
- Uploads audio through the ComfyUI API.
|
| 106 |
-
- Patches workflow inputs.
|
| 107 |
-
- Submits `/prompt`.
|
| 108 |
-
- Waits for `/history/{prompt_id}`.
|
| 109 |
-
|
| 110 |
-
Workflow patching currently covers:
|
| 111 |
-
|
| 112 |
-
- Node `16`: uploaded audio filename.
|
| 113 |
-
- Node `105`: `DEEPSEEK_API_KEY`.
|
| 114 |
-
- Node `105`: API base URL.
|
| 115 |
-
- Node `105`: LLM model name.
|
| 116 |
-
- Node `110`: target language.
|
| 117 |
-
- Node `180`: job-specific audio output prefix.
|
| 118 |
-
- Node `214`: job-specific SRT output prefix.
|
| 119 |
-
|
| 120 |
-
## Hugging Face Space Runtime Findings
|
| 121 |
-
|
| 122 |
-
### Dev Mode and SSH
|
| 123 |
-
|
| 124 |
-
SSH target:
|
| 125 |
-
|
| 126 |
-
```text
|
| 127 |
-
build-small-hackathon-voicegate@ssh.hf.space
|
| 128 |
-
```
|
| 129 |
-
|
| 130 |
-
Local private key:
|
| 131 |
-
|
| 132 |
-
```text
|
| 133 |
-
C:\Users\yantianlong\.ssh\codex_space_voicegate
|
| 134 |
-
```
|
| 135 |
-
|
| 136 |
-
SSH is only available while the Space is in Dev Mode. Normal running Spaces do
|
| 137 |
-
not accept SSH and return:
|
| 138 |
-
|
| 139 |
-
```text
|
| 140 |
-
Bad request: SSH in only allowed in Dev mode
|
| 141 |
-
```
|
| 142 |
-
|
| 143 |
-
Dev Mode can be toggled through the Hugging Face API endpoint:
|
| 144 |
-
|
| 145 |
-
```text
|
| 146 |
-
POST /api/spaces/build-small-hackathon/VoiceGate/dev-mode
|
| 147 |
-
```
|
| 148 |
-
|
| 149 |
-
Use Dev Mode for diagnostics only. Persistent fixes must be committed locally
|
| 150 |
-
and pushed.
|
| 151 |
-
|
| 152 |
-
### Dev Mode Stale Commit Pitfall
|
| 153 |
-
|
| 154 |
-
The running container initially stayed on the original template commit:
|
| 155 |
-
|
| 156 |
-
```text
|
| 157 |
-
a94117f35a42cb17f654ae70cbe619c15345d057
|
| 158 |
-
```
|
| 159 |
-
|
| 160 |
-
even after newer commits were pushed. `restart_space` alone did not move it to
|
| 161 |
-
the latest repository state while Dev Mode was enabled.
|
| 162 |
-
|
| 163 |
-
Fix:
|
| 164 |
-
|
| 165 |
-
- Disable Dev Mode.
|
| 166 |
-
- Use `factory_reboot=True` or push a new commit to trigger a normal rebuild.
|
| 167 |
-
- Confirm runtime metadata reports the latest commit.
|
| 168 |
-
|
| 169 |
-
### ZeroGPU Startup Requirement
|
| 170 |
-
|
| 171 |
-
When Dev Mode was disabled, the Space entered `RUNTIME_ERROR` with:
|
| 172 |
-
|
| 173 |
-
```text
|
| 174 |
-
No @spaces.GPU function detected during startup
|
| 175 |
-
```
|
| 176 |
-
|
| 177 |
-
Fix:
|
| 178 |
-
|
| 179 |
-
- Import `spaces`.
|
| 180 |
-
- Add at least one `@spaces.GPU(duration=...)` function in `app.py`.
|
| 181 |
-
|
| 182 |
-
Current placeholder fix:
|
| 183 |
-
|
| 184 |
-
```python
|
| 185 |
-
@spaces.GPU(duration=30)
|
| 186 |
-
def placeholder():
|
| 187 |
-
...
|
| 188 |
-
```
|
| 189 |
-
|
| 190 |
-
Later this placeholder was replaced by real diagnostic functions:
|
| 191 |
-
|
| 192 |
-
```python
|
| 193 |
-
@spaces.GPU(duration=60)
|
| 194 |
-
def gpu_smoke_test():
|
| 195 |
-
...
|
| 196 |
-
|
| 197 |
-
@spaces.GPU(duration=900)
|
| 198 |
-
def comfy_runtime_test():
|
| 199 |
-
...
|
| 200 |
-
```
|
| 201 |
-
|
| 202 |
-
### SSH Does Not Expose ZeroGPU CUDA
|
| 203 |
-
|
| 204 |
-
Starting ComfyUI normally through SSH failed with:
|
| 205 |
-
|
| 206 |
-
```text
|
| 207 |
-
RuntimeError: No CUDA GPUs are available
|
| 208 |
-
```
|
| 209 |
-
|
| 210 |
-
Conclusion:
|
| 211 |
-
|
| 212 |
-
- SSH is useful for CPU-mode diagnostics.
|
| 213 |
-
- Real GPU work must run from the Gradio process inside a `@spaces.GPU`
|
| 214 |
-
function.
|
| 215 |
-
|
| 216 |
-
CPU diagnostic command:
|
| 217 |
-
|
| 218 |
-
```bash
|
| 219 |
-
python scripts/run_comfy.py --cpu
|
| 220 |
-
```
|
| 221 |
-
|
| 222 |
-
### Gradio Request Timeout During Bootstrap
|
| 223 |
-
|
| 224 |
-
Long bootstrap work should not run synchronously inside a Gradio request. The
|
| 225 |
-
first attempt did this:
|
| 226 |
-
|
| 227 |
-
```text
|
| 228 |
-
Gradio click -> bootstrap_comfy.py -> clone repos -> pip install -> start ComfyUI
|
| 229 |
-
```
|
| 230 |
-
|
| 231 |
-
The request was interrupted by Gradio/ZeroGPU's outer queue after roughly 2.5
|
| 232 |
-
minutes and returned:
|
| 233 |
-
|
| 234 |
-
```text
|
| 235 |
-
event: error
|
| 236 |
-
data: {"error": null}
|
| 237 |
-
```
|
| 238 |
-
|
| 239 |
-
Fix:
|
| 240 |
-
|
| 241 |
-
- Add a non-GPU `Prepare` action that starts `scripts/bootstrap_comfy.py` as a
|
| 242 |
-
background process.
|
| 243 |
-
- Add `Prepare Status` to poll `/tmp/voicegate_bootstrap.log`.
|
| 244 |
-
- Keep GPU actions focused on starting ComfyUI and running actual CUDA work.
|
| 245 |
-
|
| 246 |
-
This avoids wasting ZeroGPU time on clone/install steps and prevents the request
|
| 247 |
-
from being killed before diagnostics can return useful logs.
|
| 248 |
-
|
| 249 |
### Runtime Pip Install Pitfall
|
| 250 |
-
|
| 251 |
-
The background bootstrap installed a large dependency set and upgraded the
|
| 252 |
-
on-disk Torch package. The already-running Gradio process continued to report:
|
| 253 |
-
|
| 254 |
-
```text
|
| 255 |
-
torch=2.11.0+cu130
|
| 256 |
-
```
|
| 257 |
-
|
| 258 |
-
while the ComfyUI subprocess started afterwards reported:
|
| 259 |
-
|
| 260 |
-
```text
|
| 261 |
-
pytorch_version=2.12.0+cu130
|
| 262 |
-
```
|
| 263 |
-
|
| 264 |
This is workable for diagnostics, but final production should avoid heavy
|
| 265 |
runtime `pip install` where possible. Prefer moving stable dependencies into
|
| 266 |
Space build-time requirements or explicitly controlling pins.
|
|
@@ -295,295 +295,295 @@ The working diagnostic used:
|
|
| 295 |
|
| 296 |
For future tests, keep diagnostic durations conservative and increase only when
|
| 297 |
the workflow has already proven it needs more time.
|
| 298 |
-
|
| 299 |
-
## Dependency Pitfalls and Fixes
|
| 300 |
-
|
| 301 |
-
`ComfyUI_AudioTools` initially failed to import.
|
| 302 |
-
|
| 303 |
-
First failure:
|
| 304 |
-
|
| 305 |
-
```text
|
| 306 |
-
SoX could not be found
|
| 307 |
-
ModuleNotFoundError: No module named 'sounddevice'
|
| 308 |
-
```
|
| 309 |
-
|
| 310 |
-
Second failure after adding `sounddevice`:
|
| 311 |
-
|
| 312 |
-
```text
|
| 313 |
-
OSError: PortAudio library not found
|
| 314 |
-
```
|
| 315 |
-
|
| 316 |
-
Third failure:
|
| 317 |
-
|
| 318 |
-
```text
|
| 319 |
-
ModuleNotFoundError: No module named 'easydict'
|
| 320 |
-
```
|
| 321 |
-
|
| 322 |
-
Fourth failure:
|
| 323 |
-
|
| 324 |
-
```text
|
| 325 |
-
ModuleNotFoundError: No module named 'pytorch_lightning'
|
| 326 |
-
```
|
| 327 |
-
|
| 328 |
-
Fixes added:
|
| 329 |
-
|
| 330 |
-
- `packages.txt`
|
| 331 |
-
- `sox`
|
| 332 |
-
- `libportaudio2`
|
| 333 |
-
- `portaudio19-dev`
|
| 334 |
-
- `requirements.txt`
|
| 335 |
-
- `sounddevice`
|
| 336 |
-
- `easydict`
|
| 337 |
-
- `pytorch-lightning`
|
| 338 |
-
|
| 339 |
-
Final verification:
|
| 340 |
-
|
| 341 |
-
```text
|
| 342 |
-
0.4 seconds: /home/user/app/ComfyUI/custom_nodes/ComfyUI_AudioTools
|
| 343 |
-
```
|
| 344 |
-
|
| 345 |
-
with no `IMPORT FAILED` entry.
|
| 346 |
-
|
| 347 |
-
## ComfyUI API Smoke Test
|
| 348 |
-
|
| 349 |
-
Test audio source:
|
| 350 |
-
|
| 351 |
-
```text
|
| 352 |
-
D:\voicebridge-test-audio\test_audio\2-坤哥.MP3
|
| 353 |
-
```
|
| 354 |
-
|
| 355 |
-
The first upload attempt used a plain PowerShell byte pipeline and corrupted the
|
| 356 |
-
binary file. The remote file was identified as text instead of MP3, and
|
| 357 |
-
`LoadAudio` failed with:
|
| 358 |
-
|
| 359 |
-
```text
|
| 360 |
-
Invalid data found when processing input: 'avcodec_send_packet()'
|
| 361 |
-
```
|
| 362 |
-
|
| 363 |
-
Fix:
|
| 364 |
-
|
| 365 |
-
- Upload binary test media through a binary-safe method.
|
| 366 |
-
- Verify remote `sha256sum` before using the file.
|
| 367 |
-
|
| 368 |
-
Successful upload result:
|
| 369 |
-
|
| 370 |
-
```text
|
| 371 |
-
/tmp/voicegate_test_audio.mp3: Audio file with ID3 version 2.3.0
|
| 372 |
-
```
|
| 373 |
-
|
| 374 |
-
ComfyUI API endpoints verified in Dev Mode:
|
| 375 |
-
|
| 376 |
-
- `/system_stats`
|
| 377 |
-
- `/upload/image`
|
| 378 |
-
- `/prompt`
|
| 379 |
-
- `/history/{prompt_id}`
|
| 380 |
-
|
| 381 |
-
Minimal test workflow:
|
| 382 |
-
|
| 383 |
-
```text
|
| 384 |
-
LoadAudio -> SaveAudioMP3
|
| 385 |
-
```
|
| 386 |
-
|
| 387 |
-
Successful `/history/{prompt_id}` result:
|
| 388 |
-
|
| 389 |
-
```text
|
| 390 |
-
status_str: success
|
| 391 |
-
completed: true
|
| 392 |
-
```
|
| 393 |
-
|
| 394 |
-
Output reported by ComfyUI:
|
| 395 |
-
|
| 396 |
-
```text
|
| 397 |
-
audio/api_smoke_voicegate_00001.mp3
|
| 398 |
-
```
|
| 399 |
-
|
| 400 |
-
## Segmented Workflow Smoke Tests
|
| 401 |
-
|
| 402 |
-
### ComfyUI From Gradio ZeroGPU
|
| 403 |
-
|
| 404 |
-
On 2026-06-05, `app.py` was expanded with diagnostic Gradio actions:
|
| 405 |
-
|
| 406 |
-
- `prepare_runtime`: starts `scripts/bootstrap_comfy.py` in the background and
|
| 407 |
-
writes progress to `/tmp/voicegate_bootstrap.log`.
|
| 408 |
-
- `prepare_status`: reports the background bootstrap status and log tail.
|
| 409 |
-
- `comfy_runtime_test`: runs inside `@spaces.GPU`, starts ComfyUI, and calls
|
| 410 |
-
`/system_stats`.
|
| 411 |
-
- `melband_gpu_test`: runs a tiny MelBand workflow inside `@spaces.GPU`.
|
| 412 |
-
- `voxcpm_tts_gpu_test`: runs a tiny VoxCPM2 TTS-only workflow inside
|
| 413 |
-
`@spaces.GPU`.
|
| 414 |
-
|
| 415 |
-
The first attempt ran the full bootstrap synchronously inside a Gradio request
|
| 416 |
-
and the request was interrupted by the outer queue with `event: error` and no
|
| 417 |
-
function payload after roughly 2.5 minutes. The fix was to start bootstrap as a
|
| 418 |
-
background process and poll a status endpoint.
|
| 419 |
-
|
| 420 |
-
The background prepare completed successfully. It installed a large dependency
|
| 421 |
-
set and upgraded the on-disk Torch package from `2.11.0` to `2.12.0`. The
|
| 422 |
-
already-running Gradio process still reported its originally imported
|
| 423 |
-
`torch=2.11.0+cu130`, while the newly started ComfyUI subprocess reported:
|
| 424 |
-
|
| 425 |
-
```text
|
| 426 |
-
pytorch_version=2.12.0+cu130
|
| 427 |
-
```
|
| 428 |
-
|
| 429 |
-
This is acceptable for the smoke test, but runtime pip installs are not ideal
|
| 430 |
-
for the final app. A later pass should move heavy Python dependencies into the
|
| 431 |
-
Space build/install phase or pin the root requirements more deliberately.
|
| 432 |
-
|
| 433 |
-
`comfy_runtime_test` result:
|
| 434 |
-
|
| 435 |
-
```text
|
| 436 |
-
cuda_available=True
|
| 437 |
-
comfy_ready=true
|
| 438 |
-
comfy_elapsed_sec=16.0
|
| 439 |
-
ComfyUI version=0.24.0
|
| 440 |
-
device=cuda:0 NVIDIA RTX PRO 6000 Blackwell Server Edition MIG 2g.48gb
|
| 441 |
-
vram_total=50868518912
|
| 442 |
-
```
|
| 443 |
-
|
| 444 |
-
Observed behavior: separate `@spaces.GPU` calls may run in separate worker
|
| 445 |
-
processes, so the ComfyUI subprocess should not be assumed to persist across
|
| 446 |
-
different button/API calls.
|
| 447 |
-
|
| 448 |
-
### ZeroGPU Gradio Invocation
|
| 449 |
-
|
| 450 |
-
On 2026-06-05, the Space was tested in normal runtime, with Dev Mode off, using
|
| 451 |
-
a Gradio button backed by:
|
| 452 |
-
|
| 453 |
-
```python
|
| 454 |
-
@spaces.GPU(duration=60)
|
| 455 |
-
def gpu_smoke_test():
|
| 456 |
-
...
|
| 457 |
-
```
|
| 458 |
-
|
| 459 |
-
The private Space API was called with the local Hugging Face token through:
|
| 460 |
-
|
| 461 |
-
```text
|
| 462 |
-
POST /gradio_api/call/gpu_smoke_test
|
| 463 |
-
GET /gradio_api/call/gpu_smoke_test/{event_id}
|
| 464 |
-
```
|
| 465 |
-
|
| 466 |
-
Result:
|
| 467 |
-
|
| 468 |
-
```text
|
| 469 |
-
torch=2.11.0+cu130
|
| 470 |
-
cuda_available=True
|
| 471 |
-
cuda_device_count=1
|
| 472 |
-
device_name=NVIDIA RTX PRO 6000 Blackwell Server Edition MIG 2g.48gb
|
| 473 |
-
total_memory_gb=47.38
|
| 474 |
-
tensor_result=240.0
|
| 475 |
-
memory_reserved_mb=2.00
|
| 476 |
-
```
|
| 477 |
-
|
| 478 |
-
This confirms ZeroGPU CUDA is available from the normal Gradio runtime when the
|
| 479 |
-
work is executed inside a `@spaces.GPU` function. SSH still should be treated as
|
| 480 |
-
CPU-only diagnostic access.
|
| 481 |
-
|
| 482 |
-
### DeepSeek LLM Node
|
| 483 |
-
|
| 484 |
-
On 2026-06-05, `RH_LLMAPI_NODE` was tested through ComfyUI in Dev Mode using
|
| 485 |
-
the Space `DEEPSEEK_API_KEY` secret. The key was not printed.
|
| 486 |
-
|
| 487 |
-
Minimal workflow:
|
| 488 |
-
|
| 489 |
-
```text
|
| 490 |
-
RH_LLMAPI_NODE -> easy showAnything
|
| 491 |
-
```
|
| 492 |
-
|
| 493 |
-
Prompt:
|
| 494 |
-
|
| 495 |
-
```text
|
| 496 |
-
Translate to Simplified Chinese: VoiceGate smoke test.
|
| 497 |
-
```
|
| 498 |
-
|
| 499 |
-
Result:
|
| 500 |
-
|
| 501 |
-
```text
|
| 502 |
-
status_str: success
|
| 503 |
-
output: VoiceGate 冒烟测试。
|
| 504 |
-
```
|
| 505 |
-
|
| 506 |
-
This confirms the RunningHub LLM node can read the Space secret and call the
|
| 507 |
-
DeepSeek-compatible API endpoint.
|
| 508 |
-
|
| 509 |
-
### MelBand RoFormer
|
| 510 |
-
|
| 511 |
-
On 2026-06-05, `MelBandRoFormerModelLoader` and `MelBandRoFormerSampler` were
|
| 512 |
-
tested through ComfyUI in CPU mode.
|
| 513 |
-
|
| 514 |
-
Input:
|
| 515 |
-
|
| 516 |
-
```text
|
| 517 |
-
1 second synthetic 440 Hz WAV generated with ffmpeg
|
| 518 |
-
```
|
| 519 |
-
|
| 520 |
-
Minimal workflow:
|
| 521 |
-
|
| 522 |
-
```text
|
| 523 |
-
LoadAudio -> MelBandRoFormerModelLoader -> MelBandRoFormerSampler
|
| 524 |
-
-> SaveAudioMP3(vocals)
|
| 525 |
-
-> SaveAudioMP3(instruments)
|
| 526 |
-
```
|
| 527 |
-
|
| 528 |
-
Result:
|
| 529 |
-
|
| 530 |
-
```text
|
| 531 |
-
status_str: success
|
| 532 |
-
audio/melband_smoke_vocals_00001.mp3
|
| 533 |
-
audio/melband_smoke_instruments_00001.mp3
|
| 534 |
-
```
|
| 535 |
-
|
| 536 |
-
CPU-mode runtime for the 1 second smoke input was about 51 seconds. Real runs
|
| 537 |
-
should execute inside a `@spaces.GPU` function.
|
| 538 |
-
|
| 539 |
-
Later on 2026-06-05, the same kind of tiny MelBand smoke test was run from the
|
| 540 |
-
normal Gradio runtime inside `@spaces.GPU`.
|
| 541 |
-
|
| 542 |
-
Input:
|
| 543 |
-
|
| 544 |
-
```text
|
| 545 |
-
1 second synthetic 440 Hz WAV written to ComfyUI/input
|
| 546 |
-
```
|
| 547 |
-
|
| 548 |
-
Result:
|
| 549 |
-
|
| 550 |
-
```text
|
| 551 |
-
status_str=success
|
| 552 |
-
completed=True
|
| 553 |
-
audio/melband_gpu_32459bea_instruments_00001.mp3
|
| 554 |
-
audio/melband_gpu_32459bea_vocals_00001.mp3
|
| 555 |
-
elapsed_sec=78.3
|
| 556 |
-
```
|
| 557 |
-
|
| 558 |
-
This confirms the MelBand custom node and model can execute from the Space
|
| 559 |
-
ZeroGPU path.
|
| 560 |
-
|
| 561 |
### VoxCPM2 TTS-only
|
| 562 |
-
|
| 563 |
-
On 2026-06-05, a minimal VoxCPM2 TTS-only workflow was run from the normal
|
| 564 |
-
Gradio runtime inside `@spaces.GPU`.
|
| 565 |
-
|
| 566 |
-
Minimal workflow:
|
| 567 |
-
|
| 568 |
-
```text
|
| 569 |
-
RunningHub_VoxCPM_LoadModel -> RunningHub_VoxCPM_Generate -> SaveAudioMP3
|
| 570 |
-
```
|
| 571 |
-
|
| 572 |
-
Prompt text:
|
| 573 |
-
|
| 574 |
-
```text
|
| 575 |
-
你好,VoiceGate GPU 语音合成测试。
|
| 576 |
-
```
|
| 577 |
-
|
| 578 |
-
Result:
|
| 579 |
-
|
| 580 |
-
```text
|
| 581 |
-
status_str=success
|
| 582 |
-
completed=True
|
| 583 |
-
audio/voxcpm_tts_gpu_cda209ec_00001.mp3
|
| 584 |
-
elapsed_sec=766.2
|
| 585 |
-
```
|
| 586 |
-
|
| 587 |
This confirms VoxCPM2 fits and executes in ZeroGPU, but the first cold TTS-only
|
| 588 |
run was very slow. The final app should minimize cold starts, avoid repeated
|
| 589 |
ComfyUI/model reloads where possible, and use shorter diagnostic prompts while
|
|
@@ -653,103 +653,103 @@ This confirms the Qwen3-ASR model, forced aligner, VoiceBridge ASR nodes, and
|
|
| 653 |
SRT generation can run in the Space ZeroGPU path. The smoke test intentionally
|
| 654 |
used `attention=sdpa` instead of `flash_attention_2`; `flash_attention_2`
|
| 655 |
availability remains unverified.
|
| 656 |
-
|
| 657 |
-
## Secrets and API Keys
|
| 658 |
-
|
| 659 |
-
`DEEPSEEK_API_KEY` should be stored only as a Hugging Face Space Secret.
|
| 660 |
-
|
| 661 |
-
Current expected secret:
|
| 662 |
-
|
| 663 |
-
```text
|
| 664 |
-
DEEPSEEK_API_KEY
|
| 665 |
-
```
|
| 666 |
-
|
| 667 |
-
Optional variables:
|
| 668 |
-
|
| 669 |
-
```text
|
| 670 |
-
DEEPSEEK_BASE_URL=https://api.deepseek.com
|
| 671 |
-
DEEPSEEK_MODEL=deepseek-v4-flash
|
| 672 |
-
```
|
| 673 |
-
|
| 674 |
-
Never store these values in:
|
| 675 |
-
|
| 676 |
-
- `app.py`
|
| 677 |
-
- workflow JSON files
|
| 678 |
-
- README files
|
| 679 |
-
- docs
|
| 680 |
-
- `.env` files committed to git
|
| 681 |
-
|
| 682 |
-
`scripts/workflow_client.py` reads these from environment variables.
|
| 683 |
-
|
| 684 |
-
`scripts/check_space_env.py` verifies whether these environment variables are
|
| 685 |
-
present without printing their values.
|
| 686 |
-
|
| 687 |
-
## Model Storage
|
| 688 |
-
|
| 689 |
-
Large model files should live on the Space persistent storage volume instead of
|
| 690 |
-
inside `/home/user/app`, because `/home/user/app` can be replaced during Space
|
| 691 |
-
rebuilds.
|
| 692 |
-
|
| 693 |
-
Default model root:
|
| 694 |
-
|
| 695 |
-
```text
|
| 696 |
-
/data/voicegate_models
|
| 697 |
-
```
|
| 698 |
-
|
| 699 |
-
`scripts/bootstrap_comfy.py` creates symlinks from ComfyUI's expected paths to
|
| 700 |
-
that persistent root:
|
| 701 |
-
|
| 702 |
-
```text
|
| 703 |
-
ComfyUI/models/voxcpm/VoxCPM2
|
| 704 |
-
-> /data/voicegate_models/voxcpm/VoxCPM2
|
| 705 |
-
|
| 706 |
ComfyUI/models/diffusion_models/MelBandRoFormer_comfy
|
| 707 |
-> /data/voicegate_models/diffusion_models/MelBandRoFormer_comfy
|
| 708 |
|
| 709 |
ComfyUI/models/Qwen3-ASR
|
| 710 |
-> /data/voicegate_models/Qwen3-ASR
|
| 711 |
-
```
|
| 712 |
-
|
| 713 |
-
Override the root with:
|
| 714 |
-
|
| 715 |
-
```text
|
| 716 |
-
VOICEGATE_MODEL_ROOT
|
| 717 |
-
```
|
| 718 |
-
|
| 719 |
-
On 2026-06-05, the first two explicit ComfyUI-path models were downloaded to
|
| 720 |
-
persistent storage:
|
| 721 |
-
|
| 722 |
-
```text
|
| 723 |
/data/voicegate_models/voxcpm/VoxCPM2/model.safetensors
|
| 724 |
/data/voicegate_models/voxcpm/VoxCPM2/audiovae.pth
|
| 725 |
/data/voicegate_models/diffusion_models/MelBandRoFormer_comfy/MelBandRoformer_fp32.safetensors
|
| 726 |
/data/voicegate_models/Qwen3-ASR/Qwen3-ASR-1.7B
|
| 727 |
/data/voicegate_models/Qwen3-ASR/Qwen3-ForcedAligner-0.6B
|
| 728 |
-
```
|
| 729 |
-
|
| 730 |
-
Verified symlinks:
|
| 731 |
-
|
| 732 |
-
```text
|
| 733 |
-
/home/user/app/ComfyUI/models/voxcpm/VoxCPM2
|
| 734 |
-
-> /data/voicegate_models/voxcpm/VoxCPM2
|
| 735 |
-
|
| 736 |
/home/user/app/ComfyUI/models/diffusion_models/MelBandRoFormer_comfy
|
| 737 |
-> /data/voicegate_models/diffusion_models/MelBandRoFormer_comfy
|
| 738 |
|
| 739 |
/home/user/app/ComfyUI/models/Qwen3-ASR
|
| 740 |
-> /data/voicegate_models/Qwen3-ASR
|
| 741 |
-
```
|
| 742 |
-
|
| 743 |
-
`DEEPSEEK_API_KEY` was also verified as present in the Space environment without
|
| 744 |
-
printing its value.
|
| 745 |
-
|
| 746 |
-
Model download pitfall:
|
| 747 |
-
|
| 748 |
-
- `huggingface-cli download` is deprecated and failed in the Space.
|
| 749 |
-
- `hf download` also failed because of a CLI dependency compatibility issue.
|
| 750 |
-
- `scripts/bootstrap_comfy.py` now uses the `huggingface_hub` Python API
|
| 751 |
-
directly for model downloads.
|
| 752 |
-
|
| 753 |
## Current Known Good Commits
|
| 754 |
|
| 755 |
- `683b147` Add ComfyUI runtime bootstrap scripts
|
|
@@ -905,3 +905,23 @@ Next recommended steps:
|
|
| 905 |
2. Polish the first Gradio user interface and validate the automatic model
|
| 906 |
preparation path after Space rebuilds/hardware changes.
|
| 907 |
3. Reduce runtime dependency installation and model reload overhead.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VoiceGate HF Space Work Log
|
| 2 |
+
|
| 3 |
+
This document records the effective work completed while preparing the
|
| 4 |
+
`build-small-hackathon/VoiceGate` Hugging Face Space, plus the pitfalls found
|
| 5 |
+
and how they were resolved.
|
| 6 |
+
|
| 7 |
+
## Current Snapshot
|
| 8 |
+
|
| 9 |
+
- Space: `https://huggingface.co/spaces/build-small-hackathon/VoiceGate`
|
| 10 |
+
- Space git remote: `https://huggingface.co/spaces/build-small-hackathon/VoiceGate`
|
| 11 |
+
- Runtime hardware: ZeroGPU / `zero-a10g`
|
| 12 |
+
- Space SDK: Gradio
|
| 13 |
+
- Local Space wrapper repo: `VoiceGate-hf`
|
| 14 |
+
- Local upstream reference checkout: `VoiceGate/`
|
| 15 |
- Latest confirmed normal runtime commit: `316b35db739d74d05543d6c8c9dd9c16e0580b17`
|
| 16 |
+
- Current expected Space secret: `DEEPSEEK_API_KEY`
|
| 17 |
+
- Default persistent model root: `/data/voicegate_models`
|
| 18 |
+
|
| 19 |
+
Do not commit API keys, model weights, uploaded media, generated outputs, or the
|
| 20 |
+
local `VoiceGate/` upstream checkout.
|
| 21 |
+
|
| 22 |
+
## Executive Summary
|
| 23 |
+
|
| 24 |
+
The Space is no longer just a blank scaffold. It can now run Gradio, invoke
|
| 25 |
+
ZeroGPU, prepare a ComfyUI runtime, start ComfyUI from a GPU-backed Gradio
|
| 26 |
+
function, and submit several segmented ComfyUI workflows.
|
| 27 |
+
|
| 28 |
+
Confirmed working:
|
| 29 |
+
|
| 30 |
+
- Hugging Face Space git push and normal rebuild flow.
|
| 31 |
+
- Dev Mode SSH for CPU/container diagnostics.
|
| 32 |
+
- ZeroGPU invocation from Gradio through `@spaces.GPU`.
|
| 33 |
+
- ComfyUI startup from inside a `@spaces.GPU` function.
|
| 34 |
+
- ComfyUI API calls from the Gradio process.
|
| 35 |
+
- DeepSeek-compatible LLM node with the Space secret.
|
| 36 |
- MelBand RoFormer smoke tests in CPU mode and ZeroGPU mode.
|
| 37 |
- VoxCPM2 TTS-only smoke test in ZeroGPU mode.
|
| 38 |
- VoiceBridge ASR-only smoke test in ZeroGPU mode.
|
|
|
|
| 44 |
- SRT split -> VoxCPM -> SRT merge.
|
| 45 |
- Full short-audio VoiceGate workflow.
|
| 46 |
- Final user-facing Gradio upload/download UI.
|
| 47 |
+
|
| 48 |
+
## Repository Setup Completed
|
| 49 |
+
|
| 50 |
+
- Created and pushed the Space wrapper repository.
|
| 51 |
+
- Kept `VoiceGate/` as a local-only upstream reference and ignored it in git.
|
| 52 |
+
- Preserved Hugging Face LFS rules.
|
| 53 |
+
- Copied deployment workflows:
|
| 54 |
+
- `workflows/voicegate_api.json`
|
| 55 |
+
- `workflows/voicegate_ui.json`
|
| 56 |
+
- Confirmed the API workflow JSON is valid.
|
| 57 |
+
- Confirmed workflow files contain no committed API key.
|
| 58 |
+
|
| 59 |
+
## Dependency Inventory Completed
|
| 60 |
+
|
| 61 |
+
Required workflow node providers were identified and pinned:
|
| 62 |
+
|
| 63 |
+
- ComfyUI core:
|
| 64 |
+
`comfyanonymous/ComfyUI`
|
| 65 |
+
- VoiceBridge:
|
| 66 |
+
`YanTianlong-01/comfyui_voicebridge`
|
| 67 |
+
- RunningHub VoxCPM:
|
| 68 |
+
`RH-RunningHub/ComfyUI_RH_VoxCPM`
|
| 69 |
+
- MelBand RoFormer:
|
| 70 |
+
`kijai/ComfyUI-MelBandRoFormer`
|
| 71 |
+
- RunningHub LLM API:
|
| 72 |
+
`HM-RunningHub/ComfyUI_RH_LLM_API`
|
| 73 |
+
- rgthree:
|
| 74 |
+
`rgthree/rgthree-comfy`
|
| 75 |
+
- Easy Use:
|
| 76 |
+
`yolain/ComfyUI-Easy-Use`
|
| 77 |
+
- Comfyroll:
|
| 78 |
+
`Suzie1/ComfyUI_Comfyroll_CustomNodes`
|
| 79 |
+
- MW AudioTools:
|
| 80 |
+
`billwuhao/ComfyUI_AudioTools`
|
| 81 |
+
|
| 82 |
+
Important node source confirmations:
|
| 83 |
+
|
| 84 |
+
- `ReplaceText` is provided by ComfyUI core extra nodes.
|
| 85 |
+
- `MergeAudioMW` is provided by `ComfyUI_AudioTools`.
|
| 86 |
+
- `RH_LLMAPI_NODE` is provided by `ComfyUI_RH_LLM_API`.
|
| 87 |
+
|
| 88 |
+
## Runtime Bootstrap Added
|
| 89 |
+
|
| 90 |
+
The following scripts were added:
|
| 91 |
+
|
| 92 |
+
- `scripts/bootstrap_comfy.py`
|
| 93 |
+
- Clones ComfyUI.
|
| 94 |
+
- Checks out pinned commits.
|
| 95 |
+
- Clones required custom node repositories.
|
| 96 |
+
- Installs ComfyUI and custom node Python requirements.
|
| 97 |
+
- Prepares expected model directories.
|
| 98 |
+
- Optionally downloads large model assets with `--with-models`.
|
| 99 |
+
- `scripts/run_comfy.py`
|
| 100 |
+
- Starts ComfyUI.
|
| 101 |
+
- Waits for `/system_stats`.
|
| 102 |
+
- Supports `--cpu` for SSH diagnostics.
|
| 103 |
+
- `scripts/workflow_client.py`
|
| 104 |
+
- Loads `workflows/voicegate_api.json`.
|
| 105 |
+
- Uploads audio through the ComfyUI API.
|
| 106 |
+
- Patches workflow inputs.
|
| 107 |
+
- Submits `/prompt`.
|
| 108 |
+
- Waits for `/history/{prompt_id}`.
|
| 109 |
+
|
| 110 |
+
Workflow patching currently covers:
|
| 111 |
+
|
| 112 |
+
- Node `16`: uploaded audio filename.
|
| 113 |
+
- Node `105`: `DEEPSEEK_API_KEY`.
|
| 114 |
+
- Node `105`: API base URL.
|
| 115 |
+
- Node `105`: LLM model name.
|
| 116 |
+
- Node `110`: target language.
|
| 117 |
+
- Node `180`: job-specific audio output prefix.
|
| 118 |
+
- Node `214`: job-specific SRT output prefix.
|
| 119 |
+
|
| 120 |
+
## Hugging Face Space Runtime Findings
|
| 121 |
+
|
| 122 |
+
### Dev Mode and SSH
|
| 123 |
+
|
| 124 |
+
SSH target:
|
| 125 |
+
|
| 126 |
+
```text
|
| 127 |
+
build-small-hackathon-voicegate@ssh.hf.space
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
Local private key:
|
| 131 |
+
|
| 132 |
+
```text
|
| 133 |
+
C:\Users\yantianlong\.ssh\codex_space_voicegate
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
SSH is only available while the Space is in Dev Mode. Normal running Spaces do
|
| 137 |
+
not accept SSH and return:
|
| 138 |
+
|
| 139 |
+
```text
|
| 140 |
+
Bad request: SSH in only allowed in Dev mode
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
Dev Mode can be toggled through the Hugging Face API endpoint:
|
| 144 |
+
|
| 145 |
+
```text
|
| 146 |
+
POST /api/spaces/build-small-hackathon/VoiceGate/dev-mode
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
Use Dev Mode for diagnostics only. Persistent fixes must be committed locally
|
| 150 |
+
and pushed.
|
| 151 |
+
|
| 152 |
+
### Dev Mode Stale Commit Pitfall
|
| 153 |
+
|
| 154 |
+
The running container initially stayed on the original template commit:
|
| 155 |
+
|
| 156 |
+
```text
|
| 157 |
+
a94117f35a42cb17f654ae70cbe619c15345d057
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
even after newer commits were pushed. `restart_space` alone did not move it to
|
| 161 |
+
the latest repository state while Dev Mode was enabled.
|
| 162 |
+
|
| 163 |
+
Fix:
|
| 164 |
+
|
| 165 |
+
- Disable Dev Mode.
|
| 166 |
+
- Use `factory_reboot=True` or push a new commit to trigger a normal rebuild.
|
| 167 |
+
- Confirm runtime metadata reports the latest commit.
|
| 168 |
+
|
| 169 |
+
### ZeroGPU Startup Requirement
|
| 170 |
+
|
| 171 |
+
When Dev Mode was disabled, the Space entered `RUNTIME_ERROR` with:
|
| 172 |
+
|
| 173 |
+
```text
|
| 174 |
+
No @spaces.GPU function detected during startup
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
Fix:
|
| 178 |
+
|
| 179 |
+
- Import `spaces`.
|
| 180 |
+
- Add at least one `@spaces.GPU(duration=...)` function in `app.py`.
|
| 181 |
+
|
| 182 |
+
Current placeholder fix:
|
| 183 |
+
|
| 184 |
+
```python
|
| 185 |
+
@spaces.GPU(duration=30)
|
| 186 |
+
def placeholder():
|
| 187 |
+
...
|
| 188 |
+
```
|
| 189 |
+
|
| 190 |
+
Later this placeholder was replaced by real diagnostic functions:
|
| 191 |
+
|
| 192 |
+
```python
|
| 193 |
+
@spaces.GPU(duration=60)
|
| 194 |
+
def gpu_smoke_test():
|
| 195 |
+
...
|
| 196 |
+
|
| 197 |
+
@spaces.GPU(duration=900)
|
| 198 |
+
def comfy_runtime_test():
|
| 199 |
+
...
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
### SSH Does Not Expose ZeroGPU CUDA
|
| 203 |
+
|
| 204 |
+
Starting ComfyUI normally through SSH failed with:
|
| 205 |
+
|
| 206 |
+
```text
|
| 207 |
+
RuntimeError: No CUDA GPUs are available
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
Conclusion:
|
| 211 |
+
|
| 212 |
+
- SSH is useful for CPU-mode diagnostics.
|
| 213 |
+
- Real GPU work must run from the Gradio process inside a `@spaces.GPU`
|
| 214 |
+
function.
|
| 215 |
+
|
| 216 |
+
CPU diagnostic command:
|
| 217 |
+
|
| 218 |
+
```bash
|
| 219 |
+
python scripts/run_comfy.py --cpu
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
### Gradio Request Timeout During Bootstrap
|
| 223 |
+
|
| 224 |
+
Long bootstrap work should not run synchronously inside a Gradio request. The
|
| 225 |
+
first attempt did this:
|
| 226 |
+
|
| 227 |
+
```text
|
| 228 |
+
Gradio click -> bootstrap_comfy.py -> clone repos -> pip install -> start ComfyUI
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
The request was interrupted by Gradio/ZeroGPU's outer queue after roughly 2.5
|
| 232 |
+
minutes and returned:
|
| 233 |
+
|
| 234 |
+
```text
|
| 235 |
+
event: error
|
| 236 |
+
data: {"error": null}
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
+
Fix:
|
| 240 |
+
|
| 241 |
+
- Add a non-GPU `Prepare` action that starts `scripts/bootstrap_comfy.py` as a
|
| 242 |
+
background process.
|
| 243 |
+
- Add `Prepare Status` to poll `/tmp/voicegate_bootstrap.log`.
|
| 244 |
+
- Keep GPU actions focused on starting ComfyUI and running actual CUDA work.
|
| 245 |
+
|
| 246 |
+
This avoids wasting ZeroGPU time on clone/install steps and prevents the request
|
| 247 |
+
from being killed before diagnostics can return useful logs.
|
| 248 |
+
|
| 249 |
### Runtime Pip Install Pitfall
|
| 250 |
+
|
| 251 |
+
The background bootstrap installed a large dependency set and upgraded the
|
| 252 |
+
on-disk Torch package. The already-running Gradio process continued to report:
|
| 253 |
+
|
| 254 |
+
```text
|
| 255 |
+
torch=2.11.0+cu130
|
| 256 |
+
```
|
| 257 |
+
|
| 258 |
+
while the ComfyUI subprocess started afterwards reported:
|
| 259 |
+
|
| 260 |
+
```text
|
| 261 |
+
pytorch_version=2.12.0+cu130
|
| 262 |
+
```
|
| 263 |
+
|
| 264 |
This is workable for diagnostics, but final production should avoid heavy
|
| 265 |
runtime `pip install` where possible. Prefer moving stable dependencies into
|
| 266 |
Space build-time requirements or explicitly controlling pins.
|
|
|
|
| 295 |
|
| 296 |
For future tests, keep diagnostic durations conservative and increase only when
|
| 297 |
the workflow has already proven it needs more time.
|
| 298 |
+
|
| 299 |
+
## Dependency Pitfalls and Fixes
|
| 300 |
+
|
| 301 |
+
`ComfyUI_AudioTools` initially failed to import.
|
| 302 |
+
|
| 303 |
+
First failure:
|
| 304 |
+
|
| 305 |
+
```text
|
| 306 |
+
SoX could not be found
|
| 307 |
+
ModuleNotFoundError: No module named 'sounddevice'
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
Second failure after adding `sounddevice`:
|
| 311 |
+
|
| 312 |
+
```text
|
| 313 |
+
OSError: PortAudio library not found
|
| 314 |
+
```
|
| 315 |
+
|
| 316 |
+
Third failure:
|
| 317 |
+
|
| 318 |
+
```text
|
| 319 |
+
ModuleNotFoundError: No module named 'easydict'
|
| 320 |
+
```
|
| 321 |
+
|
| 322 |
+
Fourth failure:
|
| 323 |
+
|
| 324 |
+
```text
|
| 325 |
+
ModuleNotFoundError: No module named 'pytorch_lightning'
|
| 326 |
+
```
|
| 327 |
+
|
| 328 |
+
Fixes added:
|
| 329 |
+
|
| 330 |
+
- `packages.txt`
|
| 331 |
+
- `sox`
|
| 332 |
+
- `libportaudio2`
|
| 333 |
+
- `portaudio19-dev`
|
| 334 |
+
- `requirements.txt`
|
| 335 |
+
- `sounddevice`
|
| 336 |
+
- `easydict`
|
| 337 |
+
- `pytorch-lightning`
|
| 338 |
+
|
| 339 |
+
Final verification:
|
| 340 |
+
|
| 341 |
+
```text
|
| 342 |
+
0.4 seconds: /home/user/app/ComfyUI/custom_nodes/ComfyUI_AudioTools
|
| 343 |
+
```
|
| 344 |
+
|
| 345 |
+
with no `IMPORT FAILED` entry.
|
| 346 |
+
|
| 347 |
+
## ComfyUI API Smoke Test
|
| 348 |
+
|
| 349 |
+
Test audio source:
|
| 350 |
+
|
| 351 |
+
```text
|
| 352 |
+
D:\voicebridge-test-audio\test_audio\2-坤哥.MP3
|
| 353 |
+
```
|
| 354 |
+
|
| 355 |
+
The first upload attempt used a plain PowerShell byte pipeline and corrupted the
|
| 356 |
+
binary file. The remote file was identified as text instead of MP3, and
|
| 357 |
+
`LoadAudio` failed with:
|
| 358 |
+
|
| 359 |
+
```text
|
| 360 |
+
Invalid data found when processing input: 'avcodec_send_packet()'
|
| 361 |
+
```
|
| 362 |
+
|
| 363 |
+
Fix:
|
| 364 |
+
|
| 365 |
+
- Upload binary test media through a binary-safe method.
|
| 366 |
+
- Verify remote `sha256sum` before using the file.
|
| 367 |
+
|
| 368 |
+
Successful upload result:
|
| 369 |
+
|
| 370 |
+
```text
|
| 371 |
+
/tmp/voicegate_test_audio.mp3: Audio file with ID3 version 2.3.0
|
| 372 |
+
```
|
| 373 |
+
|
| 374 |
+
ComfyUI API endpoints verified in Dev Mode:
|
| 375 |
+
|
| 376 |
+
- `/system_stats`
|
| 377 |
+
- `/upload/image`
|
| 378 |
+
- `/prompt`
|
| 379 |
+
- `/history/{prompt_id}`
|
| 380 |
+
|
| 381 |
+
Minimal test workflow:
|
| 382 |
+
|
| 383 |
+
```text
|
| 384 |
+
LoadAudio -> SaveAudioMP3
|
| 385 |
+
```
|
| 386 |
+
|
| 387 |
+
Successful `/history/{prompt_id}` result:
|
| 388 |
+
|
| 389 |
+
```text
|
| 390 |
+
status_str: success
|
| 391 |
+
completed: true
|
| 392 |
+
```
|
| 393 |
+
|
| 394 |
+
Output reported by ComfyUI:
|
| 395 |
+
|
| 396 |
+
```text
|
| 397 |
+
audio/api_smoke_voicegate_00001.mp3
|
| 398 |
+
```
|
| 399 |
+
|
| 400 |
+
## Segmented Workflow Smoke Tests
|
| 401 |
+
|
| 402 |
+
### ComfyUI From Gradio ZeroGPU
|
| 403 |
+
|
| 404 |
+
On 2026-06-05, `app.py` was expanded with diagnostic Gradio actions:
|
| 405 |
+
|
| 406 |
+
- `prepare_runtime`: starts `scripts/bootstrap_comfy.py` in the background and
|
| 407 |
+
writes progress to `/tmp/voicegate_bootstrap.log`.
|
| 408 |
+
- `prepare_status`: reports the background bootstrap status and log tail.
|
| 409 |
+
- `comfy_runtime_test`: runs inside `@spaces.GPU`, starts ComfyUI, and calls
|
| 410 |
+
`/system_stats`.
|
| 411 |
+
- `melband_gpu_test`: runs a tiny MelBand workflow inside `@spaces.GPU`.
|
| 412 |
+
- `voxcpm_tts_gpu_test`: runs a tiny VoxCPM2 TTS-only workflow inside
|
| 413 |
+
`@spaces.GPU`.
|
| 414 |
+
|
| 415 |
+
The first attempt ran the full bootstrap synchronously inside a Gradio request
|
| 416 |
+
and the request was interrupted by the outer queue with `event: error` and no
|
| 417 |
+
function payload after roughly 2.5 minutes. The fix was to start bootstrap as a
|
| 418 |
+
background process and poll a status endpoint.
|
| 419 |
+
|
| 420 |
+
The background prepare completed successfully. It installed a large dependency
|
| 421 |
+
set and upgraded the on-disk Torch package from `2.11.0` to `2.12.0`. The
|
| 422 |
+
already-running Gradio process still reported its originally imported
|
| 423 |
+
`torch=2.11.0+cu130`, while the newly started ComfyUI subprocess reported:
|
| 424 |
+
|
| 425 |
+
```text
|
| 426 |
+
pytorch_version=2.12.0+cu130
|
| 427 |
+
```
|
| 428 |
+
|
| 429 |
+
This is acceptable for the smoke test, but runtime pip installs are not ideal
|
| 430 |
+
for the final app. A later pass should move heavy Python dependencies into the
|
| 431 |
+
Space build/install phase or pin the root requirements more deliberately.
|
| 432 |
+
|
| 433 |
+
`comfy_runtime_test` result:
|
| 434 |
+
|
| 435 |
+
```text
|
| 436 |
+
cuda_available=True
|
| 437 |
+
comfy_ready=true
|
| 438 |
+
comfy_elapsed_sec=16.0
|
| 439 |
+
ComfyUI version=0.24.0
|
| 440 |
+
device=cuda:0 NVIDIA RTX PRO 6000 Blackwell Server Edition MIG 2g.48gb
|
| 441 |
+
vram_total=50868518912
|
| 442 |
+
```
|
| 443 |
+
|
| 444 |
+
Observed behavior: separate `@spaces.GPU` calls may run in separate worker
|
| 445 |
+
processes, so the ComfyUI subprocess should not be assumed to persist across
|
| 446 |
+
different button/API calls.
|
| 447 |
+
|
| 448 |
+
### ZeroGPU Gradio Invocation
|
| 449 |
+
|
| 450 |
+
On 2026-06-05, the Space was tested in normal runtime, with Dev Mode off, using
|
| 451 |
+
a Gradio button backed by:
|
| 452 |
+
|
| 453 |
+
```python
|
| 454 |
+
@spaces.GPU(duration=60)
|
| 455 |
+
def gpu_smoke_test():
|
| 456 |
+
...
|
| 457 |
+
```
|
| 458 |
+
|
| 459 |
+
The private Space API was called with the local Hugging Face token through:
|
| 460 |
+
|
| 461 |
+
```text
|
| 462 |
+
POST /gradio_api/call/gpu_smoke_test
|
| 463 |
+
GET /gradio_api/call/gpu_smoke_test/{event_id}
|
| 464 |
+
```
|
| 465 |
+
|
| 466 |
+
Result:
|
| 467 |
+
|
| 468 |
+
```text
|
| 469 |
+
torch=2.11.0+cu130
|
| 470 |
+
cuda_available=True
|
| 471 |
+
cuda_device_count=1
|
| 472 |
+
device_name=NVIDIA RTX PRO 6000 Blackwell Server Edition MIG 2g.48gb
|
| 473 |
+
total_memory_gb=47.38
|
| 474 |
+
tensor_result=240.0
|
| 475 |
+
memory_reserved_mb=2.00
|
| 476 |
+
```
|
| 477 |
+
|
| 478 |
+
This confirms ZeroGPU CUDA is available from the normal Gradio runtime when the
|
| 479 |
+
work is executed inside a `@spaces.GPU` function. SSH still should be treated as
|
| 480 |
+
CPU-only diagnostic access.
|
| 481 |
+
|
| 482 |
+
### DeepSeek LLM Node
|
| 483 |
+
|
| 484 |
+
On 2026-06-05, `RH_LLMAPI_NODE` was tested through ComfyUI in Dev Mode using
|
| 485 |
+
the Space `DEEPSEEK_API_KEY` secret. The key was not printed.
|
| 486 |
+
|
| 487 |
+
Minimal workflow:
|
| 488 |
+
|
| 489 |
+
```text
|
| 490 |
+
RH_LLMAPI_NODE -> easy showAnything
|
| 491 |
+
```
|
| 492 |
+
|
| 493 |
+
Prompt:
|
| 494 |
+
|
| 495 |
+
```text
|
| 496 |
+
Translate to Simplified Chinese: VoiceGate smoke test.
|
| 497 |
+
```
|
| 498 |
+
|
| 499 |
+
Result:
|
| 500 |
+
|
| 501 |
+
```text
|
| 502 |
+
status_str: success
|
| 503 |
+
output: VoiceGate 冒烟测试。
|
| 504 |
+
```
|
| 505 |
+
|
| 506 |
+
This confirms the RunningHub LLM node can read the Space secret and call the
|
| 507 |
+
DeepSeek-compatible API endpoint.
|
| 508 |
+
|
| 509 |
+
### MelBand RoFormer
|
| 510 |
+
|
| 511 |
+
On 2026-06-05, `MelBandRoFormerModelLoader` and `MelBandRoFormerSampler` were
|
| 512 |
+
tested through ComfyUI in CPU mode.
|
| 513 |
+
|
| 514 |
+
Input:
|
| 515 |
+
|
| 516 |
+
```text
|
| 517 |
+
1 second synthetic 440 Hz WAV generated with ffmpeg
|
| 518 |
+
```
|
| 519 |
+
|
| 520 |
+
Minimal workflow:
|
| 521 |
+
|
| 522 |
+
```text
|
| 523 |
+
LoadAudio -> MelBandRoFormerModelLoader -> MelBandRoFormerSampler
|
| 524 |
+
-> SaveAudioMP3(vocals)
|
| 525 |
+
-> SaveAudioMP3(instruments)
|
| 526 |
+
```
|
| 527 |
+
|
| 528 |
+
Result:
|
| 529 |
+
|
| 530 |
+
```text
|
| 531 |
+
status_str: success
|
| 532 |
+
audio/melband_smoke_vocals_00001.mp3
|
| 533 |
+
audio/melband_smoke_instruments_00001.mp3
|
| 534 |
+
```
|
| 535 |
+
|
| 536 |
+
CPU-mode runtime for the 1 second smoke input was about 51 seconds. Real runs
|
| 537 |
+
should execute inside a `@spaces.GPU` function.
|
| 538 |
+
|
| 539 |
+
Later on 2026-06-05, the same kind of tiny MelBand smoke test was run from the
|
| 540 |
+
normal Gradio runtime inside `@spaces.GPU`.
|
| 541 |
+
|
| 542 |
+
Input:
|
| 543 |
+
|
| 544 |
+
```text
|
| 545 |
+
1 second synthetic 440 Hz WAV written to ComfyUI/input
|
| 546 |
+
```
|
| 547 |
+
|
| 548 |
+
Result:
|
| 549 |
+
|
| 550 |
+
```text
|
| 551 |
+
status_str=success
|
| 552 |
+
completed=True
|
| 553 |
+
audio/melband_gpu_32459bea_instruments_00001.mp3
|
| 554 |
+
audio/melband_gpu_32459bea_vocals_00001.mp3
|
| 555 |
+
elapsed_sec=78.3
|
| 556 |
+
```
|
| 557 |
+
|
| 558 |
+
This confirms the MelBand custom node and model can execute from the Space
|
| 559 |
+
ZeroGPU path.
|
| 560 |
+
|
| 561 |
### VoxCPM2 TTS-only
|
| 562 |
+
|
| 563 |
+
On 2026-06-05, a minimal VoxCPM2 TTS-only workflow was run from the normal
|
| 564 |
+
Gradio runtime inside `@spaces.GPU`.
|
| 565 |
+
|
| 566 |
+
Minimal workflow:
|
| 567 |
+
|
| 568 |
+
```text
|
| 569 |
+
RunningHub_VoxCPM_LoadModel -> RunningHub_VoxCPM_Generate -> SaveAudioMP3
|
| 570 |
+
```
|
| 571 |
+
|
| 572 |
+
Prompt text:
|
| 573 |
+
|
| 574 |
+
```text
|
| 575 |
+
你好,VoiceGate GPU 语音合成测试。
|
| 576 |
+
```
|
| 577 |
+
|
| 578 |
+
Result:
|
| 579 |
+
|
| 580 |
+
```text
|
| 581 |
+
status_str=success
|
| 582 |
+
completed=True
|
| 583 |
+
audio/voxcpm_tts_gpu_cda209ec_00001.mp3
|
| 584 |
+
elapsed_sec=766.2
|
| 585 |
+
```
|
| 586 |
+
|
| 587 |
This confirms VoxCPM2 fits and executes in ZeroGPU, but the first cold TTS-only
|
| 588 |
run was very slow. The final app should minimize cold starts, avoid repeated
|
| 589 |
ComfyUI/model reloads where possible, and use shorter diagnostic prompts while
|
|
|
|
| 653 |
SRT generation can run in the Space ZeroGPU path. The smoke test intentionally
|
| 654 |
used `attention=sdpa` instead of `flash_attention_2`; `flash_attention_2`
|
| 655 |
availability remains unverified.
|
| 656 |
+
|
| 657 |
+
## Secrets and API Keys
|
| 658 |
+
|
| 659 |
+
`DEEPSEEK_API_KEY` should be stored only as a Hugging Face Space Secret.
|
| 660 |
+
|
| 661 |
+
Current expected secret:
|
| 662 |
+
|
| 663 |
+
```text
|
| 664 |
+
DEEPSEEK_API_KEY
|
| 665 |
+
```
|
| 666 |
+
|
| 667 |
+
Optional variables:
|
| 668 |
+
|
| 669 |
+
```text
|
| 670 |
+
DEEPSEEK_BASE_URL=https://api.deepseek.com
|
| 671 |
+
DEEPSEEK_MODEL=deepseek-v4-flash
|
| 672 |
+
```
|
| 673 |
+
|
| 674 |
+
Never store these values in:
|
| 675 |
+
|
| 676 |
+
- `app.py`
|
| 677 |
+
- workflow JSON files
|
| 678 |
+
- README files
|
| 679 |
+
- docs
|
| 680 |
+
- `.env` files committed to git
|
| 681 |
+
|
| 682 |
+
`scripts/workflow_client.py` reads these from environment variables.
|
| 683 |
+
|
| 684 |
+
`scripts/check_space_env.py` verifies whether these environment variables are
|
| 685 |
+
present without printing their values.
|
| 686 |
+
|
| 687 |
+
## Model Storage
|
| 688 |
+
|
| 689 |
+
Large model files should live on the Space persistent storage volume instead of
|
| 690 |
+
inside `/home/user/app`, because `/home/user/app` can be replaced during Space
|
| 691 |
+
rebuilds.
|
| 692 |
+
|
| 693 |
+
Default model root:
|
| 694 |
+
|
| 695 |
+
```text
|
| 696 |
+
/data/voicegate_models
|
| 697 |
+
```
|
| 698 |
+
|
| 699 |
+
`scripts/bootstrap_comfy.py` creates symlinks from ComfyUI's expected paths to
|
| 700 |
+
that persistent root:
|
| 701 |
+
|
| 702 |
+
```text
|
| 703 |
+
ComfyUI/models/voxcpm/VoxCPM2
|
| 704 |
+
-> /data/voicegate_models/voxcpm/VoxCPM2
|
| 705 |
+
|
| 706 |
ComfyUI/models/diffusion_models/MelBandRoFormer_comfy
|
| 707 |
-> /data/voicegate_models/diffusion_models/MelBandRoFormer_comfy
|
| 708 |
|
| 709 |
ComfyUI/models/Qwen3-ASR
|
| 710 |
-> /data/voicegate_models/Qwen3-ASR
|
| 711 |
+
```
|
| 712 |
+
|
| 713 |
+
Override the root with:
|
| 714 |
+
|
| 715 |
+
```text
|
| 716 |
+
VOICEGATE_MODEL_ROOT
|
| 717 |
+
```
|
| 718 |
+
|
| 719 |
+
On 2026-06-05, the first two explicit ComfyUI-path models were downloaded to
|
| 720 |
+
persistent storage:
|
| 721 |
+
|
| 722 |
+
```text
|
| 723 |
/data/voicegate_models/voxcpm/VoxCPM2/model.safetensors
|
| 724 |
/data/voicegate_models/voxcpm/VoxCPM2/audiovae.pth
|
| 725 |
/data/voicegate_models/diffusion_models/MelBandRoFormer_comfy/MelBandRoformer_fp32.safetensors
|
| 726 |
/data/voicegate_models/Qwen3-ASR/Qwen3-ASR-1.7B
|
| 727 |
/data/voicegate_models/Qwen3-ASR/Qwen3-ForcedAligner-0.6B
|
| 728 |
+
```
|
| 729 |
+
|
| 730 |
+
Verified symlinks:
|
| 731 |
+
|
| 732 |
+
```text
|
| 733 |
+
/home/user/app/ComfyUI/models/voxcpm/VoxCPM2
|
| 734 |
+
-> /data/voicegate_models/voxcpm/VoxCPM2
|
| 735 |
+
|
| 736 |
/home/user/app/ComfyUI/models/diffusion_models/MelBandRoFormer_comfy
|
| 737 |
-> /data/voicegate_models/diffusion_models/MelBandRoFormer_comfy
|
| 738 |
|
| 739 |
/home/user/app/ComfyUI/models/Qwen3-ASR
|
| 740 |
-> /data/voicegate_models/Qwen3-ASR
|
| 741 |
+
```
|
| 742 |
+
|
| 743 |
+
`DEEPSEEK_API_KEY` was also verified as present in the Space environment without
|
| 744 |
+
printing its value.
|
| 745 |
+
|
| 746 |
+
Model download pitfall:
|
| 747 |
+
|
| 748 |
+
- `huggingface-cli download` is deprecated and failed in the Space.
|
| 749 |
+
- `hf download` also failed because of a CLI dependency compatibility issue.
|
| 750 |
+
- `scripts/bootstrap_comfy.py` now uses the `huggingface_hub` Python API
|
| 751 |
+
directly for model downloads.
|
| 752 |
+
|
| 753 |
## Current Known Good Commits
|
| 754 |
|
| 755 |
- `683b147` Add ComfyUI runtime bootstrap scripts
|
|
|
|
| 905 |
2. Polish the first Gradio user interface and validate the automatic model
|
| 906 |
preparation path after Space rebuilds/hardware changes.
|
| 907 |
3. Reduce runtime dependency installation and model reload overhead.
|
| 908 |
+
## 2026-06-22: ZeroGPU MelBand SIGBUS recovery
|
| 909 |
+
|
| 910 |
+
- Symptom: the user workflow returned
|
| 911 |
+
`WebSocketConnectionClosedException: Connection to remote host was lost`.
|
| 912 |
+
- Root cause: the ComfyUI child process terminated with `Fatal Python error:
|
| 913 |
+
Bus error` while `comfy.utils.load_safetensors` memory-mapped
|
| 914 |
+
`MelBandRoformer_fp32.safetensors` from persistent `/data` storage.
|
| 915 |
+
- The WebSocket error was secondary; it happened because the ComfyUI process
|
| 916 |
+
had already crashed.
|
| 917 |
+
- Added strict validation for the MelBand model:
|
| 918 |
+
- expected size: `912885656` bytes
|
| 919 |
+
- expected SHA-256:
|
| 920 |
+
`450caec8e8e261ff79426f17ccf16d43490ba4b790ff84d573083cf94e111258`
|
| 921 |
+
- Invalid files are removed and force-downloaded again from
|
| 922 |
+
`Kijai/MelBandRoFormer_comfy`.
|
| 923 |
+
- The bootstrap now patches the pinned MelBand custom node to load safetensors
|
| 924 |
+
from regular file bytes instead of mmap. This prevents a persistent-storage
|
| 925 |
+
mmap failure from terminating the Python interpreter with SIGBUS.
|
| 926 |
+
- The Space runtime validates the model once per container before accepting a
|
| 927 |
+
full workflow request.
|
scripts/bootstrap_comfy.py
CHANGED
|
@@ -8,6 +8,7 @@ explicitly requested.
|
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
import argparse
|
|
|
|
| 11 |
import os
|
| 12 |
import shutil
|
| 13 |
import subprocess
|
|
@@ -20,6 +21,9 @@ ROOT = Path(__file__).resolve().parents[1]
|
|
| 20 |
COMFY_DIR = ROOT / "ComfyUI"
|
| 21 |
CUSTOM_NODES_DIR = COMFY_DIR / "custom_nodes"
|
| 22 |
DEFAULT_PERSISTENT_MODEL_ROOT = Path("/data/voicegate_models")
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
@dataclass(frozen=True)
|
|
@@ -184,6 +188,70 @@ def prepare_model_dirs(dry_run: bool = False) -> None:
|
|
| 184 |
ensure_model_link(name, dry_run=dry_run)
|
| 185 |
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
def download_models(dry_run: bool = False) -> None:
|
| 188 |
"""Download large model assets.
|
| 189 |
|
|
@@ -213,12 +281,22 @@ def download_models(dry_run: bool = False) -> None:
|
|
| 213 |
local_dir=model_target("voxcpm2"),
|
| 214 |
token=token,
|
| 215 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
hf_hub_download(
|
| 217 |
repo_id="Kijai/MelBandRoFormer_comfy",
|
| 218 |
-
filename=
|
| 219 |
local_dir=model_target("melband"),
|
| 220 |
token=token,
|
|
|
|
| 221 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
snapshot_download(
|
| 223 |
repo_id="Qwen/Qwen3-ASR-1.7B",
|
| 224 |
local_dir=model_target("qwen3_asr") / "Qwen3-ASR-1.7B",
|
|
@@ -262,6 +340,7 @@ def main() -> None:
|
|
| 262 |
CUSTOM_NODES_DIR.mkdir(parents=True, exist_ok=True)
|
| 263 |
for repo in CUSTOM_NODE_REPOS:
|
| 264 |
ensure_git_repo(repo, dry_run=args.dry_run)
|
|
|
|
| 265 |
|
| 266 |
if not args.skip_pip:
|
| 267 |
install_requirements(COMFYUI, dry_run=args.dry_run)
|
|
|
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
import argparse
|
| 11 |
+
import hashlib
|
| 12 |
import os
|
| 13 |
import shutil
|
| 14 |
import subprocess
|
|
|
|
| 21 |
COMFY_DIR = ROOT / "ComfyUI"
|
| 22 |
CUSTOM_NODES_DIR = COMFY_DIR / "custom_nodes"
|
| 23 |
DEFAULT_PERSISTENT_MODEL_ROOT = Path("/data/voicegate_models")
|
| 24 |
+
MELBAND_FILENAME = "MelBandRoformer_fp32.safetensors"
|
| 25 |
+
MELBAND_SIZE = 912_885_656
|
| 26 |
+
MELBAND_SHA256 = "450caec8e8e261ff79426f17ccf16d43490ba4b790ff84d573083cf94e111258"
|
| 27 |
|
| 28 |
|
| 29 |
@dataclass(frozen=True)
|
|
|
|
| 188 |
ensure_model_link(name, dry_run=dry_run)
|
| 189 |
|
| 190 |
|
| 191 |
+
def file_sha256(path: Path) -> str:
|
| 192 |
+
digest = hashlib.sha256()
|
| 193 |
+
with path.open("rb") as file:
|
| 194 |
+
for chunk in iter(lambda: file.read(8 * 1024 * 1024), b""):
|
| 195 |
+
digest.update(chunk)
|
| 196 |
+
return digest.hexdigest()
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def melband_model_path() -> Path:
|
| 200 |
+
return model_target("melband") / MELBAND_FILENAME
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def validate_melband_model(*, verify_hash: bool = True) -> tuple[bool, str]:
|
| 204 |
+
path = melband_model_path()
|
| 205 |
+
if not path.is_file():
|
| 206 |
+
return False, "missing"
|
| 207 |
+
size = path.stat().st_size
|
| 208 |
+
if size != MELBAND_SIZE:
|
| 209 |
+
return False, f"size_mismatch expected={MELBAND_SIZE} actual={size}"
|
| 210 |
+
if verify_hash:
|
| 211 |
+
try:
|
| 212 |
+
digest = file_sha256(path)
|
| 213 |
+
except OSError as exc:
|
| 214 |
+
return False, f"read_error {type(exc).__name__}: {exc}"
|
| 215 |
+
if digest != MELBAND_SHA256:
|
| 216 |
+
return False, f"sha256_mismatch expected={MELBAND_SHA256} actual={digest}"
|
| 217 |
+
return True, "ok"
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def patch_melband_loader(dry_run: bool = False) -> None:
|
| 221 |
+
"""Avoid safetensors mmap on persistent Space storage.
|
| 222 |
+
|
| 223 |
+
ComfyUI's generic loader uses safetensors.safe_open(), which memory maps the
|
| 224 |
+
model file. A damaged file or an unstable mmap on /data can terminate the
|
| 225 |
+
interpreter with SIGBUS before Python can report a normal exception.
|
| 226 |
+
Loading from bytes uses regular reads and turns corruption into a catchable
|
| 227 |
+
safetensors error instead.
|
| 228 |
+
"""
|
| 229 |
+
|
| 230 |
+
nodes_path = CUSTOM_NODES_DIR / "ComfyUI-MelBandRoFormer" / "nodes.py"
|
| 231 |
+
print(f"+ patch non-mmap MelBand loader: {nodes_path}", flush=True)
|
| 232 |
+
if dry_run:
|
| 233 |
+
return
|
| 234 |
+
if not nodes_path.is_file():
|
| 235 |
+
raise RuntimeError(f"MelBand node file is missing: {nodes_path}")
|
| 236 |
+
|
| 237 |
+
text = nodes_path.read_text(encoding="utf-8")
|
| 238 |
+
if "load_safetensors_bytes" not in text:
|
| 239 |
+
text = text.replace(
|
| 240 |
+
"import torchaudio.functional as TAF\n",
|
| 241 |
+
"import torchaudio.functional as TAF\n"
|
| 242 |
+
"from safetensors.torch import load as load_safetensors_bytes\n",
|
| 243 |
+
)
|
| 244 |
+
text = text.replace(
|
| 245 |
+
"model.load_state_dict(load_torch_file(model_path), strict=True)",
|
| 246 |
+
"with open(model_path, \"rb\") as model_file:\n"
|
| 247 |
+
" state_dict = load_safetensors_bytes(model_file.read())\n"
|
| 248 |
+
" model.load_state_dict(state_dict, strict=True)",
|
| 249 |
+
)
|
| 250 |
+
if "load_safetensors_bytes" not in text or "state_dict = load_safetensors_bytes" not in text:
|
| 251 |
+
raise RuntimeError("Could not apply the non-mmap MelBand loader patch")
|
| 252 |
+
nodes_path.write_text(text, encoding="utf-8")
|
| 253 |
+
|
| 254 |
+
|
| 255 |
def download_models(dry_run: bool = False) -> None:
|
| 256 |
"""Download large model assets.
|
| 257 |
|
|
|
|
| 281 |
local_dir=model_target("voxcpm2"),
|
| 282 |
token=token,
|
| 283 |
)
|
| 284 |
+
melband_valid, melband_reason = validate_melband_model(verify_hash=True)
|
| 285 |
+
print(f"+ validate MelBand model: {melband_reason}", flush=True)
|
| 286 |
+
if not melband_valid and melband_model_path().exists():
|
| 287 |
+
print(f"+ remove invalid MelBand model: {melband_model_path()}", flush=True)
|
| 288 |
+
melband_model_path().unlink()
|
| 289 |
hf_hub_download(
|
| 290 |
repo_id="Kijai/MelBandRoFormer_comfy",
|
| 291 |
+
filename=MELBAND_FILENAME,
|
| 292 |
local_dir=model_target("melband"),
|
| 293 |
token=token,
|
| 294 |
+
force_download=not melband_valid,
|
| 295 |
)
|
| 296 |
+
melband_valid, melband_reason = validate_melband_model(verify_hash=True)
|
| 297 |
+
print(f"+ verify downloaded MelBand model: {melband_reason}", flush=True)
|
| 298 |
+
if not melband_valid:
|
| 299 |
+
raise RuntimeError(f"MelBand model validation failed: {melband_reason}")
|
| 300 |
snapshot_download(
|
| 301 |
repo_id="Qwen/Qwen3-ASR-1.7B",
|
| 302 |
local_dir=model_target("qwen3_asr") / "Qwen3-ASR-1.7B",
|
|
|
|
| 340 |
CUSTOM_NODES_DIR.mkdir(parents=True, exist_ok=True)
|
| 341 |
for repo in CUSTOM_NODE_REPOS:
|
| 342 |
ensure_git_repo(repo, dry_run=args.dry_run)
|
| 343 |
+
patch_melband_loader(dry_run=args.dry_run)
|
| 344 |
|
| 345 |
if not args.skip_pip:
|
| 346 |
install_requirements(COMFYUI, dry_run=args.dry_run)
|