Harden MelBand model loading on Spaces

#1
Files changed (3) hide show
  1. app.py +15 -1
  2. docs/work-log.md +640 -620
  3. scripts/bootstrap_comfy.py +80 -1
app.py CHANGED
@@ -25,6 +25,7 @@ import spaces
25
  import torch
26
  import websocket
27
 
 
28
  from scripts.workflow_client import load_workflow, patch_voicegate_workflow
29
 
30
 
@@ -39,6 +40,7 @@ COMFY_PORT = "8188"
39
  COMFY_PROCESS: subprocess.Popen | None = None
40
  PREPARE_PROCESS: subprocess.Popen | None = None
41
  BOOTSTRAPPED = False
 
42
  BOOTSTRAP_LOG = Path("/tmp/voicegate_bootstrap.log")
43
  USER_OUTPUT_DIR = ROOT / "user_outputs"
44
  REQUIRED_MODEL_PATHS = [
@@ -487,7 +489,18 @@ def run_bootstrap(lines: list[str], *, allow_heavy: bool = True) -> None:
487
 
488
 
489
  def missing_required_models() -> list[Path]:
490
- return [path for path in REQUIRED_MODEL_PATHS if not path.exists()]
 
 
 
 
 
 
 
 
 
 
 
491
 
492
 
493
  def ensure_runtime_assets(lines: list[str]) -> None:
@@ -533,6 +546,7 @@ def ensure_comfy(lines: list[str], *, timeout: float = 240) -> dict[str, Any]:
533
  raise RuntimeError(f"Runtime preparation failed with return code {returncode}.")
534
 
535
  run_bootstrap(lines, allow_heavy=False)
 
536
 
537
  try:
538
  stats = wait_for_comfy(timeout=5)
 
25
  import torch
26
  import websocket
27
 
28
+ from scripts.bootstrap_comfy import patch_melband_loader, validate_melband_model
29
  from scripts.workflow_client import load_workflow, patch_voicegate_workflow
30
 
31
 
 
40
  COMFY_PROCESS: subprocess.Popen | None = None
41
  PREPARE_PROCESS: subprocess.Popen | None = None
42
  BOOTSTRAPPED = False
43
+ MODELS_VALIDATED = False
44
  BOOTSTRAP_LOG = Path("/tmp/voicegate_bootstrap.log")
45
  USER_OUTPUT_DIR = ROOT / "user_outputs"
46
  REQUIRED_MODEL_PATHS = [
 
489
 
490
 
491
  def missing_required_models() -> list[Path]:
492
+ global MODELS_VALIDATED
493
+
494
+ missing = [path for path in REQUIRED_MODEL_PATHS if not path.exists()]
495
+ if missing:
496
+ MODELS_VALIDATED = False
497
+ return missing
498
+ if not MODELS_VALIDATED:
499
+ melband_valid, _reason = validate_melband_model(verify_hash=True)
500
+ if not melband_valid:
501
+ return [REQUIRED_MODEL_PATHS[0]]
502
+ MODELS_VALIDATED = True
503
+ return []
504
 
505
 
506
  def ensure_runtime_assets(lines: list[str]) -> None:
 
546
  raise RuntimeError(f"Runtime preparation failed with return code {returncode}.")
547
 
548
  run_bootstrap(lines, allow_heavy=False)
549
+ patch_melband_loader()
550
 
551
  try:
552
  stats = wait_for_comfy(timeout=5)
docs/work-log.md CHANGED
@@ -1,38 +1,38 @@
1
- # VoiceGate HF Space Work Log
2
-
3
- This document records the effective work completed while preparing the
4
- `build-small-hackathon/VoiceGate` Hugging Face Space, plus the pitfalls found
5
- and how they were resolved.
6
-
7
- ## Current Snapshot
8
-
9
- - Space: `https://huggingface.co/spaces/build-small-hackathon/VoiceGate`
10
- - Space git remote: `https://huggingface.co/spaces/build-small-hackathon/VoiceGate`
11
- - Runtime hardware: ZeroGPU / `zero-a10g`
12
- - Space SDK: Gradio
13
- - Local Space wrapper repo: `VoiceGate-hf`
14
- - Local upstream reference checkout: `VoiceGate/`
15
  - Latest confirmed normal runtime commit: `316b35db739d74d05543d6c8c9dd9c16e0580b17`
16
- - Current expected Space secret: `DEEPSEEK_API_KEY`
17
- - Default persistent model root: `/data/voicegate_models`
18
-
19
- Do not commit API keys, model weights, uploaded media, generated outputs, or the
20
- local `VoiceGate/` upstream checkout.
21
-
22
- ## Executive Summary
23
-
24
- The Space is no longer just a blank scaffold. It can now run Gradio, invoke
25
- ZeroGPU, prepare a ComfyUI runtime, start ComfyUI from a GPU-backed Gradio
26
- function, and submit several segmented ComfyUI workflows.
27
-
28
- Confirmed working:
29
-
30
- - Hugging Face Space git push and normal rebuild flow.
31
- - Dev Mode SSH for CPU/container diagnostics.
32
- - ZeroGPU invocation from Gradio through `@spaces.GPU`.
33
- - ComfyUI startup from inside a `@spaces.GPU` function.
34
- - ComfyUI API calls from the Gradio process.
35
- - DeepSeek-compatible LLM node with the Space secret.
36
  - MelBand RoFormer smoke tests in CPU mode and ZeroGPU mode.
37
  - VoxCPM2 TTS-only smoke test in ZeroGPU mode.
38
  - VoiceBridge ASR-only smoke test in ZeroGPU mode.
@@ -44,223 +44,223 @@ Not yet confirmed at the start of 2026-06-06:
44
  - SRT split -> VoxCPM -> SRT merge.
45
  - Full short-audio VoiceGate workflow.
46
  - Final user-facing Gradio upload/download UI.
47
-
48
- ## Repository Setup Completed
49
-
50
- - Created and pushed the Space wrapper repository.
51
- - Kept `VoiceGate/` as a local-only upstream reference and ignored it in git.
52
- - Preserved Hugging Face LFS rules.
53
- - Copied deployment workflows:
54
- - `workflows/voicegate_api.json`
55
- - `workflows/voicegate_ui.json`
56
- - Confirmed the API workflow JSON is valid.
57
- - Confirmed workflow files contain no committed API key.
58
-
59
- ## Dependency Inventory Completed
60
-
61
- Required workflow node providers were identified and pinned:
62
-
63
- - ComfyUI core:
64
- `comfyanonymous/ComfyUI`
65
- - VoiceBridge:
66
- `YanTianlong-01/comfyui_voicebridge`
67
- - RunningHub VoxCPM:
68
- `RH-RunningHub/ComfyUI_RH_VoxCPM`
69
- - MelBand RoFormer:
70
- `kijai/ComfyUI-MelBandRoFormer`
71
- - RunningHub LLM API:
72
- `HM-RunningHub/ComfyUI_RH_LLM_API`
73
- - rgthree:
74
- `rgthree/rgthree-comfy`
75
- - Easy Use:
76
- `yolain/ComfyUI-Easy-Use`
77
- - Comfyroll:
78
- `Suzie1/ComfyUI_Comfyroll_CustomNodes`
79
- - MW AudioTools:
80
- `billwuhao/ComfyUI_AudioTools`
81
-
82
- Important node source confirmations:
83
-
84
- - `ReplaceText` is provided by ComfyUI core extra nodes.
85
- - `MergeAudioMW` is provided by `ComfyUI_AudioTools`.
86
- - `RH_LLMAPI_NODE` is provided by `ComfyUI_RH_LLM_API`.
87
-
88
- ## Runtime Bootstrap Added
89
-
90
- The following scripts were added:
91
-
92
- - `scripts/bootstrap_comfy.py`
93
- - Clones ComfyUI.
94
- - Checks out pinned commits.
95
- - Clones required custom node repositories.
96
- - Installs ComfyUI and custom node Python requirements.
97
- - Prepares expected model directories.
98
- - Optionally downloads large model assets with `--with-models`.
99
- - `scripts/run_comfy.py`
100
- - Starts ComfyUI.
101
- - Waits for `/system_stats`.
102
- - Supports `--cpu` for SSH diagnostics.
103
- - `scripts/workflow_client.py`
104
- - Loads `workflows/voicegate_api.json`.
105
- - Uploads audio through the ComfyUI API.
106
- - Patches workflow inputs.
107
- - Submits `/prompt`.
108
- - Waits for `/history/{prompt_id}`.
109
-
110
- Workflow patching currently covers:
111
-
112
- - Node `16`: uploaded audio filename.
113
- - Node `105`: `DEEPSEEK_API_KEY`.
114
- - Node `105`: API base URL.
115
- - Node `105`: LLM model name.
116
- - Node `110`: target language.
117
- - Node `180`: job-specific audio output prefix.
118
- - Node `214`: job-specific SRT output prefix.
119
-
120
- ## Hugging Face Space Runtime Findings
121
-
122
- ### Dev Mode and SSH
123
-
124
- SSH target:
125
-
126
- ```text
127
- build-small-hackathon-voicegate@ssh.hf.space
128
- ```
129
-
130
- Local private key:
131
-
132
- ```text
133
- C:\Users\yantianlong\.ssh\codex_space_voicegate
134
- ```
135
-
136
- SSH is only available while the Space is in Dev Mode. Normal running Spaces do
137
- not accept SSH and return:
138
-
139
- ```text
140
- Bad request: SSH in only allowed in Dev mode
141
- ```
142
-
143
- Dev Mode can be toggled through the Hugging Face API endpoint:
144
-
145
- ```text
146
- POST /api/spaces/build-small-hackathon/VoiceGate/dev-mode
147
- ```
148
-
149
- Use Dev Mode for diagnostics only. Persistent fixes must be committed locally
150
- and pushed.
151
-
152
- ### Dev Mode Stale Commit Pitfall
153
-
154
- The running container initially stayed on the original template commit:
155
-
156
- ```text
157
- a94117f35a42cb17f654ae70cbe619c15345d057
158
- ```
159
-
160
- even after newer commits were pushed. `restart_space` alone did not move it to
161
- the latest repository state while Dev Mode was enabled.
162
-
163
- Fix:
164
-
165
- - Disable Dev Mode.
166
- - Use `factory_reboot=True` or push a new commit to trigger a normal rebuild.
167
- - Confirm runtime metadata reports the latest commit.
168
-
169
- ### ZeroGPU Startup Requirement
170
-
171
- When Dev Mode was disabled, the Space entered `RUNTIME_ERROR` with:
172
-
173
- ```text
174
- No @spaces.GPU function detected during startup
175
- ```
176
-
177
- Fix:
178
-
179
- - Import `spaces`.
180
- - Add at least one `@spaces.GPU(duration=...)` function in `app.py`.
181
-
182
- Current placeholder fix:
183
-
184
- ```python
185
- @spaces.GPU(duration=30)
186
- def placeholder():
187
- ...
188
- ```
189
-
190
- Later this placeholder was replaced by real diagnostic functions:
191
-
192
- ```python
193
- @spaces.GPU(duration=60)
194
- def gpu_smoke_test():
195
- ...
196
-
197
- @spaces.GPU(duration=900)
198
- def comfy_runtime_test():
199
- ...
200
- ```
201
-
202
- ### SSH Does Not Expose ZeroGPU CUDA
203
-
204
- Starting ComfyUI normally through SSH failed with:
205
-
206
- ```text
207
- RuntimeError: No CUDA GPUs are available
208
- ```
209
-
210
- Conclusion:
211
-
212
- - SSH is useful for CPU-mode diagnostics.
213
- - Real GPU work must run from the Gradio process inside a `@spaces.GPU`
214
- function.
215
-
216
- CPU diagnostic command:
217
-
218
- ```bash
219
- python scripts/run_comfy.py --cpu
220
- ```
221
-
222
- ### Gradio Request Timeout During Bootstrap
223
-
224
- Long bootstrap work should not run synchronously inside a Gradio request. The
225
- first attempt did this:
226
-
227
- ```text
228
- Gradio click -> bootstrap_comfy.py -> clone repos -> pip install -> start ComfyUI
229
- ```
230
-
231
- The request was interrupted by Gradio/ZeroGPU's outer queue after roughly 2.5
232
- minutes and returned:
233
-
234
- ```text
235
- event: error
236
- data: {"error": null}
237
- ```
238
-
239
- Fix:
240
-
241
- - Add a non-GPU `Prepare` action that starts `scripts/bootstrap_comfy.py` as a
242
- background process.
243
- - Add `Prepare Status` to poll `/tmp/voicegate_bootstrap.log`.
244
- - Keep GPU actions focused on starting ComfyUI and running actual CUDA work.
245
-
246
- This avoids wasting ZeroGPU time on clone/install steps and prevents the request
247
- from being killed before diagnostics can return useful logs.
248
-
249
  ### Runtime Pip Install Pitfall
250
-
251
- The background bootstrap installed a large dependency set and upgraded the
252
- on-disk Torch package. The already-running Gradio process continued to report:
253
-
254
- ```text
255
- torch=2.11.0+cu130
256
- ```
257
-
258
- while the ComfyUI subprocess started afterwards reported:
259
-
260
- ```text
261
- pytorch_version=2.12.0+cu130
262
- ```
263
-
264
  This is workable for diagnostics, but final production should avoid heavy
265
  runtime `pip install` where possible. Prefer moving stable dependencies into
266
  Space build-time requirements or explicitly controlling pins.
@@ -295,295 +295,295 @@ The working diagnostic used:
295
 
296
  For future tests, keep diagnostic durations conservative and increase only when
297
  the workflow has already proven it needs more time.
298
-
299
- ## Dependency Pitfalls and Fixes
300
-
301
- `ComfyUI_AudioTools` initially failed to import.
302
-
303
- First failure:
304
-
305
- ```text
306
- SoX could not be found
307
- ModuleNotFoundError: No module named 'sounddevice'
308
- ```
309
-
310
- Second failure after adding `sounddevice`:
311
-
312
- ```text
313
- OSError: PortAudio library not found
314
- ```
315
-
316
- Third failure:
317
-
318
- ```text
319
- ModuleNotFoundError: No module named 'easydict'
320
- ```
321
-
322
- Fourth failure:
323
-
324
- ```text
325
- ModuleNotFoundError: No module named 'pytorch_lightning'
326
- ```
327
-
328
- Fixes added:
329
-
330
- - `packages.txt`
331
- - `sox`
332
- - `libportaudio2`
333
- - `portaudio19-dev`
334
- - `requirements.txt`
335
- - `sounddevice`
336
- - `easydict`
337
- - `pytorch-lightning`
338
-
339
- Final verification:
340
-
341
- ```text
342
- 0.4 seconds: /home/user/app/ComfyUI/custom_nodes/ComfyUI_AudioTools
343
- ```
344
-
345
- with no `IMPORT FAILED` entry.
346
-
347
- ## ComfyUI API Smoke Test
348
-
349
- Test audio source:
350
-
351
- ```text
352
- D:\voicebridge-test-audio\test_audio\2-坤哥.MP3
353
- ```
354
-
355
- The first upload attempt used a plain PowerShell byte pipeline and corrupted the
356
- binary file. The remote file was identified as text instead of MP3, and
357
- `LoadAudio` failed with:
358
-
359
- ```text
360
- Invalid data found when processing input: 'avcodec_send_packet()'
361
- ```
362
-
363
- Fix:
364
-
365
- - Upload binary test media through a binary-safe method.
366
- - Verify remote `sha256sum` before using the file.
367
-
368
- Successful upload result:
369
-
370
- ```text
371
- /tmp/voicegate_test_audio.mp3: Audio file with ID3 version 2.3.0
372
- ```
373
-
374
- ComfyUI API endpoints verified in Dev Mode:
375
-
376
- - `/system_stats`
377
- - `/upload/image`
378
- - `/prompt`
379
- - `/history/{prompt_id}`
380
-
381
- Minimal test workflow:
382
-
383
- ```text
384
- LoadAudio -> SaveAudioMP3
385
- ```
386
-
387
- Successful `/history/{prompt_id}` result:
388
-
389
- ```text
390
- status_str: success
391
- completed: true
392
- ```
393
-
394
- Output reported by ComfyUI:
395
-
396
- ```text
397
- audio/api_smoke_voicegate_00001.mp3
398
- ```
399
-
400
- ## Segmented Workflow Smoke Tests
401
-
402
- ### ComfyUI From Gradio ZeroGPU
403
-
404
- On 2026-06-05, `app.py` was expanded with diagnostic Gradio actions:
405
-
406
- - `prepare_runtime`: starts `scripts/bootstrap_comfy.py` in the background and
407
- writes progress to `/tmp/voicegate_bootstrap.log`.
408
- - `prepare_status`: reports the background bootstrap status and log tail.
409
- - `comfy_runtime_test`: runs inside `@spaces.GPU`, starts ComfyUI, and calls
410
- `/system_stats`.
411
- - `melband_gpu_test`: runs a tiny MelBand workflow inside `@spaces.GPU`.
412
- - `voxcpm_tts_gpu_test`: runs a tiny VoxCPM2 TTS-only workflow inside
413
- `@spaces.GPU`.
414
-
415
- The first attempt ran the full bootstrap synchronously inside a Gradio request
416
- and the request was interrupted by the outer queue with `event: error` and no
417
- function payload after roughly 2.5 minutes. The fix was to start bootstrap as a
418
- background process and poll a status endpoint.
419
-
420
- The background prepare completed successfully. It installed a large dependency
421
- set and upgraded the on-disk Torch package from `2.11.0` to `2.12.0`. The
422
- already-running Gradio process still reported its originally imported
423
- `torch=2.11.0+cu130`, while the newly started ComfyUI subprocess reported:
424
-
425
- ```text
426
- pytorch_version=2.12.0+cu130
427
- ```
428
-
429
- This is acceptable for the smoke test, but runtime pip installs are not ideal
430
- for the final app. A later pass should move heavy Python dependencies into the
431
- Space build/install phase or pin the root requirements more deliberately.
432
-
433
- `comfy_runtime_test` result:
434
-
435
- ```text
436
- cuda_available=True
437
- comfy_ready=true
438
- comfy_elapsed_sec=16.0
439
- ComfyUI version=0.24.0
440
- device=cuda:0 NVIDIA RTX PRO 6000 Blackwell Server Edition MIG 2g.48gb
441
- vram_total=50868518912
442
- ```
443
-
444
- Observed behavior: separate `@spaces.GPU` calls may run in separate worker
445
- processes, so the ComfyUI subprocess should not be assumed to persist across
446
- different button/API calls.
447
-
448
- ### ZeroGPU Gradio Invocation
449
-
450
- On 2026-06-05, the Space was tested in normal runtime, with Dev Mode off, using
451
- a Gradio button backed by:
452
-
453
- ```python
454
- @spaces.GPU(duration=60)
455
- def gpu_smoke_test():
456
- ...
457
- ```
458
-
459
- The private Space API was called with the local Hugging Face token through:
460
-
461
- ```text
462
- POST /gradio_api/call/gpu_smoke_test
463
- GET /gradio_api/call/gpu_smoke_test/{event_id}
464
- ```
465
-
466
- Result:
467
-
468
- ```text
469
- torch=2.11.0+cu130
470
- cuda_available=True
471
- cuda_device_count=1
472
- device_name=NVIDIA RTX PRO 6000 Blackwell Server Edition MIG 2g.48gb
473
- total_memory_gb=47.38
474
- tensor_result=240.0
475
- memory_reserved_mb=2.00
476
- ```
477
-
478
- This confirms ZeroGPU CUDA is available from the normal Gradio runtime when the
479
- work is executed inside a `@spaces.GPU` function. SSH still should be treated as
480
- CPU-only diagnostic access.
481
-
482
- ### DeepSeek LLM Node
483
-
484
- On 2026-06-05, `RH_LLMAPI_NODE` was tested through ComfyUI in Dev Mode using
485
- the Space `DEEPSEEK_API_KEY` secret. The key was not printed.
486
-
487
- Minimal workflow:
488
-
489
- ```text
490
- RH_LLMAPI_NODE -> easy showAnything
491
- ```
492
-
493
- Prompt:
494
-
495
- ```text
496
- Translate to Simplified Chinese: VoiceGate smoke test.
497
- ```
498
-
499
- Result:
500
-
501
- ```text
502
- status_str: success
503
- output: VoiceGate 冒烟测试。
504
- ```
505
-
506
- This confirms the RunningHub LLM node can read the Space secret and call the
507
- DeepSeek-compatible API endpoint.
508
-
509
- ### MelBand RoFormer
510
-
511
- On 2026-06-05, `MelBandRoFormerModelLoader` and `MelBandRoFormerSampler` were
512
- tested through ComfyUI in CPU mode.
513
-
514
- Input:
515
-
516
- ```text
517
- 1 second synthetic 440 Hz WAV generated with ffmpeg
518
- ```
519
-
520
- Minimal workflow:
521
-
522
- ```text
523
- LoadAudio -> MelBandRoFormerModelLoader -> MelBandRoFormerSampler
524
- -> SaveAudioMP3(vocals)
525
- -> SaveAudioMP3(instruments)
526
- ```
527
-
528
- Result:
529
-
530
- ```text
531
- status_str: success
532
- audio/melband_smoke_vocals_00001.mp3
533
- audio/melband_smoke_instruments_00001.mp3
534
- ```
535
-
536
- CPU-mode runtime for the 1 second smoke input was about 51 seconds. Real runs
537
- should execute inside a `@spaces.GPU` function.
538
-
539
- Later on 2026-06-05, the same kind of tiny MelBand smoke test was run from the
540
- normal Gradio runtime inside `@spaces.GPU`.
541
-
542
- Input:
543
-
544
- ```text
545
- 1 second synthetic 440 Hz WAV written to ComfyUI/input
546
- ```
547
-
548
- Result:
549
-
550
- ```text
551
- status_str=success
552
- completed=True
553
- audio/melband_gpu_32459bea_instruments_00001.mp3
554
- audio/melband_gpu_32459bea_vocals_00001.mp3
555
- elapsed_sec=78.3
556
- ```
557
-
558
- This confirms the MelBand custom node and model can execute from the Space
559
- ZeroGPU path.
560
-
561
  ### VoxCPM2 TTS-only
562
-
563
- On 2026-06-05, a minimal VoxCPM2 TTS-only workflow was run from the normal
564
- Gradio runtime inside `@spaces.GPU`.
565
-
566
- Minimal workflow:
567
-
568
- ```text
569
- RunningHub_VoxCPM_LoadModel -> RunningHub_VoxCPM_Generate -> SaveAudioMP3
570
- ```
571
-
572
- Prompt text:
573
-
574
- ```text
575
- 你好,VoiceGate GPU 语音合成测试。
576
- ```
577
-
578
- Result:
579
-
580
- ```text
581
- status_str=success
582
- completed=True
583
- audio/voxcpm_tts_gpu_cda209ec_00001.mp3
584
- elapsed_sec=766.2
585
- ```
586
-
587
  This confirms VoxCPM2 fits and executes in ZeroGPU, but the first cold TTS-only
588
  run was very slow. The final app should minimize cold starts, avoid repeated
589
  ComfyUI/model reloads where possible, and use shorter diagnostic prompts while
@@ -653,103 +653,103 @@ This confirms the Qwen3-ASR model, forced aligner, VoiceBridge ASR nodes, and
653
  SRT generation can run in the Space ZeroGPU path. The smoke test intentionally
654
  used `attention=sdpa` instead of `flash_attention_2`; `flash_attention_2`
655
  availability remains unverified.
656
-
657
- ## Secrets and API Keys
658
-
659
- `DEEPSEEK_API_KEY` should be stored only as a Hugging Face Space Secret.
660
-
661
- Current expected secret:
662
-
663
- ```text
664
- DEEPSEEK_API_KEY
665
- ```
666
-
667
- Optional variables:
668
-
669
- ```text
670
- DEEPSEEK_BASE_URL=https://api.deepseek.com
671
- DEEPSEEK_MODEL=deepseek-v4-flash
672
- ```
673
-
674
- Never store these values in:
675
-
676
- - `app.py`
677
- - workflow JSON files
678
- - README files
679
- - docs
680
- - `.env` files committed to git
681
-
682
- `scripts/workflow_client.py` reads these from environment variables.
683
-
684
- `scripts/check_space_env.py` verifies whether these environment variables are
685
- present without printing their values.
686
-
687
- ## Model Storage
688
-
689
- Large model files should live on the Space persistent storage volume instead of
690
- inside `/home/user/app`, because `/home/user/app` can be replaced during Space
691
- rebuilds.
692
-
693
- Default model root:
694
-
695
- ```text
696
- /data/voicegate_models
697
- ```
698
-
699
- `scripts/bootstrap_comfy.py` creates symlinks from ComfyUI's expected paths to
700
- that persistent root:
701
-
702
- ```text
703
- ComfyUI/models/voxcpm/VoxCPM2
704
- -> /data/voicegate_models/voxcpm/VoxCPM2
705
-
706
  ComfyUI/models/diffusion_models/MelBandRoFormer_comfy
707
  -> /data/voicegate_models/diffusion_models/MelBandRoFormer_comfy
708
 
709
  ComfyUI/models/Qwen3-ASR
710
  -> /data/voicegate_models/Qwen3-ASR
711
- ```
712
-
713
- Override the root with:
714
-
715
- ```text
716
- VOICEGATE_MODEL_ROOT
717
- ```
718
-
719
- On 2026-06-05, the first two explicit ComfyUI-path models were downloaded to
720
- persistent storage:
721
-
722
- ```text
723
  /data/voicegate_models/voxcpm/VoxCPM2/model.safetensors
724
  /data/voicegate_models/voxcpm/VoxCPM2/audiovae.pth
725
  /data/voicegate_models/diffusion_models/MelBandRoFormer_comfy/MelBandRoformer_fp32.safetensors
726
  /data/voicegate_models/Qwen3-ASR/Qwen3-ASR-1.7B
727
  /data/voicegate_models/Qwen3-ASR/Qwen3-ForcedAligner-0.6B
728
- ```
729
-
730
- Verified symlinks:
731
-
732
- ```text
733
- /home/user/app/ComfyUI/models/voxcpm/VoxCPM2
734
- -> /data/voicegate_models/voxcpm/VoxCPM2
735
-
736
  /home/user/app/ComfyUI/models/diffusion_models/MelBandRoFormer_comfy
737
  -> /data/voicegate_models/diffusion_models/MelBandRoFormer_comfy
738
 
739
  /home/user/app/ComfyUI/models/Qwen3-ASR
740
  -> /data/voicegate_models/Qwen3-ASR
741
- ```
742
-
743
- `DEEPSEEK_API_KEY` was also verified as present in the Space environment without
744
- printing its value.
745
-
746
- Model download pitfall:
747
-
748
- - `huggingface-cli download` is deprecated and failed in the Space.
749
- - `hf download` also failed because of a CLI dependency compatibility issue.
750
- - `scripts/bootstrap_comfy.py` now uses the `huggingface_hub` Python API
751
- directly for model downloads.
752
-
753
  ## Current Known Good Commits
754
 
755
  - `683b147` Add ComfyUI runtime bootstrap scripts
@@ -905,3 +905,23 @@ Next recommended steps:
905
  2. Polish the first Gradio user interface and validate the automatic model
906
  preparation path after Space rebuilds/hardware changes.
907
  3. Reduce runtime dependency installation and model reload overhead.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VoiceGate HF Space Work Log
2
+
3
+ This document records the effective work completed while preparing the
4
+ `build-small-hackathon/VoiceGate` Hugging Face Space, plus the pitfalls found
5
+ and how they were resolved.
6
+
7
+ ## Current Snapshot
8
+
9
+ - Space: `https://huggingface.co/spaces/build-small-hackathon/VoiceGate`
10
+ - Space git remote: `https://huggingface.co/spaces/build-small-hackathon/VoiceGate`
11
+ - Runtime hardware: ZeroGPU / `zero-a10g`
12
+ - Space SDK: Gradio
13
+ - Local Space wrapper repo: `VoiceGate-hf`
14
+ - Local upstream reference checkout: `VoiceGate/`
15
  - Latest confirmed normal runtime commit: `316b35db739d74d05543d6c8c9dd9c16e0580b17`
16
+ - Current expected Space secret: `DEEPSEEK_API_KEY`
17
+ - Default persistent model root: `/data/voicegate_models`
18
+
19
+ Do not commit API keys, model weights, uploaded media, generated outputs, or the
20
+ local `VoiceGate/` upstream checkout.
21
+
22
+ ## Executive Summary
23
+
24
+ The Space is no longer just a blank scaffold. It can now run Gradio, invoke
25
+ ZeroGPU, prepare a ComfyUI runtime, start ComfyUI from a GPU-backed Gradio
26
+ function, and submit several segmented ComfyUI workflows.
27
+
28
+ Confirmed working:
29
+
30
+ - Hugging Face Space git push and normal rebuild flow.
31
+ - Dev Mode SSH for CPU/container diagnostics.
32
+ - ZeroGPU invocation from Gradio through `@spaces.GPU`.
33
+ - ComfyUI startup from inside a `@spaces.GPU` function.
34
+ - ComfyUI API calls from the Gradio process.
35
+ - DeepSeek-compatible LLM node with the Space secret.
36
  - MelBand RoFormer smoke tests in CPU mode and ZeroGPU mode.
37
  - VoxCPM2 TTS-only smoke test in ZeroGPU mode.
38
  - VoiceBridge ASR-only smoke test in ZeroGPU mode.
 
44
  - SRT split -> VoxCPM -> SRT merge.
45
  - Full short-audio VoiceGate workflow.
46
  - Final user-facing Gradio upload/download UI.
47
+
48
+ ## Repository Setup Completed
49
+
50
+ - Created and pushed the Space wrapper repository.
51
+ - Kept `VoiceGate/` as a local-only upstream reference and ignored it in git.
52
+ - Preserved Hugging Face LFS rules.
53
+ - Copied deployment workflows:
54
+ - `workflows/voicegate_api.json`
55
+ - `workflows/voicegate_ui.json`
56
+ - Confirmed the API workflow JSON is valid.
57
+ - Confirmed workflow files contain no committed API key.
58
+
59
+ ## Dependency Inventory Completed
60
+
61
+ Required workflow node providers were identified and pinned:
62
+
63
+ - ComfyUI core:
64
+ `comfyanonymous/ComfyUI`
65
+ - VoiceBridge:
66
+ `YanTianlong-01/comfyui_voicebridge`
67
+ - RunningHub VoxCPM:
68
+ `RH-RunningHub/ComfyUI_RH_VoxCPM`
69
+ - MelBand RoFormer:
70
+ `kijai/ComfyUI-MelBandRoFormer`
71
+ - RunningHub LLM API:
72
+ `HM-RunningHub/ComfyUI_RH_LLM_API`
73
+ - rgthree:
74
+ `rgthree/rgthree-comfy`
75
+ - Easy Use:
76
+ `yolain/ComfyUI-Easy-Use`
77
+ - Comfyroll:
78
+ `Suzie1/ComfyUI_Comfyroll_CustomNodes`
79
+ - MW AudioTools:
80
+ `billwuhao/ComfyUI_AudioTools`
81
+
82
+ Important node source confirmations:
83
+
84
+ - `ReplaceText` is provided by ComfyUI core extra nodes.
85
+ - `MergeAudioMW` is provided by `ComfyUI_AudioTools`.
86
+ - `RH_LLMAPI_NODE` is provided by `ComfyUI_RH_LLM_API`.
87
+
88
+ ## Runtime Bootstrap Added
89
+
90
+ The following scripts were added:
91
+
92
+ - `scripts/bootstrap_comfy.py`
93
+ - Clones ComfyUI.
94
+ - Checks out pinned commits.
95
+ - Clones required custom node repositories.
96
+ - Installs ComfyUI and custom node Python requirements.
97
+ - Prepares expected model directories.
98
+ - Optionally downloads large model assets with `--with-models`.
99
+ - `scripts/run_comfy.py`
100
+ - Starts ComfyUI.
101
+ - Waits for `/system_stats`.
102
+ - Supports `--cpu` for SSH diagnostics.
103
+ - `scripts/workflow_client.py`
104
+ - Loads `workflows/voicegate_api.json`.
105
+ - Uploads audio through the ComfyUI API.
106
+ - Patches workflow inputs.
107
+ - Submits `/prompt`.
108
+ - Waits for `/history/{prompt_id}`.
109
+
110
+ Workflow patching currently covers:
111
+
112
+ - Node `16`: uploaded audio filename.
113
+ - Node `105`: `DEEPSEEK_API_KEY`.
114
+ - Node `105`: API base URL.
115
+ - Node `105`: LLM model name.
116
+ - Node `110`: target language.
117
+ - Node `180`: job-specific audio output prefix.
118
+ - Node `214`: job-specific SRT output prefix.
119
+
120
+ ## Hugging Face Space Runtime Findings
121
+
122
+ ### Dev Mode and SSH
123
+
124
+ SSH target:
125
+
126
+ ```text
127
+ build-small-hackathon-voicegate@ssh.hf.space
128
+ ```
129
+
130
+ Local private key:
131
+
132
+ ```text
133
+ C:\Users\yantianlong\.ssh\codex_space_voicegate
134
+ ```
135
+
136
+ SSH is only available while the Space is in Dev Mode. Normal running Spaces do
137
+ not accept SSH and return:
138
+
139
+ ```text
140
+ Bad request: SSH in only allowed in Dev mode
141
+ ```
142
+
143
+ Dev Mode can be toggled through the Hugging Face API endpoint:
144
+
145
+ ```text
146
+ POST /api/spaces/build-small-hackathon/VoiceGate/dev-mode
147
+ ```
148
+
149
+ Use Dev Mode for diagnostics only. Persistent fixes must be committed locally
150
+ and pushed.
151
+
152
+ ### Dev Mode Stale Commit Pitfall
153
+
154
+ The running container initially stayed on the original template commit:
155
+
156
+ ```text
157
+ a94117f35a42cb17f654ae70cbe619c15345d057
158
+ ```
159
+
160
+ even after newer commits were pushed. `restart_space` alone did not move it to
161
+ the latest repository state while Dev Mode was enabled.
162
+
163
+ Fix:
164
+
165
+ - Disable Dev Mode.
166
+ - Use `factory_reboot=True` or push a new commit to trigger a normal rebuild.
167
+ - Confirm runtime metadata reports the latest commit.
168
+
169
+ ### ZeroGPU Startup Requirement
170
+
171
+ When Dev Mode was disabled, the Space entered `RUNTIME_ERROR` with:
172
+
173
+ ```text
174
+ No @spaces.GPU function detected during startup
175
+ ```
176
+
177
+ Fix:
178
+
179
+ - Import `spaces`.
180
+ - Add at least one `@spaces.GPU(duration=...)` function in `app.py`.
181
+
182
+ Current placeholder fix:
183
+
184
+ ```python
185
+ @spaces.GPU(duration=30)
186
+ def placeholder():
187
+ ...
188
+ ```
189
+
190
+ Later this placeholder was replaced by real diagnostic functions:
191
+
192
+ ```python
193
+ @spaces.GPU(duration=60)
194
+ def gpu_smoke_test():
195
+ ...
196
+
197
+ @spaces.GPU(duration=900)
198
+ def comfy_runtime_test():
199
+ ...
200
+ ```
201
+
202
+ ### SSH Does Not Expose ZeroGPU CUDA
203
+
204
+ Starting ComfyUI normally through SSH failed with:
205
+
206
+ ```text
207
+ RuntimeError: No CUDA GPUs are available
208
+ ```
209
+
210
+ Conclusion:
211
+
212
+ - SSH is useful for CPU-mode diagnostics.
213
+ - Real GPU work must run from the Gradio process inside a `@spaces.GPU`
214
+ function.
215
+
216
+ CPU diagnostic command:
217
+
218
+ ```bash
219
+ python scripts/run_comfy.py --cpu
220
+ ```
221
+
222
+ ### Gradio Request Timeout During Bootstrap
223
+
224
+ Long bootstrap work should not run synchronously inside a Gradio request. The
225
+ first attempt did this:
226
+
227
+ ```text
228
+ Gradio click -> bootstrap_comfy.py -> clone repos -> pip install -> start ComfyUI
229
+ ```
230
+
231
+ The request was interrupted by Gradio/ZeroGPU's outer queue after roughly 2.5
232
+ minutes and returned:
233
+
234
+ ```text
235
+ event: error
236
+ data: {"error": null}
237
+ ```
238
+
239
+ Fix:
240
+
241
+ - Add a non-GPU `Prepare` action that starts `scripts/bootstrap_comfy.py` as a
242
+ background process.
243
+ - Add `Prepare Status` to poll `/tmp/voicegate_bootstrap.log`.
244
+ - Keep GPU actions focused on starting ComfyUI and running actual CUDA work.
245
+
246
+ This avoids wasting ZeroGPU time on clone/install steps and prevents the request
247
+ from being killed before diagnostics can return useful logs.
248
+
249
  ### Runtime Pip Install Pitfall
250
+
251
+ The background bootstrap installed a large dependency set and upgraded the
252
+ on-disk Torch package. The already-running Gradio process continued to report:
253
+
254
+ ```text
255
+ torch=2.11.0+cu130
256
+ ```
257
+
258
+ while the ComfyUI subprocess started afterwards reported:
259
+
260
+ ```text
261
+ pytorch_version=2.12.0+cu130
262
+ ```
263
+
264
  This is workable for diagnostics, but final production should avoid heavy
265
  runtime `pip install` where possible. Prefer moving stable dependencies into
266
  Space build-time requirements or explicitly controlling pins.
 
295
 
296
  For future tests, keep diagnostic durations conservative and increase only when
297
  the workflow has already proven it needs more time.
298
+
299
+ ## Dependency Pitfalls and Fixes
300
+
301
+ `ComfyUI_AudioTools` initially failed to import.
302
+
303
+ First failure:
304
+
305
+ ```text
306
+ SoX could not be found
307
+ ModuleNotFoundError: No module named 'sounddevice'
308
+ ```
309
+
310
+ Second failure after adding `sounddevice`:
311
+
312
+ ```text
313
+ OSError: PortAudio library not found
314
+ ```
315
+
316
+ Third failure:
317
+
318
+ ```text
319
+ ModuleNotFoundError: No module named 'easydict'
320
+ ```
321
+
322
+ Fourth failure:
323
+
324
+ ```text
325
+ ModuleNotFoundError: No module named 'pytorch_lightning'
326
+ ```
327
+
328
+ Fixes added:
329
+
330
+ - `packages.txt`
331
+ - `sox`
332
+ - `libportaudio2`
333
+ - `portaudio19-dev`
334
+ - `requirements.txt`
335
+ - `sounddevice`
336
+ - `easydict`
337
+ - `pytorch-lightning`
338
+
339
+ Final verification:
340
+
341
+ ```text
342
+ 0.4 seconds: /home/user/app/ComfyUI/custom_nodes/ComfyUI_AudioTools
343
+ ```
344
+
345
+ with no `IMPORT FAILED` entry.
346
+
347
+ ## ComfyUI API Smoke Test
348
+
349
+ Test audio source:
350
+
351
+ ```text
352
+ D:\voicebridge-test-audio\test_audio\2-坤哥.MP3
353
+ ```
354
+
355
+ The first upload attempt used a plain PowerShell byte pipeline and corrupted the
356
+ binary file. The remote file was identified as text instead of MP3, and
357
+ `LoadAudio` failed with:
358
+
359
+ ```text
360
+ Invalid data found when processing input: 'avcodec_send_packet()'
361
+ ```
362
+
363
+ Fix:
364
+
365
+ - Upload binary test media through a binary-safe method.
366
+ - Verify remote `sha256sum` before using the file.
367
+
368
+ Successful upload result:
369
+
370
+ ```text
371
+ /tmp/voicegate_test_audio.mp3: Audio file with ID3 version 2.3.0
372
+ ```
373
+
374
+ ComfyUI API endpoints verified in Dev Mode:
375
+
376
+ - `/system_stats`
377
+ - `/upload/image`
378
+ - `/prompt`
379
+ - `/history/{prompt_id}`
380
+
381
+ Minimal test workflow:
382
+
383
+ ```text
384
+ LoadAudio -> SaveAudioMP3
385
+ ```
386
+
387
+ Successful `/history/{prompt_id}` result:
388
+
389
+ ```text
390
+ status_str: success
391
+ completed: true
392
+ ```
393
+
394
+ Output reported by ComfyUI:
395
+
396
+ ```text
397
+ audio/api_smoke_voicegate_00001.mp3
398
+ ```
399
+
400
+ ## Segmented Workflow Smoke Tests
401
+
402
+ ### ComfyUI From Gradio ZeroGPU
403
+
404
+ On 2026-06-05, `app.py` was expanded with diagnostic Gradio actions:
405
+
406
+ - `prepare_runtime`: starts `scripts/bootstrap_comfy.py` in the background and
407
+ writes progress to `/tmp/voicegate_bootstrap.log`.
408
+ - `prepare_status`: reports the background bootstrap status and log tail.
409
+ - `comfy_runtime_test`: runs inside `@spaces.GPU`, starts ComfyUI, and calls
410
+ `/system_stats`.
411
+ - `melband_gpu_test`: runs a tiny MelBand workflow inside `@spaces.GPU`.
412
+ - `voxcpm_tts_gpu_test`: runs a tiny VoxCPM2 TTS-only workflow inside
413
+ `@spaces.GPU`.
414
+
415
+ The first attempt ran the full bootstrap synchronously inside a Gradio request
416
+ and the request was interrupted by the outer queue with `event: error` and no
417
+ function payload after roughly 2.5 minutes. The fix was to start bootstrap as a
418
+ background process and poll a status endpoint.
419
+
420
+ The background prepare completed successfully. It installed a large dependency
421
+ set and upgraded the on-disk Torch package from `2.11.0` to `2.12.0`. The
422
+ already-running Gradio process still reported its originally imported
423
+ `torch=2.11.0+cu130`, while the newly started ComfyUI subprocess reported:
424
+
425
+ ```text
426
+ pytorch_version=2.12.0+cu130
427
+ ```
428
+
429
+ This is acceptable for the smoke test, but runtime pip installs are not ideal
430
+ for the final app. A later pass should move heavy Python dependencies into the
431
+ Space build/install phase or pin the root requirements more deliberately.
432
+
433
+ `comfy_runtime_test` result:
434
+
435
+ ```text
436
+ cuda_available=True
437
+ comfy_ready=true
438
+ comfy_elapsed_sec=16.0
439
+ ComfyUI version=0.24.0
440
+ device=cuda:0 NVIDIA RTX PRO 6000 Blackwell Server Edition MIG 2g.48gb
441
+ vram_total=50868518912
442
+ ```
443
+
444
+ Observed behavior: separate `@spaces.GPU` calls may run in separate worker
445
+ processes, so the ComfyUI subprocess should not be assumed to persist across
446
+ different button/API calls.
447
+
448
+ ### ZeroGPU Gradio Invocation
449
+
450
+ On 2026-06-05, the Space was tested in normal runtime, with Dev Mode off, using
451
+ a Gradio button backed by:
452
+
453
+ ```python
454
+ @spaces.GPU(duration=60)
455
+ def gpu_smoke_test():
456
+ ...
457
+ ```
458
+
459
+ The private Space API was called with the local Hugging Face token through:
460
+
461
+ ```text
462
+ POST /gradio_api/call/gpu_smoke_test
463
+ GET /gradio_api/call/gpu_smoke_test/{event_id}
464
+ ```
465
+
466
+ Result:
467
+
468
+ ```text
469
+ torch=2.11.0+cu130
470
+ cuda_available=True
471
+ cuda_device_count=1
472
+ device_name=NVIDIA RTX PRO 6000 Blackwell Server Edition MIG 2g.48gb
473
+ total_memory_gb=47.38
474
+ tensor_result=240.0
475
+ memory_reserved_mb=2.00
476
+ ```
477
+
478
+ This confirms ZeroGPU CUDA is available from the normal Gradio runtime when the
479
+ work is executed inside a `@spaces.GPU` function. SSH still should be treated as
480
+ CPU-only diagnostic access.
481
+
482
+ ### DeepSeek LLM Node
483
+
484
+ On 2026-06-05, `RH_LLMAPI_NODE` was tested through ComfyUI in Dev Mode using
485
+ the Space `DEEPSEEK_API_KEY` secret. The key was not printed.
486
+
487
+ Minimal workflow:
488
+
489
+ ```text
490
+ RH_LLMAPI_NODE -> easy showAnything
491
+ ```
492
+
493
+ Prompt:
494
+
495
+ ```text
496
+ Translate to Simplified Chinese: VoiceGate smoke test.
497
+ ```
498
+
499
+ Result:
500
+
501
+ ```text
502
+ status_str: success
503
+ output: VoiceGate 冒烟测试。
504
+ ```
505
+
506
+ This confirms the RunningHub LLM node can read the Space secret and call the
507
+ DeepSeek-compatible API endpoint.
508
+
509
+ ### MelBand RoFormer
510
+
511
+ On 2026-06-05, `MelBandRoFormerModelLoader` and `MelBandRoFormerSampler` were
512
+ tested through ComfyUI in CPU mode.
513
+
514
+ Input:
515
+
516
+ ```text
517
+ 1 second synthetic 440 Hz WAV generated with ffmpeg
518
+ ```
519
+
520
+ Minimal workflow:
521
+
522
+ ```text
523
+ LoadAudio -> MelBandRoFormerModelLoader -> MelBandRoFormerSampler
524
+ -> SaveAudioMP3(vocals)
525
+ -> SaveAudioMP3(instruments)
526
+ ```
527
+
528
+ Result:
529
+
530
+ ```text
531
+ status_str: success
532
+ audio/melband_smoke_vocals_00001.mp3
533
+ audio/melband_smoke_instruments_00001.mp3
534
+ ```
535
+
536
+ CPU-mode runtime for the 1 second smoke input was about 51 seconds. Real runs
537
+ should execute inside a `@spaces.GPU` function.
538
+
539
+ Later on 2026-06-05, the same kind of tiny MelBand smoke test was run from the
540
+ normal Gradio runtime inside `@spaces.GPU`.
541
+
542
+ Input:
543
+
544
+ ```text
545
+ 1 second synthetic 440 Hz WAV written to ComfyUI/input
546
+ ```
547
+
548
+ Result:
549
+
550
+ ```text
551
+ status_str=success
552
+ completed=True
553
+ audio/melband_gpu_32459bea_instruments_00001.mp3
554
+ audio/melband_gpu_32459bea_vocals_00001.mp3
555
+ elapsed_sec=78.3
556
+ ```
557
+
558
+ This confirms the MelBand custom node and model can execute from the Space
559
+ ZeroGPU path.
560
+
561
  ### VoxCPM2 TTS-only
562
+
563
+ On 2026-06-05, a minimal VoxCPM2 TTS-only workflow was run from the normal
564
+ Gradio runtime inside `@spaces.GPU`.
565
+
566
+ Minimal workflow:
567
+
568
+ ```text
569
+ RunningHub_VoxCPM_LoadModel -> RunningHub_VoxCPM_Generate -> SaveAudioMP3
570
+ ```
571
+
572
+ Prompt text:
573
+
574
+ ```text
575
+ 你好,VoiceGate GPU 语音合成测试。
576
+ ```
577
+
578
+ Result:
579
+
580
+ ```text
581
+ status_str=success
582
+ completed=True
583
+ audio/voxcpm_tts_gpu_cda209ec_00001.mp3
584
+ elapsed_sec=766.2
585
+ ```
586
+
587
  This confirms VoxCPM2 fits and executes in ZeroGPU, but the first cold TTS-only
588
  run was very slow. The final app should minimize cold starts, avoid repeated
589
  ComfyUI/model reloads where possible, and use shorter diagnostic prompts while
 
653
  SRT generation can run in the Space ZeroGPU path. The smoke test intentionally
654
  used `attention=sdpa` instead of `flash_attention_2`; `flash_attention_2`
655
  availability remains unverified.
656
+
657
+ ## Secrets and API Keys
658
+
659
+ `DEEPSEEK_API_KEY` should be stored only as a Hugging Face Space Secret.
660
+
661
+ Current expected secret:
662
+
663
+ ```text
664
+ DEEPSEEK_API_KEY
665
+ ```
666
+
667
+ Optional variables:
668
+
669
+ ```text
670
+ DEEPSEEK_BASE_URL=https://api.deepseek.com
671
+ DEEPSEEK_MODEL=deepseek-v4-flash
672
+ ```
673
+
674
+ Never store these values in:
675
+
676
+ - `app.py`
677
+ - workflow JSON files
678
+ - README files
679
+ - docs
680
+ - `.env` files committed to git
681
+
682
+ `scripts/workflow_client.py` reads these from environment variables.
683
+
684
+ `scripts/check_space_env.py` verifies whether these environment variables are
685
+ present without printing their values.
686
+
687
+ ## Model Storage
688
+
689
+ Large model files should live on the Space persistent storage volume instead of
690
+ inside `/home/user/app`, because `/home/user/app` can be replaced during Space
691
+ rebuilds.
692
+
693
+ Default model root:
694
+
695
+ ```text
696
+ /data/voicegate_models
697
+ ```
698
+
699
+ `scripts/bootstrap_comfy.py` creates symlinks from ComfyUI's expected paths to
700
+ that persistent root:
701
+
702
+ ```text
703
+ ComfyUI/models/voxcpm/VoxCPM2
704
+ -> /data/voicegate_models/voxcpm/VoxCPM2
705
+
706
  ComfyUI/models/diffusion_models/MelBandRoFormer_comfy
707
  -> /data/voicegate_models/diffusion_models/MelBandRoFormer_comfy
708
 
709
  ComfyUI/models/Qwen3-ASR
710
  -> /data/voicegate_models/Qwen3-ASR
711
+ ```
712
+
713
+ Override the root with:
714
+
715
+ ```text
716
+ VOICEGATE_MODEL_ROOT
717
+ ```
718
+
719
+ On 2026-06-05, the first two explicit ComfyUI-path models were downloaded to
720
+ persistent storage:
721
+
722
+ ```text
723
  /data/voicegate_models/voxcpm/VoxCPM2/model.safetensors
724
  /data/voicegate_models/voxcpm/VoxCPM2/audiovae.pth
725
  /data/voicegate_models/diffusion_models/MelBandRoFormer_comfy/MelBandRoformer_fp32.safetensors
726
  /data/voicegate_models/Qwen3-ASR/Qwen3-ASR-1.7B
727
  /data/voicegate_models/Qwen3-ASR/Qwen3-ForcedAligner-0.6B
728
+ ```
729
+
730
+ Verified symlinks:
731
+
732
+ ```text
733
+ /home/user/app/ComfyUI/models/voxcpm/VoxCPM2
734
+ -> /data/voicegate_models/voxcpm/VoxCPM2
735
+
736
  /home/user/app/ComfyUI/models/diffusion_models/MelBandRoFormer_comfy
737
  -> /data/voicegate_models/diffusion_models/MelBandRoFormer_comfy
738
 
739
  /home/user/app/ComfyUI/models/Qwen3-ASR
740
  -> /data/voicegate_models/Qwen3-ASR
741
+ ```
742
+
743
+ `DEEPSEEK_API_KEY` was also verified as present in the Space environment without
744
+ printing its value.
745
+
746
+ Model download pitfall:
747
+
748
+ - `huggingface-cli download` is deprecated and failed in the Space.
749
+ - `hf download` also failed because of a CLI dependency compatibility issue.
750
+ - `scripts/bootstrap_comfy.py` now uses the `huggingface_hub` Python API
751
+ directly for model downloads.
752
+
753
  ## Current Known Good Commits
754
 
755
  - `683b147` Add ComfyUI runtime bootstrap scripts
 
905
  2. Polish the first Gradio user interface and validate the automatic model
906
  preparation path after Space rebuilds/hardware changes.
907
  3. Reduce runtime dependency installation and model reload overhead.
908
+ ## 2026-06-22: ZeroGPU MelBand SIGBUS recovery
909
+
910
+ - Symptom: the user workflow returned
911
+ `WebSocketConnectionClosedException: Connection to remote host was lost`.
912
+ - Root cause: the ComfyUI child process terminated with `Fatal Python error:
913
+ Bus error` while `comfy.utils.load_safetensors` memory-mapped
914
+ `MelBandRoformer_fp32.safetensors` from persistent `/data` storage.
915
+ - The WebSocket error was secondary; it happened because the ComfyUI process
916
+ had already crashed.
917
+ - Added strict validation for the MelBand model:
918
+ - expected size: `912885656` bytes
919
+ - expected SHA-256:
920
+ `450caec8e8e261ff79426f17ccf16d43490ba4b790ff84d573083cf94e111258`
921
+ - Invalid files are removed and force-downloaded again from
922
+ `Kijai/MelBandRoFormer_comfy`.
923
+ - The bootstrap now patches the pinned MelBand custom node to load safetensors
924
+ from regular file bytes instead of mmap. This prevents a persistent-storage
925
+ mmap failure from terminating the Python interpreter with SIGBUS.
926
+ - The Space runtime validates the model once per container before accepting a
927
+ full workflow request.
scripts/bootstrap_comfy.py CHANGED
@@ -8,6 +8,7 @@ explicitly requested.
8
  from __future__ import annotations
9
 
10
  import argparse
 
11
  import os
12
  import shutil
13
  import subprocess
@@ -20,6 +21,9 @@ ROOT = Path(__file__).resolve().parents[1]
20
  COMFY_DIR = ROOT / "ComfyUI"
21
  CUSTOM_NODES_DIR = COMFY_DIR / "custom_nodes"
22
  DEFAULT_PERSISTENT_MODEL_ROOT = Path("/data/voicegate_models")
 
 
 
23
 
24
 
25
  @dataclass(frozen=True)
@@ -184,6 +188,70 @@ def prepare_model_dirs(dry_run: bool = False) -> None:
184
  ensure_model_link(name, dry_run=dry_run)
185
 
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  def download_models(dry_run: bool = False) -> None:
188
  """Download large model assets.
189
 
@@ -213,12 +281,22 @@ def download_models(dry_run: bool = False) -> None:
213
  local_dir=model_target("voxcpm2"),
214
  token=token,
215
  )
 
 
 
 
 
216
  hf_hub_download(
217
  repo_id="Kijai/MelBandRoFormer_comfy",
218
- filename="MelBandRoformer_fp32.safetensors",
219
  local_dir=model_target("melband"),
220
  token=token,
 
221
  )
 
 
 
 
222
  snapshot_download(
223
  repo_id="Qwen/Qwen3-ASR-1.7B",
224
  local_dir=model_target("qwen3_asr") / "Qwen3-ASR-1.7B",
@@ -262,6 +340,7 @@ def main() -> None:
262
  CUSTOM_NODES_DIR.mkdir(parents=True, exist_ok=True)
263
  for repo in CUSTOM_NODE_REPOS:
264
  ensure_git_repo(repo, dry_run=args.dry_run)
 
265
 
266
  if not args.skip_pip:
267
  install_requirements(COMFYUI, dry_run=args.dry_run)
 
8
  from __future__ import annotations
9
 
10
  import argparse
11
+ import hashlib
12
  import os
13
  import shutil
14
  import subprocess
 
21
  COMFY_DIR = ROOT / "ComfyUI"
22
  CUSTOM_NODES_DIR = COMFY_DIR / "custom_nodes"
23
  DEFAULT_PERSISTENT_MODEL_ROOT = Path("/data/voicegate_models")
24
+ MELBAND_FILENAME = "MelBandRoformer_fp32.safetensors"
25
+ MELBAND_SIZE = 912_885_656
26
+ MELBAND_SHA256 = "450caec8e8e261ff79426f17ccf16d43490ba4b790ff84d573083cf94e111258"
27
 
28
 
29
  @dataclass(frozen=True)
 
188
  ensure_model_link(name, dry_run=dry_run)
189
 
190
 
191
+ def file_sha256(path: Path) -> str:
192
+ digest = hashlib.sha256()
193
+ with path.open("rb") as file:
194
+ for chunk in iter(lambda: file.read(8 * 1024 * 1024), b""):
195
+ digest.update(chunk)
196
+ return digest.hexdigest()
197
+
198
+
199
+ def melband_model_path() -> Path:
200
+ return model_target("melband") / MELBAND_FILENAME
201
+
202
+
203
+ def validate_melband_model(*, verify_hash: bool = True) -> tuple[bool, str]:
204
+ path = melband_model_path()
205
+ if not path.is_file():
206
+ return False, "missing"
207
+ size = path.stat().st_size
208
+ if size != MELBAND_SIZE:
209
+ return False, f"size_mismatch expected={MELBAND_SIZE} actual={size}"
210
+ if verify_hash:
211
+ try:
212
+ digest = file_sha256(path)
213
+ except OSError as exc:
214
+ return False, f"read_error {type(exc).__name__}: {exc}"
215
+ if digest != MELBAND_SHA256:
216
+ return False, f"sha256_mismatch expected={MELBAND_SHA256} actual={digest}"
217
+ return True, "ok"
218
+
219
+
220
+ def patch_melband_loader(dry_run: bool = False) -> None:
221
+ """Avoid safetensors mmap on persistent Space storage.
222
+
223
+ ComfyUI's generic loader uses safetensors.safe_open(), which memory maps the
224
+ model file. A damaged file or an unstable mmap on /data can terminate the
225
+ interpreter with SIGBUS before Python can report a normal exception.
226
+ Loading from bytes uses regular reads and turns corruption into a catchable
227
+ safetensors error instead.
228
+ """
229
+
230
+ nodes_path = CUSTOM_NODES_DIR / "ComfyUI-MelBandRoFormer" / "nodes.py"
231
+ print(f"+ patch non-mmap MelBand loader: {nodes_path}", flush=True)
232
+ if dry_run:
233
+ return
234
+ if not nodes_path.is_file():
235
+ raise RuntimeError(f"MelBand node file is missing: {nodes_path}")
236
+
237
+ text = nodes_path.read_text(encoding="utf-8")
238
+ if "load_safetensors_bytes" not in text:
239
+ text = text.replace(
240
+ "import torchaudio.functional as TAF\n",
241
+ "import torchaudio.functional as TAF\n"
242
+ "from safetensors.torch import load as load_safetensors_bytes\n",
243
+ )
244
+ text = text.replace(
245
+ "model.load_state_dict(load_torch_file(model_path), strict=True)",
246
+ "with open(model_path, \"rb\") as model_file:\n"
247
+ " state_dict = load_safetensors_bytes(model_file.read())\n"
248
+ " model.load_state_dict(state_dict, strict=True)",
249
+ )
250
+ if "load_safetensors_bytes" not in text or "state_dict = load_safetensors_bytes" not in text:
251
+ raise RuntimeError("Could not apply the non-mmap MelBand loader patch")
252
+ nodes_path.write_text(text, encoding="utf-8")
253
+
254
+
255
  def download_models(dry_run: bool = False) -> None:
256
  """Download large model assets.
257
 
 
281
  local_dir=model_target("voxcpm2"),
282
  token=token,
283
  )
284
+ melband_valid, melband_reason = validate_melband_model(verify_hash=True)
285
+ print(f"+ validate MelBand model: {melband_reason}", flush=True)
286
+ if not melband_valid and melband_model_path().exists():
287
+ print(f"+ remove invalid MelBand model: {melband_model_path()}", flush=True)
288
+ melband_model_path().unlink()
289
  hf_hub_download(
290
  repo_id="Kijai/MelBandRoFormer_comfy",
291
+ filename=MELBAND_FILENAME,
292
  local_dir=model_target("melband"),
293
  token=token,
294
+ force_download=not melband_valid,
295
  )
296
+ melband_valid, melband_reason = validate_melband_model(verify_hash=True)
297
+ print(f"+ verify downloaded MelBand model: {melband_reason}", flush=True)
298
+ if not melband_valid:
299
+ raise RuntimeError(f"MelBand model validation failed: {melband_reason}")
300
  snapshot_download(
301
  repo_id="Qwen/Qwen3-ASR-1.7B",
302
  local_dir=model_target("qwen3_asr") / "Qwen3-ASR-1.7B",
 
340
  CUSTOM_NODES_DIR.mkdir(parents=True, exist_ok=True)
341
  for repo in CUSTOM_NODE_REPOS:
342
  ensure_git_repo(repo, dry_run=args.dry_run)
343
+ patch_melband_loader(dry_run=args.dry_run)
344
 
345
  if not args.skip_pip:
346
  install_requirements(COMFYUI, dry_run=args.dry_run)