Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Sonnet 4.6 commited on 6 days ago

Commit

63065b2

1 Parent(s): 3272260

Fix: pre-download MMAudio CLIP model at startup to avoid GPU window download

apple/DFN5B-CLIP-ViT-H-14-384 (3.95GB) was being downloaded by open_clip
inside the ZeroGPU GPU window on cold workers, consuming ~5-10s of the
allocated budget before inference started. Pre-download via snapshot_download
at startup so it reads from cache inside the GPU window, same pattern as
the existing CLAP pre-download. Reverts MMAUDIO_LOAD_OVERHEAD back to 30s.

Also adds _regenInFlight per-slot guard to prevent queuing multiple regen
jobs from rapid re-clicks on the same slot.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +26 -24

app.py CHANGED Viewed

@@ -72,6 +72,14 @@ print("Pre-downloading CLAP model (laion/larger_clap_general)…")
 snapshot_download(repo_id="laion/larger_clap_general")
 print("CLAP model pre-downloaded.")
 # ================================================================== #
 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #
@@ -355,7 +363,7 @@ TARO_SECS_PER_STEP = 0.025  # measured 0.023s/step on H200; was 0.05, tightened
 TARO_LOAD_OVERHEAD     = 15    # seconds: model load + CAVP feature extraction
 MMAUDIO_WINDOW         = 8.0   # seconds — MMAudio's fixed generation window
 MMAUDIO_SECS_PER_STEP  = 0.25  # measured 0.230s/step on H200 (8.3s video, 2 segs × 25 steps = 11.5s wall)
-MMAUDIO_LOAD_OVERHEAD  = 30    # 15s warm + up to 30s cold-start model download
 HUNYUAN_MAX_DUR        = 15.0  # seconds — HunyuanFoley max video duration
 HUNYUAN_SECS_PER_STEP  = 0.35  # measured 0.328s/step on H200 (8.3s video, 1 seg × 50 steps = 16.4s wall)
 HUNYUAN_LOAD_OVERHEAD  = 55    # ~55s to load the 10GB XXL model weights into GPU
@@ -2210,6 +2218,13 @@ _GLOBAL_JS = """
   // If targetModel matches the slot's own prefix, uses the per-slot regen_* endpoint.
   // Otherwise uses the shared xregen_* cross-model endpoint.
   function fireRegen(slot_id, seg_idx, targetModel) {
     const prefix  = slot_id.split('_')[0];   // owning tab: 'taro'|'mma'|'hf'
     const slotNum = parseInt(slot_id.split('_')[1], 10);
@@ -2401,6 +2416,7 @@ _GLOBAL_JS = """
         }
         if (msg.msg === 'process_completed') {
           es.close();
           var errMsg = msg.output && msg.output.error;
           var hadError = !!errMsg;
           console.log('[fireRegen] completed for', slot_id, 'error:', hadError, errMsg || '');
@@ -2416,35 +2432,18 @@ _GLOBAL_JS = """
               var vidElR = document.getElementById('slot_vid_' + slot_id);
               if (vidElR) { var vR = vidElR.querySelector('video'); if (vR) { vR.setAttribute('src', preRegenVideoSrc); vR.src = preRegenVideoSrc; vR.load(); } }
             }
-            // Flash the waveform iframe border red so it's obvious the segment didn't change
-            var iframeEl = document.getElementById('wf_iframe_' + slot_id);
-            if (!iframeEl) {
-              // waveform may have been restored into preRegenWaveHtml — find via slot_wave wrapper
-              var waveWrap = document.getElementById('slot_wave_' + slot_id);
-              if (waveWrap) iframeEl = waveWrap.querySelector('iframe[id^="wf_iframe_"]');
-            }
-            if (iframeEl) {
-              iframeEl.style.transition = 'box-shadow 0.15s';
-              iframeEl.style.boxShadow = '0 0 0 2px #e05252';
-              setTimeout(function() { iframeEl.style.boxShadow = 'none'; }, 3000);
-            }
-            // Pick a human-readable message based on the error text
             var isAbort   = toastMsg.toLowerCase().indexOf('aborted') !== -1;
             var isTimeout = toastMsg.toLowerCase().indexOf('timeout') !== -1;
-            var userMsg = isAbort || isTimeout
-              ? '\u26a0\ufe0f GPU cold-start — segment unchanged, try again'
-              : '\u26a0\ufe0f Regen failed — segment unchanged';
             var statusBar = document.getElementById('wf_statusbar_' + slot_id);
             if (statusBar) {
               statusBar.style.color = '#e05252';
-              statusBar.textContent = userMsg;
               setTimeout(function() { statusBar.style.color = '#888'; statusBar.textContent = 'Click a segment to regenerate \u00a0|\u00a0 Playhead syncs to video'; }, 8000);
             }
-            if (lbl) {
-              lbl.style.color = '#e05252';
-              lbl.textContent = isAbort || isTimeout ? 'Cold-start abort — segment unchanged, try again' : 'Regen failed — segment unchanged';
-              setTimeout(function() { lbl.style.color = '#aaa'; lbl.textContent = ''; }, 8000);
-            }
           } else {
             if (lbl) lbl.textContent = 'Done';
             var src = _pendingVideoSrc;
@@ -2465,9 +2464,12 @@ _GLOBAL_JS = """
       }
       if (msg.msg === 'close_stream') { es.close(); }
     };
-    es.onerror = function() { es.close(); };
   }
   // Shared popup element created once and reused across all slots
   let _popup = null;
   let _pendingSlot = null, _pendingIdx = null;

 snapshot_download(repo_id="laion/larger_clap_general")
 print("CLAP model pre-downloaded.")
+# Pre-download MMAudio's CLIP model (apple/DFN5B-CLIP-ViT-H-14-384, ~3.95 GB).
+# open_clip.create_model_from_pretrained('hf-hub:apple/DFN5B-CLIP-ViT-H-14-384')
+# fetches this at first use — inside the GPU window on cold workers — which
+# burns ~5-10s of the allocated ZeroGPU budget before inference even starts.
+print("Pre-downloading MMAudio CLIP model (apple/DFN5B-CLIP-ViT-H-14-384)…")
+snapshot_download(repo_id="apple/DFN5B-CLIP-ViT-H-14-384")
+print("MMAudio CLIP model pre-downloaded.")
 # ================================================================== #
 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #
 TARO_LOAD_OVERHEAD     = 15    # seconds: model load + CAVP feature extraction
 MMAUDIO_WINDOW         = 8.0   # seconds — MMAudio's fixed generation window
 MMAUDIO_SECS_PER_STEP  = 0.25  # measured 0.230s/step on H200 (8.3s video, 2 segs × 25 steps = 11.5s wall)
+MMAUDIO_LOAD_OVERHEAD  = 30    # 15s warm + 15s model init; open_clip pre-downloaded at startup
 HUNYUAN_MAX_DUR        = 15.0  # seconds — HunyuanFoley max video duration
 HUNYUAN_SECS_PER_STEP  = 0.35  # measured 0.328s/step on H200 (8.3s video, 1 seg × 50 steps = 16.4s wall)
 HUNYUAN_LOAD_OVERHEAD  = 55    # ~55s to load the 10GB XXL model weights into GPU
   // If targetModel matches the slot's own prefix, uses the per-slot regen_* endpoint.
   // Otherwise uses the shared xregen_* cross-model endpoint.
   function fireRegen(slot_id, seg_idx, targetModel) {
+    // Block if a regen is already in-flight for this slot
+    if (_regenInFlight[slot_id]) {
+      console.log('[fireRegen] blocked — regen already in-flight for', slot_id);
+      return;
+    }
+    _regenInFlight[slot_id] = true;
     const prefix  = slot_id.split('_')[0];   // owning tab: 'taro'|'mma'|'hf'
     const slotNum = parseInt(slot_id.split('_')[1], 10);
         }
         if (msg.msg === 'process_completed') {
           es.close();
+          _regenInFlight[slot_id] = false;
           var errMsg = msg.output && msg.output.error;
           var hadError = !!errMsg;
           console.log('[fireRegen] completed for', slot_id, 'error:', hadError, errMsg || '');
               var vidElR = document.getElementById('slot_vid_' + slot_id);
               if (vidElR) { var vR = vidElR.querySelector('video'); if (vR) { vR.setAttribute('src', preRegenVideoSrc); vR.src = preRegenVideoSrc; vR.load(); } }
             }
+            // Update the statusbar (query after restore so we get the freshly-restored element)
             var isAbort   = toastMsg.toLowerCase().indexOf('aborted') !== -1;
             var isTimeout = toastMsg.toLowerCase().indexOf('timeout') !== -1;
+            var failMsg = isAbort || isTimeout
+              ? '\u26a0 GPU cold-start — segment unchanged, try again'
+              : '\u26a0 Regen failed — segment unchanged';
             var statusBar = document.getElementById('wf_statusbar_' + slot_id);
             if (statusBar) {
               statusBar.style.color = '#e05252';
+              statusBar.textContent = failMsg;
               setTimeout(function() { statusBar.style.color = '#888'; statusBar.textContent = 'Click a segment to regenerate \u00a0|\u00a0 Playhead syncs to video'; }, 8000);
             }
           } else {
             if (lbl) lbl.textContent = 'Done';
             var src = _pendingVideoSrc;
       }
       if (msg.msg === 'close_stream') { es.close(); }
     };
+    es.onerror = function() { es.close(); _regenInFlight[slot_id] = false; };
   }
+  // Track in-flight regen per slot — prevents queuing multiple jobs from rapid clicks
+  var _regenInFlight = {};
   // Shared popup element created once and reused across all slots
   let _popup = null;
   let _pendingSlot = null, _pendingIdx = null;