Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Opus 4.6 commited on 3 days ago

Commit

ef4f0ff

1 Parent(s): 0bc4a35

Optimize ZeroGPU: move FlashSR to CPU, fix regen 16kHz bug

- Move FlashSR upsampling (16kHz→48kHz) from inside @spaces.GPU
to CPU wrappers — saves ~1-2s GPU quota per segment since
FlashSR is CPU-only and doesn not need the GPU allocation
- Fix bug: regen_taro_segment and xregen_taro were returning
raw 16kHz wav without applying FlashSR (initial gen applied
it inside the GPU loop but regen never did)
- Reduce TARO regen duration estimate when CAVP/onset feature
cache exists — requests 5s overhead instead of 15s

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +19 -9

app.py CHANGED Viewed

@@ -686,12 +686,6 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
                     latents_scale,
                     euler_sampler, euler_maruyama_sampler,
                 )
-                # FlashSR: upsample 16kHz → 48kHz inside GPU window to avoid
-                # ZeroGPU CUDA-init-in-main-process violation
-                print(f"[FlashSR] Upsampling seg {len(wavs)+1} "
-                      f"{seg_end_s-seg_start_s:.2f}s @ 16kHz → 48kHz …")
-                wav = _apply_flashsr(wav)
-                print(f"[FlashSR] Done — {len(wav)/FLASHSR_SR_OUT:.2f}s @ {FLASHSR_SR_OUT}Hz")
                 wavs.append(wav)
             _log_inference_timing("TARO", time.perf_counter() - _t_infer_start,
                                   len(segments), int(num_steps), TARO_SECS_PER_STEP)
@@ -741,7 +735,8 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     first_cavp_saved = False
     outputs = []
     for sample_idx, (wavs, cavp_feats, onset_feats) in enumerate(results):
-        # wavs are already at 48kHz — FlashSR ran inside _taro_gpu_infer
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, FLASHSR_SR_OUT)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
         _save_wav(audio_path, final_wav, FLASHSR_SR_OUT)
@@ -1145,6 +1140,19 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
 def _taro_regen_duration(video_file, seg_idx, seg_meta_json,
                          seed_val, cfg_scale, num_steps, mode,
                          crossfade_s, crossfade_db, slot_id=None):
     return _estimate_regen_duration("taro", int(num_steps))
@@ -1206,7 +1214,8 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
                               seed_val, cfg_scale, num_steps, mode,
                               crossfade_s, crossfade_db, slot_id)
-    # new_wav is already at 48kHz — FlashSR ran inside _regen_taro_gpu → _taro_infer_segment
     # CPU: splice, stitch, mux, save
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
@@ -1477,7 +1486,8 @@ def xregen_taro(seg_idx, state_json, slot_id,
     new_wav_raw = _regen_taro_gpu(None, seg_idx, state_json,
                                   seed_val, cfg_scale, num_steps, mode,
                                   crossfade_s, crossfade_db, slot_id)
-    # new_wav_raw already at 48kHz — FlashSR ran inside _regen_taro_gpu → _taro_infer_segment
     video_path, waveform_html = _xregen_splice(new_wav_raw, FLASHSR_SR_OUT, meta, seg_idx, slot_id)
     yield gr.update(value=video_path), gr.update(value=waveform_html)

                     latents_scale,
                     euler_sampler, euler_maruyama_sampler,
                 )
                 wavs.append(wav)
             _log_inference_timing("TARO", time.perf_counter() - _t_infer_start,
                                   len(segments), int(num_steps), TARO_SECS_PER_STEP)
     first_cavp_saved = False
     outputs = []
     for sample_idx, (wavs, cavp_feats, onset_feats) in enumerate(results):
+        # FlashSR: upsample each segment 16kHz → 48kHz (CPU-only, no GPU needed)
+        wavs = [_apply_flashsr(w) for w in wavs]
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, FLASHSR_SR_OUT)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
         _save_wav(audio_path, final_wav, FLASHSR_SR_OUT)
 def _taro_regen_duration(video_file, seg_idx, seg_meta_json,
                          seed_val, cfg_scale, num_steps, mode,
                          crossfade_s, crossfade_db, slot_id=None):
+    # If cached CAVP/onset features exist, skip ~10s feature-extractor overhead
+    try:
+        meta = json.loads(seg_meta_json)
+        cavp_ok  = os.path.exists(meta.get("cavp_path", ""))
+        onset_ok = os.path.exists(meta.get("onset_path", ""))
+        if cavp_ok and onset_ok:
+            cfg  = MODEL_CONFIGS["taro"]
+            secs = int(num_steps) * cfg["secs_per_step"] + 5  # 5s model-load only
+            result = min(GPU_DURATION_CAP, max(30, int(secs)))
+            print(f"[duration] TARO regen (cache hit): 1 seg × {int(num_steps)} steps → {secs:.0f}s → capped {result}s")
+            return result
+    except Exception:
+        pass
     return _estimate_regen_duration("taro", int(num_steps))
                               seed_val, cfg_scale, num_steps, mode,
                               crossfade_s, crossfade_db, slot_id)
+    # FlashSR: upsample 16kHz → 48kHz on CPU (no GPU needed)
+    new_wav = _apply_flashsr(new_wav)
     # CPU: splice, stitch, mux, save
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     new_wav_raw = _regen_taro_gpu(None, seg_idx, state_json,
                                   seed_val, cfg_scale, num_steps, mode,
                                   crossfade_s, crossfade_db, slot_id)
+    # FlashSR: upsample 16kHz → 48kHz on CPU (no GPU needed)
+    new_wav_raw = _apply_flashsr(new_wav_raw)
     video_path, waveform_html = _xregen_splice(new_wav_raw, FLASHSR_SR_OUT, meta, seg_idx, slot_id)
     yield gr.update(value=video_path), gr.update(value=waveform_html)