Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Sonnet 4.6 commited on 14 days ago

Commit

121a071

1 Parent(s): b60f330

fix: cross-model regen — fix stereo/mono mismatch and add pending spinner

Two bugs fixed:

1. ValueError 'arrays must have same number of dimensions':
TARO outputs mono (T,) while MMAudio/Hunyuan output stereo (C, T).
_resample_to_slot_sr now takes a slot_wav_ref and matches channel
layout after resampling — stereo→mono averages channels, mono→stereo
duplicates the channel — so _cf_join always receives matching shapes.

2. No loading indicator on cross-model regen buttons:
xregen_* functions were plain return functions; the pending waveform
spinner only appeared via the Python yield in same-model regen. All
three xregen_* are now generators that yield the pending HTML
immediately before calling the GPU, matching the same-model behaviour.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +56 -22

app.py CHANGED Viewed

@@ -1324,35 +1324,59 @@ MODEL_CONFIGS["hunyuan"]["regen_fn"] = regen_hunyuan_segment
 # (44.1 kHz) / Hunyuan (48 kHz) outputs can all be mixed freely.    #
 # ================================================================== #
-def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
-    """Resample *wav* from src_sr to dst_sr using torchaudio.
-    Works for mono (T,) and stereo (C, T) numpy arrays."""
-    if src_sr == dst_sr:
-        return wav
-    stereo = wav.ndim == 2
-    t = torch.from_numpy(np.ascontiguousarray(wav))
-    if not stereo:
-        t = t.unsqueeze(0)          # (1, T)
-    t = torchaudio.functional.resample(t.float(), src_sr, dst_sr)
-    if not stereo:
-        t = t.squeeze(0)            # (T,)
-    return t.numpy()
 def xregen_taro(seg_idx, state_json, slot_id,
                 seed_val, cfg_scale, num_steps, mode,
                 crossfade_s, crossfade_db):
     """Cross-model regen: run TARO inference and splice into *slot_id*."""
-    meta        = json.loads(state_json)
-    slot_sr     = int(meta["sr"])
     new_wav_raw = _regen_taro_gpu(None, seg_idx, state_json,
                                   seed_val, cfg_scale, num_steps, mode,
                                   crossfade_s, crossfade_db, slot_id)
-    new_wav = _resample_to_slot_sr(new_wav_raw, TARO_SR, slot_sr)
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
-        new_wav, int(seg_idx), meta, slot_id
     )
-    return gr.update(value=video_path), gr.update(value=waveform_html)
 def xregen_mmaudio(seg_idx, state_json, slot_id,
@@ -1365,6 +1389,10 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
     seg_dur = seg_end - seg_start
     slot_sr = int(meta["sr"])
     silent_video = meta["silent_video"]
     tmp_dir      = tempfile.mkdtemp()
     seg_path     = os.path.join(tmp_dir, "xregen_seg.mp4")
@@ -1377,11 +1405,12 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
                                              prompt, negative_prompt, seed_val,
                                              cfg_strength, num_steps,
                                              crossfade_s, crossfade_db, slot_id)
-    new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr)
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
-    return gr.update(value=video_path), gr.update(value=waveform_html)
 def xregen_hunyuan(seg_idx, state_json, slot_id,
@@ -1395,6 +1424,10 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
     seg_dur = seg_end - seg_start
     slot_sr = int(meta["sr"])
     silent_video = meta["silent_video"]
     tmp_dir      = tempfile.mkdtemp()
     seg_path     = os.path.join(tmp_dir, "xregen_seg.mp4")
@@ -1407,11 +1440,12 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
                                              prompt, negative_prompt, seed_val,
                                              guidance_scale, num_steps, model_size,
                                              crossfade_s, crossfade_db, slot_id)
-    new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr)
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
-    return gr.update(value=video_path), gr.update(value=waveform_html)
 # ================================================================== #

 # (44.1 kHz) / Hunyuan (48 kHz) outputs can all be mixed freely.    #
 # ================================================================== #
+def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
+                         slot_wav_ref: np.ndarray = None) -> np.ndarray:
+    """Resample *wav* from src_sr to dst_sr using torchaudio, then match
+    channel layout to *slot_wav_ref* (the first existing segment in the slot).
+    TARO is mono (T,), MMAudio/Hunyuan are stereo (C, T).  Mixing them
+    without normalisation causes a shape mismatch in _cf_join.  Rules:
+      • stereo → mono : average channels
+      • mono   → stereo: duplicate the single channel
+    """
+    # 1. Resample
+    if src_sr != dst_sr:
+        stereo_in = wav.ndim == 2
+        t = torch.from_numpy(np.ascontiguousarray(wav))
+        if not stereo_in:
+            t = t.unsqueeze(0)
+        t = torchaudio.functional.resample(t.float(), src_sr, dst_sr)
+        if not stereo_in:
+            t = t.squeeze(0)
+        wav = t.numpy()
+    # 2. Match channel layout to the slot's existing segments
+    if slot_wav_ref is not None:
+        slot_stereo = slot_wav_ref.ndim == 2
+        wav_stereo  = wav.ndim == 2
+        if slot_stereo and not wav_stereo:
+            wav = np.stack([wav, wav], axis=0)   # mono → stereo (C, T)
+        elif not slot_stereo and wav_stereo:
+            wav = wav.mean(axis=0)               # stereo → mono  (T,)
+    return wav
 def xregen_taro(seg_idx, state_json, slot_id,
                 seed_val, cfg_scale, num_steps, mode,
                 crossfade_s, crossfade_db):
     """Cross-model regen: run TARO inference and splice into *slot_id*."""
+    meta    = json.loads(state_json)
+    seg_idx = int(seg_idx)
+    slot_sr = int(meta["sr"])
+    # Show pending waveform immediately
+    pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
+    yield gr.update(), gr.update(value=pending_html)
     new_wav_raw = _regen_taro_gpu(None, seg_idx, state_json,
                                   seed_val, cfg_scale, num_steps, mode,
                                   crossfade_s, crossfade_db, slot_id)
+    slot_wavs = _load_seg_wavs(meta["wav_paths"])
+    new_wav = _resample_to_slot_sr(new_wav_raw, TARO_SR, slot_sr, slot_wavs[0])
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
+        new_wav, seg_idx, meta, slot_id
     )
+    yield gr.update(value=video_path), gr.update(value=waveform_html)
 def xregen_mmaudio(seg_idx, state_json, slot_id,
     seg_dur = seg_end - seg_start
     slot_sr = int(meta["sr"])
+    # Show pending waveform immediately
+    pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
+    yield gr.update(), gr.update(value=pending_html)
     silent_video = meta["silent_video"]
     tmp_dir      = tempfile.mkdtemp()
     seg_path     = os.path.join(tmp_dir, "xregen_seg.mp4")
                                              prompt, negative_prompt, seed_val,
                                              cfg_strength, num_steps,
                                              crossfade_s, crossfade_db, slot_id)
+    slot_wavs = _load_seg_wavs(meta["wav_paths"])
+    new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
+    yield gr.update(value=video_path), gr.update(value=waveform_html)
 def xregen_hunyuan(seg_idx, state_json, slot_id,
     seg_dur = seg_end - seg_start
     slot_sr = int(meta["sr"])
+    # Show pending waveform immediately
+    pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
+    yield gr.update(), gr.update(value=pending_html)
     silent_video = meta["silent_video"]
     tmp_dir      = tempfile.mkdtemp()
     seg_path     = os.path.join(tmp_dir, "xregen_seg.mp4")
                                              prompt, negative_prompt, seed_val,
                                              guidance_scale, num_steps, model_size,
                                              crossfade_s, crossfade_db, slot_id)
+    slot_wavs = _load_seg_wavs(meta["wav_paths"])
+    new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
+    yield gr.update(value=video_path), gr.update(value=waveform_html)
 # ================================================================== #