Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 8 days ago

Commit

1c5fa8d

1 Parent(s): a5f92a7

Fix crossfade: use equal-power fade envelopes in all 3 models

All three models had flat summation (both segments at full volume
simultaneously) during the overlap region, causing loud bumps at
segment joints. Replace with equal-power crossfade using cos/sin
fade-out/fade-in envelopes, which maintains constant perceived
loudness through the transition. Applies to TARO, MMAudio, and
HunyuanFoley.

Files changed (1) hide show

app.py +16 -6

app.py CHANGED Viewed

@@ -217,8 +217,12 @@ def _crossfade_join(wav_a: np.ndarray, wav_b: np.ndarray,
     cf = min(cf, len(wav_a), len(wav_b))
     if cf <= 0:
         return np.concatenate([wav_a, wav_b])
-    gain    = 10 ** (db_boost / 20.0)
-    overlap = wav_a[-cf:] * gain + wav_b[:cf] * gain
     return np.concatenate([wav_a[:-cf], overlap, wav_b[cf:]])
@@ -475,14 +479,17 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
             wav = wav[:, :seg_samples]
             seg_audios.append(wav)
-        # Crossfade-stitch all segments
         def _cf_join(a, b, cf_s):
             cf = int(round(cf_s * sr))
             cf = min(cf, a.shape[1], b.shape[1])
             if cf <= 0:
                 return np.concatenate([a, b], axis=1)
             gain = 10 ** (MMA_CF_DB / 20.0)
-            overlap = a[:, -cf:] * gain + b[:, :cf] * gain
             return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
         full_wav = seg_audios[0]
@@ -619,14 +626,17 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
             wav = wav[:, :seg_samples]
             seg_wavs.append(wav)
-        # Stitch segments with crossfade (operates on (channels, samples) arrays)
         def _cf_join_stereo(a, b, cf_s, db):
             cf = int(round(cf_s * sr))
             cf = min(cf, a.shape[1], b.shape[1])
             if cf <= 0:
                 return np.concatenate([a, b], axis=1)
             gain = 10 ** (db / 20.0)
-            overlap = a[:, -cf:] * gain + b[:, :cf] * gain
             return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
         full_wav = seg_wavs[0]

     cf = min(cf, len(wav_a), len(wav_b))
     if cf <= 0:
         return np.concatenate([wav_a, wav_b])
+    gain = 10 ** (db_boost / 20.0)
+    # Equal-power fade: fade-out a, fade-in b over the overlap region
+    t = np.linspace(0.0, 1.0, cf, dtype=np.float32)
+    fade_out = np.cos(t * np.pi / 2)   # 1 → 0
+    fade_in  = np.sin(t * np.pi / 2)   # 0 → 1
+    overlap  = wav_a[-cf:] * fade_out * gain + wav_b[:cf] * fade_in * gain
     return np.concatenate([wav_a[:-cf], overlap, wav_b[cf:]])
             wav = wav[:, :seg_samples]
             seg_audios.append(wav)
+        # Crossfade-stitch all segments (equal-power fade)
         def _cf_join(a, b, cf_s):
             cf = int(round(cf_s * sr))
             cf = min(cf, a.shape[1], b.shape[1])
             if cf <= 0:
                 return np.concatenate([a, b], axis=1)
             gain = 10 ** (MMA_CF_DB / 20.0)
+            t = np.linspace(0.0, 1.0, cf, dtype=np.float32)
+            fade_out = np.cos(t * np.pi / 2)
+            fade_in  = np.sin(t * np.pi / 2)
+            overlap = a[:, -cf:] * fade_out * gain + b[:, :cf] * fade_in * gain
             return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
         full_wav = seg_audios[0]
             wav = wav[:, :seg_samples]
             seg_wavs.append(wav)
+        # Stitch segments with equal-power crossfade (operates on (channels, samples) arrays)
         def _cf_join_stereo(a, b, cf_s, db):
             cf = int(round(cf_s * sr))
             cf = min(cf, a.shape[1], b.shape[1])
             if cf <= 0:
                 return np.concatenate([a, b], axis=1)
             gain = 10 ** (db / 20.0)
+            t = np.linspace(0.0, 1.0, cf, dtype=np.float32)
+            fade_out = np.cos(t * np.pi / 2)
+            fade_in  = np.sin(t * np.pi / 2)
+            overlap = a[:, -cf:] * fade_out * gain + b[:, :cf] * fade_in * gain
             return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
         full_wav = seg_wavs[0]