Spaces:
Running on Zero
Running on Zero
Commit ·
166fb8e
1
Parent(s): 9f07d3f
Remove pre-truncation from all inference paths — stitch_wavs owns trimming
Browse filesAll generate_* and regen GPU functions now return full window audio.
_stitch_wavs is the single place that trims to contact-edge windows
before crossfade-join, ensuring regen respects the same segment timing
as the original generation.
app.py
CHANGED
|
@@ -647,8 +647,7 @@ def _taro_infer_segment(
|
|
| 647 |
# Decode: AudioLDM2 VAE → mel → vocoder → waveform
|
| 648 |
samples = vae.decode(samples / latents_scale).sample
|
| 649 |
wav = vocoder(samples.squeeze().float()).detach().cpu().numpy()
|
| 650 |
-
|
| 651 |
-
return wav[:seg_samples]
|
| 652 |
|
| 653 |
|
| 654 |
# ================================================================== #
|
|
@@ -1059,9 +1058,7 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
|
| 1059 |
rng=rng,
|
| 1060 |
cfg_strength=float(cfg_strength),
|
| 1061 |
)
|
| 1062 |
-
wav = audios.float().cpu()[0].numpy() # (C, T)
|
| 1063 |
-
seg_samples = int(round(seg_dur * sr))
|
| 1064 |
-
wav = wav[:, :seg_samples]
|
| 1065 |
seg_audios.append(wav)
|
| 1066 |
|
| 1067 |
_log_inference_timing("MMAudio", time.perf_counter() - _t_mma_start,
|
|
@@ -1207,9 +1204,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
|
| 1207 |
num_inference_steps=int(num_steps),
|
| 1208 |
batch_size=1,
|
| 1209 |
)
|
| 1210 |
-
wav = audio_batch[0].float().cpu().numpy()
|
| 1211 |
-
seg_samples = int(round(seg_dur * sr))
|
| 1212 |
-
wav = wav[:, :seg_samples]
|
| 1213 |
seg_wavs.append(wav)
|
| 1214 |
|
| 1215 |
_log_inference_timing("HunyuanFoley", time.perf_counter() - _t_hny_start,
|
|
@@ -1472,9 +1467,7 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
|
|
| 1472 |
feature_utils=feature_utils, net=net, fm=fm, rng=rng,
|
| 1473 |
cfg_strength=float(cfg_strength),
|
| 1474 |
)
|
| 1475 |
-
new_wav
|
| 1476 |
-
seg_samples = int(round(seg_dur * sr))
|
| 1477 |
-
new_wav = new_wav[:, :seg_samples]
|
| 1478 |
return new_wav, sr
|
| 1479 |
|
| 1480 |
|
|
@@ -1557,9 +1550,7 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
|
|
| 1557 |
num_inference_steps=int(num_steps),
|
| 1558 |
batch_size=1,
|
| 1559 |
)
|
| 1560 |
-
new_wav
|
| 1561 |
-
seg_samples = int(round(seg_dur * sr))
|
| 1562 |
-
new_wav = new_wav[:, :seg_samples]
|
| 1563 |
return new_wav, sr
|
| 1564 |
|
| 1565 |
|
|
|
|
| 647 |
# Decode: AudioLDM2 VAE → mel → vocoder → waveform
|
| 648 |
samples = vae.decode(samples / latents_scale).sample
|
| 649 |
wav = vocoder(samples.squeeze().float()).detach().cpu().numpy()
|
| 650 |
+
return wav # full window — _stitch_wavs handles contact-edge trimming
|
|
|
|
| 651 |
|
| 652 |
|
| 653 |
# ================================================================== #
|
|
|
|
| 1058 |
rng=rng,
|
| 1059 |
cfg_strength=float(cfg_strength),
|
| 1060 |
)
|
| 1061 |
+
wav = audios.float().cpu()[0].numpy() # (C, T) — full window
|
|
|
|
|
|
|
| 1062 |
seg_audios.append(wav)
|
| 1063 |
|
| 1064 |
_log_inference_timing("MMAudio", time.perf_counter() - _t_mma_start,
|
|
|
|
| 1204 |
num_inference_steps=int(num_steps),
|
| 1205 |
batch_size=1,
|
| 1206 |
)
|
| 1207 |
+
wav = audio_batch[0].float().cpu().numpy() # full window
|
|
|
|
|
|
|
| 1208 |
seg_wavs.append(wav)
|
| 1209 |
|
| 1210 |
_log_inference_timing("HunyuanFoley", time.perf_counter() - _t_hny_start,
|
|
|
|
| 1467 |
feature_utils=feature_utils, net=net, fm=fm, rng=rng,
|
| 1468 |
cfg_strength=float(cfg_strength),
|
| 1469 |
)
|
| 1470 |
+
new_wav = audios.float().cpu()[0].numpy() # full window — _stitch_wavs trims
|
|
|
|
|
|
|
| 1471 |
return new_wav, sr
|
| 1472 |
|
| 1473 |
|
|
|
|
| 1550 |
num_inference_steps=int(num_steps),
|
| 1551 |
batch_size=1,
|
| 1552 |
)
|
| 1553 |
+
new_wav = audio_batch[0].float().cpu().numpy() # full window — _stitch_wavs trims
|
|
|
|
|
|
|
| 1554 |
return new_wav, sr
|
| 1555 |
|
| 1556 |
|