BoxOfColors commited on
Commit
166fb8e
·
1 Parent(s): 9f07d3f

Remove pre-truncation from all inference paths — stitch_wavs owns trimming

Browse files

All generate_* and regen GPU functions now return full window audio.
_stitch_wavs is the single place that trims to contact-edge windows
before crossfade-join, ensuring regen respects the same segment timing
as the original generation.

Files changed (1) hide show
  1. app.py +5 -14
app.py CHANGED
@@ -647,8 +647,7 @@ def _taro_infer_segment(
647
  # Decode: AudioLDM2 VAE → mel → vocoder → waveform
648
  samples = vae.decode(samples / latents_scale).sample
649
  wav = vocoder(samples.squeeze().float()).detach().cpu().numpy()
650
- seg_samples = int(round((seg_end_s - seg_start_s) * TARO_SR))
651
- return wav[:seg_samples]
652
 
653
 
654
  # ================================================================== #
@@ -1059,9 +1058,7 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1059
  rng=rng,
1060
  cfg_strength=float(cfg_strength),
1061
  )
1062
- wav = audios.float().cpu()[0].numpy() # (C, T)
1063
- seg_samples = int(round(seg_dur * sr))
1064
- wav = wav[:, :seg_samples]
1065
  seg_audios.append(wav)
1066
 
1067
  _log_inference_timing("MMAudio", time.perf_counter() - _t_mma_start,
@@ -1207,9 +1204,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1207
  num_inference_steps=int(num_steps),
1208
  batch_size=1,
1209
  )
1210
- wav = audio_batch[0].float().cpu().numpy()
1211
- seg_samples = int(round(seg_dur * sr))
1212
- wav = wav[:, :seg_samples]
1213
  seg_wavs.append(wav)
1214
 
1215
  _log_inference_timing("HunyuanFoley", time.perf_counter() - _t_hny_start,
@@ -1472,9 +1467,7 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
1472
  feature_utils=feature_utils, net=net, fm=fm, rng=rng,
1473
  cfg_strength=float(cfg_strength),
1474
  )
1475
- new_wav = audios.float().cpu()[0].numpy()
1476
- seg_samples = int(round(seg_dur * sr))
1477
- new_wav = new_wav[:, :seg_samples]
1478
  return new_wav, sr
1479
 
1480
 
@@ -1557,9 +1550,7 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
1557
  num_inference_steps=int(num_steps),
1558
  batch_size=1,
1559
  )
1560
- new_wav = audio_batch[0].float().cpu().numpy()
1561
- seg_samples = int(round(seg_dur * sr))
1562
- new_wav = new_wav[:, :seg_samples]
1563
  return new_wav, sr
1564
 
1565
 
 
647
  # Decode: AudioLDM2 VAE → mel → vocoder → waveform
648
  samples = vae.decode(samples / latents_scale).sample
649
  wav = vocoder(samples.squeeze().float()).detach().cpu().numpy()
650
+ return wav # full window _stitch_wavs handles contact-edge trimming
 
651
 
652
 
653
  # ================================================================== #
 
1058
  rng=rng,
1059
  cfg_strength=float(cfg_strength),
1060
  )
1061
+ wav = audios.float().cpu()[0].numpy() # (C, T) — full window
 
 
1062
  seg_audios.append(wav)
1063
 
1064
  _log_inference_timing("MMAudio", time.perf_counter() - _t_mma_start,
 
1204
  num_inference_steps=int(num_steps),
1205
  batch_size=1,
1206
  )
1207
+ wav = audio_batch[0].float().cpu().numpy() # full window
 
 
1208
  seg_wavs.append(wav)
1209
 
1210
  _log_inference_timing("HunyuanFoley", time.perf_counter() - _t_hny_start,
 
1467
  feature_utils=feature_utils, net=net, fm=fm, rng=rng,
1468
  cfg_strength=float(cfg_strength),
1469
  )
1470
+ new_wav = audios.float().cpu()[0].numpy() # full window — _stitch_wavs trims
 
 
1471
  return new_wav, sr
1472
 
1473
 
 
1550
  num_inference_steps=int(num_steps),
1551
  batch_size=1,
1552
  )
1553
+ new_wav = audio_batch[0].float().cpu().numpy() # full window — _stitch_wavs trims
 
 
1554
  return new_wav, sr
1555
 
1556