futurespyhi commited on
Commit
3dfa434
Β·
1 Parent(s): 28df979

Enhance audio generation debugging and error handling

Browse files

- Add comprehensive debugging output for audio decoding pipeline
- Implement silent audio detection with amplitude checking
- Improve error handling with try-catch blocks for robustness
- Expand critical file validation in model download process
- Optimize audio file search priority for better quality output
- Update gitignore to exclude test files and logs

Files changed (4) hide show
  1. .gitignore +5 -1
  2. YuEGP/inference/infer.py +45 -15
  3. app.py +6 -0
  4. download_models.py +10 -2
.gitignore CHANGED
@@ -109,6 +109,10 @@ flash_attn/
109
  packages.txt.backup
110
  requirements.txt.backup
111
  download_model.py.backup
 
112
 
113
  # test files
114
- tests/
 
 
 
 
109
  packages.txt.backup
110
  requirements.txt.backup
111
  download_model.py.backup
112
+ decode_vocode.txt
113
 
114
  # test files
115
+ tests/
116
+
117
+ # YuE
118
+ YuE/
YuEGP/inference/infer.py CHANGED
@@ -192,7 +192,7 @@ def load_audio_mono(filepath, sampling_rate=16000):
192
  audio = resampler(audio)
193
  return audio
194
 
195
-
196
  def encode_audio(codec_model, audio_prompt, device, target_bw=0.5):
197
  if len(audio_prompt.shape) < 3:
198
  audio_prompt.unsqueeze_(0)
@@ -511,17 +511,32 @@ recons_mix_dir = os.path.join(recons_output_dir, 'mix')
511
  os.makedirs(recons_mix_dir, exist_ok=True)
512
  tracks = []
513
  for npy in stage2_result:
514
- codec_result = np.load(npy)
515
- decodec_rlt = []
516
- with torch.no_grad():
517
- decoded_waveform = codec_model.decode(
518
- torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
519
- decoded_waveform = decoded_waveform.cpu().squeeze(0)
520
- decodec_rlt.append(torch.as_tensor(decoded_waveform, device="cpu"))
521
- decodec_rlt = torch.cat(decodec_rlt, dim=-1)
522
- save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
523
- tracks.append(save_path)
524
- save_audio(decodec_rlt, save_path, 16000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
  # mix tracks
526
  for inst_path in tracks:
527
  try:
@@ -533,12 +548,27 @@ for inst_path in tracks:
533
  continue
534
  # mix
535
  recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('_itrack', '_mixed'))
536
- vocal_stem, sr = sf.read(inst_path)
537
- instrumental_stem, _ = sf.read(vocal_path)
 
 
 
 
 
 
 
 
 
 
 
 
538
  mix_stem = (vocal_stem + instrumental_stem) / 1
 
 
539
  sf.write(recons_mix, mix_stem, sr)
 
540
  except Exception as e:
541
- print(e)
542
 
543
  # vocoder to upsample audios
544
  vocal_decoder, inst_decoder = build_codec_model(args.config_path, args.vocal_decoder_path, args.inst_decoder_path)
 
192
  audio = resampler(audio)
193
  return audio
194
 
195
+ #ιŸ³ι’‘ηΌ–η 
196
  def encode_audio(codec_model, audio_prompt, device, target_bw=0.5):
197
  if len(audio_prompt.shape) < 3:
198
  audio_prompt.unsqueeze_(0)
 
511
  os.makedirs(recons_mix_dir, exist_ok=True)
512
  tracks = []
513
  for npy in stage2_result:
514
+ try:
515
+ codec_result = np.load(npy)
516
+ print(f"Decoding:{npy}, shape: {codec_result.shape}, dtype: {codec_result.dtype}")
517
+ print(f"πŸ› Codec values range: [{codec_result.min()}, {codec_result.max()}]")
518
+ # decodec_rlt = []
519
+ with torch.no_grad():
520
+ codec_tensor = torch.as_tensor(codec_result, dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device)
521
+ print(f"πŸ› Codec tensor shape: {codec_tensor.shape}")
522
+
523
+ decoded_waveform = codec_model.decode(codec_tensor)
524
+ print(f"πŸ› Decoded waveform shape: {decoded_waveform.shape}")
525
+ decoded_waveform = decoded_waveform.cpu().squeeze(0)
526
+ print(f"πŸ› Decoded audio range: [{decoded_waveform.min():.6f}, {decoded_waveform.max():.6f}]")
527
+ # decodec_rlt.append(torch.as_tensor(decoded_waveform, device="cpu"))
528
+ # decodec_rlt = torch.cat(decodec_rlt, dim=-1)
529
+ save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
530
+ tracks.append(save_path)
531
+ if decoded_waveform.abs().max() < 1e-6:
532
+ print(f"⚠️ WARNING: {npy} decoded to silent audio!")
533
+ else:
534
+ print(f"βœ… Audio has sound, max amplitude: {decoded_waveform.abs().max():.6f}")
535
+ save_audio(decoded_waveform, save_path, 16000)
536
+ print(f"Saved Path:{save_path}")
537
+ except Exception as e:
538
+ print(f"Error decoding {npy}:{e}")
539
+ continue
540
  # mix tracks
541
  for inst_path in tracks:
542
  try:
 
548
  continue
549
  # mix
550
  recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('_itrack', '_mixed'))
551
+ print(f"🎡 Mixing: {os.path.basename(vocal_path)} + {os.path.basename(inst_path)}")
552
+
553
+ vocal_stem, sr = sf.read(vocal_path)
554
+ instrumental_stem, _ = sf.read(inst_path)
555
+
556
+ print(f"πŸ› Vocal stem - shape: {vocal_stem.shape}, max: {np.abs(vocal_stem).max():.6f}, sr: {sr}")
557
+ print(f"πŸ› Instrumental stem - shape: {instrumental_stem.shape}, max: {np.abs(instrumental_stem).max():.6f}")
558
+
559
+ # Check for silent tracks
560
+ if np.abs(vocal_stem).max() < 1e-6:
561
+ print(f"⚠️ WARNING: Vocal track is silent!")
562
+ if np.abs(instrumental_stem).max() < 1e-6:
563
+ print(f"⚠️ WARNING: Instrumental track is silent!")
564
+
565
  mix_stem = (vocal_stem + instrumental_stem) / 1
566
+ print(f"πŸ› Mixed stem - max amplitude: {np.abs(mix_stem).max():.6f}")
567
+
568
  sf.write(recons_mix, mix_stem, sr)
569
+ print(f"βœ… Mixed audio saved: {recons_mix}")
570
  except Exception as e:
571
+ print(f"❌ Error mixing tracks {inst_path}: {e}")
572
 
573
  # vocoder to upsample audios
574
  vocal_decoder, inst_decoder = build_codec_model(args.config_path, args.vocal_decoder_path, args.inst_decoder_path)
app.py CHANGED
@@ -509,6 +509,12 @@ def generate_music_spaces(lyrics: str, genre: str, mood: str, progress=gr.Progre
509
  # Find generated audio file - prioritize mixed audio from vocoder/mix directory
510
  import glob
511
 
 
 
 
 
 
 
512
  # First look for the final mixed audio in vocoder/mix
513
  mixed_files = glob.glob(os.path.join(output_dir, "vocoder/mix/*_mixed.mp3"))
514
  if mixed_files:
 
509
  # Find generated audio file - prioritize mixed audio from vocoder/mix directory
510
  import glob
511
 
512
+ final_files = glob.glob(os.path.join(output_dir, "*_mixed.mp3"))
513
+ if final_files:
514
+ progress(1.0, desc="Finish music generation")
515
+ print(f"βœ… Found audio file at root: {final_files[0]}")
516
+ return final_files[0]
517
+
518
  # First look for the final mixed audio in vocoder/mix
519
  mixed_files = glob.glob(os.path.join(output_dir, "vocoder/mix/*_mixed.mp3"))
520
  if mixed_files:
download_models.py CHANGED
@@ -185,11 +185,19 @@ def ensure_model_availability():
185
 
186
  xcodec_base = Path("YuEGP/inference/xcodec_mini_infer")
187
 
188
- # Check if critical decoder files exist (these are the most important for vocoder)
189
  critical_files = [
 
190
  xcodec_base / "decoders" / "decoder_131000.pth",
191
  xcodec_base / "decoders" / "decoder_151000.pth",
192
- xcodec_base / "final_ckpt" / "ckpt_00360000.pth"
 
 
 
 
 
 
 
193
  ]
194
 
195
  missing_files = [f for f in critical_files if not f.exists()]
 
185
 
186
  xcodec_base = Path("YuEGP/inference/xcodec_mini_infer")
187
 
188
+ # Check if critical files exist (both for recons and vocoder stages)
189
  critical_files = [
190
+ # Vocoder stage files
191
  xcodec_base / "decoders" / "decoder_131000.pth",
192
  xcodec_base / "decoders" / "decoder_151000.pth",
193
+ xcodec_base / "decoders" / "config.yaml",
194
+
195
+ # Recons stage files (critical for audio decoding)
196
+ xcodec_base / "final_ckpt" / "ckpt_00360000.pth",
197
+ xcodec_base / "final_ckpt" / "config.yaml",
198
+
199
+ # Python modules
200
+ xcodec_base / "models" / "soundstream_hubert_new.py"
201
  ]
202
 
203
  missing_files = [f for f in critical_files if not f.exists()]