Spaces:

HugMilo
/

MiloMusic

Running

futurespyhi commited on Sep 14, 2025

Commit

3dfa434

1 Parent(s): 28df979

Enhance audio generation debugging and error handling

- Add comprehensive debugging output for audio decoding pipeline
- Implement silent audio detection with amplitude checking
- Improve error handling with try-catch blocks for robustness
- Expand critical file validation in model download process
- Optimize audio file search priority for better quality output
- Update gitignore to exclude test files and logs

Files changed (4) hide show

.gitignore +5 -1
YuEGP/inference/infer.py +45 -15
app.py +6 -0
download_models.py +10 -2

.gitignore CHANGED Viewed

@@ -109,6 +109,10 @@ flash_attn/
 packages.txt.backup
 requirements.txt.backup
 download_model.py.backup
 # test files
-tests/

 packages.txt.backup
 requirements.txt.backup
 download_model.py.backup
+decode_vocode.txt
 # test files
+tests/
+# YuE
+YuE/

YuEGP/inference/infer.py CHANGED Viewed

@@ -192,7 +192,7 @@ def load_audio_mono(filepath, sampling_rate=16000):
         audio = resampler(audio)
     return audio
 def encode_audio(codec_model, audio_prompt, device, target_bw=0.5):
     if len(audio_prompt.shape) < 3:
         audio_prompt.unsqueeze_(0)
@@ -511,17 +511,32 @@ recons_mix_dir = os.path.join(recons_output_dir, 'mix')
 os.makedirs(recons_mix_dir, exist_ok=True)
 tracks = []
 for npy in stage2_result:
-    codec_result = np.load(npy)
-    decodec_rlt = []
-    with torch.no_grad():
-        decoded_waveform = codec_model.decode(
-            torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
-    decoded_waveform = decoded_waveform.cpu().squeeze(0)
-    decodec_rlt.append(torch.as_tensor(decoded_waveform, device="cpu"))
-    decodec_rlt = torch.cat(decodec_rlt, dim=-1)
-    save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
-    tracks.append(save_path)
-    save_audio(decodec_rlt, save_path, 16000)
 # mix tracks
 for inst_path in tracks:
     try:
@@ -533,12 +548,27 @@ for inst_path in tracks:
                 continue
             # mix
             recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('_itrack', '_mixed'))
-            vocal_stem, sr = sf.read(inst_path)
-            instrumental_stem, _ = sf.read(vocal_path)
             mix_stem = (vocal_stem + instrumental_stem) / 1
             sf.write(recons_mix, mix_stem, sr)
     except Exception as e:
-        print(e)
 # vocoder to upsample audios
 vocal_decoder, inst_decoder = build_codec_model(args.config_path, args.vocal_decoder_path, args.inst_decoder_path)

         audio = resampler(audio)
     return audio
+#音频编码
 def encode_audio(codec_model, audio_prompt, device, target_bw=0.5):
     if len(audio_prompt.shape) < 3:
         audio_prompt.unsqueeze_(0)
 os.makedirs(recons_mix_dir, exist_ok=True)
 tracks = []
 for npy in stage2_result:
+    try:
+        codec_result = np.load(npy)
+        print(f"Decoding:{npy}, shape: {codec_result.shape}, dtype: {codec_result.dtype}")
+        print(f"🐛 Codec values range: [{codec_result.min()}, {codec_result.max()}]")
+        # decodec_rlt = []
+        with torch.no_grad():
+            codec_tensor = torch.as_tensor(codec_result, dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device)
+            print(f"🐛 Codec tensor shape: {codec_tensor.shape}")
+            decoded_waveform = codec_model.decode(codec_tensor)
+            print(f"🐛 Decoded waveform shape: {decoded_waveform.shape}")
+        decoded_waveform = decoded_waveform.cpu().squeeze(0)
+        print(f"🐛 Decoded audio range: [{decoded_waveform.min():.6f}, {decoded_waveform.max():.6f}]")
+        # decodec_rlt.append(torch.as_tensor(decoded_waveform, device="cpu"))
+        # decodec_rlt = torch.cat(decodec_rlt, dim=-1)
+        save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
+        tracks.append(save_path)
+        if decoded_waveform.abs().max() < 1e-6:
+            print(f"⚠️  WARNING: {npy} decoded to silent audio!")
+        else:
+            print(f"✅ Audio has sound, max amplitude: {decoded_waveform.abs().max():.6f}")
+        save_audio(decoded_waveform, save_path, 16000)
+        print(f"Saved Path:{save_path}")
+    except Exception as e:
+        print(f"Error decoding {npy}:{e}")
+        continue
 # mix tracks
 for inst_path in tracks:
     try:
                 continue
             # mix
             recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('_itrack', '_mixed'))
+            print(f"🎵 Mixing: {os.path.basename(vocal_path)} + {os.path.basename(inst_path)}")
+            vocal_stem, sr = sf.read(vocal_path)
+            instrumental_stem, _ = sf.read(inst_path)
+            print(f"🐛 Vocal stem - shape: {vocal_stem.shape}, max: {np.abs(vocal_stem).max():.6f}, sr: {sr}")
+            print(f"🐛 Instrumental stem - shape: {instrumental_stem.shape}, max: {np.abs(instrumental_stem).max():.6f}")
+            # Check for silent tracks
+            if np.abs(vocal_stem).max() < 1e-6:
+                print(f"⚠️  WARNING: Vocal track is silent!")
+            if np.abs(instrumental_stem).max() < 1e-6:
+                print(f"⚠️  WARNING: Instrumental track is silent!")
             mix_stem = (vocal_stem + instrumental_stem) / 1
+            print(f"🐛 Mixed stem - max amplitude: {np.abs(mix_stem).max():.6f}")
             sf.write(recons_mix, mix_stem, sr)
+            print(f"✅ Mixed audio saved: {recons_mix}")
     except Exception as e:
+        print(f"❌ Error mixing tracks {inst_path}: {e}")
 # vocoder to upsample audios
 vocal_decoder, inst_decoder = build_codec_model(args.config_path, args.vocal_decoder_path, args.inst_decoder_path)

app.py CHANGED Viewed

@@ -509,6 +509,12 @@ def generate_music_spaces(lyrics: str, genre: str, mood: str, progress=gr.Progre
             # Find generated audio file - prioritize mixed audio from vocoder/mix directory
             import glob
             # First look for the final mixed audio in vocoder/mix
             mixed_files = glob.glob(os.path.join(output_dir, "vocoder/mix/*_mixed.mp3"))
             if mixed_files:

             # Find generated audio file - prioritize mixed audio from vocoder/mix directory
             import glob
+            final_files = glob.glob(os.path.join(output_dir, "*_mixed.mp3"))
+            if final_files:
+                progress(1.0, desc="Finish music generation")
+                print(f"✅ Found audio file at root: {final_files[0]}")
+                return final_files[0]
             # First look for the final mixed audio in vocoder/mix
             mixed_files = glob.glob(os.path.join(output_dir, "vocoder/mix/*_mixed.mp3"))
             if mixed_files:

download_models.py CHANGED Viewed

@@ -185,11 +185,19 @@ def ensure_model_availability():
     xcodec_base = Path("YuEGP/inference/xcodec_mini_infer")
-    # Check if critical decoder files exist (these are the most important for vocoder)
     critical_files = [
         xcodec_base / "decoders" / "decoder_131000.pth",
         xcodec_base / "decoders" / "decoder_151000.pth",
-        xcodec_base / "final_ckpt" / "ckpt_00360000.pth"
     ]
     missing_files = [f for f in critical_files if not f.exists()]

     xcodec_base = Path("YuEGP/inference/xcodec_mini_infer")
+    # Check if critical files exist (both for recons and vocoder stages)
     critical_files = [
+        # Vocoder stage files
         xcodec_base / "decoders" / "decoder_131000.pth",
         xcodec_base / "decoders" / "decoder_151000.pth",
+        xcodec_base / "decoders" / "config.yaml",
+        # Recons stage files (critical for audio decoding)
+        xcodec_base / "final_ckpt" / "ckpt_00360000.pth",
+        xcodec_base / "final_ckpt" / "config.yaml",
+        # Python modules
+        xcodec_base / "models" / "soundstream_hubert_new.py"
     ]
     missing_files = [f for f in critical_files if not f.exists()]