ACE-Step

Paused

App Files Files Community

javier233455 commited on May 29

Commit

865175e

verified ·

1 Parent(s): 62f44c9

Update music_dcae/music_dcae_pipeline.py

Browse files

Files changed (1) hide show

music_dcae/music_dcae_pipeline.py +133 -20

music_dcae/music_dcae_pipeline.py CHANGED Viewed

@@ -21,7 +21,12 @@ VOCODER_PRETRAINED_PATH = os.path.join(root_dir, "checkpoints", "music_vocoder")
 class MusicDCAE(ModelMixin, ConfigMixin, FromOriginalModelMixin):
     @register_to_config
-    def __init__(self, source_sample_rate=None, dcae_checkpoint_path=DEFAULT_PRETRAINED_PATH, vocoder_checkpoint_path=VOCODER_PRETRAINED_PATH):
         super(MusicDCAE, self).__init__()
         self.dcae = AutoencoderDC.from_pretrained(dcae_checkpoint_path)
@@ -35,6 +40,7 @@ class MusicDCAE(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         self.transform = transforms.Compose([
             transforms.Normalize(0.5, 0.5),
         ])
         self.min_mel_value = -11.0
         self.max_mel_value = 3.0
         self.audio_chunk_size = int(round((1024 * 512 / 44100 * 48000)))
@@ -46,48 +52,128 @@ class MusicDCAE(ModelMixin, ConfigMixin, FromOriginalModelMixin):
     def load_audio(self, audio_path):
         audio, sr = torchaudio.load(audio_path)
         return audio, sr
     def forward_mel(self, audios):
         mels = []
         for i in range(len(audios)):
-            image = self.vocoder.mel_transform(audios[i])
             mels.append(image)
         mels = torch.stack(mels)
         return mels
     @torch.no_grad()
     def encode(self, audios, audio_lengths=None, sr=None):
         if audio_lengths is None:
             audio_lengths = torch.tensor([audios.shape[2]] * audios.shape[0])
             audio_lengths = audio_lengths.to(audios.device)
-        # audios: N x 2 x T, 48kHz
         device = audios.device
         dtype = audios.dtype
         if sr is None:
             sr = 48000
-            resampler = self.resampler
         else:
             resampler = torchaudio.transforms.Resample(sr, 44100).to(device).to(dtype)
         audio = resampler(audios)
         max_audio_len = audio.shape[-1]
         if max_audio_len % (8 * 512) != 0:
-            audio = torch.nn.functional.pad(audio, (0, 8 * 512 - max_audio_len % (8 * 512)))
         mels = self.forward_mel(audio)
-        mels = (mels - self.min_mel_value) / (self.max_mel_value - self.min_mel_value)
         mels = self.transform(mels)
         latents = []
         for mel in mels:
             latent = self.dcae.encoder(mel.unsqueeze(0))
             latents.append(latent)
         latents = torch.cat(latents, dim=0)
-        latent_lengths = (audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple).long()
         latents = (latents - self.shift_factor) * self.scale_factor
         return latents, latent_lengths
     @torch.no_grad()
@@ -99,43 +185,70 @@ class MusicDCAE(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         for latent in latents:
             mels = self.dcae.decoder(latent.unsqueeze(0))
             mels = mels * 0.5 + 0.5
-            mels = mels * (self.max_mel_value - self.min_mel_value) + self.min_mel_value
             wav = self.vocoder.decode(mels[0]).squeeze(1)
             if sr is not None:
-                resampler = torchaudio.transforms.Resample(44100, sr).to(latents.device).to(latents.dtype)
                 wav = resampler(wav)
             else:
                 sr = 44100
             pred_wavs.append(wav)
         if audio_lengths is not None:
-            pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)]
         return sr, pred_wavs
     def forward(self, audios, audio_lengths=None, sr=None):
-        latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr)
-        sr, pred_wavs = self.decode(latents=latents, audio_lengths=audio_lengths, sr=sr)
         return sr, pred_wavs, latents, latent_lengths
 if __name__ == "__main__":
     audio, sr = torchaudio.load("test.wav")
     audio_lengths = torch.tensor([audio.shape[1]])
     audios = audio.unsqueeze(0)
-    # test encode only
     model = MusicDCAE()
-    # latents, latent_lengths = model.encode(audios, audio_lengths)
-    # print("latents shape: ", latents.shape)
-    # print("latent_lengths: ", latent_lengths)
-    # test encode and decode
     sr, pred_wavs, latents, latent_lengths = model(audios, audio_lengths, sr)
     print("reconstructed wavs: ", pred_wavs[0].shape)
     print("latents shape: ", latents.shape)
     print("latent_lengths: ", latent_lengths)
     print("sr: ", sr)
     torchaudio.save("test_reconstructed.flac", pred_wavs[0], sr)
-    print("test_reconstructed.flac")

 class MusicDCAE(ModelMixin, ConfigMixin, FromOriginalModelMixin):
     @register_to_config
+    def __init__(
+        self,
+        source_sample_rate=None,
+        dcae_checkpoint_path=DEFAULT_PRETRAINED_PATH,
+        vocoder_checkpoint_path=VOCODER_PRETRAINED_PATH
+    ):
         super(MusicDCAE, self).__init__()
         self.dcae = AutoencoderDC.from_pretrained(dcae_checkpoint_path)
         self.transform = transforms.Compose([
             transforms.Normalize(0.5, 0.5),
         ])
         self.min_mel_value = -11.0
         self.max_mel_value = 3.0
         self.audio_chunk_size = int(round((1024 * 512 / 44100 * 48000)))
     def load_audio(self, audio_path):
         audio, sr = torchaudio.load(audio_path)
+        # FIX: si el audio está en mono, duplicarlo a estéreo
+        if audio.dim() == 1:
+            audio = audio.unsqueeze(0)
+        if audio.shape[0] == 1:
+            audio = audio.repeat(2, 1)
+        elif audio.shape[0] > 2:
+            audio = audio[:2]
         return audio, sr
     def forward_mel(self, audios):
         mels = []
         for i in range(len(audios)):
+            audio_item = audios[i]
+            # FIX: asegurar audio estéreo antes de convertir a mel
+            if audio_item.dim() == 1:
+                audio_item = audio_item.unsqueeze(0)
+            if audio_item.shape[0] == 1:
+                audio_item = audio_item.repeat(2, 1)
+            elif audio_item.shape[0] > 2:
+                audio_item = audio_item[:2]
+            image = self.vocoder.mel_transform(audio_item)
             mels.append(image)
         mels = torch.stack(mels)
         return mels
     @torch.no_grad()
     def encode(self, audios, audio_lengths=None, sr=None):
+        # ============================================================
+        # FIX PRINCIPAL:
+        # ACE-Step / MusicDCAE espera audios con forma N x 2 x T.
+        # Si llega mono N x 1 x T, se duplica el canal.
+        # ============================================================
+        if audios.dim() == 1:
+            # T -> 1 x 1 x T
+            audios = audios.unsqueeze(0).unsqueeze(0)
+        elif audios.dim() == 2:
+            # Puede venir como C x T
+            audios = audios.unsqueeze(0)
+        if audios.shape[1] == 1:
+            # N x 1 x T -> N x 2 x T
+            audios = audios.repeat(1, 2, 1)
+        elif audios.shape[1] > 2:
+            # Si tiene más de 2 canales, usar solo los dos primeros
+            audios = audios[:, :2, :]
         if audio_lengths is None:
             audio_lengths = torch.tensor([audios.shape[2]] * audios.shape[0])
             audio_lengths = audio_lengths.to(audios.device)
+        # audios: N x 2 x T
         device = audios.device
         dtype = audios.dtype
         if sr is None:
             sr = 48000
+            resampler = self.resampler.to(device).to(dtype)
         else:
             resampler = torchaudio.transforms.Resample(sr, 44100).to(device).to(dtype)
         audio = resampler(audios)
+        # FIX extra después del resample
+        if audio.shape[1] == 1:
+            audio = audio.repeat(1, 2, 1)
+        elif audio.shape[1] > 2:
+            audio = audio[:, :2, :]
         max_audio_len = audio.shape[-1]
         if max_audio_len % (8 * 512) != 0:
+            audio = torch.nn.functional.pad(
+                audio,
+                (0, 8 * 512 - max_audio_len % (8 * 512))
+            )
         mels = self.forward_mel(audio)
+        mels = (mels - self.min_mel_value) / (
+            self.max_mel_value - self.min_mel_value
+        )
         mels = self.transform(mels)
         latents = []
         for mel in mels:
+            # ========================================================
+            # FIX FINAL:
+            # El encoder espera mel con 2 canales.
+            # Si mel viene como 1 x 128 x T, convertir a 2 x 128 x T.
+            # ========================================================
+            if mel.dim() == 2:
+                mel = mel.unsqueeze(0)
+            if mel.shape[0] == 1:
+                mel = mel.repeat(2, 1, 1)
+            elif mel.shape[0] > 2:
+                mel = mel[:2]
             latent = self.dcae.encoder(mel.unsqueeze(0))
             latents.append(latent)
         latents = torch.cat(latents, dim=0)
+        latent_lengths = (
+            audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple
+        ).long()
         latents = (latents - self.shift_factor) * self.scale_factor
         return latents, latent_lengths
     @torch.no_grad()
         for latent in latents:
             mels = self.dcae.decoder(latent.unsqueeze(0))
             mels = mels * 0.5 + 0.5
+            mels = mels * (
+                self.max_mel_value - self.min_mel_value
+            ) + self.min_mel_value
             wav = self.vocoder.decode(mels[0]).squeeze(1)
             if sr is not None:
+                resampler = torchaudio.transforms.Resample(
+                    44100,
+                    sr
+                ).to(latents.device).to(latents.dtype)
                 wav = resampler(wav)
             else:
                 sr = 44100
             pred_wavs.append(wav)
         if audio_lengths is not None:
+            pred_wavs = [
+                wav[:, :length].cpu()
+                for wav, length in zip(pred_wavs, audio_lengths)
+            ]
         return sr, pred_wavs
     def forward(self, audios, audio_lengths=None, sr=None):
+        latents, latent_lengths = self.encode(
+            audios=audios,
+            audio_lengths=audio_lengths,
+            sr=sr
+        )
+        sr, pred_wavs = self.decode(
+            latents=latents,
+            audio_lengths=audio_lengths,
+            sr=sr
+        )
         return sr, pred_wavs, latents, latent_lengths
 if __name__ == "__main__":
     audio, sr = torchaudio.load("test.wav")
+    # FIX para prueba local con audio mono
+    if audio.dim() == 1:
+        audio = audio.unsqueeze(0)
+    if audio.shape[0] == 1:
+        audio = audio.repeat(2, 1)
+    elif audio.shape[0] > 2:
+        audio = audio[:2]
     audio_lengths = torch.tensor([audio.shape[1]])
     audios = audio.unsqueeze(0)
     model = MusicDCAE()
     sr, pred_wavs, latents, latent_lengths = model(audios, audio_lengths, sr)
     print("reconstructed wavs: ", pred_wavs[0].shape)
     print("latents shape: ", latents.shape)
     print("latent_lengths: ", latent_lengths)
     print("sr: ", sr)
     torchaudio.save("test_reconstructed.flac", pred_wavs[0], sr)
+    print("test_reconstructed.flac")