DiffRhythm

Runtime error

cocktailpeanut commited on Mar 21

Commit

2338c72

1 Parent(s): d53a9dd

fix

Files changed (4) hide show

app.py CHANGED Viewed

@@ -42,8 +42,15 @@ def infer_music(lrc, ref_audio_path, steps, file_type, cfg_strength, odeint_meth
     max_frames = math.floor(duration * 21.56)
     sway_sampling_coef = -1 if steps < 32 else None
     lrc_prompt, start_time = get_lrc_token(max_frames, lrc, tokenizer, device)
-    style_prompt = get_style_prompt(muq, ref_audio_path, prompt)
     negative_style_prompt = get_negative_style_prompt(device)
     latent_prompt = get_reference_latent(device, max_frames)
     print(">0")
@@ -59,6 +66,7 @@ def infer_music(lrc, ref_audio_path, steps, file_type, cfg_strength, odeint_meth
                                sway_sampling_coef=sway_sampling_coef,
                                start_time=start_time,
                                file_type=file_type,
                                odeint_method=odeint_method,
                                )
     devicetorch.empty_cache(torch)

     max_frames = math.floor(duration * 21.56)
     sway_sampling_coef = -1 if steps < 32 else None
+    vocal_flag = False
     lrc_prompt, start_time = get_lrc_token(max_frames, lrc, tokenizer, device)
+#    style_prompt = get_style_prompt(muq, ref_audio_path, prompt)
+    if prompt is not None:
+        style_prompt = get_text_style_prompt(muq, text_prompt)
+    else:
+        style_prompt, vocal_flag = get_audio_style_prompt(muq, ref_audio_path)
     negative_style_prompt = get_negative_style_prompt(device)
     latent_prompt = get_reference_latent(device, max_frames)
     print(">0")
                                sway_sampling_coef=sway_sampling_coef,
                                start_time=start_time,
                                file_type=file_type,
+                               vocal_flag=vocal_flag,
                                odeint_method=odeint_method,
                                )
     devicetorch.empty_cache(torch)

diffrhythm/infer/infer.py CHANGED Viewed

@@ -16,6 +16,7 @@ from diffrhythm.infer.infer_utils import (
     get_reference_latent,
     get_lrc_token,
     get_style_prompt,
     prepare_model,
     get_negative_style_prompt
 )
@@ -75,7 +76,7 @@ def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
-def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, cfg_strength, sway_sampling_coef, start_time, file_type, odeint_method):
     with torch.inference_mode():
         print(">1")
@@ -89,6 +90,7 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
             cfg_strength=cfg_strength,
             sway_sampling_coef=sway_sampling_coef,
             start_time=start_time,
             odeint_method=odeint_method,
         )
         if torch.cuda.is_available():

     get_reference_latent,
     get_lrc_token,
     get_style_prompt,
+    get_audio_style_prompt,
     prepare_model,
     get_negative_style_prompt
 )
             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
+def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, cfg_strength, sway_sampling_coef, start_time, file_type, vocal_flag, odeint_method):
     with torch.inference_mode():
         print(">1")
             cfg_strength=cfg_strength,
             sway_sampling_coef=sway_sampling_coef,
             start_time=start_time,
+            vocal_flag=vocal_flag,
             odeint_method=odeint_method,
         )
         if torch.cuda.is_available():

diffrhythm/infer/infer_utils.py CHANGED Viewed

@@ -52,6 +52,41 @@ def get_negative_style_prompt(device):
     return vocal_stlye
 @torch.no_grad()
 def get_style_prompt(model, wav_path, prompt):
     mulan = model
@@ -129,6 +164,9 @@ def get_lrc_token(max_frames, text, tokenizer, device):
     comma_token_id = 1
     period_token_id = 2
     lrc_with_time = parse_lyrics(text)
     modified_lrc_with_time = []

     return vocal_stlye
+def get_audio_style_prompt(model, wav_path):
+    vocal_flag = False
+    mulan = model
+    audio, _ = librosa.load(wav_path, sr=24000)
+    audio_len = librosa.get_duration(y=audio, sr=24000)
+    if audio_len <= 1:
+        vocal_flag = True
+    if audio_len > 10:
+        start_time = int(audio_len // 2 - 5)
+        wav = audio[start_time*24000:(start_time+10)*24000]
+    else:
+        wav = audio
+    wav = torch.tensor(wav).unsqueeze(0).to(model.device)
+    with torch.no_grad():
+        audio_emb = mulan(wavs = wav) # [1, 512]
+    audio_emb = audio_emb.half()
+    return audio_emb, vocal_flag
+def get_text_style_prompt(model, text_prompt):
+    mulan = model
+    with torch.no_grad():
+        text_emb = mulan(texts = text_prompt) # [1, 512]
+    text_emb = text_emb.half()
+    return text_emb
 @torch.no_grad()
 def get_style_prompt(model, wav_path, prompt):
     mulan = model
     comma_token_id = 1
     period_token_id = 2
+    if text == "":
+        return torch.zeros((max_frames,), dtype=torch.long).unsqueeze(0).to(device), torch.tensor(0.).unsqueeze(0).to(device).half()
     lrc_with_time = parse_lyrics(text)
     modified_lrc_with_time = []

diffrhythm/model/cfm.py CHANGED Viewed

@@ -121,6 +121,7 @@ class CFM(nn.Module):
         start_time=None,
         latent_pred_start_frame=0,
         latent_pred_end_frame=2048,
         odeint_method="euler"
     ):
         self.eval()
@@ -199,6 +200,11 @@ class CFM(nn.Module):
         start_time_embed, positive_text_embed, positive_text_residuals = self.transformer.forward_timestep_invariant(text, step_cond.shape[1], drop_text=False, start_time=start_time)
         _, negative_text_embed, negative_text_residuals = self.transformer.forward_timestep_invariant(text, step_cond.shape[1], drop_text=True, start_time=start_time)
         text_embed = torch.cat([positive_text_embed, negative_text_embed], 0)
         text_residuals = [torch.cat([a, b], 0) for a, b in zip(positive_text_residuals, negative_text_residuals)]
         step_cond = torch.cat([step_cond, step_cond], 0)

         start_time=None,
         latent_pred_start_frame=0,
         latent_pred_end_frame=2048,
+        vocal_flag=False,
         odeint_method="euler"
     ):
         self.eval()
         start_time_embed, positive_text_embed, positive_text_residuals = self.transformer.forward_timestep_invariant(text, step_cond.shape[1], drop_text=False, start_time=start_time)
         _, negative_text_embed, negative_text_residuals = self.transformer.forward_timestep_invariant(text, step_cond.shape[1], drop_text=True, start_time=start_time)
+        if vocal_flag:
+            style_prompt = negative_style_prompt
+            negative_style_prompt = torch.zeros_like(style_prompt)
         text_embed = torch.cat([positive_text_embed, negative_text_embed], 0)
         text_residuals = [torch.cat([a, b], 0) for a, b in zip(positive_text_residuals, negative_text_residuals)]
         step_cond = torch.cat([step_cond, step_cond], 0)