Spaces:

overwrite69
/

tts69429585

Sleeping

Z User commited on Apr 13

Commit

a8f6633

1 Parent(s): 3777ed3

Replace torchaudio.load with soundfile.read (bypasses torchcodec requirement)

Files changed (2) hide show

app.py CHANGED Viewed

@@ -307,7 +307,12 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
                 non_silent_wave += non_silent_seg
             aseg = non_silent_wave
             aseg.export(f.name, format="wav")
-            final_wave, _ = torchaudio.load(f.name, backend="soundfile")
         final_wave = final_wave.squeeze().cpu().numpy()
     # Create a combined spectrogram
@@ -363,7 +368,12 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
         else:
             ref_text += ". "
-    audio, sr = torchaudio.load(ref_audio, backend="soundfile")
     max_chars = int((len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (20 - audio.shape[-1] / sr )))
     print(f"text: {max_chars} ")

                 non_silent_wave += non_silent_seg
             aseg = non_silent_wave
             aseg.export(f.name, format="wav")
+            final_wave_np, _ = sf.read(f.name)
+            final_wave = torch.from_numpy(final_wave_np).float()
+            if final_wave.dim() == 1:
+                final_wave = final_wave.unsqueeze(0)
+            else:
+                final_wave = final_wave.T
         final_wave = final_wave.squeeze().cpu().numpy()
     # Create a combined spectrogram
         else:
             ref_text += ". "
+    audio_np, sr = sf.read(ref_audio)
+    audio = torch.from_numpy(audio_np).float()
+    if audio.dim() == 1:
+        audio = audio.unsqueeze(0)
+    else:
+        audio = audio.T  # soundfile: (frames, channels) -> torchaudio: (channels, frames)
     max_chars = int((len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (20 - audio.shape[-1] / sr )))
     print(f"text: {max_chars} ")

infer/utils_infer.py CHANGED Viewed

@@ -16,6 +16,7 @@ import matplotlib.pylab as plt
 import numpy as np
 import torch
 import torchaudio
 import tqdm
 from huggingface_hub import hf_hub_download
 from pydub import AudioSegment, silence
@@ -330,7 +331,12 @@ def infer_process(
     fix_duration=fix_duration,
     device=device,
 ):
-    audio, sr = torchaudio.load(ref_audio, backend="soundfile")
     max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
     gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
     for i, gen_text in enumerate(gen_text_batches):

 import numpy as np
 import torch
 import torchaudio
+import soundfile as sf
 import tqdm
 from huggingface_hub import hf_hub_download
 from pydub import AudioSegment, silence
     fix_duration=fix_duration,
     device=device,
 ):
+    audio_np, sr = sf.read(ref_audio)
+    audio = torch.from_numpy(audio_np).float()
+    if audio.dim() == 1:
+        audio = audio.unsqueeze(0)
+    else:
+        audio = audio.T  # soundfile: (frames, channels) -> torchaudio: (channels, frames)
     max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
     gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
     for i, gen_text in enumerate(gen_text_batches):