Z User commited on
Commit
a8f6633
·
1 Parent(s): 3777ed3

Replace torchaudio.load with soundfile.read (bypasses torchcodec requirement)

Browse files
Files changed (2) hide show
  1. app.py +12 -2
  2. infer/utils_infer.py +7 -1
app.py CHANGED
@@ -307,7 +307,12 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
307
  non_silent_wave += non_silent_seg
308
  aseg = non_silent_wave
309
  aseg.export(f.name, format="wav")
310
- final_wave, _ = torchaudio.load(f.name, backend="soundfile")
 
 
 
 
 
311
  final_wave = final_wave.squeeze().cpu().numpy()
312
 
313
  # Create a combined spectrogram
@@ -363,7 +368,12 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
363
  else:
364
  ref_text += ". "
365
 
366
- audio, sr = torchaudio.load(ref_audio, backend="soundfile")
 
 
 
 
 
367
 
368
  max_chars = int((len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (20 - audio.shape[-1] / sr )))
369
  print(f"text: {max_chars} ")
 
307
  non_silent_wave += non_silent_seg
308
  aseg = non_silent_wave
309
  aseg.export(f.name, format="wav")
310
+ final_wave_np, _ = sf.read(f.name)
311
+ final_wave = torch.from_numpy(final_wave_np).float()
312
+ if final_wave.dim() == 1:
313
+ final_wave = final_wave.unsqueeze(0)
314
+ else:
315
+ final_wave = final_wave.T
316
  final_wave = final_wave.squeeze().cpu().numpy()
317
 
318
  # Create a combined spectrogram
 
368
  else:
369
  ref_text += ". "
370
 
371
+ audio_np, sr = sf.read(ref_audio)
372
+ audio = torch.from_numpy(audio_np).float()
373
+ if audio.dim() == 1:
374
+ audio = audio.unsqueeze(0)
375
+ else:
376
+ audio = audio.T # soundfile: (frames, channels) -> torchaudio: (channels, frames)
377
 
378
  max_chars = int((len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (20 - audio.shape[-1] / sr )))
379
  print(f"text: {max_chars} ")
infer/utils_infer.py CHANGED
@@ -16,6 +16,7 @@ import matplotlib.pylab as plt
16
  import numpy as np
17
  import torch
18
  import torchaudio
 
19
  import tqdm
20
  from huggingface_hub import hf_hub_download
21
  from pydub import AudioSegment, silence
@@ -330,7 +331,12 @@ def infer_process(
330
  fix_duration=fix_duration,
331
  device=device,
332
  ):
333
- audio, sr = torchaudio.load(ref_audio, backend="soundfile")
 
 
 
 
 
334
  max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
335
  gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
336
  for i, gen_text in enumerate(gen_text_batches):
 
16
  import numpy as np
17
  import torch
18
  import torchaudio
19
+ import soundfile as sf
20
  import tqdm
21
  from huggingface_hub import hf_hub_download
22
  from pydub import AudioSegment, silence
 
331
  fix_duration=fix_duration,
332
  device=device,
333
  ):
334
+ audio_np, sr = sf.read(ref_audio)
335
+ audio = torch.from_numpy(audio_np).float()
336
+ if audio.dim() == 1:
337
+ audio = audio.unsqueeze(0)
338
+ else:
339
+ audio = audio.T # soundfile: (frames, channels) -> torchaudio: (channels, frames)
340
  max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
341
  gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
342
  for i, gen_text in enumerate(gen_text_batches):