Spaces:
Sleeping
Sleeping
Z User commited on
Commit ·
a8f6633
1
Parent(s): 3777ed3
Replace torchaudio.load with soundfile.read (bypasses torchcodec requirement)
Browse files- app.py +12 -2
- infer/utils_infer.py +7 -1
app.py
CHANGED
|
@@ -307,7 +307,12 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
| 307 |
non_silent_wave += non_silent_seg
|
| 308 |
aseg = non_silent_wave
|
| 309 |
aseg.export(f.name, format="wav")
|
| 310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
final_wave = final_wave.squeeze().cpu().numpy()
|
| 312 |
|
| 313 |
# Create a combined spectrogram
|
|
@@ -363,7 +368,12 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
|
|
| 363 |
else:
|
| 364 |
ref_text += ". "
|
| 365 |
|
| 366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
max_chars = int((len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (20 - audio.shape[-1] / sr )))
|
| 369 |
print(f"text: {max_chars} ")
|
|
|
|
| 307 |
non_silent_wave += non_silent_seg
|
| 308 |
aseg = non_silent_wave
|
| 309 |
aseg.export(f.name, format="wav")
|
| 310 |
+
final_wave_np, _ = sf.read(f.name)
|
| 311 |
+
final_wave = torch.from_numpy(final_wave_np).float()
|
| 312 |
+
if final_wave.dim() == 1:
|
| 313 |
+
final_wave = final_wave.unsqueeze(0)
|
| 314 |
+
else:
|
| 315 |
+
final_wave = final_wave.T
|
| 316 |
final_wave = final_wave.squeeze().cpu().numpy()
|
| 317 |
|
| 318 |
# Create a combined spectrogram
|
|
|
|
| 368 |
else:
|
| 369 |
ref_text += ". "
|
| 370 |
|
| 371 |
+
audio_np, sr = sf.read(ref_audio)
|
| 372 |
+
audio = torch.from_numpy(audio_np).float()
|
| 373 |
+
if audio.dim() == 1:
|
| 374 |
+
audio = audio.unsqueeze(0)
|
| 375 |
+
else:
|
| 376 |
+
audio = audio.T # soundfile: (frames, channels) -> torchaudio: (channels, frames)
|
| 377 |
|
| 378 |
max_chars = int((len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (20 - audio.shape[-1] / sr )))
|
| 379 |
print(f"text: {max_chars} ")
|
infer/utils_infer.py
CHANGED
|
@@ -16,6 +16,7 @@ import matplotlib.pylab as plt
|
|
| 16 |
import numpy as np
|
| 17 |
import torch
|
| 18 |
import torchaudio
|
|
|
|
| 19 |
import tqdm
|
| 20 |
from huggingface_hub import hf_hub_download
|
| 21 |
from pydub import AudioSegment, silence
|
|
@@ -330,7 +331,12 @@ def infer_process(
|
|
| 330 |
fix_duration=fix_duration,
|
| 331 |
device=device,
|
| 332 |
):
|
| 333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
|
| 335 |
gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
|
| 336 |
for i, gen_text in enumerate(gen_text_batches):
|
|
|
|
| 16 |
import numpy as np
|
| 17 |
import torch
|
| 18 |
import torchaudio
|
| 19 |
+
import soundfile as sf
|
| 20 |
import tqdm
|
| 21 |
from huggingface_hub import hf_hub_download
|
| 22 |
from pydub import AudioSegment, silence
|
|
|
|
| 331 |
fix_duration=fix_duration,
|
| 332 |
device=device,
|
| 333 |
):
|
| 334 |
+
audio_np, sr = sf.read(ref_audio)
|
| 335 |
+
audio = torch.from_numpy(audio_np).float()
|
| 336 |
+
if audio.dim() == 1:
|
| 337 |
+
audio = audio.unsqueeze(0)
|
| 338 |
+
else:
|
| 339 |
+
audio = audio.T # soundfile: (frames, channels) -> torchaudio: (channels, frames)
|
| 340 |
max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
|
| 341 |
gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
|
| 342 |
for i, gen_text in enumerate(gen_text_batches):
|