Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
fix vc bugs
Browse files
app.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
| 2 |
|
| 3 |
os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
|
| 4 |
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import torch
|
| 7 |
from torch import no_grad, LongTensor
|
|
@@ -34,9 +35,13 @@ def tts_fn(text, speaker_id):
|
|
| 34 |
|
| 35 |
def vc_fn(original_speaker_id, target_speaker_id, input_audio):
|
| 36 |
sampling_rate, audio = input_audio
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
y = y.unsqueeze(0)
|
| 39 |
-
|
| 40 |
spec = spectrogram_torch(y, hps.data.filter_length,
|
| 41 |
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
| 42 |
center=False)
|
|
|
|
| 2 |
|
| 3 |
os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
|
| 4 |
|
| 5 |
+
import librosa
|
| 6 |
import numpy as np
|
| 7 |
import torch
|
| 8 |
from torch import no_grad, LongTensor
|
|
|
|
| 35 |
|
| 36 |
def vc_fn(original_speaker_id, target_speaker_id, input_audio):
|
| 37 |
sampling_rate, audio = input_audio
|
| 38 |
+
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
| 39 |
+
if len(audio.shape) > 1:
|
| 40 |
+
audio = librosa.to_mono(audio.transpose(1, 0))
|
| 41 |
+
if sampling_rate != hps.data.sampling_rate:
|
| 42 |
+
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
|
| 43 |
+
y = torch.FloatTensor(audio)
|
| 44 |
y = y.unsqueeze(0)
|
|
|
|
| 45 |
spec = spectrogram_torch(y, hps.data.filter_length,
|
| 46 |
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
| 47 |
center=False)
|