upgrade
Browse files- README.md +1 -1
- app.py +5 -6
- openvoice/api.py +45 -23
- openvoice/se_extractor.py +101 -88
- requirements.txt +4 -4
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: 🤗
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
|
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.33.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
app.py
CHANGED
|
@@ -35,23 +35,22 @@ tone_color_converter = ToneColorConverter(
|
|
| 35 |
)
|
| 36 |
tone_color_converter.load_ckpt(f"{ckpt_converter}/checkpoint.pth")
|
| 37 |
|
|
|
|
| 38 |
def predict(speaker_wav, transform_wav):
|
| 39 |
# initialize a empty info
|
| 40 |
text_hint = ""
|
| 41 |
|
| 42 |
# extract source_se
|
| 43 |
-
source_se
|
| 44 |
transform_wav,
|
| 45 |
tone_color_converter,
|
| 46 |
-
vad=True,
|
| 47 |
)
|
| 48 |
|
| 49 |
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
|
| 50 |
try:
|
| 51 |
-
target_se
|
| 52 |
speaker_wav,
|
| 53 |
tone_color_converter,
|
| 54 |
-
vad=True,
|
| 55 |
)
|
| 56 |
except Exception as e:
|
| 57 |
text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
|
|
@@ -86,13 +85,13 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
| 86 |
with gr.Column():
|
| 87 |
ref_gr = gr.Audio(
|
| 88 |
label="Reference Audio",
|
| 89 |
-
info="Click on the ✎ button to upload your own target speaker audio",
|
| 90 |
type="filepath",
|
| 91 |
value="examples/speaker0.mp3",
|
| 92 |
)
|
| 93 |
tra_gr = gr.Audio(
|
| 94 |
label="Transform Audio",
|
| 95 |
-
info="Click on the ✎ button to upload your own target transform audio",
|
| 96 |
type="filepath",
|
| 97 |
value=None,
|
| 98 |
)
|
|
|
|
| 35 |
)
|
| 36 |
tone_color_converter.load_ckpt(f"{ckpt_converter}/checkpoint.pth")
|
| 37 |
|
| 38 |
+
|
| 39 |
def predict(speaker_wav, transform_wav):
|
| 40 |
# initialize a empty info
|
| 41 |
text_hint = ""
|
| 42 |
|
| 43 |
# extract source_se
|
| 44 |
+
source_se = se_extractor.get_se(
|
| 45 |
transform_wav,
|
| 46 |
tone_color_converter,
|
|
|
|
| 47 |
)
|
| 48 |
|
| 49 |
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
|
| 50 |
try:
|
| 51 |
+
target_se = se_extractor.get_se(
|
| 52 |
speaker_wav,
|
| 53 |
tone_color_converter,
|
|
|
|
| 54 |
)
|
| 55 |
except Exception as e:
|
| 56 |
text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
|
|
|
|
| 85 |
with gr.Column():
|
| 86 |
ref_gr = gr.Audio(
|
| 87 |
label="Reference Audio",
|
| 88 |
+
# info="Click on the ✎ button to upload your own target speaker audio",
|
| 89 |
type="filepath",
|
| 90 |
value="examples/speaker0.mp3",
|
| 91 |
)
|
| 92 |
tra_gr = gr.Audio(
|
| 93 |
label="Transform Audio",
|
| 94 |
+
# info="Click on the ✎ button to upload your own target transform audio",
|
| 95 |
type="filepath",
|
| 96 |
value=None,
|
| 97 |
)
|
openvoice/api.py
CHANGED
|
@@ -9,16 +9,14 @@ from openvoice.models import SynthesizerTrn
|
|
| 9 |
|
| 10 |
|
| 11 |
class OpenVoiceBaseClass(object):
|
| 12 |
-
def __init__(self,
|
| 13 |
-
|
| 14 |
-
device='cuda:0'):
|
| 15 |
-
if 'cuda' in device:
|
| 16 |
assert torch.cuda.is_available()
|
| 17 |
|
| 18 |
hps = utils.get_hparams_from_file(config_path)
|
| 19 |
|
| 20 |
model = SynthesizerTrn(
|
| 21 |
-
len(getattr(hps,
|
| 22 |
hps.data.filter_length // 2 + 1,
|
| 23 |
n_speakers=hps.data.n_speakers,
|
| 24 |
**hps.model,
|
|
@@ -31,32 +29,37 @@ class OpenVoiceBaseClass(object):
|
|
| 31 |
|
| 32 |
def load_ckpt(self, ckpt_path):
|
| 33 |
checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
|
| 34 |
-
a, b = self.model.load_state_dict(checkpoint_dict[
|
| 35 |
print("Loaded checkpoint '{}'".format(ckpt_path))
|
| 36 |
-
print(
|
| 37 |
|
| 38 |
|
| 39 |
class ToneColorConverter(OpenVoiceBaseClass):
|
| 40 |
def __init__(self, *args, **kwargs):
|
| 41 |
super().__init__(*args, **kwargs)
|
| 42 |
-
self.version = getattr(self.hps,
|
| 43 |
|
| 44 |
def extract_se(self, ref_wav_list, se_save_path=None):
|
| 45 |
-
if isinstance(ref_wav_list, str):
|
| 46 |
-
|
| 47 |
-
|
| 48 |
device = self.device
|
| 49 |
hps = self.hps
|
| 50 |
gs = []
|
| 51 |
-
|
| 52 |
for fname in ref_wav_list:
|
| 53 |
audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
|
| 54 |
y = torch.FloatTensor(audio_ref)
|
| 55 |
y = y.to(device)
|
| 56 |
y = y.unsqueeze(0)
|
| 57 |
-
y = spectrogram_torch(
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
with torch.no_grad():
|
| 61 |
g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
|
| 62 |
gs.append(g.detach())
|
|
@@ -68,23 +71,42 @@ class ToneColorConverter(OpenVoiceBaseClass):
|
|
| 68 |
|
| 69 |
return gs
|
| 70 |
|
| 71 |
-
def convert(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
hps = self.hps
|
| 73 |
# load audio
|
| 74 |
audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
|
| 75 |
audio = torch.tensor(audio).float()
|
| 76 |
-
|
| 77 |
with torch.no_grad():
|
| 78 |
y = torch.FloatTensor(audio).to(self.device)
|
| 79 |
y = y.unsqueeze(0)
|
| 80 |
-
spec = spectrogram_torch(
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
if output_path is None:
|
| 87 |
return audio
|
| 88 |
else:
|
| 89 |
soundfile.write(output_path, audio, hps.data.sampling_rate)
|
| 90 |
-
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
class OpenVoiceBaseClass(object):
|
| 12 |
+
def __init__(self, config_path, device="cuda:0"):
|
| 13 |
+
if "cuda" in device:
|
|
|
|
|
|
|
| 14 |
assert torch.cuda.is_available()
|
| 15 |
|
| 16 |
hps = utils.get_hparams_from_file(config_path)
|
| 17 |
|
| 18 |
model = SynthesizerTrn(
|
| 19 |
+
len(getattr(hps, "symbols", [])),
|
| 20 |
hps.data.filter_length // 2 + 1,
|
| 21 |
n_speakers=hps.data.n_speakers,
|
| 22 |
**hps.model,
|
|
|
|
| 29 |
|
| 30 |
def load_ckpt(self, ckpt_path):
|
| 31 |
checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
|
| 32 |
+
a, b = self.model.load_state_dict(checkpoint_dict["model"], strict=False)
|
| 33 |
print("Loaded checkpoint '{}'".format(ckpt_path))
|
| 34 |
+
print("missing/unexpected keys:", a, b)
|
| 35 |
|
| 36 |
|
| 37 |
class ToneColorConverter(OpenVoiceBaseClass):
|
| 38 |
def __init__(self, *args, **kwargs):
|
| 39 |
super().__init__(*args, **kwargs)
|
| 40 |
+
self.version = getattr(self.hps, "_version_", "v1")
|
| 41 |
|
| 42 |
def extract_se(self, ref_wav_list, se_save_path=None):
|
| 43 |
+
# if isinstance(ref_wav_list, str):
|
| 44 |
+
# ref_wav_list = [ref_wav_list]
|
| 45 |
+
|
| 46 |
device = self.device
|
| 47 |
hps = self.hps
|
| 48 |
gs = []
|
| 49 |
+
|
| 50 |
for fname in ref_wav_list:
|
| 51 |
audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
|
| 52 |
y = torch.FloatTensor(audio_ref)
|
| 53 |
y = y.to(device)
|
| 54 |
y = y.unsqueeze(0)
|
| 55 |
+
y = spectrogram_torch(
|
| 56 |
+
y,
|
| 57 |
+
hps.data.filter_length,
|
| 58 |
+
hps.data.sampling_rate,
|
| 59 |
+
hps.data.hop_length,
|
| 60 |
+
hps.data.win_length,
|
| 61 |
+
center=False,
|
| 62 |
+
).to(device)
|
| 63 |
with torch.no_grad():
|
| 64 |
g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
|
| 65 |
gs.append(g.detach())
|
|
|
|
| 71 |
|
| 72 |
return gs
|
| 73 |
|
| 74 |
+
def convert(
|
| 75 |
+
self,
|
| 76 |
+
audio_src_path,
|
| 77 |
+
src_se,
|
| 78 |
+
tgt_se,
|
| 79 |
+
output_path=None,
|
| 80 |
+
tau=0.3,
|
| 81 |
+
):
|
| 82 |
hps = self.hps
|
| 83 |
# load audio
|
| 84 |
audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
|
| 85 |
audio = torch.tensor(audio).float()
|
| 86 |
+
|
| 87 |
with torch.no_grad():
|
| 88 |
y = torch.FloatTensor(audio).to(self.device)
|
| 89 |
y = y.unsqueeze(0)
|
| 90 |
+
spec = spectrogram_torch(
|
| 91 |
+
y,
|
| 92 |
+
hps.data.filter_length,
|
| 93 |
+
hps.data.sampling_rate,
|
| 94 |
+
hps.data.hop_length,
|
| 95 |
+
hps.data.win_length,
|
| 96 |
+
center=False,
|
| 97 |
+
).to(self.device)
|
| 98 |
spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
|
| 99 |
+
|
| 100 |
+
audio = (
|
| 101 |
+
self.model.voice_conversion(
|
| 102 |
+
spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau
|
| 103 |
+
)[0][0, 0]
|
| 104 |
+
.data.cpu()
|
| 105 |
+
.float()
|
| 106 |
+
.numpy()
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
if output_path is None:
|
| 110 |
return audio
|
| 111 |
else:
|
| 112 |
soundfile.write(output_path, audio, hps.data.sampling_rate)
|
|
|
openvoice/se_extractor.py
CHANGED
|
@@ -7,7 +7,9 @@ import base64
|
|
| 7 |
from glob import glob
|
| 8 |
import numpy as np
|
| 9 |
from pydub import AudioSegment
|
| 10 |
-
from
|
|
|
|
|
|
|
| 11 |
import hashlib
|
| 12 |
import base64
|
| 13 |
import librosa
|
|
@@ -16,65 +18,72 @@ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
|
|
| 16 |
model_size = "medium"
|
| 17 |
# Run on GPU with FP16
|
| 18 |
model = None
|
| 19 |
-
def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
|
| 20 |
-
global model
|
| 21 |
-
if model is None:
|
| 22 |
-
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
| 23 |
-
audio = AudioSegment.from_file(audio_path)
|
| 24 |
-
max_len = len(audio)
|
| 25 |
|
| 26 |
-
target_folder = os.path.join(target_dir, audio_name)
|
| 27 |
-
|
| 28 |
-
segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
|
| 29 |
-
segments = list(segments)
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
for k, w in enumerate(segments):
|
| 41 |
-
# process with the time
|
| 42 |
-
if k == 0:
|
| 43 |
-
start_time = max(0, w.start)
|
| 44 |
|
| 45 |
-
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
confidence = 0.
|
| 52 |
-
# clean text
|
| 53 |
-
text = w.text.replace('...', '')
|
| 54 |
|
| 55 |
-
|
| 56 |
-
audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
audio_seg.export(output_file, format='wav')
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
|
|
|
| 75 |
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
SAMPLE_RATE = 16000
|
| 79 |
audio_vad = get_audio_tensor(audio_path)
|
| 80 |
segments = get_vad_segments(
|
|
@@ -85,69 +94,73 @@ def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
|
|
| 85 |
method="silero",
|
| 86 |
)
|
| 87 |
segments = [(seg["start"], seg["end"]) for seg in segments]
|
| 88 |
-
segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
|
| 89 |
-
print(segments)
|
| 90 |
audio_active = AudioSegment.silent(duration=0)
|
| 91 |
audio = AudioSegment.from_file(audio_path)
|
| 92 |
|
| 93 |
for start_time, end_time in segments:
|
| 94 |
-
audio_active += audio[int(
|
| 95 |
-
|
| 96 |
audio_dur = audio_active.duration_seconds
|
| 97 |
-
print(f
|
| 98 |
-
|
| 99 |
-
wavs_folder = os.path.join(target_folder, 'wavs')
|
| 100 |
-
os.makedirs(wavs_folder, exist_ok=True)
|
| 101 |
-
start_time = 0.
|
| 102 |
count = 0
|
| 103 |
num_splits = int(np.round(audio_dur / split_seconds))
|
| 104 |
-
assert num_splits > 0,
|
| 105 |
interval = audio_dur / num_splits
|
| 106 |
|
|
|
|
| 107 |
for i in range(num_splits):
|
| 108 |
end_time = min(start_time + interval, audio_dur)
|
| 109 |
if i == num_splits - 1:
|
| 110 |
end_time = audio_dur
|
| 111 |
-
output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
start_time = end_time
|
| 115 |
count += 1
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
| 131 |
version = vc_model.version
|
| 132 |
print("OpenVoice version:", version)
|
| 133 |
|
| 134 |
-
audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
|
| 135 |
-
se_path = os.path.join(target_dir, audio_name,
|
| 136 |
|
| 137 |
# if os.path.isfile(se_path):
|
| 138 |
# se = torch.load(se_path).to(device)
|
| 139 |
# return se, audio_name
|
| 140 |
# if os.path.isdir(audio_path):
|
| 141 |
# wavs_folder = audio_path
|
| 142 |
-
|
| 143 |
-
if vad:
|
| 144 |
-
wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
|
| 145 |
-
else:
|
| 146 |
-
wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
|
| 147 |
-
|
| 148 |
-
audio_segs = glob(f'{wavs_folder}/*.wav')
|
| 149 |
-
if len(audio_segs) == 0:
|
| 150 |
-
raise NotImplementedError('No audio segments found!')
|
| 151 |
-
|
| 152 |
-
return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from glob import glob
|
| 8 |
import numpy as np
|
| 9 |
from pydub import AudioSegment
|
| 10 |
+
from io import BytesIO
|
| 11 |
+
|
| 12 |
+
# from faster_whisper import WhisperModel
|
| 13 |
import hashlib
|
| 14 |
import base64
|
| 15 |
import librosa
|
|
|
|
| 18 |
model_size = "medium"
|
| 19 |
# Run on GPU with FP16
|
| 20 |
model = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
# def split_audio_whisper(audio_path, audio_name, target_dir="processed"):
|
| 24 |
+
# global model
|
| 25 |
+
# if model is None:
|
| 26 |
+
# model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
| 27 |
+
# audio = AudioSegment.from_file(audio_path)
|
| 28 |
+
# max_len = len(audio)
|
| 29 |
+
|
| 30 |
+
# target_folder = os.path.join(target_dir, audio_name)
|
| 31 |
+
|
| 32 |
+
# segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
|
| 33 |
+
# segments = list(segments)
|
| 34 |
|
| 35 |
+
# # create directory
|
| 36 |
+
# os.makedirs(target_folder, exist_ok=True)
|
| 37 |
+
# wavs_folder = os.path.join(target_folder, "wavs")
|
| 38 |
+
# os.makedirs(wavs_folder, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
# # segments
|
| 41 |
+
# s_ind = 0
|
| 42 |
+
# start_time = None
|
| 43 |
|
| 44 |
+
# for k, w in enumerate(segments):
|
| 45 |
+
# # process with the time
|
| 46 |
+
# if k == 0:
|
| 47 |
+
# start_time = max(0, w.start)
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
# end_time = w.end
|
|
|
|
| 50 |
|
| 51 |
+
# # calculate confidence
|
| 52 |
+
# if len(w.words) > 0:
|
| 53 |
+
# confidence = sum([s.probability for s in w.words]) / len(w.words)
|
| 54 |
+
# else:
|
| 55 |
+
# confidence = 0.0
|
| 56 |
+
# # clean text
|
| 57 |
+
# text = w.text.replace("...", "")
|
| 58 |
|
| 59 |
+
# # left 0.08s for each audios
|
| 60 |
+
# audio_seg = audio[
|
| 61 |
+
# int(start_time * 1000) : min(max_len, int(end_time * 1000) + 80)
|
| 62 |
+
# ]
|
| 63 |
|
| 64 |
+
# # segment file name
|
| 65 |
+
# fname = f"{audio_name}_seg{s_ind}.wav"
|
|
|
|
| 66 |
|
| 67 |
+
# # filter out the segment shorter than 1.5s and longer than 20s
|
| 68 |
+
# save = (
|
| 69 |
+
# audio_seg.duration_seconds > 1.5
|
| 70 |
+
# and audio_seg.duration_seconds < 20.0
|
| 71 |
+
# and len(text) >= 2
|
| 72 |
+
# and len(text) < 200
|
| 73 |
+
# )
|
| 74 |
|
| 75 |
+
# if save:
|
| 76 |
+
# output_file = os.path.join(wavs_folder, fname)
|
| 77 |
+
# audio_seg.export(output_file, format="wav")
|
| 78 |
|
| 79 |
+
# if k < len(segments) - 1:
|
| 80 |
+
# start_time = max(0, segments[k + 1].start - 0.08)
|
| 81 |
|
| 82 |
+
# s_ind = s_ind + 1
|
| 83 |
+
# return wavs_folder
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def split_audio_vad(audio_path, split_seconds=10.0):
|
| 87 |
SAMPLE_RATE = 16000
|
| 88 |
audio_vad = get_audio_tensor(audio_path)
|
| 89 |
segments = get_vad_segments(
|
|
|
|
| 94 |
method="silero",
|
| 95 |
)
|
| 96 |
segments = [(seg["start"], seg["end"]) for seg in segments]
|
| 97 |
+
segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s, e in segments]
|
| 98 |
+
# print(segments)
|
| 99 |
audio_active = AudioSegment.silent(duration=0)
|
| 100 |
audio = AudioSegment.from_file(audio_path)
|
| 101 |
|
| 102 |
for start_time, end_time in segments:
|
| 103 |
+
audio_active += audio[int(start_time * 1000) : int(end_time * 1000)]
|
| 104 |
+
|
| 105 |
audio_dur = audio_active.duration_seconds
|
| 106 |
+
# print(f"after vad: dur = {audio_dur}")
|
| 107 |
+
start_time = 0.0
|
|
|
|
|
|
|
|
|
|
| 108 |
count = 0
|
| 109 |
num_splits = int(np.round(audio_dur / split_seconds))
|
| 110 |
+
assert num_splits > 0, "input audio is too short"
|
| 111 |
interval = audio_dur / num_splits
|
| 112 |
|
| 113 |
+
wavs = []
|
| 114 |
for i in range(num_splits):
|
| 115 |
end_time = min(start_time + interval, audio_dur)
|
| 116 |
if i == num_splits - 1:
|
| 117 |
end_time = audio_dur
|
| 118 |
+
# output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
|
| 119 |
+
|
| 120 |
+
io = BytesIO()
|
| 121 |
+
audio_seg = audio_active[int(start_time * 1000) : int(end_time * 1000)]
|
| 122 |
+
audio_seg.export(io, format="wav")
|
| 123 |
+
io.seek(0)
|
| 124 |
+
wavs.append(io)
|
| 125 |
+
|
| 126 |
start_time = end_time
|
| 127 |
count += 1
|
| 128 |
+
|
| 129 |
+
return wavs
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# def hash_numpy_array(audio_path):
|
| 133 |
+
# array, _ = librosa.load(audio_path, sr=None, mono=True)
|
| 134 |
+
# # Convert the array to bytes
|
| 135 |
+
# array_bytes = array.tobytes()
|
| 136 |
+
# # Calculate the hash of the array bytes
|
| 137 |
+
# hash_object = hashlib.sha256(array_bytes)
|
| 138 |
+
# hash_value = hash_object.digest()
|
| 139 |
+
# # Convert the hash value to base64
|
| 140 |
+
# base64_value = base64.b64encode(hash_value)
|
| 141 |
+
# return base64_value.decode("utf-8")[:16].replace("/", "_^")
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def get_se(audio_path, vc_model, target_dir="processed"):
|
| 145 |
+
# device = vc_model.device
|
| 146 |
version = vc_model.version
|
| 147 |
print("OpenVoice version:", version)
|
| 148 |
|
| 149 |
+
# audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
|
| 150 |
+
# se_path = os.path.join(target_dir, audio_name, "se.pth")
|
| 151 |
|
| 152 |
# if os.path.isfile(se_path):
|
| 153 |
# se = torch.load(se_path).to(device)
|
| 154 |
# return se, audio_name
|
| 155 |
# if os.path.isdir(audio_path):
|
| 156 |
# wavs_folder = audio_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
+
# if vad:
|
| 159 |
+
wavs = split_audio_vad(audio_path)
|
| 160 |
+
|
| 161 |
+
# else:
|
| 162 |
+
# wavs_folder = split_audio_whisper(
|
| 163 |
+
# audio_path, target_dir=target_dir, audio_name=audio_name
|
| 164 |
+
# )
|
| 165 |
+
|
| 166 |
+
return vc_model.extract_se(wavs)
|
requirements.txt
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
-
librosa==0.
|
| 2 |
-
faster-whisper==0.9.0
|
| 3 |
pydub==0.25.1
|
| 4 |
-
numpy
|
| 5 |
whisper-timestamped==1.14.2
|
| 6 |
-
openai
|
| 7 |
torch
|
| 8 |
torchaudio
|
|
|
|
| 1 |
+
librosa==0.11.0
|
| 2 |
+
#faster-whisper==0.9.0
|
| 3 |
pydub==0.25.1
|
| 4 |
+
numpy
|
| 5 |
whisper-timestamped==1.14.2
|
| 6 |
+
#openai
|
| 7 |
torch
|
| 8 |
torchaudio
|