v2
Browse files- OpenVoice/resources/framework.jpg +0 -0
- OpenVoice/resources/lepton.jpg +0 -0
- OpenVoice/resources/myshell.jpg +0 -0
- OpenVoice/resources/openvoicelogo.jpg +0 -0
- app_locally.py +2 -6
- openvoice/__init__.py +0 -0
- {OpenVoice β openvoice}/api.py +58 -8
- {OpenVoice β openvoice}/attentions.py +2 -3
- {OpenVoice β openvoice}/commons.py +0 -0
- {OpenVoice β openvoice}/mel_processing.py +0 -0
- {OpenVoice β openvoice}/models.py +8 -6
- {OpenVoice β openvoice}/modules.py +4 -4
- openvoice/openvoice_app.py +275 -0
- {OpenVoice β openvoice}/se_extractor.py +36 -21
- {OpenVoice β openvoice}/text/__init__.py +4 -3
- {OpenVoice β openvoice}/text/cleaners.py +2 -2
- {OpenVoice β openvoice}/text/english.py +0 -1
- {OpenVoice β openvoice}/text/mandarin.py +0 -0
- {OpenVoice β openvoice}/text/symbols.py +0 -0
- {OpenVoice β openvoice}/transforms.py +0 -0
- {OpenVoice β openvoice}/utils.py +1 -1
OpenVoice/resources/framework.jpg
DELETED
|
Binary file (151 kB)
|
|
|
OpenVoice/resources/lepton.jpg
DELETED
|
Binary file (1.9 kB)
|
|
|
OpenVoice/resources/myshell.jpg
DELETED
|
Binary file (1.7 kB)
|
|
|
OpenVoice/resources/openvoicelogo.jpg
DELETED
|
Binary file (155 kB)
|
|
|
app_locally.py
CHANGED
|
@@ -26,8 +26,8 @@ args = parser.parse_args()
|
|
| 26 |
print("Starting OpenVoice")
|
| 27 |
|
| 28 |
# Init EN/ZH baseTTS and ToneConvertor
|
| 29 |
-
from
|
| 30 |
-
from
|
| 31 |
|
| 32 |
en_ckpt_base = "checkpoints/base_speakers/EN"
|
| 33 |
zh_ckpt_base = "checkpoints/base_speakers/ZH"
|
|
@@ -62,8 +62,6 @@ def predict(prompt, speaker_wav, transform_wav):
|
|
| 62 |
source_se, _ = se_extractor.get_se(
|
| 63 |
speaker_wav,
|
| 64 |
tone_color_converter,
|
| 65 |
-
target_dir="processed",
|
| 66 |
-
max_length=60.0,
|
| 67 |
vad=True,
|
| 68 |
)
|
| 69 |
else:
|
|
@@ -102,8 +100,6 @@ def predict(prompt, speaker_wav, transform_wav):
|
|
| 102 |
target_se, wavs_folder = se_extractor.get_se(
|
| 103 |
speaker_wav,
|
| 104 |
tone_color_converter,
|
| 105 |
-
target_dir="processed",
|
| 106 |
-
max_length=60.0,
|
| 107 |
vad=True,
|
| 108 |
)
|
| 109 |
# os.system(f'rm -rf {wavs_folder}')
|
|
|
|
| 26 |
print("Starting OpenVoice")
|
| 27 |
|
| 28 |
# Init EN/ZH baseTTS and ToneConvertor
|
| 29 |
+
from openvoice import se_extractor
|
| 30 |
+
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
|
| 31 |
|
| 32 |
en_ckpt_base = "checkpoints/base_speakers/EN"
|
| 33 |
zh_ckpt_base = "checkpoints/base_speakers/ZH"
|
|
|
|
| 62 |
source_se, _ = se_extractor.get_se(
|
| 63 |
speaker_wav,
|
| 64 |
tone_color_converter,
|
|
|
|
|
|
|
| 65 |
vad=True,
|
| 66 |
)
|
| 67 |
else:
|
|
|
|
| 100 |
target_se, wavs_folder = se_extractor.get_se(
|
| 101 |
speaker_wav,
|
| 102 |
tone_color_converter,
|
|
|
|
|
|
|
| 103 |
vad=True,
|
| 104 |
)
|
| 105 |
# os.system(f'rm -rf {wavs_folder}')
|
openvoice/__init__.py
ADDED
|
File without changes
|
{OpenVoice β openvoice}/api.py
RENAMED
|
@@ -2,14 +2,13 @@ import torch
|
|
| 2 |
import numpy as np
|
| 3 |
import re
|
| 4 |
import soundfile
|
| 5 |
-
|
|
|
|
| 6 |
import os
|
| 7 |
import librosa
|
| 8 |
-
from . import
|
| 9 |
-
from . import
|
| 10 |
-
from .
|
| 11 |
-
from .models import SynthesizerTrn
|
| 12 |
-
from .mel_processing import spectrogram_torch
|
| 13 |
|
| 14 |
|
| 15 |
class OpenVoiceBaseClass(object):
|
|
@@ -34,7 +33,7 @@ class OpenVoiceBaseClass(object):
|
|
| 34 |
self.device = device
|
| 35 |
|
| 36 |
def load_ckpt(self, ckpt_path):
|
| 37 |
-
checkpoint_dict = torch.load(ckpt_path, map_location=
|
| 38 |
a, b = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
|
| 39 |
print("Loaded checkpoint '{}'".format(ckpt_path))
|
| 40 |
print('missing/unexpected keys:', a, b)
|
|
@@ -103,6 +102,15 @@ class ToneColorConverter(OpenVoiceBaseClass):
|
|
| 103 |
def __init__(self, *args, **kwargs):
|
| 104 |
super().__init__(*args, **kwargs)
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
def extract_se(self, ref_wav_list, se_save_path=None):
|
| 107 |
if isinstance(ref_wav_list, str):
|
| 108 |
ref_wav_list = [ref_wav_list]
|
|
@@ -130,7 +138,7 @@ class ToneColorConverter(OpenVoiceBaseClass):
|
|
| 130 |
|
| 131 |
return gs
|
| 132 |
|
| 133 |
-
def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3):
|
| 134 |
hps = self.hps
|
| 135 |
# load audio
|
| 136 |
audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
|
|
@@ -145,8 +153,50 @@ class ToneColorConverter(OpenVoiceBaseClass):
|
|
| 145 |
spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
|
| 146 |
audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
|
| 147 |
0, 0].data.cpu().float().numpy()
|
|
|
|
| 148 |
if output_path is None:
|
| 149 |
return audio
|
| 150 |
else:
|
| 151 |
soundfile.write(output_path, audio, hps.data.sampling_rate)
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import re
|
| 4 |
import soundfile
|
| 5 |
+
from openvoice import utils
|
| 6 |
+
from openvoice import commons
|
| 7 |
import os
|
| 8 |
import librosa
|
| 9 |
+
from openvoice.text import text_to_sequence
|
| 10 |
+
from openvoice.mel_processing import spectrogram_torch
|
| 11 |
+
from openvoice.models import SynthesizerTrn
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
class OpenVoiceBaseClass(object):
|
|
|
|
| 33 |
self.device = device
|
| 34 |
|
| 35 |
def load_ckpt(self, ckpt_path):
|
| 36 |
+
checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
|
| 37 |
a, b = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
|
| 38 |
print("Loaded checkpoint '{}'".format(ckpt_path))
|
| 39 |
print('missing/unexpected keys:', a, b)
|
|
|
|
| 102 |
def __init__(self, *args, **kwargs):
|
| 103 |
super().__init__(*args, **kwargs)
|
| 104 |
|
| 105 |
+
if kwargs.get('enable_watermark', True):
|
| 106 |
+
import wavmark
|
| 107 |
+
self.watermark_model = wavmark.load_model().to(self.device)
|
| 108 |
+
else:
|
| 109 |
+
self.watermark_model = None
|
| 110 |
+
self.version = getattr(self.hps, '_version_', "v1")
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
|
| 114 |
def extract_se(self, ref_wav_list, se_save_path=None):
|
| 115 |
if isinstance(ref_wav_list, str):
|
| 116 |
ref_wav_list = [ref_wav_list]
|
|
|
|
| 138 |
|
| 139 |
return gs
|
| 140 |
|
| 141 |
+
def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3, message="default"):
|
| 142 |
hps = self.hps
|
| 143 |
# load audio
|
| 144 |
audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
|
|
|
|
| 153 |
spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
|
| 154 |
audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
|
| 155 |
0, 0].data.cpu().float().numpy()
|
| 156 |
+
audio = self.add_watermark(audio, message)
|
| 157 |
if output_path is None:
|
| 158 |
return audio
|
| 159 |
else:
|
| 160 |
soundfile.write(output_path, audio, hps.data.sampling_rate)
|
| 161 |
|
| 162 |
+
def add_watermark(self, audio, message):
|
| 163 |
+
if self.watermark_model is None:
|
| 164 |
+
return audio
|
| 165 |
+
device = self.device
|
| 166 |
+
bits = utils.string_to_bits(message).reshape(-1)
|
| 167 |
+
n_repeat = len(bits) // 32
|
| 168 |
+
|
| 169 |
+
K = 16000
|
| 170 |
+
coeff = 2
|
| 171 |
+
for n in range(n_repeat):
|
| 172 |
+
trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
|
| 173 |
+
if len(trunck) != K:
|
| 174 |
+
print('Audio too short, fail to add watermark')
|
| 175 |
+
break
|
| 176 |
+
message_npy = bits[n * 32: (n + 1) * 32]
|
| 177 |
+
|
| 178 |
+
with torch.no_grad():
|
| 179 |
+
signal = torch.FloatTensor(trunck).to(device)[None]
|
| 180 |
+
message_tensor = torch.FloatTensor(message_npy).to(device)[None]
|
| 181 |
+
signal_wmd_tensor = self.watermark_model.encode(signal, message_tensor)
|
| 182 |
+
signal_wmd_npy = signal_wmd_tensor.detach().cpu().squeeze()
|
| 183 |
+
audio[(coeff * n) * K: (coeff * n + 1) * K] = signal_wmd_npy
|
| 184 |
+
return audio
|
| 185 |
+
|
| 186 |
+
def detect_watermark(self, audio, n_repeat):
|
| 187 |
+
bits = []
|
| 188 |
+
K = 16000
|
| 189 |
+
coeff = 2
|
| 190 |
+
for n in range(n_repeat):
|
| 191 |
+
trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
|
| 192 |
+
if len(trunck) != K:
|
| 193 |
+
print('Audio too short, fail to detect watermark')
|
| 194 |
+
return 'Fail'
|
| 195 |
+
with torch.no_grad():
|
| 196 |
+
signal = torch.FloatTensor(trunck).to(self.device).unsqueeze(0)
|
| 197 |
+
message_decoded_npy = (self.watermark_model.decode(signal) >= 0.5).int().detach().cpu().numpy().squeeze()
|
| 198 |
+
bits.append(message_decoded_npy)
|
| 199 |
+
bits = np.stack(bits).reshape(-1, 8)
|
| 200 |
+
message = utils.bits_to_string(bits)
|
| 201 |
+
return message
|
| 202 |
+
|
{OpenVoice β openvoice}/attentions.py
RENAMED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
import math
|
| 2 |
import torch
|
| 3 |
-
import logging
|
| 4 |
-
|
| 5 |
from torch import nn
|
| 6 |
from torch.nn import functional as F
|
| 7 |
-
from . import commons
|
| 8 |
|
|
|
|
|
|
|
| 9 |
|
| 10 |
logger = logging.getLogger(__name__)
|
| 11 |
|
|
|
|
| 1 |
import math
|
| 2 |
import torch
|
|
|
|
|
|
|
| 3 |
from torch import nn
|
| 4 |
from torch.nn import functional as F
|
|
|
|
| 5 |
|
| 6 |
+
from openvoice import commons
|
| 7 |
+
import logging
|
| 8 |
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|
{OpenVoice β openvoice}/commons.py
RENAMED
|
File without changes
|
{OpenVoice β openvoice}/mel_processing.py
RENAMED
|
File without changes
|
{OpenVoice β openvoice}/models.py
RENAMED
|
@@ -3,14 +3,14 @@ import torch
|
|
| 3 |
from torch import nn
|
| 4 |
from torch.nn import functional as F
|
| 5 |
|
| 6 |
-
from
|
| 7 |
-
from
|
| 8 |
-
from
|
| 9 |
|
| 10 |
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
|
| 11 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
| 12 |
|
| 13 |
-
from .commons import init_weights
|
| 14 |
|
| 15 |
|
| 16 |
class TextEncoder(nn.Module):
|
|
@@ -420,6 +420,7 @@ class SynthesizerTrn(nn.Module):
|
|
| 420 |
upsample_kernel_sizes,
|
| 421 |
n_speakers=256,
|
| 422 |
gin_channels=256,
|
|
|
|
| 423 |
**kwargs
|
| 424 |
):
|
| 425 |
super().__init__()
|
|
@@ -461,6 +462,7 @@ class SynthesizerTrn(nn.Module):
|
|
| 461 |
self.sdp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
|
| 462 |
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
| 463 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
|
|
|
| 464 |
|
| 465 |
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., sdp_ratio=0.2, max_len=None):
|
| 466 |
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
|
@@ -490,8 +492,8 @@ class SynthesizerTrn(nn.Module):
|
|
| 490 |
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt, tau=1.0):
|
| 491 |
g_src = sid_src
|
| 492 |
g_tgt = sid_tgt
|
| 493 |
-
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src, tau=tau)
|
| 494 |
z_p = self.flow(z, y_mask, g=g_src)
|
| 495 |
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
| 496 |
-
o_hat = self.dec(z_hat * y_mask, g=g_tgt)
|
| 497 |
return o_hat, y_mask, (z, z_p, z_hat)
|
|
|
|
| 3 |
from torch import nn
|
| 4 |
from torch.nn import functional as F
|
| 5 |
|
| 6 |
+
from openvoice import commons
|
| 7 |
+
from openvoice import modules
|
| 8 |
+
from openvoice import attentions
|
| 9 |
|
| 10 |
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
|
| 11 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
| 12 |
|
| 13 |
+
from openvoice.commons import init_weights, get_padding
|
| 14 |
|
| 15 |
|
| 16 |
class TextEncoder(nn.Module):
|
|
|
|
| 420 |
upsample_kernel_sizes,
|
| 421 |
n_speakers=256,
|
| 422 |
gin_channels=256,
|
| 423 |
+
zero_g=False,
|
| 424 |
**kwargs
|
| 425 |
):
|
| 426 |
super().__init__()
|
|
|
|
| 462 |
self.sdp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
|
| 463 |
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
| 464 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
| 465 |
+
self.zero_g = zero_g
|
| 466 |
|
| 467 |
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., sdp_ratio=0.2, max_len=None):
|
| 468 |
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
|
|
|
| 492 |
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt, tau=1.0):
|
| 493 |
g_src = sid_src
|
| 494 |
g_tgt = sid_tgt
|
| 495 |
+
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src if not self.zero_g else torch.zeros_like(g_src), tau=tau)
|
| 496 |
z_p = self.flow(z, y_mask, g=g_src)
|
| 497 |
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
| 498 |
+
o_hat = self.dec(z_hat * y_mask, g=g_tgt if not self.zero_g else torch.zeros_like(g_tgt))
|
| 499 |
return o_hat, y_mask, (z, z_p, z_hat)
|
{OpenVoice β openvoice}/modules.py
RENAMED
|
@@ -6,10 +6,10 @@ from torch.nn import functional as F
|
|
| 6 |
from torch.nn import Conv1d
|
| 7 |
from torch.nn.utils import weight_norm, remove_weight_norm
|
| 8 |
|
| 9 |
-
from
|
| 10 |
-
from .commons import init_weights, get_padding
|
| 11 |
-
from .transforms import piecewise_rational_quadratic_transform
|
| 12 |
-
from .attentions import Encoder
|
| 13 |
|
| 14 |
LRELU_SLOPE = 0.1
|
| 15 |
|
|
|
|
| 6 |
from torch.nn import Conv1d
|
| 7 |
from torch.nn.utils import weight_norm, remove_weight_norm
|
| 8 |
|
| 9 |
+
from openvoice import commons
|
| 10 |
+
from openvoice.commons import init_weights, get_padding
|
| 11 |
+
from openvoice.transforms import piecewise_rational_quadratic_transform
|
| 12 |
+
from openvoice.attentions import Encoder
|
| 13 |
|
| 14 |
LRELU_SLOPE = 0.1
|
| 15 |
|
openvoice/openvoice_app.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import argparse
|
| 4 |
+
import gradio as gr
|
| 5 |
+
from zipfile import ZipFile
|
| 6 |
+
import langid
|
| 7 |
+
from openvoice import se_extractor
|
| 8 |
+
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
|
| 9 |
+
|
| 10 |
+
parser = argparse.ArgumentParser()
|
| 11 |
+
parser.add_argument("--share", action='store_true', default=False, help="make link public")
|
| 12 |
+
args = parser.parse_args()
|
| 13 |
+
|
| 14 |
+
en_ckpt_base = 'checkpoints/base_speakers/EN'
|
| 15 |
+
zh_ckpt_base = 'checkpoints/base_speakers/ZH'
|
| 16 |
+
ckpt_converter = 'checkpoints/converter'
|
| 17 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 18 |
+
output_dir = 'outputs'
|
| 19 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 20 |
+
|
| 21 |
+
# load models
|
| 22 |
+
en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
|
| 23 |
+
en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
|
| 24 |
+
zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
|
| 25 |
+
zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
|
| 26 |
+
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
|
| 27 |
+
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
|
| 28 |
+
|
| 29 |
+
# load speaker embeddings
|
| 30 |
+
en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
|
| 31 |
+
en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
|
| 32 |
+
zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
|
| 33 |
+
|
| 34 |
+
# This online demo mainly supports English and Chinese
|
| 35 |
+
supported_languages = ['zh', 'en']
|
| 36 |
+
|
| 37 |
+
def predict(prompt, style, audio_file_pth, agree):
|
| 38 |
+
# initialize a empty info
|
| 39 |
+
text_hint = ''
|
| 40 |
+
# agree with the terms
|
| 41 |
+
if agree == False:
|
| 42 |
+
text_hint += '[ERROR] Please accept the Terms & Condition!\n'
|
| 43 |
+
gr.Warning("Please accept the Terms & Condition!")
|
| 44 |
+
return (
|
| 45 |
+
text_hint,
|
| 46 |
+
None,
|
| 47 |
+
None,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# first detect the input language
|
| 51 |
+
language_predicted = langid.classify(prompt)[0].strip()
|
| 52 |
+
print(f"Detected language:{language_predicted}")
|
| 53 |
+
|
| 54 |
+
if language_predicted not in supported_languages:
|
| 55 |
+
text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
|
| 56 |
+
gr.Warning(
|
| 57 |
+
f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
return (
|
| 61 |
+
text_hint,
|
| 62 |
+
None,
|
| 63 |
+
None,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
if language_predicted == "zh":
|
| 67 |
+
tts_model = zh_base_speaker_tts
|
| 68 |
+
source_se = zh_source_se
|
| 69 |
+
language = 'Chinese'
|
| 70 |
+
if style not in ['default']:
|
| 71 |
+
text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
|
| 72 |
+
gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
|
| 73 |
+
return (
|
| 74 |
+
text_hint,
|
| 75 |
+
None,
|
| 76 |
+
None,
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
else:
|
| 80 |
+
tts_model = en_base_speaker_tts
|
| 81 |
+
if style == 'default':
|
| 82 |
+
source_se = en_source_default_se
|
| 83 |
+
else:
|
| 84 |
+
source_se = en_source_style_se
|
| 85 |
+
language = 'English'
|
| 86 |
+
if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
|
| 87 |
+
text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
|
| 88 |
+
gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
|
| 89 |
+
return (
|
| 90 |
+
text_hint,
|
| 91 |
+
None,
|
| 92 |
+
None,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
speaker_wav = audio_file_pth
|
| 96 |
+
|
| 97 |
+
if len(prompt) < 2:
|
| 98 |
+
text_hint += f"[ERROR] Please give a longer prompt text \n"
|
| 99 |
+
gr.Warning("Please give a longer prompt text")
|
| 100 |
+
return (
|
| 101 |
+
text_hint,
|
| 102 |
+
None,
|
| 103 |
+
None,
|
| 104 |
+
)
|
| 105 |
+
if len(prompt) > 200:
|
| 106 |
+
text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
|
| 107 |
+
gr.Warning(
|
| 108 |
+
"Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
|
| 109 |
+
)
|
| 110 |
+
return (
|
| 111 |
+
text_hint,
|
| 112 |
+
None,
|
| 113 |
+
None,
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
|
| 117 |
+
try:
|
| 118 |
+
target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', vad=True)
|
| 119 |
+
except Exception as e:
|
| 120 |
+
text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
|
| 121 |
+
gr.Warning(
|
| 122 |
+
"[ERROR] Get target tone color error {str(e)} \n"
|
| 123 |
+
)
|
| 124 |
+
return (
|
| 125 |
+
text_hint,
|
| 126 |
+
None,
|
| 127 |
+
None,
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
src_path = f'{output_dir}/tmp.wav'
|
| 131 |
+
tts_model.tts(prompt, src_path, speaker=style, language=language)
|
| 132 |
+
|
| 133 |
+
save_path = f'{output_dir}/output.wav'
|
| 134 |
+
# Run the tone color converter
|
| 135 |
+
encode_message = "@MyShell"
|
| 136 |
+
tone_color_converter.convert(
|
| 137 |
+
audio_src_path=src_path,
|
| 138 |
+
src_se=source_se,
|
| 139 |
+
tgt_se=target_se,
|
| 140 |
+
output_path=save_path,
|
| 141 |
+
message=encode_message)
|
| 142 |
+
|
| 143 |
+
text_hint += f'''Get response successfully \n'''
|
| 144 |
+
|
| 145 |
+
return (
|
| 146 |
+
text_hint,
|
| 147 |
+
save_path,
|
| 148 |
+
speaker_wav,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
title = "MyShell OpenVoice"
|
| 154 |
+
|
| 155 |
+
description = """
|
| 156 |
+
We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
|
| 157 |
+
"""
|
| 158 |
+
|
| 159 |
+
markdown_table = """
|
| 160 |
+
<div align="center" style="margin-bottom: 10px;">
|
| 161 |
+
|
| 162 |
+
| | | |
|
| 163 |
+
| :-----------: | :-----------: | :-----------: |
|
| 164 |
+
| **OpenSource Repo** | **Project Page** | **Join the Community** |
|
| 165 |
+
| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [](https://discord.gg/myshell) |
|
| 166 |
+
|
| 167 |
+
</div>
|
| 168 |
+
"""
|
| 169 |
+
|
| 170 |
+
markdown_table_v2 = """
|
| 171 |
+
<div align="center" style="margin-bottom: 2px;">
|
| 172 |
+
|
| 173 |
+
| | | | |
|
| 174 |
+
| :-----------: | :-----------: | :-----------: | :-----------: |
|
| 175 |
+
| **OpenSource Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
|
| 176 |
+
|
| 177 |
+
| | |
|
| 178 |
+
| :-----------: | :-----------: |
|
| 179 |
+
**Join the Community** | [](https://discord.gg/myshell) |
|
| 180 |
+
|
| 181 |
+
</div>
|
| 182 |
+
"""
|
| 183 |
+
content = """
|
| 184 |
+
<div>
|
| 185 |
+
<strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
|
| 186 |
+
This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
|
| 187 |
+
</div>
|
| 188 |
+
"""
|
| 189 |
+
wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
examples = [
|
| 193 |
+
[
|
| 194 |
+
"δ»ε€©ε€©ζ°ηε₯½οΌζ们δΈθ΅·εΊε»ει₯ε§γ",
|
| 195 |
+
'default',
|
| 196 |
+
"resources/demo_speaker1.mp3",
|
| 197 |
+
True,
|
| 198 |
+
],[
|
| 199 |
+
"This audio is generated by open voice with a half-performance model.",
|
| 200 |
+
'whispering',
|
| 201 |
+
"resources/demo_speaker2.mp3",
|
| 202 |
+
True,
|
| 203 |
+
],
|
| 204 |
+
[
|
| 205 |
+
"He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
| 206 |
+
'sad',
|
| 207 |
+
"resources/demo_speaker0.mp3",
|
| 208 |
+
True,
|
| 209 |
+
],
|
| 210 |
+
]
|
| 211 |
+
|
| 212 |
+
with gr.Blocks(analytics_enabled=False) as demo:
|
| 213 |
+
|
| 214 |
+
with gr.Row():
|
| 215 |
+
with gr.Column():
|
| 216 |
+
with gr.Row():
|
| 217 |
+
gr.Markdown(
|
| 218 |
+
"""
|
| 219 |
+
## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
|
| 220 |
+
"""
|
| 221 |
+
)
|
| 222 |
+
with gr.Row():
|
| 223 |
+
gr.Markdown(markdown_table_v2)
|
| 224 |
+
with gr.Row():
|
| 225 |
+
gr.Markdown(description)
|
| 226 |
+
with gr.Column():
|
| 227 |
+
gr.Video('https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f', autoplay=True)
|
| 228 |
+
|
| 229 |
+
with gr.Row():
|
| 230 |
+
gr.HTML(wrapped_markdown_content)
|
| 231 |
+
|
| 232 |
+
with gr.Row():
|
| 233 |
+
with gr.Column():
|
| 234 |
+
input_text_gr = gr.Textbox(
|
| 235 |
+
label="Text Prompt",
|
| 236 |
+
info="One or two sentences at a time is better. Up to 200 text characters.",
|
| 237 |
+
value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
| 238 |
+
)
|
| 239 |
+
style_gr = gr.Dropdown(
|
| 240 |
+
label="Style",
|
| 241 |
+
info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
|
| 242 |
+
choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
|
| 243 |
+
max_choices=1,
|
| 244 |
+
value="default",
|
| 245 |
+
)
|
| 246 |
+
ref_gr = gr.Audio(
|
| 247 |
+
label="Reference Audio",
|
| 248 |
+
info="Click on the β button to upload your own target speaker audio",
|
| 249 |
+
type="filepath",
|
| 250 |
+
value="resources/demo_speaker2.mp3",
|
| 251 |
+
)
|
| 252 |
+
tos_gr = gr.Checkbox(
|
| 253 |
+
label="Agree",
|
| 254 |
+
value=False,
|
| 255 |
+
info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
with gr.Column():
|
| 262 |
+
out_text_gr = gr.Text(label="Info")
|
| 263 |
+
audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
|
| 264 |
+
ref_audio_gr = gr.Audio(label="Reference Audio Used")
|
| 265 |
+
|
| 266 |
+
gr.Examples(examples,
|
| 267 |
+
label="Examples",
|
| 268 |
+
inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
|
| 269 |
+
outputs=[out_text_gr, audio_gr, ref_audio_gr],
|
| 270 |
+
fn=predict,
|
| 271 |
+
cache_examples=False,)
|
| 272 |
+
tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
|
| 273 |
+
|
| 274 |
+
demo.queue()
|
| 275 |
+
demo.launch(debug=True, show_api=True, share=args.share)
|
{OpenVoice β openvoice}/se_extractor.py
RENAMED
|
@@ -1,26 +1,28 @@
|
|
| 1 |
import os
|
| 2 |
import glob
|
| 3 |
import torch
|
|
|
|
|
|
|
|
|
|
| 4 |
from glob import glob
|
| 5 |
import numpy as np
|
| 6 |
from pydub import AudioSegment
|
| 7 |
from faster_whisper import WhisperModel
|
|
|
|
|
|
|
|
|
|
| 8 |
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
|
| 9 |
|
| 10 |
model_size = "medium"
|
| 11 |
# Run on GPU with FP16
|
| 12 |
model = None
|
| 13 |
-
def split_audio_whisper(audio_path,
|
| 14 |
global model
|
| 15 |
if model is None:
|
| 16 |
-
|
| 17 |
-
model = WhisperModel(model_size, device=device)
|
| 18 |
-
else:
|
| 19 |
-
model = WhisperModel(model_size, device=device, compute_type="float16")
|
| 20 |
audio = AudioSegment.from_file(audio_path)
|
| 21 |
max_len = len(audio)
|
| 22 |
|
| 23 |
-
audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
|
| 24 |
target_folder = os.path.join(target_dir, audio_name)
|
| 25 |
|
| 26 |
segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
|
|
@@ -72,9 +74,9 @@ def split_audio_whisper(audio_path, device='cuda', target_dir='processed'):
|
|
| 72 |
return wavs_folder
|
| 73 |
|
| 74 |
|
| 75 |
-
def split_audio_vad(audio_path, target_dir, split_seconds=10.0
|
| 76 |
SAMPLE_RATE = 16000
|
| 77 |
-
audio_vad = get_audio_tensor(audio_path)
|
| 78 |
segments = get_vad_segments(
|
| 79 |
audio_vad,
|
| 80 |
output_sample=True,
|
|
@@ -93,7 +95,6 @@ def split_audio_vad(audio_path, target_dir, split_seconds=10.0, max_length=60.):
|
|
| 93 |
|
| 94 |
audio_dur = audio_active.duration_seconds
|
| 95 |
print(f'after vad: dur = {audio_dur}')
|
| 96 |
-
audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
|
| 97 |
target_folder = os.path.join(target_dir, audio_name)
|
| 98 |
wavs_folder = os.path.join(target_folder, 'wavs')
|
| 99 |
os.makedirs(wavs_folder, exist_ok=True)
|
|
@@ -114,25 +115,39 @@ def split_audio_vad(audio_path, target_dir, split_seconds=10.0, max_length=60.):
|
|
| 114 |
count += 1
|
| 115 |
return wavs_folder
|
| 116 |
|
| 117 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
device = vc_model.device
|
|
|
|
|
|
|
| 119 |
|
| 120 |
-
audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
|
| 121 |
se_path = os.path.join(target_dir, audio_name, 'se.pth')
|
| 122 |
|
| 123 |
-
if os.path.isfile(se_path):
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
if os.path.isdir(audio_path):
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
|
|
|
| 130 |
else:
|
| 131 |
-
wavs_folder = split_audio_whisper(audio_path,
|
| 132 |
|
| 133 |
audio_segs = glob(f'{wavs_folder}/*.wav')
|
| 134 |
-
|
| 135 |
if len(audio_segs) == 0:
|
| 136 |
raise NotImplementedError('No audio segments found!')
|
| 137 |
|
| 138 |
-
return vc_model.extract_se(audio_segs, se_save_path=se_path),
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import glob
|
| 3 |
import torch
|
| 4 |
+
import hashlib
|
| 5 |
+
import librosa
|
| 6 |
+
import base64
|
| 7 |
from glob import glob
|
| 8 |
import numpy as np
|
| 9 |
from pydub import AudioSegment
|
| 10 |
from faster_whisper import WhisperModel
|
| 11 |
+
import hashlib
|
| 12 |
+
import base64
|
| 13 |
+
import librosa
|
| 14 |
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
|
| 15 |
|
| 16 |
model_size = "medium"
|
| 17 |
# Run on GPU with FP16
|
| 18 |
model = None
|
| 19 |
+
def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
|
| 20 |
global model
|
| 21 |
if model is None:
|
| 22 |
+
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
|
|
|
|
|
|
|
|
|
| 23 |
audio = AudioSegment.from_file(audio_path)
|
| 24 |
max_len = len(audio)
|
| 25 |
|
|
|
|
| 26 |
target_folder = os.path.join(target_dir, audio_name)
|
| 27 |
|
| 28 |
segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
|
|
|
|
| 74 |
return wavs_folder
|
| 75 |
|
| 76 |
|
| 77 |
+
def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
|
| 78 |
SAMPLE_RATE = 16000
|
| 79 |
+
audio_vad = get_audio_tensor(audio_path)
|
| 80 |
segments = get_vad_segments(
|
| 81 |
audio_vad,
|
| 82 |
output_sample=True,
|
|
|
|
| 95 |
|
| 96 |
audio_dur = audio_active.duration_seconds
|
| 97 |
print(f'after vad: dur = {audio_dur}')
|
|
|
|
| 98 |
target_folder = os.path.join(target_dir, audio_name)
|
| 99 |
wavs_folder = os.path.join(target_folder, 'wavs')
|
| 100 |
os.makedirs(wavs_folder, exist_ok=True)
|
|
|
|
| 115 |
count += 1
|
| 116 |
return wavs_folder
|
| 117 |
|
| 118 |
+
def hash_numpy_array(audio_path):
|
| 119 |
+
array, _ = librosa.load(audio_path, sr=None, mono=True)
|
| 120 |
+
# Convert the array to bytes
|
| 121 |
+
array_bytes = array.tobytes()
|
| 122 |
+
# Calculate the hash of the array bytes
|
| 123 |
+
hash_object = hashlib.sha256(array_bytes)
|
| 124 |
+
hash_value = hash_object.digest()
|
| 125 |
+
# Convert the hash value to base64
|
| 126 |
+
base64_value = base64.b64encode(hash_value)
|
| 127 |
+
return base64_value.decode('utf-8')[:16].replace('/', '_^')
|
| 128 |
+
|
| 129 |
+
def get_se(audio_path, vc_model, target_dir='processed', vad=True):
|
| 130 |
device = vc_model.device
|
| 131 |
+
version = vc_model.version
|
| 132 |
+
print("OpenVoice version:", version)
|
| 133 |
|
| 134 |
+
audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
|
| 135 |
se_path = os.path.join(target_dir, audio_name, 'se.pth')
|
| 136 |
|
| 137 |
+
# if os.path.isfile(se_path):
|
| 138 |
+
# se = torch.load(se_path).to(device)
|
| 139 |
+
# return se, audio_name
|
| 140 |
+
# if os.path.isdir(audio_path):
|
| 141 |
+
# wavs_folder = audio_path
|
| 142 |
+
|
| 143 |
+
if vad:
|
| 144 |
+
wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
|
| 145 |
else:
|
| 146 |
+
wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
|
| 147 |
|
| 148 |
audio_segs = glob(f'{wavs_folder}/*.wav')
|
|
|
|
| 149 |
if len(audio_segs) == 0:
|
| 150 |
raise NotImplementedError('No audio segments found!')
|
| 151 |
|
| 152 |
+
return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
|
| 153 |
+
|
{OpenVoice β openvoice}/text/__init__.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
""" from https://github.com/keithito/tacotron """
|
| 2 |
-
from . import cleaners
|
| 3 |
-
from .symbols import
|
|
|
|
| 4 |
|
| 5 |
# Mappings from symbol to numeric ID and vice versa:
|
| 6 |
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
|
@@ -42,7 +43,7 @@ def cleaned_text_to_sequence(cleaned_text, symbols):
|
|
| 42 |
|
| 43 |
|
| 44 |
|
| 45 |
-
from .symbols import language_tone_start_map
|
| 46 |
def cleaned_text_to_sequence_vits2(cleaned_text, tones, language, symbols, languages):
|
| 47 |
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
| 48 |
Args:
|
|
|
|
| 1 |
""" from https://github.com/keithito/tacotron """
|
| 2 |
+
from openvoice.text import cleaners
|
| 3 |
+
from openvoice.text.symbols import symbols
|
| 4 |
+
|
| 5 |
|
| 6 |
# Mappings from symbol to numeric ID and vice versa:
|
| 7 |
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
|
| 46 |
+
from openvoice.text.symbols import language_tone_start_map
|
| 47 |
def cleaned_text_to_sequence_vits2(cleaned_text, tones, language, symbols, languages):
|
| 48 |
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
| 49 |
Args:
|
{OpenVoice β openvoice}/text/cleaners.py
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import re
|
| 2 |
-
from .english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
|
| 3 |
-
from .mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
|
| 4 |
|
| 5 |
def cjke_cleaners2(text):
|
| 6 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
|
|
|
| 1 |
import re
|
| 2 |
+
from openvoice.text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
|
| 3 |
+
from openvoice.text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
|
| 4 |
|
| 5 |
def cjke_cleaners2(text):
|
| 6 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
{OpenVoice β openvoice}/text/english.py
RENAMED
|
@@ -20,7 +20,6 @@ import re
|
|
| 20 |
import inflect
|
| 21 |
from unidecode import unidecode
|
| 22 |
import eng_to_ipa as ipa
|
| 23 |
-
|
| 24 |
_inflect = inflect.engine()
|
| 25 |
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
| 26 |
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
|
|
|
| 20 |
import inflect
|
| 21 |
from unidecode import unidecode
|
| 22 |
import eng_to_ipa as ipa
|
|
|
|
| 23 |
_inflect = inflect.engine()
|
| 24 |
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
| 25 |
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
{OpenVoice β openvoice}/text/mandarin.py
RENAMED
|
File without changes
|
{OpenVoice β openvoice}/text/symbols.py
RENAMED
|
File without changes
|
{OpenVoice β openvoice}/transforms.py
RENAMED
|
File without changes
|
{OpenVoice β openvoice}/utils.py
RENAMED
|
@@ -128,7 +128,7 @@ def merge_short_sentences_latin(sens):
|
|
| 128 |
"""
|
| 129 |
sens_out = []
|
| 130 |
for s in sens:
|
| 131 |
-
# If the previous
|
| 132 |
# the current sentence.
|
| 133 |
if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
|
| 134 |
sens_out[-1] = sens_out[-1] + " " + s
|
|
|
|
| 128 |
"""
|
| 129 |
sens_out = []
|
| 130 |
for s in sens:
|
| 131 |
+
# If the previous sentence is too short, merge them with
|
| 132 |
# the current sentence.
|
| 133 |
if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
|
| 134 |
sens_out[-1] = sens_out[-1] + " " + s
|