Chuatury commited on
Commit
d05dcd7
Β·
unverified Β·
1 Parent(s): 2ef4c49
OpenVoice/resources/framework.jpg DELETED
Binary file (151 kB)
 
OpenVoice/resources/lepton.jpg DELETED
Binary file (1.9 kB)
 
OpenVoice/resources/myshell.jpg DELETED
Binary file (1.7 kB)
 
OpenVoice/resources/openvoicelogo.jpg DELETED
Binary file (155 kB)
 
app_locally.py CHANGED
@@ -26,8 +26,8 @@ args = parser.parse_args()
26
  print("Starting OpenVoice")
27
 
28
  # Init EN/ZH baseTTS and ToneConvertor
29
- from OpenVoice import se_extractor
30
- from OpenVoice.api import BaseSpeakerTTS, ToneColorConverter
31
 
32
  en_ckpt_base = "checkpoints/base_speakers/EN"
33
  zh_ckpt_base = "checkpoints/base_speakers/ZH"
@@ -62,8 +62,6 @@ def predict(prompt, speaker_wav, transform_wav):
62
  source_se, _ = se_extractor.get_se(
63
  speaker_wav,
64
  tone_color_converter,
65
- target_dir="processed",
66
- max_length=60.0,
67
  vad=True,
68
  )
69
  else:
@@ -102,8 +100,6 @@ def predict(prompt, speaker_wav, transform_wav):
102
  target_se, wavs_folder = se_extractor.get_se(
103
  speaker_wav,
104
  tone_color_converter,
105
- target_dir="processed",
106
- max_length=60.0,
107
  vad=True,
108
  )
109
  # os.system(f'rm -rf {wavs_folder}')
 
26
  print("Starting OpenVoice")
27
 
28
  # Init EN/ZH baseTTS and ToneConvertor
29
+ from openvoice import se_extractor
30
+ from openvoice.api import BaseSpeakerTTS, ToneColorConverter
31
 
32
  en_ckpt_base = "checkpoints/base_speakers/EN"
33
  zh_ckpt_base = "checkpoints/base_speakers/ZH"
 
62
  source_se, _ = se_extractor.get_se(
63
  speaker_wav,
64
  tone_color_converter,
 
 
65
  vad=True,
66
  )
67
  else:
 
100
  target_se, wavs_folder = se_extractor.get_se(
101
  speaker_wav,
102
  tone_color_converter,
 
 
103
  vad=True,
104
  )
105
  # os.system(f'rm -rf {wavs_folder}')
openvoice/__init__.py ADDED
File without changes
{OpenVoice β†’ openvoice}/api.py RENAMED
@@ -2,14 +2,13 @@ import torch
2
  import numpy as np
3
  import re
4
  import soundfile
5
-
 
6
  import os
7
  import librosa
8
- from . import utils
9
- from . import commons
10
- from .text import text_to_sequence
11
- from .models import SynthesizerTrn
12
- from .mel_processing import spectrogram_torch
13
 
14
 
15
  class OpenVoiceBaseClass(object):
@@ -34,7 +33,7 @@ class OpenVoiceBaseClass(object):
34
  self.device = device
35
 
36
  def load_ckpt(self, ckpt_path):
37
- checkpoint_dict = torch.load(ckpt_path, map_location='cpu')
38
  a, b = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
39
  print("Loaded checkpoint '{}'".format(ckpt_path))
40
  print('missing/unexpected keys:', a, b)
@@ -103,6 +102,15 @@ class ToneColorConverter(OpenVoiceBaseClass):
103
  def __init__(self, *args, **kwargs):
104
  super().__init__(*args, **kwargs)
105
 
 
 
 
 
 
 
 
 
 
106
  def extract_se(self, ref_wav_list, se_save_path=None):
107
  if isinstance(ref_wav_list, str):
108
  ref_wav_list = [ref_wav_list]
@@ -130,7 +138,7 @@ class ToneColorConverter(OpenVoiceBaseClass):
130
 
131
  return gs
132
 
133
- def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3):
134
  hps = self.hps
135
  # load audio
136
  audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
@@ -145,8 +153,50 @@ class ToneColorConverter(OpenVoiceBaseClass):
145
  spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
146
  audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
147
  0, 0].data.cpu().float().numpy()
 
148
  if output_path is None:
149
  return audio
150
  else:
151
  soundfile.write(output_path, audio, hps.data.sampling_rate)
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  import re
4
  import soundfile
5
+ from openvoice import utils
6
+ from openvoice import commons
7
  import os
8
  import librosa
9
+ from openvoice.text import text_to_sequence
10
+ from openvoice.mel_processing import spectrogram_torch
11
+ from openvoice.models import SynthesizerTrn
 
 
12
 
13
 
14
  class OpenVoiceBaseClass(object):
 
33
  self.device = device
34
 
35
  def load_ckpt(self, ckpt_path):
36
+ checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
37
  a, b = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
38
  print("Loaded checkpoint '{}'".format(ckpt_path))
39
  print('missing/unexpected keys:', a, b)
 
102
  def __init__(self, *args, **kwargs):
103
  super().__init__(*args, **kwargs)
104
 
105
+ if kwargs.get('enable_watermark', True):
106
+ import wavmark
107
+ self.watermark_model = wavmark.load_model().to(self.device)
108
+ else:
109
+ self.watermark_model = None
110
+ self.version = getattr(self.hps, '_version_', "v1")
111
+
112
+
113
+
114
  def extract_se(self, ref_wav_list, se_save_path=None):
115
  if isinstance(ref_wav_list, str):
116
  ref_wav_list = [ref_wav_list]
 
138
 
139
  return gs
140
 
141
+ def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3, message="default"):
142
  hps = self.hps
143
  # load audio
144
  audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
 
153
  spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
154
  audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
155
  0, 0].data.cpu().float().numpy()
156
+ audio = self.add_watermark(audio, message)
157
  if output_path is None:
158
  return audio
159
  else:
160
  soundfile.write(output_path, audio, hps.data.sampling_rate)
161
 
162
+ def add_watermark(self, audio, message):
163
+ if self.watermark_model is None:
164
+ return audio
165
+ device = self.device
166
+ bits = utils.string_to_bits(message).reshape(-1)
167
+ n_repeat = len(bits) // 32
168
+
169
+ K = 16000
170
+ coeff = 2
171
+ for n in range(n_repeat):
172
+ trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
173
+ if len(trunck) != K:
174
+ print('Audio too short, fail to add watermark')
175
+ break
176
+ message_npy = bits[n * 32: (n + 1) * 32]
177
+
178
+ with torch.no_grad():
179
+ signal = torch.FloatTensor(trunck).to(device)[None]
180
+ message_tensor = torch.FloatTensor(message_npy).to(device)[None]
181
+ signal_wmd_tensor = self.watermark_model.encode(signal, message_tensor)
182
+ signal_wmd_npy = signal_wmd_tensor.detach().cpu().squeeze()
183
+ audio[(coeff * n) * K: (coeff * n + 1) * K] = signal_wmd_npy
184
+ return audio
185
+
186
+ def detect_watermark(self, audio, n_repeat):
187
+ bits = []
188
+ K = 16000
189
+ coeff = 2
190
+ for n in range(n_repeat):
191
+ trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
192
+ if len(trunck) != K:
193
+ print('Audio too short, fail to detect watermark')
194
+ return 'Fail'
195
+ with torch.no_grad():
196
+ signal = torch.FloatTensor(trunck).to(self.device).unsqueeze(0)
197
+ message_decoded_npy = (self.watermark_model.decode(signal) >= 0.5).int().detach().cpu().numpy().squeeze()
198
+ bits.append(message_decoded_npy)
199
+ bits = np.stack(bits).reshape(-1, 8)
200
+ message = utils.bits_to_string(bits)
201
+ return message
202
+
{OpenVoice β†’ openvoice}/attentions.py RENAMED
@@ -1,11 +1,10 @@
1
  import math
2
  import torch
3
- import logging
4
-
5
  from torch import nn
6
  from torch.nn import functional as F
7
- from . import commons
8
 
 
 
9
 
10
  logger = logging.getLogger(__name__)
11
 
 
1
  import math
2
  import torch
 
 
3
  from torch import nn
4
  from torch.nn import functional as F
 
5
 
6
+ from openvoice import commons
7
+ import logging
8
 
9
  logger = logging.getLogger(__name__)
10
 
{OpenVoice β†’ openvoice}/commons.py RENAMED
File without changes
{OpenVoice β†’ openvoice}/mel_processing.py RENAMED
File without changes
{OpenVoice β†’ openvoice}/models.py RENAMED
@@ -3,14 +3,14 @@ import torch
3
  from torch import nn
4
  from torch.nn import functional as F
5
 
6
- from . import commons
7
- from . import modules
8
- from . import attentions
9
 
10
  from torch.nn import Conv1d, ConvTranspose1d, Conv2d
11
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
 
13
- from .commons import init_weights
14
 
15
 
16
  class TextEncoder(nn.Module):
@@ -420,6 +420,7 @@ class SynthesizerTrn(nn.Module):
420
  upsample_kernel_sizes,
421
  n_speakers=256,
422
  gin_channels=256,
 
423
  **kwargs
424
  ):
425
  super().__init__()
@@ -461,6 +462,7 @@ class SynthesizerTrn(nn.Module):
461
  self.sdp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
462
  self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
463
  self.emb_g = nn.Embedding(n_speakers, gin_channels)
 
464
 
465
  def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., sdp_ratio=0.2, max_len=None):
466
  x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
@@ -490,8 +492,8 @@ class SynthesizerTrn(nn.Module):
490
  def voice_conversion(self, y, y_lengths, sid_src, sid_tgt, tau=1.0):
491
  g_src = sid_src
492
  g_tgt = sid_tgt
493
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src, tau=tau)
494
  z_p = self.flow(z, y_mask, g=g_src)
495
  z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
496
- o_hat = self.dec(z_hat * y_mask, g=g_tgt)
497
  return o_hat, y_mask, (z, z_p, z_hat)
 
3
  from torch import nn
4
  from torch.nn import functional as F
5
 
6
+ from openvoice import commons
7
+ from openvoice import modules
8
+ from openvoice import attentions
9
 
10
  from torch.nn import Conv1d, ConvTranspose1d, Conv2d
11
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
 
13
+ from openvoice.commons import init_weights, get_padding
14
 
15
 
16
  class TextEncoder(nn.Module):
 
420
  upsample_kernel_sizes,
421
  n_speakers=256,
422
  gin_channels=256,
423
+ zero_g=False,
424
  **kwargs
425
  ):
426
  super().__init__()
 
462
  self.sdp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
463
  self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
464
  self.emb_g = nn.Embedding(n_speakers, gin_channels)
465
+ self.zero_g = zero_g
466
 
467
  def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., sdp_ratio=0.2, max_len=None):
468
  x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
 
492
  def voice_conversion(self, y, y_lengths, sid_src, sid_tgt, tau=1.0):
493
  g_src = sid_src
494
  g_tgt = sid_tgt
495
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src if not self.zero_g else torch.zeros_like(g_src), tau=tau)
496
  z_p = self.flow(z, y_mask, g=g_src)
497
  z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
498
+ o_hat = self.dec(z_hat * y_mask, g=g_tgt if not self.zero_g else torch.zeros_like(g_tgt))
499
  return o_hat, y_mask, (z, z_p, z_hat)
{OpenVoice β†’ openvoice}/modules.py RENAMED
@@ -6,10 +6,10 @@ from torch.nn import functional as F
6
  from torch.nn import Conv1d
7
  from torch.nn.utils import weight_norm, remove_weight_norm
8
 
9
- from . import commons
10
- from .commons import init_weights, get_padding
11
- from .transforms import piecewise_rational_quadratic_transform
12
- from .attentions import Encoder
13
 
14
  LRELU_SLOPE = 0.1
15
 
 
6
  from torch.nn import Conv1d
7
  from torch.nn.utils import weight_norm, remove_weight_norm
8
 
9
+ from openvoice import commons
10
+ from openvoice.commons import init_weights, get_padding
11
+ from openvoice.transforms import piecewise_rational_quadratic_transform
12
+ from openvoice.attentions import Encoder
13
 
14
  LRELU_SLOPE = 0.1
15
 
openvoice/openvoice_app.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import argparse
4
+ import gradio as gr
5
+ from zipfile import ZipFile
6
+ import langid
7
+ from openvoice import se_extractor
8
+ from openvoice.api import BaseSpeakerTTS, ToneColorConverter
9
+
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument("--share", action='store_true', default=False, help="make link public")
12
+ args = parser.parse_args()
13
+
14
+ en_ckpt_base = 'checkpoints/base_speakers/EN'
15
+ zh_ckpt_base = 'checkpoints/base_speakers/ZH'
16
+ ckpt_converter = 'checkpoints/converter'
17
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
18
+ output_dir = 'outputs'
19
+ os.makedirs(output_dir, exist_ok=True)
20
+
21
+ # load models
22
+ en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
23
+ en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
24
+ zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
25
+ zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
26
+ tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
27
+ tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
28
+
29
+ # load speaker embeddings
30
+ en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
31
+ en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
32
+ zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
33
+
34
+ # This online demo mainly supports English and Chinese
35
+ supported_languages = ['zh', 'en']
36
+
37
+ def predict(prompt, style, audio_file_pth, agree):
38
+ # initialize a empty info
39
+ text_hint = ''
40
+ # agree with the terms
41
+ if agree == False:
42
+ text_hint += '[ERROR] Please accept the Terms & Condition!\n'
43
+ gr.Warning("Please accept the Terms & Condition!")
44
+ return (
45
+ text_hint,
46
+ None,
47
+ None,
48
+ )
49
+
50
+ # first detect the input language
51
+ language_predicted = langid.classify(prompt)[0].strip()
52
+ print(f"Detected language:{language_predicted}")
53
+
54
+ if language_predicted not in supported_languages:
55
+ text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
56
+ gr.Warning(
57
+ f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
58
+ )
59
+
60
+ return (
61
+ text_hint,
62
+ None,
63
+ None,
64
+ )
65
+
66
+ if language_predicted == "zh":
67
+ tts_model = zh_base_speaker_tts
68
+ source_se = zh_source_se
69
+ language = 'Chinese'
70
+ if style not in ['default']:
71
+ text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
72
+ gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
73
+ return (
74
+ text_hint,
75
+ None,
76
+ None,
77
+ )
78
+
79
+ else:
80
+ tts_model = en_base_speaker_tts
81
+ if style == 'default':
82
+ source_se = en_source_default_se
83
+ else:
84
+ source_se = en_source_style_se
85
+ language = 'English'
86
+ if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
87
+ text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
88
+ gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
89
+ return (
90
+ text_hint,
91
+ None,
92
+ None,
93
+ )
94
+
95
+ speaker_wav = audio_file_pth
96
+
97
+ if len(prompt) < 2:
98
+ text_hint += f"[ERROR] Please give a longer prompt text \n"
99
+ gr.Warning("Please give a longer prompt text")
100
+ return (
101
+ text_hint,
102
+ None,
103
+ None,
104
+ )
105
+ if len(prompt) > 200:
106
+ text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
107
+ gr.Warning(
108
+ "Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
109
+ )
110
+ return (
111
+ text_hint,
112
+ None,
113
+ None,
114
+ )
115
+
116
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
117
+ try:
118
+ target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', vad=True)
119
+ except Exception as e:
120
+ text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
121
+ gr.Warning(
122
+ "[ERROR] Get target tone color error {str(e)} \n"
123
+ )
124
+ return (
125
+ text_hint,
126
+ None,
127
+ None,
128
+ )
129
+
130
+ src_path = f'{output_dir}/tmp.wav'
131
+ tts_model.tts(prompt, src_path, speaker=style, language=language)
132
+
133
+ save_path = f'{output_dir}/output.wav'
134
+ # Run the tone color converter
135
+ encode_message = "@MyShell"
136
+ tone_color_converter.convert(
137
+ audio_src_path=src_path,
138
+ src_se=source_se,
139
+ tgt_se=target_se,
140
+ output_path=save_path,
141
+ message=encode_message)
142
+
143
+ text_hint += f'''Get response successfully \n'''
144
+
145
+ return (
146
+ text_hint,
147
+ save_path,
148
+ speaker_wav,
149
+ )
150
+
151
+
152
+
153
+ title = "MyShell OpenVoice"
154
+
155
+ description = """
156
+ We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
157
+ """
158
+
159
+ markdown_table = """
160
+ <div align="center" style="margin-bottom: 10px;">
161
+
162
+ | | | |
163
+ | :-----------: | :-----------: | :-----------: |
164
+ | **OpenSource Repo** | **Project Page** | **Join the Community** |
165
+ | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
166
+
167
+ </div>
168
+ """
169
+
170
+ markdown_table_v2 = """
171
+ <div align="center" style="margin-bottom: 2px;">
172
+
173
+ | | | | |
174
+ | :-----------: | :-----------: | :-----------: | :-----------: |
175
+ | **OpenSource Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
176
+
177
+ | | |
178
+ | :-----------: | :-----------: |
179
+ **Join the Community** | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
180
+
181
+ </div>
182
+ """
183
+ content = """
184
+ <div>
185
+ <strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
186
+ This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
187
+ </div>
188
+ """
189
+ wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
190
+
191
+
192
+ examples = [
193
+ [
194
+ "δ»Šε€©ε€©ζ°”ηœŸε₯½οΌŒζˆ‘δ»¬δΈ€θ΅·ε‡ΊεŽ»εƒι₯­ε§γ€‚",
195
+ 'default',
196
+ "resources/demo_speaker1.mp3",
197
+ True,
198
+ ],[
199
+ "This audio is generated by open voice with a half-performance model.",
200
+ 'whispering',
201
+ "resources/demo_speaker2.mp3",
202
+ True,
203
+ ],
204
+ [
205
+ "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
206
+ 'sad',
207
+ "resources/demo_speaker0.mp3",
208
+ True,
209
+ ],
210
+ ]
211
+
212
+ with gr.Blocks(analytics_enabled=False) as demo:
213
+
214
+ with gr.Row():
215
+ with gr.Column():
216
+ with gr.Row():
217
+ gr.Markdown(
218
+ """
219
+ ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
220
+ """
221
+ )
222
+ with gr.Row():
223
+ gr.Markdown(markdown_table_v2)
224
+ with gr.Row():
225
+ gr.Markdown(description)
226
+ with gr.Column():
227
+ gr.Video('https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f', autoplay=True)
228
+
229
+ with gr.Row():
230
+ gr.HTML(wrapped_markdown_content)
231
+
232
+ with gr.Row():
233
+ with gr.Column():
234
+ input_text_gr = gr.Textbox(
235
+ label="Text Prompt",
236
+ info="One or two sentences at a time is better. Up to 200 text characters.",
237
+ value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
238
+ )
239
+ style_gr = gr.Dropdown(
240
+ label="Style",
241
+ info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
242
+ choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
243
+ max_choices=1,
244
+ value="default",
245
+ )
246
+ ref_gr = gr.Audio(
247
+ label="Reference Audio",
248
+ info="Click on the ✎ button to upload your own target speaker audio",
249
+ type="filepath",
250
+ value="resources/demo_speaker2.mp3",
251
+ )
252
+ tos_gr = gr.Checkbox(
253
+ label="Agree",
254
+ value=False,
255
+ info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
256
+ )
257
+
258
+ tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
259
+
260
+
261
+ with gr.Column():
262
+ out_text_gr = gr.Text(label="Info")
263
+ audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
264
+ ref_audio_gr = gr.Audio(label="Reference Audio Used")
265
+
266
+ gr.Examples(examples,
267
+ label="Examples",
268
+ inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
269
+ outputs=[out_text_gr, audio_gr, ref_audio_gr],
270
+ fn=predict,
271
+ cache_examples=False,)
272
+ tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
273
+
274
+ demo.queue()
275
+ demo.launch(debug=True, show_api=True, share=args.share)
{OpenVoice β†’ openvoice}/se_extractor.py RENAMED
@@ -1,26 +1,28 @@
1
  import os
2
  import glob
3
  import torch
 
 
 
4
  from glob import glob
5
  import numpy as np
6
  from pydub import AudioSegment
7
  from faster_whisper import WhisperModel
 
 
 
8
  from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
9
 
10
  model_size = "medium"
11
  # Run on GPU with FP16
12
  model = None
13
- def split_audio_whisper(audio_path, device='cuda', target_dir='processed'):
14
  global model
15
  if model is None:
16
- if device == 'cpu':
17
- model = WhisperModel(model_size, device=device)
18
- else:
19
- model = WhisperModel(model_size, device=device, compute_type="float16")
20
  audio = AudioSegment.from_file(audio_path)
21
  max_len = len(audio)
22
 
23
- audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
24
  target_folder = os.path.join(target_dir, audio_name)
25
 
26
  segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
@@ -72,9 +74,9 @@ def split_audio_whisper(audio_path, device='cuda', target_dir='processed'):
72
  return wavs_folder
73
 
74
 
75
- def split_audio_vad(audio_path, target_dir, split_seconds=10.0, max_length=60.):
76
  SAMPLE_RATE = 16000
77
- audio_vad = get_audio_tensor(audio_path)[:int(max_length * SAMPLE_RATE)]
78
  segments = get_vad_segments(
79
  audio_vad,
80
  output_sample=True,
@@ -93,7 +95,6 @@ def split_audio_vad(audio_path, target_dir, split_seconds=10.0, max_length=60.):
93
 
94
  audio_dur = audio_active.duration_seconds
95
  print(f'after vad: dur = {audio_dur}')
96
- audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
97
  target_folder = os.path.join(target_dir, audio_name)
98
  wavs_folder = os.path.join(target_folder, 'wavs')
99
  os.makedirs(wavs_folder, exist_ok=True)
@@ -114,25 +115,39 @@ def split_audio_vad(audio_path, target_dir, split_seconds=10.0, max_length=60.):
114
  count += 1
115
  return wavs_folder
116
 
117
- def get_se(audio_path, vc_model, target_dir='processed', max_length=60., vad=True):
 
 
 
 
 
 
 
 
 
 
 
118
  device = vc_model.device
 
 
119
 
120
- audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
121
  se_path = os.path.join(target_dir, audio_name, 'se.pth')
122
 
123
- if os.path.isfile(se_path):
124
- se = torch.load(se_path).to(device)
125
- return se, audio_name
126
- if os.path.isdir(audio_path):
127
- wavs_folder = audio_path
128
- elif vad:
129
- wavs_folder = split_audio_vad(audio_path, target_dir, max_length=max_length)
 
130
  else:
131
- wavs_folder = split_audio_whisper(audio_path, device=device, target_dir=target_dir)
132
 
133
  audio_segs = glob(f'{wavs_folder}/*.wav')
134
-
135
  if len(audio_segs) == 0:
136
  raise NotImplementedError('No audio segments found!')
137
 
138
- return vc_model.extract_se(audio_segs, se_save_path=se_path), wavs_folder
 
 
1
  import os
2
  import glob
3
  import torch
4
+ import hashlib
5
+ import librosa
6
+ import base64
7
  from glob import glob
8
  import numpy as np
9
  from pydub import AudioSegment
10
  from faster_whisper import WhisperModel
11
+ import hashlib
12
+ import base64
13
+ import librosa
14
  from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
15
 
16
  model_size = "medium"
17
  # Run on GPU with FP16
18
  model = None
19
+ def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
20
  global model
21
  if model is None:
22
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
 
 
 
23
  audio = AudioSegment.from_file(audio_path)
24
  max_len = len(audio)
25
 
 
26
  target_folder = os.path.join(target_dir, audio_name)
27
 
28
  segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
 
74
  return wavs_folder
75
 
76
 
77
+ def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
78
  SAMPLE_RATE = 16000
79
+ audio_vad = get_audio_tensor(audio_path)
80
  segments = get_vad_segments(
81
  audio_vad,
82
  output_sample=True,
 
95
 
96
  audio_dur = audio_active.duration_seconds
97
  print(f'after vad: dur = {audio_dur}')
 
98
  target_folder = os.path.join(target_dir, audio_name)
99
  wavs_folder = os.path.join(target_folder, 'wavs')
100
  os.makedirs(wavs_folder, exist_ok=True)
 
115
  count += 1
116
  return wavs_folder
117
 
118
+ def hash_numpy_array(audio_path):
119
+ array, _ = librosa.load(audio_path, sr=None, mono=True)
120
+ # Convert the array to bytes
121
+ array_bytes = array.tobytes()
122
+ # Calculate the hash of the array bytes
123
+ hash_object = hashlib.sha256(array_bytes)
124
+ hash_value = hash_object.digest()
125
+ # Convert the hash value to base64
126
+ base64_value = base64.b64encode(hash_value)
127
+ return base64_value.decode('utf-8')[:16].replace('/', '_^')
128
+
129
+ def get_se(audio_path, vc_model, target_dir='processed', vad=True):
130
  device = vc_model.device
131
+ version = vc_model.version
132
+ print("OpenVoice version:", version)
133
 
134
+ audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
135
  se_path = os.path.join(target_dir, audio_name, 'se.pth')
136
 
137
+ # if os.path.isfile(se_path):
138
+ # se = torch.load(se_path).to(device)
139
+ # return se, audio_name
140
+ # if os.path.isdir(audio_path):
141
+ # wavs_folder = audio_path
142
+
143
+ if vad:
144
+ wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
145
  else:
146
+ wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
147
 
148
  audio_segs = glob(f'{wavs_folder}/*.wav')
 
149
  if len(audio_segs) == 0:
150
  raise NotImplementedError('No audio segments found!')
151
 
152
+ return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
153
+
{OpenVoice β†’ openvoice}/text/__init__.py RENAMED
@@ -1,6 +1,7 @@
1
  """ from https://github.com/keithito/tacotron """
2
- from . import cleaners
3
- from .symbols import *
 
4
 
5
  # Mappings from symbol to numeric ID and vice versa:
6
  _symbol_to_id = {s: i for i, s in enumerate(symbols)}
@@ -42,7 +43,7 @@ def cleaned_text_to_sequence(cleaned_text, symbols):
42
 
43
 
44
 
45
- from .symbols import language_tone_start_map
46
  def cleaned_text_to_sequence_vits2(cleaned_text, tones, language, symbols, languages):
47
  """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
48
  Args:
 
1
  """ from https://github.com/keithito/tacotron """
2
+ from openvoice.text import cleaners
3
+ from openvoice.text.symbols import symbols
4
+
5
 
6
  # Mappings from symbol to numeric ID and vice versa:
7
  _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 
43
 
44
 
45
 
46
+ from openvoice.text.symbols import language_tone_start_map
47
  def cleaned_text_to_sequence_vits2(cleaned_text, tones, language, symbols, languages):
48
  """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
49
  Args:
{OpenVoice β†’ openvoice}/text/cleaners.py RENAMED
@@ -1,6 +1,6 @@
1
  import re
2
- from .english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
3
- from .mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
4
 
5
  def cjke_cleaners2(text):
6
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
 
1
  import re
2
+ from openvoice.text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
3
+ from openvoice.text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
4
 
5
  def cjke_cleaners2(text):
6
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
{OpenVoice β†’ openvoice}/text/english.py RENAMED
@@ -20,7 +20,6 @@ import re
20
  import inflect
21
  from unidecode import unidecode
22
  import eng_to_ipa as ipa
23
-
24
  _inflect = inflect.engine()
25
  _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
26
  _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 
20
  import inflect
21
  from unidecode import unidecode
22
  import eng_to_ipa as ipa
 
23
  _inflect = inflect.engine()
24
  _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
25
  _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
{OpenVoice β†’ openvoice}/text/mandarin.py RENAMED
File without changes
{OpenVoice β†’ openvoice}/text/symbols.py RENAMED
File without changes
{OpenVoice β†’ openvoice}/transforms.py RENAMED
File without changes
{OpenVoice β†’ openvoice}/utils.py RENAMED
@@ -128,7 +128,7 @@ def merge_short_sentences_latin(sens):
128
  """
129
  sens_out = []
130
  for s in sens:
131
- # If the previous sentense is too short, merge them with
132
  # the current sentence.
133
  if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
134
  sens_out[-1] = sens_out[-1] + " " + s
 
128
  """
129
  sens_out = []
130
  for s in sens:
131
+ # If the previous sentence is too short, merge them with
132
  # the current sentence.
133
  if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
134
  sens_out[-1] = sens_out[-1] + " " + s