prasanacodes commited on
Commit
3134aa2
·
verified ·
1 Parent(s): 6a7145c

Upload 23 files

Browse files
openvoice_cli/__init__.py ADDED
File without changes
openvoice_cli/__main__.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ from tqdm import tqdm
4
+ from openvoice_cli.downloader import download_checkpoint
5
+ from openvoice_cli.api import ToneColorConverter
6
+ import openvoice_cli.se_extractor as se_extractor
7
+ import glob
8
+
9
+ def tune_one(input_file,ref_file,output_file,device):
10
+ current_dir = os.path.dirname(os.path.realpath(__file__))
11
+ checkpoints_dir = os.path.join(current_dir, 'checkpoints')
12
+ ckpt_converter = os.path.join(checkpoints_dir, 'converter')
13
+
14
+ if not os.path.exists(ckpt_converter):
15
+ os.makedirs(ckpt_converter, exist_ok=True)
16
+ download_checkpoint(ckpt_converter)
17
+
18
+ device = device
19
+
20
+ tone_color_converter = ToneColorConverter(os.path.join(ckpt_converter, 'config.json'), device=device)
21
+ tone_color_converter.load_ckpt(os.path.join(ckpt_converter, 'checkpoint.pth'))
22
+
23
+ source_se, _ = se_extractor.get_se(input_file, tone_color_converter, vad=True)
24
+ target_se, _ = se_extractor.get_se(ref_file, tone_color_converter, vad=True)
25
+
26
+ # Ensure output directory exists and is writable
27
+ output_dir = os.path.dirname(output_file)
28
+ if output_dir:
29
+ if not os.path.exists(output_dir):
30
+ os.makedirs(output_dir, exist_ok=True)
31
+
32
+ # Run the tone color converter
33
+ tone_color_converter.convert(
34
+ audio_src_path=input_file,
35
+ src_se=source_se,
36
+ tgt_se=target_se,
37
+ output_path=output_file,
38
+ )
39
+
40
+ def tune_batch(input_dir, ref_file, output_dir=None, device='cpu', output_format='.wav'):
41
+ current_dir = os.path.dirname(os.path.realpath(__file__))
42
+ checkpoints_dir = os.path.join(current_dir, 'checkpoints')
43
+ ckpt_converter = os.path.join(checkpoints_dir, 'converter')
44
+
45
+ if not os.path.exists(ckpt_converter):
46
+ os.makedirs(ckpt_converter, exist_ok=True)
47
+ download_checkpoint(ckpt_converter)
48
+
49
+ tone_color_converter = ToneColorConverter(os.path.join(ckpt_converter, 'config.json'), device=device)
50
+ tone_color_converter.load_ckpt(os.path.join(ckpt_converter, 'checkpoint.pth'))
51
+
52
+ target_se, _ = se_extractor.get_se(ref_file, tone_color_converter, vad=True)
53
+
54
+ # Use default output directory 'out' if not provided
55
+ if output_dir is None:
56
+ output_dir = os.path.join(current_dir, 'out')
57
+ os.makedirs(output_dir, exist_ok=True)
58
+
59
+ # Check for any audio files in the input directory (wav, mp3, flac) using glob
60
+ audio_extensions = ('*.wav', '*.mp3', '*.flac')
61
+ audio_files = []
62
+ for extension in audio_extensions:
63
+ audio_files.extend(glob.glob(os.path.join(input_dir, extension)))
64
+
65
+ for audio_file in tqdm(audio_files,"Tune file",len(audio_files)):
66
+ # Extract source SE from audio file
67
+ source_se, _ = se_extractor.get_se(audio_file, tone_color_converter, vad=True)
68
+
69
+ # Run the tone color converter
70
+ filename_without_extension = os.path.splitext(os.path.basename(audio_file))[0]
71
+ output_filename = f"{filename_without_extension}_tuned{output_format}"
72
+ output_file = os.path.join(output_dir, output_filename)
73
+
74
+ tone_color_converter.convert(
75
+ audio_src_path=audio_file,
76
+ src_se=source_se,
77
+ tgt_se=target_se,
78
+ output_path=output_file,
79
+ )
80
+ print(f"Converted {audio_file} to {output_file}")
81
+
82
+ return output_dir
83
+
84
+ def main_single(args):
85
+ tune_one(input_file=args.input, ref_file=args.ref, output_file=args.output, device=args.device)
86
+
87
+ def main_batch(args):
88
+ output_dir = tune_batch(
89
+ input_dir=args.input_dir,
90
+ ref_file=args.ref_file,
91
+ output_dir=args.output_dir,
92
+ device=args.device,
93
+ output_format=args.output_format
94
+ )
95
+ print(f"Batch processing complete. Converted files are saved in {output_dir}")
96
+
97
+
98
+ if __name__ == '__main__':
99
+ parser = argparse.ArgumentParser(description='Convert the tone color of audio files using a reference audio.')
100
+
101
+ # Create subparsers for single and batch processing
102
+ subparsers = parser.add_subparsers(help='commands', dest='command')
103
+
104
+ # Single file conversion arguments
105
+ single_parser = subparsers.add_parser('single', help='Process a single file')
106
+ single_parser.add_argument('-i', '--input', help='Input audio file path', required=True)
107
+ single_parser.add_argument('-r', '--ref', help='Reference audio file path', required=True)
108
+ single_parser.add_argument('-o', '--output', default="out.wav", help='Output path for converted audio file')
109
+ single_parser.add_argument('-d', '--device', default="cpu", help='Device to use (e.g., "cuda:0" or "cpu")')
110
+ single_parser.set_defaults(func=main_single)
111
+
112
+ # Batch processing arguments
113
+ batch_parser = subparsers.add_parser('batch', help='Process a batch of files in a directory')
114
+ batch_parser.add_argument('-id', '--input_dir', help='Input directory containing audio files to process', required=True)
115
+ batch_parser.add_argument('-rf', '--ref_file', help='Reference audio file path', required=True)
116
+ batch_parser.add_argument('-od', '--output_dir', help='Output directory for converted audio files', default="outputs")
117
+ batch_parser.add_argument('-d', '--device', default="cuda", help='Device to use')
118
+ batch_parser.add_argument('-of', '--output_format', default=".wav", help='Output file format (e.g., ".wav")')
119
+ batch_parser.set_defaults(func=main_batch)
120
+
121
+ args = parser.parse_args()
122
+ if hasattr(args, 'func'):
123
+ args.func(args)
124
+ else:
125
+ parser.print_help()
openvoice_cli/api.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import re
4
+ import soundfile
5
+ import openvoice_cli.utils as utils
6
+ import os
7
+ import librosa
8
+ from openvoice_cli.mel_processing import spectrogram_torch
9
+ from openvoice_cli.models import SynthesizerTrn
10
+
11
+
12
+ class OpenVoiceBaseClass(object):
13
+ def __init__(self,
14
+ config_path,
15
+ device='cuda:0'):
16
+ if 'cuda' in device:
17
+ assert torch.cuda.is_available()
18
+
19
+ hps = utils.get_hparams_from_file(config_path)
20
+
21
+ model = SynthesizerTrn(
22
+ len(getattr(hps, 'symbols', [])),
23
+ hps.data.filter_length // 2 + 1,
24
+ n_speakers=hps.data.n_speakers,
25
+ **hps.model,
26
+ ).to(device)
27
+
28
+ model.eval()
29
+ self.model = model
30
+ self.hps = hps
31
+ self.device = device
32
+
33
+ def load_ckpt(self, ckpt_path):
34
+ checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
35
+ a, b = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
36
+ print("Loaded checkpoint '{}'".format(ckpt_path))
37
+ print('missing/unexpected keys:', a, b)
38
+
39
+ class ToneColorConverter(OpenVoiceBaseClass):
40
+ def __init__(self, *args, **kwargs):
41
+ super().__init__(*args, **kwargs)
42
+
43
+ if kwargs.get('enable_watermark', True):
44
+ import wavmark
45
+ self.watermark_model = wavmark.load_model().to(self.device)
46
+ else:
47
+ self.watermark_model = None
48
+ self.version = getattr(self.hps, '_version_', "v1")
49
+
50
+
51
+
52
+ def extract_se(self, ref_wav_list, se_save_path=None):
53
+ if isinstance(ref_wav_list, str):
54
+ ref_wav_list = [ref_wav_list]
55
+
56
+ device = self.device
57
+ hps = self.hps
58
+ gs = []
59
+
60
+ for fname in ref_wav_list:
61
+ audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
62
+ y = torch.FloatTensor(audio_ref)
63
+ y = y.to(device)
64
+ y = y.unsqueeze(0)
65
+ y = spectrogram_torch(y, hps.data.filter_length,
66
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
67
+ center=False).to(device)
68
+ with torch.no_grad():
69
+ g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
70
+ gs.append(g.detach())
71
+ gs = torch.stack(gs).mean(0)
72
+
73
+ if se_save_path is not None:
74
+ os.makedirs(os.path.dirname(se_save_path), exist_ok=True)
75
+ torch.save(gs.cpu(), se_save_path)
76
+
77
+ return gs
78
+
79
+ def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3, message="default"):
80
+ hps = self.hps
81
+ # load audio
82
+ audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
83
+ audio = torch.tensor(audio).float()
84
+
85
+ with torch.no_grad():
86
+ y = torch.FloatTensor(audio).to(self.device)
87
+ y = y.unsqueeze(0)
88
+ spec = spectrogram_torch(y, hps.data.filter_length,
89
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
90
+ center=False).to(self.device)
91
+ spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
92
+ audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
93
+ 0, 0].data.cpu().float().numpy()
94
+ audio = self.add_watermark(audio, message)
95
+ if output_path is None:
96
+ return audio
97
+ else:
98
+ soundfile.write(output_path, audio, hps.data.sampling_rate)
99
+
100
+ def add_watermark(self, audio, message):
101
+ if self.watermark_model is None:
102
+ return audio
103
+ device = self.device
104
+ bits = utils.string_to_bits(message).reshape(-1)
105
+ n_repeat = len(bits) // 32
106
+
107
+ K = 16000
108
+ coeff = 2
109
+ for n in range(n_repeat):
110
+ trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
111
+ if len(trunck) != K:
112
+ print('Audio too short, fail to add watermark')
113
+ break
114
+ message_npy = bits[n * 32: (n + 1) * 32]
115
+
116
+ with torch.no_grad():
117
+ signal = torch.FloatTensor(trunck).to(device)[None]
118
+ message_tensor = torch.FloatTensor(message_npy).to(device)[None]
119
+ signal_wmd_tensor = self.watermark_model.encode(signal, message_tensor)
120
+ signal_wmd_npy = signal_wmd_tensor.detach().cpu().squeeze()
121
+ audio[(coeff * n) * K: (coeff * n + 1) * K] = signal_wmd_npy
122
+ return audio
123
+
124
+ def detect_watermark(self, audio, n_repeat):
125
+ bits = []
126
+ K = 16000
127
+ coeff = 2
128
+ for n in range(n_repeat):
129
+ trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
130
+ if len(trunck) != K:
131
+ print('Audio too short, fail to detect watermark')
132
+ return 'Fail'
133
+ with torch.no_grad():
134
+ signal = torch.FloatTensor(trunck).to(self.device).unsqueeze(0)
135
+ message_decoded_npy = (self.watermark_model.decode(signal) >= 0.5).int().detach().cpu().numpy().squeeze()
136
+ bits.append(message_decoded_npy)
137
+ bits = np.stack(bits).reshape(-1, 8)
138
+ message = utils.bits_to_string(bits)
139
+ return message
openvoice_cli/attentions.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ import openvoice_cli.commons as commons
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class LayerNorm(nn.Module):
13
+ def __init__(self, channels, eps=1e-5):
14
+ super().__init__()
15
+ self.channels = channels
16
+ self.eps = eps
17
+
18
+ self.gamma = nn.Parameter(torch.ones(channels))
19
+ self.beta = nn.Parameter(torch.zeros(channels))
20
+
21
+ def forward(self, x):
22
+ x = x.transpose(1, -1)
23
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
24
+ return x.transpose(1, -1)
25
+
26
+
27
+ @torch.jit.script
28
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
29
+ n_channels_int = n_channels[0]
30
+ in_act = input_a + input_b
31
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
32
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
33
+ acts = t_act * s_act
34
+ return acts
35
+
36
+
37
+ class Encoder(nn.Module):
38
+ def __init__(
39
+ self,
40
+ hidden_channels,
41
+ filter_channels,
42
+ n_heads,
43
+ n_layers,
44
+ kernel_size=1,
45
+ p_dropout=0.0,
46
+ window_size=4,
47
+ isflow=True,
48
+ **kwargs
49
+ ):
50
+ super().__init__()
51
+ self.hidden_channels = hidden_channels
52
+ self.filter_channels = filter_channels
53
+ self.n_heads = n_heads
54
+ self.n_layers = n_layers
55
+ self.kernel_size = kernel_size
56
+ self.p_dropout = p_dropout
57
+ self.window_size = window_size
58
+ # if isflow:
59
+ # cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
60
+ # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
61
+ # self.cond_layer = weight_norm(cond_layer, name='weight')
62
+ # self.gin_channels = 256
63
+ self.cond_layer_idx = self.n_layers
64
+ if "gin_channels" in kwargs:
65
+ self.gin_channels = kwargs["gin_channels"]
66
+ if self.gin_channels != 0:
67
+ self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
68
+ # vits2 says 3rd block, so idx is 2 by default
69
+ self.cond_layer_idx = (
70
+ kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
71
+ )
72
+ # logging.debug(self.gin_channels, self.cond_layer_idx)
73
+ assert (
74
+ self.cond_layer_idx < self.n_layers
75
+ ), "cond_layer_idx should be less than n_layers"
76
+ self.drop = nn.Dropout(p_dropout)
77
+ self.attn_layers = nn.ModuleList()
78
+ self.norm_layers_1 = nn.ModuleList()
79
+ self.ffn_layers = nn.ModuleList()
80
+ self.norm_layers_2 = nn.ModuleList()
81
+
82
+ for i in range(self.n_layers):
83
+ self.attn_layers.append(
84
+ MultiHeadAttention(
85
+ hidden_channels,
86
+ hidden_channels,
87
+ n_heads,
88
+ p_dropout=p_dropout,
89
+ window_size=window_size,
90
+ )
91
+ )
92
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
93
+ self.ffn_layers.append(
94
+ FFN(
95
+ hidden_channels,
96
+ hidden_channels,
97
+ filter_channels,
98
+ kernel_size,
99
+ p_dropout=p_dropout,
100
+ )
101
+ )
102
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
103
+
104
+ def forward(self, x, x_mask, g=None):
105
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
106
+ x = x * x_mask
107
+ for i in range(self.n_layers):
108
+ if i == self.cond_layer_idx and g is not None:
109
+ g = self.spk_emb_linear(g.transpose(1, 2))
110
+ g = g.transpose(1, 2)
111
+ x = x + g
112
+ x = x * x_mask
113
+ y = self.attn_layers[i](x, x, attn_mask)
114
+ y = self.drop(y)
115
+ x = self.norm_layers_1[i](x + y)
116
+
117
+ y = self.ffn_layers[i](x, x_mask)
118
+ y = self.drop(y)
119
+ x = self.norm_layers_2[i](x + y)
120
+ x = x * x_mask
121
+ return x
122
+
123
+
124
+ class Decoder(nn.Module):
125
+ def __init__(
126
+ self,
127
+ hidden_channels,
128
+ filter_channels,
129
+ n_heads,
130
+ n_layers,
131
+ kernel_size=1,
132
+ p_dropout=0.0,
133
+ proximal_bias=False,
134
+ proximal_init=True,
135
+ **kwargs
136
+ ):
137
+ super().__init__()
138
+ self.hidden_channels = hidden_channels
139
+ self.filter_channels = filter_channels
140
+ self.n_heads = n_heads
141
+ self.n_layers = n_layers
142
+ self.kernel_size = kernel_size
143
+ self.p_dropout = p_dropout
144
+ self.proximal_bias = proximal_bias
145
+ self.proximal_init = proximal_init
146
+
147
+ self.drop = nn.Dropout(p_dropout)
148
+ self.self_attn_layers = nn.ModuleList()
149
+ self.norm_layers_0 = nn.ModuleList()
150
+ self.encdec_attn_layers = nn.ModuleList()
151
+ self.norm_layers_1 = nn.ModuleList()
152
+ self.ffn_layers = nn.ModuleList()
153
+ self.norm_layers_2 = nn.ModuleList()
154
+ for i in range(self.n_layers):
155
+ self.self_attn_layers.append(
156
+ MultiHeadAttention(
157
+ hidden_channels,
158
+ hidden_channels,
159
+ n_heads,
160
+ p_dropout=p_dropout,
161
+ proximal_bias=proximal_bias,
162
+ proximal_init=proximal_init,
163
+ )
164
+ )
165
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
166
+ self.encdec_attn_layers.append(
167
+ MultiHeadAttention(
168
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
169
+ )
170
+ )
171
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
172
+ self.ffn_layers.append(
173
+ FFN(
174
+ hidden_channels,
175
+ hidden_channels,
176
+ filter_channels,
177
+ kernel_size,
178
+ p_dropout=p_dropout,
179
+ causal=True,
180
+ )
181
+ )
182
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
183
+
184
+ def forward(self, x, x_mask, h, h_mask):
185
+ """
186
+ x: decoder input
187
+ h: encoder output
188
+ """
189
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
190
+ device=x.device, dtype=x.dtype
191
+ )
192
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
193
+ x = x * x_mask
194
+ for i in range(self.n_layers):
195
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
196
+ y = self.drop(y)
197
+ x = self.norm_layers_0[i](x + y)
198
+
199
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
200
+ y = self.drop(y)
201
+ x = self.norm_layers_1[i](x + y)
202
+
203
+ y = self.ffn_layers[i](x, x_mask)
204
+ y = self.drop(y)
205
+ x = self.norm_layers_2[i](x + y)
206
+ x = x * x_mask
207
+ return x
208
+
209
+
210
+ class MultiHeadAttention(nn.Module):
211
+ def __init__(
212
+ self,
213
+ channels,
214
+ out_channels,
215
+ n_heads,
216
+ p_dropout=0.0,
217
+ window_size=None,
218
+ heads_share=True,
219
+ block_length=None,
220
+ proximal_bias=False,
221
+ proximal_init=False,
222
+ ):
223
+ super().__init__()
224
+ assert channels % n_heads == 0
225
+
226
+ self.channels = channels
227
+ self.out_channels = out_channels
228
+ self.n_heads = n_heads
229
+ self.p_dropout = p_dropout
230
+ self.window_size = window_size
231
+ self.heads_share = heads_share
232
+ self.block_length = block_length
233
+ self.proximal_bias = proximal_bias
234
+ self.proximal_init = proximal_init
235
+ self.attn = None
236
+
237
+ self.k_channels = channels // n_heads
238
+ self.conv_q = nn.Conv1d(channels, channels, 1)
239
+ self.conv_k = nn.Conv1d(channels, channels, 1)
240
+ self.conv_v = nn.Conv1d(channels, channels, 1)
241
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
242
+ self.drop = nn.Dropout(p_dropout)
243
+
244
+ if window_size is not None:
245
+ n_heads_rel = 1 if heads_share else n_heads
246
+ rel_stddev = self.k_channels**-0.5
247
+ self.emb_rel_k = nn.Parameter(
248
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
249
+ * rel_stddev
250
+ )
251
+ self.emb_rel_v = nn.Parameter(
252
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
253
+ * rel_stddev
254
+ )
255
+
256
+ nn.init.xavier_uniform_(self.conv_q.weight)
257
+ nn.init.xavier_uniform_(self.conv_k.weight)
258
+ nn.init.xavier_uniform_(self.conv_v.weight)
259
+ if proximal_init:
260
+ with torch.no_grad():
261
+ self.conv_k.weight.copy_(self.conv_q.weight)
262
+ self.conv_k.bias.copy_(self.conv_q.bias)
263
+
264
+ def forward(self, x, c, attn_mask=None):
265
+ q = self.conv_q(x)
266
+ k = self.conv_k(c)
267
+ v = self.conv_v(c)
268
+
269
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
270
+
271
+ x = self.conv_o(x)
272
+ return x
273
+
274
+ def attention(self, query, key, value, mask=None):
275
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
276
+ b, d, t_s, t_t = (*key.size(), query.size(2))
277
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
278
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
279
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
280
+
281
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
282
+ if self.window_size is not None:
283
+ assert (
284
+ t_s == t_t
285
+ ), "Relative attention is only available for self-attention."
286
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
287
+ rel_logits = self._matmul_with_relative_keys(
288
+ query / math.sqrt(self.k_channels), key_relative_embeddings
289
+ )
290
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
291
+ scores = scores + scores_local
292
+ if self.proximal_bias:
293
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
294
+ scores = scores + self._attention_bias_proximal(t_s).to(
295
+ device=scores.device, dtype=scores.dtype
296
+ )
297
+ if mask is not None:
298
+ scores = scores.masked_fill(mask == 0, -1e4)
299
+ if self.block_length is not None:
300
+ assert (
301
+ t_s == t_t
302
+ ), "Local attention is only available for self-attention."
303
+ block_mask = (
304
+ torch.ones_like(scores)
305
+ .triu(-self.block_length)
306
+ .tril(self.block_length)
307
+ )
308
+ scores = scores.masked_fill(block_mask == 0, -1e4)
309
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
310
+ p_attn = self.drop(p_attn)
311
+ output = torch.matmul(p_attn, value)
312
+ if self.window_size is not None:
313
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
314
+ value_relative_embeddings = self._get_relative_embeddings(
315
+ self.emb_rel_v, t_s
316
+ )
317
+ output = output + self._matmul_with_relative_values(
318
+ relative_weights, value_relative_embeddings
319
+ )
320
+ output = (
321
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
322
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
323
+ return output, p_attn
324
+
325
+ def _matmul_with_relative_values(self, x, y):
326
+ """
327
+ x: [b, h, l, m]
328
+ y: [h or 1, m, d]
329
+ ret: [b, h, l, d]
330
+ """
331
+ ret = torch.matmul(x, y.unsqueeze(0))
332
+ return ret
333
+
334
+ def _matmul_with_relative_keys(self, x, y):
335
+ """
336
+ x: [b, h, l, d]
337
+ y: [h or 1, m, d]
338
+ ret: [b, h, l, m]
339
+ """
340
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
341
+ return ret
342
+
343
+ def _get_relative_embeddings(self, relative_embeddings, length):
344
+ 2 * self.window_size + 1
345
+ # Pad first before slice to avoid using cond ops.
346
+ pad_length = max(length - (self.window_size + 1), 0)
347
+ slice_start_position = max((self.window_size + 1) - length, 0)
348
+ slice_end_position = slice_start_position + 2 * length - 1
349
+ if pad_length > 0:
350
+ padded_relative_embeddings = F.pad(
351
+ relative_embeddings,
352
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
353
+ )
354
+ else:
355
+ padded_relative_embeddings = relative_embeddings
356
+ used_relative_embeddings = padded_relative_embeddings[
357
+ :, slice_start_position:slice_end_position
358
+ ]
359
+ return used_relative_embeddings
360
+
361
+ def _relative_position_to_absolute_position(self, x):
362
+ """
363
+ x: [b, h, l, 2*l-1]
364
+ ret: [b, h, l, l]
365
+ """
366
+ batch, heads, length, _ = x.size()
367
+ # Concat columns of pad to shift from relative to absolute indexing.
368
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
369
+
370
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
371
+ x_flat = x.view([batch, heads, length * 2 * length])
372
+ x_flat = F.pad(
373
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
374
+ )
375
+
376
+ # Reshape and slice out the padded elements.
377
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
378
+ :, :, :length, length - 1 :
379
+ ]
380
+ return x_final
381
+
382
+ def _absolute_position_to_relative_position(self, x):
383
+ """
384
+ x: [b, h, l, l]
385
+ ret: [b, h, l, 2*l-1]
386
+ """
387
+ batch, heads, length, _ = x.size()
388
+ # pad along column
389
+ x = F.pad(
390
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
391
+ )
392
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
393
+ # add 0's in the beginning that will skew the elements after reshape
394
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
395
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
396
+ return x_final
397
+
398
+ def _attention_bias_proximal(self, length):
399
+ """Bias for self-attention to encourage attention to close positions.
400
+ Args:
401
+ length: an integer scalar.
402
+ Returns:
403
+ a Tensor with shape [1, 1, length, length]
404
+ """
405
+ r = torch.arange(length, dtype=torch.float32)
406
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
407
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
408
+
409
+
410
+ class FFN(nn.Module):
411
+ def __init__(
412
+ self,
413
+ in_channels,
414
+ out_channels,
415
+ filter_channels,
416
+ kernel_size,
417
+ p_dropout=0.0,
418
+ activation=None,
419
+ causal=False,
420
+ ):
421
+ super().__init__()
422
+ self.in_channels = in_channels
423
+ self.out_channels = out_channels
424
+ self.filter_channels = filter_channels
425
+ self.kernel_size = kernel_size
426
+ self.p_dropout = p_dropout
427
+ self.activation = activation
428
+ self.causal = causal
429
+
430
+ if causal:
431
+ self.padding = self._causal_padding
432
+ else:
433
+ self.padding = self._same_padding
434
+
435
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
436
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
437
+ self.drop = nn.Dropout(p_dropout)
438
+
439
+ def forward(self, x, x_mask):
440
+ x = self.conv_1(self.padding(x * x_mask))
441
+ if self.activation == "gelu":
442
+ x = x * torch.sigmoid(1.702 * x)
443
+ else:
444
+ x = torch.relu(x)
445
+ x = self.drop(x)
446
+ x = self.conv_2(self.padding(x * x_mask))
447
+ return x * x_mask
448
+
449
+ def _causal_padding(self, x):
450
+ if self.kernel_size == 1:
451
+ return x
452
+ pad_l = self.kernel_size - 1
453
+ pad_r = 0
454
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
455
+ x = F.pad(x, commons.convert_pad_shape(padding))
456
+ return x
457
+
458
+ def _same_padding(self, x):
459
+ if self.kernel_size == 1:
460
+ return x
461
+ pad_l = (self.kernel_size - 1) // 2
462
+ pad_r = self.kernel_size // 2
463
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
464
+ x = F.pad(x, commons.convert_pad_shape(padding))
465
+ return x
openvoice_cli/checkpoints/base_speakers/ses/en-au.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e9782233deef51fc5289d05ad4dd4ce12b196e282eccf6b6db6256bbd02daaa
3
+ size 1701
openvoice_cli/checkpoints/base_speakers/ses/en-br.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bf5a88025cfd10473b25d65d5c0e608338ce4533059c5f9a3383e69c812d389
3
+ size 1701
openvoice_cli/checkpoints/base_speakers/ses/en-default.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4139de3bc2ea162f45a5a5f9559b710686c9689749b5ab8945ee5e2a082d154
3
+ size 1783
openvoice_cli/checkpoints/base_speakers/ses/en-india.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad03d946757e95fe9e13239aa4b11071d98f22316f604f34b1a0b4bdf41cda48
3
+ size 1701
openvoice_cli/checkpoints/base_speakers/ses/en-newest.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a3798229b1114f0e9cc137b33211809def7dda5a8a9398d5a112c0b42699177
3
+ size 1692
openvoice_cli/checkpoints/base_speakers/ses/en-us.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d092d4af0815a4bfbc6105b65621ab68dc4c61b2f55044d8a66968a34947c32
3
+ size 1701
openvoice_cli/checkpoints/base_speakers/ses/es.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8cece8853fb75b9f5217a1f5cda9807bac92a3e4c4547fc651e404d05deff63
3
+ size 1692
openvoice_cli/checkpoints/base_speakers/ses/fr.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a01f6d30a73efa368c288a542a522a2bcdd4e2ec5589d8646b307cf8e2ad9ae
3
+ size 1692
openvoice_cli/checkpoints/base_speakers/ses/jp.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b645ff428de4a57a22122318968f1e6127ac81fda2e2aa66062deccd3864416
3
+ size 1692
openvoice_cli/checkpoints/base_speakers/ses/kr.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f501479d6072741a396725bec79144653e9f4a5381b85901e29683aa169795df
3
+ size 1692
openvoice_cli/checkpoints/base_speakers/ses/zh.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b353de562700c13faacf096ecfc0adcafd26e6704a9feef572be1279714e031
3
+ size 1692
openvoice_cli/commons.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch.nn import functional as F
4
+
5
+
6
+ def init_weights(m, mean=0.0, std=0.01):
7
+ classname = m.__class__.__name__
8
+ if classname.find("Conv") != -1:
9
+ m.weight.data.normal_(mean, std)
10
+
11
+
12
+ def get_padding(kernel_size, dilation=1):
13
+ return int((kernel_size * dilation - dilation) / 2)
14
+
15
+
16
+ def convert_pad_shape(pad_shape):
17
+ layer = pad_shape[::-1]
18
+ pad_shape = [item for sublist in layer for item in sublist]
19
+ return pad_shape
20
+
21
+
22
+ def intersperse(lst, item):
23
+ result = [item] * (len(lst) * 2 + 1)
24
+ result[1::2] = lst
25
+ return result
26
+
27
+
28
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
29
+ """KL(P||Q)"""
30
+ kl = (logs_q - logs_p) - 0.5
31
+ kl += (
32
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
33
+ )
34
+ return kl
35
+
36
+
37
+ def rand_gumbel(shape):
38
+ """Sample from the Gumbel distribution, protect from overflows."""
39
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40
+ return -torch.log(-torch.log(uniform_samples))
41
+
42
+
43
+ def rand_gumbel_like(x):
44
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45
+ return g
46
+
47
+
48
+ def slice_segments(x, ids_str, segment_size=4):
49
+ ret = torch.zeros_like(x[:, :, :segment_size])
50
+ for i in range(x.size(0)):
51
+ idx_str = ids_str[i]
52
+ idx_end = idx_str + segment_size
53
+ ret[i] = x[i, :, idx_str:idx_end]
54
+ return ret
55
+
56
+
57
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
58
+ b, d, t = x.size()
59
+ if x_lengths is None:
60
+ x_lengths = t
61
+ ids_str_max = x_lengths - segment_size + 1
62
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
63
+ ret = slice_segments(x, ids_str, segment_size)
64
+ return ret, ids_str
65
+
66
+
67
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
68
+ position = torch.arange(length, dtype=torch.float)
69
+ num_timescales = channels // 2
70
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
71
+ num_timescales - 1
72
+ )
73
+ inv_timescales = min_timescale * torch.exp(
74
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
75
+ )
76
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
77
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
78
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
79
+ signal = signal.view(1, channels, length)
80
+ return signal
81
+
82
+
83
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
84
+ b, channels, length = x.size()
85
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
86
+ return x + signal.to(dtype=x.dtype, device=x.device)
87
+
88
+
89
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
90
+ b, channels, length = x.size()
91
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
92
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
93
+
94
+
95
+ def subsequent_mask(length):
96
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
97
+ return mask
98
+
99
+
100
+ @torch.jit.script
101
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102
+ n_channels_int = n_channels[0]
103
+ in_act = input_a + input_b
104
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
105
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106
+ acts = t_act * s_act
107
+ return acts
108
+
109
+
110
+ def convert_pad_shape(pad_shape):
111
+ layer = pad_shape[::-1]
112
+ pad_shape = [item for sublist in layer for item in sublist]
113
+ return pad_shape
114
+
115
+
116
+ def shift_1d(x):
117
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118
+ return x
119
+
120
+
121
+ def sequence_mask(length, max_length=None):
122
+ if max_length is None:
123
+ max_length = length.max()
124
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125
+ return x.unsqueeze(0) < length.unsqueeze(1)
126
+
127
+
128
+ def generate_path(duration, mask):
129
+ """
130
+ duration: [b, 1, t_x]
131
+ mask: [b, 1, t_y, t_x]
132
+ """
133
+
134
+ b, _, t_y, t_x = mask.shape
135
+ cum_duration = torch.cumsum(duration, -1)
136
+
137
+ cum_duration_flat = cum_duration.view(b * t_x)
138
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
139
+ path = path.view(b, t_x, t_y)
140
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
141
+ path = path.unsqueeze(1).transpose(2, 3) * mask
142
+ return path
143
+
144
+
145
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
146
+ if isinstance(parameters, torch.Tensor):
147
+ parameters = [parameters]
148
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
149
+ norm_type = float(norm_type)
150
+ if clip_value is not None:
151
+ clip_value = float(clip_value)
152
+
153
+ total_norm = 0
154
+ for p in parameters:
155
+ param_norm = p.grad.data.norm(norm_type)
156
+ total_norm += param_norm.item() ** norm_type
157
+ if clip_value is not None:
158
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
159
+ total_norm = total_norm ** (1.0 / norm_type)
160
+ return total_norm
openvoice_cli/downloader.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from tqdm import tqdm
3
+ import requests
4
+
5
+ def download_file(url, destination):
6
+ response = requests.get(url, stream=True)
7
+ total_size_in_bytes = int(response.headers.get('content-length', 0))
8
+ block_size = 1024 # 1 Kibibyte
9
+
10
+ progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
11
+
12
+ with open(destination, 'wb') as file:
13
+ for data in response.iter_content(block_size):
14
+ progress_bar.update(len(data))
15
+ file.write(data)
16
+
17
+ progress_bar.close()
18
+
19
+ def create_directory_if_not_exists(directory):
20
+ if not directory.exists():
21
+ directory.mkdir(parents=True)
22
+
23
+ def download_checkpoint(dest_dir):
24
+ # Define paths
25
+ model_path = Path(dest_dir)
26
+
27
+ # Define files and their corresponding URLs
28
+ files_to_download = {
29
+ "checkpoint.pth": f"https://huggingface.co/daswer123/openvoice-tunner-v2/resolve/main/checkpoint.pth?download=true",
30
+ "config.json": f"https://huggingface.co/daswer123/openvoice-tunner-v2/raw/main/config.json",
31
+ }
32
+
33
+ # Check and create directories
34
+ create_directory_if_not_exists(model_path)
35
+
36
+ # Download files if they don't exist
37
+ for filename, url in files_to_download.items():
38
+ destination = model_path / filename
39
+ if not destination.exists():
40
+ print(f"[OpenVoice Converter] Downloading {filename}...")
41
+ download_file(url, destination)
openvoice_cli/mel_processing.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import torch
3
+ import torch.utils.data
4
+ from librosa.filters import mel as librosa_mel_fn
5
+
6
+ MAX_WAV_VALUE = 32768.0
7
+
8
+
9
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
10
+ """
11
+ PARAMS
12
+ ------
13
+ C: compression factor
14
+ """
15
+ return torch.log(torch.clamp(x, min=clip_val) * C)
16
+
17
+
18
+ def dynamic_range_decompression_torch(x, C=1):
19
+ """
20
+ PARAMS
21
+ ------
22
+ C: compression factor used to compress
23
+ """
24
+ return torch.exp(x) / C
25
+
26
+
27
+ def spectral_normalize_torch(magnitudes):
28
+ output = dynamic_range_compression_torch(magnitudes)
29
+ return output
30
+
31
+
32
+ def spectral_de_normalize_torch(magnitudes):
33
+ output = dynamic_range_decompression_torch(magnitudes)
34
+ return output
35
+
36
+
37
+ mel_basis = {}
38
+ hann_window = {}
39
+
40
+
41
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
42
+ if torch.min(y) < -1.1:
43
+ print("min value is ", torch.min(y))
44
+ if torch.max(y) > 1.1:
45
+ print("max value is ", torch.max(y))
46
+
47
+ global hann_window
48
+ dtype_device = str(y.dtype) + "_" + str(y.device)
49
+ wnsize_dtype_device = str(win_size) + "_" + dtype_device
50
+ if wnsize_dtype_device not in hann_window:
51
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
52
+ dtype=y.dtype, device=y.device
53
+ )
54
+
55
+ y = torch.nn.functional.pad(
56
+ y.unsqueeze(1),
57
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
58
+ mode="reflect",
59
+ )
60
+ y = y.squeeze(1)
61
+
62
+ spec = torch.stft(
63
+ y,
64
+ n_fft,
65
+ hop_length=hop_size,
66
+ win_length=win_size,
67
+ window=hann_window[wnsize_dtype_device],
68
+ center=center,
69
+ pad_mode="reflect",
70
+ normalized=False,
71
+ onesided=True,
72
+ return_complex=False,
73
+ )
74
+
75
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
76
+ return spec
77
+
78
+
79
+ def spectrogram_torch_conv(y, n_fft, sampling_rate, hop_size, win_size, center=False):
80
+ # if torch.min(y) < -1.:
81
+ # print('min value is ', torch.min(y))
82
+ # if torch.max(y) > 1.:
83
+ # print('max value is ', torch.max(y))
84
+
85
+ global hann_window
86
+ dtype_device = str(y.dtype) + '_' + str(y.device)
87
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
88
+ if wnsize_dtype_device not in hann_window:
89
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
90
+
91
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
92
+
93
+ # ******************** original ************************#
94
+ # y = y.squeeze(1)
95
+ # spec1 = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
96
+ # center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
97
+
98
+ # ******************** ConvSTFT ************************#
99
+ freq_cutoff = n_fft // 2 + 1
100
+ fourier_basis = torch.view_as_real(torch.fft.fft(torch.eye(n_fft)))
101
+ forward_basis = fourier_basis[:freq_cutoff].permute(2, 0, 1).reshape(-1, 1, fourier_basis.shape[1])
102
+ forward_basis = forward_basis * torch.as_tensor(librosa.util.pad_center(torch.hann_window(win_size), size=n_fft)).float()
103
+
104
+ import torch.nn.functional as F
105
+
106
+ # if center:
107
+ # signal = F.pad(y[:, None, None, :], (n_fft // 2, n_fft // 2, 0, 0), mode = 'reflect').squeeze(1)
108
+ assert center is False
109
+
110
+ forward_transform_squared = F.conv1d(y, forward_basis.to(y.device), stride = hop_size)
111
+ spec2 = torch.stack([forward_transform_squared[:, :freq_cutoff, :], forward_transform_squared[:, freq_cutoff:, :]], dim = -1)
112
+
113
+
114
+ # ******************** Verification ************************#
115
+ spec1 = torch.stft(y.squeeze(1), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
116
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
117
+ assert torch.allclose(spec1, spec2, atol=1e-4)
118
+
119
+ spec = torch.sqrt(spec2.pow(2).sum(-1) + 1e-6)
120
+ return spec
121
+
122
+
123
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
124
+ global mel_basis
125
+ dtype_device = str(spec.dtype) + "_" + str(spec.device)
126
+ fmax_dtype_device = str(fmax) + "_" + dtype_device
127
+ if fmax_dtype_device not in mel_basis:
128
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
129
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
130
+ dtype=spec.dtype, device=spec.device
131
+ )
132
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
133
+ spec = spectral_normalize_torch(spec)
134
+ return spec
135
+
136
+
137
+ def mel_spectrogram_torch(
138
+ y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
139
+ ):
140
+ if torch.min(y) < -1.0:
141
+ print("min value is ", torch.min(y))
142
+ if torch.max(y) > 1.0:
143
+ print("max value is ", torch.max(y))
144
+
145
+ global mel_basis, hann_window
146
+ dtype_device = str(y.dtype) + "_" + str(y.device)
147
+ fmax_dtype_device = str(fmax) + "_" + dtype_device
148
+ wnsize_dtype_device = str(win_size) + "_" + dtype_device
149
+ if fmax_dtype_device not in mel_basis:
150
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
151
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
152
+ dtype=y.dtype, device=y.device
153
+ )
154
+ if wnsize_dtype_device not in hann_window:
155
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
156
+ dtype=y.dtype, device=y.device
157
+ )
158
+
159
+ y = torch.nn.functional.pad(
160
+ y.unsqueeze(1),
161
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
162
+ mode="reflect",
163
+ )
164
+ y = y.squeeze(1)
165
+
166
+ spec = torch.stft(
167
+ y,
168
+ n_fft,
169
+ hop_length=hop_size,
170
+ win_length=win_size,
171
+ window=hann_window[wnsize_dtype_device],
172
+ center=center,
173
+ pad_mode="reflect",
174
+ normalized=False,
175
+ onesided=True,
176
+ return_complex=False,
177
+ )
178
+
179
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
180
+
181
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
182
+ spec = spectral_normalize_torch(spec)
183
+
184
+ return spec
openvoice_cli/models.py ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ import openvoice_cli.commons as commons
7
+ import openvoice_cli.attentions as attentions
8
+ import openvoice_cli.modules as modules
9
+
10
+
11
+ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
12
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
+
14
+ from openvoice_cli.commons import init_weights, get_padding
15
+
16
+
17
+ class TextEncoder(nn.Module):
18
+ def __init__(self,
19
+ n_vocab,
20
+ out_channels,
21
+ hidden_channels,
22
+ filter_channels,
23
+ n_heads,
24
+ n_layers,
25
+ kernel_size,
26
+ p_dropout):
27
+ super().__init__()
28
+ self.n_vocab = n_vocab
29
+ self.out_channels = out_channels
30
+ self.hidden_channels = hidden_channels
31
+ self.filter_channels = filter_channels
32
+ self.n_heads = n_heads
33
+ self.n_layers = n_layers
34
+ self.kernel_size = kernel_size
35
+ self.p_dropout = p_dropout
36
+
37
+ self.emb = nn.Embedding(n_vocab, hidden_channels)
38
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
39
+
40
+ self.encoder = attentions.Encoder(
41
+ hidden_channels,
42
+ filter_channels,
43
+ n_heads,
44
+ n_layers,
45
+ kernel_size,
46
+ p_dropout)
47
+ self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
48
+
49
+ def forward(self, x, x_lengths):
50
+ x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
51
+ x = torch.transpose(x, 1, -1) # [b, h, t]
52
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
53
+
54
+ x = self.encoder(x * x_mask, x_mask)
55
+ stats = self.proj(x) * x_mask
56
+
57
+ m, logs = torch.split(stats, self.out_channels, dim=1)
58
+ return x, m, logs, x_mask
59
+
60
+
61
+ class DurationPredictor(nn.Module):
62
+ def __init__(
63
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
64
+ ):
65
+ super().__init__()
66
+
67
+ self.in_channels = in_channels
68
+ self.filter_channels = filter_channels
69
+ self.kernel_size = kernel_size
70
+ self.p_dropout = p_dropout
71
+ self.gin_channels = gin_channels
72
+
73
+ self.drop = nn.Dropout(p_dropout)
74
+ self.conv_1 = nn.Conv1d(
75
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
76
+ )
77
+ self.norm_1 = modules.LayerNorm(filter_channels)
78
+ self.conv_2 = nn.Conv1d(
79
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
80
+ )
81
+ self.norm_2 = modules.LayerNorm(filter_channels)
82
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
83
+
84
+ if gin_channels != 0:
85
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
86
+
87
+ def forward(self, x, x_mask, g=None):
88
+ x = torch.detach(x)
89
+ if g is not None:
90
+ g = torch.detach(g)
91
+ x = x + self.cond(g)
92
+ x = self.conv_1(x * x_mask)
93
+ x = torch.relu(x)
94
+ x = self.norm_1(x)
95
+ x = self.drop(x)
96
+ x = self.conv_2(x * x_mask)
97
+ x = torch.relu(x)
98
+ x = self.norm_2(x)
99
+ x = self.drop(x)
100
+ x = self.proj(x * x_mask)
101
+ return x * x_mask
102
+
103
+ class StochasticDurationPredictor(nn.Module):
104
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
105
+ super().__init__()
106
+ filter_channels = in_channels # it needs to be removed from future version.
107
+ self.in_channels = in_channels
108
+ self.filter_channels = filter_channels
109
+ self.kernel_size = kernel_size
110
+ self.p_dropout = p_dropout
111
+ self.n_flows = n_flows
112
+ self.gin_channels = gin_channels
113
+
114
+ self.log_flow = modules.Log()
115
+ self.flows = nn.ModuleList()
116
+ self.flows.append(modules.ElementwiseAffine(2))
117
+ for i in range(n_flows):
118
+ self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
119
+ self.flows.append(modules.Flip())
120
+
121
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
122
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
123
+ self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
124
+ self.post_flows = nn.ModuleList()
125
+ self.post_flows.append(modules.ElementwiseAffine(2))
126
+ for i in range(4):
127
+ self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
128
+ self.post_flows.append(modules.Flip())
129
+
130
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
131
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
132
+ self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
133
+ if gin_channels != 0:
134
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
135
+
136
+ def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
137
+ x = torch.detach(x)
138
+ x = self.pre(x)
139
+ if g is not None:
140
+ g = torch.detach(g)
141
+ x = x + self.cond(g)
142
+ x = self.convs(x, x_mask)
143
+ x = self.proj(x) * x_mask
144
+
145
+ if not reverse:
146
+ flows = self.flows
147
+ assert w is not None
148
+
149
+ logdet_tot_q = 0
150
+ h_w = self.post_pre(w)
151
+ h_w = self.post_convs(h_w, x_mask)
152
+ h_w = self.post_proj(h_w) * x_mask
153
+ e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
154
+ z_q = e_q
155
+ for flow in self.post_flows:
156
+ z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
157
+ logdet_tot_q += logdet_q
158
+ z_u, z1 = torch.split(z_q, [1, 1], 1)
159
+ u = torch.sigmoid(z_u) * x_mask
160
+ z0 = (w - u) * x_mask
161
+ logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
162
+ logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
163
+
164
+ logdet_tot = 0
165
+ z0, logdet = self.log_flow(z0, x_mask)
166
+ logdet_tot += logdet
167
+ z = torch.cat([z0, z1], 1)
168
+ for flow in flows:
169
+ z, logdet = flow(z, x_mask, g=x, reverse=reverse)
170
+ logdet_tot = logdet_tot + logdet
171
+ nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
172
+ return nll + logq # [b]
173
+ else:
174
+ flows = list(reversed(self.flows))
175
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
176
+ z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
177
+ for flow in flows:
178
+ z = flow(z, x_mask, g=x, reverse=reverse)
179
+ z0, z1 = torch.split(z, [1, 1], 1)
180
+ logw = z0
181
+ return logw
182
+
183
+ class PosteriorEncoder(nn.Module):
184
+ def __init__(
185
+ self,
186
+ in_channels,
187
+ out_channels,
188
+ hidden_channels,
189
+ kernel_size,
190
+ dilation_rate,
191
+ n_layers,
192
+ gin_channels=0,
193
+ ):
194
+ super().__init__()
195
+ self.in_channels = in_channels
196
+ self.out_channels = out_channels
197
+ self.hidden_channels = hidden_channels
198
+ self.kernel_size = kernel_size
199
+ self.dilation_rate = dilation_rate
200
+ self.n_layers = n_layers
201
+ self.gin_channels = gin_channels
202
+
203
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
204
+ self.enc = modules.WN(
205
+ hidden_channels,
206
+ kernel_size,
207
+ dilation_rate,
208
+ n_layers,
209
+ gin_channels=gin_channels,
210
+ )
211
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
212
+
213
+ def forward(self, x, x_lengths, g=None, tau=1.0):
214
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
215
+ x.dtype
216
+ )
217
+ x = self.pre(x) * x_mask
218
+ x = self.enc(x, x_mask, g=g)
219
+ stats = self.proj(x) * x_mask
220
+ m, logs = torch.split(stats, self.out_channels, dim=1)
221
+ z = (m + torch.randn_like(m) * tau * torch.exp(logs)) * x_mask
222
+ return z, m, logs, x_mask
223
+
224
+
225
+ class Generator(torch.nn.Module):
226
+ def __init__(
227
+ self,
228
+ initial_channel,
229
+ resblock,
230
+ resblock_kernel_sizes,
231
+ resblock_dilation_sizes,
232
+ upsample_rates,
233
+ upsample_initial_channel,
234
+ upsample_kernel_sizes,
235
+ gin_channels=0,
236
+ ):
237
+ super(Generator, self).__init__()
238
+ self.num_kernels = len(resblock_kernel_sizes)
239
+ self.num_upsamples = len(upsample_rates)
240
+ self.conv_pre = Conv1d(
241
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
242
+ )
243
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
244
+
245
+ self.ups = nn.ModuleList()
246
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
247
+ self.ups.append(
248
+ weight_norm(
249
+ ConvTranspose1d(
250
+ upsample_initial_channel // (2**i),
251
+ upsample_initial_channel // (2 ** (i + 1)),
252
+ k,
253
+ u,
254
+ padding=(k - u) // 2,
255
+ )
256
+ )
257
+ )
258
+
259
+ self.resblocks = nn.ModuleList()
260
+ for i in range(len(self.ups)):
261
+ ch = upsample_initial_channel // (2 ** (i + 1))
262
+ for j, (k, d) in enumerate(
263
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
264
+ ):
265
+ self.resblocks.append(resblock(ch, k, d))
266
+
267
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
268
+ self.ups.apply(init_weights)
269
+
270
+ if gin_channels != 0:
271
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
272
+
273
+ def forward(self, x, g=None):
274
+ x = self.conv_pre(x)
275
+ if g is not None:
276
+ x = x + self.cond(g)
277
+
278
+ for i in range(self.num_upsamples):
279
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
280
+ x = self.ups[i](x)
281
+ xs = None
282
+ for j in range(self.num_kernels):
283
+ if xs is None:
284
+ xs = self.resblocks[i * self.num_kernels + j](x)
285
+ else:
286
+ xs += self.resblocks[i * self.num_kernels + j](x)
287
+ x = xs / self.num_kernels
288
+ x = F.leaky_relu(x)
289
+ x = self.conv_post(x)
290
+ x = torch.tanh(x)
291
+
292
+ return x
293
+
294
+ def remove_weight_norm(self):
295
+ print("Removing weight norm...")
296
+ for layer in self.ups:
297
+ remove_weight_norm(layer)
298
+ for layer in self.resblocks:
299
+ layer.remove_weight_norm()
300
+
301
+
302
+ class ReferenceEncoder(nn.Module):
303
+ """
304
+ inputs --- [N, Ty/r, n_mels*r] mels
305
+ outputs --- [N, ref_enc_gru_size]
306
+ """
307
+
308
+ def __init__(self, spec_channels, gin_channels=0, layernorm=True):
309
+ super().__init__()
310
+ self.spec_channels = spec_channels
311
+ ref_enc_filters = [32, 32, 64, 64, 128, 128]
312
+ K = len(ref_enc_filters)
313
+ filters = [1] + ref_enc_filters
314
+ convs = [
315
+ weight_norm(
316
+ nn.Conv2d(
317
+ in_channels=filters[i],
318
+ out_channels=filters[i + 1],
319
+ kernel_size=(3, 3),
320
+ stride=(2, 2),
321
+ padding=(1, 1),
322
+ )
323
+ )
324
+ for i in range(K)
325
+ ]
326
+ self.convs = nn.ModuleList(convs)
327
+
328
+ out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
329
+ self.gru = nn.GRU(
330
+ input_size=ref_enc_filters[-1] * out_channels,
331
+ hidden_size=256 // 2,
332
+ batch_first=True,
333
+ )
334
+ self.proj = nn.Linear(128, gin_channels)
335
+ if layernorm:
336
+ self.layernorm = nn.LayerNorm(self.spec_channels)
337
+ else:
338
+ self.layernorm = None
339
+
340
+ def forward(self, inputs, mask=None):
341
+ N = inputs.size(0)
342
+
343
+ out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
344
+ if self.layernorm is not None:
345
+ out = self.layernorm(out)
346
+
347
+ for conv in self.convs:
348
+ out = conv(out)
349
+ # out = wn(out)
350
+ out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
351
+
352
+ out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
353
+ T = out.size(1)
354
+ N = out.size(0)
355
+ out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
356
+
357
+ self.gru.flatten_parameters()
358
+ memory, out = self.gru(out) # out --- [1, N, 128]
359
+
360
+ return self.proj(out.squeeze(0))
361
+
362
+ def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
363
+ for i in range(n_convs):
364
+ L = (L - kernel_size + 2 * pad) // stride + 1
365
+ return L
366
+
367
+
368
+ class ResidualCouplingBlock(nn.Module):
369
+ def __init__(self,
370
+ channels,
371
+ hidden_channels,
372
+ kernel_size,
373
+ dilation_rate,
374
+ n_layers,
375
+ n_flows=4,
376
+ gin_channels=0):
377
+ super().__init__()
378
+ self.channels = channels
379
+ self.hidden_channels = hidden_channels
380
+ self.kernel_size = kernel_size
381
+ self.dilation_rate = dilation_rate
382
+ self.n_layers = n_layers
383
+ self.n_flows = n_flows
384
+ self.gin_channels = gin_channels
385
+
386
+ self.flows = nn.ModuleList()
387
+ for i in range(n_flows):
388
+ self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
389
+ self.flows.append(modules.Flip())
390
+
391
+ def forward(self, x, x_mask, g=None, reverse=False):
392
+ if not reverse:
393
+ for flow in self.flows:
394
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
395
+ else:
396
+ for flow in reversed(self.flows):
397
+ x = flow(x, x_mask, g=g, reverse=reverse)
398
+ return x
399
+
400
+ class SynthesizerTrn(nn.Module):
401
+ """
402
+ Synthesizer for Training
403
+ """
404
+
405
+ def __init__(
406
+ self,
407
+ n_vocab,
408
+ spec_channels,
409
+ inter_channels,
410
+ hidden_channels,
411
+ filter_channels,
412
+ n_heads,
413
+ n_layers,
414
+ kernel_size,
415
+ p_dropout,
416
+ resblock,
417
+ resblock_kernel_sizes,
418
+ resblock_dilation_sizes,
419
+ upsample_rates,
420
+ upsample_initial_channel,
421
+ upsample_kernel_sizes,
422
+ n_speakers=256,
423
+ gin_channels=256,
424
+ zero_g=False,
425
+ **kwargs
426
+ ):
427
+ super().__init__()
428
+
429
+ self.dec = Generator(
430
+ inter_channels,
431
+ resblock,
432
+ resblock_kernel_sizes,
433
+ resblock_dilation_sizes,
434
+ upsample_rates,
435
+ upsample_initial_channel,
436
+ upsample_kernel_sizes,
437
+ gin_channels=gin_channels,
438
+ )
439
+ self.enc_q = PosteriorEncoder(
440
+ spec_channels,
441
+ inter_channels,
442
+ hidden_channels,
443
+ 5,
444
+ 1,
445
+ 16,
446
+ gin_channels=gin_channels,
447
+ )
448
+
449
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
450
+
451
+ self.n_speakers = n_speakers
452
+ if n_speakers == 0:
453
+ self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
454
+ else:
455
+ self.enc_p = TextEncoder(n_vocab,
456
+ inter_channels,
457
+ hidden_channels,
458
+ filter_channels,
459
+ n_heads,
460
+ n_layers,
461
+ kernel_size,
462
+ p_dropout)
463
+ self.sdp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
464
+ self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
465
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
466
+ self.zero_g = zero_g
467
+
468
+ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., sdp_ratio=0.2, max_len=None):
469
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
470
+ if self.n_speakers > 0:
471
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
472
+ else:
473
+ g = None
474
+
475
+ logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * sdp_ratio \
476
+ + self.dp(x, x_mask, g=g) * (1 - sdp_ratio)
477
+
478
+ w = torch.exp(logw) * x_mask * length_scale
479
+ w_ceil = torch.ceil(w)
480
+ y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
481
+ y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
482
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
483
+ attn = commons.generate_path(w_ceil, attn_mask)
484
+
485
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
486
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
487
+
488
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
489
+ z = self.flow(z_p, y_mask, g=g, reverse=True)
490
+ o = self.dec((z * y_mask)[:,:,:max_len], g=g)
491
+ return o, attn, y_mask, (z, z_p, m_p, logs_p)
492
+
493
+ def voice_conversion(self, y, y_lengths, sid_src, sid_tgt, tau=1.0):
494
+ g_src = sid_src
495
+ g_tgt = sid_tgt
496
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src if not self.zero_g else torch.zeros_like(g_src), tau=tau)
497
+ z_p = self.flow(z, y_mask, g=g_src)
498
+ z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
499
+ o_hat = self.dec(z_hat * y_mask, g=g_tgt if not self.zero_g else torch.zeros_like(g_tgt))
500
+ return o_hat, y_mask, (z, z_p, z_hat)
openvoice_cli/modules.py ADDED
@@ -0,0 +1,597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ from torch.nn import Conv1d
7
+ from torch.nn.utils import weight_norm, remove_weight_norm
8
+
9
+ import openvoice_cli.commons as commons
10
+ from openvoice_cli.commons import init_weights, get_padding
11
+ from openvoice_cli.transforms import piecewise_rational_quadratic_transform
12
+ from openvoice_cli.attentions import Encoder
13
+
14
+ LRELU_SLOPE = 0.1
15
+
16
+ class LayerNorm(nn.Module):
17
+ def __init__(self, channels, eps=1e-5):
18
+ super().__init__()
19
+ self.channels = channels
20
+ self.eps = eps
21
+
22
+ self.gamma = nn.Parameter(torch.ones(channels))
23
+ self.beta = nn.Parameter(torch.zeros(channels))
24
+
25
+ def forward(self, x):
26
+ x = x.transpose(1, -1)
27
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
28
+ return x.transpose(1, -1)
29
+
30
+
31
+ class ConvReluNorm(nn.Module):
32
+ def __init__(
33
+ self,
34
+ in_channels,
35
+ hidden_channels,
36
+ out_channels,
37
+ kernel_size,
38
+ n_layers,
39
+ p_dropout,
40
+ ):
41
+ super().__init__()
42
+ self.in_channels = in_channels
43
+ self.hidden_channels = hidden_channels
44
+ self.out_channels = out_channels
45
+ self.kernel_size = kernel_size
46
+ self.n_layers = n_layers
47
+ self.p_dropout = p_dropout
48
+ assert n_layers > 1, "Number of layers should be larger than 0."
49
+
50
+ self.conv_layers = nn.ModuleList()
51
+ self.norm_layers = nn.ModuleList()
52
+ self.conv_layers.append(
53
+ nn.Conv1d(
54
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
55
+ )
56
+ )
57
+ self.norm_layers.append(LayerNorm(hidden_channels))
58
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
59
+ for _ in range(n_layers - 1):
60
+ self.conv_layers.append(
61
+ nn.Conv1d(
62
+ hidden_channels,
63
+ hidden_channels,
64
+ kernel_size,
65
+ padding=kernel_size // 2,
66
+ )
67
+ )
68
+ self.norm_layers.append(LayerNorm(hidden_channels))
69
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
70
+ self.proj.weight.data.zero_()
71
+ self.proj.bias.data.zero_()
72
+
73
+ def forward(self, x, x_mask):
74
+ x_org = x
75
+ for i in range(self.n_layers):
76
+ x = self.conv_layers[i](x * x_mask)
77
+ x = self.norm_layers[i](x)
78
+ x = self.relu_drop(x)
79
+ x = x_org + self.proj(x)
80
+ return x * x_mask
81
+
82
+
83
+ class DDSConv(nn.Module):
84
+ """
85
+ Dilated and Depth-Separable Convolution
86
+ """
87
+
88
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
89
+ super().__init__()
90
+ self.channels = channels
91
+ self.kernel_size = kernel_size
92
+ self.n_layers = n_layers
93
+ self.p_dropout = p_dropout
94
+
95
+ self.drop = nn.Dropout(p_dropout)
96
+ self.convs_sep = nn.ModuleList()
97
+ self.convs_1x1 = nn.ModuleList()
98
+ self.norms_1 = nn.ModuleList()
99
+ self.norms_2 = nn.ModuleList()
100
+ for i in range(n_layers):
101
+ dilation = kernel_size**i
102
+ padding = (kernel_size * dilation - dilation) // 2
103
+ self.convs_sep.append(
104
+ nn.Conv1d(
105
+ channels,
106
+ channels,
107
+ kernel_size,
108
+ groups=channels,
109
+ dilation=dilation,
110
+ padding=padding,
111
+ )
112
+ )
113
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
114
+ self.norms_1.append(LayerNorm(channels))
115
+ self.norms_2.append(LayerNorm(channels))
116
+
117
+ def forward(self, x, x_mask, g=None):
118
+ if g is not None:
119
+ x = x + g
120
+ for i in range(self.n_layers):
121
+ y = self.convs_sep[i](x * x_mask)
122
+ y = self.norms_1[i](y)
123
+ y = F.gelu(y)
124
+ y = self.convs_1x1[i](y)
125
+ y = self.norms_2[i](y)
126
+ y = F.gelu(y)
127
+ y = self.drop(y)
128
+ x = x + y
129
+ return x * x_mask
130
+
131
+
132
+ class WN(torch.nn.Module):
133
+ def __init__(
134
+ self,
135
+ hidden_channels,
136
+ kernel_size,
137
+ dilation_rate,
138
+ n_layers,
139
+ gin_channels=0,
140
+ p_dropout=0,
141
+ ):
142
+ super(WN, self).__init__()
143
+ assert kernel_size % 2 == 1
144
+ self.hidden_channels = hidden_channels
145
+ self.kernel_size = (kernel_size,)
146
+ self.dilation_rate = dilation_rate
147
+ self.n_layers = n_layers
148
+ self.gin_channels = gin_channels
149
+ self.p_dropout = p_dropout
150
+
151
+ self.in_layers = torch.nn.ModuleList()
152
+ self.res_skip_layers = torch.nn.ModuleList()
153
+ self.drop = nn.Dropout(p_dropout)
154
+
155
+ if gin_channels != 0:
156
+ cond_layer = torch.nn.Conv1d(
157
+ gin_channels, 2 * hidden_channels * n_layers, 1
158
+ )
159
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
160
+
161
+ for i in range(n_layers):
162
+ dilation = dilation_rate**i
163
+ padding = int((kernel_size * dilation - dilation) / 2)
164
+ in_layer = torch.nn.Conv1d(
165
+ hidden_channels,
166
+ 2 * hidden_channels,
167
+ kernel_size,
168
+ dilation=dilation,
169
+ padding=padding,
170
+ )
171
+ in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
172
+ self.in_layers.append(in_layer)
173
+
174
+ # last one is not necessary
175
+ if i < n_layers - 1:
176
+ res_skip_channels = 2 * hidden_channels
177
+ else:
178
+ res_skip_channels = hidden_channels
179
+
180
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
181
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
182
+ self.res_skip_layers.append(res_skip_layer)
183
+
184
+ def forward(self, x, x_mask, g=None, **kwargs):
185
+ output = torch.zeros_like(x)
186
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
187
+
188
+ if g is not None:
189
+ g = self.cond_layer(g)
190
+
191
+ for i in range(self.n_layers):
192
+ x_in = self.in_layers[i](x)
193
+ if g is not None:
194
+ cond_offset = i * 2 * self.hidden_channels
195
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
196
+ else:
197
+ g_l = torch.zeros_like(x_in)
198
+
199
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
200
+ acts = self.drop(acts)
201
+
202
+ res_skip_acts = self.res_skip_layers[i](acts)
203
+ if i < self.n_layers - 1:
204
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
205
+ x = (x + res_acts) * x_mask
206
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
207
+ else:
208
+ output = output + res_skip_acts
209
+ return output * x_mask
210
+
211
+ def remove_weight_norm(self):
212
+ if self.gin_channels != 0:
213
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
214
+ for l in self.in_layers:
215
+ torch.nn.utils.remove_weight_norm(l)
216
+ for l in self.res_skip_layers:
217
+ torch.nn.utils.remove_weight_norm(l)
218
+
219
+
220
+ class ResBlock1(torch.nn.Module):
221
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
222
+ super(ResBlock1, self).__init__()
223
+ self.convs1 = nn.ModuleList(
224
+ [
225
+ weight_norm(
226
+ Conv1d(
227
+ channels,
228
+ channels,
229
+ kernel_size,
230
+ 1,
231
+ dilation=dilation[0],
232
+ padding=get_padding(kernel_size, dilation[0]),
233
+ )
234
+ ),
235
+ weight_norm(
236
+ Conv1d(
237
+ channels,
238
+ channels,
239
+ kernel_size,
240
+ 1,
241
+ dilation=dilation[1],
242
+ padding=get_padding(kernel_size, dilation[1]),
243
+ )
244
+ ),
245
+ weight_norm(
246
+ Conv1d(
247
+ channels,
248
+ channels,
249
+ kernel_size,
250
+ 1,
251
+ dilation=dilation[2],
252
+ padding=get_padding(kernel_size, dilation[2]),
253
+ )
254
+ ),
255
+ ]
256
+ )
257
+ self.convs1.apply(init_weights)
258
+
259
+ self.convs2 = nn.ModuleList(
260
+ [
261
+ weight_norm(
262
+ Conv1d(
263
+ channels,
264
+ channels,
265
+ kernel_size,
266
+ 1,
267
+ dilation=1,
268
+ padding=get_padding(kernel_size, 1),
269
+ )
270
+ ),
271
+ weight_norm(
272
+ Conv1d(
273
+ channels,
274
+ channels,
275
+ kernel_size,
276
+ 1,
277
+ dilation=1,
278
+ padding=get_padding(kernel_size, 1),
279
+ )
280
+ ),
281
+ weight_norm(
282
+ Conv1d(
283
+ channels,
284
+ channels,
285
+ kernel_size,
286
+ 1,
287
+ dilation=1,
288
+ padding=get_padding(kernel_size, 1),
289
+ )
290
+ ),
291
+ ]
292
+ )
293
+ self.convs2.apply(init_weights)
294
+
295
+ def forward(self, x, x_mask=None):
296
+ for c1, c2 in zip(self.convs1, self.convs2):
297
+ xt = F.leaky_relu(x, LRELU_SLOPE)
298
+ if x_mask is not None:
299
+ xt = xt * x_mask
300
+ xt = c1(xt)
301
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
302
+ if x_mask is not None:
303
+ xt = xt * x_mask
304
+ xt = c2(xt)
305
+ x = xt + x
306
+ if x_mask is not None:
307
+ x = x * x_mask
308
+ return x
309
+
310
+ def remove_weight_norm(self):
311
+ for l in self.convs1:
312
+ remove_weight_norm(l)
313
+ for l in self.convs2:
314
+ remove_weight_norm(l)
315
+
316
+
317
+ class ResBlock2(torch.nn.Module):
318
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
319
+ super(ResBlock2, self).__init__()
320
+ self.convs = nn.ModuleList(
321
+ [
322
+ weight_norm(
323
+ Conv1d(
324
+ channels,
325
+ channels,
326
+ kernel_size,
327
+ 1,
328
+ dilation=dilation[0],
329
+ padding=get_padding(kernel_size, dilation[0]),
330
+ )
331
+ ),
332
+ weight_norm(
333
+ Conv1d(
334
+ channels,
335
+ channels,
336
+ kernel_size,
337
+ 1,
338
+ dilation=dilation[1],
339
+ padding=get_padding(kernel_size, dilation[1]),
340
+ )
341
+ ),
342
+ ]
343
+ )
344
+ self.convs.apply(init_weights)
345
+
346
+ def forward(self, x, x_mask=None):
347
+ for c in self.convs:
348
+ xt = F.leaky_relu(x, LRELU_SLOPE)
349
+ if x_mask is not None:
350
+ xt = xt * x_mask
351
+ xt = c(xt)
352
+ x = xt + x
353
+ if x_mask is not None:
354
+ x = x * x_mask
355
+ return x
356
+
357
+ def remove_weight_norm(self):
358
+ for l in self.convs:
359
+ remove_weight_norm(l)
360
+
361
+
362
+ class Log(nn.Module):
363
+ def forward(self, x, x_mask, reverse=False, **kwargs):
364
+ if not reverse:
365
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
366
+ logdet = torch.sum(-y, [1, 2])
367
+ return y, logdet
368
+ else:
369
+ x = torch.exp(x) * x_mask
370
+ return x
371
+
372
+
373
+ class Flip(nn.Module):
374
+ def forward(self, x, *args, reverse=False, **kwargs):
375
+ x = torch.flip(x, [1])
376
+ if not reverse:
377
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
378
+ return x, logdet
379
+ else:
380
+ return x
381
+
382
+
383
+ class ElementwiseAffine(nn.Module):
384
+ def __init__(self, channels):
385
+ super().__init__()
386
+ self.channels = channels
387
+ self.m = nn.Parameter(torch.zeros(channels, 1))
388
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
389
+
390
+ def forward(self, x, x_mask, reverse=False, **kwargs):
391
+ if not reverse:
392
+ y = self.m + torch.exp(self.logs) * x
393
+ y = y * x_mask
394
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
395
+ return y, logdet
396
+ else:
397
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
398
+ return x
399
+
400
+
401
+ class ResidualCouplingLayer(nn.Module):
402
+ def __init__(
403
+ self,
404
+ channels,
405
+ hidden_channels,
406
+ kernel_size,
407
+ dilation_rate,
408
+ n_layers,
409
+ p_dropout=0,
410
+ gin_channels=0,
411
+ mean_only=False,
412
+ ):
413
+ assert channels % 2 == 0, "channels should be divisible by 2"
414
+ super().__init__()
415
+ self.channels = channels
416
+ self.hidden_channels = hidden_channels
417
+ self.kernel_size = kernel_size
418
+ self.dilation_rate = dilation_rate
419
+ self.n_layers = n_layers
420
+ self.half_channels = channels // 2
421
+ self.mean_only = mean_only
422
+
423
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
424
+ self.enc = WN(
425
+ hidden_channels,
426
+ kernel_size,
427
+ dilation_rate,
428
+ n_layers,
429
+ p_dropout=p_dropout,
430
+ gin_channels=gin_channels,
431
+ )
432
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
433
+ self.post.weight.data.zero_()
434
+ self.post.bias.data.zero_()
435
+
436
+ def forward(self, x, x_mask, g=None, reverse=False):
437
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
438
+ h = self.pre(x0) * x_mask
439
+ h = self.enc(h, x_mask, g=g)
440
+ stats = self.post(h) * x_mask
441
+ if not self.mean_only:
442
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
443
+ else:
444
+ m = stats
445
+ logs = torch.zeros_like(m)
446
+
447
+ if not reverse:
448
+ x1 = m + x1 * torch.exp(logs) * x_mask
449
+ x = torch.cat([x0, x1], 1)
450
+ logdet = torch.sum(logs, [1, 2])
451
+ return x, logdet
452
+ else:
453
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
454
+ x = torch.cat([x0, x1], 1)
455
+ return x
456
+
457
+
458
+ class ConvFlow(nn.Module):
459
+ def __init__(
460
+ self,
461
+ in_channels,
462
+ filter_channels,
463
+ kernel_size,
464
+ n_layers,
465
+ num_bins=10,
466
+ tail_bound=5.0,
467
+ ):
468
+ super().__init__()
469
+ self.in_channels = in_channels
470
+ self.filter_channels = filter_channels
471
+ self.kernel_size = kernel_size
472
+ self.n_layers = n_layers
473
+ self.num_bins = num_bins
474
+ self.tail_bound = tail_bound
475
+ self.half_channels = in_channels // 2
476
+
477
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
478
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
479
+ self.proj = nn.Conv1d(
480
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
481
+ )
482
+ self.proj.weight.data.zero_()
483
+ self.proj.bias.data.zero_()
484
+
485
+ def forward(self, x, x_mask, g=None, reverse=False):
486
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
487
+ h = self.pre(x0)
488
+ h = self.convs(h, x_mask, g=g)
489
+ h = self.proj(h) * x_mask
490
+
491
+ b, c, t = x0.shape
492
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
493
+
494
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
495
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
496
+ self.filter_channels
497
+ )
498
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
499
+
500
+ x1, logabsdet = piecewise_rational_quadratic_transform(
501
+ x1,
502
+ unnormalized_widths,
503
+ unnormalized_heights,
504
+ unnormalized_derivatives,
505
+ inverse=reverse,
506
+ tails="linear",
507
+ tail_bound=self.tail_bound,
508
+ )
509
+
510
+ x = torch.cat([x0, x1], 1) * x_mask
511
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
512
+ if not reverse:
513
+ return x, logdet
514
+ else:
515
+ return x
516
+
517
+
518
+ class TransformerCouplingLayer(nn.Module):
519
+ def __init__(
520
+ self,
521
+ channels,
522
+ hidden_channels,
523
+ kernel_size,
524
+ n_layers,
525
+ n_heads,
526
+ p_dropout=0,
527
+ filter_channels=0,
528
+ mean_only=False,
529
+ wn_sharing_parameter=None,
530
+ gin_channels=0,
531
+ ):
532
+ assert n_layers == 3, n_layers
533
+ assert channels % 2 == 0, "channels should be divisible by 2"
534
+ super().__init__()
535
+ self.channels = channels
536
+ self.hidden_channels = hidden_channels
537
+ self.kernel_size = kernel_size
538
+ self.n_layers = n_layers
539
+ self.half_channels = channels // 2
540
+ self.mean_only = mean_only
541
+
542
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
543
+ self.enc = (
544
+ Encoder(
545
+ hidden_channels,
546
+ filter_channels,
547
+ n_heads,
548
+ n_layers,
549
+ kernel_size,
550
+ p_dropout,
551
+ isflow=True,
552
+ gin_channels=gin_channels,
553
+ )
554
+ if wn_sharing_parameter is None
555
+ else wn_sharing_parameter
556
+ )
557
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
558
+ self.post.weight.data.zero_()
559
+ self.post.bias.data.zero_()
560
+
561
+ def forward(self, x, x_mask, g=None, reverse=False):
562
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
563
+ h = self.pre(x0) * x_mask
564
+ h = self.enc(h, x_mask, g=g)
565
+ stats = self.post(h) * x_mask
566
+ if not self.mean_only:
567
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
568
+ else:
569
+ m = stats
570
+ logs = torch.zeros_like(m)
571
+
572
+ if not reverse:
573
+ x1 = m + x1 * torch.exp(logs) * x_mask
574
+ x = torch.cat([x0, x1], 1)
575
+ logdet = torch.sum(logs, [1, 2])
576
+ return x, logdet
577
+ else:
578
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
579
+ x = torch.cat([x0, x1], 1)
580
+ return x
581
+
582
+ x1, logabsdet = piecewise_rational_quadratic_transform(
583
+ x1,
584
+ unnormalized_widths,
585
+ unnormalized_heights,
586
+ unnormalized_derivatives,
587
+ inverse=reverse,
588
+ tails="linear",
589
+ tail_bound=self.tail_bound,
590
+ )
591
+
592
+ x = torch.cat([x0, x1], 1) * x_mask
593
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
594
+ if not reverse:
595
+ return x, logdet
596
+ else:
597
+ return x
openvoice_cli/se_extractor.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import torch
4
+ import hashlib
5
+ import librosa
6
+ import base64
7
+ from glob import glob
8
+ import numpy as np
9
+ from pydub import AudioSegment
10
+ from faster_whisper import WhisperModel
11
+ import hashlib
12
+ import base64
13
+ import librosa
14
+ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
15
+
16
+ model_size = "medium"
17
+ # Run on GPU with FP16
18
+ model = None
19
+ def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
20
+ global model
21
+ if model is None:
22
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
23
+ audio = AudioSegment.from_file(audio_path)
24
+ max_len = len(audio)
25
+
26
+ target_folder = os.path.join(target_dir, audio_name)
27
+
28
+ segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
29
+ segments = list(segments)
30
+
31
+ # create directory
32
+ os.makedirs(target_folder, exist_ok=True)
33
+ wavs_folder = os.path.join(target_folder, 'wavs')
34
+ os.makedirs(wavs_folder, exist_ok=True)
35
+
36
+ # segments
37
+ s_ind = 0
38
+ start_time = None
39
+
40
+ for k, w in enumerate(segments):
41
+ # process with the time
42
+ if k == 0:
43
+ start_time = max(0, w.start)
44
+
45
+ end_time = w.end
46
+
47
+ # calculate confidence
48
+ if len(w.words) > 0:
49
+ confidence = sum([s.probability for s in w.words]) / len(w.words)
50
+ else:
51
+ confidence = 0.
52
+ # clean text
53
+ text = w.text.replace('...', '')
54
+
55
+ # left 0.08s for each audios
56
+ audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
57
+
58
+ # segment file name
59
+ fname = f"{audio_name}_seg{s_ind}.wav"
60
+
61
+ # filter out the segment shorter than 1.5s and longer than 20s
62
+ save = audio_seg.duration_seconds > 1.5 and \
63
+ audio_seg.duration_seconds < 20. and \
64
+ len(text) >= 2 and len(text) < 200
65
+
66
+ if save:
67
+ output_file = os.path.join(wavs_folder, fname)
68
+ audio_seg.export(output_file, format='wav')
69
+
70
+ if k < len(segments) - 1:
71
+ start_time = max(0, segments[k+1].start - 0.08)
72
+
73
+ s_ind = s_ind + 1
74
+ return wavs_folder
75
+
76
+
77
+ def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
78
+ SAMPLE_RATE = 16000
79
+ audio_vad = get_audio_tensor(audio_path)
80
+ segments = get_vad_segments(
81
+ audio_vad,
82
+ output_sample=True,
83
+ min_speech_duration=0.1,
84
+ min_silence_duration=1,
85
+ method="silero",
86
+ )
87
+ segments = [(seg["start"], seg["end"]) for seg in segments]
88
+ segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
89
+ print(segments)
90
+ audio_active = AudioSegment.silent(duration=0)
91
+ audio = AudioSegment.from_file(audio_path)
92
+
93
+ for start_time, end_time in segments:
94
+ audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
95
+
96
+ audio_dur = audio_active.duration_seconds
97
+ print(f'after vad: dur = {audio_dur}')
98
+ target_folder = os.path.join(target_dir, audio_name)
99
+ wavs_folder = os.path.join(target_folder, 'wavs')
100
+ os.makedirs(wavs_folder, exist_ok=True)
101
+ start_time = 0.
102
+ count = 0
103
+ num_splits = int(np.round(audio_dur / split_seconds))
104
+ assert num_splits > 0, 'input audio is too short'
105
+ interval = audio_dur / num_splits
106
+
107
+ for i in range(num_splits):
108
+ end_time = min(start_time + interval, audio_dur)
109
+ if i == num_splits - 1:
110
+ end_time = audio_dur
111
+ output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
112
+ audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
113
+ audio_seg.export(output_file, format='wav')
114
+ start_time = end_time
115
+ count += 1
116
+ return wavs_folder
117
+
118
+ def hash_numpy_array(audio_path):
119
+ array, _ = librosa.load(audio_path, sr=None, mono=True)
120
+ # Convert the array to bytes
121
+ array_bytes = array.tobytes()
122
+ # Calculate the hash of the array bytes
123
+ hash_object = hashlib.sha256(array_bytes)
124
+ hash_value = hash_object.digest()
125
+ # Convert the hash value to base64
126
+ base64_value = base64.b64encode(hash_value)
127
+ return base64_value.decode('utf-8')[:16].replace('/', '_^')
128
+
129
+ def get_se(audio_path, vc_model, target_dir='processed', vad=True):
130
+ device = vc_model.device
131
+ version = vc_model.version
132
+ print("OpenVoice version:", version)
133
+
134
+ audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
135
+ se_path = os.path.join(target_dir, audio_name, 'se.pth')
136
+
137
+ # if os.path.isfile(se_path):
138
+ # se = torch.load(se_path).to(device)
139
+ # return se, audio_name
140
+ # if os.path.isdir(audio_path):
141
+ # wavs_folder = audio_path
142
+
143
+ if vad:
144
+ wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
145
+ else:
146
+ wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
147
+
148
+ audio_segs = glob(f'{wavs_folder}/*.wav')
149
+ if len(audio_segs) == 0:
150
+ raise NotImplementedError('No audio segments found!')
151
+
152
+ return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
openvoice_cli/transforms.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import functional as F
3
+
4
+ import numpy as np
5
+
6
+
7
+ DEFAULT_MIN_BIN_WIDTH = 1e-3
8
+ DEFAULT_MIN_BIN_HEIGHT = 1e-3
9
+ DEFAULT_MIN_DERIVATIVE = 1e-3
10
+
11
+
12
+ def piecewise_rational_quadratic_transform(
13
+ inputs,
14
+ unnormalized_widths,
15
+ unnormalized_heights,
16
+ unnormalized_derivatives,
17
+ inverse=False,
18
+ tails=None,
19
+ tail_bound=1.0,
20
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
21
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
22
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
23
+ ):
24
+ if tails is None:
25
+ spline_fn = rational_quadratic_spline
26
+ spline_kwargs = {}
27
+ else:
28
+ spline_fn = unconstrained_rational_quadratic_spline
29
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
30
+
31
+ outputs, logabsdet = spline_fn(
32
+ inputs=inputs,
33
+ unnormalized_widths=unnormalized_widths,
34
+ unnormalized_heights=unnormalized_heights,
35
+ unnormalized_derivatives=unnormalized_derivatives,
36
+ inverse=inverse,
37
+ min_bin_width=min_bin_width,
38
+ min_bin_height=min_bin_height,
39
+ min_derivative=min_derivative,
40
+ **spline_kwargs
41
+ )
42
+ return outputs, logabsdet
43
+
44
+
45
+ def searchsorted(bin_locations, inputs, eps=1e-6):
46
+ bin_locations[..., -1] += eps
47
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
48
+
49
+
50
+ def unconstrained_rational_quadratic_spline(
51
+ inputs,
52
+ unnormalized_widths,
53
+ unnormalized_heights,
54
+ unnormalized_derivatives,
55
+ inverse=False,
56
+ tails="linear",
57
+ tail_bound=1.0,
58
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
59
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
60
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
61
+ ):
62
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
63
+ outside_interval_mask = ~inside_interval_mask
64
+
65
+ outputs = torch.zeros_like(inputs)
66
+ logabsdet = torch.zeros_like(inputs)
67
+
68
+ if tails == "linear":
69
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
70
+ constant = np.log(np.exp(1 - min_derivative) - 1)
71
+ unnormalized_derivatives[..., 0] = constant
72
+ unnormalized_derivatives[..., -1] = constant
73
+
74
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
75
+ logabsdet[outside_interval_mask] = 0
76
+ else:
77
+ raise RuntimeError("{} tails are not implemented.".format(tails))
78
+
79
+ (
80
+ outputs[inside_interval_mask],
81
+ logabsdet[inside_interval_mask],
82
+ ) = rational_quadratic_spline(
83
+ inputs=inputs[inside_interval_mask],
84
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
85
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
86
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
87
+ inverse=inverse,
88
+ left=-tail_bound,
89
+ right=tail_bound,
90
+ bottom=-tail_bound,
91
+ top=tail_bound,
92
+ min_bin_width=min_bin_width,
93
+ min_bin_height=min_bin_height,
94
+ min_derivative=min_derivative,
95
+ )
96
+
97
+ return outputs, logabsdet
98
+
99
+
100
+ def rational_quadratic_spline(
101
+ inputs,
102
+ unnormalized_widths,
103
+ unnormalized_heights,
104
+ unnormalized_derivatives,
105
+ inverse=False,
106
+ left=0.0,
107
+ right=1.0,
108
+ bottom=0.0,
109
+ top=1.0,
110
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
113
+ ):
114
+ if torch.min(inputs) < left or torch.max(inputs) > right:
115
+ raise ValueError("Input to a transform is not within its domain")
116
+
117
+ num_bins = unnormalized_widths.shape[-1]
118
+
119
+ if min_bin_width * num_bins > 1.0:
120
+ raise ValueError("Minimal bin width too large for the number of bins")
121
+ if min_bin_height * num_bins > 1.0:
122
+ raise ValueError("Minimal bin height too large for the number of bins")
123
+
124
+ widths = F.softmax(unnormalized_widths, dim=-1)
125
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126
+ cumwidths = torch.cumsum(widths, dim=-1)
127
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128
+ cumwidths = (right - left) * cumwidths + left
129
+ cumwidths[..., 0] = left
130
+ cumwidths[..., -1] = right
131
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132
+
133
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134
+
135
+ heights = F.softmax(unnormalized_heights, dim=-1)
136
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137
+ cumheights = torch.cumsum(heights, dim=-1)
138
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139
+ cumheights = (top - bottom) * cumheights + bottom
140
+ cumheights[..., 0] = bottom
141
+ cumheights[..., -1] = top
142
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
143
+
144
+ if inverse:
145
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
146
+ else:
147
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
148
+
149
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151
+
152
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153
+ delta = heights / widths
154
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
155
+
156
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158
+
159
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
160
+
161
+ if inverse:
162
+ a = (inputs - input_cumheights) * (
163
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
164
+ ) + input_heights * (input_delta - input_derivatives)
165
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
167
+ )
168
+ c = -input_delta * (inputs - input_cumheights)
169
+
170
+ discriminant = b.pow(2) - 4 * a * c
171
+ assert (discriminant >= 0).all()
172
+
173
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
174
+ outputs = root * input_bin_widths + input_cumwidths
175
+
176
+ theta_one_minus_theta = root * (1 - root)
177
+ denominator = input_delta + (
178
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179
+ * theta_one_minus_theta
180
+ )
181
+ derivative_numerator = input_delta.pow(2) * (
182
+ input_derivatives_plus_one * root.pow(2)
183
+ + 2 * input_delta * theta_one_minus_theta
184
+ + input_derivatives * (1 - root).pow(2)
185
+ )
186
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187
+
188
+ return outputs, -logabsdet
189
+ else:
190
+ theta = (inputs - input_cumwidths) / input_bin_widths
191
+ theta_one_minus_theta = theta * (1 - theta)
192
+
193
+ numerator = input_heights * (
194
+ input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195
+ )
196
+ denominator = input_delta + (
197
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198
+ * theta_one_minus_theta
199
+ )
200
+ outputs = input_cumheights + numerator / denominator
201
+
202
+ derivative_numerator = input_delta.pow(2) * (
203
+ input_derivatives_plus_one * theta.pow(2)
204
+ + 2 * input_delta * theta_one_minus_theta
205
+ + input_derivatives * (1 - theta).pow(2)
206
+ )
207
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208
+
209
+ return outputs, logabsdet
openvoice_cli/utils.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import numpy as np
4
+
5
+
6
+ def get_hparams_from_file(config_path):
7
+ with open(config_path, "r", encoding="utf-8") as f:
8
+ data = f.read()
9
+ config = json.loads(data)
10
+
11
+ hparams = HParams(**config)
12
+ return hparams
13
+
14
+ class HParams:
15
+ def __init__(self, **kwargs):
16
+ for k, v in kwargs.items():
17
+ if type(v) == dict:
18
+ v = HParams(**v)
19
+ self[k] = v
20
+
21
+ def keys(self):
22
+ return self.__dict__.keys()
23
+
24
+ def items(self):
25
+ return self.__dict__.items()
26
+
27
+ def values(self):
28
+ return self.__dict__.values()
29
+
30
+ def __len__(self):
31
+ return len(self.__dict__)
32
+
33
+ def __getitem__(self, key):
34
+ return getattr(self, key)
35
+
36
+ def __setitem__(self, key, value):
37
+ return setattr(self, key, value)
38
+
39
+ def __contains__(self, key):
40
+ return key in self.__dict__
41
+
42
+ def __repr__(self):
43
+ return self.__dict__.__repr__()
44
+
45
+
46
+ def string_to_bits(string, pad_len=8):
47
+ # Convert each character to its ASCII value
48
+ ascii_values = [ord(char) for char in string]
49
+
50
+ # Convert ASCII values to binary representation
51
+ binary_values = [bin(value)[2:].zfill(8) for value in ascii_values]
52
+
53
+ # Convert binary strings to integer arrays
54
+ bit_arrays = [[int(bit) for bit in binary] for binary in binary_values]
55
+
56
+ # Convert list of arrays to NumPy array
57
+ numpy_array = np.array(bit_arrays)
58
+ numpy_array_full = np.zeros((pad_len, 8), dtype=numpy_array.dtype)
59
+ numpy_array_full[:, 2] = 1
60
+ max_len = min(pad_len, len(numpy_array))
61
+ numpy_array_full[:max_len] = numpy_array[:max_len]
62
+ return numpy_array_full
63
+
64
+
65
+ def bits_to_string(bits_array):
66
+ # Convert each row of the array to a binary string
67
+ binary_values = [''.join(str(bit) for bit in row) for row in bits_array]
68
+
69
+ # Convert binary strings to ASCII values
70
+ ascii_values = [int(binary, 2) for binary in binary_values]
71
+
72
+ # Convert ASCII values to characters
73
+ output_string = ''.join(chr(value) for value in ascii_values)
74
+
75
+ return output_string
76
+
77
+
78
+ def split_sentence(text, min_len=10, language_str='[EN]'):
79
+ if language_str in ['EN']:
80
+ sentences = split_sentences_latin(text, min_len=min_len)
81
+ else:
82
+ sentences = split_sentences_zh(text, min_len=min_len)
83
+ return sentences
84
+
85
+ def split_sentences_latin(text, min_len=10):
86
+ """Split Long sentences into list of short ones
87
+
88
+ Args:
89
+ str: Input sentences.
90
+
91
+ Returns:
92
+ List[str]: list of output sentences.
93
+ """
94
+ # deal with dirty sentences
95
+ text = re.sub('[。!?;]', '.', text)
96
+ text = re.sub('[,]', ',', text)
97
+ text = re.sub('[“”]', '"', text)
98
+ text = re.sub('[‘’]', "'", text)
99
+ text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
100
+ text = re.sub('[\n\t ]+', ' ', text)
101
+ text = re.sub('([,.!?;])', r'\1 $#!', text)
102
+ # split
103
+ sentences = [s.strip() for s in text.split('$#!')]
104
+ if len(sentences[-1]) == 0: del sentences[-1]
105
+
106
+ new_sentences = []
107
+ new_sent = []
108
+ count_len = 0
109
+ for ind, sent in enumerate(sentences):
110
+ # print(sent)
111
+ new_sent.append(sent)
112
+ count_len += len(sent.split(" "))
113
+ if count_len > min_len or ind == len(sentences) - 1:
114
+ count_len = 0
115
+ new_sentences.append(' '.join(new_sent))
116
+ new_sent = []
117
+ return merge_short_sentences_latin(new_sentences)
118
+
119
+
120
+ def merge_short_sentences_latin(sens):
121
+ """Avoid short sentences by merging them with the following sentence.
122
+
123
+ Args:
124
+ List[str]: list of input sentences.
125
+
126
+ Returns:
127
+ List[str]: list of output sentences.
128
+ """
129
+ sens_out = []
130
+ for s in sens:
131
+ # If the previous sentence is too short, merge them with
132
+ # the current sentence.
133
+ if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
134
+ sens_out[-1] = sens_out[-1] + " " + s
135
+ else:
136
+ sens_out.append(s)
137
+ try:
138
+ if len(sens_out[-1].split(" ")) <= 2:
139
+ sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
140
+ sens_out.pop(-1)
141
+ except:
142
+ pass
143
+ return sens_out
144
+
145
+ def split_sentences_zh(text, min_len=10):
146
+ text = re.sub('[。!?;]', '.', text)
147
+ text = re.sub('[,]', ',', text)
148
+ # 将文本中的换行符、空格和制表符替换为空格
149
+ text = re.sub('[\n\t ]+', ' ', text)
150
+ # 在标点符号后添加一个空格
151
+ text = re.sub('([,.!?;])', r'\1 $#!', text)
152
+ # 分隔句子并去除前后空格
153
+ # sentences = [s.strip() for s in re.split('(。|!|?|;)', text)]
154
+ sentences = [s.strip() for s in text.split('$#!')]
155
+ if len(sentences[-1]) == 0: del sentences[-1]
156
+
157
+ new_sentences = []
158
+ new_sent = []
159
+ count_len = 0
160
+ for ind, sent in enumerate(sentences):
161
+ new_sent.append(sent)
162
+ count_len += len(sent)
163
+ if count_len > min_len or ind == len(sentences) - 1:
164
+ count_len = 0
165
+ new_sentences.append(' '.join(new_sent))
166
+ new_sent = []
167
+ return merge_short_sentences_zh(new_sentences)
168
+
169
+
170
+ def merge_short_sentences_zh(sens):
171
+ # return sens
172
+ """Avoid short sentences by merging them with the following sentence.
173
+
174
+ Args:
175
+ List[str]: list of input sentences.
176
+
177
+ Returns:
178
+ List[str]: list of output sentences.
179
+ """
180
+ sens_out = []
181
+ for s in sens:
182
+ # If the previous sentense is too short, merge them with
183
+ # the current sentence.
184
+ if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
185
+ sens_out[-1] = sens_out[-1] + " " + s
186
+ else:
187
+ sens_out.append(s)
188
+ try:
189
+ if len(sens_out[-1]) <= 2:
190
+ sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
191
+ sens_out.pop(-1)
192
+ except:
193
+ pass
194
+ return sens_out