Adam3 commited on
Commit
8d13132
·
verified ·
1 Parent(s): 3395174

Upload 20 files

Browse files
Files changed (20) hide show
  1. F0Extractor.py +100 -0
  2. FCPE.py +920 -0
  3. RMVPE.py +560 -0
  4. __init__.py +1 -0
  5. attentions.py +243 -0
  6. commons.py +207 -0
  7. config.py +159 -0
  8. core.py +1519 -0
  9. discriminators.py +160 -0
  10. encoders.py +218 -0
  11. generators.py +231 -0
  12. infer.py +495 -0
  13. modules.py +124 -0
  14. normalization.py +31 -0
  15. nsf.py +196 -0
  16. pipeline.py +708 -0
  17. residuals.py +250 -0
  18. synthesizers.py +237 -0
  19. utils.py +137 -0
  20. zluda.py +43 -0
F0Extractor.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import pathlib
3
+ import libf0
4
+ import librosa
5
+ import numpy as np
6
+ import resampy
7
+ import torch
8
+ import torchcrepe
9
+ import torchfcpe
10
+ import os
11
+
12
+ # from tools.anyf0.rmvpe import RMVPE
13
+ from rvc_cli.rvc.lib.predictors.RMVPE import RMVPE0Predictor
14
+ from rvc_cli.rvc.configs.config import Config
15
+
16
+ config = Config()
17
+
18
+
19
+ @dataclasses.dataclass
20
+ class F0Extractor:
21
+ wav_path: pathlib.Path
22
+ sample_rate: int = 44100
23
+ hop_length: int = 512
24
+ f0_min: int = 50
25
+ f0_max: int = 1600
26
+ method: str = "rmvpe"
27
+ x: np.ndarray = dataclasses.field(init=False)
28
+
29
+ def __post_init__(self):
30
+ self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate)
31
+
32
+ @property
33
+ def hop_size(self) -> float:
34
+ return self.hop_length / self.sample_rate
35
+
36
+ @property
37
+ def wav16k(self) -> np.ndarray:
38
+ return resampy.resample(self.x, self.sample_rate, 16000)
39
+
40
+ def extract_f0(self) -> np.ndarray:
41
+ f0 = None
42
+ method = self.method
43
+ if method == "crepe":
44
+ wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(config.device)
45
+ f0 = torchcrepe.predict(
46
+ wav16k_torch,
47
+ sample_rate=16000,
48
+ hop_length=160,
49
+ batch_size=512,
50
+ fmin=self.f0_min,
51
+ fmax=self.f0_max,
52
+ device=config.device,
53
+ )
54
+ f0 = f0[0].cpu().numpy()
55
+ elif method == "fcpe":
56
+ audio = librosa.to_mono(self.x)
57
+ audio_length = len(audio)
58
+ f0_target_length = (audio_length // self.hop_length) + 1
59
+ audio = (
60
+ torch.from_numpy(audio)
61
+ .float()
62
+ .unsqueeze(0)
63
+ .unsqueeze(-1)
64
+ .to(config.device)
65
+ )
66
+ model = torchfcpe.spawn_bundled_infer_model(device=config.device)
67
+
68
+ f0 = model.infer(
69
+ audio,
70
+ sr=self.sample_rate,
71
+ decoder_mode="local_argmax",
72
+ threshold=0.006,
73
+ f0_min=self.f0_min,
74
+ f0_max=self.f0_max,
75
+ interp_uv=False,
76
+ output_interp_target_length=f0_target_length,
77
+ )
78
+ f0 = f0.squeeze().cpu().numpy()
79
+ elif method == "rmvpe":
80
+ model_rmvpe = RMVPE0Predictor(
81
+ os.path.join("rvc_cli", "rvc", "models", "predictors", "rmvpe.pt"),
82
+ is_half=config.is_half,
83
+ device=config.device,
84
+ # hop_length=80
85
+ )
86
+ f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03)
87
+
88
+ else:
89
+ raise ValueError(f"Unknown method: {self.method}")
90
+ return libf0.hz_to_cents(f0, librosa.midi_to_hz(0))
91
+
92
+ def plot_f0(self, f0):
93
+ from matplotlib import pyplot as plt
94
+
95
+ plt.figure(figsize=(10, 4))
96
+ plt.plot(f0)
97
+ plt.title(self.method)
98
+ plt.xlabel("Time (frames)")
99
+ plt.ylabel("F0 (cents)")
100
+ plt.show()
FCPE.py ADDED
@@ -0,0 +1,920 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn as nn
7
+ from torch.nn.utils.parametrizations import weight_norm
8
+ from torchaudio.transforms import Resample
9
+ import os
10
+ import librosa
11
+ import soundfile as sf
12
+ import torch.utils.data
13
+ from librosa.filters import mel as librosa_mel_fn
14
+ import math
15
+ from functools import partial
16
+
17
+ from einops import rearrange, repeat
18
+ from local_attention import LocalAttention
19
+ from torch import nn
20
+
21
+ os.environ["LRU_CACHE_CAPACITY"] = "3"
22
+
23
+
24
+ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
25
+ """Loads wav file to torch tensor."""
26
+ try:
27
+ data, sample_rate = sf.read(full_path, always_2d=True)
28
+ except Exception as error:
29
+ print(f"An error occurred loading {full_path}: {error}")
30
+ if return_empty_on_exception:
31
+ return [], sample_rate or target_sr or 48000
32
+ else:
33
+ raise
34
+
35
+ data = data[:, 0] if len(data.shape) > 1 else data
36
+ assert len(data) > 2
37
+
38
+ # Normalize data
39
+ max_mag = (
40
+ -np.iinfo(data.dtype).min
41
+ if np.issubdtype(data.dtype, np.integer)
42
+ else max(np.amax(data), -np.amin(data))
43
+ )
44
+ max_mag = (
45
+ (2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0)
46
+ )
47
+ data = torch.FloatTensor(data.astype(np.float32)) / max_mag
48
+
49
+ # Handle exceptions and resample
50
+ if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:
51
+ return [], sample_rate or target_sr or 48000
52
+ if target_sr is not None and sample_rate != target_sr:
53
+ data = torch.from_numpy(
54
+ librosa.core.resample(
55
+ data.numpy(), orig_sr=sample_rate, target_sr=target_sr
56
+ )
57
+ )
58
+ sample_rate = target_sr
59
+
60
+ return data, sample_rate
61
+
62
+
63
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
64
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
65
+
66
+
67
+ def dynamic_range_decompression(x, C=1):
68
+ return np.exp(x) / C
69
+
70
+
71
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
72
+ return torch.log(torch.clamp(x, min=clip_val) * C)
73
+
74
+
75
+ def dynamic_range_decompression_torch(x, C=1):
76
+ return torch.exp(x) / C
77
+
78
+
79
+ class STFT:
80
+ def __init__(
81
+ self,
82
+ sr=22050,
83
+ n_mels=80,
84
+ n_fft=1024,
85
+ win_size=1024,
86
+ hop_length=256,
87
+ fmin=20,
88
+ fmax=11025,
89
+ clip_val=1e-5,
90
+ ):
91
+ self.target_sr = sr
92
+ self.n_mels = n_mels
93
+ self.n_fft = n_fft
94
+ self.win_size = win_size
95
+ self.hop_length = hop_length
96
+ self.fmin = fmin
97
+ self.fmax = fmax
98
+ self.clip_val = clip_val
99
+ self.mel_basis = {}
100
+ self.hann_window = {}
101
+
102
+ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
103
+ sample_rate = self.target_sr
104
+ n_mels = self.n_mels
105
+ n_fft = self.n_fft
106
+ win_size = self.win_size
107
+ hop_length = self.hop_length
108
+ fmin = self.fmin
109
+ fmax = self.fmax
110
+ clip_val = self.clip_val
111
+
112
+ factor = 2 ** (keyshift / 12)
113
+ n_fft_new = int(np.round(n_fft * factor))
114
+ win_size_new = int(np.round(win_size * factor))
115
+ hop_length_new = int(np.round(hop_length * speed))
116
+
117
+ # Optimize mel_basis and hann_window caching
118
+ mel_basis = self.mel_basis if not train else {}
119
+ hann_window = self.hann_window if not train else {}
120
+
121
+ mel_basis_key = str(fmax) + "_" + str(y.device)
122
+ if mel_basis_key not in mel_basis:
123
+ mel = librosa_mel_fn(
124
+ sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
125
+ )
126
+ mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
127
+
128
+ keyshift_key = str(keyshift) + "_" + str(y.device)
129
+ if keyshift_key not in hann_window:
130
+ hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
131
+
132
+ # Padding and STFT
133
+ pad_left = (win_size_new - hop_length_new) // 2
134
+ pad_right = max(
135
+ (win_size_new - hop_length_new + 1) // 2,
136
+ win_size_new - y.size(-1) - pad_left,
137
+ )
138
+ mode = "reflect" if pad_right < y.size(-1) else "constant"
139
+ y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode)
140
+ y = y.squeeze(1)
141
+
142
+ spec = torch.stft(
143
+ y,
144
+ n_fft_new,
145
+ hop_length=hop_length_new,
146
+ win_length=win_size_new,
147
+ window=hann_window[keyshift_key],
148
+ center=center,
149
+ pad_mode="reflect",
150
+ normalized=False,
151
+ onesided=True,
152
+ return_complex=True,
153
+ )
154
+ spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
155
+
156
+ # Handle keyshift and mel conversion
157
+ if keyshift != 0:
158
+ size = n_fft // 2 + 1
159
+ resize = spec.size(1)
160
+ spec = (
161
+ F.pad(spec, (0, 0, 0, size - resize))
162
+ if resize < size
163
+ else spec[:, :size, :]
164
+ )
165
+ spec = spec * win_size / win_size_new
166
+ spec = torch.matmul(mel_basis[mel_basis_key], spec)
167
+ spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
168
+ return spec
169
+
170
+ def __call__(self, audiopath):
171
+ audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
172
+ spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
173
+ return spect
174
+
175
+
176
+ stft = STFT()
177
+
178
+
179
+ def softmax_kernel(
180
+ data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None
181
+ ):
182
+ b, h, *_ = data.shape
183
+
184
+ # Normalize data
185
+ data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0
186
+
187
+ # Project data
188
+ ratio = projection_matrix.shape[0] ** -0.5
189
+ projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h)
190
+ projection = projection.type_as(data)
191
+ data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection)
192
+
193
+ # Calculate diagonal data
194
+ diag_data = data**2
195
+ diag_data = torch.sum(diag_data, dim=-1)
196
+ diag_data = (diag_data / 2.0) * (data_normalizer**2)
197
+ diag_data = diag_data.unsqueeze(dim=-1)
198
+
199
+ # Apply softmax
200
+ if is_query:
201
+ data_dash = ratio * (
202
+ torch.exp(
203
+ data_dash
204
+ - diag_data
205
+ - torch.max(data_dash, dim=-1, keepdim=True).values
206
+ )
207
+ + eps
208
+ )
209
+ else:
210
+ data_dash = ratio * (torch.exp(data_dash - diag_data + eps))
211
+
212
+ return data_dash.type_as(data)
213
+
214
+
215
+ def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
216
+ unstructured_block = torch.randn((cols, cols), device=device)
217
+ q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
218
+ q, r = map(lambda t: t.to(device), (q, r))
219
+
220
+ if qr_uniform_q:
221
+ d = torch.diag(r, 0)
222
+ q *= d.sign()
223
+ return q.t()
224
+
225
+
226
+ def exists(val):
227
+ return val is not None
228
+
229
+
230
+ def empty(tensor):
231
+ return tensor.numel() == 0
232
+
233
+
234
+ def default(val, d):
235
+ return val if exists(val) else d
236
+
237
+
238
+ def cast_tuple(val):
239
+ return (val,) if not isinstance(val, tuple) else val
240
+
241
+
242
+ class PCmer(nn.Module):
243
+ def __init__(
244
+ self,
245
+ num_layers,
246
+ num_heads,
247
+ dim_model,
248
+ dim_keys,
249
+ dim_values,
250
+ residual_dropout,
251
+ attention_dropout,
252
+ ):
253
+ super().__init__()
254
+ self.num_layers = num_layers
255
+ self.num_heads = num_heads
256
+ self.dim_model = dim_model
257
+ self.dim_values = dim_values
258
+ self.dim_keys = dim_keys
259
+ self.residual_dropout = residual_dropout
260
+ self.attention_dropout = attention_dropout
261
+
262
+ self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
263
+
264
+ def forward(self, phone, mask=None):
265
+ for layer in self._layers:
266
+ phone = layer(phone, mask)
267
+ return phone
268
+
269
+
270
+ class _EncoderLayer(nn.Module):
271
+ def __init__(self, parent: PCmer):
272
+ super().__init__()
273
+ self.conformer = ConformerConvModule(parent.dim_model)
274
+ self.norm = nn.LayerNorm(parent.dim_model)
275
+ self.dropout = nn.Dropout(parent.residual_dropout)
276
+ self.attn = SelfAttention(
277
+ dim=parent.dim_model, heads=parent.num_heads, causal=False
278
+ )
279
+
280
+ def forward(self, phone, mask=None):
281
+ phone = phone + (self.attn(self.norm(phone), mask=mask))
282
+ phone = phone + (self.conformer(phone))
283
+ return phone
284
+
285
+
286
+ def calc_same_padding(kernel_size):
287
+ pad = kernel_size // 2
288
+ return (pad, pad - (kernel_size + 1) % 2)
289
+
290
+
291
+ class Swish(nn.Module):
292
+ def forward(self, x):
293
+ return x * x.sigmoid()
294
+
295
+
296
+ class Transpose(nn.Module):
297
+ def __init__(self, dims):
298
+ super().__init__()
299
+ assert len(dims) == 2, "dims must be a tuple of two dimensions"
300
+ self.dims = dims
301
+
302
+ def forward(self, x):
303
+ return x.transpose(*self.dims)
304
+
305
+
306
+ class GLU(nn.Module):
307
+ def __init__(self, dim):
308
+ super().__init__()
309
+ self.dim = dim
310
+
311
+ def forward(self, x):
312
+ out, gate = x.chunk(2, dim=self.dim)
313
+ return out * gate.sigmoid()
314
+
315
+
316
+ class DepthWiseConv1d(nn.Module):
317
+ def __init__(self, chan_in, chan_out, kernel_size, padding):
318
+ super().__init__()
319
+ self.padding = padding
320
+ self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
321
+
322
+ def forward(self, x):
323
+ x = F.pad(x, self.padding)
324
+ return self.conv(x)
325
+
326
+
327
+ class ConformerConvModule(nn.Module):
328
+ def __init__(
329
+ self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0
330
+ ):
331
+ super().__init__()
332
+
333
+ inner_dim = dim * expansion_factor
334
+ padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
335
+
336
+ self.net = nn.Sequential(
337
+ nn.LayerNorm(dim),
338
+ Transpose((1, 2)),
339
+ nn.Conv1d(dim, inner_dim * 2, 1),
340
+ GLU(dim=1),
341
+ DepthWiseConv1d(
342
+ inner_dim, inner_dim, kernel_size=kernel_size, padding=padding
343
+ ),
344
+ Swish(),
345
+ nn.Conv1d(inner_dim, dim, 1),
346
+ Transpose((1, 2)),
347
+ nn.Dropout(dropout),
348
+ )
349
+
350
+ def forward(self, x):
351
+ return self.net(x)
352
+
353
+
354
+ def linear_attention(q, k, v):
355
+ if v is None:
356
+ out = torch.einsum("...ed,...nd->...ne", k, q)
357
+ return out
358
+ else:
359
+ k_cumsum = k.sum(dim=-2)
360
+ D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8)
361
+ context = torch.einsum("...nd,...ne->...de", k, v)
362
+ out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv)
363
+ return out
364
+
365
+
366
+ def gaussian_orthogonal_random_matrix(
367
+ nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None
368
+ ):
369
+ nb_full_blocks = int(nb_rows / nb_columns)
370
+ block_list = []
371
+
372
+ for _ in range(nb_full_blocks):
373
+ q = orthogonal_matrix_chunk(
374
+ nb_columns, qr_uniform_q=qr_uniform_q, device=device
375
+ )
376
+ block_list.append(q)
377
+
378
+ remaining_rows = nb_rows - nb_full_blocks * nb_columns
379
+ if remaining_rows > 0:
380
+ q = orthogonal_matrix_chunk(
381
+ nb_columns, qr_uniform_q=qr_uniform_q, device=device
382
+ )
383
+ block_list.append(q[:remaining_rows])
384
+
385
+ final_matrix = torch.cat(block_list)
386
+
387
+ if scaling == 0:
388
+ multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1)
389
+ elif scaling == 1:
390
+ multiplier = math.sqrt((float(nb_columns))) * torch.ones(
391
+ (nb_rows,), device=device
392
+ )
393
+ else:
394
+ raise ValueError(f"Invalid scaling {scaling}")
395
+
396
+ return torch.diag(multiplier) @ final_matrix
397
+
398
+
399
+ class FastAttention(nn.Module):
400
+ def __init__(
401
+ self,
402
+ dim_heads,
403
+ nb_features=None,
404
+ ortho_scaling=0,
405
+ causal=False,
406
+ generalized_attention=False,
407
+ kernel_fn=nn.ReLU(),
408
+ qr_uniform_q=False,
409
+ no_projection=False,
410
+ ):
411
+ super().__init__()
412
+ nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
413
+
414
+ self.dim_heads = dim_heads
415
+ self.nb_features = nb_features
416
+ self.ortho_scaling = ortho_scaling
417
+
418
+ self.create_projection = partial(
419
+ gaussian_orthogonal_random_matrix,
420
+ nb_rows=self.nb_features,
421
+ nb_columns=dim_heads,
422
+ scaling=ortho_scaling,
423
+ qr_uniform_q=qr_uniform_q,
424
+ )
425
+ projection_matrix = self.create_projection()
426
+ self.register_buffer("projection_matrix", projection_matrix)
427
+
428
+ self.generalized_attention = generalized_attention
429
+ self.kernel_fn = kernel_fn
430
+ self.no_projection = no_projection
431
+ self.causal = causal
432
+
433
+ @torch.no_grad()
434
+ def redraw_projection_matrix(self):
435
+ projections = self.create_projection()
436
+ self.projection_matrix.copy_(projections)
437
+ del projections
438
+
439
+ def forward(self, q, k, v):
440
+ device = q.device
441
+
442
+ if self.no_projection:
443
+ q = q.softmax(dim=-1)
444
+ k = torch.exp(k) if self.causal else k.softmax(dim=-2)
445
+ else:
446
+ create_kernel = partial(
447
+ softmax_kernel, projection_matrix=self.projection_matrix, device=device
448
+ )
449
+ q = create_kernel(q, is_query=True)
450
+ k = create_kernel(k, is_query=False)
451
+
452
+ attn_fn = linear_attention if not self.causal else self.causal_linear_fn
453
+
454
+ if v is None:
455
+ out = attn_fn(q, k, None)
456
+ return out
457
+ else:
458
+ out = attn_fn(q, k, v)
459
+ return out
460
+
461
+
462
+ class SelfAttention(nn.Module):
463
+ def __init__(
464
+ self,
465
+ dim,
466
+ causal=False,
467
+ heads=8,
468
+ dim_head=64,
469
+ local_heads=0,
470
+ local_window_size=256,
471
+ nb_features=None,
472
+ feature_redraw_interval=1000,
473
+ generalized_attention=False,
474
+ kernel_fn=nn.ReLU(),
475
+ qr_uniform_q=False,
476
+ dropout=0.0,
477
+ no_projection=False,
478
+ ):
479
+ super().__init__()
480
+ assert dim % heads == 0, "dimension must be divisible by number of heads"
481
+ dim_head = default(dim_head, dim // heads)
482
+ inner_dim = dim_head * heads
483
+ self.fast_attention = FastAttention(
484
+ dim_head,
485
+ nb_features,
486
+ causal=causal,
487
+ generalized_attention=generalized_attention,
488
+ kernel_fn=kernel_fn,
489
+ qr_uniform_q=qr_uniform_q,
490
+ no_projection=no_projection,
491
+ )
492
+
493
+ self.heads = heads
494
+ self.global_heads = heads - local_heads
495
+ self.local_attn = (
496
+ LocalAttention(
497
+ window_size=local_window_size,
498
+ causal=causal,
499
+ autopad=True,
500
+ dropout=dropout,
501
+ look_forward=int(not causal),
502
+ rel_pos_emb_config=(dim_head, local_heads),
503
+ )
504
+ if local_heads > 0
505
+ else None
506
+ )
507
+
508
+ self.to_q = nn.Linear(dim, inner_dim)
509
+ self.to_k = nn.Linear(dim, inner_dim)
510
+ self.to_v = nn.Linear(dim, inner_dim)
511
+ self.to_out = nn.Linear(inner_dim, dim)
512
+ self.dropout = nn.Dropout(dropout)
513
+
514
+ @torch.no_grad()
515
+ def redraw_projection_matrix(self):
516
+ self.fast_attention.redraw_projection_matrix()
517
+
518
+ def forward(
519
+ self,
520
+ x,
521
+ context=None,
522
+ mask=None,
523
+ context_mask=None,
524
+ name=None,
525
+ inference=False,
526
+ **kwargs,
527
+ ):
528
+ _, _, _, h, gh = *x.shape, self.heads, self.global_heads
529
+
530
+ cross_attend = exists(context)
531
+ context = default(context, x)
532
+ context_mask = default(context_mask, mask) if not cross_attend else context_mask
533
+ q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
534
+
535
+ q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
536
+ (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
537
+
538
+ attn_outs = []
539
+ if not empty(q):
540
+ if exists(context_mask):
541
+ global_mask = context_mask[:, None, :, None]
542
+ v.masked_fill_(~global_mask, 0.0)
543
+ if cross_attend:
544
+ pass # TODO: Implement cross-attention
545
+ else:
546
+ out = self.fast_attention(q, k, v)
547
+ attn_outs.append(out)
548
+
549
+ if not empty(lq):
550
+ assert (
551
+ not cross_attend
552
+ ), "local attention is not compatible with cross attention"
553
+ out = self.local_attn(lq, lk, lv, input_mask=mask)
554
+ attn_outs.append(out)
555
+
556
+ out = torch.cat(attn_outs, dim=1)
557
+ out = rearrange(out, "b h n d -> b n (h d)")
558
+ out = self.to_out(out)
559
+ return self.dropout(out)
560
+
561
+
562
+ def l2_regularization(model, l2_alpha):
563
+ l2_loss = []
564
+ for module in model.modules():
565
+ if type(module) is nn.Conv2d:
566
+ l2_loss.append((module.weight**2).sum() / 2.0)
567
+ return l2_alpha * sum(l2_loss)
568
+
569
+
570
+ class FCPE(nn.Module):
571
+ def __init__(
572
+ self,
573
+ input_channel=128,
574
+ out_dims=360,
575
+ n_layers=12,
576
+ n_chans=512,
577
+ use_siren=False,
578
+ use_full=False,
579
+ loss_mse_scale=10,
580
+ loss_l2_regularization=False,
581
+ loss_l2_regularization_scale=1,
582
+ loss_grad1_mse=False,
583
+ loss_grad1_mse_scale=1,
584
+ f0_max=1975.5,
585
+ f0_min=32.70,
586
+ confidence=False,
587
+ threshold=0.05,
588
+ use_input_conv=True,
589
+ ):
590
+ super().__init__()
591
+ if use_siren is True:
592
+ raise ValueError("Siren is not supported yet.")
593
+ if use_full is True:
594
+ raise ValueError("Full model is not supported yet.")
595
+
596
+ self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
597
+ self.loss_l2_regularization = (
598
+ loss_l2_regularization if (loss_l2_regularization is not None) else False
599
+ )
600
+ self.loss_l2_regularization_scale = (
601
+ loss_l2_regularization_scale
602
+ if (loss_l2_regularization_scale is not None)
603
+ else 1
604
+ )
605
+ self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
606
+ self.loss_grad1_mse_scale = (
607
+ loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
608
+ )
609
+ self.f0_max = f0_max if (f0_max is not None) else 1975.5
610
+ self.f0_min = f0_min if (f0_min is not None) else 32.70
611
+ self.confidence = confidence if (confidence is not None) else False
612
+ self.threshold = threshold if (threshold is not None) else 0.05
613
+ self.use_input_conv = use_input_conv if (use_input_conv is not None) else True
614
+
615
+ self.cent_table_b = torch.Tensor(
616
+ np.linspace(
617
+ self.f0_to_cent(torch.Tensor([f0_min]))[0],
618
+ self.f0_to_cent(torch.Tensor([f0_max]))[0],
619
+ out_dims,
620
+ )
621
+ )
622
+ self.register_buffer("cent_table", self.cent_table_b)
623
+
624
+ # conv in stack
625
+ _leaky = nn.LeakyReLU()
626
+ self.stack = nn.Sequential(
627
+ nn.Conv1d(input_channel, n_chans, 3, 1, 1),
628
+ nn.GroupNorm(4, n_chans),
629
+ _leaky,
630
+ nn.Conv1d(n_chans, n_chans, 3, 1, 1),
631
+ )
632
+
633
+ # transformer
634
+ self.decoder = PCmer(
635
+ num_layers=n_layers,
636
+ num_heads=8,
637
+ dim_model=n_chans,
638
+ dim_keys=n_chans,
639
+ dim_values=n_chans,
640
+ residual_dropout=0.1,
641
+ attention_dropout=0.1,
642
+ )
643
+ self.norm = nn.LayerNorm(n_chans)
644
+
645
+ # out
646
+ self.n_out = out_dims
647
+ self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out))
648
+
649
+ def forward(
650
+ self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax"
651
+ ):
652
+ if cdecoder == "argmax":
653
+ self.cdecoder = self.cents_decoder
654
+ elif cdecoder == "local_argmax":
655
+ self.cdecoder = self.cents_local_decoder
656
+
657
+ x = (
658
+ self.stack(mel.transpose(1, 2)).transpose(1, 2)
659
+ if self.use_input_conv
660
+ else mel
661
+ )
662
+ x = self.decoder(x)
663
+ x = self.norm(x)
664
+ x = self.dense_out(x)
665
+ x = torch.sigmoid(x)
666
+
667
+ if not infer:
668
+ gt_cent_f0 = self.f0_to_cent(gt_f0)
669
+ gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0)
670
+ loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0)
671
+ if self.loss_l2_regularization:
672
+ loss_all = loss_all + l2_regularization(
673
+ model=self, l2_alpha=self.loss_l2_regularization_scale
674
+ )
675
+ x = loss_all
676
+ if infer:
677
+ x = self.cdecoder(x)
678
+ x = self.cent_to_f0(x)
679
+ x = (1 + x / 700).log() if not return_hz_f0 else x
680
+
681
+ return x
682
+
683
+ def cents_decoder(self, y, mask=True):
684
+ B, N, _ = y.size()
685
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
686
+ rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(
687
+ y, dim=-1, keepdim=True
688
+ )
689
+ if mask:
690
+ confident = torch.max(y, dim=-1, keepdim=True)[0]
691
+ confident_mask = torch.ones_like(confident)
692
+ confident_mask[confident <= self.threshold] = float("-INF")
693
+ rtn = rtn * confident_mask
694
+ return (rtn, confident) if self.confidence else rtn
695
+
696
+ def cents_local_decoder(self, y, mask=True):
697
+ B, N, _ = y.size()
698
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
699
+ confident, max_index = torch.max(y, dim=-1, keepdim=True)
700
+ local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
701
+ local_argmax_index = torch.clamp(local_argmax_index, 0, self.n_out - 1)
702
+ ci_l = torch.gather(ci, -1, local_argmax_index)
703
+ y_l = torch.gather(y, -1, local_argmax_index)
704
+ rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(
705
+ y_l, dim=-1, keepdim=True
706
+ )
707
+ if mask:
708
+ confident_mask = torch.ones_like(confident)
709
+ confident_mask[confident <= self.threshold] = float("-INF")
710
+ rtn = rtn * confident_mask
711
+ return (rtn, confident) if self.confidence else rtn
712
+
713
+ def cent_to_f0(self, cent):
714
+ return 10.0 * 2 ** (cent / 1200.0)
715
+
716
+ def f0_to_cent(self, f0):
717
+ return 1200.0 * torch.log2(f0 / 10.0)
718
+
719
+ def gaussian_blurred_cent(self, cents):
720
+ mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0)))
721
+ B, N, _ = cents.size()
722
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
723
+ return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()
724
+
725
+
726
+ class FCPEInfer:
727
+ def __init__(self, model_path, device=None, dtype=torch.float32):
728
+ if device is None:
729
+ device = "cuda" if torch.cuda.is_available() else "cpu"
730
+ self.device = device
731
+ ckpt = torch.load(model_path, map_location=torch.device(self.device))
732
+ self.args = DotDict(ckpt["config"])
733
+ self.dtype = dtype
734
+ model = FCPE(
735
+ input_channel=self.args.model.input_channel,
736
+ out_dims=self.args.model.out_dims,
737
+ n_layers=self.args.model.n_layers,
738
+ n_chans=self.args.model.n_chans,
739
+ use_siren=self.args.model.use_siren,
740
+ use_full=self.args.model.use_full,
741
+ loss_mse_scale=self.args.loss.loss_mse_scale,
742
+ loss_l2_regularization=self.args.loss.loss_l2_regularization,
743
+ loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
744
+ loss_grad1_mse=self.args.loss.loss_grad1_mse,
745
+ loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
746
+ f0_max=self.args.model.f0_max,
747
+ f0_min=self.args.model.f0_min,
748
+ confidence=self.args.model.confidence,
749
+ )
750
+ model.to(self.device).to(self.dtype)
751
+ model.load_state_dict(ckpt["model"])
752
+ model.eval()
753
+ self.model = model
754
+ self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)
755
+
756
+ @torch.no_grad()
757
+ def __call__(self, audio, sr, threshold=0.05):
758
+ self.model.threshold = threshold
759
+ audio = audio[None, :]
760
+ mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
761
+ f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
762
+ return f0
763
+
764
+
765
+ class Wav2Mel:
766
+ def __init__(self, args, device=None, dtype=torch.float32):
767
+ self.sample_rate = args.mel.sampling_rate
768
+ self.hop_size = args.mel.hop_size
769
+ if device is None:
770
+ device = "cuda" if torch.cuda.is_available() else "cpu"
771
+ self.device = device
772
+ self.dtype = dtype
773
+ self.stft = STFT(
774
+ args.mel.sampling_rate,
775
+ args.mel.num_mels,
776
+ args.mel.n_fft,
777
+ args.mel.win_size,
778
+ args.mel.hop_size,
779
+ args.mel.fmin,
780
+ args.mel.fmax,
781
+ )
782
+ self.resample_kernel = {}
783
+
784
+ def extract_nvstft(self, audio, keyshift=0, train=False):
785
+ mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2)
786
+ return mel
787
+
788
+ def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
789
+ audio = audio.to(self.dtype).to(self.device)
790
+ if sample_rate == self.sample_rate:
791
+ audio_res = audio
792
+ else:
793
+ key_str = str(sample_rate)
794
+ if key_str not in self.resample_kernel:
795
+ self.resample_kernel[key_str] = Resample(
796
+ sample_rate, self.sample_rate, lowpass_filter_width=128
797
+ )
798
+ self.resample_kernel[key_str] = (
799
+ self.resample_kernel[key_str].to(self.dtype).to(self.device)
800
+ )
801
+ audio_res = self.resample_kernel[key_str](audio)
802
+
803
+ mel = self.extract_nvstft(
804
+ audio_res, keyshift=keyshift, train=train
805
+ ) # B, n_frames, bins
806
+ n_frames = int(audio.shape[1] // self.hop_size) + 1
807
+ mel = (
808
+ torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel
809
+ )
810
+ mel = mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel
811
+ return mel
812
+
813
+ def __call__(self, audio, sample_rate, keyshift=0, train=False):
814
+ return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
815
+
816
+
817
+ class DotDict(dict):
818
+ def __getattr__(*args):
819
+ val = dict.get(*args)
820
+ return DotDict(val) if type(val) is dict else val
821
+
822
+ __setattr__ = dict.__setitem__
823
+ __delattr__ = dict.__delitem__
824
+
825
+
826
+ class F0Predictor(object):
827
+ def compute_f0(self, wav, p_len):
828
+ pass
829
+
830
+ def compute_f0_uv(self, wav, p_len):
831
+ pass
832
+
833
+
834
+ class FCPEF0Predictor(F0Predictor):
835
+ def __init__(
836
+ self,
837
+ model_path,
838
+ hop_length=512,
839
+ f0_min=50,
840
+ f0_max=1100,
841
+ dtype=torch.float32,
842
+ device=None,
843
+ sample_rate=44100,
844
+ threshold=0.05,
845
+ ):
846
+ self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype)
847
+ self.hop_length = hop_length
848
+ self.f0_min = f0_min
849
+ self.f0_max = f0_max
850
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
851
+ self.threshold = threshold
852
+ self.sample_rate = sample_rate
853
+ self.dtype = dtype
854
+ self.name = "fcpe"
855
+
856
+ def repeat_expand(
857
+ self,
858
+ content: Union[torch.Tensor, np.ndarray],
859
+ target_len: int,
860
+ mode: str = "nearest",
861
+ ):
862
+ ndim = content.ndim
863
+ content = (
864
+ content[None, None]
865
+ if ndim == 1
866
+ else content[None] if ndim == 2 else content
867
+ )
868
+ assert content.ndim == 3
869
+ is_np = isinstance(content, np.ndarray)
870
+ content = torch.from_numpy(content) if is_np else content
871
+ results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
872
+ results = results.numpy() if is_np else results
873
+ return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results
874
+
875
+ def post_process(self, x, sample_rate, f0, pad_to):
876
+ f0 = (
877
+ torch.from_numpy(f0).float().to(x.device)
878
+ if isinstance(f0, np.ndarray)
879
+ else f0
880
+ )
881
+ f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0
882
+
883
+ vuv_vector = torch.zeros_like(f0)
884
+ vuv_vector[f0 > 0.0] = 1.0
885
+ vuv_vector[f0 <= 0.0] = 0.0
886
+
887
+ nzindex = torch.nonzero(f0).squeeze()
888
+ f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
889
+ time_org = self.hop_length / sample_rate * nzindex.cpu().numpy()
890
+ time_frame = np.arange(pad_to) * self.hop_length / sample_rate
891
+
892
+ vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
893
+
894
+ if f0.shape[0] <= 0:
895
+ return np.zeros(pad_to), vuv_vector.cpu().numpy()
896
+ if f0.shape[0] == 1:
897
+ return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy()
898
+
899
+ f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
900
+ return f0, vuv_vector.cpu().numpy()
901
+
902
+ def compute_f0(self, wav, p_len=None):
903
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
904
+ p_len = x.shape[0] // self.hop_length if p_len is None else p_len
905
+ f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
906
+ if torch.all(f0 == 0):
907
+ return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
908
+ f0.cpu().numpy() if p_len is None else np.zeros(p_len)
909
+ )
910
+ return self.post_process(x, self.sample_rate, f0, p_len)[0]
911
+
912
+ def compute_f0_uv(self, wav, p_len=None):
913
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
914
+ p_len = x.shape[0] // self.hop_length if p_len is None else p_len
915
+ f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
916
+ if torch.all(f0 == 0):
917
+ return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
918
+ f0.cpu().numpy() if p_len is None else np.zeros(p_len)
919
+ )
920
+ return self.post_process(x, self.sample_rate, f0, p_len)
RMVPE.py ADDED
@@ -0,0 +1,560 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+
6
+ from librosa.filters import mel
7
+ from typing import List
8
+
9
+ # Constants for readability
10
+ N_MELS = 128
11
+ N_CLASS = 360
12
+
13
+
14
+ # Define a helper function for creating convolutional blocks
15
+ class ConvBlockRes(nn.Module):
16
+ """
17
+ A convolutional block with residual connection.
18
+
19
+ Args:
20
+ in_channels (int): Number of input channels.
21
+ out_channels (int): Number of output channels.
22
+ momentum (float): Momentum for batch normalization.
23
+ """
24
+
25
+ def __init__(self, in_channels, out_channels, momentum=0.01):
26
+ super(ConvBlockRes, self).__init__()
27
+ self.conv = nn.Sequential(
28
+ nn.Conv2d(
29
+ in_channels=in_channels,
30
+ out_channels=out_channels,
31
+ kernel_size=(3, 3),
32
+ stride=(1, 1),
33
+ padding=(1, 1),
34
+ bias=False,
35
+ ),
36
+ nn.BatchNorm2d(out_channels, momentum=momentum),
37
+ nn.ReLU(),
38
+ nn.Conv2d(
39
+ in_channels=out_channels,
40
+ out_channels=out_channels,
41
+ kernel_size=(3, 3),
42
+ stride=(1, 1),
43
+ padding=(1, 1),
44
+ bias=False,
45
+ ),
46
+ nn.BatchNorm2d(out_channels, momentum=momentum),
47
+ nn.ReLU(),
48
+ )
49
+ if in_channels != out_channels:
50
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
51
+ self.is_shortcut = True
52
+ else:
53
+ self.is_shortcut = False
54
+
55
+ def forward(self, x):
56
+ if self.is_shortcut:
57
+ return self.conv(x) + self.shortcut(x)
58
+ else:
59
+ return self.conv(x) + x
60
+
61
+
62
+ # Define a class for residual encoder blocks
63
+ class ResEncoderBlock(nn.Module):
64
+ """
65
+ A residual encoder block.
66
+
67
+ Args:
68
+ in_channels (int): Number of input channels.
69
+ out_channels (int): Number of output channels.
70
+ kernel_size (tuple): Size of the average pooling kernel.
71
+ n_blocks (int): Number of convolutional blocks in the block.
72
+ momentum (float): Momentum for batch normalization.
73
+ """
74
+
75
+ def __init__(
76
+ self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
77
+ ):
78
+ super(ResEncoderBlock, self).__init__()
79
+ self.n_blocks = n_blocks
80
+ self.conv = nn.ModuleList()
81
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
82
+ for _ in range(n_blocks - 1):
83
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
84
+ self.kernel_size = kernel_size
85
+ if self.kernel_size is not None:
86
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
87
+
88
+ def forward(self, x):
89
+ for i in range(self.n_blocks):
90
+ x = self.conv[i](x)
91
+ if self.kernel_size is not None:
92
+ return x, self.pool(x)
93
+ else:
94
+ return x
95
+
96
+
97
+ # Define a class for the encoder
98
+ class Encoder(nn.Module):
99
+ """
100
+ The encoder part of the DeepUnet.
101
+
102
+ Args:
103
+ in_channels (int): Number of input channels.
104
+ in_size (int): Size of the input tensor.
105
+ n_encoders (int): Number of encoder blocks.
106
+ kernel_size (tuple): Size of the average pooling kernel.
107
+ n_blocks (int): Number of convolutional blocks in each encoder block.
108
+ out_channels (int): Number of output channels for the first encoder block.
109
+ momentum (float): Momentum for batch normalization.
110
+ """
111
+
112
+ def __init__(
113
+ self,
114
+ in_channels,
115
+ in_size,
116
+ n_encoders,
117
+ kernel_size,
118
+ n_blocks,
119
+ out_channels=16,
120
+ momentum=0.01,
121
+ ):
122
+ super(Encoder, self).__init__()
123
+ self.n_encoders = n_encoders
124
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
125
+ self.layers = nn.ModuleList()
126
+ self.latent_channels = []
127
+ for i in range(self.n_encoders):
128
+ self.layers.append(
129
+ ResEncoderBlock(
130
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
131
+ )
132
+ )
133
+ self.latent_channels.append([out_channels, in_size])
134
+ in_channels = out_channels
135
+ out_channels *= 2
136
+ in_size //= 2
137
+ self.out_size = in_size
138
+ self.out_channel = out_channels
139
+
140
+ def forward(self, x: torch.Tensor):
141
+ concat_tensors: List[torch.Tensor] = []
142
+ x = self.bn(x)
143
+ for i in range(self.n_encoders):
144
+ t, x = self.layers[i](x)
145
+ concat_tensors.append(t)
146
+ return x, concat_tensors
147
+
148
+
149
+ # Define a class for the intermediate layer
150
+ class Intermediate(nn.Module):
151
+ """
152
+ The intermediate layer of the DeepUnet.
153
+
154
+ Args:
155
+ in_channels (int): Number of input channels.
156
+ out_channels (int): Number of output channels.
157
+ n_inters (int): Number of convolutional blocks in the intermediate layer.
158
+ n_blocks (int): Number of convolutional blocks in each intermediate block.
159
+ momentum (float): Momentum for batch normalization.
160
+ """
161
+
162
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
163
+ super(Intermediate, self).__init__()
164
+ self.n_inters = n_inters
165
+ self.layers = nn.ModuleList()
166
+ self.layers.append(
167
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
168
+ )
169
+ for _ in range(self.n_inters - 1):
170
+ self.layers.append(
171
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
172
+ )
173
+
174
+ def forward(self, x):
175
+ for i in range(self.n_inters):
176
+ x = self.layers[i](x)
177
+ return x
178
+
179
+
180
+ # Define a class for residual decoder blocks
181
+ class ResDecoderBlock(nn.Module):
182
+ """
183
+ A residual decoder block.
184
+
185
+ Args:
186
+ in_channels (int): Number of input channels.
187
+ out_channels (int): Number of output channels.
188
+ stride (tuple): Stride for transposed convolution.
189
+ n_blocks (int): Number of convolutional blocks in the block.
190
+ momentum (float): Momentum for batch normalization.
191
+ """
192
+
193
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
194
+ super(ResDecoderBlock, self).__init__()
195
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
196
+ self.n_blocks = n_blocks
197
+ self.conv1 = nn.Sequential(
198
+ nn.ConvTranspose2d(
199
+ in_channels=in_channels,
200
+ out_channels=out_channels,
201
+ kernel_size=(3, 3),
202
+ stride=stride,
203
+ padding=(1, 1),
204
+ output_padding=out_padding,
205
+ bias=False,
206
+ ),
207
+ nn.BatchNorm2d(out_channels, momentum=momentum),
208
+ nn.ReLU(),
209
+ )
210
+ self.conv2 = nn.ModuleList()
211
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
212
+ for _ in range(n_blocks - 1):
213
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
214
+
215
+ def forward(self, x, concat_tensor):
216
+ x = self.conv1(x)
217
+ x = torch.cat((x, concat_tensor), dim=1)
218
+ for i in range(self.n_blocks):
219
+ x = self.conv2[i](x)
220
+ return x
221
+
222
+
223
+ # Define a class for the decoder
224
+ class Decoder(nn.Module):
225
+ """
226
+ The decoder part of the DeepUnet.
227
+
228
+ Args:
229
+ in_channels (int): Number of input channels.
230
+ n_decoders (int): Number of decoder blocks.
231
+ stride (tuple): Stride for transposed convolution.
232
+ n_blocks (int): Number of convolutional blocks in each decoder block.
233
+ momentum (float): Momentum for batch normalization.
234
+ """
235
+
236
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
237
+ super(Decoder, self).__init__()
238
+ self.layers = nn.ModuleList()
239
+ self.n_decoders = n_decoders
240
+ for _ in range(self.n_decoders):
241
+ out_channels = in_channels // 2
242
+ self.layers.append(
243
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
244
+ )
245
+ in_channels = out_channels
246
+
247
+ def forward(self, x, concat_tensors):
248
+ for i in range(self.n_decoders):
249
+ x = self.layers[i](x, concat_tensors[-1 - i])
250
+ return x
251
+
252
+
253
+ # Define a class for the DeepUnet architecture
254
+ class DeepUnet(nn.Module):
255
+ """
256
+ The DeepUnet architecture.
257
+
258
+ Args:
259
+ kernel_size (tuple): Size of the average pooling kernel.
260
+ n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
261
+ en_de_layers (int): Number of encoder/decoder layers.
262
+ inter_layers (int): Number of convolutional blocks in the intermediate layer.
263
+ in_channels (int): Number of input channels.
264
+ en_out_channels (int): Number of output channels for the first encoder block.
265
+ """
266
+
267
+ def __init__(
268
+ self,
269
+ kernel_size,
270
+ n_blocks,
271
+ en_de_layers=5,
272
+ inter_layers=4,
273
+ in_channels=1,
274
+ en_out_channels=16,
275
+ ):
276
+ super(DeepUnet, self).__init__()
277
+ self.encoder = Encoder(
278
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
279
+ )
280
+ self.intermediate = Intermediate(
281
+ self.encoder.out_channel // 2,
282
+ self.encoder.out_channel,
283
+ inter_layers,
284
+ n_blocks,
285
+ )
286
+ self.decoder = Decoder(
287
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
288
+ )
289
+
290
+ def forward(self, x):
291
+ x, concat_tensors = self.encoder(x)
292
+ x = self.intermediate(x)
293
+ x = self.decoder(x, concat_tensors)
294
+ return x
295
+
296
+
297
+ # Define a class for the end-to-end model
298
+ class E2E(nn.Module):
299
+ """
300
+ The end-to-end model.
301
+
302
+ Args:
303
+ n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
304
+ n_gru (int): Number of GRU layers.
305
+ kernel_size (tuple): Size of the average pooling kernel.
306
+ en_de_layers (int): Number of encoder/decoder layers.
307
+ inter_layers (int): Number of convolutional blocks in the intermediate layer.
308
+ in_channels (int): Number of input channels.
309
+ en_out_channels (int): Number of output channels for the first encoder block.
310
+ """
311
+
312
+ def __init__(
313
+ self,
314
+ n_blocks,
315
+ n_gru,
316
+ kernel_size,
317
+ en_de_layers=5,
318
+ inter_layers=4,
319
+ in_channels=1,
320
+ en_out_channels=16,
321
+ ):
322
+ super(E2E, self).__init__()
323
+ self.unet = DeepUnet(
324
+ kernel_size,
325
+ n_blocks,
326
+ en_de_layers,
327
+ inter_layers,
328
+ in_channels,
329
+ en_out_channels,
330
+ )
331
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
332
+ if n_gru:
333
+ self.fc = nn.Sequential(
334
+ BiGRU(3 * 128, 256, n_gru),
335
+ nn.Linear(512, N_CLASS),
336
+ nn.Dropout(0.25),
337
+ nn.Sigmoid(),
338
+ )
339
+ else:
340
+ self.fc = nn.Sequential(
341
+ nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
342
+ )
343
+
344
+ def forward(self, mel):
345
+ mel = mel.transpose(-1, -2).unsqueeze(1)
346
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
347
+ x = self.fc(x)
348
+ return x
349
+
350
+
351
+ # Define a class for the MelSpectrogram extractor
352
+ class MelSpectrogram(torch.nn.Module):
353
+ """
354
+ Extracts Mel-spectrogram features from audio.
355
+
356
+ Args:
357
+ is_half (bool): Whether to use half-precision floating-point numbers.
358
+ n_mel_channels (int): Number of Mel-frequency bands.
359
+ sample_rate (int): Sampling rate of the audio.
360
+ win_length (int): Length of the window function in samples.
361
+ hop_length (int): Hop size between frames in samples.
362
+ n_fft (int, optional): Length of the FFT window. Defaults to None, which uses win_length.
363
+ mel_fmin (int, optional): Minimum frequency for the Mel filter bank. Defaults to 0.
364
+ mel_fmax (int, optional): Maximum frequency for the Mel filter bank. Defaults to None.
365
+ clamp (float, optional): Minimum value for clamping the Mel-spectrogram. Defaults to 1e-5.
366
+ """
367
+
368
+ def __init__(
369
+ self,
370
+ is_half,
371
+ n_mel_channels,
372
+ sample_rate,
373
+ win_length,
374
+ hop_length,
375
+ n_fft=None,
376
+ mel_fmin=0,
377
+ mel_fmax=None,
378
+ clamp=1e-5,
379
+ ):
380
+ super().__init__()
381
+ n_fft = win_length if n_fft is None else n_fft
382
+ self.hann_window = {}
383
+ mel_basis = mel(
384
+ sr=sample_rate,
385
+ n_fft=n_fft,
386
+ n_mels=n_mel_channels,
387
+ fmin=mel_fmin,
388
+ fmax=mel_fmax,
389
+ htk=True,
390
+ )
391
+ mel_basis = torch.from_numpy(mel_basis).float()
392
+ self.register_buffer("mel_basis", mel_basis)
393
+ self.n_fft = win_length if n_fft is None else n_fft
394
+ self.hop_length = hop_length
395
+ self.win_length = win_length
396
+ self.sample_rate = sample_rate
397
+ self.n_mel_channels = n_mel_channels
398
+ self.clamp = clamp
399
+ self.is_half = is_half
400
+
401
+ def forward(self, audio, keyshift=0, speed=1, center=True):
402
+ factor = 2 ** (keyshift / 12)
403
+ n_fft_new = int(np.round(self.n_fft * factor))
404
+ win_length_new = int(np.round(self.win_length * factor))
405
+ hop_length_new = int(np.round(self.hop_length * speed))
406
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
407
+ if keyshift_key not in self.hann_window:
408
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
409
+ audio.device
410
+ )
411
+ fft = torch.stft(
412
+ audio,
413
+ n_fft=n_fft_new,
414
+ hop_length=hop_length_new,
415
+ win_length=win_length_new,
416
+ window=self.hann_window[keyshift_key],
417
+ center=center,
418
+ return_complex=True,
419
+ )
420
+
421
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
422
+ if keyshift != 0:
423
+ size = self.n_fft // 2 + 1
424
+ resize = magnitude.size(1)
425
+ if resize < size:
426
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
427
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
428
+ mel_output = torch.matmul(self.mel_basis, magnitude)
429
+ if self.is_half:
430
+ mel_output = mel_output.half()
431
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
432
+ return log_mel_spec
433
+
434
+
435
+ # Define a class for the RMVPE0 predictor
436
+ class RMVPE0Predictor:
437
+ """
438
+ A predictor for fundamental frequency (F0) based on the RMVPE0 model.
439
+
440
+ Args:
441
+ model_path (str): Path to the RMVPE0 model file.
442
+ is_half (bool): Whether to use half-precision floating-point numbers.
443
+ device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available.
444
+ """
445
+
446
+ def __init__(self, model_path, is_half, device=None):
447
+ self.resample_kernel = {}
448
+ model = E2E(4, 1, (2, 2))
449
+ ckpt = torch.load(model_path, map_location="cpu")
450
+ model.load_state_dict(ckpt)
451
+ model.eval()
452
+ if is_half:
453
+ model = model.half()
454
+ self.model = model
455
+ self.resample_kernel = {}
456
+ self.is_half = is_half
457
+ self.device = device
458
+ self.mel_extractor = MelSpectrogram(
459
+ is_half, N_MELS, 16000, 1024, 160, None, 30, 8000
460
+ ).to(device)
461
+ self.model = self.model.to(device)
462
+ cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191
463
+ self.cents_mapping = np.pad(cents_mapping, (4, 4))
464
+
465
+ def mel2hidden(self, mel):
466
+ """
467
+ Converts Mel-spectrogram features to hidden representation.
468
+
469
+ Args:
470
+ mel (torch.Tensor): Mel-spectrogram features.
471
+ """
472
+ with torch.no_grad():
473
+ n_frames = mel.shape[-1]
474
+ mel = F.pad(
475
+ mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
476
+ )
477
+ hidden = self.model(mel)
478
+ return hidden[:, :n_frames]
479
+
480
+ def decode(self, hidden, thred=0.03):
481
+ """
482
+ Decodes hidden representation to F0.
483
+
484
+ Args:
485
+ hidden (np.ndarray): Hidden representation.
486
+ thred (float, optional): Threshold for salience. Defaults to 0.03.
487
+ """
488
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
489
+ f0 = 10 * (2 ** (cents_pred / 1200))
490
+ f0[f0 == 10] = 0
491
+ return f0
492
+
493
+ def infer_from_audio(self, audio, thred=0.03):
494
+ """
495
+ Infers F0 from audio.
496
+
497
+ Args:
498
+ audio (np.ndarray): Audio signal.
499
+ thred (float, optional): Threshold for salience. Defaults to 0.03.
500
+ """
501
+ audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
502
+ mel = self.mel_extractor(audio, center=True)
503
+ hidden = self.mel2hidden(mel)
504
+ hidden = hidden.squeeze(0).cpu().numpy()
505
+ if self.is_half == True:
506
+ hidden = hidden.astype("float32")
507
+ f0 = self.decode(hidden, thred=thred)
508
+ return f0
509
+
510
+ def to_local_average_cents(self, salience, thred=0.05):
511
+ """
512
+ Converts salience to local average cents.
513
+
514
+ Args:
515
+ salience (np.ndarray): Salience values.
516
+ thred (float, optional): Threshold for salience. Defaults to 0.05.
517
+ """
518
+ center = np.argmax(salience, axis=1)
519
+ salience = np.pad(salience, ((0, 0), (4, 4)))
520
+ center += 4
521
+ todo_salience = []
522
+ todo_cents_mapping = []
523
+ starts = center - 4
524
+ ends = center + 5
525
+ for idx in range(salience.shape[0]):
526
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
527
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
528
+ todo_salience = np.array(todo_salience)
529
+ todo_cents_mapping = np.array(todo_cents_mapping)
530
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
531
+ weight_sum = np.sum(todo_salience, 1)
532
+ devided = product_sum / weight_sum
533
+ maxx = np.max(salience, axis=1)
534
+ devided[maxx <= thred] = 0
535
+ return devided
536
+
537
+
538
+ # Define a class for BiGRU (bidirectional GRU)
539
+ class BiGRU(nn.Module):
540
+ """
541
+ A bidirectional GRU layer.
542
+
543
+ Args:
544
+ input_features (int): Number of input features.
545
+ hidden_features (int): Number of hidden features.
546
+ num_layers (int): Number of GRU layers.
547
+ """
548
+
549
+ def __init__(self, input_features, hidden_features, num_layers):
550
+ super(BiGRU, self).__init__()
551
+ self.gru = nn.GRU(
552
+ input_features,
553
+ hidden_features,
554
+ num_layers=num_layers,
555
+ batch_first=True,
556
+ bidirectional=True,
557
+ )
558
+
559
+ def forward(self, x):
560
+ return self.gru(x)[0]
__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
attentions.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from rvc_cli.rvc.lib.algorithm.commons import convert_pad_shape
4
+
5
+
6
+ class MultiHeadAttention(torch.nn.Module):
7
+ """
8
+ Multi-head attention module with optional relative positional encoding and proximal bias.
9
+
10
+ Args:
11
+ channels (int): Number of input channels.
12
+ out_channels (int): Number of output channels.
13
+ n_heads (int): Number of attention heads.
14
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
15
+ window_size (int, optional): Window size for relative positional encoding. Defaults to None.
16
+ heads_share (bool, optional): Whether to share relative positional embeddings across heads. Defaults to True.
17
+ block_length (int, optional): Block length for local attention. Defaults to None.
18
+ proximal_bias (bool, optional): Whether to use proximal bias in self-attention. Defaults to False.
19
+ proximal_init (bool, optional): Whether to initialize the key projection weights the same as query projection weights. Defaults to False.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ channels,
25
+ out_channels,
26
+ n_heads,
27
+ p_dropout=0.0,
28
+ window_size=None,
29
+ heads_share=True,
30
+ block_length=None,
31
+ proximal_bias=False,
32
+ proximal_init=False,
33
+ ):
34
+ super().__init__()
35
+ assert (
36
+ channels % n_heads == 0
37
+ ), "Channels must be divisible by the number of heads."
38
+
39
+ self.channels = channels
40
+ self.out_channels = out_channels
41
+ self.n_heads = n_heads
42
+ self.k_channels = channels // n_heads
43
+ self.window_size = window_size
44
+ self.block_length = block_length
45
+ self.proximal_bias = proximal_bias
46
+
47
+ # Define projections
48
+ self.conv_q = torch.nn.Conv1d(channels, channels, 1)
49
+ self.conv_k = torch.nn.Conv1d(channels, channels, 1)
50
+ self.conv_v = torch.nn.Conv1d(channels, channels, 1)
51
+ self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
52
+
53
+ self.drop = torch.nn.Dropout(p_dropout)
54
+
55
+ # Relative positional encodings
56
+ if window_size:
57
+ n_heads_rel = 1 if heads_share else n_heads
58
+ rel_stddev = self.k_channels**-0.5
59
+ self.emb_rel_k = torch.nn.Parameter(
60
+ torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels)
61
+ * rel_stddev
62
+ )
63
+ self.emb_rel_v = torch.nn.Parameter(
64
+ torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels)
65
+ * rel_stddev
66
+ )
67
+
68
+ # Initialize weights
69
+ torch.nn.init.xavier_uniform_(self.conv_q.weight)
70
+ torch.nn.init.xavier_uniform_(self.conv_k.weight)
71
+ torch.nn.init.xavier_uniform_(self.conv_v.weight)
72
+ torch.nn.init.xavier_uniform_(self.conv_o.weight)
73
+
74
+ if proximal_init:
75
+ with torch.no_grad():
76
+ self.conv_k.weight.copy_(self.conv_q.weight)
77
+ self.conv_k.bias.copy_(self.conv_q.bias)
78
+
79
+ def forward(self, x, c, attn_mask=None):
80
+ # Compute query, key, value projections
81
+ q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c)
82
+
83
+ # Compute attention
84
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
85
+
86
+ # Final output projection
87
+ return self.conv_o(x)
88
+
89
+ def attention(self, query, key, value, mask=None):
90
+ # Reshape and compute scaled dot-product attention
91
+ b, d, t_s, t_t = (*key.size(), query.size(2))
92
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
93
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
94
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
95
+
96
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
97
+
98
+ if self.window_size:
99
+ assert t_s == t_t, "Relative attention only supports self-attention."
100
+ scores += self._compute_relative_scores(query, t_s)
101
+
102
+ if self.proximal_bias:
103
+ assert t_s == t_t, "Proximal bias only supports self-attention."
104
+ scores += self._attention_bias_proximal(t_s).to(scores.device, scores.dtype)
105
+
106
+ if mask is not None:
107
+ scores = scores.masked_fill(mask == 0, -1e4)
108
+ if self.block_length:
109
+ block_mask = (
110
+ torch.ones_like(scores)
111
+ .triu(-self.block_length)
112
+ .tril(self.block_length)
113
+ )
114
+ scores = scores.masked_fill(block_mask == 0, -1e4)
115
+
116
+ # Apply softmax and dropout
117
+ p_attn = self.drop(torch.nn.functional.softmax(scores, dim=-1))
118
+
119
+ # Compute attention output
120
+ output = torch.matmul(p_attn, value)
121
+
122
+ if self.window_size:
123
+ output += self._apply_relative_values(p_attn, t_s)
124
+
125
+ return output.transpose(2, 3).contiguous().view(b, d, t_t), p_attn
126
+
127
+ def _compute_relative_scores(self, query, length):
128
+ rel_emb = self._get_relative_embeddings(self.emb_rel_k, length)
129
+ rel_logits = self._matmul_with_relative_keys(
130
+ query / math.sqrt(self.k_channels), rel_emb
131
+ )
132
+ return self._relative_position_to_absolute_position(rel_logits)
133
+
134
+ def _apply_relative_values(self, p_attn, length):
135
+ rel_weights = self._absolute_position_to_relative_position(p_attn)
136
+ rel_emb = self._get_relative_embeddings(self.emb_rel_v, length)
137
+ return self._matmul_with_relative_values(rel_weights, rel_emb)
138
+
139
+ # Helper methods
140
+ def _matmul_with_relative_values(self, x, y):
141
+ return torch.matmul(x, y.unsqueeze(0))
142
+
143
+ def _matmul_with_relative_keys(self, x, y):
144
+ return torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
145
+
146
+ def _get_relative_embeddings(self, embeddings, length):
147
+ pad_length = max(length - (self.window_size + 1), 0)
148
+ start = max((self.window_size + 1) - length, 0)
149
+ end = start + 2 * length - 1
150
+
151
+ if pad_length > 0:
152
+ embeddings = torch.nn.functional.pad(
153
+ embeddings,
154
+ convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
155
+ )
156
+ return embeddings[:, start:end]
157
+
158
+ def _relative_position_to_absolute_position(self, x):
159
+ batch, heads, length, _ = x.size()
160
+ x = torch.nn.functional.pad(
161
+ x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
162
+ )
163
+ x_flat = x.view(batch, heads, length * 2 * length)
164
+ x_flat = torch.nn.functional.pad(
165
+ x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
166
+ )
167
+ return x_flat.view(batch, heads, length + 1, 2 * length - 1)[
168
+ :, :, :length, length - 1 :
169
+ ]
170
+
171
+ def _absolute_position_to_relative_position(self, x):
172
+ batch, heads, length, _ = x.size()
173
+ x = torch.nn.functional.pad(
174
+ x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
175
+ )
176
+ x_flat = x.view(batch, heads, length**2 + length * (length - 1))
177
+ x_flat = torch.nn.functional.pad(
178
+ x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]])
179
+ )
180
+ return x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:]
181
+
182
+ def _attention_bias_proximal(self, length):
183
+ r = torch.arange(length, dtype=torch.float32)
184
+ diff = r.unsqueeze(0) - r.unsqueeze(1)
185
+ return -torch.log1p(torch.abs(diff)).unsqueeze(0).unsqueeze(0)
186
+
187
+
188
+ class FFN(torch.nn.Module):
189
+ """
190
+ Feed-forward network module.
191
+
192
+ Args:
193
+ in_channels (int): Number of input channels.
194
+ out_channels (int): Number of output channels.
195
+ filter_channels (int): Number of filter channels in the convolution layers.
196
+ kernel_size (int): Kernel size of the convolution layers.
197
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
198
+ activation (str, optional): Activation function to use. Defaults to None.
199
+ causal (bool, optional): Whether to use causal padding in the convolution layers. Defaults to False.
200
+ """
201
+
202
+ def __init__(
203
+ self,
204
+ in_channels,
205
+ out_channels,
206
+ filter_channels,
207
+ kernel_size,
208
+ p_dropout=0.0,
209
+ activation=None,
210
+ causal=False,
211
+ ):
212
+ super().__init__()
213
+ self.padding_fn = self._causal_padding if causal else self._same_padding
214
+
215
+ self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size)
216
+ self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size)
217
+ self.drop = torch.nn.Dropout(p_dropout)
218
+
219
+ self.activation = activation
220
+
221
+ def forward(self, x, x_mask):
222
+ x = self.conv_1(self.padding_fn(x * x_mask))
223
+ x = self._apply_activation(x)
224
+ x = self.drop(x)
225
+ x = self.conv_2(self.padding_fn(x * x_mask))
226
+ return x * x_mask
227
+
228
+ def _apply_activation(self, x):
229
+ if self.activation == "gelu":
230
+ return x * torch.sigmoid(1.702 * x)
231
+ return torch.relu(x)
232
+
233
+ def _causal_padding(self, x):
234
+ pad_l, pad_r = self.conv_1.kernel_size[0] - 1, 0
235
+ return torch.nn.functional.pad(
236
+ x, convert_pad_shape([[0, 0], [0, 0], [pad_l, pad_r]])
237
+ )
238
+
239
+ def _same_padding(self, x):
240
+ pad = (self.conv_1.kernel_size[0] - 1) // 2
241
+ return torch.nn.functional.pad(
242
+ x, convert_pad_shape([[0, 0], [0, 0], [pad, pad]])
243
+ )
commons.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from typing import List, Optional
4
+
5
+
6
+ def init_weights(m, mean=0.0, std=0.01):
7
+ """
8
+ Initialize the weights of a module.
9
+
10
+ Args:
11
+ m: The module to initialize.
12
+ mean: The mean of the normal distribution.
13
+ std: The standard deviation of the normal distribution.
14
+ """
15
+ classname = m.__class__.__name__
16
+ if classname.find("Conv") != -1:
17
+ m.weight.data.normal_(mean, std)
18
+
19
+
20
+ def get_padding(kernel_size, dilation=1):
21
+ """
22
+ Calculate the padding needed for a convolution.
23
+
24
+ Args:
25
+ kernel_size: The size of the kernel.
26
+ dilation: The dilation of the convolution.
27
+ """
28
+ return int((kernel_size * dilation - dilation) / 2)
29
+
30
+
31
+ def convert_pad_shape(pad_shape):
32
+ """
33
+ Convert the pad shape to a list of integers.
34
+
35
+ Args:
36
+ pad_shape: The pad shape..
37
+ """
38
+ l = pad_shape[::-1]
39
+ pad_shape = [item for sublist in l for item in sublist]
40
+ return pad_shape
41
+
42
+
43
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
44
+ """
45
+ Calculate the KL divergence between two distributions.
46
+
47
+ Args:
48
+ m_p: The mean of the first distribution.
49
+ logs_p: The log of the standard deviation of the first distribution.
50
+ m_q: The mean of the second distribution.
51
+ logs_q: The log of the standard deviation of the second distribution.
52
+ """
53
+ kl = (logs_q - logs_p) - 0.5
54
+ kl += (
55
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
56
+ )
57
+ return kl
58
+
59
+
60
+ def slice_segments(
61
+ x: torch.Tensor, ids_str: torch.Tensor, segment_size: int = 4, dim: int = 2
62
+ ):
63
+ """
64
+ Slice segments from a tensor, handling tensors with different numbers of dimensions.
65
+
66
+ Args:
67
+ x (torch.Tensor): The tensor to slice.
68
+ ids_str (torch.Tensor): The starting indices of the segments.
69
+ segment_size (int, optional): The size of each segment. Defaults to 4.
70
+ dim (int, optional): The dimension to slice across (2D or 3D tensors). Defaults to 2.
71
+ """
72
+ if dim == 2:
73
+ ret = torch.zeros_like(x[:, :segment_size])
74
+ elif dim == 3:
75
+ ret = torch.zeros_like(x[:, :, :segment_size])
76
+
77
+ for i in range(x.size(0)):
78
+ idx_str = ids_str[i].item()
79
+ idx_end = idx_str + segment_size
80
+ if dim == 2:
81
+ ret[i] = x[i, idx_str:idx_end]
82
+ else:
83
+ ret[i] = x[i, :, idx_str:idx_end]
84
+
85
+ return ret
86
+
87
+
88
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
89
+ """
90
+ Randomly slice segments from a tensor.
91
+
92
+ Args:
93
+ x: The tensor to slice.
94
+ x_lengths: The lengths of the sequences.
95
+ segment_size: The size of each segment.
96
+ """
97
+ b, d, t = x.size()
98
+ if x_lengths is None:
99
+ x_lengths = t
100
+ ids_str_max = x_lengths - segment_size + 1
101
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
102
+ ret = slice_segments(x, ids_str, segment_size, dim=3)
103
+ return ret, ids_str
104
+
105
+
106
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
107
+ """
108
+ Generate a 1D timing signal.
109
+
110
+ Args:
111
+ length: The length of the signal.
112
+ channels: The number of channels of the signal.
113
+ min_timescale: The minimum timescale.
114
+ max_timescale: The maximum timescale.
115
+ """
116
+ position = torch.arange(length, dtype=torch.float)
117
+ num_timescales = channels // 2
118
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
119
+ num_timescales - 1
120
+ )
121
+ inv_timescales = min_timescale * torch.exp(
122
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
123
+ )
124
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
125
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
126
+ signal = torch.nn.functional.pad(signal, [0, 0, 0, channels % 2])
127
+ signal = signal.view(1, channels, length)
128
+ return signal
129
+
130
+
131
+ def subsequent_mask(length):
132
+ """
133
+ Generate a subsequent mask.
134
+
135
+ Args:
136
+ length: The length of the sequence.
137
+ """
138
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
139
+ return mask
140
+
141
+
142
+ @torch.jit.script
143
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
144
+ """
145
+ Fused add tanh sigmoid multiply operation.
146
+
147
+ Args:
148
+ input_a: The first input tensor.
149
+ input_b: The second input tensor.
150
+ n_channels: The number of channels.
151
+ """
152
+ n_channels_int = n_channels[0]
153
+ in_act = input_a + input_b
154
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
155
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
156
+ acts = t_act * s_act
157
+ return acts
158
+
159
+
160
+ def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
161
+ """
162
+ Convert the pad shape to a list of integers.
163
+
164
+ Args:
165
+ pad_shape: The pad shape.
166
+ """
167
+ return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
168
+
169
+
170
+ def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
171
+ """
172
+ Generate a sequence mask.
173
+
174
+ Args:
175
+ length: The lengths of the sequences.
176
+ max_length: The maximum length of the sequences.
177
+ """
178
+ if max_length is None:
179
+ max_length = length.max()
180
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
181
+ return x.unsqueeze(0) < length.unsqueeze(1)
182
+
183
+
184
+ def clip_grad_value(parameters, clip_value, norm_type=2):
185
+ """
186
+ Clip the gradients of a list of parameters.
187
+
188
+ Args:
189
+ parameters: The list of parameters to clip.
190
+ clip_value: The maximum value of the gradients.
191
+ norm_type: The type of norm to use for clipping.
192
+ """
193
+ if isinstance(parameters, torch.Tensor):
194
+ parameters = [parameters]
195
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
196
+ norm_type = float(norm_type)
197
+ if clip_value is not None:
198
+ clip_value = float(clip_value)
199
+
200
+ total_norm = 0
201
+ for p in parameters:
202
+ param_norm = p.grad.data.norm(norm_type)
203
+ total_norm += param_norm.item() ** norm_type
204
+ if clip_value is not None:
205
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
206
+ total_norm = total_norm ** (1.0 / norm_type)
207
+ return total_norm
config.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import json
3
+ import os
4
+
5
+ CONFIG_BASE_PATH = os.path.join("rvc_cli", "rvc", "configs")
6
+
7
+ version_config_paths = [
8
+ os.path.join("v1", "32000.json"),
9
+ os.path.join("v1", "40000.json"),
10
+ os.path.join("v1", "48000.json"),
11
+ os.path.join("v2", "48000.json"),
12
+ os.path.join("v2", "40000.json"),
13
+ os.path.join("v2", "32000.json"),
14
+ ]
15
+
16
+
17
+ def singleton(cls):
18
+ instances = {}
19
+
20
+ def get_instance(*args, **kwargs):
21
+ if cls not in instances:
22
+ instances[cls] = cls(*args, **kwargs)
23
+ return instances[cls]
24
+
25
+ return get_instance
26
+
27
+
28
+ @singleton
29
+ class Config:
30
+ def __init__(self):
31
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
32
+ self.is_half = self.device.startswith("cuda")
33
+ self.gpu_name = (
34
+ torch.cuda.get_device_name(int(self.device.split(":")[-1]))
35
+ if self.device.startswith("cuda")
36
+ else None
37
+ )
38
+ self.json_config = self.load_config_json()
39
+ self.gpu_mem = None
40
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
41
+
42
+ def load_config_json(self) -> dict:
43
+ configs = {}
44
+ for config_file in version_config_paths:
45
+ config_path = os.path.join(CONFIG_BASE_PATH, config_file)
46
+
47
+ if not os.path.exists(config_path):
48
+ print(f"[WARNING] Config file not found: {config_path}")
49
+ continue # Skip missing config files
50
+
51
+ try:
52
+ with open(config_path, "r") as f:
53
+ configs[config_file] = json.load(f)
54
+ except json.JSONDecodeError:
55
+ print(f"[ERROR] Failed to parse JSON in {config_path}")
56
+
57
+ return configs
58
+
59
+ def has_mps(self) -> bool:
60
+ return torch.backends.mps.is_available()
61
+
62
+ def has_xpu(self) -> bool:
63
+ return hasattr(torch, "xpu") and torch.xpu.is_available()
64
+
65
+ def set_precision(self, precision):
66
+ if precision not in ["fp32", "fp16"]:
67
+ raise ValueError("Invalid precision type. Must be 'fp32' or 'fp16'.")
68
+
69
+ fp16_run_value = precision == "fp16"
70
+ for config_path in version_config_paths:
71
+ full_config_path = os.path.join(CONFIG_BASE_PATH, config_path)
72
+ if not os.path.exists(full_config_path):
73
+ print(f"[WARNING] Config file missing: {full_config_path}")
74
+ continue
75
+
76
+ try:
77
+ with open(full_config_path, "r") as f:
78
+ config = json.load(f)
79
+ config["train"]["fp16_run"] = fp16_run_value
80
+ with open(full_config_path, "w") as f:
81
+ json.dump(config, f, indent=4)
82
+ except (FileNotFoundError, json.JSONDecodeError):
83
+ print(f"[ERROR] Failed to update {full_config_path}")
84
+
85
+ return f"Set precision to {precision} in available config files."
86
+
87
+ def get_precision(self):
88
+ if not version_config_paths:
89
+ raise FileNotFoundError("No configuration paths provided.")
90
+
91
+ full_config_path = os.path.join(CONFIG_BASE_PATH, version_config_paths[0])
92
+ if not os.path.exists(full_config_path):
93
+ print(f"[ERROR] Config file missing: {full_config_path}")
94
+ return None
95
+
96
+ try:
97
+ with open(full_config_path, "r") as f:
98
+ config = json.load(f)
99
+ return "fp16" if config["train"].get("fp16_run", False) else "fp32"
100
+ except json.JSONDecodeError:
101
+ print(f"[ERROR] JSON parsing failed in {full_config_path}")
102
+ return None
103
+
104
+ def device_config(self) -> tuple:
105
+ if self.device.startswith("cuda"):
106
+ self.set_cuda_config()
107
+ elif self.has_mps():
108
+ self.device = "mps"
109
+ self.is_half = False
110
+ self.set_precision("fp32")
111
+ else:
112
+ self.device = "cpu"
113
+ self.is_half = False
114
+ self.set_precision("fp32")
115
+
116
+ x_pad, x_query, x_center, x_max = (
117
+ (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
118
+ )
119
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
120
+ x_pad, x_query, x_center, x_max = (1, 5, 30, 32)
121
+
122
+ return x_pad, x_query, x_center, x_max
123
+
124
+ def set_cuda_config(self):
125
+ i_device = int(self.device.split(":")[-1])
126
+ self.gpu_name = torch.cuda.get_device_name(i_device)
127
+ low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"]
128
+ if (
129
+ any(gpu in self.gpu_name for gpu in low_end_gpus)
130
+ and "V100" not in self.gpu_name.upper()
131
+ ):
132
+ self.is_half = False
133
+ self.set_precision("fp32")
134
+
135
+ self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (
136
+ 1024**3
137
+ )
138
+
139
+
140
+ def max_vram_gpu(gpu):
141
+ if torch.cuda.is_available():
142
+ gpu_properties = torch.cuda.get_device_properties(gpu)
143
+ return round(gpu_properties.total_memory / 1024 / 1024 / 1024)
144
+ return 8
145
+
146
+
147
+ def get_gpu_info():
148
+ ngpu = torch.cuda.device_count()
149
+ gpu_infos = []
150
+ if torch.cuda.is_available() or ngpu != 0:
151
+ for i in range(ngpu):
152
+ gpu_name = torch.cuda.get_device_name(i)
153
+ mem = int(torch.cuda.get_device_properties(i).total_memory / 1024**3 + 0.4)
154
+ gpu_infos.append(f"{i}: {gpu_name} ({mem} GB)")
155
+ return "\n".join(gpu_infos) if gpu_infos else "No compatible GPU found."
156
+
157
+
158
+ def get_number_of_gpus():
159
+ return "-".join(map(str, range(torch.cuda.device_count()))) if torch.cuda.is_available() else "-"
core.py ADDED
@@ -0,0 +1,1519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import argparse
5
+ import subprocess
6
+ from functools import lru_cache
7
+ from distutils.util import strtobool
8
+ from rvc_cli.rvc.lib.tools.model_download import model_download_pipeline
9
+ from rvc_cli.rvc.lib.tools.prerequisites import prequisites_download_pipeline
10
+
11
+
12
+
13
+ now_dir = os.getcwd()
14
+ sys.path.append(now_dir)
15
+
16
+ current_script_directory = os.path.dirname(os.path.realpath(__file__))
17
+ logs_path = os.path.join(current_script_directory, "logs")
18
+
19
+
20
+ python = sys.executable
21
+
22
+
23
+
24
+ @lru_cache(maxsize=None)
25
+ def import_voice_converter():
26
+ from rvc_cli.rvc.infer.infer import VoiceConverter
27
+
28
+ return VoiceConverter()
29
+
30
+
31
+ @lru_cache(maxsize=1)
32
+ def get_config():
33
+ from rvc_cli.rvc.configs.config import Config
34
+
35
+ return Config()
36
+
37
+
38
+ # Infer
39
+ def run_infer_script(
40
+ pitch: int,
41
+ filter_radius: int,
42
+ index_rate: float,
43
+ volume_envelope: int,
44
+ protect: float,
45
+ hop_length: int,
46
+ f0_method: str,
47
+ input_path: str,
48
+ output_path: str,
49
+ pth_path: str,
50
+ index_path: str,
51
+ split_audio: bool,
52
+ f0_autotune: bool,
53
+ f0_autotune_strength: float,
54
+ clean_audio: bool,
55
+ clean_strength: float,
56
+ export_format: str,
57
+ f0_file: str,
58
+ embedder_model: str,
59
+ embedder_model_custom: str = None,
60
+ formant_shifting: bool = False,
61
+ formant_qfrency: float = 1.0,
62
+ formant_timbre: float = 1.0,
63
+ post_process: bool = False,
64
+ reverb: bool = False,
65
+ pitch_shift: bool = False,
66
+ limiter: bool = False,
67
+ gain: bool = False,
68
+ distortion: bool = False,
69
+ chorus: bool = False,
70
+ bitcrush: bool = False,
71
+ clipping: bool = False,
72
+ compressor: bool = False,
73
+ delay: bool = False,
74
+ reverb_room_size: float = 0.5,
75
+ reverb_damping: float = 0.5,
76
+ reverb_wet_gain: float = 0.5,
77
+ reverb_dry_gain: float = 0.5,
78
+ reverb_width: float = 0.5,
79
+ reverb_freeze_mode: float = 0.5,
80
+ pitch_shift_semitones: float = 0.0,
81
+ limiter_threshold: float = -6,
82
+ limiter_release_time: float = 0.01,
83
+ gain_db: float = 0.0,
84
+ distortion_gain: float = 25,
85
+ chorus_rate: float = 1.0,
86
+ chorus_depth: float = 0.25,
87
+ chorus_center_delay: float = 7,
88
+ chorus_feedback: float = 0.0,
89
+ chorus_mix: float = 0.5,
90
+ bitcrush_bit_depth: int = 8,
91
+ clipping_threshold: float = -6,
92
+ compressor_threshold: float = 0,
93
+ compressor_ratio: float = 1,
94
+ compressor_attack: float = 1.0,
95
+ compressor_release: float = 100,
96
+ delay_seconds: float = 0.5,
97
+ delay_feedback: float = 0.0,
98
+ delay_mix: float = 0.5,
99
+ sid: int = 0,
100
+ ):
101
+ kwargs = {
102
+ "audio_input_path": input_path,
103
+ "audio_output_path": output_path,
104
+ "model_path": pth_path,
105
+ "index_path": index_path,
106
+ "pitch": pitch,
107
+ "filter_radius": filter_radius,
108
+ "index_rate": index_rate,
109
+ "volume_envelope": volume_envelope,
110
+ "protect": protect,
111
+ "hop_length": hop_length,
112
+ "f0_method": f0_method,
113
+ "pth_path": pth_path,
114
+ "index_path": index_path,
115
+ "split_audio": split_audio,
116
+ "f0_autotune": f0_autotune,
117
+ "f0_autotune_strength": f0_autotune_strength,
118
+ "clean_audio": clean_audio,
119
+ "clean_strength": clean_strength,
120
+ "export_format": export_format,
121
+ "f0_file": f0_file,
122
+ "embedder_model": embedder_model,
123
+ "embedder_model_custom": embedder_model_custom,
124
+ "post_process": post_process,
125
+ "formant_shifting": formant_shifting,
126
+ "formant_qfrency": formant_qfrency,
127
+ "formant_timbre": formant_timbre,
128
+ "reverb": reverb,
129
+ "pitch_shift": pitch_shift,
130
+ "limiter": limiter,
131
+ "gain": gain,
132
+ "distortion": distortion,
133
+ "chorus": chorus,
134
+ "bitcrush": bitcrush,
135
+ "clipping": clipping,
136
+ "compressor": compressor,
137
+ "delay": delay,
138
+ "reverb_room_size": reverb_room_size,
139
+ "reverb_damping": reverb_damping,
140
+ "reverb_wet_level": reverb_wet_gain,
141
+ "reverb_dry_level": reverb_dry_gain,
142
+ "reverb_width": reverb_width,
143
+ "reverb_freeze_mode": reverb_freeze_mode,
144
+ "pitch_shift_semitones": pitch_shift_semitones,
145
+ "limiter_threshold": limiter_threshold,
146
+ "limiter_release": limiter_release_time,
147
+ "gain_db": gain_db,
148
+ "distortion_gain": distortion_gain,
149
+ "chorus_rate": chorus_rate,
150
+ "chorus_depth": chorus_depth,
151
+ "chorus_delay": chorus_center_delay,
152
+ "chorus_feedback": chorus_feedback,
153
+ "chorus_mix": chorus_mix,
154
+ "bitcrush_bit_depth": bitcrush_bit_depth,
155
+ "clipping_threshold": clipping_threshold,
156
+ "compressor_threshold": compressor_threshold,
157
+ "compressor_ratio": compressor_ratio,
158
+ "compressor_attack": compressor_attack,
159
+ "compressor_release": compressor_release,
160
+ "delay_seconds": delay_seconds,
161
+ "delay_feedback": delay_feedback,
162
+ "delay_mix": delay_mix,
163
+ "sid": sid,
164
+ }
165
+ infer_pipeline = import_voice_converter()
166
+ infer_pipeline.convert_audio(
167
+ **kwargs,
168
+ )
169
+ return f"File {input_path} inferred successfully.", output_path.replace(
170
+ ".wav", f".{export_format.lower()}"
171
+ )
172
+
173
+
174
+ # Batch infer
175
+ def run_batch_infer_script(
176
+ pitch: int,
177
+ filter_radius: int,
178
+ index_rate: float,
179
+ volume_envelope: int,
180
+ protect: float,
181
+ hop_length: int,
182
+ f0_method: str,
183
+ input_folder: str,
184
+ output_folder: str,
185
+ pth_path: str,
186
+ index_path: str,
187
+ split_audio: bool,
188
+ f0_autotune: bool,
189
+ f0_autotune_strength: float,
190
+ clean_audio: bool,
191
+ clean_strength: float,
192
+ export_format: str,
193
+ f0_file: str,
194
+ embedder_model: str,
195
+ embedder_model_custom: str = None,
196
+ formant_shifting: bool = False,
197
+ formant_qfrency: float = 1.0,
198
+ formant_timbre: float = 1.0,
199
+ post_process: bool = False,
200
+ reverb: bool = False,
201
+ pitch_shift: bool = False,
202
+ limiter: bool = False,
203
+ gain: bool = False,
204
+ distortion: bool = False,
205
+ chorus: bool = False,
206
+ bitcrush: bool = False,
207
+ clipping: bool = False,
208
+ compressor: bool = False,
209
+ delay: bool = False,
210
+ reverb_room_size: float = 0.5,
211
+ reverb_damping: float = 0.5,
212
+ reverb_wet_gain: float = 0.5,
213
+ reverb_dry_gain: float = 0.5,
214
+ reverb_width: float = 0.5,
215
+ reverb_freeze_mode: float = 0.5,
216
+ pitch_shift_semitones: float = 0.0,
217
+ limiter_threshold: float = -6,
218
+ limiter_release_time: float = 0.01,
219
+ gain_db: float = 0.0,
220
+ distortion_gain: float = 25,
221
+ chorus_rate: float = 1.0,
222
+ chorus_depth: float = 0.25,
223
+ chorus_center_delay: float = 7,
224
+ chorus_feedback: float = 0.0,
225
+ chorus_mix: float = 0.5,
226
+ bitcrush_bit_depth: int = 8,
227
+ clipping_threshold: float = -6,
228
+ compressor_threshold: float = 0,
229
+ compressor_ratio: float = 1,
230
+ compressor_attack: float = 1.0,
231
+ compressor_release: float = 100,
232
+ delay_seconds: float = 0.5,
233
+ delay_feedback: float = 0.0,
234
+ delay_mix: float = 0.5,
235
+ sid: int = 0,
236
+ ):
237
+ kwargs = {
238
+ "audio_input_paths": input_folder,
239
+ "audio_output_path": output_folder,
240
+ "model_path": pth_path,
241
+ "index_path": index_path,
242
+ "pitch": pitch,
243
+ "filter_radius": filter_radius,
244
+ "index_rate": index_rate,
245
+ "volume_envelope": volume_envelope,
246
+ "protect": protect,
247
+ "hop_length": hop_length,
248
+ "f0_method": f0_method,
249
+ "pth_path": pth_path,
250
+ "index_path": index_path,
251
+ "split_audio": split_audio,
252
+ "f0_autotune": f0_autotune,
253
+ "f0_autotune_strength": f0_autotune_strength,
254
+ "clean_audio": clean_audio,
255
+ "clean_strength": clean_strength,
256
+ "export_format": export_format,
257
+ "f0_file": f0_file,
258
+ "embedder_model": embedder_model,
259
+ "embedder_model_custom": embedder_model_custom,
260
+ "post_process": post_process,
261
+ "formant_shifting": formant_shifting,
262
+ "formant_qfrency": formant_qfrency,
263
+ "formant_timbre": formant_timbre,
264
+ "reverb": reverb,
265
+ "pitch_shift": pitch_shift,
266
+ "limiter": limiter,
267
+ "gain": gain,
268
+ "distortion": distortion,
269
+ "chorus": chorus,
270
+ "bitcrush": bitcrush,
271
+ "clipping": clipping,
272
+ "compressor": compressor,
273
+ "delay": delay,
274
+ "reverb_room_size": reverb_room_size,
275
+ "reverb_damping": reverb_damping,
276
+ "reverb_wet_level": reverb_wet_gain,
277
+ "reverb_dry_level": reverb_dry_gain,
278
+ "reverb_width": reverb_width,
279
+ "reverb_freeze_mode": reverb_freeze_mode,
280
+ "pitch_shift_semitones": pitch_shift_semitones,
281
+ "limiter_threshold": limiter_threshold,
282
+ "limiter_release": limiter_release_time,
283
+ "gain_db": gain_db,
284
+ "distortion_gain": distortion_gain,
285
+ "chorus_rate": chorus_rate,
286
+ "chorus_depth": chorus_depth,
287
+ "chorus_delay": chorus_center_delay,
288
+ "chorus_feedback": chorus_feedback,
289
+ "chorus_mix": chorus_mix,
290
+ "bitcrush_bit_depth": bitcrush_bit_depth,
291
+ "clipping_threshold": clipping_threshold,
292
+ "compressor_threshold": compressor_threshold,
293
+ "compressor_ratio": compressor_ratio,
294
+ "compressor_attack": compressor_attack,
295
+ "compressor_release": compressor_release,
296
+ "delay_seconds": delay_seconds,
297
+ "delay_feedback": delay_feedback,
298
+ "delay_mix": delay_mix,
299
+ "sid": sid,
300
+ }
301
+ infer_pipeline = import_voice_converter()
302
+ infer_pipeline.convert_audio_batch(
303
+ **kwargs,
304
+ )
305
+
306
+ return f"Files from {input_folder} inferred successfully."
307
+
308
+
309
+
310
+ # Download
311
+ def run_download_script(model_link: str):
312
+ model_download_pipeline(model_link)
313
+ return f"Model downloaded successfully."
314
+
315
+
316
+
317
+
318
+
319
+ # Parse arguments
320
+ def parse_arguments():
321
+ parser = argparse.ArgumentParser(
322
+ description="Run the main.py script with specific parameters."
323
+ )
324
+ subparsers = parser.add_subparsers(
325
+ title="subcommands", dest="mode", help="Choose a mode"
326
+ )
327
+
328
+ # Parser for 'infer' mode
329
+ infer_parser = subparsers.add_parser("infer", help="Run inference")
330
+ pitch_description = (
331
+ "Set the pitch of the audio. Higher values result in a higher pitch."
332
+ )
333
+ infer_parser.add_argument(
334
+ "--pitch",
335
+ type=int,
336
+ help=pitch_description,
337
+ choices=range(-24, 25),
338
+ default=0,
339
+ )
340
+ filter_radius_description = "Apply median filtering to the extracted pitch values if this value is greater than or equal to three. This can help reduce breathiness in the output audio."
341
+ infer_parser.add_argument(
342
+ "--filter_radius",
343
+ type=int,
344
+ help=filter_radius_description,
345
+ choices=range(11),
346
+ default=3,
347
+ )
348
+ index_rate_description = "Control the influence of the index file on the output. Higher values mean stronger influence. Lower values can help reduce artifacts but may result in less accurate voice cloning."
349
+ infer_parser.add_argument(
350
+ "--index_rate",
351
+ type=float,
352
+ help=index_rate_description,
353
+ choices=[i / 100.0 for i in range(0, 101)],
354
+ default=0.3,
355
+ )
356
+ volume_envelope_description = "Control the blending of the output's volume envelope. A value of 1 means the output envelope is fully used."
357
+ infer_parser.add_argument(
358
+ "--volume_envelope",
359
+ type=float,
360
+ help=volume_envelope_description,
361
+ choices=[i / 100.0 for i in range(0, 101)],
362
+ default=1,
363
+ )
364
+ protect_description = "Protect consonants and breathing sounds from artifacts. A value of 0.5 offers the strongest protection, while lower values may reduce the protection level but potentially mitigate the indexing effect."
365
+ infer_parser.add_argument(
366
+ "--protect",
367
+ type=float,
368
+ help=protect_description,
369
+ choices=[i / 1000.0 for i in range(0, 501)],
370
+ default=0.33,
371
+ )
372
+ hop_length_description = "Only applicable for the Crepe pitch extraction method. Determines the time it takes for the system to react to a significant pitch change. Smaller values require more processing time but can lead to better pitch accuracy."
373
+ infer_parser.add_argument(
374
+ "--hop_length",
375
+ type=int,
376
+ help=hop_length_description,
377
+ choices=range(1, 513),
378
+ default=128,
379
+ )
380
+ f0_method_description = "Choose the pitch extraction algorithm for the conversion. 'rmvpe' is the default and generally recommended."
381
+ infer_parser.add_argument(
382
+ "--f0_method",
383
+ type=str,
384
+ help=f0_method_description,
385
+ choices=[
386
+ "crepe",
387
+ "crepe-tiny",
388
+ "rmvpe",
389
+ "fcpe",
390
+ "hybrid[crepe+rmvpe]",
391
+ "hybrid[crepe+fcpe]",
392
+ "hybrid[rmvpe+fcpe]",
393
+ "hybrid[crepe+rmvpe+fcpe]",
394
+ ],
395
+ default="rmvpe",
396
+ )
397
+ infer_parser.add_argument(
398
+ "--input_path",
399
+ type=str,
400
+ help="Full path to the input audio file.",
401
+ required=True,
402
+ )
403
+ infer_parser.add_argument(
404
+ "--output_path",
405
+ type=str,
406
+ help="Full path to the output audio file.",
407
+ required=True,
408
+ )
409
+ pth_path_description = "Full path to the RVC model file (.pth)."
410
+ infer_parser.add_argument(
411
+ "--pth_path", type=str, help=pth_path_description, required=True
412
+ )
413
+ index_path_description = "Full path to the index file (.index)."
414
+ infer_parser.add_argument(
415
+ "--index_path", type=str, help=index_path_description, required=True
416
+ )
417
+ split_audio_description = "Split the audio into smaller segments before inference. This can improve the quality of the output for longer audio files."
418
+ infer_parser.add_argument(
419
+ "--split_audio",
420
+ type=lambda x: bool(strtobool(x)),
421
+ choices=[True, False],
422
+ help=split_audio_description,
423
+ default=False,
424
+ )
425
+ f0_autotune_description = "Apply a light autotune to the inferred audio. Particularly useful for singing voice conversions."
426
+ infer_parser.add_argument(
427
+ "--f0_autotune",
428
+ type=lambda x: bool(strtobool(x)),
429
+ choices=[True, False],
430
+ help=f0_autotune_description,
431
+ default=False,
432
+ )
433
+ f0_autotune_strength_description = "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid."
434
+ infer_parser.add_argument(
435
+ "--f0_autotune_strength",
436
+ type=float,
437
+ help=f0_autotune_strength_description,
438
+ choices=[(i / 10) for i in range(11)],
439
+ default=1.0,
440
+ )
441
+ clean_audio_description = "Clean the output audio using noise reduction algorithms. Recommended for speech conversions."
442
+ infer_parser.add_argument(
443
+ "--clean_audio",
444
+ type=lambda x: bool(strtobool(x)),
445
+ choices=[True, False],
446
+ help=clean_audio_description,
447
+ default=False,
448
+ )
449
+ clean_strength_description = "Adjust the intensity of the audio cleaning process. Higher values result in stronger cleaning, but may lead to a more compressed sound."
450
+ infer_parser.add_argument(
451
+ "--clean_strength",
452
+ type=float,
453
+ help=clean_strength_description,
454
+ choices=[(i / 10) for i in range(11)],
455
+ default=0.7,
456
+ )
457
+ export_format_description = "Select the desired output audio format."
458
+ infer_parser.add_argument(
459
+ "--export_format",
460
+ type=str,
461
+ help=export_format_description,
462
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
463
+ default="WAV",
464
+ )
465
+ embedder_model_description = (
466
+ "Choose the model used for generating speaker embeddings."
467
+ )
468
+ infer_parser.add_argument(
469
+ "--embedder_model",
470
+ type=str,
471
+ help=embedder_model_description,
472
+ choices=[
473
+ "contentvec",
474
+ "chinese-hubert-base",
475
+ "japanese-hubert-base",
476
+ "korean-hubert-base",
477
+ "custom",
478
+ ],
479
+ default="contentvec",
480
+ )
481
+ embedder_model_custom_description = "Specify the path to a custom model for speaker embedding. Only applicable if 'embedder_model' is set to 'custom'."
482
+ infer_parser.add_argument(
483
+ "--embedder_model_custom",
484
+ type=str,
485
+ help=embedder_model_custom_description,
486
+ default=None,
487
+ )
488
+ f0_file_description = "Full path to an external F0 file (.f0). This allows you to use pre-computed pitch values for the input audio."
489
+ infer_parser.add_argument(
490
+ "--f0_file",
491
+ type=str,
492
+ help=f0_file_description,
493
+ default=None,
494
+ )
495
+ formant_shifting_description = "Apply formant shifting to the input audio. This can help adjust the timbre of the voice."
496
+ infer_parser.add_argument(
497
+ "--formant_shifting",
498
+ type=lambda x: bool(strtobool(x)),
499
+ choices=[True, False],
500
+ help=formant_shifting_description,
501
+ default=False,
502
+ required=False,
503
+ )
504
+ formant_qfrency_description = "Control the frequency of the formant shifting effect. Higher values result in a more pronounced effect."
505
+ infer_parser.add_argument(
506
+ "--formant_qfrency",
507
+ type=float,
508
+ help=formant_qfrency_description,
509
+ default=1.0,
510
+ required=False,
511
+ )
512
+ formant_timbre_description = "Control the timbre of the formant shifting effect. Higher values result in a more pronounced effect."
513
+ infer_parser.add_argument(
514
+ "--formant_timbre",
515
+ type=float,
516
+ help=formant_timbre_description,
517
+ default=1.0,
518
+ required=False,
519
+ )
520
+ sid_description = "Speaker ID for multi-speaker models."
521
+ infer_parser.add_argument(
522
+ "--sid",
523
+ type=int,
524
+ help=sid_description,
525
+ default=0,
526
+ required=False,
527
+ )
528
+ post_process_description = "Apply post-processing effects to the output audio."
529
+ infer_parser.add_argument(
530
+ "--post_process",
531
+ type=lambda x: bool(strtobool(x)),
532
+ choices=[True, False],
533
+ help=post_process_description,
534
+ default=False,
535
+ required=False,
536
+ )
537
+ reverb_description = "Apply reverb effect to the output audio."
538
+ infer_parser.add_argument(
539
+ "--reverb",
540
+ type=lambda x: bool(strtobool(x)),
541
+ choices=[True, False],
542
+ help=reverb_description,
543
+ default=False,
544
+ required=False,
545
+ )
546
+
547
+ pitch_shift_description = "Apply pitch shifting effect to the output audio."
548
+ infer_parser.add_argument(
549
+ "--pitch_shift",
550
+ type=lambda x: bool(strtobool(x)),
551
+ choices=[True, False],
552
+ help=pitch_shift_description,
553
+ default=False,
554
+ required=False,
555
+ )
556
+
557
+ limiter_description = "Apply limiter effect to the output audio."
558
+ infer_parser.add_argument(
559
+ "--limiter",
560
+ type=lambda x: bool(strtobool(x)),
561
+ choices=[True, False],
562
+ help=limiter_description,
563
+ default=False,
564
+ required=False,
565
+ )
566
+
567
+ gain_description = "Apply gain effect to the output audio."
568
+ infer_parser.add_argument(
569
+ "--gain",
570
+ type=lambda x: bool(strtobool(x)),
571
+ choices=[True, False],
572
+ help=gain_description,
573
+ default=False,
574
+ required=False,
575
+ )
576
+
577
+ distortion_description = "Apply distortion effect to the output audio."
578
+ infer_parser.add_argument(
579
+ "--distortion",
580
+ type=lambda x: bool(strtobool(x)),
581
+ choices=[True, False],
582
+ help=distortion_description,
583
+ default=False,
584
+ required=False,
585
+ )
586
+
587
+ chorus_description = "Apply chorus effect to the output audio."
588
+ infer_parser.add_argument(
589
+ "--chorus",
590
+ type=lambda x: bool(strtobool(x)),
591
+ choices=[True, False],
592
+ help=chorus_description,
593
+ default=False,
594
+ required=False,
595
+ )
596
+
597
+ bitcrush_description = "Apply bitcrush effect to the output audio."
598
+ infer_parser.add_argument(
599
+ "--bitcrush",
600
+ type=lambda x: bool(strtobool(x)),
601
+ choices=[True, False],
602
+ help=bitcrush_description,
603
+ default=False,
604
+ required=False,
605
+ )
606
+
607
+ clipping_description = "Apply clipping effect to the output audio."
608
+ infer_parser.add_argument(
609
+ "--clipping",
610
+ type=lambda x: bool(strtobool(x)),
611
+ choices=[True, False],
612
+ help=clipping_description,
613
+ default=False,
614
+ required=False,
615
+ )
616
+
617
+ compressor_description = "Apply compressor effect to the output audio."
618
+ infer_parser.add_argument(
619
+ "--compressor",
620
+ type=lambda x: bool(strtobool(x)),
621
+ choices=[True, False],
622
+ help=compressor_description,
623
+ default=False,
624
+ required=False,
625
+ )
626
+
627
+ delay_description = "Apply delay effect to the output audio."
628
+ infer_parser.add_argument(
629
+ "--delay",
630
+ type=lambda x: bool(strtobool(x)),
631
+ choices=[True, False],
632
+ help=delay_description,
633
+ default=False,
634
+ required=False,
635
+ )
636
+
637
+ reverb_room_size_description = "Control the room size of the reverb effect. Higher values result in a larger room size."
638
+ infer_parser.add_argument(
639
+ "--reverb_room_size",
640
+ type=float,
641
+ help=reverb_room_size_description,
642
+ default=0.5,
643
+ required=False,
644
+ )
645
+
646
+ reverb_damping_description = "Control the damping of the reverb effect. Higher values result in a more damped sound."
647
+ infer_parser.add_argument(
648
+ "--reverb_damping",
649
+ type=float,
650
+ help=reverb_damping_description,
651
+ default=0.5,
652
+ required=False,
653
+ )
654
+
655
+ reverb_wet_gain_description = "Control the wet gain of the reverb effect. Higher values result in a stronger reverb effect."
656
+ infer_parser.add_argument(
657
+ "--reverb_wet_gain",
658
+ type=float,
659
+ help=reverb_wet_gain_description,
660
+ default=0.5,
661
+ required=False,
662
+ )
663
+
664
+ reverb_dry_gain_description = "Control the dry gain of the reverb effect. Higher values result in a stronger dry signal."
665
+ infer_parser.add_argument(
666
+ "--reverb_dry_gain",
667
+ type=float,
668
+ help=reverb_dry_gain_description,
669
+ default=0.5,
670
+ required=False,
671
+ )
672
+
673
+ reverb_width_description = "Control the stereo width of the reverb effect. Higher values result in a wider stereo image."
674
+ infer_parser.add_argument(
675
+ "--reverb_width",
676
+ type=float,
677
+ help=reverb_width_description,
678
+ default=0.5,
679
+ required=False,
680
+ )
681
+
682
+ reverb_freeze_mode_description = "Control the freeze mode of the reverb effect. Higher values result in a stronger freeze effect."
683
+ infer_parser.add_argument(
684
+ "--reverb_freeze_mode",
685
+ type=float,
686
+ help=reverb_freeze_mode_description,
687
+ default=0.5,
688
+ required=False,
689
+ )
690
+
691
+ pitch_shift_semitones_description = "Control the pitch shift in semitones. Positive values increase the pitch, while negative values decrease it."
692
+ infer_parser.add_argument(
693
+ "--pitch_shift_semitones",
694
+ type=float,
695
+ help=pitch_shift_semitones_description,
696
+ default=0.0,
697
+ required=False,
698
+ )
699
+
700
+ limiter_threshold_description = "Control the threshold of the limiter effect. Higher values result in a stronger limiting effect."
701
+ infer_parser.add_argument(
702
+ "--limiter_threshold",
703
+ type=float,
704
+ help=limiter_threshold_description,
705
+ default=-6,
706
+ required=False,
707
+ )
708
+
709
+ limiter_release_time_description = "Control the release time of the limiter effect. Higher values result in a longer release time."
710
+ infer_parser.add_argument(
711
+ "--limiter_release_time",
712
+ type=float,
713
+ help=limiter_release_time_description,
714
+ default=0.01,
715
+ required=False,
716
+ )
717
+
718
+ gain_db_description = "Control the gain in decibels. Positive values increase the gain, while negative values decrease it."
719
+ infer_parser.add_argument(
720
+ "--gain_db",
721
+ type=float,
722
+ help=gain_db_description,
723
+ default=0.0,
724
+ required=False,
725
+ )
726
+
727
+ distortion_gain_description = "Control the gain of the distortion effect. Higher values result in a stronger distortion effect."
728
+ infer_parser.add_argument(
729
+ "--distortion_gain",
730
+ type=float,
731
+ help=distortion_gain_description,
732
+ default=25,
733
+ required=False,
734
+ )
735
+
736
+ chorus_rate_description = "Control the rate of the chorus effect. Higher values result in a faster chorus effect."
737
+ infer_parser.add_argument(
738
+ "--chorus_rate",
739
+ type=float,
740
+ help=chorus_rate_description,
741
+ default=1.0,
742
+ required=False,
743
+ )
744
+
745
+ chorus_depth_description = "Control the depth of the chorus effect. Higher values result in a stronger chorus effect."
746
+ infer_parser.add_argument(
747
+ "--chorus_depth",
748
+ type=float,
749
+ help=chorus_depth_description,
750
+ default=0.25,
751
+ required=False,
752
+ )
753
+
754
+ chorus_center_delay_description = "Control the center delay of the chorus effect. Higher values result in a longer center delay."
755
+ infer_parser.add_argument(
756
+ "--chorus_center_delay",
757
+ type=float,
758
+ help=chorus_center_delay_description,
759
+ default=7,
760
+ required=False,
761
+ )
762
+
763
+ chorus_feedback_description = "Control the feedback of the chorus effect. Higher values result in a stronger feedback effect."
764
+ infer_parser.add_argument(
765
+ "--chorus_feedback",
766
+ type=float,
767
+ help=chorus_feedback_description,
768
+ default=0.0,
769
+ required=False,
770
+ )
771
+
772
+ chorus_mix_description = "Control the mix of the chorus effect. Higher values result in a stronger chorus effect."
773
+ infer_parser.add_argument(
774
+ "--chorus_mix",
775
+ type=float,
776
+ help=chorus_mix_description,
777
+ default=0.5,
778
+ required=False,
779
+ )
780
+
781
+ bitcrush_bit_depth_description = "Control the bit depth of the bitcrush effect. Higher values result in a stronger bitcrush effect."
782
+ infer_parser.add_argument(
783
+ "--bitcrush_bit_depth",
784
+ type=int,
785
+ help=bitcrush_bit_depth_description,
786
+ default=8,
787
+ required=False,
788
+ )
789
+
790
+ clipping_threshold_description = "Control the threshold of the clipping effect. Higher values result in a stronger clipping effect."
791
+ infer_parser.add_argument(
792
+ "--clipping_threshold",
793
+ type=float,
794
+ help=clipping_threshold_description,
795
+ default=-6,
796
+ required=False,
797
+ )
798
+
799
+ compressor_threshold_description = "Control the threshold of the compressor effect. Higher values result in a stronger compressor effect."
800
+ infer_parser.add_argument(
801
+ "--compressor_threshold",
802
+ type=float,
803
+ help=compressor_threshold_description,
804
+ default=0,
805
+ required=False,
806
+ )
807
+
808
+ compressor_ratio_description = "Control the ratio of the compressor effect. Higher values result in a stronger compressor effect."
809
+ infer_parser.add_argument(
810
+ "--compressor_ratio",
811
+ type=float,
812
+ help=compressor_ratio_description,
813
+ default=1,
814
+ required=False,
815
+ )
816
+
817
+ compressor_attack_description = "Control the attack of the compressor effect. Higher values result in a stronger compressor effect."
818
+ infer_parser.add_argument(
819
+ "--compressor_attack",
820
+ type=float,
821
+ help=compressor_attack_description,
822
+ default=1.0,
823
+ required=False,
824
+ )
825
+
826
+ compressor_release_description = "Control the release of the compressor effect. Higher values result in a stronger compressor effect."
827
+ infer_parser.add_argument(
828
+ "--compressor_release",
829
+ type=float,
830
+ help=compressor_release_description,
831
+ default=100,
832
+ required=False,
833
+ )
834
+
835
+ delay_seconds_description = "Control the delay time in seconds. Higher values result in a longer delay time."
836
+ infer_parser.add_argument(
837
+ "--delay_seconds",
838
+ type=float,
839
+ help=delay_seconds_description,
840
+ default=0.5,
841
+ required=False,
842
+ )
843
+ delay_feedback_description = "Control the feedback of the delay effect. Higher values result in a stronger feedback effect."
844
+ infer_parser.add_argument(
845
+ "--delay_feedback",
846
+ type=float,
847
+ help=delay_feedback_description,
848
+ default=0.0,
849
+ required=False,
850
+ )
851
+ delay_mix_description = "Control the mix of the delay effect. Higher values result in a stronger delay effect."
852
+ infer_parser.add_argument(
853
+ "--delay_mix",
854
+ type=float,
855
+ help=delay_mix_description,
856
+ default=0.5,
857
+ required=False,
858
+ )
859
+
860
+ # Parser for 'batch_infer' mode
861
+ batch_infer_parser = subparsers.add_parser(
862
+ "batch_infer",
863
+ help="Run batch inference",
864
+ )
865
+ batch_infer_parser.add_argument(
866
+ "--pitch",
867
+ type=int,
868
+ help=pitch_description,
869
+ choices=range(-24, 25),
870
+ default=0,
871
+ )
872
+ batch_infer_parser.add_argument(
873
+ "--filter_radius",
874
+ type=int,
875
+ help=filter_radius_description,
876
+ choices=range(11),
877
+ default=3,
878
+ )
879
+ batch_infer_parser.add_argument(
880
+ "--index_rate",
881
+ type=float,
882
+ help=index_rate_description,
883
+ choices=[i / 100.0 for i in range(0, 101)],
884
+ default=0.3,
885
+ )
886
+ batch_infer_parser.add_argument(
887
+ "--volume_envelope",
888
+ type=float,
889
+ help=volume_envelope_description,
890
+ choices=[i / 100.0 for i in range(0, 101)],
891
+ default=1,
892
+ )
893
+ batch_infer_parser.add_argument(
894
+ "--protect",
895
+ type=float,
896
+ help=protect_description,
897
+ choices=[i / 1000.0 for i in range(0, 501)],
898
+ default=0.33,
899
+ )
900
+ batch_infer_parser.add_argument(
901
+ "--hop_length",
902
+ type=int,
903
+ help=hop_length_description,
904
+ choices=range(1, 513),
905
+ default=128,
906
+ )
907
+ batch_infer_parser.add_argument(
908
+ "--f0_method",
909
+ type=str,
910
+ help=f0_method_description,
911
+ choices=[
912
+ "crepe",
913
+ "crepe-tiny",
914
+ "rmvpe",
915
+ "fcpe",
916
+ "hybrid[crepe+rmvpe]",
917
+ "hybrid[crepe+fcpe]",
918
+ "hybrid[rmvpe+fcpe]",
919
+ "hybrid[crepe+rmvpe+fcpe]",
920
+ ],
921
+ default="rmvpe",
922
+ )
923
+ batch_infer_parser.add_argument(
924
+ "--input_folder",
925
+ type=str,
926
+ help="Path to the folder containing input audio files.",
927
+ required=True,
928
+ )
929
+ batch_infer_parser.add_argument(
930
+ "--output_folder",
931
+ type=str,
932
+ help="Path to the folder for saving output audio files.",
933
+ required=True,
934
+ )
935
+ batch_infer_parser.add_argument(
936
+ "--pth_path", type=str, help=pth_path_description, required=True
937
+ )
938
+ batch_infer_parser.add_argument(
939
+ "--index_path", type=str, help=index_path_description, required=True
940
+ )
941
+ batch_infer_parser.add_argument(
942
+ "--split_audio",
943
+ type=lambda x: bool(strtobool(x)),
944
+ choices=[True, False],
945
+ help=split_audio_description,
946
+ default=False,
947
+ )
948
+ batch_infer_parser.add_argument(
949
+ "--f0_autotune",
950
+ type=lambda x: bool(strtobool(x)),
951
+ choices=[True, False],
952
+ help=f0_autotune_description,
953
+ default=False,
954
+ )
955
+ batch_infer_parser.add_argument(
956
+ "--f0_autotune_strength",
957
+ type=float,
958
+ help=clean_strength_description,
959
+ choices=[(i / 10) for i in range(11)],
960
+ default=1.0,
961
+ )
962
+ batch_infer_parser.add_argument(
963
+ "--clean_audio",
964
+ type=lambda x: bool(strtobool(x)),
965
+ choices=[True, False],
966
+ help=clean_audio_description,
967
+ default=False,
968
+ )
969
+ batch_infer_parser.add_argument(
970
+ "--clean_strength",
971
+ type=float,
972
+ help=clean_strength_description,
973
+ choices=[(i / 10) for i in range(11)],
974
+ default=0.7,
975
+ )
976
+ batch_infer_parser.add_argument(
977
+ "--export_format",
978
+ type=str,
979
+ help=export_format_description,
980
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
981
+ default="WAV",
982
+ )
983
+ batch_infer_parser.add_argument(
984
+ "--embedder_model",
985
+ type=str,
986
+ help=embedder_model_description,
987
+ choices=[
988
+ "contentvec",
989
+ "chinese-hubert-base",
990
+ "japanese-hubert-base",
991
+ "korean-hubert-base",
992
+ "custom",
993
+ ],
994
+ default="contentvec",
995
+ )
996
+ batch_infer_parser.add_argument(
997
+ "--embedder_model_custom",
998
+ type=str,
999
+ help=embedder_model_custom_description,
1000
+ default=None,
1001
+ )
1002
+ batch_infer_parser.add_argument(
1003
+ "--f0_file",
1004
+ type=str,
1005
+ help=f0_file_description,
1006
+ default=None,
1007
+ )
1008
+ batch_infer_parser.add_argument(
1009
+ "--formant_shifting",
1010
+ type=lambda x: bool(strtobool(x)),
1011
+ choices=[True, False],
1012
+ help=formant_shifting_description,
1013
+ default=False,
1014
+ required=False,
1015
+ )
1016
+ batch_infer_parser.add_argument(
1017
+ "--formant_qfrency",
1018
+ type=float,
1019
+ help=formant_qfrency_description,
1020
+ default=1.0,
1021
+ required=False,
1022
+ )
1023
+ batch_infer_parser.add_argument(
1024
+ "--formant_timbre",
1025
+ type=float,
1026
+ help=formant_timbre_description,
1027
+ default=1.0,
1028
+ required=False,
1029
+ )
1030
+ batch_infer_parser.add_argument(
1031
+ "--sid",
1032
+ type=int,
1033
+ help=sid_description,
1034
+ default=0,
1035
+ required=False,
1036
+ )
1037
+ batch_infer_parser.add_argument(
1038
+ "--post_process",
1039
+ type=lambda x: bool(strtobool(x)),
1040
+ choices=[True, False],
1041
+ help=post_process_description,
1042
+ default=False,
1043
+ required=False,
1044
+ )
1045
+ batch_infer_parser.add_argument(
1046
+ "--reverb",
1047
+ type=lambda x: bool(strtobool(x)),
1048
+ choices=[True, False],
1049
+ help=reverb_description,
1050
+ default=False,
1051
+ required=False,
1052
+ )
1053
+
1054
+ batch_infer_parser.add_argument(
1055
+ "--pitch_shift",
1056
+ type=lambda x: bool(strtobool(x)),
1057
+ choices=[True, False],
1058
+ help=pitch_shift_description,
1059
+ default=False,
1060
+ required=False,
1061
+ )
1062
+
1063
+ batch_infer_parser.add_argument(
1064
+ "--limiter",
1065
+ type=lambda x: bool(strtobool(x)),
1066
+ choices=[True, False],
1067
+ help=limiter_description,
1068
+ default=False,
1069
+ required=False,
1070
+ )
1071
+
1072
+ batch_infer_parser.add_argument(
1073
+ "--gain",
1074
+ type=lambda x: bool(strtobool(x)),
1075
+ choices=[True, False],
1076
+ help=gain_description,
1077
+ default=False,
1078
+ required=False,
1079
+ )
1080
+
1081
+ batch_infer_parser.add_argument(
1082
+ "--distortion",
1083
+ type=lambda x: bool(strtobool(x)),
1084
+ choices=[True, False],
1085
+ help=distortion_description,
1086
+ default=False,
1087
+ required=False,
1088
+ )
1089
+
1090
+ batch_infer_parser.add_argument(
1091
+ "--chorus",
1092
+ type=lambda x: bool(strtobool(x)),
1093
+ choices=[True, False],
1094
+ help=chorus_description,
1095
+ default=False,
1096
+ required=False,
1097
+ )
1098
+
1099
+ batch_infer_parser.add_argument(
1100
+ "--bitcrush",
1101
+ type=lambda x: bool(strtobool(x)),
1102
+ choices=[True, False],
1103
+ help=bitcrush_description,
1104
+ default=False,
1105
+ required=False,
1106
+ )
1107
+
1108
+ batch_infer_parser.add_argument(
1109
+ "--clipping",
1110
+ type=lambda x: bool(strtobool(x)),
1111
+ choices=[True, False],
1112
+ help=clipping_description,
1113
+ default=False,
1114
+ required=False,
1115
+ )
1116
+
1117
+ batch_infer_parser.add_argument(
1118
+ "--compressor",
1119
+ type=lambda x: bool(strtobool(x)),
1120
+ choices=[True, False],
1121
+ help=compressor_description,
1122
+ default=False,
1123
+ required=False,
1124
+ )
1125
+
1126
+ batch_infer_parser.add_argument(
1127
+ "--delay",
1128
+ type=lambda x: bool(strtobool(x)),
1129
+ choices=[True, False],
1130
+ help=delay_description,
1131
+ default=False,
1132
+ required=False,
1133
+ )
1134
+
1135
+ batch_infer_parser.add_argument(
1136
+ "--reverb_room_size",
1137
+ type=float,
1138
+ help=reverb_room_size_description,
1139
+ default=0.5,
1140
+ required=False,
1141
+ )
1142
+
1143
+ batch_infer_parser.add_argument(
1144
+ "--reverb_damping",
1145
+ type=float,
1146
+ help=reverb_damping_description,
1147
+ default=0.5,
1148
+ required=False,
1149
+ )
1150
+
1151
+ batch_infer_parser.add_argument(
1152
+ "--reverb_wet_gain",
1153
+ type=float,
1154
+ help=reverb_wet_gain_description,
1155
+ default=0.5,
1156
+ required=False,
1157
+ )
1158
+
1159
+ batch_infer_parser.add_argument(
1160
+ "--reverb_dry_gain",
1161
+ type=float,
1162
+ help=reverb_dry_gain_description,
1163
+ default=0.5,
1164
+ required=False,
1165
+ )
1166
+
1167
+ batch_infer_parser.add_argument(
1168
+ "--reverb_width",
1169
+ type=float,
1170
+ help=reverb_width_description,
1171
+ default=0.5,
1172
+ required=False,
1173
+ )
1174
+
1175
+ batch_infer_parser.add_argument(
1176
+ "--reverb_freeze_mode",
1177
+ type=float,
1178
+ help=reverb_freeze_mode_description,
1179
+ default=0.5,
1180
+ required=False,
1181
+ )
1182
+
1183
+ batch_infer_parser.add_argument(
1184
+ "--pitch_shift_semitones",
1185
+ type=float,
1186
+ help=pitch_shift_semitones_description,
1187
+ default=0.0,
1188
+ required=False,
1189
+ )
1190
+
1191
+ batch_infer_parser.add_argument(
1192
+ "--limiter_threshold",
1193
+ type=float,
1194
+ help=limiter_threshold_description,
1195
+ default=-6,
1196
+ required=False,
1197
+ )
1198
+
1199
+ batch_infer_parser.add_argument(
1200
+ "--limiter_release_time",
1201
+ type=float,
1202
+ help=limiter_release_time_description,
1203
+ default=0.01,
1204
+ required=False,
1205
+ )
1206
+ batch_infer_parser.add_argument(
1207
+ "--gain_db",
1208
+ type=float,
1209
+ help=gain_db_description,
1210
+ default=0.0,
1211
+ required=False,
1212
+ )
1213
+
1214
+ batch_infer_parser.add_argument(
1215
+ "--distortion_gain",
1216
+ type=float,
1217
+ help=distortion_gain_description,
1218
+ default=25,
1219
+ required=False,
1220
+ )
1221
+
1222
+ batch_infer_parser.add_argument(
1223
+ "--chorus_rate",
1224
+ type=float,
1225
+ help=chorus_rate_description,
1226
+ default=1.0,
1227
+ required=False,
1228
+ )
1229
+
1230
+ batch_infer_parser.add_argument(
1231
+ "--chorus_depth",
1232
+ type=float,
1233
+ help=chorus_depth_description,
1234
+ default=0.25,
1235
+ required=False,
1236
+ )
1237
+ batch_infer_parser.add_argument(
1238
+ "--chorus_center_delay",
1239
+ type=float,
1240
+ help=chorus_center_delay_description,
1241
+ default=7,
1242
+ required=False,
1243
+ )
1244
+
1245
+ batch_infer_parser.add_argument(
1246
+ "--chorus_feedback",
1247
+ type=float,
1248
+ help=chorus_feedback_description,
1249
+ default=0.0,
1250
+ required=False,
1251
+ )
1252
+
1253
+ batch_infer_parser.add_argument(
1254
+ "--chorus_mix",
1255
+ type=float,
1256
+ help=chorus_mix_description,
1257
+ default=0.5,
1258
+ required=False,
1259
+ )
1260
+
1261
+ batch_infer_parser.add_argument(
1262
+ "--bitcrush_bit_depth",
1263
+ type=int,
1264
+ help=bitcrush_bit_depth_description,
1265
+ default=8,
1266
+ required=False,
1267
+ )
1268
+
1269
+ batch_infer_parser.add_argument(
1270
+ "--clipping_threshold",
1271
+ type=float,
1272
+ help=clipping_threshold_description,
1273
+ default=-6,
1274
+ required=False,
1275
+ )
1276
+
1277
+ batch_infer_parser.add_argument(
1278
+ "--compressor_threshold",
1279
+ type=float,
1280
+ help=compressor_threshold_description,
1281
+ default=0,
1282
+ required=False,
1283
+ )
1284
+
1285
+ batch_infer_parser.add_argument(
1286
+ "--compressor_ratio",
1287
+ type=float,
1288
+ help=compressor_ratio_description,
1289
+ default=1,
1290
+ required=False,
1291
+ )
1292
+
1293
+ batch_infer_parser.add_argument(
1294
+ "--compressor_attack",
1295
+ type=float,
1296
+ help=compressor_attack_description,
1297
+ default=1.0,
1298
+ required=False,
1299
+ )
1300
+
1301
+ batch_infer_parser.add_argument(
1302
+ "--compressor_release",
1303
+ type=float,
1304
+ help=compressor_release_description,
1305
+ default=100,
1306
+ required=False,
1307
+ )
1308
+ batch_infer_parser.add_argument(
1309
+ "--delay_seconds",
1310
+ type=float,
1311
+ help=delay_seconds_description,
1312
+ default=0.5,
1313
+ required=False,
1314
+ )
1315
+ batch_infer_parser.add_argument(
1316
+ "--delay_feedback",
1317
+ type=float,
1318
+ help=delay_feedback_description,
1319
+ default=0.0,
1320
+ required=False,
1321
+ )
1322
+ batch_infer_parser.add_argument(
1323
+ "--delay_mix",
1324
+ type=float,
1325
+ help=delay_mix_description,
1326
+ default=0.5,
1327
+ required=False,
1328
+ )
1329
+
1330
+
1331
+ # Parser for 'download' mode
1332
+ download_parser = subparsers.add_parser(
1333
+ "download", help="Download a model from a provided link."
1334
+ )
1335
+ download_parser.add_argument(
1336
+ "--model_link", type=str, help="Direct link to the model file.", required=True
1337
+ )
1338
+
1339
+ # Parser for 'prerequisites' mode
1340
+ prerequisites_parser = subparsers.add_parser(
1341
+ "prerequisites", help="Install prerequisites for RVC."
1342
+ )
1343
+ prerequisites_parser.add_argument(
1344
+ "--models",
1345
+ type=lambda x: bool(strtobool(x)),
1346
+ choices=[True, False],
1347
+ default=True,
1348
+ help="Download additional models.",
1349
+ )
1350
+ prerequisites_parser.add_argument(
1351
+ "--exe",
1352
+ type=lambda x: bool(strtobool(x)),
1353
+ choices=[True, False],
1354
+ default=True,
1355
+ help="Download required executables.",
1356
+ )
1357
+
1358
+
1359
+ return parser.parse_args()
1360
+
1361
+
1362
+ def main():
1363
+ if len(sys.argv) == 1:
1364
+ print("Please run the script with '-h' for more information.")
1365
+ sys.exit(1)
1366
+
1367
+ args = parse_arguments()
1368
+
1369
+ try:
1370
+ if args.mode == "infer":
1371
+ run_infer_script(
1372
+ pitch=args.pitch,
1373
+ filter_radius=args.filter_radius,
1374
+ index_rate=args.index_rate,
1375
+ volume_envelope=args.volume_envelope,
1376
+ protect=args.protect,
1377
+ hop_length=args.hop_length,
1378
+ f0_method=args.f0_method,
1379
+ input_path=args.input_path,
1380
+ output_path=args.output_path,
1381
+ pth_path=args.pth_path,
1382
+ index_path=args.index_path,
1383
+ split_audio=args.split_audio,
1384
+ f0_autotune=args.f0_autotune,
1385
+ f0_autotune_strength=args.f0_autotune_strength,
1386
+ clean_audio=args.clean_audio,
1387
+ clean_strength=args.clean_strength,
1388
+ export_format=args.export_format,
1389
+ embedder_model=args.embedder_model,
1390
+ embedder_model_custom=args.embedder_model_custom,
1391
+ f0_file=args.f0_file,
1392
+ formant_shifting=args.formant_shifting,
1393
+ formant_qfrency=args.formant_qfrency,
1394
+ formant_timbre=args.formant_timbre,
1395
+ sid=args.sid,
1396
+ post_process=args.post_process,
1397
+ reverb=args.reverb,
1398
+ pitch_shift=args.pitch_shift,
1399
+ limiter=args.limiter,
1400
+ gain=args.gain,
1401
+ distortion=args.distortion,
1402
+ chorus=args.chorus,
1403
+ bitcrush=args.bitcrush,
1404
+ clipping=args.clipping,
1405
+ compressor=args.compressor,
1406
+ delay=args.delay,
1407
+ reverb_room_size=args.reverb_room_size,
1408
+ reverb_damping=args.reverb_damping,
1409
+ reverb_wet_gain=args.reverb_wet_gain,
1410
+ reverb_dry_gain=args.reverb_dry_gain,
1411
+ reverb_width=args.reverb_width,
1412
+ reverb_freeze_mode=args.reverb_freeze_mode,
1413
+ pitch_shift_semitones=args.pitch_shift_semitones,
1414
+ limiter_threshold=args.limiter_threshold,
1415
+ limiter_release_time=args.limiter_release_time,
1416
+ gain_db=args.gain_db,
1417
+ distortion_gain=args.distortion_gain,
1418
+ chorus_rate=args.chorus_rate,
1419
+ chorus_depth=args.chorus_depth,
1420
+ chorus_center_delay=args.chorus_center_delay,
1421
+ chorus_feedback=args.chorus_feedback,
1422
+ chorus_mix=args.chorus_mix,
1423
+ bitcrush_bit_depth=args.bitcrush_bit_depth,
1424
+ clipping_threshold=args.clipping_threshold,
1425
+ compressor_threshold=args.compressor_threshold,
1426
+ compressor_ratio=args.compressor_ratio,
1427
+ compressor_attack=args.compressor_attack,
1428
+ compressor_release=args.compressor_release,
1429
+ delay_seconds=args.delay_seconds,
1430
+ delay_feedback=args.delay_feedback,
1431
+ delay_mix=args.delay_mix,
1432
+ )
1433
+ elif args.mode == "batch_infer":
1434
+ run_batch_infer_script(
1435
+ pitch=args.pitch,
1436
+ filter_radius=args.filter_radius,
1437
+ index_rate=args.index_rate,
1438
+ volume_envelope=args.volume_envelope,
1439
+ protect=args.protect,
1440
+ hop_length=args.hop_length,
1441
+ f0_method=args.f0_method,
1442
+ input_folder=args.input_folder,
1443
+ output_folder=args.output_folder,
1444
+ pth_path=args.pth_path,
1445
+ index_path=args.index_path,
1446
+ split_audio=args.split_audio,
1447
+ f0_autotune=args.f0_autotune,
1448
+ f0_autotune_strength=args.f0_autotune_strength,
1449
+ clean_audio=args.clean_audio,
1450
+ clean_strength=args.clean_strength,
1451
+ export_format=args.export_format,
1452
+ embedder_model=args.embedder_model,
1453
+ embedder_model_custom=args.embedder_model_custom,
1454
+ f0_file=args.f0_file,
1455
+ formant_shifting=args.formant_shifting,
1456
+ formant_qfrency=args.formant_qfrency,
1457
+ formant_timbre=args.formant_timbre,
1458
+ sid=args.sid,
1459
+ post_process=args.post_process,
1460
+ reverb=args.reverb,
1461
+ pitch_shift=args.pitch_shift,
1462
+ limiter=args.limiter,
1463
+ gain=args.gain,
1464
+ distortion=args.distortion,
1465
+ chorus=args.chorus,
1466
+ bitcrush=args.bitcrush,
1467
+ clipping=args.clipping,
1468
+ compressor=args.compressor,
1469
+ delay=args.delay,
1470
+ reverb_room_size=args.reverb_room_size,
1471
+ reverb_damping=args.reverb_damping,
1472
+ reverb_wet_gain=args.reverb_wet_gain,
1473
+ reverb_dry_gain=args.reverb_dry_gain,
1474
+ reverb_width=args.reverb_width,
1475
+ reverb_freeze_mode=args.reverb_freeze_mode,
1476
+ pitch_shift_semitones=args.pitch_shift_semitones,
1477
+ limiter_threshold=args.limiter_threshold,
1478
+ limiter_release_time=args.limiter_release_time,
1479
+ gain_db=args.gain_db,
1480
+ distortion_gain=args.distortion_gain,
1481
+ chorus_rate=args.chorus_rate,
1482
+ chorus_depth=args.chorus_depth,
1483
+ chorus_center_delay=args.chorus_center_delay,
1484
+ chorus_feedback=args.chorus_feedback,
1485
+ chorus_mix=args.chorus_mix,
1486
+ bitcrush_bit_depth=args.bitcrush_bit_depth,
1487
+ clipping_threshold=args.clipping_threshold,
1488
+ compressor_threshold=args.compressor_threshold,
1489
+ compressor_ratio=args.compressor_ratio,
1490
+ compressor_attack=args.compressor_attack,
1491
+ compressor_release=args.compressor_release,
1492
+ delay_seconds=args.delay_seconds,
1493
+ delay_feedback=args.delay_feedback,
1494
+ delay_mix=args.delay_mix,
1495
+ )
1496
+
1497
+ elif args.mode == "prerequisites":
1498
+ run_prerequisites_script(
1499
+ pretraineds_v1_f0=args.pretraineds_v1_f0,
1500
+ pretraineds_v1_nof0=args.pretraineds_v1_nof0,
1501
+ pretraineds_v2_f0=args.pretraineds_v2_f0,
1502
+ pretraineds_v2_nof0=args.pretraineds_v2_nof0,
1503
+ models=args.models,
1504
+ exe=args.exe,
1505
+ )
1506
+ elif args.mode == "download":
1507
+ run_download_script(
1508
+ model_link=args.model_link,
1509
+ )
1510
+ except Exception as error:
1511
+ print(f"An error occurred during execution: {error}")
1512
+
1513
+ import traceback
1514
+
1515
+ traceback.print_exc()
1516
+
1517
+
1518
+ if __name__ == "__main__":
1519
+ main()
discriminators.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn.utils.parametrizations import spectral_norm, weight_norm
3
+
4
+ from rvc_cli.rvc.lib.algorithm.commons import get_padding
5
+ from rvc_cli.rvc.lib.algorithm.residuals import LRELU_SLOPE
6
+
7
+
8
+ class MultiPeriodDiscriminator(torch.nn.Module):
9
+ """
10
+ Multi-period discriminator.
11
+
12
+ This class implements a multi-period discriminator, which is used to
13
+ discriminate between real and fake audio signals. The discriminator
14
+ is composed of a series of convolutional layers that are applied to
15
+ the input signal at different periods.
16
+
17
+ Args:
18
+ periods (str): Periods of the discriminator. V1 = [2, 3, 5, 7, 11, 17], V2 = [2, 3, 5, 7, 11, 17, 23, 37].
19
+ use_spectral_norm (bool): Whether to use spectral normalization.
20
+ Defaults to False.
21
+ """
22
+
23
+ def __init__(self, version, use_spectral_norm=False):
24
+ super(MultiPeriodDiscriminator, self).__init__()
25
+ periods = (
26
+ [2, 3, 5, 7, 11, 17] if version == "v1" else [2, 3, 5, 7, 11, 17, 23, 37]
27
+ )
28
+ self.discriminators = torch.nn.ModuleList(
29
+ [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
30
+ + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods]
31
+ )
32
+
33
+ def forward(self, y, y_hat):
34
+ """
35
+ Forward pass of the multi-period discriminator.
36
+
37
+ Args:
38
+ y (torch.Tensor): Real audio signal.
39
+ y_hat (torch.Tensor): Fake audio signal.
40
+ """
41
+ y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
42
+ for d in self.discriminators:
43
+ y_d_r, fmap_r = d(y)
44
+ y_d_g, fmap_g = d(y_hat)
45
+ y_d_rs.append(y_d_r)
46
+ y_d_gs.append(y_d_g)
47
+ fmap_rs.append(fmap_r)
48
+ fmap_gs.append(fmap_g)
49
+
50
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
51
+
52
+
53
+ class DiscriminatorS(torch.nn.Module):
54
+ """
55
+ Discriminator for the short-term component.
56
+
57
+ This class implements a discriminator for the short-term component
58
+ of the audio signal. The discriminator is composed of a series of
59
+ convolutional layers that are applied to the input signal.
60
+ """
61
+
62
+ def __init__(self, use_spectral_norm=False):
63
+ super(DiscriminatorS, self).__init__()
64
+ norm_f = spectral_norm if use_spectral_norm else weight_norm
65
+ self.convs = torch.nn.ModuleList(
66
+ [
67
+ norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)),
68
+ norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
69
+ norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
70
+ norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
71
+ norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
72
+ norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2)),
73
+ ]
74
+ )
75
+ self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1))
76
+ self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
77
+
78
+ def forward(self, x):
79
+ """
80
+ Forward pass of the discriminator.
81
+
82
+ Args:
83
+ x (torch.Tensor): Input audio signal.
84
+ """
85
+ fmap = []
86
+ for conv in self.convs:
87
+ x = self.lrelu(conv(x))
88
+ fmap.append(x)
89
+ x = self.conv_post(x)
90
+ fmap.append(x)
91
+ x = torch.flatten(x, 1, -1)
92
+ return x, fmap
93
+
94
+
95
+ class DiscriminatorP(torch.nn.Module):
96
+ """
97
+ Discriminator for the long-term component.
98
+
99
+ This class implements a discriminator for the long-term component
100
+ of the audio signal. The discriminator is composed of a series of
101
+ convolutional layers that are applied to the input signal at a given
102
+ period.
103
+
104
+ Args:
105
+ period (int): Period of the discriminator.
106
+ kernel_size (int): Kernel size of the convolutional layers.
107
+ Defaults to 5.
108
+ stride (int): Stride of the convolutional layers. Defaults to 3.
109
+ use_spectral_norm (bool): Whether to use spectral normalization.
110
+ Defaults to False.
111
+ """
112
+
113
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
114
+ super(DiscriminatorP, self).__init__()
115
+ self.period = period
116
+ norm_f = spectral_norm if use_spectral_norm else weight_norm
117
+
118
+ in_channels = [1, 32, 128, 512, 1024]
119
+ out_channels = [32, 128, 512, 1024, 1024]
120
+
121
+ self.convs = torch.nn.ModuleList(
122
+ [
123
+ norm_f(
124
+ torch.nn.Conv2d(
125
+ in_ch,
126
+ out_ch,
127
+ (kernel_size, 1),
128
+ (stride, 1),
129
+ padding=(get_padding(kernel_size, 1), 0),
130
+ )
131
+ )
132
+ for in_ch, out_ch in zip(in_channels, out_channels)
133
+ ]
134
+ )
135
+
136
+ self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
137
+ self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
138
+
139
+ def forward(self, x):
140
+ """
141
+ Forward pass of the discriminator.
142
+
143
+ Args:
144
+ x (torch.Tensor): Input audio signal.
145
+ """
146
+ fmap = []
147
+ b, c, t = x.shape
148
+ if t % self.period != 0:
149
+ n_pad = self.period - (t % self.period)
150
+ x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
151
+ x = x.view(b, c, -1, self.period)
152
+
153
+ for conv in self.convs:
154
+ x = self.lrelu(conv(x))
155
+ fmap.append(x)
156
+
157
+ x = self.conv_post(x)
158
+ fmap.append(x)
159
+ x = torch.flatten(x, 1, -1)
160
+ return x, fmap
encoders.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from typing import Optional
4
+
5
+ from rvc_cli.rvc.lib.algorithm.commons import sequence_mask
6
+ from rvc_cli.rvc.lib.algorithm.modules import WaveNet
7
+ from rvc_cli.rvc.lib.algorithm.normalization import LayerNorm
8
+ from rvc_cli.rvc.lib.algorithm.attentions import FFN, MultiHeadAttention
9
+
10
+
11
+ class Encoder(torch.nn.Module):
12
+ """
13
+ Encoder module for the Transformer model.
14
+
15
+ Args:
16
+ hidden_channels (int): Number of hidden channels in the encoder.
17
+ filter_channels (int): Number of filter channels in the feed-forward network.
18
+ n_heads (int): Number of attention heads.
19
+ n_layers (int): Number of encoder layers.
20
+ kernel_size (int, optional): Kernel size of the convolution layers in the feed-forward network. Defaults to 1.
21
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
22
+ window_size (int, optional): Window size for relative positional encoding. Defaults to 10.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ hidden_channels,
28
+ filter_channels,
29
+ n_heads,
30
+ n_layers,
31
+ kernel_size=1,
32
+ p_dropout=0.0,
33
+ window_size=10,
34
+ ):
35
+ super().__init__()
36
+ self.hidden_channels = hidden_channels
37
+ self.filter_channels = filter_channels
38
+ self.n_heads = n_heads
39
+ self.n_layers = n_layers
40
+ self.kernel_size = kernel_size
41
+ self.p_dropout = p_dropout
42
+ self.window_size = window_size
43
+
44
+ self.drop = torch.nn.Dropout(p_dropout)
45
+ self.attn_layers = torch.nn.ModuleList()
46
+ self.norm_layers_1 = torch.nn.ModuleList()
47
+ self.ffn_layers = torch.nn.ModuleList()
48
+ self.norm_layers_2 = torch.nn.ModuleList()
49
+ for i in range(self.n_layers):
50
+ self.attn_layers.append(
51
+ MultiHeadAttention(
52
+ hidden_channels,
53
+ hidden_channels,
54
+ n_heads,
55
+ p_dropout=p_dropout,
56
+ window_size=window_size,
57
+ )
58
+ )
59
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
60
+ self.ffn_layers.append(
61
+ FFN(
62
+ hidden_channels,
63
+ hidden_channels,
64
+ filter_channels,
65
+ kernel_size,
66
+ p_dropout=p_dropout,
67
+ )
68
+ )
69
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
70
+
71
+ def forward(self, x, x_mask):
72
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
73
+ x = x * x_mask
74
+ for i in range(self.n_layers):
75
+ y = self.attn_layers[i](x, x, attn_mask)
76
+ y = self.drop(y)
77
+ x = self.norm_layers_1[i](x + y)
78
+
79
+ y = self.ffn_layers[i](x, x_mask)
80
+ y = self.drop(y)
81
+ x = self.norm_layers_2[i](x + y)
82
+ x = x * x_mask
83
+ return x
84
+
85
+
86
+ class TextEncoder(torch.nn.Module):
87
+ """Text Encoder with configurable embedding dimension.
88
+
89
+ Args:
90
+ out_channels (int): Output channels of the encoder.
91
+ hidden_channels (int): Hidden channels of the encoder.
92
+ filter_channels (int): Filter channels of the encoder.
93
+ n_heads (int): Number of attention heads.
94
+ n_layers (int): Number of encoder layers.
95
+ kernel_size (int): Kernel size of the convolutional layers.
96
+ p_dropout (float): Dropout probability.
97
+ embedding_dim (int): Embedding dimension for phone embeddings (v1 = 256, v2 = 768).
98
+ f0 (bool, optional): Whether to use F0 embedding. Defaults to True.
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ out_channels,
104
+ hidden_channels,
105
+ filter_channels,
106
+ n_heads,
107
+ n_layers,
108
+ kernel_size,
109
+ p_dropout,
110
+ embedding_dim,
111
+ f0=True,
112
+ ):
113
+ super(TextEncoder, self).__init__()
114
+ self.out_channels = out_channels
115
+ self.hidden_channels = hidden_channels
116
+ self.filter_channels = filter_channels
117
+ self.n_heads = n_heads
118
+ self.n_layers = n_layers
119
+ self.kernel_size = kernel_size
120
+ self.p_dropout = float(p_dropout)
121
+ self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels)
122
+ self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True)
123
+ if f0:
124
+ self.emb_pitch = torch.nn.Embedding(256, hidden_channels)
125
+ self.encoder = Encoder(
126
+ hidden_channels,
127
+ filter_channels,
128
+ n_heads,
129
+ n_layers,
130
+ kernel_size,
131
+ float(p_dropout),
132
+ )
133
+ self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
134
+
135
+ def forward(
136
+ self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor
137
+ ):
138
+ if pitch is None:
139
+ x = self.emb_phone(phone)
140
+ else:
141
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
142
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
143
+ x = self.lrelu(x)
144
+ x = torch.transpose(x, 1, -1) # [b, h, t]
145
+ x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
146
+ x = self.encoder(x * x_mask, x_mask)
147
+ stats = self.proj(x) * x_mask
148
+
149
+ m, logs = torch.split(stats, self.out_channels, dim=1)
150
+ return m, logs, x_mask
151
+
152
+
153
+ class PosteriorEncoder(torch.nn.Module):
154
+ """Posterior Encoder for inferring latent representation.
155
+
156
+ Args:
157
+ in_channels (int): Number of channels in the input.
158
+ out_channels (int): Number of channels in the output.
159
+ hidden_channels (int): Number of hidden channels in the encoder.
160
+ kernel_size (int): Kernel size of the convolutional layers.
161
+ dilation_rate (int): Dilation rate of the convolutional layers.
162
+ n_layers (int): Number of layers in the encoder.
163
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
164
+ """
165
+
166
+ def __init__(
167
+ self,
168
+ in_channels,
169
+ out_channels,
170
+ hidden_channels,
171
+ kernel_size,
172
+ dilation_rate,
173
+ n_layers,
174
+ gin_channels=0,
175
+ ):
176
+ super(PosteriorEncoder, self).__init__()
177
+ self.in_channels = in_channels
178
+ self.out_channels = out_channels
179
+ self.hidden_channels = hidden_channels
180
+ self.kernel_size = kernel_size
181
+ self.dilation_rate = dilation_rate
182
+ self.n_layers = n_layers
183
+ self.gin_channels = gin_channels
184
+
185
+ self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1)
186
+ self.enc = WaveNet(
187
+ hidden_channels,
188
+ kernel_size,
189
+ dilation_rate,
190
+ n_layers,
191
+ gin_channels=gin_channels,
192
+ )
193
+ self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
194
+
195
+ def forward(
196
+ self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
197
+ ):
198
+ x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
199
+ x = self.pre(x) * x_mask
200
+ x = self.enc(x, x_mask, g=g)
201
+ stats = self.proj(x) * x_mask
202
+ m, logs = torch.split(stats, self.out_channels, dim=1)
203
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
204
+ return z, m, logs, x_mask
205
+
206
+ def remove_weight_norm(self):
207
+ """Removes weight normalization from the encoder."""
208
+ self.enc.remove_weight_norm()
209
+
210
+ def __prepare_scriptable__(self):
211
+ """Prepares the module for scripting."""
212
+ for hook in self.enc._forward_pre_hooks.values():
213
+ if (
214
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
215
+ and hook.__class__.__name__ == "WeightNorm"
216
+ ):
217
+ torch.nn.utils.remove_weight_norm(self.enc)
218
+ return self
generators.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from torch.nn.utils import remove_weight_norm
4
+ from torch.nn.utils.parametrizations import weight_norm
5
+ from typing import Optional
6
+
7
+ from rvc_cli.rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock1, ResBlock2
8
+ from rvc_cli.rvc.lib.algorithm.commons import init_weights
9
+
10
+
11
+ class Generator(torch.nn.Module):
12
+ """Generator for synthesizing audio.
13
+
14
+ Args:
15
+ initial_channel (int): Number of channels in the initial convolutional layer.
16
+ resblock (str): Type of residual block to use (1 or 2).
17
+ resblock_kernel_sizes (list): Kernel sizes of the residual blocks.
18
+ resblock_dilation_sizes (list): Dilation rates of the residual blocks.
19
+ upsample_rates (list): Upsampling rates.
20
+ upsample_initial_channel (int): Number of channels in the initial upsampling layer.
21
+ upsample_kernel_sizes (list): Kernel sizes of the upsampling layers.
22
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ initial_channel,
28
+ resblock,
29
+ resblock_kernel_sizes,
30
+ resblock_dilation_sizes,
31
+ upsample_rates,
32
+ upsample_initial_channel,
33
+ upsample_kernel_sizes,
34
+ gin_channels=0,
35
+ ):
36
+ super(Generator, self).__init__()
37
+ self.num_kernels = len(resblock_kernel_sizes)
38
+ self.num_upsamples = len(upsample_rates)
39
+ self.conv_pre = torch.nn.Conv1d(
40
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
41
+ )
42
+ resblock = ResBlock1 if resblock == "1" else ResBlock2
43
+
44
+ self.ups = torch.nn.ModuleList()
45
+ self.resblocks = torch.nn.ModuleList()
46
+
47
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
48
+ self.ups.append(
49
+ weight_norm(
50
+ torch.nn.ConvTranspose1d(
51
+ upsample_initial_channel // (2**i),
52
+ upsample_initial_channel // (2 ** (i + 1)),
53
+ k,
54
+ u,
55
+ padding=(k - u) // 2,
56
+ )
57
+ )
58
+ )
59
+ ch = upsample_initial_channel // (2 ** (i + 1))
60
+ for j, (k, d) in enumerate(
61
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
62
+ ):
63
+ self.resblocks.append(resblock(ch, k, d))
64
+
65
+ self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
66
+ self.ups.apply(init_weights)
67
+
68
+ if gin_channels != 0:
69
+ self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
70
+
71
+ def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
72
+ x = self.conv_pre(x)
73
+ if g is not None:
74
+ x = x + self.cond(g)
75
+
76
+ for i in range(self.num_upsamples):
77
+ x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
78
+ x = self.ups[i](x)
79
+ xs = None
80
+ for j in range(self.num_kernels):
81
+ if xs == None:
82
+ xs = self.resblocks[i * self.num_kernels + j](x)
83
+ else:
84
+ xs += self.resblocks[i * self.num_kernels + j](x)
85
+ x = xs / self.num_kernels
86
+
87
+ x = torch.nn.functional.leaky_relu(x)
88
+ x = self.conv_post(x)
89
+ x = torch.tanh(x)
90
+
91
+ return x
92
+
93
+ def __prepare_scriptable__(self):
94
+ """Prepares the module for scripting."""
95
+ for l in self.ups_and_resblocks:
96
+ for hook in l._forward_pre_hooks.values():
97
+ if (
98
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
99
+ and hook.__class__.__name__ == "WeightNorm"
100
+ ):
101
+ torch.nn.utils.remove_weight_norm(l)
102
+ return self
103
+
104
+ def remove_weight_norm(self):
105
+ """Removes weight normalization from the upsampling and residual blocks."""
106
+ for l in self.ups:
107
+ remove_weight_norm(l)
108
+ for l in self.resblocks:
109
+ l.remove_weight_norm()
110
+
111
+
112
+ class SineGenerator(torch.nn.Module):
113
+ """
114
+ A sine wave generator that synthesizes waveforms with optional harmonic overtones and noise.
115
+
116
+ Args:
117
+ sampling_rate (int): The sampling rate in Hz.
118
+ num_harmonics (int, optional): The number of harmonic overtones to include. Defaults to 0.
119
+ sine_amplitude (float, optional): The amplitude of the sine waveform. Defaults to 0.1.
120
+ noise_stddev (float, optional): The standard deviation of Gaussian noise. Defaults to 0.003.
121
+ voiced_threshold (float, optional): F0 threshold for distinguishing voiced/unvoiced frames. Defaults to 0.
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ sampling_rate: int,
127
+ num_harmonics: int = 0,
128
+ sine_amplitude: float = 0.1,
129
+ noise_stddev: float = 0.003,
130
+ voiced_threshold: float = 0.0,
131
+ ):
132
+ super(SineGenerator, self).__init__()
133
+ self.sampling_rate = sampling_rate
134
+ self.num_harmonics = num_harmonics
135
+ self.sine_amplitude = sine_amplitude
136
+ self.noise_stddev = noise_stddev
137
+ self.voiced_threshold = voiced_threshold
138
+ self.waveform_dim = self.num_harmonics + 1 # fundamental + harmonics
139
+
140
+ def _compute_voiced_unvoiced(self, f0: torch.Tensor) -> torch.Tensor:
141
+ """
142
+ Generate a binary mask to indicate voiced/unvoiced frames.
143
+
144
+ Args:
145
+ f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length).
146
+ """
147
+ uv_mask = (f0 > self.voiced_threshold).float()
148
+ return uv_mask
149
+
150
+ def _generate_sine_wave(
151
+ self, f0: torch.Tensor, upsampling_factor: int
152
+ ) -> torch.Tensor:
153
+ """
154
+ Generate sine waves for the fundamental frequency and its harmonics.
155
+
156
+ Args:
157
+ f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length, 1).
158
+ upsampling_factor (int): Upsampling factor.
159
+ """
160
+ batch_size, length, _ = f0.shape
161
+
162
+ # Create an upsampling grid
163
+ upsampling_grid = torch.arange(
164
+ 1, upsampling_factor + 1, dtype=f0.dtype, device=f0.device
165
+ )
166
+
167
+ # Calculate phase increments
168
+ phase_increments = (f0 / self.sampling_rate) * upsampling_grid
169
+ phase_remainder = torch.fmod(phase_increments[:, :-1, -1:] + 0.5, 1.0) - 0.5
170
+ cumulative_phase = phase_remainder.cumsum(dim=1).fmod(1.0).to(f0.dtype)
171
+ phase_increments += torch.nn.functional.pad(
172
+ cumulative_phase, (0, 0, 1, 0), mode="constant"
173
+ )
174
+
175
+ # Reshape to match the sine wave shape
176
+ phase_increments = phase_increments.reshape(batch_size, -1, 1)
177
+
178
+ # Scale for harmonics
179
+ harmonic_scale = torch.arange(
180
+ 1, self.waveform_dim + 1, dtype=f0.dtype, device=f0.device
181
+ ).reshape(1, 1, -1)
182
+ phase_increments *= harmonic_scale
183
+
184
+ # Add random phase offset (except for the fundamental)
185
+ random_phase = torch.rand(1, 1, self.waveform_dim, device=f0.device)
186
+ random_phase[..., 0] = 0 # Fundamental frequency has no random offset
187
+ phase_increments += random_phase
188
+
189
+ # Generate sine waves
190
+ sine_waves = torch.sin(2 * np.pi * phase_increments)
191
+ return sine_waves
192
+
193
+ def forward(self, f0: torch.Tensor, upsampling_factor: int):
194
+ """
195
+ Forward pass to generate sine waveforms with noise and voiced/unvoiced masking.
196
+
197
+ Args:
198
+ f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length, 1).
199
+ upsampling_factor (int): Upsampling factor.
200
+ """
201
+ with torch.no_grad():
202
+ # Expand `f0` to include waveform dimensions
203
+ f0 = f0.unsqueeze(-1)
204
+
205
+ # Generate sine waves
206
+ sine_waves = (
207
+ self._generate_sine_wave(f0, upsampling_factor) * self.sine_amplitude
208
+ )
209
+
210
+ # Compute voiced/unvoiced mask
211
+ voiced_mask = self._compute_voiced_unvoiced(f0)
212
+
213
+ # Upsample voiced/unvoiced mask
214
+ voiced_mask = torch.nn.functional.interpolate(
215
+ voiced_mask.transpose(2, 1),
216
+ scale_factor=float(upsampling_factor),
217
+ mode="nearest",
218
+ ).transpose(2, 1)
219
+
220
+ # Compute noise amplitude
221
+ noise_amplitude = voiced_mask * self.noise_stddev + (1 - voiced_mask) * (
222
+ self.sine_amplitude / 3
223
+ )
224
+
225
+ # Add Gaussian noise
226
+ noise = noise_amplitude * torch.randn_like(sine_waves)
227
+
228
+ # Combine sine waves and noise
229
+ sine_waveforms = sine_waves * voiced_mask + noise
230
+
231
+ return sine_waveforms, voiced_mask, noise
infer.py ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import torch
5
+ import librosa
6
+ import logging
7
+ import traceback
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import noisereduce as nr
11
+ from pedalboard import (
12
+ Pedalboard,
13
+ Chorus,
14
+ Distortion,
15
+ Reverb,
16
+ PitchShift,
17
+ Limiter,
18
+ Gain,
19
+ Bitcrush,
20
+ Clipping,
21
+ Compressor,
22
+ Delay,
23
+ )
24
+
25
+ now_dir = os.getcwd()
26
+ sys.path.append(now_dir)
27
+
28
+ from rvc_cli.rvc.infer.pipeline import Pipeline as VC
29
+ from rvc_cli.rvc.lib.utils import load_audio_infer, load_embedding
30
+ from rvc_cli.rvc.lib.tools.split_audio import process_audio, merge_audio
31
+ from rvc_cli.rvc.lib.algorithm.synthesizers import Synthesizer
32
+ from rvc_cli.rvc.configs.config import Config
33
+
34
+ logging.getLogger("httpx").setLevel(logging.WARNING)
35
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
36
+ logging.getLogger("faiss").setLevel(logging.WARNING)
37
+ logging.getLogger("faiss.loader").setLevel(logging.WARNING)
38
+
39
+
40
+ class VoiceConverter:
41
+ """
42
+ A class for performing voice conversion using the Retrieval-Based Voice Conversion (RVC) method.
43
+ """
44
+
45
+ def __init__(self):
46
+ """
47
+ Initializes the VoiceConverter with default configuration, and sets up models and parameters.
48
+ """
49
+ self.config = Config() # Load RVC configuration
50
+ self.hubert_model = (
51
+ None # Initialize the Hubert model (for embedding extraction)
52
+ )
53
+ self.last_embedder_model = None # Last used embedder model
54
+ self.tgt_sr = None # Target sampling rate for the output audio
55
+ self.net_g = None # Generator network for voice conversion
56
+ self.vc = None # Voice conversion pipeline instance
57
+ self.cpt = None # Checkpoint for loading model weights
58
+ self.version = None # Model version
59
+ self.n_spk = None # Number of speakers in the model
60
+ self.use_f0 = None # Whether the model uses F0
61
+ self.loaded_model = None
62
+
63
+ def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
64
+ """
65
+ Loads the HuBERT model for speaker embedding extraction.
66
+
67
+ Args:
68
+ embedder_model (str): Path to the pre-trained HuBERT model.
69
+ embedder_model_custom (str): Path to the custom HuBERT model.
70
+ """
71
+ self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
72
+ self.hubert_model.to(self.config.device)
73
+ self.hubert_model = (
74
+ self.hubert_model.half()
75
+ if self.config.is_half
76
+ else self.hubert_model.float()
77
+ )
78
+ self.hubert_model.eval()
79
+
80
+ @staticmethod
81
+ def remove_audio_noise(data, sr, reduction_strength=0.7):
82
+ """
83
+ Removes noise from an audio file using the NoiseReduce library.
84
+
85
+ Args:
86
+ data (numpy.ndarray): The audio data as a NumPy array.
87
+ sr (int): The sample rate of the audio data.
88
+ reduction_strength (float): Strength of the noise reduction. Default is 0.7.
89
+ """
90
+ try:
91
+ reduced_noise = nr.reduce_noise(
92
+ y=data, sr=sr, prop_decrease=reduction_strength
93
+ )
94
+ return reduced_noise
95
+ except Exception as error:
96
+ print(f"An error occurred removing audio noise: {error}")
97
+ return None
98
+
99
+ @staticmethod
100
+ def convert_audio_format(input_path, output_path, output_format):
101
+ """
102
+ Converts an audio file to a specified output format.
103
+
104
+ Args:
105
+ input_path (str): Path to the input audio file.
106
+ output_path (str): Path to the output audio file.
107
+ output_format (str): Desired audio format (e.g., "WAV", "MP3").
108
+ """
109
+ try:
110
+ if output_format != "WAV":
111
+ print(f"Saving audio as {output_format}...")
112
+ audio, sample_rate = librosa.load(input_path, sr=None)
113
+ common_sample_rates = [
114
+ 8000,
115
+ 11025,
116
+ 12000,
117
+ 16000,
118
+ 22050,
119
+ 24000,
120
+ 32000,
121
+ 44100,
122
+ 48000,
123
+ ]
124
+ target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
125
+ audio = librosa.resample(
126
+ audio, orig_sr=sample_rate, target_sr=target_sr
127
+ )
128
+ sf.write(output_path, audio, target_sr, format=output_format.lower())
129
+ return output_path
130
+ except Exception as error:
131
+ print(f"An error occurred converting the audio format: {error}")
132
+
133
+ @staticmethod
134
+ def post_process_audio(
135
+ audio_input,
136
+ sample_rate,
137
+ **kwargs,
138
+ ):
139
+ board = Pedalboard()
140
+ if kwargs.get("reverb", False):
141
+ reverb = Reverb(
142
+ room_size=kwargs.get("reverb_room_size", 0.5),
143
+ damping=kwargs.get("reverb_damping", 0.5),
144
+ wet_level=kwargs.get("reverb_wet_level", 0.33),
145
+ dry_level=kwargs.get("reverb_dry_level", 0.4),
146
+ width=kwargs.get("reverb_width", 1.0),
147
+ freeze_mode=kwargs.get("reverb_freeze_mode", 0),
148
+ )
149
+ board.append(reverb)
150
+ if kwargs.get("pitch_shift", False):
151
+ pitch_shift = PitchShift(semitones=kwargs.get("pitch_shift_semitones", 0))
152
+ board.append(pitch_shift)
153
+ if kwargs.get("limiter", False):
154
+ limiter = Limiter(
155
+ threshold_db=kwargs.get("limiter_threshold", -6),
156
+ release_ms=kwargs.get("limiter_release", 0.05),
157
+ )
158
+ board.append(limiter)
159
+ if kwargs.get("gain", False):
160
+ gain = Gain(gain_db=kwargs.get("gain_db", 0))
161
+ board.append(gain)
162
+ if kwargs.get("distortion", False):
163
+ distortion = Distortion(drive_db=kwargs.get("distortion_gain", 25))
164
+ board.append(distortion)
165
+ if kwargs.get("chorus", False):
166
+ chorus = Chorus(
167
+ rate_hz=kwargs.get("chorus_rate", 1.0),
168
+ depth=kwargs.get("chorus_depth", 0.25),
169
+ centre_delay_ms=kwargs.get("chorus_delay", 7),
170
+ feedback=kwargs.get("chorus_feedback", 0.0),
171
+ mix=kwargs.get("chorus_mix", 0.5),
172
+ )
173
+ board.append(chorus)
174
+ if kwargs.get("bitcrush", False):
175
+ bitcrush = Bitcrush(bit_depth=kwargs.get("bitcrush_bit_depth", 8))
176
+ board.append(bitcrush)
177
+ if kwargs.get("clipping", False):
178
+ clipping = Clipping(threshold_db=kwargs.get("clipping_threshold", 0))
179
+ board.append(clipping)
180
+ if kwargs.get("compressor", False):
181
+ compressor = Compressor(
182
+ threshold_db=kwargs.get("compressor_threshold", 0),
183
+ ratio=kwargs.get("compressor_ratio", 1),
184
+ attack_ms=kwargs.get("compressor_attack", 1.0),
185
+ release_ms=kwargs.get("compressor_release", 100),
186
+ )
187
+ board.append(compressor)
188
+ if kwargs.get("delay", False):
189
+ delay = Delay(
190
+ delay_seconds=kwargs.get("delay_seconds", 0.5),
191
+ feedback=kwargs.get("delay_feedback", 0.0),
192
+ mix=kwargs.get("delay_mix", 0.5),
193
+ )
194
+ board.append(delay)
195
+ return board(audio_input, sample_rate)
196
+
197
+ def convert_audio(
198
+ self,
199
+ audio_input_path: str,
200
+ audio_output_path: str,
201
+ model_path: str,
202
+ index_path: str,
203
+ pitch: int = 0,
204
+ f0_file: str = None,
205
+ f0_method: str = "rmvpe",
206
+ index_rate: float = 0.75,
207
+ volume_envelope: float = 1,
208
+ protect: float = 0.5,
209
+ hop_length: int = 128,
210
+ split_audio: bool = False,
211
+ f0_autotune: bool = False,
212
+ f0_autotune_strength: float = 1,
213
+ filter_radius: int = 3,
214
+ embedder_model: str = "contentvec",
215
+ embedder_model_custom: str = None,
216
+ clean_audio: bool = False,
217
+ clean_strength: float = 0.5,
218
+ export_format: str = "WAV",
219
+ upscale_audio: bool = False,
220
+ post_process: bool = False,
221
+ resample_sr: int = 0,
222
+ sid: int = 0,
223
+ **kwargs,
224
+ ):
225
+ """
226
+ Performs voice conversion on the input audio.
227
+
228
+ Args:
229
+ pitch (int): Key for F0 up-sampling.
230
+ filter_radius (int): Radius for filtering.
231
+ index_rate (float): Rate for index matching.
232
+ volume_envelope (int): RMS mix rate.
233
+ protect (float): Protection rate for certain audio segments.
234
+ hop_length (int): Hop length for audio processing.
235
+ f0_method (str): Method for F0 extraction.
236
+ audio_input_path (str): Path to the input audio file.
237
+ audio_output_path (str): Path to the output audio file.
238
+ model_path (str): Path to the voice conversion model.
239
+ index_path (str): Path to the index file.
240
+ split_audio (bool): Whether to split the audio for processing.
241
+ f0_autotune (bool): Whether to use F0 autotune.
242
+ clean_audio (bool): Whether to clean the audio.
243
+ clean_strength (float): Strength of the audio cleaning.
244
+ export_format (str): Format for exporting the audio.
245
+ upscale_audio (bool): Whether to upscale the audio.
246
+ f0_file (str): Path to the F0 file.
247
+ embedder_model (str): Path to the embedder model.
248
+ embedder_model_custom (str): Path to the custom embedder model.
249
+ resample_sr (int, optional): Resample sampling rate. Default is 0.
250
+ sid (int, optional): Speaker ID. Default is 0.
251
+ **kwargs: Additional keyword arguments.
252
+ """
253
+ self.get_vc(model_path, sid)
254
+ try:
255
+ start_time = time.time()
256
+ print(f"Converting audio '{audio_input_path}'...")
257
+
258
+ audio = load_audio_infer(
259
+ audio_input_path,
260
+ 16000,
261
+ **kwargs,
262
+ )
263
+ audio_max = np.abs(audio).max() / 0.95
264
+
265
+ if audio_max > 1:
266
+ audio /= audio_max
267
+
268
+ if not self.hubert_model or embedder_model != self.last_embedder_model:
269
+ self.load_hubert(embedder_model, embedder_model_custom)
270
+ self.last_embedder_model = embedder_model
271
+
272
+ file_index = (
273
+ index_path.strip()
274
+ .strip('"')
275
+ .strip("\n")
276
+ .strip('"')
277
+ .strip()
278
+ .replace("trained", "added")
279
+ )
280
+
281
+ if self.tgt_sr != resample_sr >= 16000:
282
+ self.tgt_sr = resample_sr
283
+
284
+ if split_audio:
285
+ chunks, intervals = process_audio(audio, 16000)
286
+ print(f"Audio split into {len(chunks)} chunks for processing.")
287
+ else:
288
+ chunks = []
289
+ chunks.append(audio)
290
+
291
+ converted_chunks = []
292
+ for c in chunks:
293
+ audio_opt = self.vc.pipeline(
294
+ model=self.hubert_model,
295
+ net_g=self.net_g,
296
+ sid=sid,
297
+ audio=c,
298
+ pitch=pitch,
299
+ f0_method=f0_method,
300
+ file_index=file_index,
301
+ index_rate=index_rate,
302
+ pitch_guidance=self.use_f0,
303
+ filter_radius=filter_radius,
304
+ volume_envelope=volume_envelope,
305
+ version=self.version,
306
+ protect=protect,
307
+ hop_length=hop_length,
308
+ f0_autotune=f0_autotune,
309
+ f0_autotune_strength=f0_autotune_strength,
310
+ f0_file=f0_file,
311
+ )
312
+ converted_chunks.append(audio_opt)
313
+ if split_audio:
314
+ print(f"Converted audio chunk {len(converted_chunks)}")
315
+
316
+ if split_audio:
317
+ audio_opt = merge_audio(converted_chunks, intervals, 16000, self.tgt_sr)
318
+ else:
319
+ audio_opt = converted_chunks[0]
320
+
321
+ if clean_audio:
322
+ cleaned_audio = self.remove_audio_noise(
323
+ audio_opt, self.tgt_sr, clean_strength
324
+ )
325
+ if cleaned_audio is not None:
326
+ audio_opt = cleaned_audio
327
+
328
+ if post_process:
329
+ audio_opt = self.post_process_audio(
330
+ audio_input=audio_opt,
331
+ sample_rate=self.tgt_sr,
332
+ **kwargs,
333
+ )
334
+
335
+ sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV")
336
+ output_path_format = audio_output_path.replace(
337
+ ".wav", f".{export_format.lower()}"
338
+ )
339
+ audio_output_path = self.convert_audio_format(
340
+ audio_output_path, output_path_format, export_format
341
+ )
342
+
343
+ elapsed_time = time.time() - start_time
344
+ print(
345
+ f"Conversion completed at '{audio_output_path}' in {elapsed_time:.2f} seconds."
346
+ )
347
+ except Exception as error:
348
+ print(f"An error occurred during audio conversion: {error}")
349
+ print(traceback.format_exc())
350
+
351
+ def convert_audio_batch(
352
+ self,
353
+ audio_input_paths: str,
354
+ audio_output_path: str,
355
+ **kwargs,
356
+ ):
357
+ """
358
+ Performs voice conversion on a batch of input audio files.
359
+
360
+ Args:
361
+ audio_input_paths (str): List of paths to the input audio files.
362
+ audio_output_path (str): Path to the output audio file.
363
+ resample_sr (int, optional): Resample sampling rate. Default is 0.
364
+ sid (int, optional): Speaker ID. Default is 0.
365
+ **kwargs: Additional keyword arguments.
366
+ """
367
+ pid = os.getpid()
368
+ try:
369
+ with open(
370
+ os.path.join(now_dir, "assets", "infer_pid.txt"), "w"
371
+ ) as pid_file:
372
+ pid_file.write(str(pid))
373
+ start_time = time.time()
374
+ print(f"Converting audio batch '{audio_input_paths}'...")
375
+ audio_files = [
376
+ f
377
+ for f in os.listdir(audio_input_paths)
378
+ if f.endswith(
379
+ (
380
+ "wav",
381
+ "mp3",
382
+ "flac",
383
+ "ogg",
384
+ "opus",
385
+ "m4a",
386
+ "mp4",
387
+ "aac",
388
+ "alac",
389
+ "wma",
390
+ "aiff",
391
+ "webm",
392
+ "ac3",
393
+ )
394
+ )
395
+ ]
396
+ print(f"Detected {len(audio_files)} audio files for inference.")
397
+ for a in audio_files:
398
+ new_input = os.path.join(audio_input_paths, a)
399
+ new_output = os.path.splitext(a)[0] + "_output.wav"
400
+ new_output = os.path.join(audio_output_path, new_output)
401
+ if os.path.exists(new_output):
402
+ continue
403
+ self.convert_audio(
404
+ audio_input_path=new_input,
405
+ audio_output_path=new_output,
406
+ **kwargs,
407
+ )
408
+ print(f"Conversion completed at '{audio_input_paths}'.")
409
+ elapsed_time = time.time() - start_time
410
+ print(f"Batch conversion completed in {elapsed_time:.2f} seconds.")
411
+ except Exception as error:
412
+ print(f"An error occurred during audio batch conversion: {error}")
413
+ print(traceback.format_exc())
414
+ finally:
415
+ os.remove(os.path.join(now_dir, "assets", "infer_pid.txt"))
416
+
417
+ def get_vc(self, weight_root, sid):
418
+ """
419
+ Loads the voice conversion model and sets up the pipeline.
420
+
421
+ Args:
422
+ weight_root (str): Path to the model weights.
423
+ sid (int): Speaker ID.
424
+ """
425
+ if sid == "" or sid == []:
426
+ self.cleanup_model()
427
+ if torch.cuda.is_available():
428
+ torch.cuda.empty_cache()
429
+
430
+ if not self.loaded_model or self.loaded_model != weight_root:
431
+ self.load_model(weight_root)
432
+ if self.cpt is not None:
433
+ self.setup_network()
434
+ self.setup_vc_instance()
435
+ self.loaded_model = weight_root
436
+
437
+ def cleanup_model(self):
438
+ """
439
+ Cleans up the model and releases resources.
440
+ """
441
+ if self.hubert_model is not None:
442
+ del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
443
+ self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
444
+ if torch.cuda.is_available():
445
+ torch.cuda.empty_cache()
446
+
447
+ del self.net_g, self.cpt
448
+ if torch.cuda.is_available():
449
+ torch.cuda.empty_cache()
450
+ self.cpt = None
451
+
452
+ def load_model(self, weight_root):
453
+ """
454
+ Loads the model weights from the specified path.
455
+
456
+ Args:
457
+ weight_root (str): Path to the model weights.
458
+ """
459
+ self.cpt = (
460
+ torch.load(weight_root, map_location="cpu")
461
+ if os.path.isfile(weight_root)
462
+ else None
463
+ )
464
+
465
+ def setup_network(self):
466
+ """
467
+ Sets up the network configuration based on the loaded checkpoint.
468
+ """
469
+ if self.cpt is not None:
470
+ self.tgt_sr = self.cpt["config"][-1]
471
+ self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
472
+ self.use_f0 = self.cpt.get("f0", 1)
473
+
474
+ self.version = self.cpt.get("version", "v1")
475
+ self.text_enc_hidden_dim = 768 if self.version == "v2" else 256
476
+ self.net_g = Synthesizer(
477
+ *self.cpt["config"],
478
+ use_f0=self.use_f0,
479
+ text_enc_hidden_dim=self.text_enc_hidden_dim,
480
+ is_half=self.config.is_half,
481
+ )
482
+ del self.net_g.enc_q
483
+ self.net_g.load_state_dict(self.cpt["weight"], strict=False)
484
+ self.net_g.eval().to(self.config.device)
485
+ self.net_g = (
486
+ self.net_g.half() if self.config.is_half else self.net_g.float()
487
+ )
488
+
489
+ def setup_vc_instance(self):
490
+ """
491
+ Sets up the voice conversion pipeline instance based on the target sampling rate and configuration.
492
+ """
493
+ if self.cpt is not None:
494
+ self.vc = VC(self.tgt_sr, self.config)
495
+ self.n_spk = self.cpt["config"][-3]
modules.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from rvc_cli.rvc.lib.algorithm.commons import fused_add_tanh_sigmoid_multiply
3
+
4
+
5
+ class WaveNet(torch.nn.Module):
6
+ """WaveNet residual blocks as used in WaveGlow.
7
+
8
+ Args:
9
+ hidden_channels (int): Number of hidden channels.
10
+ kernel_size (int): Size of the convolutional kernel.
11
+ dilation_rate (int): Dilation rate of the convolution.
12
+ n_layers (int): Number of convolutional layers.
13
+ gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
14
+ p_dropout (float, optional): Dropout probability. Defaults to 0.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ hidden_channels,
20
+ kernel_size,
21
+ dilation_rate,
22
+ n_layers,
23
+ gin_channels=0,
24
+ p_dropout=0,
25
+ ):
26
+ super().__init__()
27
+ assert kernel_size % 2 == 1, "Kernel size must be odd for proper padding."
28
+
29
+ self.hidden_channels = hidden_channels
30
+ self.kernel_size = (kernel_size,)
31
+ self.dilation_rate = dilation_rate
32
+ self.n_layers = n_layers
33
+ self.gin_channels = gin_channels
34
+ self.p_dropout = p_dropout
35
+ self.n_channels_tensor = torch.IntTensor([hidden_channels]) # Static tensor
36
+
37
+ self.in_layers = torch.nn.ModuleList()
38
+ self.res_skip_layers = torch.nn.ModuleList()
39
+ self.drop = torch.nn.Dropout(p_dropout)
40
+
41
+ # Conditional layer for global conditioning
42
+ if gin_channels:
43
+ self.cond_layer = torch.nn.utils.parametrizations.weight_norm(
44
+ torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1),
45
+ name="weight",
46
+ )
47
+
48
+ # Precompute dilations and paddings
49
+ dilations = [dilation_rate**i for i in range(n_layers)]
50
+ paddings = [(kernel_size * d - d) // 2 for d in dilations]
51
+
52
+ # Initialize layers
53
+ for i in range(n_layers):
54
+ self.in_layers.append(
55
+ torch.nn.utils.parametrizations.weight_norm(
56
+ torch.nn.Conv1d(
57
+ hidden_channels,
58
+ 2 * hidden_channels,
59
+ kernel_size,
60
+ dilation=dilations[i],
61
+ padding=paddings[i],
62
+ ),
63
+ name="weight",
64
+ )
65
+ )
66
+
67
+ res_skip_channels = (
68
+ hidden_channels if i == n_layers - 1 else 2 * hidden_channels
69
+ )
70
+ self.res_skip_layers.append(
71
+ torch.nn.utils.parametrizations.weight_norm(
72
+ torch.nn.Conv1d(hidden_channels, res_skip_channels, 1),
73
+ name="weight",
74
+ )
75
+ )
76
+
77
+ def forward(self, x, x_mask, g=None):
78
+ """Forward pass.
79
+
80
+ Args:
81
+ x (torch.Tensor): Input tensor (batch_size, hidden_channels, time_steps).
82
+ x_mask (torch.Tensor): Mask tensor (batch_size, 1, time_steps).
83
+ g (torch.Tensor, optional): Conditioning tensor (batch_size, gin_channels, time_steps).
84
+ """
85
+ output = x.clone().zero_()
86
+
87
+ # Apply conditional layer if global conditioning is provided
88
+ g = self.cond_layer(g) if g is not None else None
89
+
90
+ for i in range(self.n_layers):
91
+ x_in = self.in_layers[i](x)
92
+ g_l = (
93
+ g[
94
+ :,
95
+ i * 2 * self.hidden_channels : (i + 1) * 2 * self.hidden_channels,
96
+ :,
97
+ ]
98
+ if g is not None
99
+ else 0
100
+ )
101
+
102
+ # Activation with fused Tanh-Sigmoid
103
+ acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, self.n_channels_tensor)
104
+ acts = self.drop(acts)
105
+
106
+ # Residual and skip connections
107
+ res_skip_acts = self.res_skip_layers[i](acts)
108
+ if i < self.n_layers - 1:
109
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
110
+ x = (x + res_acts) * x_mask
111
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
112
+ else:
113
+ output = output + res_skip_acts
114
+
115
+ return output * x_mask
116
+
117
+ def remove_weight_norm(self):
118
+ """Remove weight normalization from the module."""
119
+ if self.gin_channels:
120
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
121
+ for layer in self.in_layers:
122
+ torch.nn.utils.remove_weight_norm(layer)
123
+ for layer in self.res_skip_layers:
124
+ torch.nn.utils.remove_weight_norm(layer)
normalization.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class LayerNorm(torch.nn.Module):
5
+ """Layer normalization module.
6
+
7
+ Args:
8
+ channels (int): Number of channels.
9
+ eps (float, optional): Epsilon value for numerical stability. Defaults to 1e-5.
10
+ """
11
+
12
+ def __init__(self, channels, eps=1e-5):
13
+ super().__init__()
14
+ self.eps = eps
15
+ self.gamma = torch.nn.Parameter(torch.ones(channels))
16
+ self.beta = torch.nn.Parameter(torch.zeros(channels))
17
+
18
+ def forward(self, x):
19
+ """Forward pass.
20
+
21
+ Args:
22
+ x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps).
23
+
24
+ """
25
+ # Transpose to (batch_size, time_steps, channels) for layer_norm
26
+ x = x.transpose(1, -1)
27
+ x = torch.nn.functional.layer_norm(
28
+ x, (x.size(-1),), self.gamma, self.beta, self.eps
29
+ )
30
+ # Transpose back to (batch_size, channels, time_steps)
31
+ return x.transpose(1, -1)
nsf.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch.nn.utils import remove_weight_norm
4
+ from torch.nn.utils.parametrizations import weight_norm
5
+ from typing import Optional
6
+
7
+ from rvc_cli.rvc.lib.algorithm.generators import SineGenerator
8
+ from rvc_cli.rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock1, ResBlock2
9
+ from rvc_cli.rvc.lib.algorithm.commons import init_weights
10
+
11
+
12
+ class SourceModuleHnNSF(torch.nn.Module):
13
+ """
14
+ Source Module for harmonic-plus-noise excitation.
15
+
16
+ Args:
17
+ sample_rate (int): Sampling rate in Hz.
18
+ harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0.
19
+ sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1.
20
+ add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003.
21
+ voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0.
22
+ is_half (bool, optional): Whether to use half precision. Defaults to True.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ sample_rate,
28
+ harmonic_num=0,
29
+ sine_amp=0.1,
30
+ add_noise_std=0.003,
31
+ voiced_threshod=0,
32
+ is_half=True,
33
+ ):
34
+ super(SourceModuleHnNSF, self).__init__()
35
+
36
+ self.sine_amp = sine_amp
37
+ self.noise_std = add_noise_std
38
+ self.is_half = is_half
39
+
40
+ self.l_sin_gen = SineGenerator(
41
+ sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
42
+ )
43
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
44
+ self.l_tanh = torch.nn.Tanh()
45
+
46
+ def forward(self, x: torch.Tensor, upsample_factor: int = 1):
47
+ sine_wavs, uv, _ = self.l_sin_gen(x, upsample_factor)
48
+ sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
49
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
50
+ return sine_merge, None, None
51
+
52
+
53
+ class GeneratorNSF(torch.nn.Module):
54
+ """
55
+ Generator for synthesizing audio using the NSF (Neural Source Filter) approach.
56
+
57
+ Args:
58
+ initial_channel (int): Number of channels in the initial convolutional layer.
59
+ resblock (str): Type of residual block to use (1 or 2).
60
+ resblock_kernel_sizes (list): Kernel sizes of the residual blocks.
61
+ resblock_dilation_sizes (list): Dilation rates of the residual blocks.
62
+ upsample_rates (list): Upsampling rates.
63
+ upsample_initial_channel (int): Number of channels in the initial upsampling layer.
64
+ upsample_kernel_sizes (list): Kernel sizes of the upsampling layers.
65
+ gin_channels (int): Number of channels for the global conditioning input.
66
+ sr (int): Sampling rate.
67
+ is_half (bool, optional): Whether to use half precision. Defaults to False.
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ initial_channel,
73
+ resblock,
74
+ resblock_kernel_sizes,
75
+ resblock_dilation_sizes,
76
+ upsample_rates,
77
+ upsample_initial_channel,
78
+ upsample_kernel_sizes,
79
+ gin_channels,
80
+ sr,
81
+ is_half=False,
82
+ ):
83
+ super(GeneratorNSF, self).__init__()
84
+
85
+ self.num_kernels = len(resblock_kernel_sizes)
86
+ self.num_upsamples = len(upsample_rates)
87
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates))
88
+ self.m_source = SourceModuleHnNSF(
89
+ sample_rate=sr, harmonic_num=0, is_half=is_half
90
+ )
91
+
92
+ self.conv_pre = torch.nn.Conv1d(
93
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
94
+ )
95
+ resblock_cls = ResBlock1 if resblock == "1" else ResBlock2
96
+
97
+ self.ups = torch.nn.ModuleList()
98
+ self.noise_convs = torch.nn.ModuleList()
99
+
100
+ channels = [
101
+ upsample_initial_channel // (2 ** (i + 1))
102
+ for i in range(len(upsample_rates))
103
+ ]
104
+ stride_f0s = [
105
+ math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1
106
+ for i in range(len(upsample_rates))
107
+ ]
108
+
109
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
110
+ self.ups.append(
111
+ weight_norm(
112
+ torch.nn.ConvTranspose1d(
113
+ upsample_initial_channel // (2**i),
114
+ channels[i],
115
+ k,
116
+ u,
117
+ padding=(k - u) // 2,
118
+ )
119
+ )
120
+ )
121
+
122
+ self.noise_convs.append(
123
+ torch.nn.Conv1d(
124
+ 1,
125
+ channels[i],
126
+ kernel_size=(stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1),
127
+ stride=stride_f0s[i],
128
+ padding=(stride_f0s[i] // 2 if stride_f0s[i] > 1 else 0),
129
+ )
130
+ )
131
+
132
+ self.resblocks = torch.nn.ModuleList(
133
+ [
134
+ resblock_cls(channels[i], k, d)
135
+ for i in range(len(self.ups))
136
+ for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes)
137
+ ]
138
+ )
139
+
140
+ self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False)
141
+ self.ups.apply(init_weights)
142
+
143
+ if gin_channels != 0:
144
+ self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
145
+
146
+ self.upp = math.prod(upsample_rates)
147
+ self.lrelu_slope = LRELU_SLOPE
148
+
149
+ def forward(self, x, f0, g: Optional[torch.Tensor] = None):
150
+ har_source, _, _ = self.m_source(f0, self.upp)
151
+ har_source = har_source.transpose(1, 2)
152
+ x = self.conv_pre(x)
153
+
154
+ if g is not None:
155
+ x = x + self.cond(g)
156
+
157
+ for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
158
+ x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
159
+ x = ups(x)
160
+ x = x + noise_convs(har_source)
161
+
162
+ xs = sum(
163
+ [
164
+ resblock(x)
165
+ for j, resblock in enumerate(self.resblocks)
166
+ if j in range(i * self.num_kernels, (i + 1) * self.num_kernels)
167
+ ]
168
+ )
169
+ x = xs / self.num_kernels
170
+
171
+ x = torch.nn.functional.leaky_relu(x)
172
+ x = torch.tanh(self.conv_post(x))
173
+ return x
174
+
175
+ def remove_weight_norm(self):
176
+ for l in self.ups:
177
+ remove_weight_norm(l)
178
+ for l in self.resblocks:
179
+ l.remove_weight_norm()
180
+
181
+ def __prepare_scriptable__(self):
182
+ for l in self.ups:
183
+ for hook in l._forward_pre_hooks.values():
184
+ if (
185
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
186
+ and hook.__class__.__name__ == "WeightNorm"
187
+ ):
188
+ remove_weight_norm(l)
189
+ for l in self.resblocks:
190
+ for hook in l._forward_pre_hooks.values():
191
+ if (
192
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
193
+ and hook.__class__.__name__ == "WeightNorm"
194
+ ):
195
+ remove_weight_norm(l)
196
+ return self
pipeline.py ADDED
@@ -0,0 +1,708 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
+ import re
4
+ import sys
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import torchcrepe
8
+ import faiss
9
+ import librosa
10
+ import numpy as np
11
+ from scipy import signal
12
+ from torch import Tensor
13
+
14
+ now_dir = os.getcwd()
15
+ sys.path.append(now_dir)
16
+
17
+ from rvc_cli.rvc.lib.predictors.RMVPE import RMVPE0Predictor
18
+ from rvc_cli.rvc.lib.predictors.FCPE import FCPEF0Predictor
19
+
20
+ import logging
21
+
22
+ logging.getLogger("faiss").setLevel(logging.WARNING)
23
+
24
+ # Constants for high-pass filter
25
+ FILTER_ORDER = 5
26
+ CUTOFF_FREQUENCY = 48 # Hz
27
+ SAMPLE_RATE = 16000 # Hz
28
+ bh, ah = signal.butter(
29
+ N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE
30
+ )
31
+
32
+ input_audio_path2wav = {}
33
+
34
+
35
+ class AudioProcessor:
36
+ """
37
+ A class for processing audio signals, specifically for adjusting RMS levels.
38
+ """
39
+
40
+ def change_rms(
41
+ source_audio: np.ndarray,
42
+ source_rate: int,
43
+ target_audio: np.ndarray,
44
+ target_rate: int,
45
+ rate: float,
46
+ ) -> np.ndarray:
47
+ """
48
+ Adjust the RMS level of target_audio to match the RMS of source_audio, with a given blending rate.
49
+
50
+ Args:
51
+ source_audio: The source audio signal as a NumPy array.
52
+ source_rate: The sampling rate of the source audio.
53
+ target_audio: The target audio signal to adjust.
54
+ target_rate: The sampling rate of the target audio.
55
+ rate: The blending rate between the source and target RMS levels.
56
+ """
57
+ # Calculate RMS of both audio data
58
+ rms1 = librosa.feature.rms(
59
+ y=source_audio,
60
+ frame_length=source_rate // 2 * 2,
61
+ hop_length=source_rate // 2,
62
+ )
63
+ rms2 = librosa.feature.rms(
64
+ y=target_audio,
65
+ frame_length=target_rate // 2 * 2,
66
+ hop_length=target_rate // 2,
67
+ )
68
+
69
+ # Interpolate RMS to match target audio length
70
+ rms1 = F.interpolate(
71
+ torch.from_numpy(rms1).float().unsqueeze(0),
72
+ size=target_audio.shape[0],
73
+ mode="linear",
74
+ ).squeeze()
75
+ rms2 = F.interpolate(
76
+ torch.from_numpy(rms2).float().unsqueeze(0),
77
+ size=target_audio.shape[0],
78
+ mode="linear",
79
+ ).squeeze()
80
+ rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6)
81
+
82
+ # Adjust target audio RMS based on the source audio RMS
83
+ adjusted_audio = (
84
+ target_audio
85
+ * (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy()
86
+ )
87
+ return adjusted_audio
88
+
89
+
90
+ class Autotune:
91
+ """
92
+ A class for applying autotune to a given fundamental frequency (F0) contour.
93
+ """
94
+
95
+ def __init__(self, ref_freqs):
96
+ """
97
+ Initializes the Autotune class with a set of reference frequencies.
98
+
99
+ Args:
100
+ ref_freqs: A list of reference frequencies representing musical notes.
101
+ """
102
+ self.ref_freqs = ref_freqs
103
+ self.note_dict = self.ref_freqs # No interpolation needed
104
+
105
+ def autotune_f0(self, f0, f0_autotune_strength):
106
+ """
107
+ Autotunes a given F0 contour by snapping each frequency to the closest reference frequency.
108
+
109
+ Args:
110
+ f0: The input F0 contour as a NumPy array.
111
+ """
112
+ autotuned_f0 = np.zeros_like(f0)
113
+ for i, freq in enumerate(f0):
114
+ closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
115
+ autotuned_f0[i] = freq + (closest_note - freq) * f0_autotune_strength
116
+ return autotuned_f0
117
+
118
+
119
+ class Pipeline:
120
+ """
121
+ The main pipeline class for performing voice conversion, including preprocessing, F0 estimation,
122
+ voice conversion using a model, and post-processing.
123
+ """
124
+
125
+ def __init__(self, tgt_sr, config):
126
+ """
127
+ Initializes the Pipeline class with target sampling rate and configuration parameters.
128
+
129
+ Args:
130
+ tgt_sr: The target sampling rate for the output audio.
131
+ config: A configuration object containing various parameters for the pipeline.
132
+ """
133
+ self.x_pad = config.x_pad
134
+ self.x_query = config.x_query
135
+ self.x_center = config.x_center
136
+ self.x_max = config.x_max
137
+ self.is_half = config.is_half
138
+ self.sample_rate = 16000
139
+ self.window = 160
140
+ self.t_pad = self.sample_rate * self.x_pad
141
+ self.t_pad_tgt = tgt_sr * self.x_pad
142
+ self.t_pad2 = self.t_pad * 2
143
+ self.t_query = self.sample_rate * self.x_query
144
+ self.t_center = self.sample_rate * self.x_center
145
+ self.t_max = self.sample_rate * self.x_max
146
+ self.time_step = self.window / self.sample_rate * 1000
147
+ self.f0_min = 50
148
+ self.f0_max = 1100
149
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
150
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
151
+ self.device = config.device
152
+ self.ref_freqs = [
153
+ 49.00, # G1
154
+ 51.91, # G#1 / Ab1
155
+ 55.00, # A1
156
+ 58.27, # A#1 / Bb1
157
+ 61.74, # B1
158
+ 65.41, # C2
159
+ 69.30, # C#2 / Db2
160
+ 73.42, # D2
161
+ 77.78, # D#2 / Eb2
162
+ 82.41, # E2
163
+ 87.31, # F2
164
+ 92.50, # F#2 / Gb2
165
+ 98.00, # G2
166
+ 103.83, # G#2 / Ab2
167
+ 110.00, # A2
168
+ 116.54, # A#2 / Bb2
169
+ 123.47, # B2
170
+ 130.81, # C3
171
+ 138.59, # C#3 / Db3
172
+ 146.83, # D3
173
+ 155.56, # D#3 / Eb3
174
+ 164.81, # E3
175
+ 174.61, # F3
176
+ 185.00, # F#3 / Gb3
177
+ 196.00, # G3
178
+ 207.65, # G#3 / Ab3
179
+ 220.00, # A3
180
+ 233.08, # A#3 / Bb3
181
+ 246.94, # B3
182
+ 261.63, # C4
183
+ 277.18, # C#4 / Db4
184
+ 293.66, # D4
185
+ 311.13, # D#4 / Eb4
186
+ 329.63, # E4
187
+ 349.23, # F4
188
+ 369.99, # F#4 / Gb4
189
+ 392.00, # G4
190
+ 415.30, # G#4 / Ab4
191
+ 440.00, # A4
192
+ 466.16, # A#4 / Bb4
193
+ 493.88, # B4
194
+ 523.25, # C5
195
+ 554.37, # C#5 / Db5
196
+ 587.33, # D5
197
+ 622.25, # D#5 / Eb5
198
+ 659.25, # E5
199
+ 698.46, # F5
200
+ 739.99, # F#5 / Gb5
201
+ 783.99, # G5
202
+ 830.61, # G#5 / Ab5
203
+ 880.00, # A5
204
+ 932.33, # A#5 / Bb5
205
+ 987.77, # B5
206
+ 1046.50, # C6
207
+ ]
208
+ self.autotune = Autotune(self.ref_freqs)
209
+ self.note_dict = self.autotune.note_dict
210
+ self.model_rmvpe = RMVPE0Predictor(
211
+ os.path.join("rvc_cli", "rvc", "models", "predictors", "rmvpe.pt"),
212
+ is_half=self.is_half,
213
+ device=self.device,
214
+ )
215
+
216
+ def get_f0_crepe(
217
+ self,
218
+ x,
219
+ f0_min,
220
+ f0_max,
221
+ p_len,
222
+ hop_length,
223
+ model="full",
224
+ ):
225
+ """
226
+ Estimates the fundamental frequency (F0) of a given audio signal using the Crepe model.
227
+
228
+ Args:
229
+ x: The input audio signal as a NumPy array.
230
+ f0_min: Minimum F0 value to consider.
231
+ f0_max: Maximum F0 value to consider.
232
+ p_len: Desired length of the F0 output.
233
+ hop_length: Hop length for the Crepe model.
234
+ model: Crepe model size to use ("full" or "tiny").
235
+ """
236
+ x = x.astype(np.float32)
237
+ x /= np.quantile(np.abs(x), 0.999)
238
+ audio = torch.from_numpy(x).to(self.device, copy=True)
239
+ audio = torch.unsqueeze(audio, dim=0)
240
+ if audio.ndim == 2 and audio.shape[0] > 1:
241
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
242
+ audio = audio.detach()
243
+ pitch: Tensor = torchcrepe.predict(
244
+ audio,
245
+ self.sample_rate,
246
+ hop_length,
247
+ f0_min,
248
+ f0_max,
249
+ model,
250
+ batch_size=hop_length * 2,
251
+ device=self.device,
252
+ pad=True,
253
+ )
254
+ p_len = p_len or x.shape[0] // hop_length
255
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
256
+ source[source < 0.001] = np.nan
257
+ target = np.interp(
258
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
259
+ np.arange(0, len(source)),
260
+ source,
261
+ )
262
+ f0 = np.nan_to_num(target)
263
+ return f0
264
+
265
+ def get_f0_hybrid(
266
+ self,
267
+ methods_str,
268
+ x,
269
+ f0_min,
270
+ f0_max,
271
+ p_len,
272
+ hop_length,
273
+ ):
274
+ """
275
+ Estimates the fundamental frequency (F0) using a hybrid approach combining multiple methods.
276
+
277
+ Args:
278
+ methods_str: A string specifying the methods to combine (e.g., "hybrid[crepe+rmvpe]").
279
+ x: The input audio signal as a NumPy array.
280
+ f0_min: Minimum F0 value to consider.
281
+ f0_max: Maximum F0 value to consider.
282
+ p_len: Desired length of the F0 output.
283
+ hop_length: Hop length for F0 estimation methods.
284
+ """
285
+ methods_str = re.search("hybrid\[(.+)\]", methods_str)
286
+ if methods_str:
287
+ methods = [method.strip() for method in methods_str.group(1).split("+")]
288
+ f0_computation_stack = []
289
+ print(f"Calculating f0 pitch estimations for methods: {', '.join(methods)}")
290
+ x = x.astype(np.float32)
291
+ x /= np.quantile(np.abs(x), 0.999)
292
+ for method in methods:
293
+ f0 = None
294
+ if method == "crepe":
295
+ f0 = self.get_f0_crepe_computation(
296
+ x, f0_min, f0_max, p_len, int(hop_length)
297
+ )
298
+ elif method == "rmvpe":
299
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
300
+ f0 = f0[1:]
301
+ elif method == "fcpe":
302
+ self.model_fcpe = FCPEF0Predictor(
303
+ os.path.join("rvc_cli", "rvc", "models", "predictors", "fcpe.pt"),
304
+ f0_min=int(f0_min),
305
+ f0_max=int(f0_max),
306
+ dtype=torch.float32,
307
+ device=self.device,
308
+ sample_rate=self.sample_rate,
309
+ threshold=0.03,
310
+ )
311
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
312
+ del self.model_fcpe
313
+ gc.collect()
314
+ f0_computation_stack.append(f0)
315
+
316
+ f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None]
317
+ f0_median_hybrid = None
318
+ if len(f0_computation_stack) == 1:
319
+ f0_median_hybrid = f0_computation_stack[0]
320
+ else:
321
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
322
+ return f0_median_hybrid
323
+
324
+ def get_f0(
325
+ self,
326
+ input_audio_path,
327
+ x,
328
+ p_len,
329
+ pitch,
330
+ f0_method,
331
+ filter_radius,
332
+ hop_length,
333
+ f0_autotune,
334
+ f0_autotune_strength,
335
+ inp_f0=None,
336
+ ):
337
+ """
338
+ Estimates the fundamental frequency (F0) of a given audio signal using various methods.
339
+
340
+ Args:
341
+ input_audio_path: Path to the input audio file.
342
+ x: The input audio signal as a NumPy array.
343
+ p_len: Desired length of the F0 output.
344
+ pitch: Key to adjust the pitch of the F0 contour.
345
+ f0_method: Method to use for F0 estimation (e.g., "crepe").
346
+ filter_radius: Radius for median filtering the F0 contour.
347
+ hop_length: Hop length for F0 estimation methods.
348
+ f0_autotune: Whether to apply autotune to the F0 contour.
349
+ inp_f0: Optional input F0 contour to use instead of estimating.
350
+ """
351
+ global input_audio_path2wav
352
+ if f0_method == "crepe":
353
+ f0 = self.get_f0_crepe(x, self.f0_min, self.f0_max, p_len, int(hop_length))
354
+ elif f0_method == "crepe-tiny":
355
+ f0 = self.get_f0_crepe(
356
+ x, self.f0_min, self.f0_max, p_len, int(hop_length), "tiny"
357
+ )
358
+ elif f0_method == "rmvpe":
359
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
360
+ elif f0_method == "fcpe":
361
+ self.model_fcpe = FCPEF0Predictor(
362
+ os.path.join("rvc_cli", "rvc", "models", "predictors", "fcpe.pt"),
363
+ f0_min=int(self.f0_min),
364
+ f0_max=int(self.f0_max),
365
+ dtype=torch.float32,
366
+ device=self.device,
367
+ sample_rate=self.sample_rate,
368
+ threshold=0.03,
369
+ )
370
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
371
+ del self.model_fcpe
372
+ gc.collect()
373
+ elif "hybrid" in f0_method:
374
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
375
+ f0 = self.get_f0_hybrid(
376
+ f0_method,
377
+ x,
378
+ self.f0_min,
379
+ self.f0_max,
380
+ p_len,
381
+ hop_length,
382
+ )
383
+
384
+ if f0_autotune is True:
385
+ f0 = Autotune.autotune_f0(self, f0, f0_autotune_strength)
386
+
387
+ f0 *= pow(2, pitch / 12)
388
+ tf0 = self.sample_rate // self.window
389
+ if inp_f0 is not None:
390
+ delta_t = np.round(
391
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
392
+ ).astype("int16")
393
+ replace_f0 = np.interp(
394
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
395
+ )
396
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
397
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
398
+ :shape
399
+ ]
400
+ f0bak = f0.copy()
401
+ f0_mel = 1127 * np.log(1 + f0 / 700)
402
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
403
+ self.f0_mel_max - self.f0_mel_min
404
+ ) + 1
405
+ f0_mel[f0_mel <= 1] = 1
406
+ f0_mel[f0_mel > 255] = 255
407
+ f0_coarse = np.rint(f0_mel).astype(int)
408
+
409
+ return f0_coarse, f0bak
410
+
411
+ def voice_conversion(
412
+ self,
413
+ model,
414
+ net_g,
415
+ sid,
416
+ audio0,
417
+ pitch,
418
+ pitchf,
419
+ index,
420
+ big_npy,
421
+ index_rate,
422
+ version,
423
+ protect,
424
+ ):
425
+ """
426
+ Performs voice conversion on a given audio segment.
427
+
428
+ Args:
429
+ model: The feature extractor model.
430
+ net_g: The generative model for synthesizing speech.
431
+ sid: Speaker ID for the target voice.
432
+ audio0: The input audio segment.
433
+ pitch: Quantized F0 contour for pitch guidance.
434
+ pitchf: Original F0 contour for pitch guidance.
435
+ index: FAISS index for speaker embedding retrieval.
436
+ big_npy: Speaker embeddings stored in a NumPy array.
437
+ index_rate: Blending rate for speaker embedding retrieval.
438
+ version: Model version ("v1" or "v2").
439
+ protect: Protection level for preserving the original pitch.
440
+ """
441
+ with torch.no_grad():
442
+ pitch_guidance = pitch != None and pitchf != None
443
+ # prepare source audio
444
+ feats = (
445
+ torch.from_numpy(audio0).half()
446
+ if self.is_half
447
+ else torch.from_numpy(audio0).float()
448
+ )
449
+ feats = feats.mean(-1) if feats.dim() == 2 else feats
450
+ assert feats.dim() == 1, feats.dim()
451
+ feats = feats.view(1, -1).to(self.device)
452
+ # extract features
453
+ feats = model(feats)["last_hidden_state"]
454
+ feats = (
455
+ model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats
456
+ )
457
+ # make a copy for pitch guidance and protection
458
+ feats0 = feats.clone() if pitch_guidance else None
459
+ if (
460
+ index
461
+ ): # set by parent function, only true if index is available, loaded, and index rate > 0
462
+ feats = self._retrieve_speaker_embeddings(
463
+ feats, index, big_npy, index_rate
464
+ )
465
+ # feature upsampling
466
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(
467
+ 0, 2, 1
468
+ )
469
+ # adjust the length if the audio is short
470
+ p_len = min(audio0.shape[0] // self.window, feats.shape[1])
471
+ if pitch_guidance:
472
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
473
+ 0, 2, 1
474
+ )
475
+ pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len]
476
+ # Pitch protection blending
477
+ if protect < 0.5:
478
+ pitchff = pitchf.clone()
479
+ pitchff[pitchf > 0] = 1
480
+ pitchff[pitchf < 1] = protect
481
+ feats = feats * pitchff.unsqueeze(-1) + feats0 * (
482
+ 1 - pitchff.unsqueeze(-1)
483
+ )
484
+ feats = feats.to(feats0.dtype)
485
+ else:
486
+ pitch, pitchf = None, None
487
+ p_len = torch.tensor([p_len], device=self.device).long()
488
+ audio1 = (
489
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
490
+ .data.cpu()
491
+ .float()
492
+ .numpy()
493
+ )
494
+ # clean up
495
+ del feats, feats0, p_len
496
+ if torch.cuda.is_available():
497
+ torch.cuda.empty_cache()
498
+ return audio1
499
+
500
+ def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate):
501
+ npy = feats[0].cpu().numpy()
502
+ npy = npy.astype("float32") if self.is_half else npy
503
+ score, ix = index.search(npy, k=8)
504
+ weight = np.square(1 / score)
505
+ weight /= weight.sum(axis=1, keepdims=True)
506
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
507
+ npy = npy.astype("float16") if self.is_half else npy
508
+ feats = (
509
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
510
+ + (1 - index_rate) * feats
511
+ )
512
+ return feats
513
+
514
+ def pipeline(
515
+ self,
516
+ model,
517
+ net_g,
518
+ sid,
519
+ audio,
520
+ pitch,
521
+ f0_method,
522
+ file_index,
523
+ index_rate,
524
+ pitch_guidance,
525
+ filter_radius,
526
+ volume_envelope,
527
+ version,
528
+ protect,
529
+ hop_length,
530
+ f0_autotune,
531
+ f0_autotune_strength,
532
+ f0_file,
533
+ ):
534
+ """
535
+ The main pipeline function for performing voice conversion.
536
+
537
+ Args:
538
+ model: The feature extractor model.
539
+ net_g: The generative model for synthesizing speech.
540
+ sid: Speaker ID for the target voice.
541
+ audio: The input audio signal.
542
+ input_audio_path: Path to the input audio file.
543
+ pitch: Key to adjust the pitch of the F0 contour.
544
+ f0_method: Method to use for F0 estimation.
545
+ file_index: Path to the FAISS index file for speaker embedding retrieval.
546
+ index_rate: Blending rate for speaker embedding retrieval.
547
+ pitch_guidance: Whether to use pitch guidance during voice conversion.
548
+ filter_radius: Radius for median filtering the F0 contour.
549
+ tgt_sr: Target sampling rate for the output audio.
550
+ resample_sr: Resampling rate for the output audio.
551
+ volume_envelope: Blending rate for adjusting the RMS level of the output audio.
552
+ version: Model version.
553
+ protect: Protection level for preserving the original pitch.
554
+ hop_length: Hop length for F0 estimation methods.
555
+ f0_autotune: Whether to apply autotune to the F0 contour.
556
+ f0_file: Path to a file containing an F0 contour to use.
557
+ """
558
+ if file_index != "" and os.path.exists(file_index) and index_rate > 0:
559
+ try:
560
+ index = faiss.read_index(file_index)
561
+ big_npy = index.reconstruct_n(0, index.ntotal)
562
+ except Exception as error:
563
+ print(f"An error occurred reading the FAISS index: {error}")
564
+ index = big_npy = None
565
+ else:
566
+ index = big_npy = None
567
+ audio = signal.filtfilt(bh, ah, audio)
568
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
569
+ opt_ts = []
570
+ if audio_pad.shape[0] > self.t_max:
571
+ audio_sum = np.zeros_like(audio)
572
+ for i in range(self.window):
573
+ audio_sum += audio_pad[i : i - self.window]
574
+ for t in range(self.t_center, audio.shape[0], self.t_center):
575
+ opt_ts.append(
576
+ t
577
+ - self.t_query
578
+ + np.where(
579
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
580
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
581
+ )[0][0]
582
+ )
583
+ s = 0
584
+ audio_opt = []
585
+ t = None
586
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
587
+ p_len = audio_pad.shape[0] // self.window
588
+ inp_f0 = None
589
+ if hasattr(f0_file, "name"):
590
+ try:
591
+ with open(f0_file.name, "r") as f:
592
+ lines = f.read().strip("\n").split("\n")
593
+ inp_f0 = []
594
+ for line in lines:
595
+ inp_f0.append([float(i) for i in line.split(",")])
596
+ inp_f0 = np.array(inp_f0, dtype="float32")
597
+ except Exception as error:
598
+ print(f"An error occurred reading the F0 file: {error}")
599
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
600
+ if pitch_guidance:
601
+ pitch, pitchf = self.get_f0(
602
+ "input_audio_path", # questionable purpose of making a key for an array
603
+ audio_pad,
604
+ p_len,
605
+ pitch,
606
+ f0_method,
607
+ filter_radius,
608
+ hop_length,
609
+ f0_autotune,
610
+ f0_autotune_strength,
611
+ inp_f0,
612
+ )
613
+ pitch = pitch[:p_len]
614
+ pitchf = pitchf[:p_len]
615
+ if self.device == "mps":
616
+ pitchf = pitchf.astype(np.float32)
617
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
618
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
619
+ for t in opt_ts:
620
+ t = t // self.window * self.window
621
+ if pitch_guidance:
622
+ audio_opt.append(
623
+ self.voice_conversion(
624
+ model,
625
+ net_g,
626
+ sid,
627
+ audio_pad[s : t + self.t_pad2 + self.window],
628
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
629
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
630
+ index,
631
+ big_npy,
632
+ index_rate,
633
+ version,
634
+ protect,
635
+ )[self.t_pad_tgt : -self.t_pad_tgt]
636
+ )
637
+ else:
638
+ audio_opt.append(
639
+ self.voice_conversion(
640
+ model,
641
+ net_g,
642
+ sid,
643
+ audio_pad[s : t + self.t_pad2 + self.window],
644
+ None,
645
+ None,
646
+ index,
647
+ big_npy,
648
+ index_rate,
649
+ version,
650
+ protect,
651
+ )[self.t_pad_tgt : -self.t_pad_tgt]
652
+ )
653
+ s = t
654
+ if pitch_guidance:
655
+ audio_opt.append(
656
+ self.voice_conversion(
657
+ model,
658
+ net_g,
659
+ sid,
660
+ audio_pad[t:],
661
+ pitch[:, t // self.window :] if t is not None else pitch,
662
+ pitchf[:, t // self.window :] if t is not None else pitchf,
663
+ index,
664
+ big_npy,
665
+ index_rate,
666
+ version,
667
+ protect,
668
+ )[self.t_pad_tgt : -self.t_pad_tgt]
669
+ )
670
+ else:
671
+ audio_opt.append(
672
+ self.voice_conversion(
673
+ model,
674
+ net_g,
675
+ sid,
676
+ audio_pad[t:],
677
+ None,
678
+ None,
679
+ index,
680
+ big_npy,
681
+ index_rate,
682
+ version,
683
+ protect,
684
+ )[self.t_pad_tgt : -self.t_pad_tgt]
685
+ )
686
+ audio_opt = np.concatenate(audio_opt)
687
+ if volume_envelope != 1:
688
+ audio_opt = AudioProcessor.change_rms(
689
+ audio, self.sample_rate, audio_opt, self.sample_rate, volume_envelope
690
+ )
691
+ # if resample_sr >= self.sample_rate and tgt_sr != resample_sr:
692
+ # audio_opt = librosa.resample(
693
+ # audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
694
+ # )
695
+ # audio_max = np.abs(audio_opt).max() / 0.99
696
+ # max_int16 = 32768
697
+ # if audio_max > 1:
698
+ # max_int16 /= audio_max
699
+ # audio_opt = (audio_opt * 32768).astype(np.int16)
700
+ audio_max = np.abs(audio_opt).max() / 0.99
701
+ if audio_max > 1:
702
+ audio_opt /= audio_max
703
+ if pitch_guidance:
704
+ del pitch, pitchf
705
+ del sid
706
+ if torch.cuda.is_available():
707
+ torch.cuda.empty_cache()
708
+ return audio_opt
residuals.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ import torch
3
+ from torch.nn.utils import remove_weight_norm
4
+ from torch.nn.utils.parametrizations import weight_norm
5
+
6
+ from rvc_cli.rvc.lib.algorithm.modules import WaveNet
7
+ from rvc_cli.rvc.lib.algorithm.commons import get_padding, init_weights
8
+
9
+ LRELU_SLOPE = 0.1
10
+
11
+
12
+ def create_conv1d_layer(channels, kernel_size, dilation):
13
+ return weight_norm(
14
+ torch.nn.Conv1d(
15
+ channels,
16
+ channels,
17
+ kernel_size,
18
+ 1,
19
+ dilation=dilation,
20
+ padding=get_padding(kernel_size, dilation),
21
+ )
22
+ )
23
+
24
+
25
+ def apply_mask(tensor, mask):
26
+ return tensor * mask if mask is not None else tensor
27
+
28
+
29
+ class ResBlockBase(torch.nn.Module):
30
+ def __init__(self, channels, kernel_size, dilations):
31
+ super(ResBlockBase, self).__init__()
32
+ self.convs1 = torch.nn.ModuleList(
33
+ [create_conv1d_layer(channels, kernel_size, d) for d in dilations]
34
+ )
35
+ self.convs1.apply(init_weights)
36
+
37
+ self.convs2 = torch.nn.ModuleList(
38
+ [create_conv1d_layer(channels, kernel_size, 1) for _ in dilations]
39
+ )
40
+ self.convs2.apply(init_weights)
41
+
42
+ def forward(self, x, x_mask=None):
43
+ for c1, c2 in zip(self.convs1, self.convs2):
44
+ xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
45
+ xt = apply_mask(xt, x_mask)
46
+ xt = torch.nn.functional.leaky_relu(c1(xt), LRELU_SLOPE)
47
+ xt = apply_mask(xt, x_mask)
48
+ xt = c2(xt)
49
+ x = xt + x
50
+ return apply_mask(x, x_mask)
51
+
52
+ def remove_weight_norm(self):
53
+ for conv in self.convs1 + self.convs2:
54
+ remove_weight_norm(conv)
55
+
56
+
57
+ class ResBlock1(ResBlockBase):
58
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
59
+ super(ResBlock1, self).__init__(channels, kernel_size, dilation)
60
+
61
+
62
+ class ResBlock2(ResBlockBase):
63
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
64
+ super(ResBlock2, self).__init__(channels, kernel_size, dilation)
65
+
66
+
67
+ class Flip(torch.nn.Module):
68
+ """Flip module for flow-based models.
69
+
70
+ This module flips the input along the time dimension.
71
+ """
72
+
73
+ def forward(self, x, *args, reverse=False, **kwargs):
74
+ """Forward pass.
75
+
76
+ Args:
77
+ x (torch.Tensor): Input tensor.
78
+ reverse (bool, optional): Whether to reverse the operation. Defaults to False.
79
+ """
80
+ x = torch.flip(x, [1])
81
+ if not reverse:
82
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
83
+ return x, logdet
84
+ else:
85
+ return x
86
+
87
+
88
+ class ResidualCouplingBlock(torch.nn.Module):
89
+ """Residual Coupling Block for normalizing flow.
90
+
91
+ Args:
92
+ channels (int): Number of channels in the input.
93
+ hidden_channels (int): Number of hidden channels in the coupling layer.
94
+ kernel_size (int): Kernel size of the convolutional layers.
95
+ dilation_rate (int): Dilation rate of the convolutional layers.
96
+ n_layers (int): Number of layers in the coupling layer.
97
+ n_flows (int, optional): Number of coupling layers in the block. Defaults to 4.
98
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ channels,
104
+ hidden_channels,
105
+ kernel_size,
106
+ dilation_rate,
107
+ n_layers,
108
+ n_flows=4,
109
+ gin_channels=0,
110
+ ):
111
+ super(ResidualCouplingBlock, self).__init__()
112
+ self.channels = channels
113
+ self.hidden_channels = hidden_channels
114
+ self.kernel_size = kernel_size
115
+ self.dilation_rate = dilation_rate
116
+ self.n_layers = n_layers
117
+ self.n_flows = n_flows
118
+ self.gin_channels = gin_channels
119
+
120
+ self.flows = torch.nn.ModuleList()
121
+ for i in range(n_flows):
122
+ self.flows.append(
123
+ ResidualCouplingLayer(
124
+ channels,
125
+ hidden_channels,
126
+ kernel_size,
127
+ dilation_rate,
128
+ n_layers,
129
+ gin_channels=gin_channels,
130
+ mean_only=True,
131
+ )
132
+ )
133
+ self.flows.append(Flip())
134
+
135
+ def forward(
136
+ self,
137
+ x: torch.Tensor,
138
+ x_mask: torch.Tensor,
139
+ g: Optional[torch.Tensor] = None,
140
+ reverse: bool = False,
141
+ ):
142
+ if not reverse:
143
+ for flow in self.flows:
144
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
145
+ else:
146
+ for flow in reversed(self.flows):
147
+ x = flow.forward(x, x_mask, g=g, reverse=reverse)
148
+ return x
149
+
150
+ def remove_weight_norm(self):
151
+ """Removes weight normalization from the coupling layers."""
152
+ for i in range(self.n_flows):
153
+ self.flows[i * 2].remove_weight_norm()
154
+
155
+ def __prepare_scriptable__(self):
156
+ """Prepares the module for scripting."""
157
+ for i in range(self.n_flows):
158
+ for hook in self.flows[i * 2]._forward_pre_hooks.values():
159
+ if (
160
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
161
+ and hook.__class__.__name__ == "WeightNorm"
162
+ ):
163
+ torch.nn.utils.remove_weight_norm(self.flows[i * 2])
164
+
165
+ return self
166
+
167
+
168
+ class ResidualCouplingLayer(torch.nn.Module):
169
+ """Residual coupling layer for flow-based models.
170
+
171
+ Args:
172
+ channels (int): Number of channels.
173
+ hidden_channels (int): Number of hidden channels.
174
+ kernel_size (int): Size of the convolutional kernel.
175
+ dilation_rate (int): Dilation rate of the convolution.
176
+ n_layers (int): Number of convolutional layers.
177
+ p_dropout (float, optional): Dropout probability. Defaults to 0.
178
+ gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
179
+ mean_only (bool, optional): Whether to use mean-only coupling. Defaults to False.
180
+ """
181
+
182
+ def __init__(
183
+ self,
184
+ channels,
185
+ hidden_channels,
186
+ kernel_size,
187
+ dilation_rate,
188
+ n_layers,
189
+ p_dropout=0,
190
+ gin_channels=0,
191
+ mean_only=False,
192
+ ):
193
+ assert channels % 2 == 0, "channels should be divisible by 2"
194
+ super().__init__()
195
+ self.channels = channels
196
+ self.hidden_channels = hidden_channels
197
+ self.kernel_size = kernel_size
198
+ self.dilation_rate = dilation_rate
199
+ self.n_layers = n_layers
200
+ self.half_channels = channels // 2
201
+ self.mean_only = mean_only
202
+
203
+ self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1)
204
+ self.enc = WaveNet(
205
+ hidden_channels,
206
+ kernel_size,
207
+ dilation_rate,
208
+ n_layers,
209
+ p_dropout=p_dropout,
210
+ gin_channels=gin_channels,
211
+ )
212
+ self.post = torch.nn.Conv1d(
213
+ hidden_channels, self.half_channels * (2 - mean_only), 1
214
+ )
215
+ self.post.weight.data.zero_()
216
+ self.post.bias.data.zero_()
217
+
218
+ def forward(self, x, x_mask, g=None, reverse=False):
219
+ """Forward pass.
220
+
221
+ Args:
222
+ x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps).
223
+ x_mask (torch.Tensor): Mask tensor of shape (batch_size, 1, time_steps).
224
+ g (torch.Tensor, optional): Conditioning tensor of shape (batch_size, gin_channels, time_steps).
225
+ Defaults to None.
226
+ reverse (bool, optional): Whether to reverse the operation. Defaults to False.
227
+ """
228
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
229
+ h = self.pre(x0) * x_mask
230
+ h = self.enc(h, x_mask, g=g)
231
+ stats = self.post(h) * x_mask
232
+ if not self.mean_only:
233
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
234
+ else:
235
+ m = stats
236
+ logs = torch.zeros_like(m)
237
+
238
+ if not reverse:
239
+ x1 = m + x1 * torch.exp(logs) * x_mask
240
+ x = torch.cat([x0, x1], 1)
241
+ logdet = torch.sum(logs, [1, 2])
242
+ return x, logdet
243
+ else:
244
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
245
+ x = torch.cat([x0, x1], 1)
246
+ return x
247
+
248
+ def remove_weight_norm(self):
249
+ """Remove weight normalization from the module."""
250
+ self.enc.remove_weight_norm()
synthesizers.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Optional
3
+
4
+ from rvc_cli.rvc.lib.algorithm.nsf import GeneratorNSF
5
+ from rvc_cli.rvc.lib.algorithm.generators import Generator
6
+ from rvc_cli.rvc.lib.algorithm.commons import slice_segments, rand_slice_segments
7
+ from rvc_cli.rvc.lib.algorithm.residuals import ResidualCouplingBlock
8
+ from rvc_cli.rvc.lib.algorithm.encoders import TextEncoder, PosteriorEncoder
9
+
10
+
11
+ class Synthesizer(torch.nn.Module):
12
+ """
13
+ Base Synthesizer model.
14
+
15
+ Args:
16
+ spec_channels (int): Number of channels in the spectrogram.
17
+ segment_size (int): Size of the audio segment.
18
+ inter_channels (int): Number of channels in the intermediate layers.
19
+ hidden_channels (int): Number of channels in the hidden layers.
20
+ filter_channels (int): Number of channels in the filter layers.
21
+ n_heads (int): Number of attention heads.
22
+ n_layers (int): Number of layers in the encoder.
23
+ kernel_size (int): Size of the convolution kernel.
24
+ p_dropout (float): Dropout probability.
25
+ resblock (str): Type of residual block.
26
+ resblock_kernel_sizes (list): Kernel sizes for the residual blocks.
27
+ resblock_dilation_sizes (list): Dilation sizes for the residual blocks.
28
+ upsample_rates (list): Upsampling rates for the decoder.
29
+ upsample_initial_channel (int): Number of channels in the initial upsampling layer.
30
+ upsample_kernel_sizes (list): Kernel sizes for the upsampling layers.
31
+ spk_embed_dim (int): Dimension of the speaker embedding.
32
+ gin_channels (int): Number of channels in the global conditioning vector.
33
+ sr (int): Sampling rate of the audio.
34
+ use_f0 (bool): Whether to use F0 information.
35
+ text_enc_hidden_dim (int): Hidden dimension for the text encoder.
36
+ kwargs: Additional keyword arguments.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ spec_channels,
42
+ segment_size,
43
+ inter_channels,
44
+ hidden_channels,
45
+ filter_channels,
46
+ n_heads,
47
+ n_layers,
48
+ kernel_size,
49
+ p_dropout,
50
+ resblock,
51
+ resblock_kernel_sizes,
52
+ resblock_dilation_sizes,
53
+ upsample_rates,
54
+ upsample_initial_channel,
55
+ upsample_kernel_sizes,
56
+ spk_embed_dim,
57
+ gin_channels,
58
+ sr,
59
+ use_f0,
60
+ text_enc_hidden_dim=768,
61
+ **kwargs
62
+ ):
63
+ super(Synthesizer, self).__init__()
64
+ self.spec_channels = spec_channels
65
+ self.inter_channels = inter_channels
66
+ self.hidden_channels = hidden_channels
67
+ self.filter_channels = filter_channels
68
+ self.n_heads = n_heads
69
+ self.n_layers = n_layers
70
+ self.kernel_size = kernel_size
71
+ self.p_dropout = float(p_dropout)
72
+ self.resblock = resblock
73
+ self.resblock_kernel_sizes = resblock_kernel_sizes
74
+ self.resblock_dilation_sizes = resblock_dilation_sizes
75
+ self.upsample_rates = upsample_rates
76
+ self.upsample_initial_channel = upsample_initial_channel
77
+ self.upsample_kernel_sizes = upsample_kernel_sizes
78
+ self.segment_size = segment_size
79
+ self.gin_channels = gin_channels
80
+ self.spk_embed_dim = spk_embed_dim
81
+ self.use_f0 = use_f0
82
+
83
+ self.enc_p = TextEncoder(
84
+ inter_channels,
85
+ hidden_channels,
86
+ filter_channels,
87
+ n_heads,
88
+ n_layers,
89
+ kernel_size,
90
+ float(p_dropout),
91
+ text_enc_hidden_dim,
92
+ f0=use_f0,
93
+ )
94
+
95
+ if use_f0:
96
+ self.dec = GeneratorNSF(
97
+ inter_channels,
98
+ resblock,
99
+ resblock_kernel_sizes,
100
+ resblock_dilation_sizes,
101
+ upsample_rates,
102
+ upsample_initial_channel,
103
+ upsample_kernel_sizes,
104
+ gin_channels=gin_channels,
105
+ sr=sr,
106
+ is_half=kwargs["is_half"],
107
+ )
108
+ else:
109
+ self.dec = Generator(
110
+ inter_channels,
111
+ resblock,
112
+ resblock_kernel_sizes,
113
+ resblock_dilation_sizes,
114
+ upsample_rates,
115
+ upsample_initial_channel,
116
+ upsample_kernel_sizes,
117
+ gin_channels=gin_channels,
118
+ )
119
+
120
+ self.enc_q = PosteriorEncoder(
121
+ spec_channels,
122
+ inter_channels,
123
+ hidden_channels,
124
+ 5,
125
+ 1,
126
+ 16,
127
+ gin_channels=gin_channels,
128
+ )
129
+ self.flow = ResidualCouplingBlock(
130
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
131
+ )
132
+ self.emb_g = torch.nn.Embedding(self.spk_embed_dim, gin_channels)
133
+
134
+ def remove_weight_norm(self):
135
+ """Removes weight normalization from the model."""
136
+ self.dec.remove_weight_norm()
137
+ self.flow.remove_weight_norm()
138
+ self.enc_q.remove_weight_norm()
139
+
140
+ def __prepare_scriptable__(self):
141
+ for hook in self.dec._forward_pre_hooks.values():
142
+ if (
143
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
144
+ and hook.__class__.__name__ == "WeightNorm"
145
+ ):
146
+ torch.nn.utils.remove_weight_norm(self.dec)
147
+ for hook in self.flow._forward_pre_hooks.values():
148
+ if (
149
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
150
+ and hook.__class__.__name__ == "WeightNorm"
151
+ ):
152
+ torch.nn.utils.remove_weight_norm(self.flow)
153
+ if hasattr(self, "enc_q"):
154
+ for hook in self.enc_q._forward_pre_hooks.values():
155
+ if (
156
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
157
+ and hook.__class__.__name__ == "WeightNorm"
158
+ ):
159
+ torch.nn.utils.remove_weight_norm(self.enc_q)
160
+ return self
161
+
162
+ @torch.jit.ignore
163
+ def forward(
164
+ self,
165
+ phone: torch.Tensor,
166
+ phone_lengths: torch.Tensor,
167
+ pitch: Optional[torch.Tensor] = None,
168
+ pitchf: Optional[torch.Tensor] = None,
169
+ y: torch.Tensor = None,
170
+ y_lengths: torch.Tensor = None,
171
+ ds: Optional[torch.Tensor] = None,
172
+ ):
173
+ """
174
+ Forward pass of the model.
175
+
176
+ Args:
177
+ phone (torch.Tensor): Phoneme sequence.
178
+ phone_lengths (torch.Tensor): Lengths of the phoneme sequences.
179
+ pitch (torch.Tensor, optional): Pitch sequence.
180
+ pitchf (torch.Tensor, optional): Fine-grained pitch sequence.
181
+ y (torch.Tensor, optional): Target spectrogram.
182
+ y_lengths (torch.Tensor, optional): Lengths of the target spectrograms.
183
+ ds (torch.Tensor, optional): Speaker embedding. Defaults to None.
184
+ """
185
+ g = self.emb_g(ds).unsqueeze(-1)
186
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
187
+ if y is not None:
188
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
189
+ z_p = self.flow(z, y_mask, g=g)
190
+ z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size)
191
+ if self.use_f0:
192
+ pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2)
193
+ o = self.dec(z_slice, pitchf, g=g)
194
+ else:
195
+ o = self.dec(z_slice, g=g)
196
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
197
+ else:
198
+ return None, None, x_mask, None, (None, None, m_p, logs_p, None, None)
199
+
200
+ @torch.jit.export
201
+ def infer(
202
+ self,
203
+ phone: torch.Tensor,
204
+ phone_lengths: torch.Tensor,
205
+ pitch: Optional[torch.Tensor] = None,
206
+ nsff0: Optional[torch.Tensor] = None,
207
+ sid: torch.Tensor = None,
208
+ rate: Optional[torch.Tensor] = None,
209
+ ):
210
+ """
211
+ Inference of the model.
212
+
213
+ Args:
214
+ phone (torch.Tensor): Phoneme sequence.
215
+ phone_lengths (torch.Tensor): Lengths of the phoneme sequences.
216
+ pitch (torch.Tensor, optional): Pitch sequence.
217
+ nsff0 (torch.Tensor, optional): Fine-grained pitch sequence.
218
+ sid (torch.Tensor): Speaker embedding.
219
+ rate (torch.Tensor, optional): Rate for time-stretching. Defaults to None.
220
+ """
221
+ g = self.emb_g(sid).unsqueeze(-1)
222
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
223
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
224
+ if rate is not None:
225
+ assert isinstance(rate, torch.Tensor)
226
+ head = int(z_p.shape[2] * (1.0 - rate.item()))
227
+ z_p = z_p[:, :, head:]
228
+ x_mask = x_mask[:, :, head:]
229
+ if self.use_f0:
230
+ nsff0 = nsff0[:, head:]
231
+ if self.use_f0:
232
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
233
+ o = self.dec(z * x_mask, nsff0, g=g)
234
+ else:
235
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
236
+ o = self.dec(z * x_mask, g=g)
237
+ return o, x_mask, (z, z_p, m_p, logs_p)
utils.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import librosa
3
+ import soundfile as sf
4
+ import numpy as np
5
+ import re
6
+ import unicodedata
7
+ import wget
8
+ from pydub import AudioSegment
9
+ from torch import nn
10
+
11
+ import logging
12
+ from transformers import HubertModel
13
+ import warnings
14
+
15
+ # Remove this to see warnings about transformers models
16
+ warnings.filterwarnings("ignore")
17
+
18
+ logging.getLogger("fairseq").setLevel(logging.ERROR)
19
+ logging.getLogger("faiss.loader").setLevel(logging.ERROR)
20
+ logging.getLogger("transformers").setLevel(logging.ERROR)
21
+ logging.getLogger("torch").setLevel(logging.ERROR)
22
+
23
+ now_dir = os.getcwd()
24
+ sys.path.append(now_dir)
25
+
26
+ base_path = os.path.join(now_dir, "rvc_cli", "rvc", "models", "formant", "stftpitchshift")
27
+ stft = base_path + ".exe" if sys.platform == "win32" else base_path
28
+
29
+
30
+ class HubertModelWithFinalProj(HubertModel):
31
+ def __init__(self, config):
32
+ super().__init__(config)
33
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
34
+
35
+
36
+ def load_audio(file, sample_rate):
37
+ try:
38
+ file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
39
+ audio, sr = sf.read(file)
40
+ if len(audio.shape) > 1:
41
+ audio = librosa.to_mono(audio.T)
42
+ if sr != sample_rate:
43
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
44
+ except Exception as error:
45
+ raise RuntimeError(f"An error occurred loading the audio: {error}")
46
+
47
+ return audio.flatten()
48
+
49
+
50
+ def load_audio_infer(
51
+ file,
52
+ sample_rate,
53
+ **kwargs,
54
+ ):
55
+ formant_shifting = kwargs.get("formant_shifting", False)
56
+ try:
57
+ file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
58
+ if not os.path.isfile(file):
59
+ raise FileNotFoundError(f"File not found: {file}")
60
+ audio, sr = sf.read(file)
61
+ if len(audio.shape) > 1:
62
+ audio = librosa.to_mono(audio.T)
63
+ if sr != sample_rate:
64
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
65
+ if formant_shifting:
66
+ formant_qfrency = kwargs.get("formant_qfrency", 0.8)
67
+ formant_timbre = kwargs.get("formant_timbre", 0.8)
68
+
69
+ from stftpitchshift import StftPitchShift
70
+
71
+ pitchshifter = StftPitchShift(1024, 32, sample_rate)
72
+ audio = pitchshifter.shiftpitch(
73
+ audio,
74
+ factors=1,
75
+ quefrency=formant_qfrency * 1e-3,
76
+ distortion=formant_timbre,
77
+ )
78
+ except Exception as error:
79
+ raise RuntimeError(f"An error occurred loading the audio: {error}")
80
+ return np.array(audio).flatten()
81
+
82
+
83
+ def format_title(title):
84
+ formatted_title = (
85
+ unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8")
86
+ )
87
+ formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title)
88
+ formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title)
89
+ formatted_title = re.sub(r"\s+", "_", formatted_title)
90
+ return formatted_title
91
+
92
+
93
+ def load_embedding(embedder_model, custom_embedder=None):
94
+ embedder_root = os.path.join(now_dir, "rvc", "models", "embedders")
95
+ embedding_list = {
96
+ "contentvec": os.path.join(embedder_root, "contentvec"),
97
+ "chinese-hubert-base": os.path.join(embedder_root, "chinese_hubert_base"),
98
+ "japanese-hubert-base": os.path.join(embedder_root, "japanese_hubert_base"),
99
+ "korean-hubert-base": os.path.join(embedder_root, "korean_hubert_base"),
100
+ }
101
+
102
+ online_embedders = {
103
+ "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/pytorch_model.bin",
104
+ "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/pytorch_model.bin",
105
+ "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/pytorch_model.bin",
106
+ "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/pytorch_model.bin",
107
+ }
108
+
109
+ config_files = {
110
+ "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/config.json",
111
+ "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/config.json",
112
+ "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/config.json",
113
+ "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/config.json",
114
+ }
115
+
116
+ if embedder_model == "custom":
117
+ if os.path.exists(custom_embedder):
118
+ model_path = custom_embedder
119
+ else:
120
+ print(f"Custom embedder not found: {custom_embedder}, using contentvec")
121
+ model_path = embedding_list["contentvec"]
122
+ else:
123
+ model_path = embedding_list[embedder_model]
124
+ bin_file = os.path.join(model_path, "pytorch_model.bin")
125
+ json_file = os.path.join(model_path, "config.json")
126
+ os.makedirs(model_path, exist_ok=True)
127
+ if not os.path.exists(bin_file):
128
+ url = online_embedders[embedder_model]
129
+ print(f"Downloading {url} to {model_path}...")
130
+ wget.download(url, out=bin_file)
131
+ if not os.path.exists(json_file):
132
+ url = config_files[embedder_model]
133
+ print(f"Downloading {url} to {model_path}...")
134
+ wget.download(url, out=json_file)
135
+
136
+ models = HubertModelWithFinalProj.from_pretrained(model_path)
137
+ return models
zluda.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"):
4
+ _torch_stft = torch.stft
5
+
6
+ def z_stft(
7
+ audio: torch.Tensor,
8
+ n_fft: int,
9
+ hop_length: int = None,
10
+ win_length: int = None,
11
+ window: torch.Tensor = None,
12
+ center: bool = True,
13
+ pad_mode: str = "reflect",
14
+ normalized: bool = False,
15
+ onesided: bool = None,
16
+ return_complex: bool = None,
17
+ ):
18
+ sd = audio.device
19
+ return _torch_stft(
20
+ audio.to("cpu"),
21
+ n_fft=n_fft,
22
+ hop_length=hop_length,
23
+ win_length=win_length,
24
+ window=window.to("cpu"),
25
+ center=center,
26
+ pad_mode=pad_mode,
27
+ normalized=normalized,
28
+ onesided=onesided,
29
+ return_complex=return_complex,
30
+ ).to(sd)
31
+
32
+ def z_jit(f, *_, **__):
33
+ f.graph = torch._C.Graph()
34
+ return f
35
+
36
+ # hijacks
37
+ torch.stft = z_stft
38
+ torch.jit.script = z_jit
39
+ # disabling unsupported cudnn
40
+ torch.backends.cudnn.enabled = False
41
+ torch.backends.cuda.enable_flash_sdp(False)
42
+ torch.backends.cuda.enable_math_sdp(True)
43
+ torch.backends.cuda.enable_mem_efficient_sdp(False)