diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..3a59e6e57712eeadb6f6588fc9dbe95a3ddd84cb 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +logs/reference/reference.wav filter=lfs diff=lfs merge=lfs -text diff --git a/logs/reference/contentvec/feats.npy b/logs/reference/contentvec/feats.npy new file mode 100644 index 0000000000000000000000000000000000000000..68bb90a8be9b9645690764c47d3818ecce6b4479 --- /dev/null +++ b/logs/reference/contentvec/feats.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29f222c312b1b449f4defa82938a4d1b910f22d1861cdb4d23475f14cd39ca5f +size 5354624 diff --git a/logs/reference/create_reference.py b/logs/reference/create_reference.py new file mode 100644 index 0000000000000000000000000000000000000000..39a65e18996067e8295695a489e4cd81cb10ea65 --- /dev/null +++ b/logs/reference/create_reference.py @@ -0,0 +1,68 @@ +import numpy as np +import torch +import librosa +import soundfile as sf +from rvc.lib.predictors.f0 import RMVPE +from transformers import HubertModel + + +def cf0(f0): + f0_bin = 256 + f0_max = 1100.0 + f0_min = 50.0 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + """Convert F0 to coarse F0.""" + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel = np.clip( + (f0_mel - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1, + 1, + f0_bin - 1, + ) + return np.rint(f0_mel).astype(int) + + +ref = r"reference.wav" +audio, sr = librosa.load(ref, sr=16000) +trimmed_len = (len(audio) // 320) * 320 +# to prevent feature and pitch offset mismatch +audio = audio[:trimmed_len] + +print("audio", audio.shape) +rmvpe_model = RMVPE(device="cpu", sample_rate=16000, hop_size=160) +f0 = rmvpe_model.get_f0(audio, filter_radius=0.03) +print("f0", f0.shape) +f0c = cf0(f0) +print("f0c", f0c.shape) + +cv_path = r"rvc\models\embedders\contentvec" +cv_model = HubertModel.from_pretrained(cv_path) + +spin_path = r"rvc\models\embedders\spin" +spin_model = HubertModel.from_pretrained(spin_path) + +spin2_path = r"rvc\models\embedders\spin-v2" +spin2_model = HubertModel.from_pretrained(spin2_path) + +feats = torch.from_numpy(audio).to(torch.float32).to("cpu") +feats = torch.nn.functional.pad(feats.unsqueeze(0), (40, 40), mode="reflect") +feats = feats.view(1, -1) + +with torch.no_grad(): + cv_feats = cv_model(feats)["last_hidden_state"] + cv_feats = cv_feats.squeeze(0).float().cpu().numpy() + print("cv", cv_feats.shape) + + spin_feats = spin_model(feats)["last_hidden_state"] + spin_feats = spin_feats.squeeze(0).float().cpu().numpy() + print("spin", spin_feats.shape) + + spin2_feats = spin2_model(feats)["last_hidden_state"] + spin2_feats = spin2_feats.squeeze(0).float().cpu().numpy() + print("spin-v2", spin2_feats.shape) + +np.save(r"logs\reference\contentvec\feats.npy", cv_feats) +np.save(r"logs\reference\spin\feats.npy", spin_feats) +np.save(r"logs\reference\spin-v2\feats.npy", spin2_feats) +np.save(r"logs\reference\pitch_coarse.npy", f0c) +np.save(r"logs\reference\pitch_fine.npy", f0) diff --git a/logs/reference/pitch_coarse.npy b/logs/reference/pitch_coarse.npy new file mode 100644 index 0000000000000000000000000000000000000000..a4c385ef4382a1e7cc7b92122d7536b6e3b7e5b9 --- /dev/null +++ b/logs/reference/pitch_coarse.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8164c4678a265c064b26d8927fda314b75db407e13c7ff59a8645d8a911ae70 +size 14076 diff --git a/logs/reference/pitch_fine.npy b/logs/reference/pitch_fine.npy new file mode 100644 index 0000000000000000000000000000000000000000..03d0b47e404f1660bdc48a801ef34d12de98ff10 --- /dev/null +++ b/logs/reference/pitch_fine.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83408eddb152bfc49214693d8e6eec5127fd5f308aecfd40edc69b6119f818eb +size 28024 diff --git a/logs/reference/reference.wav b/logs/reference/reference.wav new file mode 100644 index 0000000000000000000000000000000000000000..c9a883543f1617f2c0e6aec574b5d0bb1c226c5a --- /dev/null +++ b/logs/reference/reference.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d84b4a9de2592488f5ec75b1072f87114bfac80fa625d323f941273f5a4f32ff +size 2231084 diff --git a/logs/reference/spin-v2/feats.npy b/logs/reference/spin-v2/feats.npy new file mode 100644 index 0000000000000000000000000000000000000000..187881ff0182800c6edb4fac9501f13bc107e639 --- /dev/null +++ b/logs/reference/spin-v2/feats.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9b539aa9aae0bbfe80d0fe54c496c4eada98890a12185f20ebd5b2bff2d32e5 +size 5354624 diff --git a/logs/reference/spin/feats.npy b/logs/reference/spin/feats.npy new file mode 100644 index 0000000000000000000000000000000000000000..ddfa4c27f8337297193dfe54453c7906cd499ac4 --- /dev/null +++ b/logs/reference/spin/feats.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3ec6620a520b4f9a77d7dc170d06b63efa31b9ec2ac6410447ea6aaa3f5aafb +size 5354624 diff --git a/rvc/configs/32000.json b/rvc/configs/32000.json new file mode 100644 index 0000000000000000000000000000000000000000..803a948e9f4d033dc7d7ccf08deee7706ebceb91 --- /dev/null +++ b/rvc/configs/32000.json @@ -0,0 +1,42 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "lr_decay": 0.999875, + "segment_size": 12800, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 32000, + "filter_length": 1024, + "hop_length": 320, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,8,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [20,16,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/rvc/configs/40000.json b/rvc/configs/40000.json new file mode 100644 index 0000000000000000000000000000000000000000..dd9bcc85b96a5e71478354c525189a4832c2b78a --- /dev/null +++ b/rvc/configs/40000.json @@ -0,0 +1,42 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "lr_decay": 0.999875, + "segment_size": 12800, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 40000, + "filter_length": 2048, + "hop_length": 400, + "win_length": 2048, + "n_mel_channels": 125, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,10,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/rvc/configs/48000.json b/rvc/configs/48000.json new file mode 100644 index 0000000000000000000000000000000000000000..968ad3b3b15963beb1cf77f16f4b3dfd8ca9ca08 --- /dev/null +++ b/rvc/configs/48000.json @@ -0,0 +1,42 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "lr_decay": 0.999875, + "segment_size": 17280, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 48000, + "filter_length": 2048, + "hop_length": 480, + "win_length": 2048, + "n_mel_channels": 128, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [12,10,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [24,20,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/rvc/configs/config.py b/rvc/configs/config.py new file mode 100644 index 0000000000000000000000000000000000000000..d1d4c978d0b93ee45ec5bcb3aa0b66b929d1fedf --- /dev/null +++ b/rvc/configs/config.py @@ -0,0 +1,98 @@ +import torch +import json +import os + +version_config_paths = [ + os.path.join("48000.json"), + os.path.join("40000.json"), + os.path.join("32000.json"), +] + + +def singleton(cls): + instances = {} + + def get_instance(*args, **kwargs): + if cls not in instances: + instances[cls] = cls(*args, **kwargs) + return instances[cls] + + return get_instance + + +@singleton +class Config: + def __init__(self): + self.device = "cuda:0" if torch.cuda.is_available() else "cpu" + self.gpu_name = ( + torch.cuda.get_device_name(int(self.device.split(":")[-1])) + if self.device.startswith("cuda") + else None + ) + self.json_config = self.load_config_json() + self.gpu_mem = None + self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() + + def load_config_json(self): + configs = {} + for config_file in version_config_paths: + config_path = os.path.join("rvc", "configs", config_file) + with open(config_path, "r") as f: + configs[config_file] = json.load(f) + return configs + + def device_config(self): + if self.device.startswith("cuda"): + self.set_cuda_config() + else: + self.device = "cpu" + + # Configuration for 6GB GPU memory + x_pad, x_query, x_center, x_max = (1, 6, 38, 41) + if self.gpu_mem is not None and self.gpu_mem <= 4: + # Configuration for 5GB GPU memory + x_pad, x_query, x_center, x_max = (1, 5, 30, 32) + + return x_pad, x_query, x_center, x_max + + def set_cuda_config(self): + i_device = int(self.device.split(":")[-1]) + self.gpu_name = torch.cuda.get_device_name(i_device) + self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // ( + 1024**3 + ) + + +def max_vram_gpu(gpu): + if torch.cuda.is_available(): + gpu_properties = torch.cuda.get_device_properties(gpu) + total_memory_gb = round(gpu_properties.total_memory / 1024 / 1024 / 1024) + return total_memory_gb + else: + return "8" + + +def get_gpu_info(): + ngpu = torch.cuda.device_count() + gpu_infos = [] + if torch.cuda.is_available() or ngpu != 0: + for i in range(ngpu): + gpu_name = torch.cuda.get_device_name(i) + mem = int( + torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + + 0.4 + ) + gpu_infos.append(f"{i}: {gpu_name} ({mem} GB)") + if len(gpu_infos) > 0: + gpu_info = "\n".join(gpu_infos) + else: + gpu_info = "Unfortunately, there is no compatible GPU available to support your training." + return gpu_info + + +def get_number_of_gpus(): + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + return "-".join(map(str, range(num_gpus))) + else: + return "-" diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..435520a95206fd3135acf7e5a796e6377935d592 --- /dev/null +++ b/rvc/infer/infer.py @@ -0,0 +1,496 @@ +import os +import sys +import soxr +import time +import torch +import librosa +import logging +import traceback +import numpy as np +import soundfile as sf +import noisereduce as nr +from pedalboard import ( + Pedalboard, + Chorus, + Distortion, + Reverb, + PitchShift, + Limiter, + Gain, + Bitcrush, + Clipping, + Compressor, + Delay, +) + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from rvc.infer.pipeline import Pipeline as VC +from rvc.lib.utils import load_audio_infer, load_embedding +from rvc.lib.tools.split_audio import process_audio, merge_audio +from rvc.lib.algorithm.synthesizers import Synthesizer +from rvc.configs.config import Config + +logging.getLogger("httpx").setLevel(logging.WARNING) +logging.getLogger("httpcore").setLevel(logging.WARNING) +logging.getLogger("faiss").setLevel(logging.WARNING) +logging.getLogger("faiss.loader").setLevel(logging.WARNING) + + +class VoiceConverter: + """ + A class for performing voice conversion using the Retrieval-Based Voice Conversion (RVC) method. + """ + + def __init__(self): + """ + Initializes the VoiceConverter with default configuration, and sets up models and parameters. + """ + self.config = Config() # Load configuration + self.hubert_model = ( + None # Initialize the Hubert model (for embedding extraction) + ) + self.last_embedder_model = None # Last used embedder model + self.tgt_sr = None # Target sampling rate for the output audio + self.net_g = None # Generator network for voice conversion + self.vc = None # Voice conversion pipeline instance + self.cpt = None # Checkpoint for loading model weights + self.version = None # Model version + self.n_spk = None # Number of speakers in the model + self.use_f0 = None # Whether the model uses F0 + self.loaded_model = None + + def load_hubert(self, embedder_model: str, embedder_model_custom: str = None): + """ + Loads the HuBERT model for speaker embedding extraction. + + Args: + embedder_model (str): Path to the pre-trained HuBERT model. + embedder_model_custom (str): Path to the custom HuBERT model. + """ + self.hubert_model = load_embedding(embedder_model, embedder_model_custom) + self.hubert_model = self.hubert_model.to(self.config.device).float() + self.hubert_model.eval() + + @staticmethod + def remove_audio_noise(data, sr, reduction_strength=0.7): + """ + Removes noise from an audio file using the NoiseReduce library. + + Args: + data (numpy.ndarray): The audio data as a NumPy array. + sr (int): The sample rate of the audio data. + reduction_strength (float): Strength of the noise reduction. Default is 0.7. + """ + try: + reduced_noise = nr.reduce_noise( + y=data, sr=sr, prop_decrease=reduction_strength + ) + return reduced_noise + except Exception as error: + print(f"An error occurred removing audio noise: {error}") + return None + + @staticmethod + def convert_audio_format(input_path, output_path, output_format): + """ + Converts an audio file to a specified output format. + + Args: + input_path (str): Path to the input audio file. + output_path (str): Path to the output audio file. + output_format (str): Desired audio format (e.g., "WAV", "MP3"). + """ + try: + if output_format != "WAV": + print(f"Saving audio as {output_format}...") + audio, sample_rate = librosa.load(input_path, sr=None) + common_sample_rates = [ + 8000, + 11025, + 12000, + 16000, + 22050, + 24000, + 32000, + 44100, + 48000, + ] + target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate)) + audio = librosa.resample( + audio, orig_sr=sample_rate, target_sr=target_sr, res_type="soxr_vhq" + ) + sf.write(output_path, audio, target_sr, format=output_format.lower()) + return output_path + except Exception as error: + print(f"An error occurred converting the audio format: {error}") + + @staticmethod + def post_process_audio( + audio_input, + sample_rate, + **kwargs, + ): + board = Pedalboard() + if kwargs.get("reverb", False): + reverb = Reverb( + room_size=kwargs.get("reverb_room_size", 0.5), + damping=kwargs.get("reverb_damping", 0.5), + wet_level=kwargs.get("reverb_wet_level", 0.33), + dry_level=kwargs.get("reverb_dry_level", 0.4), + width=kwargs.get("reverb_width", 1.0), + freeze_mode=kwargs.get("reverb_freeze_mode", 0), + ) + board.append(reverb) + if kwargs.get("pitch_shift", False): + pitch_shift = PitchShift(semitones=kwargs.get("pitch_shift_semitones", 0)) + board.append(pitch_shift) + if kwargs.get("limiter", False): + limiter = Limiter( + threshold_db=kwargs.get("limiter_threshold", -6), + release_ms=kwargs.get("limiter_release", 0.05), + ) + board.append(limiter) + if kwargs.get("gain", False): + gain = Gain(gain_db=kwargs.get("gain_db", 0)) + board.append(gain) + if kwargs.get("distortion", False): + distortion = Distortion(drive_db=kwargs.get("distortion_gain", 25)) + board.append(distortion) + if kwargs.get("chorus", False): + chorus = Chorus( + rate_hz=kwargs.get("chorus_rate", 1.0), + depth=kwargs.get("chorus_depth", 0.25), + centre_delay_ms=kwargs.get("chorus_delay", 7), + feedback=kwargs.get("chorus_feedback", 0.0), + mix=kwargs.get("chorus_mix", 0.5), + ) + board.append(chorus) + if kwargs.get("bitcrush", False): + bitcrush = Bitcrush(bit_depth=kwargs.get("bitcrush_bit_depth", 8)) + board.append(bitcrush) + if kwargs.get("clipping", False): + clipping = Clipping(threshold_db=kwargs.get("clipping_threshold", 0)) + board.append(clipping) + if kwargs.get("compressor", False): + compressor = Compressor( + threshold_db=kwargs.get("compressor_threshold", 0), + ratio=kwargs.get("compressor_ratio", 1), + attack_ms=kwargs.get("compressor_attack", 1.0), + release_ms=kwargs.get("compressor_release", 100), + ) + board.append(compressor) + if kwargs.get("delay", False): + delay = Delay( + delay_seconds=kwargs.get("delay_seconds", 0.5), + feedback=kwargs.get("delay_feedback", 0.0), + mix=kwargs.get("delay_mix", 0.5), + ) + board.append(delay) + return board(audio_input, sample_rate) + + def convert_audio( + self, + audio_input_path: str, + audio_output_path: str, + model_path: str, + index_path: str, + pitch: int = 0, + f0_method: str = "rmvpe", + index_rate: float = 0.75, + volume_envelope: float = 1.0, + protect: float = 0.5, + hop_length: int = 128, + split_audio: bool = False, + f0_autotune: bool = False, + f0_autotune_strength: float = 1, + embedder_model: str = "contentvec", + embedder_model_custom: str = None, + clean_audio: bool = False, + clean_strength: float = 0.5, + export_format: str = "WAV", + post_process: bool = False, + resample_sr: int = 0, + sid: int = 0, + proposed_pitch: bool = False, + proposed_pitch_threshold: float = 155.0, + **kwargs, + ): + """ + Performs voice conversion on the input audio. + + Args: + pitch (int): Key for F0 up-sampling. + index_rate (float): Rate for index matching. + volume_envelope (int): RMS mix rate. + protect (float): Protection rate for certain audio segments. + hop_length (int): Hop length for audio processing. + f0_method (str): Method for F0 extraction. + audio_input_path (str): Path to the input audio file. + audio_output_path (str): Path to the output audio file. + model_path (str): Path to the voice conversion model. + index_path (str): Path to the index file. + split_audio (bool): Whether to split the audio for processing. + f0_autotune (bool): Whether to use F0 autotune. + clean_audio (bool): Whether to clean the audio. + clean_strength (float): Strength of the audio cleaning. + export_format (str): Format for exporting the audio. + f0_file (str): Path to the F0 file. + embedder_model (str): Path to the embedder model. + embedder_model_custom (str): Path to the custom embedder model. + resample_sr (int, optional): Resample sampling rate. Default is 0. + sid (int, optional): Speaker ID. Default is 0. + **kwargs: Additional keyword arguments. + """ + if not model_path: + print("No model path provided. Aborting conversion.") + return + + self.get_vc(model_path, sid) + + try: + start_time = time.time() + print(f"Converting audio '{audio_input_path}'...") + + audio = load_audio_infer( + audio_input_path, + 16000, + **kwargs, + ) + audio_max = np.abs(audio).max() / 0.95 + + if audio_max > 1: + audio /= audio_max + + if not self.hubert_model or embedder_model != self.last_embedder_model: + self.load_hubert(embedder_model, embedder_model_custom) + self.last_embedder_model = embedder_model + + file_index = ( + index_path.strip() + .strip('"') + .strip("\n") + .strip('"') + .strip() + .replace("trained", "added") + ) + + if self.tgt_sr != resample_sr >= 16000: + self.tgt_sr = resample_sr + + if split_audio: + chunks, intervals = process_audio(audio, 16000) + print(f"Audio split into {len(chunks)} chunks for processing.") + else: + chunks = [] + chunks.append(audio) + + converted_chunks = [] + for c in chunks: + audio_opt = self.vc.pipeline( + model=self.hubert_model, + net_g=self.net_g, + sid=sid, + audio=c, + pitch=pitch, + f0_method=f0_method, + file_index=file_index, + index_rate=index_rate, + pitch_guidance=self.use_f0, + volume_envelope=volume_envelope, + version=self.version, + protect=protect, + f0_autotune=f0_autotune, + f0_autotune_strength=f0_autotune_strength, + proposed_pitch=proposed_pitch, + proposed_pitch_threshold=proposed_pitch_threshold, + ) + converted_chunks.append(audio_opt) + if split_audio: + print(f"Converted audio chunk {len(converted_chunks)}") + + if split_audio: + audio_opt = merge_audio( + chunks, converted_chunks, intervals, 16000, self.tgt_sr + ) + else: + audio_opt = converted_chunks[0] + + if clean_audio: + cleaned_audio = self.remove_audio_noise( + audio_opt, self.tgt_sr, clean_strength + ) + if cleaned_audio is not None: + audio_opt = cleaned_audio + + if post_process: + audio_opt = self.post_process_audio( + audio_input=audio_opt, + sample_rate=self.tgt_sr, + **kwargs, + ) + + sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV") + output_path_format = audio_output_path.replace( + ".wav", f".{export_format.lower()}" + ) + audio_output_path = self.convert_audio_format( + audio_output_path, output_path_format, export_format + ) + + elapsed_time = time.time() - start_time + print( + f"Conversion completed at '{audio_output_path}' in {elapsed_time:.2f} seconds." + ) + except Exception as error: + print(f"An error occurred during audio conversion: {error}") + print(traceback.format_exc()) + + def convert_audio_batch( + self, + audio_input_paths: str, + audio_output_path: str, + **kwargs, + ): + """ + Performs voice conversion on a batch of input audio files. + + Args: + audio_input_paths (str): List of paths to the input audio files. + audio_output_path (str): Path to the output audio file. + resample_sr (int, optional): Resample sampling rate. Default is 0. + sid (int, optional): Speaker ID. Default is 0. + **kwargs: Additional keyword arguments. + """ + pid = os.getpid() + try: + with open( + os.path.join(now_dir, "assets", "infer_pid.txt"), "w" + ) as pid_file: + pid_file.write(str(pid)) + start_time = time.time() + print(f"Converting audio batch '{audio_input_paths}'...") + audio_files = [ + f + for f in os.listdir(audio_input_paths) + if f.lower().endswith( + ( + "wav", + "mp3", + "flac", + "ogg", + "opus", + "m4a", + "mp4", + "aac", + "alac", + "wma", + "aiff", + "webm", + "ac3", + ) + ) + ] + print(f"Detected {len(audio_files)} audio files for inference.") + for a in audio_files: + new_input = os.path.join(audio_input_paths, a) + new_output = os.path.splitext(a)[0] + "_output.wav" + new_output = os.path.join(audio_output_path, new_output) + if os.path.exists(new_output): + continue + self.convert_audio( + audio_input_path=new_input, + audio_output_path=new_output, + **kwargs, + ) + print(f"Conversion completed at '{audio_input_paths}'.") + elapsed_time = time.time() - start_time + print(f"Batch conversion completed in {elapsed_time:.2f} seconds.") + except Exception as error: + print(f"An error occurred during audio batch conversion: {error}") + print(traceback.format_exc()) + finally: + os.remove(os.path.join(now_dir, "assets", "infer_pid.txt")) + + def get_vc(self, weight_root, sid): + """ + Loads the voice conversion model and sets up the pipeline. + + Args: + weight_root (str): Path to the model weights. + sid (int): Speaker ID. + """ + if sid == "" or sid == []: + self.cleanup_model() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + if not self.loaded_model or self.loaded_model != weight_root: + self.load_model(weight_root) + if self.cpt is not None: + self.setup_network() + self.setup_vc_instance() + self.loaded_model = weight_root + else: + self.vc = None + self.loaded_model = None + + def cleanup_model(self): + """ + Cleans up the model and releases resources. + """ + if self.hubert_model is not None: + del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr + self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + del self.net_g, self.cpt + if torch.cuda.is_available(): + torch.cuda.empty_cache() + self.cpt = None + + def load_model(self, weight_root): + """ + Loads the model weights from the specified path. + + Args: + weight_root (str): Path to the model weights. + """ + self.cpt = ( + torch.load(weight_root, map_location="cpu", weights_only=True) + if os.path.isfile(weight_root) + else None + ) + + def setup_network(self): + """ + Sets up the network configuration based on the loaded checkpoint. + """ + if self.cpt is not None: + self.tgt_sr = self.cpt["config"][-1] + self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] + self.use_f0 = self.cpt.get("f0", 1) + + self.version = self.cpt.get("version", "v1") + self.text_enc_hidden_dim = 768 if self.version == "v2" else 256 + self.vocoder = self.cpt.get("vocoder", "HiFi-GAN") + self.net_g = Synthesizer( + *self.cpt["config"], + use_f0=self.use_f0, + text_enc_hidden_dim=self.text_enc_hidden_dim, + vocoder=self.vocoder, + ) + del self.net_g.enc_q + self.net_g.load_state_dict(self.cpt["weight"], strict=False) + self.net_g = self.net_g.to(self.config.device).float() + self.net_g.eval() + + def setup_vc_instance(self): + """ + Sets up the voice conversion pipeline instance based on the target sampling rate and configuration. + """ + if self.cpt is not None: + self.vc = VC(self.tgt_sr, self.config) + self.n_spk = self.cpt["config"][-3] diff --git a/rvc/infer/pipeline.py b/rvc/infer/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..74c4323c32002bf1e0275656372e84513879931a --- /dev/null +++ b/rvc/infer/pipeline.py @@ -0,0 +1,566 @@ +import os +import gc +import sys +import torch +import torch.nn.functional as F +import torchcrepe +import faiss +import librosa +import numpy as np +from scipy import signal +from torch import Tensor + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from rvc.lib.predictors.f0 import CREPE, FCPE, RMVPE, SWIFT + +import logging + +logging.getLogger("faiss").setLevel(logging.WARNING) + +FILTER_ORDER = 5 +CUTOFF_FREQUENCY = 48 # Hz +SAMPLE_RATE = 16000 # Hz +bh, ah = signal.butter( + N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE +) + + +class AudioProcessor: + """ + A class for processing audio signals, specifically for adjusting RMS levels. + """ + + def change_rms( + source_audio: np.ndarray, + source_rate: int, + target_audio: np.ndarray, + target_rate: int, + rate: float, + ): + """ + Adjust the RMS level of target_audio to match the RMS of source_audio, with a given blending rate. + + Args: + source_audio: The source audio signal as a NumPy array. + source_rate: The sampling rate of the source audio. + target_audio: The target audio signal to adjust. + target_rate: The sampling rate of the target audio. + rate: The blending rate between the source and target RMS levels. + """ + # Calculate RMS of both audio data + rms1 = librosa.feature.rms( + y=source_audio, + frame_length=source_rate // 2 * 2, + hop_length=source_rate // 2, + ) + rms2 = librosa.feature.rms( + y=target_audio, + frame_length=target_rate // 2 * 2, + hop_length=target_rate // 2, + ) + + # Interpolate RMS to match target audio length + rms1 = F.interpolate( + torch.from_numpy(rms1).float().unsqueeze(0), + size=target_audio.shape[0], + mode="linear", + ).squeeze() + rms2 = F.interpolate( + torch.from_numpy(rms2).float().unsqueeze(0), + size=target_audio.shape[0], + mode="linear", + ).squeeze() + rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6) + + # Adjust target audio RMS based on the source audio RMS + adjusted_audio = ( + target_audio + * (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy() + ) + return adjusted_audio + + +class Autotune: + """ + A class for applying autotune to a given fundamental frequency (F0) contour. + """ + + def __init__(self): + """ + Initializes the Autotune class with a set of reference frequencies. + """ + self.note_dict = [ + 49.00, # G1 + 51.91, # G#1 / Ab1 + 55.00, # A1 + 58.27, # A#1 / Bb1 + 61.74, # B1 + 65.41, # C2 + 69.30, # C#2 / Db2 + 73.42, # D2 + 77.78, # D#2 / Eb2 + 82.41, # E2 + 87.31, # F2 + 92.50, # F#2 / Gb2 + 98.00, # G2 + 103.83, # G#2 / Ab2 + 110.00, # A2 + 116.54, # A#2 / Bb2 + 123.47, # B2 + 130.81, # C3 + 138.59, # C#3 / Db3 + 146.83, # D3 + 155.56, # D#3 / Eb3 + 164.81, # E3 + 174.61, # F3 + 185.00, # F#3 / Gb3 + 196.00, # G3 + 207.65, # G#3 / Ab3 + 220.00, # A3 + 233.08, # A#3 / Bb3 + 246.94, # B3 + 261.63, # C4 + 277.18, # C#4 / Db4 + 293.66, # D4 + 311.13, # D#4 / Eb4 + 329.63, # E4 + 349.23, # F4 + 369.99, # F#4 / Gb4 + 392.00, # G4 + 415.30, # G#4 / Ab4 + 440.00, # A4 + 466.16, # A#4 / Bb4 + 493.88, # B4 + 523.25, # C5 + 554.37, # C#5 / Db5 + 587.33, # D5 + 622.25, # D#5 / Eb5 + 659.25, # E5 + 698.46, # F5 + 739.99, # F#5 / Gb5 + 783.99, # G5 + 830.61, # G#5 / Ab5 + 880.00, # A5 + 932.33, # A#5 / Bb5 + 987.77, # B5 + 1046.50, # C6 + ] + + def autotune_f0(self, f0, f0_autotune_strength): + """ + Autotunes a given F0 contour by snapping each frequency to the closest reference frequency. + + Args: + f0: The input F0 contour as a NumPy array. + """ + autotuned_f0 = np.zeros_like(f0) + for i, freq in enumerate(f0): + closest_note = min(self.note_dict, key=lambda x: abs(x - freq)) + autotuned_f0[i] = freq + (closest_note - freq) * f0_autotune_strength + return autotuned_f0 + + +class Pipeline: + """ + The main pipeline class for performing voice conversion, including preprocessing, F0 estimation, + voice conversion using a model, and post-processing. + """ + + def __init__(self, tgt_sr, config): + """ + Initializes the Pipeline class with target sampling rate and configuration parameters. + + Args: + tgt_sr: The target sampling rate for the output audio. + config: A configuration object containing various parameters for the pipeline. + """ + self.x_pad = config.x_pad + self.x_query = config.x_query + self.x_center = config.x_center + self.x_max = config.x_max + self.sample_rate = 16000 + self.tgt_sr = tgt_sr + self.window = 160 + self.t_pad = self.sample_rate * self.x_pad + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sample_rate * self.x_query + self.t_center = self.sample_rate * self.x_center + self.t_max = self.sample_rate * self.x_max + self.time_step = self.window / self.sample_rate * 1000 + self.f0_min = 50 + self.f0_max = 1100 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.device = config.device + self.autotune = Autotune() + + def get_f0( + self, + x, + p_len, + f0_method: str = "rmvpe", + pitch: int = 0, + f0_autotune: bool = False, + f0_autotune_strength: float = 1.0, + proposed_pitch: bool = False, + proposed_pitch_threshold: float = 155.0, + ): + """ + Estimates the fundamental frequency (F0) of a given audio signal using various methods. + + Args: + x: The input audio signal as a NumPy array. + p_len: Desired length of the F0 output. + pitch: Key to adjust the pitch of the F0 contour. + f0_method: Method to use for F0 estimation (e.g., "crepe"). + f0_autotune: Whether to apply autotune to the F0 contour. + proposed_pitch: whether to apply proposed pitch adjustment + proposed_pitch_threshold: target frequency, 155.0 for male, 255.0 for female + """ + if f0_method == "crepe": + model = CREPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.window + ) + f0 = model.get_f0(x, self.f0_min, self.f0_max, p_len, "full") + del model + elif f0_method == "crepe-tiny": + model = CREPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.window + ) + f0 = model.get_f0(x, self.f0_min, self.f0_max, p_len, "tiny") + del model + elif f0_method == "rmvpe": + model = RMVPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.window + ) + f0 = model.get_f0(x, filter_radius=0.03) + del model + elif f0_method == "fcpe": + model = FCPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.window + ) + f0 = model.get_f0(x, p_len, filter_radius=0.006) + del model + elif f0_method == "swift": + model = SWIFT( + device=self.device, sample_rate=self.sample_rate, hop_size=self.window + ) + f0 = model.get_f0( + x, self.f0_min, self.f0_max, p_len, confidence_threshold=0.887 + ) + del model + + # f0 adjustments + if f0_autotune is True: + f0 = self.autotune.autotune_f0(f0, f0_autotune_strength) + elif proposed_pitch is True: + limit = 12 + # calculate median f0 of the audio + valid_f0 = np.where(f0 > 0)[0] + if len(valid_f0) < 2: + # no valid f0 detected + up_key = 0 + else: + median_f0 = float( + np.median(np.interp(np.arange(len(f0)), valid_f0, f0[valid_f0])) + ) + if median_f0 <= 0 or np.isnan(median_f0): + up_key = 0 + else: + # calculate proposed shift + up_key = max( + -limit, + min( + limit, + int( + np.round( + 12 * np.log2(proposed_pitch_threshold / median_f0) + ) + ), + ), + ) + print("calculated pitch offset:", up_key) + f0 *= pow(2, (pitch + up_key) / 12) + else: + f0 *= pow(2, pitch / 12) + # quantizing f0 to 255 buckets to make coarse f0 + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( + self.f0_mel_max - self.f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(int) + + return f0_coarse, f0bak + + def voice_conversion( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + index, + big_npy, + index_rate, + version, + protect, + ): + """ + Performs voice conversion on a given audio segment. + + Args: + model: The feature extractor model. + net_g: The generative model for synthesizing speech. + sid: Speaker ID for the target voice. + audio0: The input audio segment. + pitch: Quantized F0 contour for pitch guidance. + pitchf: Original F0 contour for pitch guidance. + index: FAISS index for speaker embedding retrieval. + big_npy: Speaker embeddings stored in a NumPy array. + index_rate: Blending rate for speaker embedding retrieval. + version: Model version (Keep to support old models). + protect: Protection level for preserving the original pitch. + """ + with torch.no_grad(): + pitch_guidance = pitch != None and pitchf != None + # prepare source audio + feats = torch.from_numpy(audio0).float() + feats = feats.mean(-1) if feats.dim() == 2 else feats + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1).to(self.device) + # extract features + feats = model(feats)["last_hidden_state"] + feats = ( + model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats + ) + # make a copy for pitch guidance and protection + feats0 = feats.clone() if pitch_guidance else None + if ( + index + ): # set by parent function, only true if index is available, loaded, and index rate > 0 + feats = self._retrieve_speaker_embeddings( + feats, index, big_npy, index_rate + ) + # feature upsampling + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + ) + # adjust the length if the audio is short + p_len = min(audio0.shape[0] // self.window, feats.shape[1]) + if pitch_guidance: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + ) + pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len] + # Pitch protection blending + if protect < 0.5: + pitchff = pitchf.clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + feats = feats * pitchff.unsqueeze(-1) + feats0 * ( + 1 - pitchff.unsqueeze(-1) + ) + feats = feats.to(feats0.dtype) + else: + pitch, pitchf = None, None + p_len = torch.tensor([p_len], device=self.device).long() + audio1 = ( + (net_g.infer(feats.float(), p_len, pitch, pitchf.float(), sid)[0][0, 0]) + .data.cpu() + .float() + .numpy() + ) + # clean up + del feats, feats0, p_len + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio1 + + def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate): + npy = feats[0].cpu().numpy() + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + return feats + + def pipeline( + self, + model, + net_g, + sid, + audio, + pitch, + f0_method, + file_index, + index_rate, + pitch_guidance, + volume_envelope, + version, + protect, + f0_autotune, + f0_autotune_strength, + proposed_pitch, + proposed_pitch_threshold, + ): + """ + The main pipeline function for performing voice conversion. + + Args: + model: The feature extractor model. + net_g: The generative model for synthesizing speech. + sid: Speaker ID for the target voice. + audio: The input audio signal. + input_audio_path: Path to the input audio file. + pitch: Key to adjust the pitch of the F0 contour. + f0_method: Method to use for F0 estimation. + file_index: Path to the FAISS index file for speaker embedding retrieval. + index_rate: Blending rate for speaker embedding retrieval. + pitch_guidance: Whether to use pitch guidance during voice conversion. + tgt_sr: Target sampling rate for the output audio. + resample_sr: Resampling rate for the output audio. + version: Model version. + protect: Protection level for preserving the original pitch. + hop_length: Hop length for F0 estimation methods. + f0_autotune: Whether to apply autotune to the F0 contour. + """ + if file_index != "" and os.path.exists(file_index) and index_rate > 0: + try: + index = faiss.read_index(file_index) + big_npy = index.reconstruct_n(0, index.ntotal) + except Exception as error: + print(f"An error occurred reading the FAISS index: {error}") + index = big_npy = None + else: + index = big_npy = None + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += audio_pad[i : i - self.window] + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + np.abs(audio_sum[t - self.t_query : t + self.t_query]) + == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() + )[0][0] + ) + s = 0 + audio_opt = [] + t = None + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + if pitch_guidance: + pitch, pitchf = self.get_f0( + audio_pad, + p_len, + f0_method, + pitch, + f0_autotune, + f0_autotune_strength, + proposed_pitch, + proposed_pitch_threshold, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if self.device == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + for t in opt_ts: + t = t // self.window * self.window + if pitch_guidance: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + s = t + if pitch_guidance: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + if volume_envelope != 1: + audio_opt = AudioProcessor.change_rms( + audio, self.sample_rate, audio_opt, self.tgt_sr, volume_envelope + ) + audio_max = np.abs(audio_opt).max() / 0.99 + if audio_max > 1: + audio_opt /= audio_max + if pitch_guidance: + del pitch, pitchf + del sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt diff --git a/rvc/lib/algorithm/__init__.py b/rvc/lib/algorithm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/rvc/lib/algorithm/attentions.py b/rvc/lib/algorithm/attentions.py new file mode 100644 index 0000000000000000000000000000000000000000..d698afc8aef31097772e8aafc5aca0189043e0ed --- /dev/null +++ b/rvc/lib/algorithm/attentions.py @@ -0,0 +1,243 @@ +import math +import torch +from rvc.lib.algorithm.commons import convert_pad_shape + + +class MultiHeadAttention(torch.nn.Module): + """ + Multi-head attention module with optional relative positional encoding and proximal bias. + + Args: + channels (int): Number of input channels. + out_channels (int): Number of output channels. + n_heads (int): Number of attention heads. + p_dropout (float, optional): Dropout probability. Defaults to 0.0. + window_size (int, optional): Window size for relative positional encoding. Defaults to None. + heads_share (bool, optional): Whether to share relative positional embeddings across heads. Defaults to True. + block_length (int, optional): Block length for local attention. Defaults to None. + proximal_bias (bool, optional): Whether to use proximal bias in self-attention. Defaults to False. + proximal_init (bool, optional): Whether to initialize the key projection weights the same as query projection weights. Defaults to False. + """ + + def __init__( + self, + channels: int, + out_channels: int, + n_heads: int, + p_dropout: float = 0.0, + window_size: int = None, + heads_share: bool = True, + block_length: int = None, + proximal_bias: bool = False, + proximal_init: bool = False, + ): + super().__init__() + assert ( + channels % n_heads == 0 + ), "Channels must be divisible by the number of heads." + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.k_channels = channels // n_heads + self.window_size = window_size + self.block_length = block_length + self.proximal_bias = proximal_bias + + # Define projections + self.conv_q = torch.nn.Conv1d(channels, channels, 1) + self.conv_k = torch.nn.Conv1d(channels, channels, 1) + self.conv_v = torch.nn.Conv1d(channels, channels, 1) + self.conv_o = torch.nn.Conv1d(channels, out_channels, 1) + + self.drop = torch.nn.Dropout(p_dropout) + + # Relative positional encodings + if window_size: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = torch.nn.Parameter( + torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = torch.nn.Parameter( + torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels) + * rel_stddev + ) + + # Initialize weights + torch.nn.init.xavier_uniform_(self.conv_q.weight) + torch.nn.init.xavier_uniform_(self.conv_k.weight) + torch.nn.init.xavier_uniform_(self.conv_v.weight) + torch.nn.init.xavier_uniform_(self.conv_o.weight) + + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + # Compute query, key, value projections + q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c) + + # Compute attention + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + # Final output projection + return self.conv_o(x) + + def attention(self, query, key, value, mask=None): + # Reshape and compute scaled dot-product attention + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + + if self.window_size: + assert t_s == t_t, "Relative attention only supports self-attention." + scores += self._compute_relative_scores(query, t_s) + + if self.proximal_bias: + assert t_s == t_t, "Proximal bias only supports self-attention." + scores += self._attention_bias_proximal(t_s).to(scores.device, scores.dtype) + + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length: + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + + # Apply softmax and dropout + p_attn = self.drop(torch.nn.functional.softmax(scores, dim=-1)) + + # Compute attention output + output = torch.matmul(p_attn, value) + + if self.window_size: + output += self._apply_relative_values(p_attn, t_s) + + return output.transpose(2, 3).contiguous().view(b, d, t_t), p_attn + + def _compute_relative_scores(self, query, length): + rel_emb = self._get_relative_embeddings(self.emb_rel_k, length) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), rel_emb + ) + return self._relative_position_to_absolute_position(rel_logits) + + def _apply_relative_values(self, p_attn, length): + rel_weights = self._absolute_position_to_relative_position(p_attn) + rel_emb = self._get_relative_embeddings(self.emb_rel_v, length) + return self._matmul_with_relative_values(rel_weights, rel_emb) + + # Helper methods + def _matmul_with_relative_values(self, x, y): + return torch.matmul(x, y.unsqueeze(0)) + + def _matmul_with_relative_keys(self, x, y): + return torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + + def _get_relative_embeddings(self, embeddings, length): + pad_length = max(length - (self.window_size + 1), 0) + start = max((self.window_size + 1) - length, 0) + end = start + 2 * length - 1 + + if pad_length > 0: + embeddings = torch.nn.functional.pad( + embeddings, + convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + return embeddings[:, start:end] + + def _relative_position_to_absolute_position(self, x): + batch, heads, length, _ = x.size() + x = torch.nn.functional.pad( + x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]) + ) + x_flat = x.view(batch, heads, length * 2 * length) + x_flat = torch.nn.functional.pad( + x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + return x_flat.view(batch, heads, length + 1, 2 * length - 1)[ + :, :, :length, length - 1 : + ] + + def _absolute_position_to_relative_position(self, x): + batch, heads, length, _ = x.size() + x = torch.nn.functional.pad( + x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view(batch, heads, length**2 + length * (length - 1)) + x_flat = torch.nn.functional.pad( + x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]) + ) + return x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:] + + def _attention_bias_proximal(self, length): + r = torch.arange(length, dtype=torch.float32) + diff = r.unsqueeze(0) - r.unsqueeze(1) + return -torch.log1p(torch.abs(diff)).unsqueeze(0).unsqueeze(0) + + +class FFN(torch.nn.Module): + """ + Feed-forward network module. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + filter_channels (int): Number of filter channels in the convolution layers. + kernel_size (int): Kernel size of the convolution layers. + p_dropout (float, optional): Dropout probability. Defaults to 0.0. + activation (str, optional): Activation function to use. Defaults to None. + causal (bool, optional): Whether to use causal padding in the convolution layers. Defaults to False. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + filter_channels: int, + kernel_size: int, + p_dropout: float = 0.0, + activation: str = None, + causal: bool = False, + ): + super().__init__() + self.padding_fn = self._causal_padding if causal else self._same_padding + + self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = torch.nn.Dropout(p_dropout) + + self.activation = activation + + def forward(self, x, x_mask): + x = self.conv_1(self.padding_fn(x * x_mask)) + x = self._apply_activation(x) + x = self.drop(x) + x = self.conv_2(self.padding_fn(x * x_mask)) + return x * x_mask + + def _apply_activation(self, x): + if self.activation == "gelu": + return x * torch.sigmoid(1.702 * x) + return torch.relu(x) + + def _causal_padding(self, x): + pad_l, pad_r = self.conv_1.kernel_size[0] - 1, 0 + return torch.nn.functional.pad( + x, convert_pad_shape([[0, 0], [0, 0], [pad_l, pad_r]]) + ) + + def _same_padding(self, x): + pad = (self.conv_1.kernel_size[0] - 1) // 2 + return torch.nn.functional.pad( + x, convert_pad_shape([[0, 0], [0, 0], [pad, pad]]) + ) diff --git a/rvc/lib/algorithm/commons.py b/rvc/lib/algorithm/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..47375eafa9944ba6914ca296757913f2cdb68dec --- /dev/null +++ b/rvc/lib/algorithm/commons.py @@ -0,0 +1,138 @@ +import torch +from typing import Optional + + +def init_weights(m, mean=0.0, std=0.01): + """ + Initialize the weights of a module. + + Args: + m: The module to initialize. + mean: The mean of the normal distribution. + std: The standard deviation of the normal distribution. + """ + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + """ + Calculate the padding needed for a convolution. + + Args: + kernel_size: The size of the kernel. + dilation: The dilation of the convolution. + """ + return int((kernel_size * dilation - dilation) / 2) + + +def convert_pad_shape(pad_shape): + """ + Convert the pad shape to a list of integers. + + Args: + pad_shape: The pad shape.. + """ + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def slice_segments( + x: torch.Tensor, ids_str: torch.Tensor, segment_size: int = 4, dim: int = 2 +): + """ + Slice segments from a tensor, handling tensors with different numbers of dimensions. + + Args: + x (torch.Tensor): The tensor to slice. + ids_str (torch.Tensor): The starting indices of the segments. + segment_size (int, optional): The size of each segment. Defaults to 4. + dim (int, optional): The dimension to slice across (2D or 3D tensors). Defaults to 2. + """ + if dim == 2: + ret = torch.zeros_like(x[:, :segment_size]) + elif dim == 3: + ret = torch.zeros_like(x[:, :, :segment_size]) + + for i in range(x.size(0)): + idx_str = ids_str[i].item() + idx_end = idx_str + segment_size + if dim == 2: + ret[i] = x[i, idx_str:idx_end] + else: + ret[i] = x[i, :, idx_str:idx_end] + + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + """ + Randomly slice segments from a tensor. + + Args: + x: The tensor to slice. + x_lengths: The lengths of the sequences. + segment_size: The size of each segment. + """ + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size, dim=3) + return ret, ids_str + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + """ + Fused add tanh sigmoid multiply operation. + + Args: + input_a: The first input tensor. + input_b: The second input tensor. + n_channels: The number of channels. + """ + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None): + """ + Generate a sequence mask. + + Args: + length: The lengths of the sequences. + max_length: The maximum length of the sequences. + """ + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def grad_norm(parameters, norm_type: float = 2.0): + """ + Calculates norm of parameter gradients + + Args: + parameters: The list of parameters to clip. + norm_type: The type of norm to use for clipping. + """ + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + + parameters = [p for p in parameters if p.grad is not None] + + if not parameters: + return 0.0 + + return torch.linalg.vector_norm( + torch.stack([p.grad.norm(norm_type) for p in parameters]), ord=norm_type + ).item() diff --git a/rvc/lib/algorithm/discriminators.py b/rvc/lib/algorithm/discriminators.py new file mode 100644 index 0000000000000000000000000000000000000000..22b59df5e6f5695c3923ffd61d5b149ac829c878 --- /dev/null +++ b/rvc/lib/algorithm/discriminators.py @@ -0,0 +1,149 @@ +import torch +from torch.utils.checkpoint import checkpoint +from torch.nn.utils.parametrizations import spectral_norm, weight_norm + +from rvc.lib.algorithm.commons import get_padding +from rvc.lib.algorithm.residuals import LRELU_SLOPE + + +class MultiPeriodDiscriminator(torch.nn.Module): + """ + Multi-period discriminator. + + This class implements a multi-period discriminator, which is used to + discriminate between real and fake audio signals. The discriminator + is composed of a series of convolutional layers that are applied to + the input signal at different periods. + + Args: + use_spectral_norm (bool): Whether to use spectral normalization. + Defaults to False. + """ + + def __init__(self, use_spectral_norm: bool = False, checkpointing: bool = False): + super().__init__() + periods = [2, 3, 5, 7, 11, 17, 23, 37] + self.checkpointing = checkpointing + self.discriminators = torch.nn.ModuleList( + [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods] + ) + + def forward(self, y, y_hat): + y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], [] + for d in self.discriminators: + if self.training and self.checkpointing: + y_d_r, fmap_r = checkpoint(d, y, use_reentrant=False) + y_d_g, fmap_g = checkpoint(d, y_hat, use_reentrant=False) + else: + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + """ + Discriminator for the short-term component. + + This class implements a discriminator for the short-term component + of the audio signal. The discriminator is composed of a series of + convolutional layers that are applied to the input signal. + """ + + def __init__(self, use_spectral_norm: bool = False): + super().__init__() + + norm_f = spectral_norm if use_spectral_norm else weight_norm + self.convs = torch.nn.ModuleList( + [ + norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)), + norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1)) + self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) + + def forward(self, x): + fmap = [] + for conv in self.convs: + x = self.lrelu(conv(x)) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + """ + Discriminator for the long-term component. + + This class implements a discriminator for the long-term component + of the audio signal. The discriminator is composed of a series of + convolutional layers that are applied to the input signal at a given + period. + + Args: + period (int): Period of the discriminator. + kernel_size (int): Kernel size of the convolutional layers. Defaults to 5. + stride (int): Stride of the convolutional layers. Defaults to 3. + use_spectral_norm (bool): Whether to use spectral normalization. Defaults to False. + """ + + def __init__( + self, + period: int, + kernel_size: int = 5, + stride: int = 3, + use_spectral_norm: bool = False, + ): + super().__init__() + self.period = period + norm_f = spectral_norm if use_spectral_norm else weight_norm + + in_channels = [1, 32, 128, 512, 1024] + out_channels = [32, 128, 512, 1024, 1024] + strides = [3, 3, 3, 3, 1] + + self.convs = torch.nn.ModuleList( + [ + norm_f( + torch.nn.Conv2d( + in_ch, + out_ch, + (kernel_size, 1), + (s, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ) + for in_ch, out_ch, s in zip(in_channels, out_channels, strides) + ] + ) + + self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) + + def forward(self, x): + fmap = [] + b, c, t = x.shape + if t % self.period != 0: + n_pad = self.period - (t % self.period) + x = torch.nn.functional.pad(x, (0, n_pad), "reflect") + x = x.view(b, c, -1, self.period) + + for conv in self.convs: + x = self.lrelu(conv(x)) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + return x, fmap diff --git a/rvc/lib/algorithm/encoders.py b/rvc/lib/algorithm/encoders.py new file mode 100644 index 0000000000000000000000000000000000000000..ff27d039e6d177724059c180e0fbfcdbdaa03689 --- /dev/null +++ b/rvc/lib/algorithm/encoders.py @@ -0,0 +1,209 @@ +import math +import torch +from typing import Optional + +from rvc.lib.algorithm.commons import sequence_mask +from rvc.lib.algorithm.modules import WaveNet +from rvc.lib.algorithm.normalization import LayerNorm +from rvc.lib.algorithm.attentions import FFN, MultiHeadAttention + + +class Encoder(torch.nn.Module): + """ + Encoder module for the Transformer model. + + Args: + hidden_channels (int): Number of hidden channels in the encoder. + filter_channels (int): Number of filter channels in the feed-forward network. + n_heads (int): Number of attention heads. + n_layers (int): Number of encoder layers. + kernel_size (int, optional): Kernel size of the convolution layers in the feed-forward network. Defaults to 1. + p_dropout (float, optional): Dropout probability. Defaults to 0.0. + window_size (int, optional): Window size for relative positional encoding. Defaults to 10. + """ + + def __init__( + self, + hidden_channels: int, + filter_channels: int, + n_heads: int, + n_layers: int, + kernel_size: int = 1, + p_dropout: float = 0.0, + window_size: int = 10, + ): + super().__init__() + + self.hidden_channels = hidden_channels + self.n_layers = n_layers + self.drop = torch.nn.Dropout(p_dropout) + + self.attn_layers = torch.nn.ModuleList( + [ + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + for _ in range(n_layers) + ] + ) + self.norm_layers_1 = torch.nn.ModuleList( + [LayerNorm(hidden_channels) for _ in range(n_layers)] + ) + self.ffn_layers = torch.nn.ModuleList( + [ + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + for _ in range(n_layers) + ] + ) + self.norm_layers_2 = torch.nn.ModuleList( + [LayerNorm(hidden_channels) for _ in range(n_layers)] + ) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + + for i in range(self.n_layers): + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + + return x * x_mask + + +class TextEncoder(torch.nn.Module): + """ + Text Encoder with configurable embedding dimension. + + Args: + out_channels (int): Output channels of the encoder. + hidden_channels (int): Hidden channels of the encoder. + filter_channels (int): Filter channels of the encoder. + n_heads (int): Number of attention heads. + n_layers (int): Number of encoder layers. + kernel_size (int): Kernel size of the convolutional layers. + p_dropout (float): Dropout probability. + embedding_dim (int): Embedding dimension for phone embeddings (v1 = 256, v2 = 768). + f0 (bool, optional): Whether to use F0 embedding. Defaults to True. + """ + + def __init__( + self, + out_channels: int, + hidden_channels: int, + filter_channels: int, + n_heads: int, + n_layers: int, + kernel_size: int, + p_dropout: float, + embedding_dim: int, + f0: bool = True, + ): + super().__init__() + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels) + self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True) + self.emb_pitch = torch.nn.Embedding(256, hidden_channels) if f0 else None + + self.encoder = Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor + ): + x = self.emb_phone(phone) + if pitch is not None and self.emb_pitch: + x += self.emb_pitch(pitch) + + x *= math.sqrt(self.hidden_channels) + x = self.lrelu(x) + x = x.transpose(1, -1) # [B, H, T] + + x_mask = sequence_mask(lengths, x.size(2)).unsqueeze(1).to(x.dtype) + x = self.encoder(x, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class PosteriorEncoder(torch.nn.Module): + """ + Posterior Encoder for inferring latent representation. + + Args: + in_channels (int): Number of channels in the input. + out_channels (int): Number of channels in the output. + hidden_channels (int): Number of hidden channels in the encoder. + kernel_size (int): Kernel size of the convolutional layers. + dilation_rate (int): Dilation rate of the convolutional layers. + n_layers (int): Number of layers in the encoder. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + hidden_channels: int, + kernel_size: int, + dilation_rate: int, + n_layers: int, + gin_channels: int = 0, + ): + super().__init__() + self.out_channels = out_channels + self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = WaveNet( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None + ): + x_mask = sequence_mask(x_lengths, x.size(2)).unsqueeze(1).to(x.dtype) + + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + + z = m + torch.randn_like(m) * torch.exp(logs) + z *= x_mask + + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self diff --git a/rvc/lib/algorithm/generators/__init__.py b/rvc/lib/algorithm/generators/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/rvc/lib/algorithm/generators/hifigan.py b/rvc/lib/algorithm/generators/hifigan.py new file mode 100644 index 0000000000000000000000000000000000000000..871793de7676da8c98ea49f6de8fc4fafc26f53e --- /dev/null +++ b/rvc/lib/algorithm/generators/hifigan.py @@ -0,0 +1,228 @@ +import torch +import numpy as np +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm +from typing import Optional + +from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock +from rvc.lib.algorithm.commons import init_weights + + +class HiFiGANGenerator(torch.nn.Module): + """ + HiFi-GAN Generator module for audio synthesis. + + This module implements the generator part of the HiFi-GAN architecture, + which uses transposed convolutions for upsampling and residual blocks for + refining the audio output. It can also incorporate global conditioning. + + Args: + initial_channel (int): Number of input channels to the initial convolutional layer. + resblock_kernel_sizes (list): List of kernel sizes for the residual blocks. + resblock_dilation_sizes (list): List of lists of dilation rates for the residual blocks, corresponding to each kernel size. + upsample_rates (list): List of upsampling factors for each upsampling layer. + upsample_initial_channel (int): Number of output channels from the initial convolutional layer, which is also the input to the first upsampling layer. + upsample_kernel_sizes (list): List of kernel sizes for the transposed convolutional layers used for upsampling. + gin_channels (int, optional): Number of input channels for the global conditioning. If 0, no global conditioning is used. Defaults to 0. + """ + + def __init__( + self, + initial_channel: int, + resblock_kernel_sizes: list, + resblock_dilation_sizes: list, + upsample_rates: list, + upsample_initial_channel: int, + upsample_kernel_sizes: list, + gin_channels: int = 0, + ): + super(HiFiGANGenerator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = torch.nn.Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + + self.ups = torch.nn.ModuleList() + self.resblocks = torch.nn.ModuleList() + + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + torch.nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(ResBlock(ch, k, d)) + + self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None): + # new tensor + x = self.conv_pre(x) + + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + # in-place call + x = torch.nn.functional.leaky_relu(x) + x = self.conv_post(x) + # in-place call + x = torch.tanh(x) + + return x + + def __prepare_scriptable__(self): + for l in self.ups_and_resblocks: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGenerator(torch.nn.Module): + """ + Sine wave generator with optional harmonic overtones and noise. + + This module generates sine waves for a fundamental frequency and its harmonics. + It can also add Gaussian noise and apply a voiced/unvoiced mask. + + Args: + sampling_rate (int): The sampling rate of the audio in Hz. + num_harmonics (int, optional): The number of harmonic overtones to generate. Defaults to 0. + sine_amplitude (float, optional): The amplitude of the sine wave components. Defaults to 0.1. + noise_stddev (float, optional): The standard deviation of the additive Gaussian noise. Defaults to 0.003. + voiced_threshold (float, optional): The threshold for the fundamental frequency (F0) to determine if a frame is voiced. Defaults to 0.0. + """ + + def __init__( + self, + sampling_rate: int, + num_harmonics: int = 0, + sine_amplitude: float = 0.1, + noise_stddev: float = 0.003, + voiced_threshold: float = 0.0, + ): + super(SineGenerator, self).__init__() + self.sampling_rate = sampling_rate + self.num_harmonics = num_harmonics + self.sine_amplitude = sine_amplitude + self.noise_stddev = noise_stddev + self.voiced_threshold = voiced_threshold + self.waveform_dim = self.num_harmonics + 1 # fundamental + harmonics + + def _compute_voiced_unvoiced(self, f0: torch.Tensor): + """ + Generates a binary mask indicating voiced/unvoiced frames based on the fundamental frequency. + + Args: + f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length). + """ + uv_mask = (f0 > self.voiced_threshold).float() + return uv_mask + + def _generate_sine_wave(self, f0: torch.Tensor, upsampling_factor: int): + """ + Generates sine waves for the fundamental frequency and its harmonics. + + Args: + f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length, 1). + upsampling_factor (int): The factor by which to upsample the sine wave. + """ + batch_size, length, _ = f0.shape + + # Create an upsampling grid + upsampling_grid = torch.arange( + 1, upsampling_factor + 1, dtype=f0.dtype, device=f0.device + ) + + # Calculate phase increments + phase_increments = (f0 / self.sampling_rate) * upsampling_grid + phase_remainder = torch.fmod(phase_increments[:, :-1, -1:] + 0.5, 1.0) - 0.5 + cumulative_phase = phase_remainder.cumsum(dim=1).fmod(1.0).to(f0.dtype) + phase_increments += torch.nn.functional.pad( + cumulative_phase, (0, 0, 1, 0), mode="constant" + ) + + # Reshape to match the sine wave shape + phase_increments = phase_increments.reshape(batch_size, -1, 1) + + # Scale for harmonics + harmonic_scale = torch.arange( + 1, self.waveform_dim + 1, dtype=f0.dtype, device=f0.device + ).reshape(1, 1, -1) + phase_increments *= harmonic_scale + + # Add random phase offset (except for the fundamental) + random_phase = torch.rand(1, 1, self.waveform_dim, device=f0.device) + random_phase[..., 0] = 0 # Fundamental frequency has no random offset + phase_increments += random_phase + + # Generate sine waves + sine_waves = torch.sin(2 * np.pi * phase_increments) + return sine_waves + + def forward(self, f0: torch.Tensor, upsampling_factor: int): + with torch.no_grad(): + # Expand `f0` to include waveform dimensions + f0 = f0.unsqueeze(-1) + + # Generate sine waves + sine_waves = ( + self._generate_sine_wave(f0, upsampling_factor) * self.sine_amplitude + ) + + # Compute voiced/unvoiced mask + voiced_mask = self._compute_voiced_unvoiced(f0) + + # Upsample voiced/unvoiced mask + voiced_mask = torch.nn.functional.interpolate( + voiced_mask.transpose(2, 1), + scale_factor=float(upsampling_factor), + mode="nearest", + ).transpose(2, 1) + + # Compute noise amplitude + noise_amplitude = voiced_mask * self.noise_stddev + (1 - voiced_mask) * ( + self.sine_amplitude / 3 + ) + + # Add Gaussian noise + noise = noise_amplitude * torch.randn_like(sine_waves) + + # Combine sine waves and noise + sine_waveforms = sine_waves * voiced_mask + noise + + return sine_waveforms, voiced_mask, noise diff --git a/rvc/lib/algorithm/generators/hifigan_mrf.py b/rvc/lib/algorithm/generators/hifigan_mrf.py new file mode 100644 index 0000000000000000000000000000000000000000..70b9de13a688b688b6386f22224a78f58f6b38ae --- /dev/null +++ b/rvc/lib/algorithm/generators/hifigan_mrf.py @@ -0,0 +1,374 @@ +import math +from typing import Optional + +import numpy as np +import torch +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.utils.checkpoint import checkpoint + +LRELU_SLOPE = 0.1 + + +class MRFLayer(torch.nn.Module): + """ + A single layer of the Multi-Receptive Field (MRF) block. + + This layer consists of two 1D convolutional layers with weight normalization + and Leaky ReLU activation in between. The first convolution has a dilation, + while the second has a dilation of 1. A skip connection is added from the input + to the output. + + Args: + channels (int): The number of input and output channels. + kernel_size (int): The kernel size of the convolutional layers. + dilation (int): The dilation rate for the first convolutional layer. + """ + + def __init__(self, channels, kernel_size, dilation): + super().__init__() + self.conv1 = weight_norm( + torch.nn.Conv1d( + channels, + channels, + kernel_size, + padding=(kernel_size * dilation - dilation) // 2, + dilation=dilation, + ) + ) + self.conv2 = weight_norm( + torch.nn.Conv1d( + channels, channels, kernel_size, padding=kernel_size // 2, dilation=1 + ) + ) + + def forward(self, x: torch.Tensor): + y = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + y = self.conv1(y) + y = torch.nn.functional.leaky_relu(y, LRELU_SLOPE) + y = self.conv2(y) + return x + y + + def remove_weight_norm(self): + remove_weight_norm(self.conv1) + remove_weight_norm(self.conv2) + + +class MRFBlock(torch.nn.Module): + """ + A Multi-Receptive Field (MRF) block. + + This block consists of multiple MRFLayers with different dilation rates. + It applies each layer sequentially to the input. + + Args: + channels (int): The number of input and output channels for the MRFLayers. + kernel_size (int): The kernel size for the convolutional layers in the MRFLayers. + dilations (list[int]): A list of dilation rates for the MRFLayers. + """ + + def __init__(self, channels, kernel_size, dilations): + super().__init__() + self.layers = torch.nn.ModuleList() + for dilation in dilations: + self.layers.append(MRFLayer(channels, kernel_size, dilation)) + + def forward(self, x: torch.Tensor): + for layer in self.layers: + x = layer(x) + return x + + def remove_weight_norm(self): + for layer in self.layers: + layer.remove_weight_norm() + + +class SineGenerator(torch.nn.Module): + """ + Definition of sine generator + + Generates sine waveforms with optional harmonics and additive noise. + Can be used to create harmonic noise source for neural vocoders. + + Args: + samp_rate (int): Sampling rate in Hz. + harmonic_num (int): Number of harmonic overtones (default 0). + sine_amp (float): Amplitude of sine-waveform (default 0.1). + noise_std (float): Standard deviation of Gaussian noise (default 0.003). + voiced_threshold (float): F0 threshold for voiced/unvoiced classification (default 0). + """ + + def __init__( + self, + samp_rate: int, + harmonic_num: int = 0, + sine_amp: float = 0.1, + noise_std: float = 0.003, + voiced_threshold: float = 0, + ): + super(SineGenerator, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0: torch.Tensor): + """ + Generates voiced/unvoiced (UV) signal based on the fundamental frequency (F0). + + Args: + f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length, 1). + """ + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def _f02sine(self, f0_values: torch.Tensor): + """ + Generates sine waveforms based on the fundamental frequency (F0) and its harmonics. + + Args: + f0_values (torch.Tensor): Tensor of fundamental frequency and its harmonics, + shape (batch_size, length, dim), where dim indicates + the fundamental tone and overtones. + """ + # convert to F0 in rad. The integer part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand( + f0_values.shape[0], f0_values.shape[2], device=f0_values.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + tmp_over_one = torch.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) + + return sines + + def forward(self, f0: torch.Tensor): + with torch.no_grad(): + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) + + sine_waves = self._f02sine(f0_buf) * self.sine_amp + + uv = self._f02uv(f0) + + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """ + Generates harmonic and noise source features. + + This module uses the SineGenerator to create harmonic signals based on the + fundamental frequency (F0) and merges them into a single excitation signal. + + Args: + sample_rate (int): Sampling rate in Hz. + harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0. + sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1. + add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003. + voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0. + """ + + def __init__( + self, + sampling_rate: int, + harmonic_num: int = 0, + sine_amp: float = 0.1, + add_noise_std: float = 0.003, + voiced_threshold: float = 0, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGenerator( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x: torch.Tensor): + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + return sine_merge, None, None + + +class HiFiGANMRFGenerator(torch.nn.Module): + """ + HiFi-GAN generator with Multi-Receptive Field (MRF) blocks. + + This generator takes an input feature sequence and fundamental frequency (F0) + as input and generates an audio waveform. It utilizes transposed convolutions + for upsampling and MRF blocks for feature refinement. It can also condition + on global conditioning features. + + Args: + in_channel (int): Number of input channels. + upsample_initial_channel (int): Number of channels after the initial convolution. + upsample_rates (list[int]): List of upsampling rates for the transposed convolutions. + upsample_kernel_sizes (list[int]): List of kernel sizes for the transposed convolutions. + resblock_kernel_sizes (list[int]): List of kernel sizes for the convolutional layers in the MRF blocks. + resblock_dilations (list[list[int]]): List of lists of dilation rates for the MRF blocks. + gin_channels (int): Number of global conditioning input channels (0 if no global conditioning). + sample_rate (int): Sampling rate of the audio. + harmonic_num (int): Number of harmonics to generate. + checkpointing (bool): Whether to use checkpointing to save memory during training (default: False). + """ + + def __init__( + self, + in_channel: int, + upsample_initial_channel: int, + upsample_rates: list[int], + upsample_kernel_sizes: list[int], + resblock_kernel_sizes: list[int], + resblock_dilations: list[list[int]], + gin_channels: int, + sample_rate: int, + harmonic_num: int, + checkpointing: bool = False, + ): + super().__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.checkpointing = checkpointing + + self.f0_upsample = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num) + + self.conv_pre = weight_norm( + torch.nn.Conv1d( + in_channel, upsample_initial_channel, kernel_size=7, stride=1, padding=3 + ) + ) + self.upsamples = torch.nn.ModuleList() + self.noise_convs = torch.nn.ModuleList() + + stride_f0s = [ + math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1 + for i in range(len(upsample_rates)) + ] + + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + # handling odd upsampling rates + if u % 2 == 0: + # old method + padding = (k - u) // 2 + else: + padding = u // 2 + u % 2 + + self.upsamples.append( + weight_norm( + torch.nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + kernel_size=k, + stride=u, + padding=padding, + output_padding=u % 2, + ) + ) + ) + """ handling odd upsampling rates + # s k p + # 40 80 20 + # 32 64 16 + # 4 8 2 + # 2 3 1 + # 63 125 31 + # 9 17 4 + # 3 5 1 + # 1 1 0 + """ + stride = stride_f0s[i] + kernel = 1 if stride == 1 else stride * 2 - stride % 2 + padding = 0 if stride == 1 else (kernel - stride) // 2 + + self.noise_convs.append( + torch.nn.Conv1d( + 1, + upsample_initial_channel // (2 ** (i + 1)), + kernel_size=kernel, + stride=stride, + padding=padding, + ) + ) + self.mrfs = torch.nn.ModuleList() + for i in range(len(self.upsamples)): + channel = upsample_initial_channel // (2 ** (i + 1)) + self.mrfs.append( + torch.nn.ModuleList( + [ + MRFBlock(channel, kernel_size=k, dilations=d) + for k, d in zip(resblock_kernel_sizes, resblock_dilations) + ] + ) + ) + self.conv_post = weight_norm( + torch.nn.Conv1d(channel, 1, kernel_size=7, stride=1, padding=3) + ) + if gin_channels != 0: + self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward( + self, x: torch.Tensor, f0: torch.Tensor, g: Optional[torch.Tensor] = None + ): + f0 = self.f0_upsample(f0[:, None, :]).transpose(-1, -2) + har_source, _, _ = self.m_source(f0) + har_source = har_source.transpose(-1, -2) + x = self.conv_pre(x) + + if g is not None: + x = x + self.cond(g) + + for ups, mrf, noise_conv in zip(self.upsamples, self.mrfs, self.noise_convs): + x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + + if self.training and self.checkpointing: + x = checkpoint(ups, x, use_reentrant=False) + x = x + noise_conv(har_source) + xs = sum([checkpoint(layer, x, use_reentrant=False) for layer in mrf]) + else: + x = ups(x) + x = x + noise_conv(har_source) + xs = sum([layer(x) for layer in mrf]) + x = xs / self.num_kernels + + x = torch.nn.functional.leaky_relu(x) + x = torch.tanh(self.conv_post(x)) + + return x + + def remove_weight_norm(self): + remove_weight_norm(self.conv_pre) + for up in self.upsamples: + remove_weight_norm(up) + for mrf in self.mrfs: + mrf.remove_weight_norm() + remove_weight_norm(self.conv_post) diff --git a/rvc/lib/algorithm/generators/hifigan_nsf.py b/rvc/lib/algorithm/generators/hifigan_nsf.py new file mode 100644 index 0000000000000000000000000000000000000000..20928af2d153e814beda3c0b27b1f0920a00006b --- /dev/null +++ b/rvc/lib/algorithm/generators/hifigan_nsf.py @@ -0,0 +1,235 @@ +import math +from typing import Optional + +import torch +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.utils.checkpoint import checkpoint + +from rvc.lib.algorithm.commons import init_weights +from rvc.lib.algorithm.generators.hifigan import SineGenerator +from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock + + +class SourceModuleHnNSF(torch.nn.Module): + """ + Source Module for generating harmonic and noise components for audio synthesis. + + This module generates a harmonic source signal using sine waves and adds + optional noise. It's often used in neural vocoders as a source of excitation. + + Args: + sample_rate (int): Sampling rate of the audio in Hz. + harmonic_num (int, optional): Number of harmonic overtones to generate above the fundamental frequency (F0). Defaults to 0. + sine_amp (float, optional): Amplitude of the sine wave components. Defaults to 0.1. + add_noise_std (float, optional): Standard deviation of the additive white Gaussian noise. Defaults to 0.003. + voiced_threshod (float, optional): Threshold for the fundamental frequency (F0) to determine if a frame is voiced. If F0 is below this threshold, it's considered unvoiced. Defaults to 0. + """ + + def __init__( + self, + sample_rate: int, + harmonic_num: int = 0, + sine_amp: float = 0.1, + add_noise_std: float = 0.003, + voiced_threshod: float = 0, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + self.l_sin_gen = SineGenerator( + sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x: torch.Tensor, upsample_factor: int = 1): + sine_wavs, uv, _ = self.l_sin_gen(x, upsample_factor) + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None + + +class HiFiGANNSFGenerator(torch.nn.Module): + """ + Generator module based on the Neural Source Filter (NSF) architecture. + + This generator synthesizes audio by first generating a source excitation signal + (harmonic and noise) and then filtering it through a series of upsampling and + residual blocks. Global conditioning can be applied to influence the generation. + + Args: + initial_channel (int): Number of input channels to the initial convolutional layer. + resblock_kernel_sizes (list): List of kernel sizes for the residual blocks. + resblock_dilation_sizes (list): List of lists of dilation rates for the residual blocks, corresponding to each kernel size. + upsample_rates (list): List of upsampling factors for each upsampling layer. + upsample_initial_channel (int): Number of output channels from the initial convolutional layer, which is also the input to the first upsampling layer. + upsample_kernel_sizes (list): List of kernel sizes for the transposed convolutional layers used for upsampling. + gin_channels (int): Number of input channels for the global conditioning. If 0, no global conditioning is used. + sr (int): Sampling rate of the audio. + checkpointing (bool, optional): Whether to use gradient checkpointing to save memory during training. Defaults to False. + """ + + def __init__( + self, + initial_channel: int, + resblock_kernel_sizes: list, + resblock_dilation_sizes: list, + upsample_rates: list, + upsample_initial_channel: int, + upsample_kernel_sizes: list, + gin_channels: int, + sr: int, + checkpointing: bool = False, + ): + super(HiFiGANNSFGenerator, self).__init__() + + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.checkpointing = checkpointing + self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0) + + self.conv_pre = torch.nn.Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + + self.ups = torch.nn.ModuleList() + self.noise_convs = torch.nn.ModuleList() + + channels = [ + upsample_initial_channel // (2 ** (i + 1)) + for i in range(len(upsample_rates)) + ] + stride_f0s = [ + math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1 + for i in range(len(upsample_rates)) + ] + + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + # handling odd upsampling rates + if u % 2 == 0: + # old method + padding = (k - u) // 2 + else: + padding = u // 2 + u % 2 + + self.ups.append( + weight_norm( + torch.nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + channels[i], + k, + u, + padding=padding, + output_padding=u % 2, + ) + ) + ) + """ handling odd upsampling rates + # s k p + # 40 80 20 + # 32 64 16 + # 4 8 2 + # 2 3 1 + # 63 125 31 + # 9 17 4 + # 3 5 1 + # 1 1 0 + """ + stride = stride_f0s[i] + kernel = 1 if stride == 1 else stride * 2 - stride % 2 + padding = 0 if stride == 1 else (kernel - stride) // 2 + + self.noise_convs.append( + torch.nn.Conv1d( + 1, + channels[i], + kernel_size=kernel, + stride=stride, + padding=padding, + ) + ) + + self.resblocks = torch.nn.ModuleList( + [ + ResBlock(channels[i], k, d) + for i in range(len(self.ups)) + for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes) + ] + ) + + self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = math.prod(upsample_rates) + self.lrelu_slope = LRELU_SLOPE + + def forward( + self, x: torch.Tensor, f0: torch.Tensor, g: Optional[torch.Tensor] = None + ): + har_source, _, _ = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + # new tensor + x = self.conv_pre(x) + + if g is not None: + x = x + self.cond(g) + + for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): + x = torch.nn.functional.leaky_relu(x, self.lrelu_slope) + # Apply upsampling layer + if self.training and self.checkpointing: + x = checkpoint(ups, x, use_reentrant=False) + x = x + noise_convs(har_source) + xs = sum( + [ + checkpoint(resblock, x, use_reentrant=False) + for j, resblock in enumerate(self.resblocks) + if j in range(i * self.num_kernels, (i + 1) * self.num_kernels) + ] + ) + else: + x = ups(x) + x = x + noise_convs(har_source) + xs = sum( + [ + resblock(x) + for j, resblock in enumerate(self.resblocks) + if j in range(i * self.num_kernels, (i + 1) * self.num_kernels) + ] + ) + x = xs / self.num_kernels + + x = torch.nn.functional.leaky_relu(x) + x = torch.tanh(self.conv_post(x)) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + def __prepare_scriptable__(self): + for l in self.ups: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + remove_weight_norm(l) + for l in self.resblocks: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + remove_weight_norm(l) + return self diff --git a/rvc/lib/algorithm/generators/refinegan.py b/rvc/lib/algorithm/generators/refinegan.py new file mode 100644 index 0000000000000000000000000000000000000000..eca303ca8ba0f5cc0087e186a3c4e80320d38ed3 --- /dev/null +++ b/rvc/lib/algorithm/generators/refinegan.py @@ -0,0 +1,451 @@ +import numpy as np +import torch +import torchaudio +from torch import nn +from torch.nn import functional as F +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils import remove_weight_norm +from torch.utils.checkpoint import checkpoint + +from rvc.lib.algorithm.commons import init_weights, get_padding + + +class ResBlock(nn.Module): + """ + Residual block with multiple dilated convolutions. + + This block applies a sequence of dilated convolutional layers with Leaky ReLU activation. + It's designed to capture information at different scales due to the varying dilation rates. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (int, optional): Kernel size for the convolutional layers. Defaults to 7. + dilation (tuple[int], optional): Tuple of dilation rates for the convolutional layers. Defaults to (1, 3, 5). + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2. + """ + + def __init__( + self, + channels: int, + kernel_size: int = 7, + dilation: tuple[int] = (1, 3, 5), + leaky_relu_slope: float = 0.2, + ): + super().__init__() + + self.leaky_relu_slope = leaky_relu_slope + + self.convs1 = nn.ModuleList( + [ + weight_norm( + nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + dilation=d, + padding=get_padding(kernel_size, d), + ) + ) + for d in dilation + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ) + for d in dilation + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x: torch.Tensor): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, self.leaky_relu_slope) + xt = c1(xt) + xt = F.leaky_relu(xt, self.leaky_relu_slope) + xt = c2(xt) + x = xt + x + + return x + + def remove_weight_norm(self): + for c1, c2 in zip(self.convs1, self.convs2): + remove_weight_norm(c1) + remove_weight_norm(c2) + + +class AdaIN(nn.Module): + """ + Adaptive Instance Normalization layer. + + This layer applies a scaling factor to the input based on a learnable weight. + + Args: + channels (int): Number of input channels. + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation applied after scaling. Defaults to 0.2. + """ + + def __init__( + self, + *, + channels: int, + leaky_relu_slope: float = 0.2, + ): + super().__init__() + + self.weight = nn.Parameter(torch.ones(channels) * 1e-4) + # safe to use in-place as it is used on a new x+gaussian tensor + self.activation = nn.LeakyReLU(leaky_relu_slope) + + def forward(self, x: torch.Tensor): + gaussian = torch.randn_like(x) * self.weight[None, :, None] + + return self.activation(x + gaussian) + + +class ParallelResBlock(nn.Module): + """ + Parallel residual block that applies multiple residual blocks with different kernel sizes in parallel. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_sizes (tuple[int], optional): Tuple of kernel sizes for the parallel residual blocks. Defaults to (3, 7, 11). + dilation (tuple[int], optional): Tuple of dilation rates for the convolutional layers within the residual blocks. Defaults to (1, 3, 5). + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2. + """ + + def __init__( + self, + *, + in_channels: int, + out_channels: int, + kernel_sizes: tuple[int] = (3, 7, 11), + dilation: tuple[int] = (1, 3, 5), + leaky_relu_slope: float = 0.2, + ): + super().__init__() + + self.in_channels = in_channels + self.out_channels = out_channels + + self.input_conv = nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=7, + stride=1, + padding=3, + ) + + self.input_conv.apply(init_weights) + + self.blocks = nn.ModuleList( + [ + nn.Sequential( + AdaIN(channels=out_channels), + ResBlock( + out_channels, + kernel_size=kernel_size, + dilation=dilation, + leaky_relu_slope=leaky_relu_slope, + ), + AdaIN(channels=out_channels), + ) + for kernel_size in kernel_sizes + ] + ) + + def forward(self, x: torch.Tensor): + x = self.input_conv(x) + return torch.stack([block(x) for block in self.blocks], dim=0).mean(dim=0) + + def remove_weight_norm(self): + remove_weight_norm(self.input_conv) + for block in self.blocks: + block[1].remove_weight_norm() + + +class SineGenerator(nn.Module): + """ + Definition of sine generator + + Generates sine waveforms with optional harmonics and additive noise. + Can be used to create harmonic noise source for neural vocoders. + + Args: + samp_rate (int): Sampling rate in Hz. + harmonic_num (int): Number of harmonic overtones (default 0). + sine_amp (float): Amplitude of sine-waveform (default 0.1). + noise_std (float): Standard deviation of Gaussian noise (default 0.003). + voiced_threshold (float): F0 threshold for voiced/unvoiced classification (default 0). + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + ): + super(SineGenerator, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + self.merge = nn.Sequential( + nn.Linear(self.dim, 1, bias=False), + nn.Tanh(), + ) + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def _f02sine(self, f0_values): + """f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The integer part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand( + f0_values.shape[0], f0_values.shape[2], device=f0_values.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + tmp_over_one = torch.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) + + return sines + + def forward(self, f0): + with torch.no_grad(): + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) + + sine_waves = self._f02sine(f0_buf) * self.sine_amp + + uv = self._f02uv(f0) + + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + sine_waves = sine_waves * uv + noise + + # merge with grad + return self.merge(sine_waves) + + +class RefineGANGenerator(nn.Module): + """ + RefineGAN generator for audio synthesis. + + This generator uses a combination of downsampling, residual blocks, and parallel residual blocks + to refine an input mel-spectrogram and fundamental frequency (F0) into an audio waveform. + It can also incorporate global conditioning. + + Args: + sample_rate (int, optional): Sampling rate of the audio. Defaults to 44100. + downsample_rates (tuple[int], optional): Downsampling rates for the downsampling blocks. Defaults to (2, 2, 8, 8). + upsample_rates (tuple[int], optional): Upsampling rates for the upsampling blocks. Defaults to (8, 8, 2, 2). + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2. + num_mels (int, optional): Number of mel-frequency bins in the input mel-spectrogram. Defaults to 128. + start_channels (int, optional): Number of channels in the initial convolutional layer. Defaults to 16. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 256. + checkpointing (bool, optional): Whether to use checkpointing for memory efficiency. Defaults to False. + """ + + def __init__( + self, + *, + sample_rate: int = 44100, + downsample_rates: tuple[int] = (2, 2, 8, 8), # unused + upsample_rates: tuple[int] = (8, 8, 2, 2), + leaky_relu_slope: float = 0.2, + num_mels: int = 128, + start_channels: int = 16, # unused + gin_channels: int = 256, + checkpointing: bool = False, + upsample_initial_channel=512, + ): + super().__init__() + self.upsample_rates = upsample_rates + self.leaky_relu_slope = leaky_relu_slope + self.checkpointing = checkpointing + + self.upp = np.prod(upsample_rates) + self.m_source = SineGenerator(sample_rate) + + # expanded f0 sinegen -> match mel_conv + # (8, 1, 17280) -> (8, 16, 17280) + self.pre_conv = weight_norm( + nn.Conv1d( + 1, + 16, + 7, + 1, + padding=3, + ) + ) + + # (8, 16, 17280) = 4th upscale + # (8, 32, 8640) = 3rd upscale + # (8, 64, 4320) = 2nd upscale + # (8, 128, 432) = 1st upscale + # (8, 256, 36) merged to mel + + # f0 downsampling and upchanneling + channels = start_channels + size = self.upp + self.downsample_blocks = nn.ModuleList([]) + self.df0 = [] + for i, u in enumerate(upsample_rates): + + new_size = int(size / upsample_rates[-i - 1]) + # T dimension factors for torchaudio.functional.resample + self.df0.append([size, new_size]) + size = new_size + + new_channels = channels * 2 + self.downsample_blocks.append( + weight_norm(nn.Conv1d(channels, new_channels, 7, 1, padding=3)) + ) + channels = new_channels + + # mel handling + channels = upsample_initial_channel + + self.mel_conv = weight_norm( + nn.Conv1d( + num_mels, + channels // 2, + 7, + 1, + padding=3, + ) + ) + + self.mel_conv.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(256, channels // 2, 1) + + self.upsample_blocks = nn.ModuleList([]) + self.upsample_conv_blocks = nn.ModuleList([]) + + for rate in upsample_rates: + new_channels = channels // 2 + + self.upsample_blocks.append(nn.Upsample(scale_factor=rate, mode="linear")) + + self.upsample_conv_blocks.append( + ParallelResBlock( + in_channels=channels + channels // 4, + out_channels=new_channels, + kernel_sizes=(3, 7, 11), + dilation=(1, 3, 5), + leaky_relu_slope=leaky_relu_slope, + ) + ) + + channels = new_channels + + self.conv_post = weight_norm( + nn.Conv1d(channels, 1, 7, 1, padding=3, bias=False) + ) + self.conv_post.apply(init_weights) + + def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor = None): + f0_size = mel.shape[-1] + # change f0 helper to full size + f0 = F.interpolate(f0.unsqueeze(1), size=f0_size * self.upp, mode="linear") + # get f0 turned into sines harmonics + har_source = self.m_source(f0.transpose(1, 2)).transpose(1, 2) + # prepare for fusion to mel + x = self.pre_conv(har_source) + # downsampled/upchanneled versions for each upscale + downs = [] + for block, (old_size, new_size) in zip(self.downsample_blocks, self.df0): + x = F.leaky_relu(x, self.leaky_relu_slope) + downs.append(x) + # attempt to cancel spectral aliasing + x = torchaudio.functional.resample( + x.contiguous(), + orig_freq=int(f0_size * old_size), + new_freq=int(f0_size * new_size), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="sinc_interp_kaiser", + beta=14.769656459379492, + ) + x = block(x) + + # expanding spectrogram from 192 to 256 channels + mel = self.mel_conv(mel) + if g is not None: + # adding expanded speaker embedding + mel = mel + self.cond(g) + + x = torch.cat([mel, x], dim=1) + + for ups, res, down in zip( + self.upsample_blocks, + self.upsample_conv_blocks, + reversed(downs), + ): + x = F.leaky_relu(x, self.leaky_relu_slope) + + if self.training and self.checkpointing: + x = checkpoint(ups, x, use_reentrant=False) + x = torch.cat([x, down], dim=1) + x = checkpoint(res, x, use_reentrant=False) + else: + x = ups(x) + x = torch.cat([x, down], dim=1) + x = res(x) + + x = F.leaky_relu(x, self.leaky_relu_slope) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + remove_weight_norm(self.pre_conv) + remove_weight_norm(self.mel_conv) + remove_weight_norm(self.conv_post) + + for block in self.downsample_blocks: + block.remove_weight_norm() + + for block in self.upsample_conv_blocks: + block.remove_weight_norm() diff --git a/rvc/lib/algorithm/modules.py b/rvc/lib/algorithm/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..611c45d3830051923effd0319f66d39e3126d4d2 --- /dev/null +++ b/rvc/lib/algorithm/modules.py @@ -0,0 +1,117 @@ +import torch +from rvc.lib.algorithm.commons import fused_add_tanh_sigmoid_multiply + + +class WaveNet(torch.nn.Module): + """ + WaveNet residual blocks as used in WaveGlow. + + Args: + hidden_channels (int): Number of hidden channels. + kernel_size (int): Size of the convolutional kernel. + dilation_rate (int): Dilation rate of the convolution. + n_layers (int): Number of convolutional layers. + gin_channels (int, optional): Number of conditioning channels. Defaults to 0. + p_dropout (float, optional): Dropout probability. Defaults to 0. + """ + + def __init__( + self, + hidden_channels: int, + kernel_size: int, + dilation_rate, + n_layers: int, + gin_channels: int = 0, + p_dropout: int = 0, + ): + super().__init__() + assert kernel_size % 2 == 1, "Kernel size must be odd for proper padding." + + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + self.n_channels_tensor = torch.IntTensor([hidden_channels]) # Static tensor + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = torch.nn.Dropout(p_dropout) + + # Conditional layer for global conditioning + if gin_channels: + self.cond_layer = torch.nn.utils.parametrizations.weight_norm( + torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1), + name="weight", + ) + + # Precompute dilations and paddings + dilations = [dilation_rate**i for i in range(n_layers)] + paddings = [(kernel_size * d - d) // 2 for d in dilations] + + # Initialize layers + for i in range(n_layers): + self.in_layers.append( + torch.nn.utils.parametrizations.weight_norm( + torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilations[i], + padding=paddings[i], + ), + name="weight", + ) + ) + + res_skip_channels = ( + hidden_channels if i == n_layers - 1 else 2 * hidden_channels + ) + self.res_skip_layers.append( + torch.nn.utils.parametrizations.weight_norm( + torch.nn.Conv1d(hidden_channels, res_skip_channels, 1), + name="weight", + ) + ) + + def forward(self, x, x_mask, g=None): + output = x.clone().zero_() + + # Apply conditional layer if global conditioning is provided + g = self.cond_layer(g) if g is not None else None + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + g_l = ( + g[ + :, + i * 2 * self.hidden_channels : (i + 1) * 2 * self.hidden_channels, + :, + ] + if g is not None + else 0 + ) + + # Activation with fused Tanh-Sigmoid + acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, self.n_channels_tensor) + acts = self.drop(acts) + + # Residual and skip connections + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for layer in self.in_layers: + torch.nn.utils.remove_weight_norm(layer) + for layer in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(layer) diff --git a/rvc/lib/algorithm/normalization.py b/rvc/lib/algorithm/normalization.py new file mode 100644 index 0000000000000000000000000000000000000000..94a29bac9fbe9bce34f7a3fe253b8f231fcec78a --- /dev/null +++ b/rvc/lib/algorithm/normalization.py @@ -0,0 +1,26 @@ +import torch + + +class LayerNorm(torch.nn.Module): + """ + Layer normalization module. + + Args: + channels (int): Number of channels. + eps (float, optional): Epsilon value for numerical stability. Defaults to 1e-5. + """ + + def __init__(self, channels: int, eps: float = 1e-5): + super().__init__() + self.eps = eps + self.gamma = torch.nn.Parameter(torch.ones(channels)) + self.beta = torch.nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + # Transpose to (batch_size, time_steps, channels) for layer_norm + x = x.transpose(1, -1) + x = torch.nn.functional.layer_norm( + x, (x.size(-1),), self.gamma, self.beta, self.eps + ) + # Transpose back to (batch_size, channels, time_steps) + return x.transpose(1, -1) diff --git a/rvc/lib/algorithm/residuals.py b/rvc/lib/algorithm/residuals.py new file mode 100644 index 0000000000000000000000000000000000000000..33c3b08ffd1fef17657c7fe5f267eecd9d847e35 --- /dev/null +++ b/rvc/lib/algorithm/residuals.py @@ -0,0 +1,261 @@ +import torch +from itertools import chain +from typing import Optional, Tuple +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm + +from rvc.lib.algorithm.modules import WaveNet +from rvc.lib.algorithm.commons import get_padding, init_weights + +LRELU_SLOPE = 0.1 + + +def create_conv1d_layer(channels, kernel_size, dilation): + return weight_norm( + torch.nn.Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation, + padding=get_padding(kernel_size, dilation), + ) + ) + + +def apply_mask(tensor: torch.Tensor, mask: Optional[torch.Tensor]): + return tensor * mask if mask else tensor + + +def apply_mask_(tensor: torch.Tensor, mask: Optional[torch.Tensor]): + return tensor.mul_(mask) if mask else tensor + + +class ResBlock(torch.nn.Module): + """ + A residual block module that applies a series of 1D convolutional layers with residual connections. + """ + + def __init__( + self, channels: int, kernel_size: int = 3, dilations: Tuple[int] = (1, 3, 5) + ): + """ + Initializes the ResBlock. + + Args: + channels (int): Number of input and output channels for the convolution layers. + kernel_size (int): Size of the convolution kernel. Defaults to 3. + dilations (Tuple[int]): Tuple of dilation rates for the convolution layers in the first set. + """ + super().__init__() + # Create convolutional layers with specified dilations and initialize weights + self.convs1 = self._create_convs(channels, kernel_size, dilations) + self.convs2 = self._create_convs(channels, kernel_size, [1] * len(dilations)) + + @staticmethod + def _create_convs(channels: int, kernel_size: int, dilations: Tuple[int]): + """ + Creates a list of 1D convolutional layers with specified dilations. + + Args: + channels (int): Number of input and output channels for the convolution layers. + kernel_size (int): Size of the convolution kernel. + dilations (Tuple[int]): Tuple of dilation rates for each convolution layer. + """ + layers = torch.nn.ModuleList( + [create_conv1d_layer(channels, kernel_size, d) for d in dilations] + ) + layers.apply(init_weights) + return layers + + def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None): + for conv1, conv2 in zip(self.convs1, self.convs2): + x_residual = x + x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + x = apply_mask(x, x_mask) + x = torch.nn.functional.leaky_relu(conv1(x), LRELU_SLOPE) + x = apply_mask(x, x_mask) + x = conv2(x) + x = x + x_residual + return apply_mask(x, x_mask) + + def remove_weight_norm(self): + for conv in chain(self.convs1, self.convs2): + remove_weight_norm(conv) + + +class Flip(torch.nn.Module): + """ + Flip module for flow-based models. + + This module flips the input along the time dimension. + """ + + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0), dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +class ResidualCouplingBlock(torch.nn.Module): + """ + Residual Coupling Block for normalizing flow. + + Args: + channels (int): Number of channels in the input. + hidden_channels (int): Number of hidden channels in the coupling layer. + kernel_size (int): Kernel size of the convolutional layers. + dilation_rate (int): Dilation rate of the convolutional layers. + n_layers (int): Number of layers in the coupling layer. + n_flows (int, optional): Number of coupling layers in the block. Defaults to 4. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0. + """ + + def __init__( + self, + channels: int, + hidden_channels: int, + kernel_size: int, + dilation_rate: int, + n_layers: int, + n_flows: int = 4, + gin_channels: int = 0, + ): + super(ResidualCouplingBlock, self).__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = torch.nn.ModuleList() + for _ in range(n_flows): + self.flows.append( + ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(Flip()) + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow.forward(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + def __prepare_scriptable__(self): + for i in range(self.n_flows): + for hook in self.flows[i * 2]._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flows[i * 2]) + + return self + + +class ResidualCouplingLayer(torch.nn.Module): + """ + Residual coupling layer for flow-based models. + + Args: + channels (int): Number of channels. + hidden_channels (int): Number of hidden channels. + kernel_size (int): Size of the convolutional kernel. + dilation_rate (int): Dilation rate of the convolution. + n_layers (int): Number of convolutional layers. + p_dropout (float, optional): Dropout probability. Defaults to 0. + gin_channels (int, optional): Number of conditioning channels. Defaults to 0. + mean_only (bool, optional): Whether to use mean-only coupling. Defaults to False. + """ + + def __init__( + self, + channels: int, + hidden_channels: int, + kernel_size: int, + dilation_rate: int, + n_layers: int, + p_dropout: float = 0, + gin_channels: int = 0, + mean_only: bool = False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WaveNet( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=p_dropout, + gin_channels=gin_channels, + ) + self.post = torch.nn.Conv1d( + hidden_channels, self.half_channels * (2 - mean_only), 1 + ) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x + + def remove_weight_norm(self): + self.enc.remove_weight_norm() diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py new file mode 100644 index 0000000000000000000000000000000000000000..e920b535e9649106283dc3ad48d595b9ba6f9531 --- /dev/null +++ b/rvc/lib/algorithm/synthesizers.py @@ -0,0 +1,243 @@ +import torch +from typing import Optional +from rvc.lib.algorithm.generators.hifigan_mrf import HiFiGANMRFGenerator +from rvc.lib.algorithm.generators.hifigan_nsf import HiFiGANNSFGenerator +from rvc.lib.algorithm.generators.hifigan import HiFiGANGenerator +from rvc.lib.algorithm.generators.refinegan import RefineGANGenerator +from rvc.lib.algorithm.commons import slice_segments, rand_slice_segments +from rvc.lib.algorithm.residuals import ResidualCouplingBlock +from rvc.lib.algorithm.encoders import TextEncoder, PosteriorEncoder + + +class Synthesizer(torch.nn.Module): + """ + Base Synthesizer model. + + Args: + spec_channels (int): Number of channels in the spectrogram. + segment_size (int): Size of the audio segment. + inter_channels (int): Number of channels in the intermediate layers. + hidden_channels (int): Number of channels in the hidden layers. + filter_channels (int): Number of channels in the filter layers. + n_heads (int): Number of attention heads. + n_layers (int): Number of layers in the encoder. + kernel_size (int): Size of the convolution kernel. + p_dropout (float): Dropout probability. + resblock (str): Type of residual block. + resblock_kernel_sizes (list): Kernel sizes for the residual blocks. + resblock_dilation_sizes (list): Dilation sizes for the residual blocks. + upsample_rates (list): Upsampling rates for the decoder. + upsample_initial_channel (int): Number of channels in the initial upsampling layer. + upsample_kernel_sizes (list): Kernel sizes for the upsampling layers. + spk_embed_dim (int): Dimension of the speaker embedding. + gin_channels (int): Number of channels in the global conditioning vector. + sr (int): Sampling rate of the audio. + use_f0 (bool): Whether to use F0 information. + text_enc_hidden_dim (int): Hidden dimension for the text encoder. + kwargs: Additional keyword arguments. + """ + + def __init__( + self, + spec_channels: int, + segment_size: int, + inter_channels: int, + hidden_channels: int, + filter_channels: int, + n_heads: int, + n_layers: int, + kernel_size: int, + p_dropout: float, + resblock: str, + resblock_kernel_sizes: list, + resblock_dilation_sizes: list, + upsample_rates: list, + upsample_initial_channel: int, + upsample_kernel_sizes: list, + spk_embed_dim: int, + gin_channels: int, + sr: int, + use_f0: bool, + text_enc_hidden_dim: int = 768, + vocoder: str = "HiFi-GAN", + randomized: bool = True, + checkpointing: bool = False, + **kwargs, + ): + super().__init__() + self.segment_size = segment_size + self.use_f0 = use_f0 + self.randomized = randomized + + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + text_enc_hidden_dim, + f0=use_f0, + ) + print(f"Using {vocoder} vocoder") + if use_f0: + if vocoder == "MRF HiFi-GAN": + self.dec = HiFiGANMRFGenerator( + in_channel=inter_channels, + upsample_initial_channel=upsample_initial_channel, + upsample_rates=upsample_rates, + upsample_kernel_sizes=upsample_kernel_sizes, + resblock_kernel_sizes=resblock_kernel_sizes, + resblock_dilations=resblock_dilation_sizes, + gin_channels=gin_channels, + sample_rate=sr, + harmonic_num=8, + checkpointing=checkpointing, + ) + elif vocoder == "RefineGAN": + self.dec = RefineGANGenerator( + sample_rate=sr, + downsample_rates=upsample_rates[::-1], + upsample_rates=upsample_rates, + start_channels=16, + num_mels=inter_channels, + checkpointing=checkpointing, + ) + else: + self.dec = HiFiGANNSFGenerator( + inter_channels, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + checkpointing=checkpointing, + ) + else: + if vocoder == "MRF HiFi-GAN": + print("MRF HiFi-GAN does not support training without pitch guidance.") + self.dec = None + elif vocoder == "RefineGAN": + print("RefineGAN does not support training without pitch guidance.") + self.dec = None + else: + self.dec = HiFiGANGenerator( + inter_channels, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, + hidden_channels, + 5, + 1, + 3, + gin_channels=gin_channels, + ) + self.emb_g = torch.nn.Embedding(spk_embed_dim, gin_channels) + + def _remove_weight_norm_from(self, module): + for hook in module._forward_pre_hooks.values(): + if getattr(hook, "__class__", None).__name__ == "WeightNorm": + torch.nn.utils.remove_weight_norm(module) + + def remove_weight_norm(self): + for module in [self.dec, self.flow, self.enc_q]: + self._remove_weight_norm_from(module) + + def __prepare_scriptable__(self): + self.remove_weight_norm() + return self + + def forward( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: Optional[torch.Tensor] = None, + pitchf: Optional[torch.Tensor] = None, + y: Optional[torch.Tensor] = None, + y_lengths: Optional[torch.Tensor] = None, + ds: Optional[torch.Tensor] = None, + ): + g = self.emb_g(ds).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + + if y is not None: + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + # regular old training method using random slices + if self.randomized: + z_slice, ids_slice = rand_slice_segments( + z, y_lengths, self.segment_size + ) + if self.use_f0: + pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2) + o = self.dec(z_slice, pitchf, g=g) + else: + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + # future use for finetuning using the entire dataset each pass + else: + if self.use_f0: + o = self.dec(z, pitchf, g=g) + else: + o = self.dec(z, g=g) + return o, None, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + else: + return None, None, x_mask, None, (None, None, m_p, logs_p, None, None) + + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: Optional[torch.Tensor] = None, + nsff0: Optional[torch.Tensor] = None, + sid: torch.Tensor = None, + rate: Optional[torch.Tensor] = None, + ): + """ + Inference of the model. + + Args: + phone (torch.Tensor): Phoneme sequence. + phone_lengths (torch.Tensor): Lengths of the phoneme sequences. + pitch (torch.Tensor, optional): Pitch sequence. + nsff0 (torch.Tensor, optional): Fine-grained pitch sequence. + sid (torch.Tensor): Speaker embedding. + rate (torch.Tensor, optional): Rate for time-stretching. + """ + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + + if rate is not None: + head = int(z_p.shape[2] * (1.0 - rate.item())) + z_p, x_mask = z_p[:, :, head:], x_mask[:, :, head:] + if self.use_f0 and nsff0 is not None: + nsff0 = nsff0[:, head:] + + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = ( + self.dec(z * x_mask, nsff0, g=g) + if self.use_f0 + else self.dec(z * x_mask, g=g) + ) + + return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/rvc/lib/predictors/F0Extractor.py b/rvc/lib/predictors/F0Extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..7768893f416613d8a531c9410ebf67dd6a395ff7 --- /dev/null +++ b/rvc/lib/predictors/F0Extractor.py @@ -0,0 +1,105 @@ +import dataclasses +import pathlib +import librosa +import numpy as np +import resampy +import torch +import torchcrepe +import torchfcpe +import os + +# from tools.anyf0.rmvpe import RMVPE +from rvc.lib.predictors.RMVPE import RMVPE0Predictor +from rvc.configs.config import Config + +config = Config() + + +@dataclasses.dataclass +class F0Extractor: + wav_path: pathlib.Path + sample_rate: int = 44100 + hop_length: int = 512 + f0_min: int = 50 + f0_max: int = 1600 + method: str = "rmvpe" + x: np.ndarray = dataclasses.field(init=False) + + def __post_init__(self): + self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate) + + @property + def hop_size(self): + return self.hop_length / self.sample_rate + + @property + def wav16k(self): + return resampy.resample(self.x, self.sample_rate, 16000) + + def extract_f0(self): + f0 = None + method = self.method + if method == "crepe": + wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(config.device) + f0 = torchcrepe.predict( + wav16k_torch, + sample_rate=16000, + hop_length=160, + batch_size=512, + fmin=self.f0_min, + fmax=self.f0_max, + device=config.device, + ) + f0 = f0[0].cpu().numpy() + elif method == "fcpe": + audio = librosa.to_mono(self.x) + audio_length = len(audio) + f0_target_length = (audio_length // self.hop_length) + 1 + audio = ( + torch.from_numpy(audio) + .float() + .unsqueeze(0) + .unsqueeze(-1) + .to(config.device) + ) + model = torchfcpe.spawn_bundled_infer_model(device=config.device) + + f0 = model.infer( + audio, + sr=self.sample_rate, + decoder_mode="local_argmax", + threshold=0.006, + f0_min=self.f0_min, + f0_max=self.f0_max, + interp_uv=False, + output_interp_target_length=f0_target_length, + ) + f0 = f0.squeeze().cpu().numpy() + elif method == "rmvpe": + model_rmvpe = RMVPE0Predictor( + os.path.join("rvc", "models", "predictors", "rmvpe.pt"), + device=config.device, + # hop_length=80 + ) + f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03) + + else: + raise ValueError(f"Unknown method: {self.method}") + return self.hz_to_cents(f0, librosa.midi_to_hz(0)) + + def plot_f0(self, f0): + from matplotlib import pyplot as plt + + plt.figure(figsize=(10, 4)) + plt.plot(f0) + plt.title(self.method) + plt.xlabel("Time (frames)") + plt.ylabel("F0 (cents)") + plt.show() + + @staticmethod + def hz_to_cents(F, F_ref=55.0): + F_temp = np.array(F).astype(float) + F_temp[F_temp == 0] = np.nan + F_cents = 1200 * np.log2(F_temp / F_ref) + return F_cents diff --git a/rvc/lib/predictors/FCPE.py b/rvc/lib/predictors/FCPE.py new file mode 100644 index 0000000000000000000000000000000000000000..6ba757f5f938f089c653f7b250c9ef488b4e495d --- /dev/null +++ b/rvc/lib/predictors/FCPE.py @@ -0,0 +1,920 @@ +from typing import Union + +import torch.nn.functional as F +import numpy as np +import torch +import torch.nn as nn +from torch.nn.utils.parametrizations import weight_norm +from torchaudio.transforms import Resample +import os +import librosa +import soundfile as sf +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn +import math +from functools import partial + +from einops import rearrange, repeat +from local_attention import LocalAttention +from torch import nn + +os.environ["LRU_CACHE_CAPACITY"] = "3" + + +def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): + """Loads wav file to torch tensor.""" + try: + data, sample_rate = sf.read(full_path, always_2d=True) + except Exception as error: + print(f"An error occurred loading {full_path}: {error}") + if return_empty_on_exception: + return [], sample_rate or target_sr or 48000 + else: + raise + + data = data[:, 0] if len(data.shape) > 1 else data + assert len(data) > 2 + + # Normalize data + max_mag = ( + -np.iinfo(data.dtype).min + if np.issubdtype(data.dtype, np.integer) + else max(np.amax(data), -np.amin(data)) + ) + max_mag = ( + (2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0) + ) + data = torch.FloatTensor(data.astype(np.float32)) / max_mag + + # Handle exceptions and resample + if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception: + return [], sample_rate or target_sr or 48000 + if target_sr is not None and sample_rate != target_sr: + data = torch.from_numpy( + librosa.core.resample( + data.numpy(), orig_sr=sample_rate, target_sr=target_sr + ) + ) + sample_rate = target_sr + + return data, sample_rate + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + return torch.exp(x) / C + + +class STFT: + def __init__( + self, + sr=22050, + n_mels=80, + n_fft=1024, + win_size=1024, + hop_length=256, + fmin=20, + fmax=11025, + clip_val=1e-5, + ): + self.target_sr = sr + self.n_mels = n_mels + self.n_fft = n_fft + self.win_size = win_size + self.hop_length = hop_length + self.fmin = fmin + self.fmax = fmax + self.clip_val = clip_val + self.mel_basis = {} + self.hann_window = {} + + def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): + sample_rate = self.target_sr + n_mels = self.n_mels + n_fft = self.n_fft + win_size = self.win_size + hop_length = self.hop_length + fmin = self.fmin + fmax = self.fmax + clip_val = self.clip_val + + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(n_fft * factor)) + win_size_new = int(np.round(win_size * factor)) + hop_length_new = int(np.round(hop_length * speed)) + + # Optimize mel_basis and hann_window caching + mel_basis = self.mel_basis if not train else {} + hann_window = self.hann_window if not train else {} + + mel_basis_key = str(fmax) + "_" + str(y.device) + if mel_basis_key not in mel_basis: + mel = librosa_mel_fn( + sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax + ) + mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device) + + keyshift_key = str(keyshift) + "_" + str(y.device) + if keyshift_key not in hann_window: + hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device) + + # Padding and STFT + pad_left = (win_size_new - hop_length_new) // 2 + pad_right = max( + (win_size_new - hop_length_new + 1) // 2, + win_size_new - y.size(-1) - pad_left, + ) + mode = "reflect" if pad_right < y.size(-1) else "constant" + y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode) + y = y.squeeze(1) + + spec = torch.stft( + y, + n_fft=n_fft_new, + hop_length=hop_length_new, + win_length=win_size_new, + window=hann_window[keyshift_key], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9)) + + # Handle keyshift and mel conversion + if keyshift != 0: + size = n_fft // 2 + 1 + resize = spec.size(1) + spec = ( + F.pad(spec, (0, 0, 0, size - resize)) + if resize < size + else spec[:, :size, :] + ) + spec = spec * win_size / win_size_new + spec = torch.matmul(mel_basis[mel_basis_key], spec) + spec = dynamic_range_compression_torch(spec, clip_val=clip_val) + return spec + + def __call__(self, audiopath): + audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr) + spect = self.get_mel(audio.unsqueeze(0)).squeeze(0) + return spect + + +stft = STFT() + + +def softmax_kernel( + data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None +): + b, h, *_ = data.shape + + # Normalize data + data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0 + + # Project data + ratio = projection_matrix.shape[0] ** -0.5 + projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h) + projection = projection.type_as(data) + data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection) + + # Calculate diagonal data + diag_data = data**2 + diag_data = torch.sum(diag_data, dim=-1) + diag_data = (diag_data / 2.0) * (data_normalizer**2) + diag_data = diag_data.unsqueeze(dim=-1) + + # Apply softmax + if is_query: + data_dash = ratio * ( + torch.exp( + data_dash + - diag_data + - torch.max(data_dash, dim=-1, keepdim=True).values + ) + + eps + ) + else: + data_dash = ratio * (torch.exp(data_dash - diag_data + eps)) + + return data_dash.type_as(data) + + +def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None): + unstructured_block = torch.randn((cols, cols), device=device) + q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced") + q, r = map(lambda t: t.to(device), (q, r)) + + if qr_uniform_q: + d = torch.diag(r, 0) + q *= d.sign() + return q.t() + + +def exists(val): + return val is not None + + +def empty(tensor): + return tensor.numel() == 0 + + +def default(val, d): + return val if exists(val) else d + + +def cast_tuple(val): + return (val,) if not isinstance(val, tuple) else val + + +class PCmer(nn.Module): + def __init__( + self, + num_layers, + num_heads, + dim_model, + dim_keys, + dim_values, + residual_dropout, + attention_dropout, + ): + super().__init__() + self.num_layers = num_layers + self.num_heads = num_heads + self.dim_model = dim_model + self.dim_values = dim_values + self.dim_keys = dim_keys + self.residual_dropout = residual_dropout + self.attention_dropout = attention_dropout + + self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)]) + + def forward(self, phone, mask=None): + for layer in self._layers: + phone = layer(phone, mask) + return phone + + +class _EncoderLayer(nn.Module): + def __init__(self, parent: PCmer): + super().__init__() + self.conformer = ConformerConvModule(parent.dim_model) + self.norm = nn.LayerNorm(parent.dim_model) + self.dropout = nn.Dropout(parent.residual_dropout) + self.attn = SelfAttention( + dim=parent.dim_model, heads=parent.num_heads, causal=False + ) + + def forward(self, phone, mask=None): + phone = phone + (self.attn(self.norm(phone), mask=mask)) + phone = phone + (self.conformer(phone)) + return phone + + +def calc_same_padding(kernel_size): + pad = kernel_size // 2 + return (pad, pad - (kernel_size + 1) % 2) + + +class Swish(nn.Module): + def forward(self, x): + return x * x.sigmoid() + + +class Transpose(nn.Module): + def __init__(self, dims): + super().__init__() + assert len(dims) == 2, "dims must be a tuple of two dimensions" + self.dims = dims + + def forward(self, x): + return x.transpose(*self.dims) + + +class GLU(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + out, gate = x.chunk(2, dim=self.dim) + return out * gate.sigmoid() + + +class DepthWiseConv1d(nn.Module): + def __init__(self, chan_in, chan_out, kernel_size, padding): + super().__init__() + self.padding = padding + self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in) + + def forward(self, x): + x = F.pad(x, self.padding) + return self.conv(x) + + +class ConformerConvModule(nn.Module): + def __init__( + self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0 + ): + super().__init__() + + inner_dim = dim * expansion_factor + padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0) + + self.net = nn.Sequential( + nn.LayerNorm(dim), + Transpose((1, 2)), + nn.Conv1d(dim, inner_dim * 2, 1), + GLU(dim=1), + DepthWiseConv1d( + inner_dim, inner_dim, kernel_size=kernel_size, padding=padding + ), + Swish(), + nn.Conv1d(inner_dim, dim, 1), + Transpose((1, 2)), + nn.Dropout(dropout), + ) + + def forward(self, x): + return self.net(x) + + +def linear_attention(q, k, v): + if v is None: + out = torch.einsum("...ed,...nd->...ne", k, q) + return out + else: + k_cumsum = k.sum(dim=-2) + D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8) + context = torch.einsum("...nd,...ne->...de", k, v) + out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv) + return out + + +def gaussian_orthogonal_random_matrix( + nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None +): + nb_full_blocks = int(nb_rows / nb_columns) + block_list = [] + + for _ in range(nb_full_blocks): + q = orthogonal_matrix_chunk( + nb_columns, qr_uniform_q=qr_uniform_q, device=device + ) + block_list.append(q) + + remaining_rows = nb_rows - nb_full_blocks * nb_columns + if remaining_rows > 0: + q = orthogonal_matrix_chunk( + nb_columns, qr_uniform_q=qr_uniform_q, device=device + ) + block_list.append(q[:remaining_rows]) + + final_matrix = torch.cat(block_list) + + if scaling == 0: + multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1) + elif scaling == 1: + multiplier = math.sqrt((float(nb_columns))) * torch.ones( + (nb_rows,), device=device + ) + else: + raise ValueError(f"Invalid scaling {scaling}") + + return torch.diag(multiplier) @ final_matrix + + +class FastAttention(nn.Module): + def __init__( + self, + dim_heads, + nb_features=None, + ortho_scaling=0, + causal=False, + generalized_attention=False, + kernel_fn=nn.ReLU(), + qr_uniform_q=False, + no_projection=False, + ): + super().__init__() + nb_features = default(nb_features, int(dim_heads * math.log(dim_heads))) + + self.dim_heads = dim_heads + self.nb_features = nb_features + self.ortho_scaling = ortho_scaling + + self.create_projection = partial( + gaussian_orthogonal_random_matrix, + nb_rows=self.nb_features, + nb_columns=dim_heads, + scaling=ortho_scaling, + qr_uniform_q=qr_uniform_q, + ) + projection_matrix = self.create_projection() + self.register_buffer("projection_matrix", projection_matrix) + + self.generalized_attention = generalized_attention + self.kernel_fn = kernel_fn + self.no_projection = no_projection + self.causal = causal + + @torch.no_grad() + def redraw_projection_matrix(self): + projections = self.create_projection() + self.projection_matrix.copy_(projections) + del projections + + def forward(self, q, k, v): + device = q.device + + if self.no_projection: + q = q.softmax(dim=-1) + k = torch.exp(k) if self.causal else k.softmax(dim=-2) + else: + create_kernel = partial( + softmax_kernel, projection_matrix=self.projection_matrix, device=device + ) + q = create_kernel(q, is_query=True) + k = create_kernel(k, is_query=False) + + attn_fn = linear_attention if not self.causal else self.causal_linear_fn + + if v is None: + out = attn_fn(q, k, None) + return out + else: + out = attn_fn(q, k, v) + return out + + +class SelfAttention(nn.Module): + def __init__( + self, + dim, + causal=False, + heads=8, + dim_head=64, + local_heads=0, + local_window_size=256, + nb_features=None, + feature_redraw_interval=1000, + generalized_attention=False, + kernel_fn=nn.ReLU(), + qr_uniform_q=False, + dropout=0.0, + no_projection=False, + ): + super().__init__() + assert dim % heads == 0, "dimension must be divisible by number of heads" + dim_head = default(dim_head, dim // heads) + inner_dim = dim_head * heads + self.fast_attention = FastAttention( + dim_head, + nb_features, + causal=causal, + generalized_attention=generalized_attention, + kernel_fn=kernel_fn, + qr_uniform_q=qr_uniform_q, + no_projection=no_projection, + ) + + self.heads = heads + self.global_heads = heads - local_heads + self.local_attn = ( + LocalAttention( + window_size=local_window_size, + causal=causal, + autopad=True, + dropout=dropout, + look_forward=int(not causal), + rel_pos_emb_config=(dim_head, local_heads), + ) + if local_heads > 0 + else None + ) + + self.to_q = nn.Linear(dim, inner_dim) + self.to_k = nn.Linear(dim, inner_dim) + self.to_v = nn.Linear(dim, inner_dim) + self.to_out = nn.Linear(inner_dim, dim) + self.dropout = nn.Dropout(dropout) + + @torch.no_grad() + def redraw_projection_matrix(self): + self.fast_attention.redraw_projection_matrix() + + def forward( + self, + x, + context=None, + mask=None, + context_mask=None, + name=None, + inference=False, + **kwargs, + ): + _, _, _, h, gh = *x.shape, self.heads, self.global_heads + + cross_attend = exists(context) + context = default(context, x) + context_mask = default(context_mask, mask) if not cross_attend else context_mask + q, k, v = self.to_q(x), self.to_k(context), self.to_v(context) + + q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v)) + (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v)) + + attn_outs = [] + if not empty(q): + if exists(context_mask): + global_mask = context_mask[:, None, :, None] + v.masked_fill_(~global_mask, 0.0) + if cross_attend: + pass # TODO: Implement cross-attention + else: + out = self.fast_attention(q, k, v) + attn_outs.append(out) + + if not empty(lq): + assert ( + not cross_attend + ), "local attention is not compatible with cross attention" + out = self.local_attn(lq, lk, lv, input_mask=mask) + attn_outs.append(out) + + out = torch.cat(attn_outs, dim=1) + out = rearrange(out, "b h n d -> b n (h d)") + out = self.to_out(out) + return self.dropout(out) + + +def l2_regularization(model, l2_alpha): + l2_loss = [] + for module in model.modules(): + if type(module) is nn.Conv2d: + l2_loss.append((module.weight**2).sum() / 2.0) + return l2_alpha * sum(l2_loss) + + +class FCPE(nn.Module): + def __init__( + self, + input_channel=128, + out_dims=360, + n_layers=12, + n_chans=512, + use_siren=False, + use_full=False, + loss_mse_scale=10, + loss_l2_regularization=False, + loss_l2_regularization_scale=1, + loss_grad1_mse=False, + loss_grad1_mse_scale=1, + f0_max=1975.5, + f0_min=32.70, + confidence=False, + threshold=0.05, + use_input_conv=True, + ): + super().__init__() + if use_siren is True: + raise ValueError("Siren is not supported yet.") + if use_full is True: + raise ValueError("Full model is not supported yet.") + + self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10 + self.loss_l2_regularization = ( + loss_l2_regularization if (loss_l2_regularization is not None) else False + ) + self.loss_l2_regularization_scale = ( + loss_l2_regularization_scale + if (loss_l2_regularization_scale is not None) + else 1 + ) + self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False + self.loss_grad1_mse_scale = ( + loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1 + ) + self.f0_max = f0_max if (f0_max is not None) else 1975.5 + self.f0_min = f0_min if (f0_min is not None) else 32.70 + self.confidence = confidence if (confidence is not None) else False + self.threshold = threshold if (threshold is not None) else 0.05 + self.use_input_conv = use_input_conv if (use_input_conv is not None) else True + + self.cent_table_b = torch.Tensor( + np.linspace( + self.f0_to_cent(torch.Tensor([f0_min]))[0], + self.f0_to_cent(torch.Tensor([f0_max]))[0], + out_dims, + ) + ) + self.register_buffer("cent_table", self.cent_table_b) + + # conv in stack + _leaky = nn.LeakyReLU() + self.stack = nn.Sequential( + nn.Conv1d(input_channel, n_chans, 3, 1, 1), + nn.GroupNorm(4, n_chans), + _leaky, + nn.Conv1d(n_chans, n_chans, 3, 1, 1), + ) + + # transformer + self.decoder = PCmer( + num_layers=n_layers, + num_heads=8, + dim_model=n_chans, + dim_keys=n_chans, + dim_values=n_chans, + residual_dropout=0.1, + attention_dropout=0.1, + ) + self.norm = nn.LayerNorm(n_chans) + + # out + self.n_out = out_dims + self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out)) + + def forward( + self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax" + ): + if cdecoder == "argmax": + self.cdecoder = self.cents_decoder + elif cdecoder == "local_argmax": + self.cdecoder = self.cents_local_decoder + + x = ( + self.stack(mel.transpose(1, 2)).transpose(1, 2) + if self.use_input_conv + else mel + ) + x = self.decoder(x) + x = self.norm(x) + x = self.dense_out(x) + x = torch.sigmoid(x) + + if not infer: + gt_cent_f0 = self.f0_to_cent(gt_f0) + gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0) + loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0) + if self.loss_l2_regularization: + loss_all = loss_all + l2_regularization( + model=self, l2_alpha=self.loss_l2_regularization_scale + ) + x = loss_all + if infer: + x = self.cdecoder(x) + x = self.cent_to_f0(x) + x = (1 + x / 700).log() if not return_hz_f0 else x + + return x + + def cents_decoder(self, y, mask=True): + B, N, _ = y.size() + ci = self.cent_table[None, None, :].expand(B, N, -1) + rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum( + y, dim=-1, keepdim=True + ) + if mask: + confident = torch.max(y, dim=-1, keepdim=True)[0] + confident_mask = torch.ones_like(confident) + confident_mask[confident <= self.threshold] = float("-INF") + rtn = rtn * confident_mask + return (rtn, confident) if self.confidence else rtn + + def cents_local_decoder(self, y, mask=True): + B, N, _ = y.size() + ci = self.cent_table[None, None, :].expand(B, N, -1) + confident, max_index = torch.max(y, dim=-1, keepdim=True) + local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4) + local_argmax_index = torch.clamp(local_argmax_index, 0, self.n_out - 1) + ci_l = torch.gather(ci, -1, local_argmax_index) + y_l = torch.gather(y, -1, local_argmax_index) + rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum( + y_l, dim=-1, keepdim=True + ) + if mask: + confident_mask = torch.ones_like(confident) + confident_mask[confident <= self.threshold] = float("-INF") + rtn = rtn * confident_mask + return (rtn, confident) if self.confidence else rtn + + def cent_to_f0(self, cent): + return 10.0 * 2 ** (cent / 1200.0) + + def f0_to_cent(self, f0): + return 1200.0 * torch.log2(f0 / 10.0) + + def gaussian_blurred_cent(self, cents): + mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0))) + B, N, _ = cents.size() + ci = self.cent_table[None, None, :].expand(B, N, -1) + return torch.exp(-torch.square(ci - cents) / 1250) * mask.float() + + +class FCPEInfer: + def __init__(self, model_path, device=None, dtype=torch.float32): + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device + ckpt = torch.load( + model_path, map_location=torch.device(self.device), weights_only=True + ) + self.args = DotDict(ckpt["config"]) + self.dtype = dtype + model = FCPE( + input_channel=self.args.model.input_channel, + out_dims=self.args.model.out_dims, + n_layers=self.args.model.n_layers, + n_chans=self.args.model.n_chans, + use_siren=self.args.model.use_siren, + use_full=self.args.model.use_full, + loss_mse_scale=self.args.loss.loss_mse_scale, + loss_l2_regularization=self.args.loss.loss_l2_regularization, + loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale, + loss_grad1_mse=self.args.loss.loss_grad1_mse, + loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale, + f0_max=self.args.model.f0_max, + f0_min=self.args.model.f0_min, + confidence=self.args.model.confidence, + ) + model.to(self.device).to(self.dtype) + model.load_state_dict(ckpt["model"]) + model.eval() + self.model = model + self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device) + + @torch.no_grad() + def __call__(self, audio, sr, threshold=0.05): + self.model.threshold = threshold + audio = audio[None, :] + mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype) + f0 = self.model(mel=mel, infer=True, return_hz_f0=True) + return f0 + + +class Wav2Mel: + def __init__(self, args, device=None, dtype=torch.float32): + self.sample_rate = args.mel.sampling_rate + self.hop_size = args.mel.hop_size + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device + self.dtype = dtype + self.stft = STFT( + args.mel.sampling_rate, + args.mel.num_mels, + args.mel.n_fft, + args.mel.win_size, + args.mel.hop_size, + args.mel.fmin, + args.mel.fmax, + ) + self.resample_kernel = {} + + def extract_nvstft(self, audio, keyshift=0, train=False): + mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2) + return mel + + def extract_mel(self, audio, sample_rate, keyshift=0, train=False): + audio = audio.to(self.dtype).to(self.device) + if sample_rate == self.sample_rate: + audio_res = audio + else: + key_str = str(sample_rate) + if key_str not in self.resample_kernel: + self.resample_kernel[key_str] = Resample( + sample_rate, self.sample_rate, lowpass_filter_width=128 + ) + self.resample_kernel[key_str] = ( + self.resample_kernel[key_str].to(self.dtype).to(self.device) + ) + audio_res = self.resample_kernel[key_str](audio) + + mel = self.extract_nvstft( + audio_res, keyshift=keyshift, train=train + ) # B, n_frames, bins + n_frames = int(audio.shape[1] // self.hop_size) + 1 + mel = ( + torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel + ) + mel = mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel + return mel + + def __call__(self, audio, sample_rate, keyshift=0, train=False): + return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train) + + +class DotDict(dict): + def __getattr__(*args): + val = dict.get(*args) + return DotDict(val) if type(val) is dict else val + + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + +class F0Predictor(object): + def compute_f0(self, wav, p_len): + pass + + def compute_f0_uv(self, wav, p_len): + pass + + +class FCPEF0Predictor(F0Predictor): + def __init__( + self, + model_path, + hop_length=512, + f0_min=50, + f0_max=1100, + dtype=torch.float32, + device=None, + sample_rate=44100, + threshold=0.05, + ): + self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype) + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.threshold = threshold + self.sample_rate = sample_rate + self.dtype = dtype + self.name = "fcpe" + + def repeat_expand( + self, + content: Union[torch.Tensor, np.ndarray], + target_len: int, + mode: str = "nearest", + ): + ndim = content.ndim + content = ( + content[None, None] + if ndim == 1 + else content[None] if ndim == 2 else content + ) + assert content.ndim == 3 + is_np = isinstance(content, np.ndarray) + content = torch.from_numpy(content) if is_np else content + results = torch.nn.functional.interpolate(content, size=target_len, mode=mode) + results = results.numpy() if is_np else results + return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results + + def post_process(self, x, sample_rate, f0, pad_to): + f0 = ( + torch.from_numpy(f0).float().to(x.device) + if isinstance(f0, np.ndarray) + else f0 + ) + f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0 + + vuv_vector = torch.zeros_like(f0) + vuv_vector[f0 > 0.0] = 1.0 + vuv_vector[f0 <= 0.0] = 0.0 + + nzindex = torch.nonzero(f0).squeeze() + f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() + time_org = self.hop_length / sample_rate * nzindex.cpu().numpy() + time_frame = np.arange(pad_to) * self.hop_length / sample_rate + + vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0] + + if f0.shape[0] <= 0: + return np.zeros(pad_to), vuv_vector.cpu().numpy() + if f0.shape[0] == 1: + return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy() + + f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1]) + return f0, vuv_vector.cpu().numpy() + + def compute_f0(self, wav, p_len=None): + x = torch.FloatTensor(wav).to(self.dtype).to(self.device) + p_len = x.shape[0] // self.hop_length if p_len is None else p_len + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] + if torch.all(f0 == 0): + return f0.cpu().numpy() if p_len is None else np.zeros(p_len) + return self.post_process(x, self.sample_rate, f0, p_len)[0] + + def compute_f0_uv(self, wav, p_len=None): + x = torch.FloatTensor(wav).to(self.dtype).to(self.device) + p_len = x.shape[0] // self.hop_length if p_len is None else p_len + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] + if torch.all(f0 == 0): + return f0.cpu().numpy() if p_len is None else np.zeros(p_len), ( + f0.cpu().numpy() if p_len is None else np.zeros(p_len) + ) + return self.post_process(x, self.sample_rate, f0, p_len) diff --git a/rvc/lib/predictors/RMVPE.py b/rvc/lib/predictors/RMVPE.py new file mode 100644 index 0000000000000000000000000000000000000000..ab18f47724ee3f431422d3bedb56ee74b6fbb535 --- /dev/null +++ b/rvc/lib/predictors/RMVPE.py @@ -0,0 +1,564 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +from librosa.filters import mel +from typing import List + +N_MELS = 128 +N_CLASS = 360 + + +class ConvBlockRes(nn.Module): + """ + A convolutional block with residual connection. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + momentum (float): Momentum for batch normalization. + """ + + def __init__(self, in_channels, out_channels, momentum=0.01): + super(ConvBlockRes, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + if in_channels != out_channels: + self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) + self.is_shortcut = True + else: + self.is_shortcut = False + + def forward(self, x): + if self.is_shortcut: + return self.conv(x) + self.shortcut(x) + else: + return self.conv(x) + x + + +class ResEncoderBlock(nn.Module): + """ + A residual encoder block. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (tuple): Size of the average pooling kernel. + n_blocks (int): Number of convolutional blocks in the block. + momentum (float): Momentum for batch normalization. + """ + + def __init__( + self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 + ): + super(ResEncoderBlock, self).__init__() + self.n_blocks = n_blocks + self.conv = nn.ModuleList() + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) + for _ in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + self.kernel_size = kernel_size + if self.kernel_size is not None: + self.pool = nn.AvgPool2d(kernel_size=kernel_size) + + def forward(self, x): + for i in range(self.n_blocks): + x = self.conv[i](x) + if self.kernel_size is not None: + return x, self.pool(x) + else: + return x + + +class Encoder(nn.Module): + """ + The encoder part of the DeepUnet. + + Args: + in_channels (int): Number of input channels. + in_size (int): Size of the input tensor. + n_encoders (int): Number of encoder blocks. + kernel_size (tuple): Size of the average pooling kernel. + n_blocks (int): Number of convolutional blocks in each encoder block. + out_channels (int): Number of output channels for the first encoder block. + momentum (float): Momentum for batch normalization. + """ + + def __init__( + self, + in_channels, + in_size, + n_encoders, + kernel_size, + n_blocks, + out_channels=16, + momentum=0.01, + ): + super(Encoder, self).__init__() + self.n_encoders = n_encoders + self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) + self.layers = nn.ModuleList() + self.latent_channels = [] + for i in range(self.n_encoders): + self.layers.append( + ResEncoderBlock( + in_channels, out_channels, kernel_size, n_blocks, momentum=momentum + ) + ) + self.latent_channels.append([out_channels, in_size]) + in_channels = out_channels + out_channels *= 2 + in_size //= 2 + self.out_size = in_size + self.out_channel = out_channels + + def forward(self, x: torch.Tensor): + concat_tensors: List[torch.Tensor] = [] + x = self.bn(x) + for i in range(self.n_encoders): + t, x = self.layers[i](x) + concat_tensors.append(t) + return x, concat_tensors + + +class Intermediate(nn.Module): + """ + The intermediate layer of the DeepUnet. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + n_inters (int): Number of convolutional blocks in the intermediate layer. + n_blocks (int): Number of convolutional blocks in each intermediate block. + momentum (float): Momentum for batch normalization. + """ + + def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): + super(Intermediate, self).__init__() + self.n_inters = n_inters + self.layers = nn.ModuleList() + self.layers.append( + ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) + ) + for _ in range(self.n_inters - 1): + self.layers.append( + ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) + ) + + def forward(self, x): + for i in range(self.n_inters): + x = self.layers[i](x) + return x + + +class ResDecoderBlock(nn.Module): + """ + A residual decoder block. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + stride (tuple): Stride for transposed convolution. + n_blocks (int): Number of convolutional blocks in the block. + momentum (float): Momentum for batch normalization. + """ + + def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): + super(ResDecoderBlock, self).__init__() + out_padding = (0, 1) if stride == (1, 2) else (1, 1) + self.n_blocks = n_blocks + self.conv1 = nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=stride, + padding=(1, 1), + output_padding=out_padding, + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + self.conv2 = nn.ModuleList() + self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) + for _ in range(n_blocks - 1): + self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x, concat_tensor): + x = self.conv1(x) + x = torch.cat((x, concat_tensor), dim=1) + for i in range(self.n_blocks): + x = self.conv2[i](x) + return x + + +class Decoder(nn.Module): + """ + The decoder part of the DeepUnet. + + Args: + in_channels (int): Number of input channels. + n_decoders (int): Number of decoder blocks. + stride (tuple): Stride for transposed convolution. + n_blocks (int): Number of convolutional blocks in each decoder block. + momentum (float): Momentum for batch normalization. + """ + + def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): + super(Decoder, self).__init__() + self.layers = nn.ModuleList() + self.n_decoders = n_decoders + for _ in range(self.n_decoders): + out_channels = in_channels // 2 + self.layers.append( + ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) + ) + in_channels = out_channels + + def forward(self, x, concat_tensors): + for i in range(self.n_decoders): + x = self.layers[i](x, concat_tensors[-1 - i]) + return x + + +class DeepUnet(nn.Module): + """ + The DeepUnet architecture. + + Args: + kernel_size (tuple): Size of the average pooling kernel. + n_blocks (int): Number of convolutional blocks in each encoder/decoder block. + en_de_layers (int): Number of encoder/decoder layers. + inter_layers (int): Number of convolutional blocks in the intermediate layer. + in_channels (int): Number of input channels. + en_out_channels (int): Number of output channels for the first encoder block. + """ + + def __init__( + self, + kernel_size, + n_blocks, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(DeepUnet, self).__init__() + self.encoder = Encoder( + in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels + ) + self.intermediate = Intermediate( + self.encoder.out_channel // 2, + self.encoder.out_channel, + inter_layers, + n_blocks, + ) + self.decoder = Decoder( + self.encoder.out_channel, en_de_layers, kernel_size, n_blocks + ) + + def forward(self, x): + x, concat_tensors = self.encoder(x) + x = self.intermediate(x) + x = self.decoder(x, concat_tensors) + return x + + +class E2E(nn.Module): + """ + The end-to-end model. + + Args: + n_blocks (int): Number of convolutional blocks in each encoder/decoder block. + n_gru (int): Number of GRU layers. + kernel_size (tuple): Size of the average pooling kernel. + en_de_layers (int): Number of encoder/decoder layers. + inter_layers (int): Number of convolutional blocks in the intermediate layer. + in_channels (int): Number of input channels. + en_out_channels (int): Number of output channels for the first encoder block. + """ + + def __init__( + self, + n_blocks, + n_gru, + kernel_size, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(E2E, self).__init__() + self.unet = DeepUnet( + kernel_size, + n_blocks, + en_de_layers, + inter_layers, + in_channels, + en_out_channels, + ) + self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) + if n_gru: + self.fc = nn.Sequential( + BiGRU(3 * 128, 256, n_gru), + nn.Linear(512, N_CLASS), + nn.Dropout(0.25), + nn.Sigmoid(), + ) + else: + self.fc = nn.Sequential( + nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid() + ) + + def forward(self, mel): + mel = mel.transpose(-1, -2).unsqueeze(1) + x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) + x = self.fc(x) + return x + + +class MelSpectrogram(torch.nn.Module): + """ + Extracts Mel-spectrogram features from audio. + + Args: + n_mel_channels (int): Number of Mel-frequency bands. + sample_rate (int): Sampling rate of the audio. + win_length (int): Length of the window function in samples. + hop_length (int): Hop size between frames in samples. + n_fft (int, optional): Length of the FFT window. Defaults to None, which uses win_length. + mel_fmin (int, optional): Minimum frequency for the Mel filter bank. Defaults to 0. + mel_fmax (int, optional): Maximum frequency for the Mel filter bank. Defaults to None. + clamp (float, optional): Minimum value for clamping the Mel-spectrogram. Defaults to 1e-5. + """ + + def __init__( + self, + n_mel_channels, + sample_rate, + win_length, + hop_length, + n_fft=None, + mel_fmin=0, + mel_fmax=None, + clamp=1e-5, + ): + super().__init__() + n_fft = win_length if n_fft is None else n_fft + self.hann_window = {} + mel_basis = mel( + sr=sample_rate, + n_fft=n_fft, + n_mels=n_mel_channels, + fmin=mel_fmin, + fmax=mel_fmax, + htk=True, + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + self.n_fft = win_length if n_fft is None else n_fft + self.hop_length = hop_length + self.win_length = win_length + self.sample_rate = sample_rate + self.n_mel_channels = n_mel_channels + self.clamp = clamp + + def forward(self, audio, keyshift=0, speed=1, center=True): + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(self.n_fft * factor)) + win_length_new = int(np.round(self.win_length * factor)) + hop_length_new = int(np.round(self.hop_length * speed)) + keyshift_key = str(keyshift) + "_" + str(audio.device) + if keyshift_key not in self.hann_window: + self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( + audio.device + ) + fft = torch.stft( + audio, + n_fft=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window=self.hann_window[keyshift_key], + center=center, + return_complex=True, + ) + + magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) + if keyshift != 0: + size = self.n_fft // 2 + 1 + resize = magnitude.size(1) + if resize < size: + magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) + magnitude = magnitude[:, :size, :] * self.win_length / win_length_new + mel_output = torch.matmul(self.mel_basis, magnitude) + log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) + return log_mel_spec + + +class RMVPE0Predictor: + """ + A predictor for fundamental frequency (F0) based on the RMVPE0 model. + + Args: + model_path (str): Path to the RMVPE0 model file. + device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available. + """ + + def __init__(self, model_path, device=None): + self.resample_kernel = {} + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location="cpu", weights_only=True) + model.load_state_dict(ckpt) + model.eval() + self.model = model + self.resample_kernel = {} + self.device = device + self.mel_extractor = MelSpectrogram( + N_MELS, 16000, 1024, 160, None, 30, 8000 + ).to(device) + self.model = self.model.to(device) + cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191 + self.cents_mapping = np.pad(cents_mapping, (4, 4)) + + def mel2hidden(self, mel, chunk_size=32000): + """ + Converts Mel-spectrogram features to hidden representation. + + Args: + mel (torch.Tensor): Mel-spectrogram features. + """ + with torch.no_grad(): + n_frames = mel.shape[-1] + # print('n_frames', n_frames) + # print('mel shape before padding', mel.shape) + mel = F.pad( + mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect" + ) + # print('mel shape after padding', mel.shape) + + output_chunks = [] + pad_frames = mel.shape[-1] + for start in range(0, pad_frames, chunk_size): + # print('chunk @', start) + end = min(start + chunk_size, pad_frames) + mel_chunk = mel[..., start:end] + assert ( + mel_chunk.shape[-1] % 32 == 0 + ), "chunk_size must be divisible by 32" + # print(' before padding', mel_chunk.shape) + # mel_chunk = F.pad(mel_chunk, (320, 320), mode="reflect") + # print(' after padding', mel_chunk.shape) + + out_chunk = self.model(mel_chunk) + # print(' result chunk', out_chunk.shape) + # out_chunk = out_chunk[:, 320:-320, :] + # print(' trimmed chunk', out_chunk.shape) + output_chunks.append(out_chunk) + + hidden = torch.cat(output_chunks, dim=1) + # print('output', hidden[:, :n_frames].shape) + return hidden[:, :n_frames] + + def decode(self, hidden, thred=0.03): + """ + Decodes hidden representation to F0. + + Args: + hidden (np.ndarray): Hidden representation. + thred (float, optional): Threshold for salience. Defaults to 0.03. + """ + cents_pred = self.to_local_average_cents(hidden, thred=thred) + f0 = 10 * (2 ** (cents_pred / 1200)) + f0[f0 == 10] = 0 + return f0 + + def infer_from_audio(self, audio, thred=0.03): + """ + Infers F0 from audio. + + Args: + audio (np.ndarray): Audio signal. + thred (float, optional): Threshold for salience. Defaults to 0.03. + """ + audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) + mel = self.mel_extractor(audio, center=True) + del audio + with torch.no_grad(): + torch.cuda.empty_cache() + hidden = self.mel2hidden(mel) + hidden = hidden.squeeze(0).cpu().numpy() + f0 = self.decode(hidden, thred=thred) + return f0 + + def to_local_average_cents(self, salience, thred=0.05): + """ + Converts salience to local average cents. + + Args: + salience (np.ndarray): Salience values. + thred (float, optional): Threshold for salience. Defaults to 0.05. + """ + center = np.argmax(salience, axis=1) + salience = np.pad(salience, ((0, 0), (4, 4))) + center += 4 + todo_salience = [] + todo_cents_mapping = [] + starts = center - 4 + ends = center + 5 + for idx in range(salience.shape[0]): + todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) + todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) + todo_salience = np.array(todo_salience) + todo_cents_mapping = np.array(todo_cents_mapping) + product_sum = np.sum(todo_salience * todo_cents_mapping, 1) + weight_sum = np.sum(todo_salience, 1) + devided = product_sum / weight_sum + maxx = np.max(salience, axis=1) + devided[maxx <= thred] = 0 + return devided + + +class BiGRU(nn.Module): + """ + A bidirectional GRU layer. + + Args: + input_features (int): Number of input features. + hidden_features (int): Number of hidden features. + num_layers (int): Number of GRU layers. + """ + + def __init__(self, input_features, hidden_features, num_layers): + super(BiGRU, self).__init__() + self.gru = nn.GRU( + input_features, + hidden_features, + num_layers=num_layers, + batch_first=True, + bidirectional=True, + ) + + def forward(self, x): + return self.gru(x)[0] diff --git a/rvc/lib/predictors/f0.py b/rvc/lib/predictors/f0.py new file mode 100644 index 0000000000000000000000000000000000000000..3b17084867a9e924ba464c9c22bac3ac8eabaa94 --- /dev/null +++ b/rvc/lib/predictors/f0.py @@ -0,0 +1,118 @@ +import os +import torch + +from rvc.lib.predictors.RMVPE import RMVPE0Predictor +from torchfcpe import spawn_bundled_infer_model +import torchcrepe +from swift_f0 import SwiftF0 +import numpy as np + + +class RMVPE: + def __init__(self, device, model_name="rmvpe.pt", sample_rate=16000, hop_size=160): + self.device = device + self.sample_rate = sample_rate + self.hop_size = hop_size + self.model = RMVPE0Predictor( + os.path.join("rvc", "models", "predictors", model_name), + device=self.device, + ) + + def get_f0(self, x, filter_radius=0.03): + f0 = self.model.infer_from_audio(x, thred=filter_radius) + return f0 + + +class CREPE: + def __init__(self, device, sample_rate=16000, hop_size=160): + self.device = device + self.sample_rate = sample_rate + self.hop_size = hop_size + + def get_f0(self, x, f0_min=50, f0_max=1100, p_len=None, model="full"): + if p_len is None: + p_len = x.shape[0] // self.hop_size + + if not torch.is_tensor(x): + x = torch.from_numpy(x) + + batch_size = 512 + + f0, pd = torchcrepe.predict( + x.float().to(self.device).unsqueeze(dim=0), + self.sample_rate, + self.hop_size, + f0_min, + f0_max, + model=model, + batch_size=batch_size, + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + + return f0 + + +class FCPE: + def __init__(self, device, sample_rate=16000, hop_size=160): + self.device = device + self.sample_rate = sample_rate + self.hop_size = hop_size + self.model = spawn_bundled_infer_model(self.device) + + def get_f0(self, x, p_len=None, filter_radius=0.006): + if p_len is None: + p_len = x.shape[0] // self.hop_size + + if not torch.is_tensor(x): + x = torch.from_numpy(x) + + f0 = ( + self.model.infer( + x.float().to(self.device).unsqueeze(0), + sr=self.sample_rate, + decoder_mode="local_argmax", + threshold=filter_radius, + ) + .squeeze() + .cpu() + .numpy() + ) + + return f0 + + +class SWIFT: + def __init__(self, device, sample_rate=16000, hop_size=160): + self.device = "cpu" + self.sample_rate = sample_rate + self.hop_size = hop_size + + def get_f0(self, x, f0_min=50, f0_max=1100, p_len=None, confidence_threshold=0.9): + if torch.is_tensor(x): + x = x.cpu().numpy() + + if p_len is None: + p_len = x.shape[0] // self.hop_size + + f0_min = max(f0_min, 46.875) + f0_max = min(f0_max, 2093.75) + + detector = SwiftF0( + fmin=f0_min, fmax=f0_max, confidence_threshold=confidence_threshold + ) + result = detector.detect_from_array(x, self.sample_rate) + if len(result.timestamps) == 0: + return np.zeros(p_len) + target_time = ( + np.arange(p_len) * self.hop_size + self.hop_size / 2 + ) / self.sample_rate + pitch = np.nan_to_num(result.pitch_hz, nan=0.0) + pitch[~result.voicing] = 0.0 + f0 = np.interp(target_time, result.timestamps, pitch, left=0.0, right=0.0) + + return f0 diff --git a/rvc/lib/tools/__pycache__/prerequisites_download.cpython-310.pyc b/rvc/lib/tools/__pycache__/prerequisites_download.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbc022b4e8bebe1b04334de7a36507eb41627789 Binary files /dev/null and b/rvc/lib/tools/__pycache__/prerequisites_download.cpython-310.pyc differ diff --git a/rvc/lib/tools/analyzer.py b/rvc/lib/tools/analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..f4b794348082b168132dda0a23933c6d633f0097 --- /dev/null +++ b/rvc/lib/tools/analyzer.py @@ -0,0 +1,76 @@ +import numpy as np +import matplotlib.pyplot as plt +import librosa.display +import librosa + + +def calculate_features(y, sr): + stft = np.abs(librosa.stft(y)) + duration = librosa.get_duration(y=y, sr=sr) + cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0] + bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0] + rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0] + return stft, duration, cent, bw, rolloff + + +def plot_title(title): + plt.suptitle(title, fontsize=16, fontweight="bold") + + +def plot_spectrogram(y, sr, stft, duration, cmap="inferno"): + plt.subplot(3, 1, 1) + plt.imshow( + librosa.amplitude_to_db(stft, ref=np.max), + origin="lower", + extent=[0, duration, 0, sr / 1000], + aspect="auto", + cmap=cmap, # Change the colormap here + ) + plt.colorbar(format="%+2.0f dB") + plt.xlabel("Time (s)") + plt.ylabel("Frequency (kHz)") + plt.title("Spectrogram") + + +def plot_waveform(y, sr, duration): + plt.subplot(3, 1, 2) + librosa.display.waveshow(y, sr=sr) + plt.xlabel("Time (s)") + plt.ylabel("Amplitude") + plt.title("Waveform") + + +def plot_features(times, cent, bw, rolloff, duration): + plt.subplot(3, 1, 3) + plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b") + plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g") + plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r") + plt.xlabel("Time (s)") + plt.title("Spectral Features") + plt.legend() + + +def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"): + y, sr = librosa.load(audio_file) + stft, duration, cent, bw, rolloff = calculate_features(y, sr) + + plt.figure(figsize=(12, 10)) + + plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1]) + plot_spectrogram(y, sr, stft, duration) + plot_waveform(y, sr, duration) + plot_features(librosa.times_like(cent), cent, bw, rolloff, duration) + + plt.tight_layout() + + if save_plot_path: + plt.savefig(save_plot_path, bbox_inches="tight", dpi=300) + plt.close() + + audio_info = f"""Sample Rate: {sr}\nDuration: {( + str(round(duration, 2)) + " seconds" + if duration < 60 + else str(round(duration / 60, 2)) + " minutes" + )}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}""" + + return audio_info, save_plot_path diff --git a/rvc/lib/tools/gdown.py b/rvc/lib/tools/gdown.py new file mode 100644 index 0000000000000000000000000000000000000000..35f2dc310c3ef2c69a3485da8810b69687c13264 --- /dev/null +++ b/rvc/lib/tools/gdown.py @@ -0,0 +1,285 @@ +import os +import re +import sys +import json +import time +import shutil +import tempfile +import warnings +from typing import Optional, Union, IO +import requests +from urllib.parse import urlparse, unquote +from tqdm import tqdm + +CHUNK_SIZE = 512 * 1024 +HOME = os.path.expanduser("~") + + +def indent(text: str, prefix: str): + """Indent each non-empty line of text with the given prefix.""" + return "".join( + (prefix + line if line.strip() else line) for line in text.splitlines(True) + ) + + +class FileURLRetrievalError(Exception): + """Custom exception for issues retrieving file URLs.""" + + +def _extract_download_url_from_confirmation(contents: str, url_origin: str): + """Extract the download URL from a Google Drive confirmation page.""" + patterns = [ + r'href="(\/uc\?export=download[^"]+)', + r'href="/open\?id=([^"]+)"', + r'"downloadUrl":"([^"]+)', + ] + for pattern in patterns: + match = re.search(pattern, contents) + if match: + url = match.group(1) + if pattern == r'href="/open\?id=([^"]+)"': + uuid_match = re.search( + r'(.*)

', contents) + if error_match: + error = error_match.group(1) + raise FileURLRetrievalError(error) + + raise FileURLRetrievalError( + "Cannot retrieve the public link of the file. " + "You may need to change the permission to " + "'Anyone with the link', or have had many accesses." + ) + + +def _create_session( + proxy: Optional[str] = None, + use_cookies: bool = True, + return_cookies_file: bool = False, +): + """Create a requests session with optional proxy and cookie handling.""" + sess = requests.session() + sess.headers.update( + {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"} + ) + + if proxy: + sess.proxies = {"http": proxy, "https": proxy} + + cookies_file = os.path.join(HOME, ".cache/gdown/cookies.json") + if os.path.exists(cookies_file) and use_cookies: + try: + with open(cookies_file) as f: + cookies = json.load(f) + for k, v in cookies: + sess.cookies[k] = v + except json.JSONDecodeError: + warnings.warn("Corrupted Cookies file") + + return (sess, cookies_file) if return_cookies_file else sess + + +def download( + output: Optional[str] = None, + quiet: bool = False, + proxy: Optional[str] = None, + speed: Optional[float] = None, + use_cookies: bool = True, + verify: Union[bool, str] = True, + id: Optional[str] = None, + fuzzy: bool = True, + resume: bool = False, + format: Optional[str] = None, + url: Optional[str] = None, +): + """Download a file from a URL, supporting Google Drive links. + + Args: + output: Output filepath. Default is basename of URL. + quiet: Suppress terminal output. + proxy: HTTP/HTTPS proxy. + speed: Download speed limit (bytes per second). + use_cookies: Flag to use cookies. + verify: Verify TLS certificates. + id: Google Drive's file ID. + fuzzy: Fuzzy Google Drive ID extraction. + resume: Resume download from a tmp file. + format: Format for Google Docs/Sheets/Slides. + url: URL to download from. + + Returns: + Output filename, or None on error. + """ + if not (id is None) ^ (url is None): + raise ValueError("Either url or id has to be specified") + + if id is not None: + url = f"https://drive.google.com/uc?id={id}" + + url_origin = url + sess, cookies_file = _create_session( + proxy=proxy, use_cookies=use_cookies, return_cookies_file=True + ) + + while True: + res = sess.get(url, stream=True, verify=verify) + res.raise_for_status() + + if url == url_origin and res.status_code == 500: + url = f"https://drive.google.com/open?id={id}" + continue + + if res.headers.get("Content-Type", "").startswith("text/html"): + title_match = re.search("(.+)", res.text) + if title_match: + title = title_match.group(1) + if title.endswith(" - Google Docs"): + url = f"https://docs.google.com/document/d/{id}/export?format={'docx' if format is None else format}" + continue + if title.endswith(" - Google Sheets"): + url = f"https://docs.google.com/spreadsheets/d/{id}/export?format={'xlsx' if format is None else format}" + continue + if title.endswith(" - Google Slides"): + url = f"https://docs.google.com/presentation/d/{id}/export?format={'pptx' if format is None else format}" + continue + if ( + "Content-Disposition" in res.headers + and res.headers["Content-Disposition"].endswith("pptx") + and format not in (None, "pptx") + ): + url = f"https://docs.google.com/presentation/d/{id}/export?format={'pptx' if format is None else format}" + continue + + if use_cookies: + os.makedirs(os.path.dirname(cookies_file), exist_ok=True) + cookies = [ + (k, v) + for k, v in sess.cookies.items() + if not k.startswith("download_warning_") + ] + with open(cookies_file, "w") as f: + json.dump(cookies, f, indent=2) + + if "Content-Disposition" in res.headers: + break + + parsed_url = urlparse(url) + is_gdrive = parsed_url.hostname in ("drive.google.com", "docs.google.com") + is_download_link = parsed_url.path.endswith("/uc") + + if not (is_gdrive and is_download_link and fuzzy): + break + + try: + url = _extract_download_url_from_confirmation(res.text, url_origin) + except FileURLRetrievalError as e: + raise FileURLRetrievalError(e) + + content_disposition = res.headers.get("Content-Disposition", "") + filename_match = re.search( + r"filename\*=UTF-8''(.*)", content_disposition + ) or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition) + filename_from_url = ( + unquote(filename_match.group(1)) if filename_match else os.path.basename(url) + ) + download_path = output or filename_from_url + + if isinstance(download_path, str) and download_path.endswith(os.path.sep): + os.makedirs(download_path, exist_ok=True) + download_path = os.path.join(download_path, filename_from_url) + + temp_dir = os.path.dirname(download_path) or "." + prefix = os.path.basename(download_path) + + if isinstance(download_path, str): + existing_tmp_files = [ + os.path.join(temp_dir, file) + for file in os.listdir(temp_dir) + if file.startswith(prefix) + ] + if resume and existing_tmp_files: + if len(existing_tmp_files) > 1: + print( + "There are multiple temporary files to resume:", + file=sys.stderr, + ) + for file in existing_tmp_files: + print(f"\t{file}", file=sys.stderr) + print( + "Please remove them except one to resume downloading.", + file=sys.stderr, + ) + return None + temp_file_path = existing_tmp_files[0] + else: + resume = False + temp_file_path = tempfile.mktemp( + suffix=tempfile.template, prefix=prefix, dir=temp_dir + ) + + try: + file_obj: IO = open(temp_file_path, "ab") + except Exception as e: + print( + f"Could not open the temporary file {temp_file_path}: {e}", + file=sys.stderr, + ) + return None + else: + temp_file_path = None + file_obj = download_path + + if temp_file_path is not None and file_obj.tell() != 0: + headers = {"Range": f"bytes={file_obj.tell()}-"} + res = sess.get(url, headers=headers, stream=True, verify=verify) + res.raise_for_status() + + try: + total = int(res.headers.get("Content-Length", 0)) + if total > 0: + if not quiet: + pbar = tqdm( + total=total, unit="B", unit_scale=True, desc=filename_from_url + ) + else: + if not quiet: + pbar = tqdm(unit="B", unit_scale=True, desc=filename_from_url) + + t_start = time.time() + for chunk in res.iter_content(chunk_size=CHUNK_SIZE): + file_obj.write(chunk) + if not quiet: + pbar.update(len(chunk)) + if speed is not None: + elapsed_time_expected = 1.0 * pbar.n / speed + elapsed_time = time.time() - t_start + if elapsed_time < elapsed_time_expected: + time.sleep(elapsed_time_expected - elapsed_time) + if not quiet: + pbar.close() + + if temp_file_path: + file_obj.close() + shutil.move(temp_file_path, download_path) + finally: + sess.close() + + return download_path diff --git a/rvc/lib/tools/launch_tensorboard.py b/rvc/lib/tools/launch_tensorboard.py new file mode 100644 index 0000000000000000000000000000000000000000..7f74e316762b737037f7b8e4448a1042553d5651 --- /dev/null +++ b/rvc/lib/tools/launch_tensorboard.py @@ -0,0 +1,21 @@ +import time +import logging +from tensorboard import program + +log_path = "logs" + + +def launch_tensorboard_pipeline(): + logging.getLogger("root").setLevel(logging.WARNING) + logging.getLogger("tensorboard").setLevel(logging.WARNING) + + tb = program.TensorBoard() + tb.configure(argv=[None, "--logdir", log_path]) + url = tb.launch() + + print( + f"Access the tensorboard using the following link:\n{url}?pinnedCards=%5B%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fd%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fkl%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fmel%22%7D%5D" + ) + + while True: + time.sleep(600) diff --git a/rvc/lib/tools/model_download.py b/rvc/lib/tools/model_download.py new file mode 100644 index 0000000000000000000000000000000000000000..f795a201864b9702f93da7bb07117a0fcfcee90f --- /dev/null +++ b/rvc/lib/tools/model_download.py @@ -0,0 +1,226 @@ +import os +import re +import sys +import shutil +import zipfile +import requests +from bs4 import BeautifulSoup +from urllib.parse import unquote +from tqdm import tqdm + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from rvc.lib.utils import format_title +from rvc.lib.tools import gdown + + +file_path = os.path.join(now_dir, "logs") +zips_path = os.path.join(file_path, "zips") +os.makedirs(zips_path, exist_ok=True) + + +def search_pth_index(folder): + pth_paths = [ + os.path.join(folder, file) + for file in os.listdir(folder) + if os.path.isfile(os.path.join(folder, file)) and file.endswith(".pth") + ] + index_paths = [ + os.path.join(folder, file) + for file in os.listdir(folder) + if os.path.isfile(os.path.join(folder, file)) and file.endswith(".index") + ] + return pth_paths, index_paths + + +def download_from_url(url): + os.chdir(zips_path) + + try: + if "drive.google.com" in url: + file_id = extract_google_drive_id(url) + if file_id: + gdown.download( + url=f"https://drive.google.com/uc?id={file_id}", + quiet=False, + fuzzy=True, + ) + elif "/blob/" in url or "/resolve/" in url: + download_blob_or_resolve(url) + elif "/tree/main" in url: + download_from_huggingface(url) + else: + download_file(url) + + rename_downloaded_files() + return "downloaded" + except Exception as error: + print(f"An error occurred downloading the file: {error}") + return None + finally: + os.chdir(now_dir) + + +def extract_google_drive_id(url): + if "file/d/" in url: + return url.split("file/d/")[1].split("/")[0] + if "id=" in url: + return url.split("id=")[1].split("&")[0] + return None + + +def download_blob_or_resolve(url): + if "/blob/" in url: + url = url.replace("/blob/", "/resolve/") + response = requests.get(url, stream=True) + if response.status_code == 200: + save_response_content(response) + else: + raise ValueError( + "Download failed with status code: " + str(response.status_code) + ) + + +def save_response_content(response): + content_disposition = unquote(response.headers.get("Content-Disposition", "")) + file_name = ( + re.search(r'filename="([^"]+)"', content_disposition) + .groups()[0] + .replace(os.path.sep, "_") + if content_disposition + else "downloaded_file" + ) + + total_size = int(response.headers.get("Content-Length", 0)) + chunk_size = 1024 + + with open(os.path.join(zips_path, file_name), "wb") as file, tqdm( + total=total_size, unit="B", unit_scale=True, desc=file_name + ) as progress_bar: + for data in response.iter_content(chunk_size): + file.write(data) + progress_bar.update(len(data)) + + +def download_from_huggingface(url): + response = requests.get(url) + soup = BeautifulSoup(response.content, "html.parser") + temp_url = next( + ( + link["href"] + for link in soup.find_all("a", href=True) + if link["href"].endswith(".zip") + ), + None, + ) + if temp_url: + url = temp_url.replace("blob", "resolve") + if "huggingface.co" not in url: + url = "https://huggingface.co" + url + download_file(url) + else: + raise ValueError("No zip file found in Huggingface URL") + + +def download_file(url): + response = requests.get(url, stream=True) + if response.status_code == 200: + save_response_content(response) + else: + raise ValueError( + "Download failed with status code: " + str(response.status_code) + ) + + +def rename_downloaded_files(): + for currentPath, _, zipFiles in os.walk(zips_path): + for file in zipFiles: + file_name, extension = os.path.splitext(file) + real_path = os.path.join(currentPath, file) + os.rename(real_path, file_name.replace(os.path.sep, "_") + extension) + + +def extract(zipfile_path, unzips_path): + try: + with zipfile.ZipFile(zipfile_path, "r") as zip_ref: + zip_ref.extractall(unzips_path) + os.remove(zipfile_path) + return True + except Exception as error: + print(f"An error occurred extracting the zip file: {error}") + return False + + +def unzip_file(zip_path, zip_file_name): + zip_file_path = os.path.join(zip_path, zip_file_name + ".zip") + extract_path = os.path.join(file_path, zip_file_name) + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + zip_ref.extractall(extract_path) + os.remove(zip_file_path) + + +def model_download_pipeline(url: str): + try: + result = download_from_url(url) + if result == "downloaded": + return handle_extraction_process() + else: + return "Error" + except Exception as error: + print(f"An unexpected error occurred: {error}") + return "Error" + + +def handle_extraction_process(): + extract_folder_path = "" + for filename in os.listdir(zips_path): + if filename.endswith(".zip"): + zipfile_path = os.path.join(zips_path, filename) + model_name = format_title(os.path.basename(zipfile_path).split(".zip")[0]) + extract_folder_path = os.path.join("logs", os.path.normpath(model_name)) + success = extract(zipfile_path, extract_folder_path) + clean_extracted_files(extract_folder_path, model_name) + + if success: + print(f"Model {model_name} downloaded!") + else: + print(f"Error downloading {model_name}") + return "Error" + if not extract_folder_path: + print("Zip file was not found.") + return "Error" + return search_pth_index(extract_folder_path) + + +def clean_extracted_files(extract_folder_path, model_name): + macosx_path = os.path.join(extract_folder_path, "__MACOSX") + if os.path.exists(macosx_path): + shutil.rmtree(macosx_path) + + subfolders = [ + f + for f in os.listdir(extract_folder_path) + if os.path.isdir(os.path.join(extract_folder_path, f)) + ] + if len(subfolders) == 1: + subfolder_path = os.path.join(extract_folder_path, subfolders[0]) + for item in os.listdir(subfolder_path): + shutil.move( + os.path.join(subfolder_path, item), + os.path.join(extract_folder_path, item), + ) + os.rmdir(subfolder_path) + + for item in os.listdir(extract_folder_path): + source_path = os.path.join(extract_folder_path, item) + if ".pth" in item: + new_file_name = model_name + ".pth" + elif ".index" in item: + new_file_name = model_name + ".index" + else: + continue + + destination_path = os.path.join(extract_folder_path, new_file_name) + if not os.path.exists(destination_path): + os.rename(source_path, destination_path) diff --git a/rvc/lib/tools/prerequisites_download.py b/rvc/lib/tools/prerequisites_download.py new file mode 100644 index 0000000000000000000000000000000000000000..fce714272099d6fd98a0ba72b6dbcebf9daa5948 --- /dev/null +++ b/rvc/lib/tools/prerequisites_download.py @@ -0,0 +1,153 @@ +import os +from concurrent.futures import ThreadPoolExecutor +from tqdm import tqdm +import requests + +url_base = "https://huggingface.co/IAHispano/Applio/resolve/main/Resources" + +pretraineds_hifigan_list = [ + ( + "pretrained_v2/", + [ + "f0D32k.pth", + "f0D40k.pth", + "f0D48k.pth", + "f0G32k.pth", + "f0G40k.pth", + "f0G48k.pth", + ], + ) +] +models_list = [("predictors/", ["rmvpe.pt", "fcpe.pt"])] +embedders_list = [("embedders/contentvec/", ["pytorch_model.bin", "config.json"])] +executables_list = [ + ("", ["ffmpeg.exe", "ffprobe.exe"]), +] + +folder_mapping_list = { + "pretrained_v2/": "rvc/models/pretraineds/hifi-gan/", + "embedders/contentvec/": "rvc/models/embedders/contentvec/", + "predictors/": "rvc/models/predictors/", + "formant/": "rvc/models/formant/", +} + + +def get_file_size_if_missing(file_list): + """ + Calculate the total size of files to be downloaded only if they do not exist locally. + """ + total_size = 0 + for remote_folder, files in file_list: + local_folder = folder_mapping_list.get(remote_folder, "") + for file in files: + destination_path = os.path.join(local_folder, file) + if not os.path.exists(destination_path): + url = f"{url_base}/{remote_folder}{file}" + response = requests.head(url) + total_size += int(response.headers.get("content-length", 0)) + return total_size + + +def download_file(url, destination_path, global_bar): + """ + Download a file from the given URL to the specified destination path, + updating the global progress bar as data is downloaded. + """ + + dir_name = os.path.dirname(destination_path) + if dir_name: + os.makedirs(dir_name, exist_ok=True) + response = requests.get(url, stream=True) + block_size = 1024 + with open(destination_path, "wb") as file: + for data in response.iter_content(block_size): + file.write(data) + global_bar.update(len(data)) + + +def download_mapping_files(file_mapping_list, global_bar): + """ + Download all files in the provided file mapping list using a thread pool executor, + and update the global progress bar as downloads progress. + """ + with ThreadPoolExecutor() as executor: + futures = [] + for remote_folder, file_list in file_mapping_list: + local_folder = folder_mapping_list.get(remote_folder, "") + for file in file_list: + destination_path = os.path.join(local_folder, file) + if not os.path.exists(destination_path): + url = f"{url_base}/{remote_folder}{file}" + futures.append( + executor.submit( + download_file, url, destination_path, global_bar + ) + ) + for future in futures: + future.result() + + +def split_pretraineds(pretrained_list): + f0_list = [] + non_f0_list = [] + for folder, files in pretrained_list: + f0_files = [f for f in files if f.startswith("f0")] + non_f0_files = [f for f in files if not f.startswith("f0")] + if f0_files: + f0_list.append((folder, f0_files)) + if non_f0_files: + non_f0_list.append((folder, non_f0_files)) + return f0_list, non_f0_list + + +pretraineds_hifigan_list, _ = split_pretraineds(pretraineds_hifigan_list) + + +def calculate_total_size( + pretraineds_hifigan, + models, + exe, +): + """ + Calculate the total size of all files to be downloaded based on selected categories. + """ + total_size = 0 + if models: + total_size += get_file_size_if_missing(models_list) + total_size += get_file_size_if_missing(embedders_list) + if exe and os.name == "nt": + total_size += get_file_size_if_missing(executables_list) + total_size += get_file_size_if_missing(pretraineds_hifigan) + return total_size + + +def prequisites_download_pipeline( + pretraineds_hifigan, + models, + exe, +): + """ + Manage the download pipeline for different categories of files. + """ + total_size = calculate_total_size( + pretraineds_hifigan_list if pretraineds_hifigan else [], + models, + exe, + ) + + if total_size > 0: + with tqdm( + total=total_size, unit="iB", unit_scale=True, desc="Downloading all files" + ) as global_bar: + if models: + download_mapping_files(models_list, global_bar) + download_mapping_files(embedders_list, global_bar) + if exe: + if os.name == "nt": + download_mapping_files(executables_list, global_bar) + else: + print("No executables needed") + if pretraineds_hifigan: + download_mapping_files(pretraineds_hifigan_list, global_bar) + else: + pass diff --git a/rvc/lib/tools/pretrained_selector.py b/rvc/lib/tools/pretrained_selector.py new file mode 100644 index 0000000000000000000000000000000000000000..94d67e9687cb0842325c2984e1aeb8847f2d7c6f --- /dev/null +++ b/rvc/lib/tools/pretrained_selector.py @@ -0,0 +1,13 @@ +import os + + +def pretrained_selector(vocoder, sample_rate): + base_path = os.path.join("rvc", "models", "pretraineds", f"{vocoder.lower()}") + + path_g = os.path.join(base_path, f"f0G{str(sample_rate)[:2]}k.pth") + path_d = os.path.join(base_path, f"f0D{str(sample_rate)[:2]}k.pth") + + if os.path.exists(path_g) and os.path.exists(path_d): + return path_g, path_d + else: + return "", "" diff --git a/rvc/lib/tools/split_audio.py b/rvc/lib/tools/split_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..e7d29420dfade0884b9fd98beb0f7991c4a94d90 --- /dev/null +++ b/rvc/lib/tools/split_audio.py @@ -0,0 +1,79 @@ +import numpy as np +import librosa + + +def process_audio(audio, sr=16000, silence_thresh=-60, min_silence_len=250): + """ + Splits an audio signal into segments using a fixed frame size and hop size. + + Parameters: + - audio (np.ndarray): The audio signal to split. + - sr (int): The sample rate of the input audio (default is 16000). + - silence_thresh (int): Silence threshold (default =-60dB) + - min_silence_len (int): Minimum silence duration (default 250ms). + + Returns: + - list of np.ndarray: A list of audio segments. + - np.ndarray: The intervals where the audio was split. + """ + frame_length = int(min_silence_len / 1000 * sr) + hop_length = frame_length // 2 + intervals = librosa.effects.split( + audio, top_db=-silence_thresh, frame_length=frame_length, hop_length=hop_length + ) + audio_segments = [audio[start:end] for start, end in intervals] + + return audio_segments, intervals + + +def merge_audio(audio_segments_org, audio_segments_new, intervals, sr_orig, sr_new): + """ + Merges audio segments back into a single audio signal, filling gaps with silence. + Assumes audio segments are already at sr_new. + + Parameters: + - audio_segments_org (list of np.ndarray): The non-silent audio segments (at sr_orig). + - audio_segments_new (list of np.ndarray): The non-silent audio segments (at sr_new). + - intervals (np.ndarray): The intervals used for splitting the original audio. + - sr_orig (int): The sample rate of the original audio + - sr_new (int): The sample rate of the model + Returns: + - np.ndarray: The merged audio signal with silent gaps restored. + """ + merged_audio = np.array([], dtype=audio_segments_new[0].dtype) + sr_ratio = sr_new / sr_orig + + for i, (start, end) in enumerate(intervals): + + start_new = int(start * sr_ratio) + end_new = int(end * sr_ratio) + + original_duration = len(audio_segments_org[i]) / sr_orig + new_duration = len(audio_segments_new[i]) / sr_new + duration_diff = new_duration - original_duration + + silence_samples = int(abs(duration_diff) * sr_new) + silence_compensation = np.zeros( + silence_samples, dtype=audio_segments_new[0].dtype + ) + + if i == 0 and start_new > 0: + initial_silence = np.zeros(start_new, dtype=audio_segments_new[0].dtype) + merged_audio = np.concatenate((merged_audio, initial_silence)) + + if duration_diff > 0: + merged_audio = np.concatenate((merged_audio, silence_compensation)) + + merged_audio = np.concatenate((merged_audio, audio_segments_new[i])) + + if duration_diff < 0: + merged_audio = np.concatenate((merged_audio, silence_compensation)) + + if i < len(intervals) - 1: + next_start_new = int(intervals[i + 1][0] * sr_ratio) + silence_duration = next_start_new - end_new + if silence_duration > 0: + silence = np.zeros(silence_duration, dtype=audio_segments_new[0].dtype) + merged_audio = np.concatenate((merged_audio, silence)) + + return merged_audio diff --git a/rvc/lib/tools/tts.py b/rvc/lib/tools/tts.py new file mode 100644 index 0000000000000000000000000000000000000000..c3252369cb12c10a6c7d11e0e10f754ee78e947f --- /dev/null +++ b/rvc/lib/tools/tts.py @@ -0,0 +1,29 @@ +import sys +import asyncio +import edge_tts +import os + + +async def main(): + # Parse command line arguments + tts_file = str(sys.argv[1]) + text = str(sys.argv[2]) + voice = str(sys.argv[3]) + rate = int(sys.argv[4]) + output_file = str(sys.argv[5]) + + rates = f"+{rate}%" if rate >= 0 else f"{rate}%" + if tts_file and os.path.exists(tts_file): + text = "" + try: + with open(tts_file, "r", encoding="utf-8") as file: + text = file.read() + except UnicodeDecodeError: + with open(tts_file, "r") as file: + text = file.read() + await edge_tts.Communicate(text, voice, rate=rates).save(output_file) + # print(f"TTS with {voice} completed. Output TTS file: '{output_file}'") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/rvc/lib/tools/tts_voices.json b/rvc/lib/tools/tts_voices.json new file mode 100644 index 0000000000000000000000000000000000000000..5f5fea686dbcbcf850e37f3e010f92696a7a6258 --- /dev/null +++ b/rvc/lib/tools/tts_voices.json @@ -0,0 +1,5820 @@ +[ + { + "Name": "Microsoft Server Speech Text to Speech Voice (af-ZA, AdriNeural)", + "ShortName": "af-ZA-AdriNeural", + "Gender": "Female", + "Locale": "af-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Adri Online (Natural) - Afrikaans (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (af-ZA, WillemNeural)", + "ShortName": "af-ZA-WillemNeural", + "Gender": "Male", + "Locale": "af-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Willem Online (Natural) - Afrikaans (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sq-AL, AnilaNeural)", + "ShortName": "sq-AL-AnilaNeural", + "Gender": "Female", + "Locale": "sq-AL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Anila Online (Natural) - Albanian (Albania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sq-AL, IlirNeural)", + "ShortName": "sq-AL-IlirNeural", + "Gender": "Male", + "Locale": "sq-AL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ilir Online (Natural) - Albanian (Albania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (am-ET, AmehaNeural)", + "ShortName": "am-ET-AmehaNeural", + "Gender": "Male", + "Locale": "am-ET", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ameha Online (Natural) - Amharic (Ethiopia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (am-ET, MekdesNeural)", + "ShortName": "am-ET-MekdesNeural", + "Gender": "Female", + "Locale": "am-ET", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mekdes Online (Natural) - Amharic (Ethiopia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-DZ, AminaNeural)", + "ShortName": "ar-DZ-AminaNeural", + "Gender": "Female", + "Locale": "ar-DZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amina Online (Natural) - Arabic (Algeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-DZ, IsmaelNeural)", + "ShortName": "ar-DZ-IsmaelNeural", + "Gender": "Male", + "Locale": "ar-DZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ismael Online (Natural) - Arabic (Algeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-BH, AliNeural)", + "ShortName": "ar-BH-AliNeural", + "Gender": "Male", + "Locale": "ar-BH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ali Online (Natural) - Arabic (Bahrain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-BH, LailaNeural)", + "ShortName": "ar-BH-LailaNeural", + "Gender": "Female", + "Locale": "ar-BH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Laila Online (Natural) - Arabic (Bahrain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-EG, SalmaNeural)", + "ShortName": "ar-EG-SalmaNeural", + "Gender": "Female", + "Locale": "ar-EG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Salma Online (Natural) - Arabic (Egypt)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-EG, ShakirNeural)", + "ShortName": "ar-EG-ShakirNeural", + "Gender": "Male", + "Locale": "ar-EG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Shakir Online (Natural) - Arabic (Egypt)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-IQ, BasselNeural)", + "ShortName": "ar-IQ-BasselNeural", + "Gender": "Male", + "Locale": "ar-IQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Bassel Online (Natural) - Arabic (Iraq)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-IQ, RanaNeural)", + "ShortName": "ar-IQ-RanaNeural", + "Gender": "Female", + "Locale": "ar-IQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rana Online (Natural) - Arabic (Iraq)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-JO, SanaNeural)", + "ShortName": "ar-JO-SanaNeural", + "Gender": "Female", + "Locale": "ar-JO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sana Online (Natural) - Arabic (Jordan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-JO, TaimNeural)", + "ShortName": "ar-JO-TaimNeural", + "Gender": "Male", + "Locale": "ar-JO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Taim Online (Natural) - Arabic (Jordan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-KW, FahedNeural)", + "ShortName": "ar-KW-FahedNeural", + "Gender": "Male", + "Locale": "ar-KW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fahed Online (Natural) - Arabic (Kuwait)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-KW, NouraNeural)", + "ShortName": "ar-KW-NouraNeural", + "Gender": "Female", + "Locale": "ar-KW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Noura Online (Natural) - Arabic (Kuwait)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LB, LaylaNeural)", + "ShortName": "ar-LB-LaylaNeural", + "Gender": "Female", + "Locale": "ar-LB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Layla Online (Natural) - Arabic (Lebanon)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LB, RamiNeural)", + "ShortName": "ar-LB-RamiNeural", + "Gender": "Male", + "Locale": "ar-LB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rami Online (Natural) - Arabic (Lebanon)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LY, ImanNeural)", + "ShortName": "ar-LY-ImanNeural", + "Gender": "Female", + "Locale": "ar-LY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Iman Online (Natural) - Arabic (Libya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LY, OmarNeural)", + "ShortName": "ar-LY-OmarNeural", + "Gender": "Male", + "Locale": "ar-LY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Omar Online (Natural) - Arabic (Libya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-MA, JamalNeural)", + "ShortName": "ar-MA-JamalNeural", + "Gender": "Male", + "Locale": "ar-MA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jamal Online (Natural) - Arabic (Morocco)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-MA, MounaNeural)", + "ShortName": "ar-MA-MounaNeural", + "Gender": "Female", + "Locale": "ar-MA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mouna Online (Natural) - Arabic (Morocco)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-OM, AbdullahNeural)", + "ShortName": "ar-OM-AbdullahNeural", + "Gender": "Male", + "Locale": "ar-OM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Abdullah Online (Natural) - Arabic (Oman)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-OM, AyshaNeural)", + "ShortName": "ar-OM-AyshaNeural", + "Gender": "Female", + "Locale": "ar-OM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aysha Online (Natural) - Arabic (Oman)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-QA, AmalNeural)", + "ShortName": "ar-QA-AmalNeural", + "Gender": "Female", + "Locale": "ar-QA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amal Online (Natural) - Arabic (Qatar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-QA, MoazNeural)", + "ShortName": "ar-QA-MoazNeural", + "Gender": "Male", + "Locale": "ar-QA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Moaz Online (Natural) - Arabic (Qatar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SA, HamedNeural)", + "ShortName": "ar-SA-HamedNeural", + "Gender": "Male", + "Locale": "ar-SA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hamed Online (Natural) - Arabic (Saudi Arabia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SA, ZariyahNeural)", + "ShortName": "ar-SA-ZariyahNeural", + "Gender": "Female", + "Locale": "ar-SA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Zariyah Online (Natural) - Arabic (Saudi Arabia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SY, AmanyNeural)", + "ShortName": "ar-SY-AmanyNeural", + "Gender": "Female", + "Locale": "ar-SY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amany Online (Natural) - Arabic (Syria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SY, LaithNeural)", + "ShortName": "ar-SY-LaithNeural", + "Gender": "Male", + "Locale": "ar-SY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Laith Online (Natural) - Arabic (Syria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-TN, HediNeural)", + "ShortName": "ar-TN-HediNeural", + "Gender": "Male", + "Locale": "ar-TN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hedi Online (Natural) - Arabic (Tunisia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-TN, ReemNeural)", + "ShortName": "ar-TN-ReemNeural", + "Gender": "Female", + "Locale": "ar-TN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Reem Online (Natural) - Arabic (Tunisia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-AE, FatimaNeural)", + "ShortName": "ar-AE-FatimaNeural", + "Gender": "Female", + "Locale": "ar-AE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fatima Online (Natural) - Arabic (United Arab Emirates)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-AE, HamdanNeural)", + "ShortName": "ar-AE-HamdanNeural", + "Gender": "Male", + "Locale": "ar-AE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hamdan Online (Natural) - Arabic (United Arab Emirates)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-YE, MaryamNeural)", + "ShortName": "ar-YE-MaryamNeural", + "Gender": "Female", + "Locale": "ar-YE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maryam Online (Natural) - Arabic (Yemen)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-YE, SalehNeural)", + "ShortName": "ar-YE-SalehNeural", + "Gender": "Male", + "Locale": "ar-YE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Saleh Online (Natural) - Arabic (Yemen)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (az-AZ, BabekNeural)", + "ShortName": "az-AZ-BabekNeural", + "Gender": "Male", + "Locale": "az-AZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Babek Online (Natural) - Azerbaijani (Azerbaijan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (az-AZ, BanuNeural)", + "ShortName": "az-AZ-BanuNeural", + "Gender": "Female", + "Locale": "az-AZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Banu Online (Natural) - Azerbaijani (Azerbaijan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-BD, NabanitaNeural)", + "ShortName": "bn-BD-NabanitaNeural", + "Gender": "Female", + "Locale": "bn-BD", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nabanita Online (Natural) - Bangla (Bangladesh)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-BD, PradeepNeural)", + "ShortName": "bn-BD-PradeepNeural", + "Gender": "Male", + "Locale": "bn-BD", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Pradeep Online (Natural) - Bangla (Bangladesh)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-IN, BashkarNeural)", + "ShortName": "bn-IN-BashkarNeural", + "Gender": "Male", + "Locale": "bn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Bashkar Online (Natural) - Bangla (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-IN, TanishaaNeural)", + "ShortName": "bn-IN-TanishaaNeural", + "Gender": "Female", + "Locale": "bn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tanishaa Online (Natural) - Bengali (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bs-BA, VesnaNeural)", + "ShortName": "bs-BA-VesnaNeural", + "Gender": "Female", + "Locale": "bs-BA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Vesna Online (Natural) - Bosnian (Bosnia and Herzegovina)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bs-BA, GoranNeural)", + "ShortName": "bs-BA-GoranNeural", + "Gender": "Male", + "Locale": "bs-BA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Goran Online (Natural) - Bosnian (Bosnia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bg-BG, BorislavNeural)", + "ShortName": "bg-BG-BorislavNeural", + "Gender": "Male", + "Locale": "bg-BG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Borislav Online (Natural) - Bulgarian (Bulgaria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bg-BG, KalinaNeural)", + "ShortName": "bg-BG-KalinaNeural", + "Gender": "Female", + "Locale": "bg-BG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kalina Online (Natural) - Bulgarian (Bulgaria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (my-MM, NilarNeural)", + "ShortName": "my-MM-NilarNeural", + "Gender": "Female", + "Locale": "my-MM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nilar Online (Natural) - Burmese (Myanmar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (my-MM, ThihaNeural)", + "ShortName": "my-MM-ThihaNeural", + "Gender": "Male", + "Locale": "my-MM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thiha Online (Natural) - Burmese (Myanmar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ca-ES, EnricNeural)", + "ShortName": "ca-ES-EnricNeural", + "Gender": "Male", + "Locale": "ca-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Enric Online (Natural) - Catalan", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ca-ES, JoanaNeural)", + "ShortName": "ca-ES-JoanaNeural", + "Gender": "Female", + "Locale": "ca-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Joana Online (Natural) - Catalan", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuGaaiNeural)", + "ShortName": "zh-HK-HiuGaaiNeural", + "Gender": "Female", + "Locale": "zh-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HiuGaai Online (Natural) - Chinese (Cantonese Traditional)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuMaanNeural)", + "ShortName": "zh-HK-HiuMaanNeural", + "Gender": "Female", + "Locale": "zh-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HiuMaan Online (Natural) - Chinese (Hong Kong SAR)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, WanLungNeural)", + "ShortName": "zh-HK-WanLungNeural", + "Gender": "Male", + "Locale": "zh-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft WanLung Online (Natural) - Chinese (Hong Kong SAR)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)", + "ShortName": "zh-CN-XiaoxiaoNeural", + "Gender": "Female", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaoxiao Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Warm" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoyiNeural)", + "ShortName": "zh-CN-XiaoyiNeural", + "Gender": "Female", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaoyi Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Cartoon", + "Novel" + ], + "VoicePersonalities": [ + "Lively" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunjianNeural)", + "ShortName": "zh-CN-YunjianNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunjian Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Sports", + " Novel" + ], + "VoicePersonalities": [ + "Passion" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiNeural)", + "ShortName": "zh-CN-YunxiNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunxi Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Novel" + ], + "VoicePersonalities": [ + "Lively", + "Sunshine" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiaNeural)", + "ShortName": "zh-CN-YunxiaNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunxia Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Cartoon", + "Novel" + ], + "VoicePersonalities": [ + "Cute" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunyangNeural)", + "ShortName": "zh-CN-YunyangNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunyang Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News" + ], + "VoicePersonalities": [ + "Professional", + "Reliable" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN-liaoning, XiaobeiNeural)", + "ShortName": "zh-CN-liaoning-XiaobeiNeural", + "Gender": "Female", + "Locale": "zh-CN-liaoning", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaobei Online (Natural) - Chinese (Northeastern Mandarin)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Dialect" + ], + "VoicePersonalities": [ + "Humorous" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, HsiaoChenNeural)", + "ShortName": "zh-TW-HsiaoChenNeural", + "Gender": "Female", + "Locale": "zh-TW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HsiaoChen Online (Natural) - Chinese (Taiwan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, YunJheNeural)", + "ShortName": "zh-TW-YunJheNeural", + "Gender": "Male", + "Locale": "zh-TW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft YunJhe Online (Natural) - Chinese (Taiwan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, HsiaoYuNeural)", + "ShortName": "zh-TW-HsiaoYuNeural", + "Gender": "Female", + "Locale": "zh-TW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HsiaoYu Online (Natural) - Chinese (Taiwanese Mandarin)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN-shaanxi, XiaoniNeural)", + "ShortName": "zh-CN-shaanxi-XiaoniNeural", + "Gender": "Female", + "Locale": "zh-CN-shaanxi", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaoni Online (Natural) - Chinese (Zhongyuan Mandarin Shaanxi)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Dialect" + ], + "VoicePersonalities": [ + "Bright" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hr-HR, GabrijelaNeural)", + "ShortName": "hr-HR-GabrijelaNeural", + "Gender": "Female", + "Locale": "hr-HR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gabrijela Online (Natural) - Croatian (Croatia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hr-HR, SreckoNeural)", + "ShortName": "hr-HR-SreckoNeural", + "Gender": "Male", + "Locale": "hr-HR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Srecko Online (Natural) - Croatian (Croatia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cs-CZ, AntoninNeural)", + "ShortName": "cs-CZ-AntoninNeural", + "Gender": "Male", + "Locale": "cs-CZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Antonin Online (Natural) - Czech (Czech)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cs-CZ, VlastaNeural)", + "ShortName": "cs-CZ-VlastaNeural", + "Gender": "Female", + "Locale": "cs-CZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Vlasta Online (Natural) - Czech (Czech)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (da-DK, ChristelNeural)", + "ShortName": "da-DK-ChristelNeural", + "Gender": "Female", + "Locale": "da-DK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Christel Online (Natural) - Danish (Denmark)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (da-DK, JeppeNeural)", + "ShortName": "da-DK-JeppeNeural", + "Gender": "Male", + "Locale": "da-DK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jeppe Online (Natural) - Danish (Denmark)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-BE, ArnaudNeural)", + "ShortName": "nl-BE-ArnaudNeural", + "Gender": "Male", + "Locale": "nl-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Arnaud Online (Natural) - Dutch (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-BE, DenaNeural)", + "ShortName": "nl-BE-DenaNeural", + "Gender": "Female", + "Locale": "nl-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dena Online (Natural) - Dutch (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, ColetteNeural)", + "ShortName": "nl-NL-ColetteNeural", + "Gender": "Female", + "Locale": "nl-NL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Colette Online (Natural) - Dutch (Netherlands)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, FennaNeural)", + "ShortName": "nl-NL-FennaNeural", + "Gender": "Female", + "Locale": "nl-NL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fenna Online (Natural) - Dutch (Netherlands)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, MaartenNeural)", + "ShortName": "nl-NL-MaartenNeural", + "Gender": "Male", + "Locale": "nl-NL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maarten Online (Natural) - Dutch (Netherlands)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-AU, NatashaNeural)", + "ShortName": "en-AU-NatashaNeural", + "Gender": "Female", + "Locale": "en-AU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Natasha Online (Natural) - English (Australia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-AU, WilliamNeural)", + "ShortName": "en-AU-WilliamNeural", + "Gender": "Male", + "Locale": "en-AU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft William Online (Natural) - English (Australia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-CA, ClaraNeural)", + "ShortName": "en-CA-ClaraNeural", + "Gender": "Female", + "Locale": "en-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Clara Online (Natural) - English (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-CA, LiamNeural)", + "ShortName": "en-CA-LiamNeural", + "Gender": "Male", + "Locale": "en-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Liam Online (Natural) - English (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-HK, YanNeural)", + "ShortName": "en-HK-YanNeural", + "Gender": "Female", + "Locale": "en-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yan Online (Natural) - English (Hong Kong SAR)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-HK, SamNeural)", + "ShortName": "en-HK-SamNeural", + "Gender": "Male", + "Locale": "en-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sam Online (Natural) - English (Hongkong)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, NeerjaExpressiveNeural)", + "ShortName": "en-IN-NeerjaExpressiveNeural", + "Gender": "Female", + "Locale": "en-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Neerja Online (Natural) - English (India) (Preview)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, NeerjaNeural)", + "ShortName": "en-IN-NeerjaNeural", + "Gender": "Female", + "Locale": "en-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Neerja Online (Natural) - English (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, PrabhatNeural)", + "ShortName": "en-IN-PrabhatNeural", + "Gender": "Male", + "Locale": "en-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Prabhat Online (Natural) - English (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IE, ConnorNeural)", + "ShortName": "en-IE-ConnorNeural", + "Gender": "Male", + "Locale": "en-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Connor Online (Natural) - English (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IE, EmilyNeural)", + "ShortName": "en-IE-EmilyNeural", + "Gender": "Female", + "Locale": "en-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emily Online (Natural) - English (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-KE, AsiliaNeural)", + "ShortName": "en-KE-AsiliaNeural", + "Gender": "Female", + "Locale": "en-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Asilia Online (Natural) - English (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-KE, ChilembaNeural)", + "ShortName": "en-KE-ChilembaNeural", + "Gender": "Male", + "Locale": "en-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Chilemba Online (Natural) - English (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NZ, MitchellNeural)", + "ShortName": "en-NZ-MitchellNeural", + "Gender": "Male", + "Locale": "en-NZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mitchell Online (Natural) - English (New Zealand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NZ, MollyNeural)", + "ShortName": "en-NZ-MollyNeural", + "Gender": "Female", + "Locale": "en-NZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Molly Online (Natural) - English (New Zealand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NG, AbeoNeural)", + "ShortName": "en-NG-AbeoNeural", + "Gender": "Male", + "Locale": "en-NG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Abeo Online (Natural) - English (Nigeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NG, EzinneNeural)", + "ShortName": "en-NG-EzinneNeural", + "Gender": "Female", + "Locale": "en-NG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ezinne Online (Natural) - English (Nigeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-PH, JamesNeural)", + "ShortName": "en-PH-JamesNeural", + "Gender": "Male", + "Locale": "en-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft James Online (Natural) - English (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-PH, RosaNeural)", + "ShortName": "en-PH-RosaNeural", + "Gender": "Female", + "Locale": "en-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rosa Online (Natural) - English (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-SG, LunaNeural)", + "ShortName": "en-SG-LunaNeural", + "Gender": "Female", + "Locale": "en-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Luna Online (Natural) - English (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-SG, WayneNeural)", + "ShortName": "en-SG-WayneNeural", + "Gender": "Male", + "Locale": "en-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Wayne Online (Natural) - English (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AvaMultilingualNeural)", + "ShortName": "en-US-AvaMultilingualNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft AvaMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Expressive", + "Caring", + "Pleasant", + "Friendly" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AndrewMultilingualNeural)", + "ShortName": "en-US-AndrewMultilingualNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft AndrewMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Warm", + "Confident", + "Authentic", + "Honest" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EmmaMultilingualNeural)", + "ShortName": "en-US-EmmaMultilingualNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft EmmaMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Cheerful", + "Clear", + "Conversational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, BrianMultilingualNeural)", + "ShortName": "en-US-BrianMultilingualNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft BrianMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Approachable", + "Casual", + "Sincere" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AvaNeural)", + "ShortName": "en-US-AvaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ava Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Expressive", + "Caring", + "Pleasant", + "Friendly" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AndrewNeural)", + "ShortName": "en-US-AndrewNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Andrew Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Warm", + "Confident", + "Authentic", + "Honest" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EmmaNeural)", + "ShortName": "en-US-EmmaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emma Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Cheerful", + "Clear", + "Conversational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, BrianNeural)", + "ShortName": "en-US-BrianNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Brian Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Approachable", + "Casual", + "Sincere" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-ZA, LeahNeural)", + "ShortName": "en-ZA-LeahNeural", + "Gender": "Female", + "Locale": "en-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Leah Online (Natural) - English (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-ZA, LukeNeural)", + "ShortName": "en-ZA-LukeNeural", + "Gender": "Male", + "Locale": "en-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Luke Online (Natural) - English (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-TZ, ElimuNeural)", + "ShortName": "en-TZ-ElimuNeural", + "Gender": "Male", + "Locale": "en-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elimu Online (Natural) - English (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-TZ, ImaniNeural)", + "ShortName": "en-TZ-ImaniNeural", + "Gender": "Female", + "Locale": "en-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Imani Online (Natural) - English (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, LibbyNeural)", + "ShortName": "en-GB-LibbyNeural", + "Gender": "Female", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Libby Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, MaisieNeural)", + "ShortName": "en-GB-MaisieNeural", + "Gender": "Female", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maisie Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, RyanNeural)", + "ShortName": "en-GB-RyanNeural", + "Gender": "Male", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ryan Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, SoniaNeural)", + "ShortName": "en-GB-SoniaNeural", + "Gender": "Female", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sonia Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, ThomasNeural)", + "ShortName": "en-GB-ThomasNeural", + "Gender": "Male", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thomas Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AnaNeural)", + "ShortName": "en-US-AnaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ana Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Cartoon", + "Conversation" + ], + "VoicePersonalities": [ + "Cute" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)", + "ShortName": "en-US-AriaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aria Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Positive", + "Confident" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, ChristopherNeural)", + "ShortName": "en-US-ChristopherNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Christopher Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Reliable", + "Authority" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EricNeural)", + "ShortName": "en-US-EricNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Eric Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Rational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, GuyNeural)", + "ShortName": "en-US-GuyNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Guy Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Passion" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)", + "ShortName": "en-US-JennyNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jenny Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Considerate", + "Comfort" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, MichelleNeural)", + "ShortName": "en-US-MichelleNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Michelle Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Friendly", + "Pleasant" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, RogerNeural)", + "ShortName": "en-US-RogerNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Roger Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Lively" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, SteffanNeural)", + "ShortName": "en-US-SteffanNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Steffan Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Rational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (et-EE, AnuNeural)", + "ShortName": "et-EE-AnuNeural", + "Gender": "Female", + "Locale": "et-EE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Anu Online (Natural) - Estonian (Estonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (et-EE, KertNeural)", + "ShortName": "et-EE-KertNeural", + "Gender": "Male", + "Locale": "et-EE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kert Online (Natural) - Estonian (Estonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fil-PH, AngeloNeural)", + "ShortName": "fil-PH-AngeloNeural", + "Gender": "Male", + "Locale": "fil-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Angelo Online (Natural) - Filipino (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fil-PH, BlessicaNeural)", + "ShortName": "fil-PH-BlessicaNeural", + "Gender": "Female", + "Locale": "fil-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Blessica Online (Natural) - Filipino (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fi-FI, HarriNeural)", + "ShortName": "fi-FI-HarriNeural", + "Gender": "Male", + "Locale": "fi-FI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Harri Online (Natural) - Finnish (Finland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fi-FI, NooraNeural)", + "ShortName": "fi-FI-NooraNeural", + "Gender": "Female", + "Locale": "fi-FI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Noora Online (Natural) - Finnish (Finland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-BE, CharlineNeural)", + "ShortName": "fr-BE-CharlineNeural", + "Gender": "Female", + "Locale": "fr-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Charline Online (Natural) - French (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-BE, GerardNeural)", + "ShortName": "fr-BE-GerardNeural", + "Gender": "Male", + "Locale": "fr-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gerard Online (Natural) - French (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, ThierryNeural)", + "ShortName": "fr-CA-ThierryNeural", + "Gender": "Male", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thierry Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, AntoineNeural)", + "ShortName": "fr-CA-AntoineNeural", + "Gender": "Male", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Antoine Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, JeanNeural)", + "ShortName": "fr-CA-JeanNeural", + "Gender": "Male", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jean Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, SylvieNeural)", + "ShortName": "fr-CA-SylvieNeural", + "Gender": "Female", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sylvie Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, VivienneMultilingualNeural)", + "ShortName": "fr-FR-VivienneMultilingualNeural", + "Gender": "Female", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft VivienneMultilingual Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, RemyMultilingualNeural)", + "ShortName": "fr-FR-RemyMultilingualNeural", + "Gender": "Male", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft RemyMultilingual Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, DeniseNeural)", + "ShortName": "fr-FR-DeniseNeural", + "Gender": "Female", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Denise Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, EloiseNeural)", + "ShortName": "fr-FR-EloiseNeural", + "Gender": "Female", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Eloise Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, HenriNeural)", + "ShortName": "fr-FR-HenriNeural", + "Gender": "Male", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Henri Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CH, ArianeNeural)", + "ShortName": "fr-CH-ArianeNeural", + "Gender": "Female", + "Locale": "fr-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ariane Online (Natural) - French (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CH, FabriceNeural)", + "ShortName": "fr-CH-FabriceNeural", + "Gender": "Male", + "Locale": "fr-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fabrice Online (Natural) - French (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gl-ES, RoiNeural)", + "ShortName": "gl-ES-RoiNeural", + "Gender": "Male", + "Locale": "gl-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Roi Online (Natural) - Galician", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gl-ES, SabelaNeural)", + "ShortName": "gl-ES-SabelaNeural", + "Gender": "Female", + "Locale": "gl-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sabela Online (Natural) - Galician", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ka-GE, EkaNeural)", + "ShortName": "ka-GE-EkaNeural", + "Gender": "Female", + "Locale": "ka-GE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Eka Online (Natural) - Georgian (Georgia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ka-GE, GiorgiNeural)", + "ShortName": "ka-GE-GiorgiNeural", + "Gender": "Male", + "Locale": "ka-GE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Giorgi Online (Natural) - Georgian (Georgia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-AT, IngridNeural)", + "ShortName": "de-AT-IngridNeural", + "Gender": "Female", + "Locale": "de-AT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ingrid Online (Natural) - German (Austria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-AT, JonasNeural)", + "ShortName": "de-AT-JonasNeural", + "Gender": "Male", + "Locale": "de-AT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jonas Online (Natural) - German (Austria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, SeraphinaMultilingualNeural)", + "ShortName": "de-DE-SeraphinaMultilingualNeural", + "Gender": "Female", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft SeraphinaMultilingual Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, FlorianMultilingualNeural)", + "ShortName": "de-DE-FlorianMultilingualNeural", + "Gender": "Male", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft FlorianMultilingual Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, AmalaNeural)", + "ShortName": "de-DE-AmalaNeural", + "Gender": "Female", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amala Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, ConradNeural)", + "ShortName": "de-DE-ConradNeural", + "Gender": "Male", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Conrad Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, KatjaNeural)", + "ShortName": "de-DE-KatjaNeural", + "Gender": "Female", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Katja Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, KillianNeural)", + "ShortName": "de-DE-KillianNeural", + "Gender": "Male", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Killian Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-CH, JanNeural)", + "ShortName": "de-CH-JanNeural", + "Gender": "Male", + "Locale": "de-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jan Online (Natural) - German (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-CH, LeniNeural)", + "ShortName": "de-CH-LeniNeural", + "Gender": "Female", + "Locale": "de-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Leni Online (Natural) - German (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (el-GR, AthinaNeural)", + "ShortName": "el-GR-AthinaNeural", + "Gender": "Female", + "Locale": "el-GR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Athina Online (Natural) - Greek (Greece)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (el-GR, NestorasNeural)", + "ShortName": "el-GR-NestorasNeural", + "Gender": "Male", + "Locale": "el-GR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nestoras Online (Natural) - Greek (Greece)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gu-IN, DhwaniNeural)", + "ShortName": "gu-IN-DhwaniNeural", + "Gender": "Female", + "Locale": "gu-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dhwani Online (Natural) - Gujarati (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gu-IN, NiranjanNeural)", + "ShortName": "gu-IN-NiranjanNeural", + "Gender": "Male", + "Locale": "gu-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Niranjan Online (Natural) - Gujarati (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (he-IL, AvriNeural)", + "ShortName": "he-IL-AvriNeural", + "Gender": "Male", + "Locale": "he-IL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Avri Online (Natural) - Hebrew (Israel)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (he-IL, HilaNeural)", + "ShortName": "he-IL-HilaNeural", + "Gender": "Female", + "Locale": "he-IL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hila Online (Natural) - Hebrew (Israel)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hi-IN, MadhurNeural)", + "ShortName": "hi-IN-MadhurNeural", + "Gender": "Male", + "Locale": "hi-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Madhur Online (Natural) - Hindi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hi-IN, SwaraNeural)", + "ShortName": "hi-IN-SwaraNeural", + "Gender": "Female", + "Locale": "hi-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Swara Online (Natural) - Hindi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hu-HU, NoemiNeural)", + "ShortName": "hu-HU-NoemiNeural", + "Gender": "Female", + "Locale": "hu-HU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Noemi Online (Natural) - Hungarian (Hungary)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hu-HU, TamasNeural)", + "ShortName": "hu-HU-TamasNeural", + "Gender": "Male", + "Locale": "hu-HU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tamas Online (Natural) - Hungarian (Hungary)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (is-IS, GudrunNeural)", + "ShortName": "is-IS-GudrunNeural", + "Gender": "Female", + "Locale": "is-IS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gudrun Online (Natural) - Icelandic (Iceland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (is-IS, GunnarNeural)", + "ShortName": "is-IS-GunnarNeural", + "Gender": "Male", + "Locale": "is-IS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gunnar Online (Natural) - Icelandic (Iceland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (id-ID, ArdiNeural)", + "ShortName": "id-ID-ArdiNeural", + "Gender": "Male", + "Locale": "id-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ardi Online (Natural) - Indonesian (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (id-ID, GadisNeural)", + "ShortName": "id-ID-GadisNeural", + "Gender": "Female", + "Locale": "id-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gadis Online (Natural) - Indonesian (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (iu-Latn-CA, SiqiniqNeural)", + "ShortName": "iu-Latn-CA-SiqiniqNeural", + "Gender": "Female", + "Locale": "iu-Latn-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Siqiniq Online (Natural) - Inuktitut (Latin, Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (iu-Latn-CA, TaqqiqNeural)", + "ShortName": "iu-Latn-CA-TaqqiqNeural", + "Gender": "Male", + "Locale": "iu-Latn-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Taqqiq Online (Natural) - Inuktitut (Latin, Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (iu-Cans-CA, SiqiniqNeural)", + "ShortName": "iu-Cans-CA-SiqiniqNeural", + "Gender": "Female", + "Locale": "iu-Cans-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Siqiniq Online (Natural) - Inuktitut (Syllabics, Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (iu-Cans-CA, TaqqiqNeural)", + "ShortName": "iu-Cans-CA-TaqqiqNeural", + "Gender": "Male", + "Locale": "iu-Cans-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Taqqiq Online (Natural) - Inuktitut (Syllabics, Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ga-IE, ColmNeural)", + "ShortName": "ga-IE-ColmNeural", + "Gender": "Male", + "Locale": "ga-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Colm Online (Natural) - Irish (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ga-IE, OrlaNeural)", + "ShortName": "ga-IE-OrlaNeural", + "Gender": "Female", + "Locale": "ga-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Orla Online (Natural) - Irish (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, GiuseppeMultilingualNeural)", + "ShortName": "it-IT-GiuseppeMultilingualNeural", + "Gender": "Male", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft GiuseppeMultilingual Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, DiegoNeural)", + "ShortName": "it-IT-DiegoNeural", + "Gender": "Male", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Diego Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, ElsaNeural)", + "ShortName": "it-IT-ElsaNeural", + "Gender": "Female", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elsa Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, IsabellaNeural)", + "ShortName": "it-IT-IsabellaNeural", + "Gender": "Female", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Isabella Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ja-JP, KeitaNeural)", + "ShortName": "ja-JP-KeitaNeural", + "Gender": "Male", + "Locale": "ja-JP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Keita Online (Natural) - Japanese (Japan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ja-JP, NanamiNeural)", + "ShortName": "ja-JP-NanamiNeural", + "Gender": "Female", + "Locale": "ja-JP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nanami Online (Natural) - Japanese (Japan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (jv-ID, DimasNeural)", + "ShortName": "jv-ID-DimasNeural", + "Gender": "Male", + "Locale": "jv-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dimas Online (Natural) - Javanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (jv-ID, SitiNeural)", + "ShortName": "jv-ID-SitiNeural", + "Gender": "Female", + "Locale": "jv-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Siti Online (Natural) - Javanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kn-IN, GaganNeural)", + "ShortName": "kn-IN-GaganNeural", + "Gender": "Male", + "Locale": "kn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gagan Online (Natural) - Kannada (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kn-IN, SapnaNeural)", + "ShortName": "kn-IN-SapnaNeural", + "Gender": "Female", + "Locale": "kn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sapna Online (Natural) - Kannada (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kk-KZ, AigulNeural)", + "ShortName": "kk-KZ-AigulNeural", + "Gender": "Female", + "Locale": "kk-KZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aigul Online (Natural) - Kazakh (Kazakhstan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kk-KZ, DauletNeural)", + "ShortName": "kk-KZ-DauletNeural", + "Gender": "Male", + "Locale": "kk-KZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Daulet Online (Natural) - Kazakh (Kazakhstan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (km-KH, PisethNeural)", + "ShortName": "km-KH-PisethNeural", + "Gender": "Male", + "Locale": "km-KH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Piseth Online (Natural) - Khmer (Cambodia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (km-KH, SreymomNeural)", + "ShortName": "km-KH-SreymomNeural", + "Gender": "Female", + "Locale": "km-KH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sreymom Online (Natural) - Khmer (Cambodia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, HyunsuMultilingualNeural)", + "ShortName": "ko-KR-HyunsuMultilingualNeural", + "Gender": "Male", + "Locale": "ko-KR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HyunsuMultilingual Online (Natural) - Korean (Korea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, InJoonNeural)", + "ShortName": "ko-KR-InJoonNeural", + "Gender": "Male", + "Locale": "ko-KR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft InJoon Online (Natural) - Korean (Korea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, SunHiNeural)", + "ShortName": "ko-KR-SunHiNeural", + "Gender": "Female", + "Locale": "ko-KR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft SunHi Online (Natural) - Korean (Korea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lo-LA, ChanthavongNeural)", + "ShortName": "lo-LA-ChanthavongNeural", + "Gender": "Male", + "Locale": "lo-LA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Chanthavong Online (Natural) - Lao (Laos)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lo-LA, KeomanyNeural)", + "ShortName": "lo-LA-KeomanyNeural", + "Gender": "Female", + "Locale": "lo-LA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Keomany Online (Natural) - Lao (Laos)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lv-LV, EveritaNeural)", + "ShortName": "lv-LV-EveritaNeural", + "Gender": "Female", + "Locale": "lv-LV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Everita Online (Natural) - Latvian (Latvia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lv-LV, NilsNeural)", + "ShortName": "lv-LV-NilsNeural", + "Gender": "Male", + "Locale": "lv-LV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nils Online (Natural) - Latvian (Latvia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lt-LT, LeonasNeural)", + "ShortName": "lt-LT-LeonasNeural", + "Gender": "Male", + "Locale": "lt-LT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Leonas Online (Natural) - Lithuanian (Lithuania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lt-LT, OnaNeural)", + "ShortName": "lt-LT-OnaNeural", + "Gender": "Female", + "Locale": "lt-LT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ona Online (Natural) - Lithuanian (Lithuania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mk-MK, AleksandarNeural)", + "ShortName": "mk-MK-AleksandarNeural", + "Gender": "Male", + "Locale": "mk-MK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aleksandar Online (Natural) - Macedonian (North Macedonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mk-MK, MarijaNeural)", + "ShortName": "mk-MK-MarijaNeural", + "Gender": "Female", + "Locale": "mk-MK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marija Online (Natural) - Macedonian (North Macedonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ms-MY, OsmanNeural)", + "ShortName": "ms-MY-OsmanNeural", + "Gender": "Male", + "Locale": "ms-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Osman Online (Natural) - Malay (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ms-MY, YasminNeural)", + "ShortName": "ms-MY-YasminNeural", + "Gender": "Female", + "Locale": "ms-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yasmin Online (Natural) - Malay (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ml-IN, MidhunNeural)", + "ShortName": "ml-IN-MidhunNeural", + "Gender": "Male", + "Locale": "ml-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Midhun Online (Natural) - Malayalam (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ml-IN, SobhanaNeural)", + "ShortName": "ml-IN-SobhanaNeural", + "Gender": "Female", + "Locale": "ml-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sobhana Online (Natural) - Malayalam (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mt-MT, GraceNeural)", + "ShortName": "mt-MT-GraceNeural", + "Gender": "Female", + "Locale": "mt-MT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Grace Online (Natural) - Maltese (Malta)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mt-MT, JosephNeural)", + "ShortName": "mt-MT-JosephNeural", + "Gender": "Male", + "Locale": "mt-MT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Joseph Online (Natural) - Maltese (Malta)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mr-IN, AarohiNeural)", + "ShortName": "mr-IN-AarohiNeural", + "Gender": "Female", + "Locale": "mr-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aarohi Online (Natural) - Marathi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mr-IN, ManoharNeural)", + "ShortName": "mr-IN-ManoharNeural", + "Gender": "Male", + "Locale": "mr-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Manohar Online (Natural) - Marathi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mn-MN, BataaNeural)", + "ShortName": "mn-MN-BataaNeural", + "Gender": "Male", + "Locale": "mn-MN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Bataa Online (Natural) - Mongolian (Mongolia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mn-MN, YesuiNeural)", + "ShortName": "mn-MN-YesuiNeural", + "Gender": "Female", + "Locale": "mn-MN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yesui Online (Natural) - Mongolian (Mongolia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ne-NP, HemkalaNeural)", + "ShortName": "ne-NP-HemkalaNeural", + "Gender": "Female", + "Locale": "ne-NP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hemkala Online (Natural) - Nepali (Nepal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ne-NP, SagarNeural)", + "ShortName": "ne-NP-SagarNeural", + "Gender": "Male", + "Locale": "ne-NP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sagar Online (Natural) - Nepali (Nepal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nb-NO, FinnNeural)", + "ShortName": "nb-NO-FinnNeural", + "Gender": "Male", + "Locale": "nb-NO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Finn Online (Natural) - Norwegian (Bokmål Norway)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nb-NO, PernilleNeural)", + "ShortName": "nb-NO-PernilleNeural", + "Gender": "Female", + "Locale": "nb-NO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Pernille Online (Natural) - Norwegian (Bokmål, Norway)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ps-AF, GulNawazNeural)", + "ShortName": "ps-AF-GulNawazNeural", + "Gender": "Male", + "Locale": "ps-AF", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft GulNawaz Online (Natural) - Pashto (Afghanistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ps-AF, LatifaNeural)", + "ShortName": "ps-AF-LatifaNeural", + "Gender": "Female", + "Locale": "ps-AF", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Latifa Online (Natural) - Pashto (Afghanistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fa-IR, DilaraNeural)", + "ShortName": "fa-IR-DilaraNeural", + "Gender": "Female", + "Locale": "fa-IR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dilara Online (Natural) - Persian (Iran)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fa-IR, FaridNeural)", + "ShortName": "fa-IR-FaridNeural", + "Gender": "Male", + "Locale": "fa-IR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Farid Online (Natural) - Persian (Iran)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pl-PL, MarekNeural)", + "ShortName": "pl-PL-MarekNeural", + "Gender": "Male", + "Locale": "pl-PL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marek Online (Natural) - Polish (Poland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pl-PL, ZofiaNeural)", + "ShortName": "pl-PL-ZofiaNeural", + "Gender": "Female", + "Locale": "pl-PL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Zofia Online (Natural) - Polish (Poland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, ThalitaMultilingualNeural)", + "ShortName": "pt-BR-ThalitaMultilingualNeural", + "Gender": "Female", + "Locale": "pt-BR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft ThalitaMultilingual Online (Natural) - Portuguese (Brazil)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, AntonioNeural)", + "ShortName": "pt-BR-AntonioNeural", + "Gender": "Male", + "Locale": "pt-BR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Antonio Online (Natural) - Portuguese (Brazil)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, FranciscaNeural)", + "ShortName": "pt-BR-FranciscaNeural", + "Gender": "Female", + "Locale": "pt-BR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Francisca Online (Natural) - Portuguese (Brazil)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-PT, DuarteNeural)", + "ShortName": "pt-PT-DuarteNeural", + "Gender": "Male", + "Locale": "pt-PT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Duarte Online (Natural) - Portuguese (Portugal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-PT, RaquelNeural)", + "ShortName": "pt-PT-RaquelNeural", + "Gender": "Female", + "Locale": "pt-PT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Raquel Online (Natural) - Portuguese (Portugal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ro-RO, AlinaNeural)", + "ShortName": "ro-RO-AlinaNeural", + "Gender": "Female", + "Locale": "ro-RO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alina Online (Natural) - Romanian (Romania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ro-RO, EmilNeural)", + "ShortName": "ro-RO-EmilNeural", + "Gender": "Male", + "Locale": "ro-RO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emil Online (Natural) - Romanian (Romania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ru-RU, DmitryNeural)", + "ShortName": "ru-RU-DmitryNeural", + "Gender": "Male", + "Locale": "ru-RU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dmitry Online (Natural) - Russian (Russia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ru-RU, SvetlanaNeural)", + "ShortName": "ru-RU-SvetlanaNeural", + "Gender": "Female", + "Locale": "ru-RU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Svetlana Online (Natural) - Russian (Russia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sr-RS, NicholasNeural)", + "ShortName": "sr-RS-NicholasNeural", + "Gender": "Male", + "Locale": "sr-RS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nicholas Online (Natural) - Serbian (Serbia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sr-RS, SophieNeural)", + "ShortName": "sr-RS-SophieNeural", + "Gender": "Female", + "Locale": "sr-RS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sophie Online (Natural) - Serbian (Serbia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (si-LK, SameeraNeural)", + "ShortName": "si-LK-SameeraNeural", + "Gender": "Male", + "Locale": "si-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sameera Online (Natural) - Sinhala (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (si-LK, ThiliniNeural)", + "ShortName": "si-LK-ThiliniNeural", + "Gender": "Female", + "Locale": "si-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thilini Online (Natural) - Sinhala (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sk-SK, LukasNeural)", + "ShortName": "sk-SK-LukasNeural", + "Gender": "Male", + "Locale": "sk-SK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Lukas Online (Natural) - Slovak (Slovakia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sk-SK, ViktoriaNeural)", + "ShortName": "sk-SK-ViktoriaNeural", + "Gender": "Female", + "Locale": "sk-SK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Viktoria Online (Natural) - Slovak (Slovakia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sl-SI, PetraNeural)", + "ShortName": "sl-SI-PetraNeural", + "Gender": "Female", + "Locale": "sl-SI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Petra Online (Natural) - Slovenian (Slovenia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sl-SI, RokNeural)", + "ShortName": "sl-SI-RokNeural", + "Gender": "Male", + "Locale": "sl-SI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rok Online (Natural) - Slovenian (Slovenia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (so-SO, MuuseNeural)", + "ShortName": "so-SO-MuuseNeural", + "Gender": "Male", + "Locale": "so-SO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Muuse Online (Natural) - Somali (Somalia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (so-SO, UbaxNeural)", + "ShortName": "so-SO-UbaxNeural", + "Gender": "Female", + "Locale": "so-SO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ubax Online (Natural) - Somali (Somalia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-AR, ElenaNeural)", + "ShortName": "es-AR-ElenaNeural", + "Gender": "Female", + "Locale": "es-AR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elena Online (Natural) - Spanish (Argentina)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-AR, TomasNeural)", + "ShortName": "es-AR-TomasNeural", + "Gender": "Male", + "Locale": "es-AR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tomas Online (Natural) - Spanish (Argentina)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-BO, MarceloNeural)", + "ShortName": "es-BO-MarceloNeural", + "Gender": "Male", + "Locale": "es-BO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marcelo Online (Natural) - Spanish (Bolivia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-BO, SofiaNeural)", + "ShortName": "es-BO-SofiaNeural", + "Gender": "Female", + "Locale": "es-BO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sofia Online (Natural) - Spanish (Bolivia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CL, CatalinaNeural)", + "ShortName": "es-CL-CatalinaNeural", + "Gender": "Female", + "Locale": "es-CL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Catalina Online (Natural) - Spanish (Chile)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CL, LorenzoNeural)", + "ShortName": "es-CL-LorenzoNeural", + "Gender": "Male", + "Locale": "es-CL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Lorenzo Online (Natural) - Spanish (Chile)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CO, GonzaloNeural)", + "ShortName": "es-CO-GonzaloNeural", + "Gender": "Male", + "Locale": "es-CO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gonzalo Online (Natural) - Spanish (Colombia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CO, SalomeNeural)", + "ShortName": "es-CO-SalomeNeural", + "Gender": "Female", + "Locale": "es-CO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Salome Online (Natural) - Spanish (Colombia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, XimenaNeural)", + "ShortName": "es-ES-XimenaNeural", + "Gender": "Female", + "Locale": "es-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ximena Online (Natural) - Spanish (Colombia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CR, JuanNeural)", + "ShortName": "es-CR-JuanNeural", + "Gender": "Male", + "Locale": "es-CR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Juan Online (Natural) - Spanish (Costa Rica)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CR, MariaNeural)", + "ShortName": "es-CR-MariaNeural", + "Gender": "Female", + "Locale": "es-CR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maria Online (Natural) - Spanish (Costa Rica)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CU, BelkysNeural)", + "ShortName": "es-CU-BelkysNeural", + "Gender": "Female", + "Locale": "es-CU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Belkys Online (Natural) - Spanish (Cuba)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CU, ManuelNeural)", + "ShortName": "es-CU-ManuelNeural", + "Gender": "Male", + "Locale": "es-CU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Manuel Online (Natural) - Spanish (Cuba)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-DO, EmilioNeural)", + "ShortName": "es-DO-EmilioNeural", + "Gender": "Male", + "Locale": "es-DO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emilio Online (Natural) - Spanish (Dominican Republic)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-DO, RamonaNeural)", + "ShortName": "es-DO-RamonaNeural", + "Gender": "Female", + "Locale": "es-DO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ramona Online (Natural) - Spanish (Dominican Republic)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-EC, AndreaNeural)", + "ShortName": "es-EC-AndreaNeural", + "Gender": "Female", + "Locale": "es-EC", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Andrea Online (Natural) - Spanish (Ecuador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-EC, LuisNeural)", + "ShortName": "es-EC-LuisNeural", + "Gender": "Male", + "Locale": "es-EC", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Luis Online (Natural) - Spanish (Ecuador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-SV, LorenaNeural)", + "ShortName": "es-SV-LorenaNeural", + "Gender": "Female", + "Locale": "es-SV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Lorena Online (Natural) - Spanish (El Salvador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-SV, RodrigoNeural)", + "ShortName": "es-SV-RodrigoNeural", + "Gender": "Male", + "Locale": "es-SV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rodrigo Online (Natural) - Spanish (El Salvador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GQ, JavierNeural)", + "ShortName": "es-GQ-JavierNeural", + "Gender": "Male", + "Locale": "es-GQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Javier Online (Natural) - Spanish (Equatorial Guinea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GQ, TeresaNeural)", + "ShortName": "es-GQ-TeresaNeural", + "Gender": "Female", + "Locale": "es-GQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Teresa Online (Natural) - Spanish (Equatorial Guinea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GT, AndresNeural)", + "ShortName": "es-GT-AndresNeural", + "Gender": "Male", + "Locale": "es-GT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Andres Online (Natural) - Spanish (Guatemala)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GT, MartaNeural)", + "ShortName": "es-GT-MartaNeural", + "Gender": "Female", + "Locale": "es-GT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marta Online (Natural) - Spanish (Guatemala)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-HN, CarlosNeural)", + "ShortName": "es-HN-CarlosNeural", + "Gender": "Male", + "Locale": "es-HN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Carlos Online (Natural) - Spanish (Honduras)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-HN, KarlaNeural)", + "ShortName": "es-HN-KarlaNeural", + "Gender": "Female", + "Locale": "es-HN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Karla Online (Natural) - Spanish (Honduras)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-MX, DaliaNeural)", + "ShortName": "es-MX-DaliaNeural", + "Gender": "Female", + "Locale": "es-MX", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dalia Online (Natural) - Spanish (Mexico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-MX, JorgeNeural)", + "ShortName": "es-MX-JorgeNeural", + "Gender": "Male", + "Locale": "es-MX", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jorge Online (Natural) - Spanish (Mexico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-NI, FedericoNeural)", + "ShortName": "es-NI-FedericoNeural", + "Gender": "Male", + "Locale": "es-NI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Federico Online (Natural) - Spanish (Nicaragua)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-NI, YolandaNeural)", + "ShortName": "es-NI-YolandaNeural", + "Gender": "Female", + "Locale": "es-NI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yolanda Online (Natural) - Spanish (Nicaragua)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PA, MargaritaNeural)", + "ShortName": "es-PA-MargaritaNeural", + "Gender": "Female", + "Locale": "es-PA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Margarita Online (Natural) - Spanish (Panama)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PA, RobertoNeural)", + "ShortName": "es-PA-RobertoNeural", + "Gender": "Male", + "Locale": "es-PA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Roberto Online (Natural) - Spanish (Panama)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PY, MarioNeural)", + "ShortName": "es-PY-MarioNeural", + "Gender": "Male", + "Locale": "es-PY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mario Online (Natural) - Spanish (Paraguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PY, TaniaNeural)", + "ShortName": "es-PY-TaniaNeural", + "Gender": "Female", + "Locale": "es-PY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tania Online (Natural) - Spanish (Paraguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PE, AlexNeural)", + "ShortName": "es-PE-AlexNeural", + "Gender": "Male", + "Locale": "es-PE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alex Online (Natural) - Spanish (Peru)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PE, CamilaNeural)", + "ShortName": "es-PE-CamilaNeural", + "Gender": "Female", + "Locale": "es-PE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Camila Online (Natural) - Spanish (Peru)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PR, KarinaNeural)", + "ShortName": "es-PR-KarinaNeural", + "Gender": "Female", + "Locale": "es-PR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Karina Online (Natural) - Spanish (Puerto Rico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PR, VictorNeural)", + "ShortName": "es-PR-VictorNeural", + "Gender": "Male", + "Locale": "es-PR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Victor Online (Natural) - Spanish (Puerto Rico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, AlvaroNeural)", + "ShortName": "es-ES-AlvaroNeural", + "Gender": "Male", + "Locale": "es-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alvaro Online (Natural) - Spanish (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, ElviraNeural)", + "ShortName": "es-ES-ElviraNeural", + "Gender": "Female", + "Locale": "es-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elvira Online (Natural) - Spanish (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-US, AlonsoNeural)", + "ShortName": "es-US-AlonsoNeural", + "Gender": "Male", + "Locale": "es-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alonso Online (Natural) - Spanish (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-US, PalomaNeural)", + "ShortName": "es-US-PalomaNeural", + "Gender": "Female", + "Locale": "es-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Paloma Online (Natural) - Spanish (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-UY, MateoNeural)", + "ShortName": "es-UY-MateoNeural", + "Gender": "Male", + "Locale": "es-UY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mateo Online (Natural) - Spanish (Uruguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-UY, ValentinaNeural)", + "ShortName": "es-UY-ValentinaNeural", + "Gender": "Female", + "Locale": "es-UY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Valentina Online (Natural) - Spanish (Uruguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-VE, PaolaNeural)", + "ShortName": "es-VE-PaolaNeural", + "Gender": "Female", + "Locale": "es-VE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Paola Online (Natural) - Spanish (Venezuela)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-VE, SebastianNeural)", + "ShortName": "es-VE-SebastianNeural", + "Gender": "Male", + "Locale": "es-VE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sebastian Online (Natural) - Spanish (Venezuela)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (su-ID, JajangNeural)", + "ShortName": "su-ID-JajangNeural", + "Gender": "Male", + "Locale": "su-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jajang Online (Natural) - Sundanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (su-ID, TutiNeural)", + "ShortName": "su-ID-TutiNeural", + "Gender": "Female", + "Locale": "su-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tuti Online (Natural) - Sundanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-KE, RafikiNeural)", + "ShortName": "sw-KE-RafikiNeural", + "Gender": "Male", + "Locale": "sw-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rafiki Online (Natural) - Swahili (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-KE, ZuriNeural)", + "ShortName": "sw-KE-ZuriNeural", + "Gender": "Female", + "Locale": "sw-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Zuri Online (Natural) - Swahili (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-TZ, DaudiNeural)", + "ShortName": "sw-TZ-DaudiNeural", + "Gender": "Male", + "Locale": "sw-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Daudi Online (Natural) - Swahili (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-TZ, RehemaNeural)", + "ShortName": "sw-TZ-RehemaNeural", + "Gender": "Female", + "Locale": "sw-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rehema Online (Natural) - Swahili (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sv-SE, MattiasNeural)", + "ShortName": "sv-SE-MattiasNeural", + "Gender": "Male", + "Locale": "sv-SE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mattias Online (Natural) - Swedish (Sweden)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sv-SE, SofieNeural)", + "ShortName": "sv-SE-SofieNeural", + "Gender": "Female", + "Locale": "sv-SE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sofie Online (Natural) - Swedish (Sweden)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-IN, PallaviNeural)", + "ShortName": "ta-IN-PallaviNeural", + "Gender": "Female", + "Locale": "ta-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Pallavi Online (Natural) - Tamil (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-IN, ValluvarNeural)", + "ShortName": "ta-IN-ValluvarNeural", + "Gender": "Male", + "Locale": "ta-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Valluvar Online (Natural) - Tamil (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-MY, KaniNeural)", + "ShortName": "ta-MY-KaniNeural", + "Gender": "Female", + "Locale": "ta-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kani Online (Natural) - Tamil (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-MY, SuryaNeural)", + "ShortName": "ta-MY-SuryaNeural", + "Gender": "Male", + "Locale": "ta-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Surya Online (Natural) - Tamil (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-SG, AnbuNeural)", + "ShortName": "ta-SG-AnbuNeural", + "Gender": "Male", + "Locale": "ta-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Anbu Online (Natural) - Tamil (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-SG, VenbaNeural)", + "ShortName": "ta-SG-VenbaNeural", + "Gender": "Female", + "Locale": "ta-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Venba Online (Natural) - Tamil (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-LK, KumarNeural)", + "ShortName": "ta-LK-KumarNeural", + "Gender": "Male", + "Locale": "ta-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kumar Online (Natural) - Tamil (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-LK, SaranyaNeural)", + "ShortName": "ta-LK-SaranyaNeural", + "Gender": "Female", + "Locale": "ta-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Saranya Online (Natural) - Tamil (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (te-IN, MohanNeural)", + "ShortName": "te-IN-MohanNeural", + "Gender": "Male", + "Locale": "te-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mohan Online (Natural) - Telugu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (te-IN, ShrutiNeural)", + "ShortName": "te-IN-ShrutiNeural", + "Gender": "Female", + "Locale": "te-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Shruti Online (Natural) - Telugu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (th-TH, NiwatNeural)", + "ShortName": "th-TH-NiwatNeural", + "Gender": "Male", + "Locale": "th-TH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Niwat Online (Natural) - Thai (Thailand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (th-TH, PremwadeeNeural)", + "ShortName": "th-TH-PremwadeeNeural", + "Gender": "Female", + "Locale": "th-TH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Premwadee Online (Natural) - Thai (Thailand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (tr-TR, EmelNeural)", + "ShortName": "tr-TR-EmelNeural", + "Gender": "Female", + "Locale": "tr-TR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emel Online (Natural) - Turkish (Turkey)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (tr-TR, AhmetNeural)", + "ShortName": "tr-TR-AhmetNeural", + "Gender": "Male", + "Locale": "tr-TR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ahmet Online (Natural) - Turkish (Türkiye)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uk-UA, OstapNeural)", + "ShortName": "uk-UA-OstapNeural", + "Gender": "Male", + "Locale": "uk-UA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ostap Online (Natural) - Ukrainian (Ukraine)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uk-UA, PolinaNeural)", + "ShortName": "uk-UA-PolinaNeural", + "Gender": "Female", + "Locale": "uk-UA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Polina Online (Natural) - Ukrainian (Ukraine)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-IN, GulNeural)", + "ShortName": "ur-IN-GulNeural", + "Gender": "Female", + "Locale": "ur-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gul Online (Natural) - Urdu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-IN, SalmanNeural)", + "ShortName": "ur-IN-SalmanNeural", + "Gender": "Male", + "Locale": "ur-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Salman Online (Natural) - Urdu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-PK, AsadNeural)", + "ShortName": "ur-PK-AsadNeural", + "Gender": "Male", + "Locale": "ur-PK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Asad Online (Natural) - Urdu (Pakistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-PK, UzmaNeural)", + "ShortName": "ur-PK-UzmaNeural", + "Gender": "Female", + "Locale": "ur-PK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Uzma Online (Natural) - Urdu (Pakistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uz-UZ, MadinaNeural)", + "ShortName": "uz-UZ-MadinaNeural", + "Gender": "Female", + "Locale": "uz-UZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Madina Online (Natural) - Uzbek (Uzbekistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uz-UZ, SardorNeural)", + "ShortName": "uz-UZ-SardorNeural", + "Gender": "Male", + "Locale": "uz-UZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sardor Online (Natural) - Uzbek (Uzbekistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (vi-VN, HoaiMyNeural)", + "ShortName": "vi-VN-HoaiMyNeural", + "Gender": "Female", + "Locale": "vi-VN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HoaiMy Online (Natural) - Vietnamese (Vietnam)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (vi-VN, NamMinhNeural)", + "ShortName": "vi-VN-NamMinhNeural", + "Gender": "Male", + "Locale": "vi-VN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft NamMinh Online (Natural) - Vietnamese (Vietnam)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cy-GB, AledNeural)", + "ShortName": "cy-GB-AledNeural", + "Gender": "Male", + "Locale": "cy-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aled Online (Natural) - Welsh (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cy-GB, NiaNeural)", + "ShortName": "cy-GB-NiaNeural", + "Gender": "Female", + "Locale": "cy-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nia Online (Natural) - Welsh (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zu-ZA, ThandoNeural)", + "ShortName": "zu-ZA-ThandoNeural", + "Gender": "Female", + "Locale": "zu-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thando Online (Natural) - Zulu (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zu-ZA, ThembaNeural)", + "ShortName": "zu-ZA-ThembaNeural", + "Gender": "Male", + "Locale": "zu-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Themba Online (Natural) - Zulu (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + } +] \ No newline at end of file diff --git a/rvc/lib/utils.py b/rvc/lib/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fe2321b5015d11705c71423a63ccdd6951920c29 --- /dev/null +++ b/rvc/lib/utils.py @@ -0,0 +1,156 @@ +import os +import sys +import soxr +import librosa +import soundfile as sf +import numpy as np +import re +import unicodedata +import wget +from torch import nn + +import logging +from transformers import HubertModel +import warnings + +# Remove this to see warnings about transformers models +warnings.filterwarnings("ignore") + +logging.getLogger("fairseq").setLevel(logging.ERROR) +logging.getLogger("faiss.loader").setLevel(logging.ERROR) +logging.getLogger("transformers").setLevel(logging.ERROR) +logging.getLogger("torch").setLevel(logging.ERROR) + +now_dir = os.getcwd() +sys.path.append(now_dir) + +base_path = os.path.join(now_dir, "rvc", "models", "formant", "stftpitchshift") +stft = base_path + ".exe" if sys.platform == "win32" else base_path + + +class HubertModelWithFinalProj(HubertModel): + def __init__(self, config): + super().__init__(config) + self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) + + +def load_audio_16k(file): + # this is used by f0 and feature extractions that load preprocessed 16k files, so there's no need to resample + try: + audio, sr = librosa.load(file, sr=16000) + except Exception as error: + raise RuntimeError(f"An error occurred loading the audio: {error}") + + return audio.flatten() + + +def load_audio(file, sample_rate): + try: + file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + audio, sr = sf.read(file) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.T) + if sr != sample_rate: + audio = librosa.resample( + audio, orig_sr=sr, target_sr=sample_rate, res_type="soxr_vhq" + ) + except Exception as error: + raise RuntimeError(f"An error occurred loading the audio: {error}") + + return audio.flatten() + + +def load_audio_infer( + file, + sample_rate, + **kwargs, +): + formant_shifting = kwargs.get("formant_shifting", False) + try: + file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + if not os.path.isfile(file): + raise FileNotFoundError(f"File not found: {file}") + audio, sr = sf.read(file) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.T) + if sr != sample_rate: + audio = librosa.resample( + audio, orig_sr=sr, target_sr=sample_rate, res_type="soxr_vhq" + ) + if formant_shifting: + formant_qfrency = kwargs.get("formant_qfrency", 0.8) + formant_timbre = kwargs.get("formant_timbre", 0.8) + + from stftpitchshift import StftPitchShift + + pitchshifter = StftPitchShift(1024, 32, sample_rate) + audio = pitchshifter.shiftpitch( + audio, + factors=1, + quefrency=formant_qfrency * 1e-3, + distortion=formant_timbre, + ) + except Exception as error: + raise RuntimeError(f"An error occurred loading the audio: {error}") + return np.array(audio).flatten() + + +def format_title(title): + formatted_title = unicodedata.normalize("NFC", title) + formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title) + formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title, flags=re.UNICODE) + formatted_title = re.sub(r"\s+", "_", formatted_title) + return formatted_title + + +def load_embedding(embedder_model, custom_embedder=None): + embedder_root = os.path.join(now_dir, "rvc", "models", "embedders") + embedding_list = { + "contentvec": os.path.join(embedder_root, "contentvec"), + "spin": os.path.join(embedder_root, "spin"), + "spin-v2": os.path.join(embedder_root, "spin-v2"), + "chinese-hubert-base": os.path.join(embedder_root, "chinese_hubert_base"), + "japanese-hubert-base": os.path.join(embedder_root, "japanese_hubert_base"), + "korean-hubert-base": os.path.join(embedder_root, "korean_hubert_base"), + } + + online_embedders = { + "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/pytorch_model.bin", + "spin": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/spin/pytorch_model.bin", + "spin-v2": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/spin-v2/pytorch_model.bin", + "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/pytorch_model.bin", + "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/pytorch_model.bin", + "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/pytorch_model.bin", + } + + config_files = { + "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/config.json", + "spin": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/spin/config.json", + "spin-v2": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/spin-v2/config.json", + "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/config.json", + "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/config.json", + "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/config.json", + } + + if embedder_model == "custom": + if os.path.exists(custom_embedder): + model_path = custom_embedder + else: + print(f"Custom embedder not found: {custom_embedder}, using contentvec") + model_path = embedding_list["contentvec"] + else: + model_path = embedding_list[embedder_model] + bin_file = os.path.join(model_path, "pytorch_model.bin") + json_file = os.path.join(model_path, "config.json") + os.makedirs(model_path, exist_ok=True) + if not os.path.exists(bin_file): + url = online_embedders[embedder_model] + print(f"Downloading {url} to {model_path}...") + wget.download(url, out=bin_file) + if not os.path.exists(json_file): + url = config_files[embedder_model] + print(f"Downloading {url} to {model_path}...") + wget.download(url, out=json_file) + + models = HubertModelWithFinalProj.from_pretrained(model_path) + return models diff --git a/rvc/lib/zluda.py b/rvc/lib/zluda.py new file mode 100644 index 0000000000000000000000000000000000000000..45d59b78cb5804b11db649bdb0855063410cdb7f --- /dev/null +++ b/rvc/lib/zluda.py @@ -0,0 +1,76 @@ +import torch + +if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"): + + class STFT: + def __init__(self): + self.device = "cuda" + self.fourier_bases = {} # Cache for Fourier bases + + def _get_fourier_basis(self, n_fft): + # Check if the basis for this n_fft is already cached + if n_fft in self.fourier_bases: + return self.fourier_bases[n_fft] + fourier_basis = torch.fft.fft(torch.eye(n_fft, device="cpu")).to( + self.device + ) + # stack separated real and imaginary components and convert to torch tensor + cutoff = n_fft // 2 + 1 + fourier_basis = torch.cat( + [fourier_basis.real[:cutoff], fourier_basis.imag[:cutoff]], dim=0 + ) + # cache the tensor and return + self.fourier_bases[n_fft] = fourier_basis + return fourier_basis + + def transform(self, input, n_fft, hop_length, window): + # fetch cached Fourier basis + fourier_basis = self._get_fourier_basis(n_fft) + # apply hann window to Fourier basis + fourier_basis = fourier_basis * window + # pad input to center with reflect + pad_amount = n_fft // 2 + input = torch.nn.functional.pad( + input, (pad_amount, pad_amount), mode="reflect" + ) + # separate input into n_fft-sized frames + input_frames = input.unfold(1, n_fft, hop_length).permute(0, 2, 1) + # apply fft to each frame + fourier_transform = torch.matmul(fourier_basis, input_frames) + cutoff = n_fft // 2 + 1 + return torch.complex( + fourier_transform[:, :cutoff, :], fourier_transform[:, cutoff:, :] + ) + + stft = STFT() + _torch_stft = torch.stft + + def z_stft(input: torch.Tensor, window: torch.Tensor, *args, **kwargs): + # only optimizing a specific call from rvc.train.mel_processing.MultiScaleMelSpectrogramLoss + if ( + kwargs.get("win_length") == None + and kwargs.get("center") == None + and kwargs.get("return_complex") == True + ): + # use GPU accelerated calculation + return stft.transform( + input, kwargs.get("n_fft"), kwargs.get("hop_length"), window + ) + else: + # simply do the operation on CPU + return _torch_stft( + input=input.cpu(), window=window.cpu(), *args, **kwargs + ).to(input.device) + + def z_jit(f, *_, **__): + f.graph = torch._C.Graph() + return f + + # hijacks + torch.stft = z_stft + torch.jit.script = z_jit + # disabling unsupported cudnn + torch.backends.cudnn.enabled = False + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + torch.backends.cuda.enable_mem_efficient_sdp(False) diff --git a/rvc/models/embedders/.gitkeep b/rvc/models/embedders/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/rvc/models/embedders/.gitkeep @@ -0,0 +1 @@ + diff --git a/rvc/models/embedders/embedders_custom/.gitkeep b/rvc/models/embedders/embedders_custom/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/rvc/models/embedders/embedders_custom/.gitkeep @@ -0,0 +1 @@ + diff --git a/rvc/models/formant/.gitkeep b/rvc/models/formant/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/rvc/models/formant/.gitkeep @@ -0,0 +1 @@ + diff --git a/rvc/models/predictors/.gitkeep b/rvc/models/predictors/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/rvc/models/pretraineds/.gitkeep b/rvc/models/pretraineds/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/rvc/models/pretraineds/custom/.gitkeep b/rvc/models/pretraineds/custom/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/rvc/models/pretraineds/custom/.gitkeep @@ -0,0 +1 @@ + diff --git a/rvc/models/pretraineds/hifi-gan/.gitkeep b/rvc/models/pretraineds/hifi-gan/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/rvc/realtime/audio.py b/rvc/realtime/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..7b237420055768ddd0e73e495a818096d711e0c8 --- /dev/null +++ b/rvc/realtime/audio.py @@ -0,0 +1,349 @@ +import os +import sys +import librosa +import traceback +import numpy as np +import sounddevice as sd +from queue import Queue +from dataclasses import dataclass + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from rvc.realtime.core import AUDIO_SAMPLE_RATE + + +@dataclass +class ServerAudioDevice: + index: int = 0 + name: str = "" + host_api: str = "" + max_input_channels: int = 0 + max_output_channels: int = 0 + default_samplerate: int = 0 + + +def check_the_device(device, type: str = "input"): + stream_cls = sd.InputStream if type == "input" else sd.OutputStream + try: + with stream_cls( + device=device["index"], + dtype=np.float32, + samplerate=device["default_samplerate"], + ): + return True + except Exception: + return False + + +def list_audio_device(): + """ + Function to query audio devices and host api. + """ + try: + audio_device_list = sd.query_devices() + except Exception as e: + print("An error occurred while querying the audio device:", e) + audio_device_list = [] + except OSError as e: + # This error can occur when the libportaudio2 library is missing. + print("An error occurred while querying the audio device:", e) + audio_device_list = [] + + input_audio_device_list = [ + d + for d in audio_device_list + if d["max_input_channels"] > 0 and check_the_device(d, "input") + ] + output_audio_device_list = [ + d + for d in audio_device_list + if d["max_output_channels"] > 0 and check_the_device(d, "output") + ] + + try: + hostapis = sd.query_hostapis() + except Exception as e: + print("An error occurred while querying the host api:", e) + hostapis = [] + except OSError as e: + # This error can occur when the libportaudio2 library is missing. + print("An error occurred while querying the host api:", e) + hostapis = [] + + audio_input_device = [] + audio_output_device = [] + + for d in input_audio_device_list: + input_audio_device = ServerAudioDevice( + index=d["index"], + name=d["name"], + host_api=hostapis[d["hostapi"]]["name"], + max_input_channels=d["max_input_channels"], + max_output_channels=d["max_output_channels"], + default_samplerate=d["default_samplerate"], + ) + audio_input_device.append(input_audio_device) + + for d in output_audio_device_list: + output_audio_device = ServerAudioDevice( + index=d["index"], + name=d["name"], + host_api=hostapis[d["hostapi"]]["name"], + max_input_channels=d["max_input_channels"], + max_output_channels=d["max_output_channels"], + default_samplerate=d["default_samplerate"], + ) + audio_output_device.append(output_audio_device) + + return audio_input_device, audio_output_device + + +class Audio: + def __init__( + self, + callbacks, + f0_up_key: int = 0, + index_rate: float = 0.5, + protect: float = 0.5, + volume_envelope: float = 1, + f0_autotune: bool = False, + f0_autotune_strength: float = 1, + proposed_pitch=False, + proposed_pitch_threshold: float = 155.0, + input_audio_gain: float = 1.0, + output_audio_gain: float = 1.0, + monitor_audio_gain: float = 1.0, + monitor: bool = False, + ): + self.callbacks = callbacks + self.mon_queue = Queue() + self.stream = None + self.monitor = None + self.running = False + self.input_audio_gain = input_audio_gain + self.output_audio_gain = output_audio_gain + self.monitor_audio_gain = monitor_audio_gain + self.use_monitor = monitor + self.f0_up_key = f0_up_key + self.index_rate = index_rate + self.protect = protect + self.volume_envelope = volume_envelope + self.f0_autotune = f0_autotune + self.f0_autotune_strength = f0_autotune_strength + self.proposed_pitch = proposed_pitch + self.proposed_pitch_threshold = proposed_pitch_threshold + + def get_input_audio_device(self, index: int): + audioinput, _ = list_audio_device() + serverAudioDevice = [x for x in audioinput if x.index == index] + + return serverAudioDevice[0] if len(serverAudioDevice) > 0 else None + + def get_output_audio_device(self, index: int): + _, audiooutput = list_audio_device() + serverAudioDevice = [x for x in audiooutput if x.index == index] + + return serverAudioDevice[0] if len(serverAudioDevice) > 0 else None + + def process_data(self, indata: np.ndarray): + indata = indata * self.input_audio_gain + unpacked_data = librosa.to_mono(indata.T) + + return self.callbacks.change_voice( + unpacked_data, + self.f0_up_key, + self.index_rate, + self.protect, + self.volume_envelope, + self.f0_autotune, + self.f0_autotune_strength, + self.proposed_pitch, + self.proposed_pitch_threshold, + ) + + def process_data_with_time(self, indata: np.ndarray): + out_wav, _, perf, _ = self.process_data(indata) + performance_ms = perf[1] + # print(f"real-time voice conversion performance: {performance_ms:.2f} ms") + self.latency = performance_ms # latency to display on the application interface + + return out_wav + + def audio_stream_callback( + self, indata: np.ndarray, outdata: np.ndarray, frames, times, status + ): + try: + out_wav = self.process_data_with_time(indata) + + output_channels = outdata.shape[1] + if self.use_monitor: + self.mon_queue.put(out_wav) + + outdata[:] = ( + np.repeat(out_wav, output_channels).reshape(-1, output_channels) + * self.output_audio_gain + ) + except Exception as error: + print(f"An error occurred while running the audio stream: {error}") + print(traceback.format_exc()) + + def audio_queue(self, outdata: np.ndarray, frames, times, status): + try: + mon_wav = self.mon_queue.get() + + while self.mon_queue.qsize() > 0: + self.mon_queue.get() + + output_channels = outdata.shape[1] + outdata[:] = ( + np.repeat(mon_wav, output_channels).reshape(-1, output_channels) + * self.monitor_audio_gain + ) + except Exception as error: + print(f"An error occurred while running the audio queue: {error}") + print(traceback.format_exc()) + + def run_audio_stream( + self, + block_frame: int, + input_device_id: int, + output_device_id: int, + output_monitor_id: int, + input_max_channel: int, + output_max_channel: int, + output_monitor_max_channel: int, + input_extra_setting, + output_extra_setting, + output_monitor_extra_setting, + ): + self.stream = sd.Stream( + callback=self.audio_stream_callback, + latency="low", + dtype=np.float32, + device=(input_device_id, output_device_id), + blocksize=block_frame, + samplerate=AUDIO_SAMPLE_RATE, + channels=(input_max_channel, output_max_channel), + extra_settings=(input_extra_setting, output_extra_setting), + ) + self.stream.start() + + if self.use_monitor: + self.monitor = sd.OutputStream( + callback=self.audio_queue, + dtype=np.float32, + device=output_monitor_id, + blocksize=block_frame, + samplerate=AUDIO_SAMPLE_RATE, + channels=output_monitor_max_channel, + extra_settings=output_monitor_extra_setting, + ) + self.monitor.start() + + def stop(self): + self.running = False + + if self.stream is not None: + self.stream.close() + self.stream = None + + if self.monitor is not None: + self.monitor.close() + self.monitor = None + + def start( + self, + input_device_id: int, + output_device_id: int, + output_monitor_id: int = None, + exclusive_mode: bool = False, + asio_input_channel: int = -1, + asio_output_channel: int = -1, + asio_output_monitor_channel: int = -1, + read_chunk_size: int = 192, + ): + self.stop() + + input_audio_device, output_audio_device = self.get_input_audio_device( + input_device_id + ), self.get_output_audio_device(output_device_id) + input_channels, output_channels = ( + input_audio_device.max_input_channels, + output_audio_device.max_output_channels, + ) + + ( + input_extra_setting, + output_extra_setting, + output_monitor_extra_setting, + monitor_channels, + ) = (None, None, None, None) + wasapi_exclusive_mode = bool(exclusive_mode) + + if input_audio_device and "WASAPI" in input_audio_device.host_api: + input_extra_setting = sd.WasapiSettings( + exclusive=wasapi_exclusive_mode, auto_convert=not wasapi_exclusive_mode + ) + elif ( + input_audio_device + and "ASIO" in input_audio_device.host_api + and asio_input_channel != -1 + ): + input_extra_setting = sd.AsioSettings( + channel_selectors=[asio_input_channel] + ) + input_channels = 1 + + if output_audio_device and "WASAPI" in output_audio_device.host_api: + output_extra_setting = sd.WasapiSettings( + exclusive=wasapi_exclusive_mode, auto_convert=not wasapi_exclusive_mode + ) + elif ( + input_audio_device + and "ASIO" in input_audio_device.host_api + and asio_output_channel != -1 + ): + output_extra_setting = sd.AsioSettings( + channel_selectors=[asio_output_channel] + ) + output_channels = 1 + + if self.use_monitor: + output_monitor_device = self.get_output_audio_device(output_monitor_id) + monitor_channels = output_monitor_device.max_output_channels + + if output_monitor_device and "WASAPI" in output_monitor_device.host_api: + output_monitor_extra_setting = sd.WasapiSettings( + exclusive=wasapi_exclusive_mode, + auto_convert=not wasapi_exclusive_mode, + ) + elif ( + output_monitor_device + and "ASIO" in output_monitor_device.host_api + and asio_output_monitor_channel != -1 + ): + output_monitor_extra_setting = sd.AsioSettings( + channel_selectors=[asio_output_monitor_channel] + ) + monitor_channels = 1 + + block_frame = int((read_chunk_size * 128 / 48000) * AUDIO_SAMPLE_RATE) + + try: + self.run_audio_stream( + block_frame, + input_device_id, + output_device_id, + output_monitor_id, + input_channels, + output_channels, + monitor_channels, + input_extra_setting, + output_extra_setting, + output_monitor_extra_setting, + ) + self.running = True + except Exception as error: + print(f"An error occurred while streaming audio: {error}") + print(traceback.format_exc()) diff --git a/rvc/realtime/callbacks.py b/rvc/realtime/callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..c873b7222c4e2ada69f7ed3f630e0c9c1479ae14 --- /dev/null +++ b/rvc/realtime/callbacks.py @@ -0,0 +1,114 @@ +import os +import sys +import threading +import numpy as np + +sys.path.append(os.getcwd()) + +from rvc.realtime.audio import Audio +from rvc.realtime.core import VoiceChanger + + +class AudioCallbacks: + def __init__( + self, + pass_through: bool = False, + read_chunk_size: int = 192, + cross_fade_overlap_size: float = 0.1, + extra_convert_size: float = 0.5, + model_path: str = None, + index_path: str = None, + f0_method: str = "rmvpe", + embedder_model: str = None, + embedder_model_custom: str = None, + silent_threshold: int = -90, + f0_up_key: int = 0, + index_rate: float = 0.5, + protect: float = 0.5, + volume_envelope: float = 1, + f0_autotune: bool = False, + f0_autotune_strength: float = 1, + proposed_pitch: bool = False, + proposed_pitch_threshold: float = 155.0, + input_audio_gain: float = 1.0, + output_audio_gain: float = 1.0, + monitor_audio_gain: float = 1.0, + monitor: bool = False, + vad_enabled: bool = False, + vad_sensitivity: int = 3, + vad_frame_ms: int = 30, + sid: int = 0, + # device: str = "cuda", + ): + self.pass_through = pass_through + self.lock = threading.Lock() + self.vc = VoiceChanger( + read_chunk_size, + cross_fade_overlap_size, + extra_convert_size, + model_path, + index_path, + f0_method, + embedder_model, + embedder_model_custom, + silent_threshold, + vad_enabled, + vad_sensitivity, + vad_frame_ms, + sid, + # device, + ) + self.audio = Audio( + self, + f0_up_key, + index_rate, + protect, + volume_envelope, + f0_autotune, + f0_autotune_strength, + proposed_pitch, + proposed_pitch_threshold, + input_audio_gain, + output_audio_gain, + monitor_audio_gain, + monitor, + ) + + def change_voice( + self, + received_data: np.ndarray, + f0_up_key: int = 0, + index_rate: float = 0.5, + protect: float = 0.5, + volume_envelope: float = 1, + f0_autotune: bool = False, + f0_autotune_strength: float = 1, + proposed_pitch: bool = False, + proposed_pitch_threshold: float = 155.0, + ): + if self.pass_through: # through + vol = float(np.sqrt(np.square(received_data).mean(dtype=np.float32))) + return received_data, vol, [0, 0, 0], None + + try: + with self.lock: + audio, vol, perf = self.vc.on_request( + received_data, + f0_up_key, + index_rate, + protect, + volume_envelope, + f0_autotune, + f0_autotune_strength, + proposed_pitch, + proposed_pitch_threshold, + ) + + return audio, vol, perf, None + except RuntimeError as error: + import traceback + + print(f"An error occurred during real-time voice conversion: {error}") + print(traceback.format_exc()) + + return np.zeros(1, dtype=np.float32), 0, [0, 0, 0], None diff --git a/rvc/realtime/core.py b/rvc/realtime/core.py new file mode 100644 index 0000000000000000000000000000000000000000..cc43e4ef0261b4dcbcb38e19973e00236fda18ba --- /dev/null +++ b/rvc/realtime/core.py @@ -0,0 +1,376 @@ +import os +import sys +import time +import torch +import torch.nn.functional as F +import torchaudio.transforms as tat +import numpy as np + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from rvc.realtime.utils.torch import circular_write +from rvc.realtime.utils.vad import VADProcessor +from rvc.realtime.pipeline import create_pipeline + +SAMPLE_RATE = 16000 +AUDIO_SAMPLE_RATE = 48000 + + +class Realtime: + def __init__( + self, + model_path: str = None, + index_path: str = None, + f0_method: str = "rmvpe", + embedder_model: str = None, + embedder_model_custom: str = None, + silent_threshold: int = 0, + vad_enabled: bool = False, + vad_sensitivity: int = 3, + vad_frame_ms: int = 30, + sid: int = 0, + # device: str = "cuda", + ): + self.sample_rate = SAMPLE_RATE + self.convert_buffer = None + self.pitch_buffer = None + self.pitchf_buffer = None + self.return_length = 0 + self.skip_head = 0 + self.silence_front = 0 + # Convert dB to RMS + self.input_sensitivity = 10 ** (silent_threshold / 20) + self.window_size = self.sample_rate // 100 + self.dtype = torch.float32 # torch.float16 if config.is_half else torch.float32 + + self.vad = ( + VADProcessor( + sensitivity_mode=vad_sensitivity, + sample_rate=self.sample_rate, + frame_duration_ms=vad_frame_ms, + ) + if vad_enabled + else None + ) + # Create conversion pipelines + self.pipeline = create_pipeline( + model_path, + index_path, + f0_method, + embedder_model, + embedder_model_custom, + # device, + sid, + ) + self.device = self.pipeline.device + # Resampling of inputs and outputs. + self.resample_in = tat.Resample( + orig_freq=AUDIO_SAMPLE_RATE, new_freq=self.sample_rate, dtype=torch.float32 + ).to(self.device) + self.resample_out = tat.Resample( + orig_freq=self.pipeline.tgt_sr, + new_freq=AUDIO_SAMPLE_RATE, + dtype=torch.float32, + ).to(self.device) + + def realloc( + self, + block_frame: int, + extra_frame: int, + crossfade_frame: int, + sola_search_frame: int, + ): + # Calculate frame sizes based on DEVICE sample rate (f.e., 48000Hz) and convert to 16000Hz + block_frame_16k = int(block_frame / AUDIO_SAMPLE_RATE * self.sample_rate) + crossfade_frame_16k = int( + crossfade_frame / AUDIO_SAMPLE_RATE * self.sample_rate + ) + sola_search_frame_16k = int( + sola_search_frame / AUDIO_SAMPLE_RATE * self.sample_rate + ) + extra_frame_16k = int(extra_frame / AUDIO_SAMPLE_RATE * self.sample_rate) + + convert_size_16k = ( + block_frame_16k + + sola_search_frame_16k + + extra_frame_16k + + crossfade_frame_16k + ) + if ( + modulo := convert_size_16k % self.window_size + ) != 0: # Compensate for truncation due to hop size in model output. + convert_size_16k = convert_size_16k + (self.window_size - modulo) + self.convert_feature_size_16k = convert_size_16k // self.window_size + + self.skip_head = extra_frame_16k // self.window_size + self.return_length = self.convert_feature_size_16k - self.skip_head + self.silence_front = ( + extra_frame_16k - (self.window_size * 5) if self.silence_front else 0 + ) + # Audio buffer to measure volume between chunks + audio_buffer_size = block_frame_16k + crossfade_frame_16k + self.audio_buffer = torch.zeros( + audio_buffer_size, dtype=self.dtype, device=self.device + ) + # Audio buffer for conversion without silence + self.convert_buffer = torch.zeros( + convert_size_16k, dtype=self.dtype, device=self.device + ) + # Additional +1 is to compensate for pitch extraction algorithm + # that can output additional feature. + self.pitch_buffer = torch.zeros( + self.convert_feature_size_16k + 1, dtype=torch.int64, device=self.device + ) + self.pitchf_buffer = torch.zeros( + self.convert_feature_size_16k + 1, dtype=self.dtype, device=self.device + ) + + def inference( + self, + audio_input: np.ndarray, + f0_up_key: int = 0, + index_rate: float = 0.5, + protect: float = 0.5, + volume_envelope: float = 1, + f0_autotune: bool = False, + f0_autotune_strength: float = 1, + proposed_pitch: bool = False, + proposed_pitch_threshold: float = 155.0, + ): + if self.pipeline is None: + raise RuntimeError("Pipeline is not initialized.") + + # Input audio is always float32 + audio_input_16k = self.resample_in( + torch.as_tensor(audio_input, dtype=torch.float32, device=self.device) + ).to(self.dtype) + circular_write(audio_input_16k, self.audio_buffer) + + vol_t = torch.sqrt(torch.square(self.audio_buffer).mean()) + vol = max(vol_t.item(), 0) + + if self.vad is not None: + is_speech = self.vad.is_speech(audio_input_16k.cpu().numpy().copy()) + if not is_speech: + # Busy wait to keep power manager happy and clocks stable. Running pipeline on-demand seems to lag when the delay between + # voice changer activation is too high. + # https://forums.developer.nvidia.com/t/why-kernel-calculate-speed-got-slower-after-waiting-for-a-while/221059/9 + self.pipeline.voice_conversion( + self.convert_buffer, + self.pitch_buffer, + self.pitchf_buffer, + f0_up_key, + index_rate, + self.convert_feature_size_16k, + self.silence_front, + self.skip_head, + self.return_length, + protect, + volume_envelope, + f0_autotune, + f0_autotune_strength, + proposed_pitch, + proposed_pitch_threshold, + ) + return None, vol + + if vol < self.input_sensitivity: + # Busy wait to keep power manager happy and clocks stable. Running pipeline on-demand seems to lag when the delay between + # voice changer activation is too high. + # https://forums.developer.nvidia.com/t/why-kernel-calculate-speed-got-slower-after-waiting-for-a-while/221059/9 + self.pipeline.voice_conversion( + self.convert_buffer, + self.pitch_buffer, + self.pitchf_buffer, + f0_up_key, + index_rate, + self.convert_feature_size_16k, + self.silence_front, + self.skip_head, + self.return_length, + protect, + volume_envelope, + f0_autotune, + f0_autotune_strength, + proposed_pitch, + proposed_pitch_threshold, + ) + + return None, vol + + circular_write(audio_input_16k, self.convert_buffer) + + audio_model = self.pipeline.voice_conversion( + self.convert_buffer, + self.pitch_buffer, + self.pitchf_buffer, + f0_up_key, + index_rate, + self.convert_feature_size_16k, + self.silence_front, + self.skip_head, + self.return_length, + protect, + volume_envelope, + f0_autotune, + f0_autotune_strength, + proposed_pitch, + proposed_pitch_threshold, + ) + + audio_out: torch.Tensor = self.resample_out(audio_model * torch.sqrt(vol_t)) + return audio_out, vol + + def __del__(self): + del self.pipeline + + +class VoiceChanger: + def __init__( + self, + read_chunk_size: int, + cross_fade_overlap_size: float, + extra_convert_size: float, + model_path: str = None, + index_path: str = None, + f0_method: str = "rmvpe", + embedder_model: str = None, + embedder_model_custom: str = None, + silent_threshold: int = 0, + vad_enabled: bool = False, + vad_sensitivity: int = 3, + vad_frame_ms: int = 30, + sid: int = 0, + # device: str = "cuda", + ): + self.block_frame = read_chunk_size * 128 + self.crossfade_frame = int(cross_fade_overlap_size * AUDIO_SAMPLE_RATE) + self.extra_frame = int(extra_convert_size * AUDIO_SAMPLE_RATE) + self.sola_search_frame = AUDIO_SAMPLE_RATE // 100 + self.sola_buffer = None + self.vc_model = Realtime( + model_path, + index_path, + f0_method, + embedder_model, + embedder_model_custom, + silent_threshold, + vad_enabled, + vad_sensitivity, + vad_frame_ms, + sid, + # device + ) + self.device = self.vc_model.device + self.vc_model.realloc( + self.block_frame, + self.extra_frame, + self.crossfade_frame, + self.sola_search_frame, + ) + self.generate_strength() + + def generate_strength(self): + self.fade_in_window: torch.Tensor = ( + torch.sin( + 0.5 + * np.pi + * torch.linspace( + 0.0, + 1.0, + steps=self.crossfade_frame, + device=self.device, + dtype=torch.float32, + ) + ) + ** 2 + ) + + self.fade_out_window: torch.Tensor = 1 - self.fade_in_window + # The size will change from the previous result, so the record will be deleted. + self.sola_buffer = torch.zeros( + self.crossfade_frame, device=self.device, dtype=torch.float32 + ) + + def process_audio( + self, + audio_input: np.ndarray, + f0_up_key: int = 0, + index_rate: float = 0.5, + protect: float = 0.5, + volume_envelope: float = 1, + f0_autotune: bool = False, + f0_autotune_strength: float = 1, + proposed_pitch: bool = False, + proposed_pitch_threshold: float = 155.0, + ): + block_size = audio_input.shape[0] + + audio, vol = self.vc_model.inference( + audio_input, + f0_up_key, + index_rate, + protect, + volume_envelope, + f0_autotune, + f0_autotune_strength, + proposed_pitch, + proposed_pitch_threshold, + ) + + if audio is None: + # In case there's an actual silence - send full block with zeros + return np.zeros(block_size, dtype=np.float32), vol + + conv_input = audio[None, None, : self.crossfade_frame + self.sola_search_frame] + cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :]) + cor_den = torch.sqrt( + F.conv1d( + conv_input**2, + torch.ones(1, 1, self.crossfade_frame, device=self.device), + ) + + 1e-8 + ) + sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) + + audio = audio[sola_offset:] + audio[: self.crossfade_frame] *= self.fade_in_window + audio[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window + + self.sola_buffer[:] = audio[block_size : block_size + self.crossfade_frame] + return audio[:block_size].detach().cpu().numpy(), vol + + @torch.no_grad() + def on_request( + self, + audio_input: np.ndarray, + f0_up_key: int = 0, + index_rate: float = 0.5, + protect: float = 0.5, + volume_envelope: float = 1, + f0_autotune: bool = False, + f0_autotune_strength: float = 1, + proposed_pitch: bool = False, + proposed_pitch_threshold: float = 155.0, + ): + if self.vc_model is None: + raise RuntimeError("Voice Changer is not selected.") + + start = ( + time.perf_counter() + ) # Using perf_counter to measure real-time voice conversion latency. + result, vol = self.process_audio( + audio_input, + f0_up_key, + index_rate, + protect, + volume_envelope, + f0_autotune, + f0_autotune_strength, + proposed_pitch, + proposed_pitch_threshold, + ) + end = time.perf_counter() + + return result, vol, [0, (end - start) * 1000, 0] diff --git a/rvc/realtime/pipeline.py b/rvc/realtime/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..337e7e83431baa6c795a75946b27a1e9020479d5 --- /dev/null +++ b/rvc/realtime/pipeline.py @@ -0,0 +1,414 @@ +import os +import sys +import faiss +import numpy as np +import torch +import torch.nn.utils.parametrize +import torch.nn.functional as F +import torchaudio.transforms as tat +from torch import Tensor + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from rvc.realtime.utils.torch import circular_write +from rvc.configs.config import Config +from rvc.infer.pipeline import Autotune, AudioProcessor +from rvc.lib.algorithm.synthesizers import Synthesizer +from rvc.lib.predictors.f0 import FCPE, RMVPE, SWIFT +from rvc.lib.utils import load_embedding, HubertModelWithFinalProj + + +class RealtimeVoiceConverter: + """ + A class for performing realtime voice conversion using the Retrieval-Based Voice Conversion (RVC) method. + """ + + def __init__(self, weight_root): + """ + Initializes the RealtimeVoiceConverter with default configuration, and sets up models and parameters. + """ + self.config = Config() # Load configuration + self.tgt_sr = None # Target sampling rate for the output audio + self.net_g = None # Generator network for voice conversion + self.cpt = None # Checkpoint for loading model weights + self.version = None # Model version + self.use_f0 = None # Whether the model uses F0 + # load weights and setup model network. + self.load_model(weight_root) + self.setup_network() + + def load_model(self, weight_root): + """ + Loads the model weights from the specified path. + + Args: + weight_root (str): Path to the model weights. + """ + self.cpt = ( + torch.load(weight_root, map_location="cpu", weights_only=True) + if os.path.isfile(weight_root) + else None + ) + + def setup_network(self): + """ + Sets up the network configuration based on the loaded checkpoint. + """ + if self.cpt is not None: + self.tgt_sr = self.cpt["config"][-1] + self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] + self.use_f0 = self.cpt.get("f0", 1) + + self.version = self.cpt.get("version", "v1") + self.text_enc_hidden_dim = 768 if self.version == "v2" else 256 + self.vocoder = self.cpt.get("vocoder", "HiFi-GAN") + self.net_g = Synthesizer( + *self.cpt["config"], + use_f0=self.use_f0, + text_enc_hidden_dim=self.text_enc_hidden_dim, + vocoder=self.vocoder, + ) + + self.net_g.load_state_dict(self.cpt["weight"], strict=False) + strip_parametrizations(self.net_g) + self.net_g = self.net_g.to(self.config.device).float() + self.net_g.eval() + # self.net_g.remove_weight_norm() + + def inference( + self, + feats: Tensor, + p_len: Tensor, + sid: Tensor, + pitch: Tensor, + pitchf: Tensor, + ): + output = self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] + + return torch.clip(output, -1.0, 1.0, out=output) + + +class Realtime_Pipeline: + def __init__( + self, + vc: RealtimeVoiceConverter, + hubert_model: HubertModelWithFinalProj = None, + index=None, + big_npy=None, + f0_method: str = "rmvpe", + sid: int = 0, + ): + self.vc = vc + self.hubert_model = hubert_model + self.index = index + self.big_npy = big_npy + self.use_f0 = vc.use_f0 + self.version = vc.version + self.f0_method = f0_method + self.sample_rate = 16000 + self.tgt_sr = vc.tgt_sr + self.window = 160 + self.model_window = self.tgt_sr // 100 + self.f0_min = 50.0 + self.f0_max = 1100.0 + self.device = vc.config.device + self.sid = torch.tensor([sid], device=self.device, dtype=torch.int64) + self.autotune = Autotune() + self.resamplers = {} + self.f0_model = None + + def get_f0( + self, + x: Tensor, + pitch: Tensor = None, + pitchf: Tensor = None, + f0_up_key: int = 0, + f0_autotune: bool = False, + f0_autotune_strength: float = 1.0, + proposed_pitch: bool = False, + proposed_pitch_threshold: float = 155.0, + ): + """ + Estimates the fundamental frequency (F0) of a given audio signal using various methods. + """ + + if torch.is_tensor(x): + # If the input is a tensor, it will need to be converted to numpy array to calculate with RMVPE and FCPE. + x = x.cpu().numpy() + + if self.f0_method == "rmvpe": + if self.f0_model is None: + self.f0_model = RMVPE( + device=self.device, + sample_rate=self.sample_rate, + hop_size=self.window, + ) + f0 = self.f0_model.get_f0(x, filter_radius=0.03) + elif self.f0_method == "fcpe": + if self.f0_model is None: + self.f0_model = FCPE( + device=self.device, + sample_rate=self.sample_rate, + hop_size=self.window, + ) + f0 = self.f0_model.get_f0(x, x.shape[0] // self.window, filter_radius=0.006) + elif self.f0_method == "swift": + if self.f0_model is None: + self.f0_model = SWIFT( + device=self.device, + sample_rate=self.sample_rate, + hop_size=self.window, + ) + f0 = self.f0_model.get_f0( + x, + self.f0_min, + self.f0_max, + x.shape[0] // self.window, + confidence_threshold=0.887, + ) + + # f0 adjustments + if f0_autotune is True: + f0 = self.autotune.autotune_f0(f0, f0_autotune_strength) + elif proposed_pitch is True: + limit = 12 + # calculate median f0 of the audio + valid_f0 = np.where(f0 > 0)[0] + if len(valid_f0) < 2: + # no valid f0 detected + up_key = 0 + else: + median_f0 = float( + np.median(np.interp(np.arange(len(f0)), valid_f0, f0[valid_f0])) + ) + if median_f0 <= 0 or np.isnan(median_f0): + up_key = 0 + else: + # calculate proposed shift + up_key = max( + -limit, + min( + limit, + int( + np.round( + 12 * np.log2(proposed_pitch_threshold / median_f0) + ) + ), + ), + ) + print( + "calculated pitch offset:", up_key + ) # Might need to hide so terminal output doesn't become a mess + f0 *= pow(2, (f0_up_key + up_key) / 12) + else: + f0 *= pow(2, f0_up_key / 12) + + # Convert to Tensor for computational use + f0 = torch.from_numpy(f0).to(self.device).float() + + # quantizing f0 to 255 buckets to make coarse f0 + f0_mel = 1127.0 * torch.log(1.0 + f0 / 700.0) + f0_mel = torch.clip( + (f0_mel - self.f0_min) * 254 / (self.f0_max - self.f0_min) + 1, + 1, + 255, + out=f0_mel, + ) + f0_coarse = torch.round(f0_mel, out=f0_mel).long() + + if pitch is not None and pitchf is not None: + circular_write(f0_coarse, pitch) + circular_write(f0, pitchf) + else: + pitch = f0_coarse + pitchf = f0 + + return pitch.unsqueeze(0), pitchf.unsqueeze(0) + + def voice_conversion( + self, + audio: Tensor, + pitch: Tensor = None, + pitchf: Tensor = None, + f0_up_key: int = 0, + index_rate: float = 0.5, + p_len: int = 0, + silence_front: int = 0, + skip_head: int = None, + return_length: int = None, + protect: float = 0.5, + volume_envelope: float = 1, + f0_autotune: bool = False, + f0_autotune_strength: float = 1, + proposed_pitch: bool = False, + proposed_pitch_threshold: float = 155.0, + ): + """ + Performs realtime voice conversion on a given audio segment. + """ + assert audio.dim() == 1, audio.dim() + feats = audio.view(1, -1).to(self.device) + + formant_length = int(np.ceil(return_length * 1.0)) + + pitch, pitchf = ( + self.get_f0( + audio[silence_front:], + pitch, + pitchf, + f0_up_key, + f0_autotune, + f0_autotune_strength, + proposed_pitch, + proposed_pitch_threshold, + ) + if self.use_f0 + else (None, None) + ) + + # extract features + feats = self.hubert_model(feats)["last_hidden_state"] + feats = ( + self.hubert_model.final_proj(feats[0]).unsqueeze(0) + if self.version == "v1" + else feats + ) + + feats = torch.cat((feats, feats[:, -1:, :]), 1) + # make a copy for pitch guidance and protection + feats0 = feats.detach().clone() if self.use_f0 else None + + if ( + self.index + ): # set by parent function, only true if index is available, loaded, and index rate > 0 + feats = self._retrieve_speaker_embeddings( + skip_head, feats, self.index, self.big_npy, index_rate + ) + # feature upsampling + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)[ + :, :p_len, : + ] + + if self.use_f0: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + )[:, :p_len, :] + pitch, pitchf = pitch[:, -p_len:], pitchf[:, -p_len:] * ( + formant_length / return_length + ) + + # Pitch protection blending + if protect < 0.5: + pitchff = pitchf.detach().clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + feats = feats * pitchff.unsqueeze(-1) + feats0 * ( + 1 - pitchff.unsqueeze(-1) + ) + feats = feats.to(feats0.dtype) + else: + pitch, pitchf = None, None + + p_len = torch.tensor([p_len], device=self.device, dtype=torch.int64) + out_audio = self.vc.inference(feats, p_len, self.sid, pitch, pitchf).float() + if volume_envelope != 1: + out_audio = AudioProcessor.change_rms( + audio, self.sample_rate, out_audio, self.tgt_sr, volume_envelope + ) + + scaled_window = int(np.floor(1.0 * self.model_window)) + + if scaled_window != self.model_window: + if scaled_window not in self.resamplers: + self.resamplers[scaled_window] = tat.Resample( + orig_freq=scaled_window, + new_freq=self.model_window, + dtype=torch.float32, + ).to(self.device) + out_audio = self.resamplers[scaled_window]( + out_audio[: return_length * scaled_window] + ) + + return out_audio + + def _retrieve_speaker_embeddings( + self, skip_head, feats, index, big_npy, index_rate + ): + skip_offset = skip_head // 2 + npy = feats[0][skip_offset:].cpu().numpy() + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + feats[0][skip_offset:] = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats[0][skip_offset:] + ) + return feats + + +def load_faiss_index(file_index): + if file_index != "" and os.path.exists(file_index): + try: + index = faiss.read_index(file_index) + big_npy = index.reconstruct_n(0, index.ntotal) + except Exception as error: + print(f"An error occurred reading the FAISS index: {error}") + index = big_npy = None + else: + index = big_npy = None + + return index, big_npy + + +def create_pipeline( + model_path: str = None, + index_path: str = None, + f0_method: str = "rmvpe", + embedder_model: str = None, + embedder_model_custom: str = None, + # device: str = "cuda", + sid: int = 0, +): + """ + Initialize real-time voice conversion pipeline. + """ + + vc = RealtimeVoiceConverter(model_path) + index, big_npy = load_faiss_index( + index_path.strip() + .strip('"') + .strip("\n") + .strip('"') + .strip() + .replace("trained", "added") + ) + + hubert_model = load_embedding(embedder_model, embedder_model_custom) + hubert_model = hubert_model.to(vc.config.device).float() + hubert_model.eval() + + pipeline = Realtime_Pipeline( + vc, + hubert_model, + index, + big_npy, + f0_method, + sid, + ) + + return pipeline + + +def strip_parametrizations(module: torch.nn.Module): + """ + Remove all parametrizations (e.g., weight norm) from a module and log each removal. + """ + for name, submodule in module.named_modules(): + if hasattr(submodule, "parametrizations"): + for pname, plist in list(submodule.parametrizations.items()): + # print(f"Removing parametrizations from {name}.{pname}: {[p.__class__.__name__ for p in plist]}") + torch.nn.utils.parametrize.remove_parametrizations( + submodule, pname, leave_parametrized=True + ) diff --git a/rvc/realtime/utils/torch.py b/rvc/realtime/utils/torch.py new file mode 100644 index 0000000000000000000000000000000000000000..6ae22e95f43a9e92da88c2f1128ba8960a78920d --- /dev/null +++ b/rvc/realtime/utils/torch.py @@ -0,0 +1,8 @@ +import torch + + +def circular_write(new_data: torch.Tensor, target: torch.Tensor) -> torch.Tensor: + offset = new_data.shape[0] + target[:-offset] = target[offset:].detach().clone() + target[-offset:] = new_data + return target diff --git a/rvc/realtime/utils/vad.py b/rvc/realtime/utils/vad.py new file mode 100644 index 0000000000000000000000000000000000000000..670169c61119bad57f807b8862ba561b75d531a0 --- /dev/null +++ b/rvc/realtime/utils/vad.py @@ -0,0 +1,85 @@ +import webrtcvad +import numpy as np + + +class VADProcessor: + def __init__(self, sensitivity_mode=3, sample_rate=16000, frame_duration_ms=30): + """ + Initializes the VADProcessor. + + Args: + sensitivity_mode (int): VAD sensitivity (0-3). 3 is most aggressive. + sample_rate (int): Sample rate of the audio. Must be 8000, 16000, 32000, or 48000 Hz. + WebRTC VAD internally works best with 16000 Hz. + frame_duration_ms (int): Duration of each audio frame in ms. Must be 10, 20, or 30. + """ + + if sample_rate not in [8000, 16000, 32000, 48000]: + raise ValueError("VAD sample rate must be 8000, 16000, 32000, or 48000 Hz") + if frame_duration_ms not in [10, 20, 30]: + raise ValueError("VAD frame duration must be 10, 20, or 30 ms") + + self.vad = webrtcvad.Vad(sensitivity_mode) + self.sample_rate = sample_rate + self.frame_length = int(sample_rate * (frame_duration_ms / 1000.0)) + # print(f"VAD Initialized: SR={sample_rate}, Frame Duration={frame_duration_ms}ms, Frame Length={self.frame_length} samples") + + def is_speech(self, audio_chunk_float32): + """ + Detects if the given audio chunk contains speech. + + Args: + audio_chunk_float32 (np.ndarray): A chunk of audio data in float32 format, mono. + The sample rate must match the one VAD was initialized with. + + Returns: + bool: True if speech is detected in the chunk, False otherwise. + """ + + if audio_chunk_float32.ndim > 1 and audio_chunk_float32.shape[1] == 1: + audio_chunk_float32 = audio_chunk_float32.flatten() + elif audio_chunk_float32.ndim > 1: + # If stereo, average to mono. This is a simple approach. + # For better results, ensure mono input from the source. + print("VAD Warning: Received stereo audio, averaging to mono.") + audio_chunk_float32 = np.mean(audio_chunk_float32, axis=1) + + # Convert float32 audio to int16 PCM + # WebRTC VAD expects 16-bit linear PCM audio. + if np.max(np.abs(audio_chunk_float32)) > 1.0: + # print( + # f"VAD Warning: Input audio chunk has values outside [-1.0, 1.0]: min={np.min(audio_chunk_float32)}, max={np.max(audio_chunk_float32)}. Clipping." + # ) + audio_chunk_float32 = np.clip(audio_chunk_float32, -1.0, 1.0) + + audio_chunk_int16 = (audio_chunk_float32 * 32767).astype(np.int16) + + num_frames = len(audio_chunk_int16) // self.frame_length + if num_frames == 0 and len(audio_chunk_int16) > 0: + # If the chunk is smaller than one frame, pad it for VAD analysis + # This might not be ideal but handles small initial chunks + padding = np.zeros( + self.frame_length - len(audio_chunk_int16), dtype=np.int16 + ) + audio_chunk_int16 = np.concatenate((audio_chunk_int16, padding)) + num_frames = 1 + elif num_frames == 0 and len(audio_chunk_int16) == 0: + return False # Empty chunk + + try: + for i in range(num_frames): + start = i * self.frame_length + end = start + self.frame_length + frame = audio_chunk_int16[start:end] + # The VAD expects bytes, not a NumPy array. + if self.vad.is_speech(frame.tobytes(), self.sample_rate): + return True # Speech detected in at least one frame + return False # No speech detected in any frame + except Exception as e: + # webrtcvad can sometimes throw "Error talking to VAD" or similar + # if frame length is not perfect. + print( + f"VAD processing error: {e}. Chunk length: {len(audio_chunk_int16)}, Frame length: {self.frame_length}" + ) + # Fallback: assume no speech on error to avoid processing noise + return False diff --git a/rvc/train/anyprecision_optimizer.py b/rvc/train/anyprecision_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..40c50d542207bf809e6cf0a9ea77a4b30c583646 --- /dev/null +++ b/rvc/train/anyprecision_optimizer.py @@ -0,0 +1,182 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# AnyPrecisionAdamW: a flexible precision AdamW optimizer +# with optional Kahan summation for high precision weight updates. +# Allows direct control over momentum, variance and auxiliary compensation +# buffer dtypes. +# Optional Kahan summation is used to offset precision reduction for +# the weight updates. This allows full training in BFloat16 (equal or +# better than FP32 results in many cases) due to high precision weight upates. + +import torch +from torch.optim.optimizer import Optimizer + + +class AnyPrecisionAdamW(Optimizer): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=0.0, + use_kahan_summation=True, + momentum_dtype=torch.bfloat16, + variance_dtype=torch.bfloat16, + compensation_buffer_dtype=torch.bfloat16, + ): + """ + Args: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay coefficient (default: 1e-2) + + # Any Precision specific + use_kahan_summation = creates auxiliary buffer to ensure high precision + model param updates (default: False) + momentum_dtype = dtype for momentum (default: BFloat32) + variance_dtype = dtype for uncentered variance (default: BFloat16) + compensation_buffer_dtype = dtype for Kahan summation + buffer (default: BFloat16). Only used if + ``use_kahan_summation=True``. + + # Usage + This optimizer implements optimizer states, and Kahan summation + for high precision updates, all in user controlled dtypes. + Defaults are variance in BF16, Momentum in FP32. + This can be run in FSDP mixed precision, amp, or full precision, + depending on what training pipeline you wish to work with. + + Setting to use_kahan_summation = False, and changing momentum and + variance dtypes to FP32, reverts this to a standard AdamW optimizer. + """ + defaults = dict( + lr=lr, + betas=betas, + eps=eps, + weight_decay=weight_decay, + use_kahan_summation=use_kahan_summation, + momentum_dtype=momentum_dtype, + variance_dtype=variance_dtype, + compensation_buffer_dtype=compensation_buffer_dtype, + ) + + super().__init__(params, defaults) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + Args: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + + if closure is not None: + with torch.enable_grad(): + # to fix linter, we do not keep the returned loss for use atm. + closure() + + for group in self.param_groups: + + beta1, beta2 = group["betas"] + lr = group["lr"] + weight_decay = group["weight_decay"] + eps = group["eps"] + use_kahan_summation = group["use_kahan_summation"] + + momentum_dtype = group["momentum_dtype"] + variance_dtype = group["variance_dtype"] + compensation_buffer_dtype = group["compensation_buffer_dtype"] + + for p in group["params"]: + if p.grad is None: + continue + + if p.grad.is_sparse: + raise RuntimeError( + "AnyPrecisionAdamW does not support sparse gradients" + ) + + state = self.state[p] + + # State initialization + if len(state) == 0: + + state["step"] = torch.tensor(0.0) + + # momentum - EMA of gradient values + state["exp_avg"] = torch.zeros_like( + p, + dtype=momentum_dtype, + ) + + # variance uncentered - EMA of squared gradient values + state["exp_avg_sq"] = torch.zeros_like( + p, + dtype=variance_dtype, + ) + + # optional Kahan summation - accumulated error tracker + if use_kahan_summation: + state["compensation"] = torch.zeros_like( + p, + dtype=compensation_buffer_dtype, + ) + + # main processing ------------------------- + + # update the steps for each param group update + state["step"] += 1 + step = state["step"] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + + grad = p.grad + + # weight decay, AdamW style + if weight_decay: + p.data.mul_(1 - lr * weight_decay) + + # update momentum + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + + # update uncentered variance + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + + # adjust using bias1 + bias_correction1 = 1 - beta1**step + + step_size = lr / bias_correction1 + + # adjust using bias2 + denom_correction = (1 - beta2**step) ** 0.5 # avoids math import + + centered_variance = (exp_avg_sq.sqrt() / denom_correction).add_( + eps, alpha=1 + ) + + # lr update to compensation + if use_kahan_summation: + compensation = state["compensation"] + + compensation.addcdiv_(exp_avg, centered_variance, value=-step_size) + + # update weights with compensation (Kahan summation) + # save error back to compensation for next iteration + temp_buffer = p.detach().clone() + p.data.add_(compensation) + compensation.add_(temp_buffer.sub_(p.data)) + + else: + # usual AdamW updates + p.data.addcdiv_(exp_avg, centered_variance, value=-step_size) diff --git a/rvc/train/data_utils.py b/rvc/train/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7b33d718030a702c347766ab8619f77859790c2c --- /dev/null +++ b/rvc/train/data_utils.py @@ -0,0 +1,379 @@ +import os +import numpy as np +import torch +import torch.utils.data + +from mel_processing import spectrogram_torch +from utils import load_filepaths_and_text, load_wav_to_torch + + +class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): + """ + Dataset that loads text and audio pairs. + + Args: + hparams: Hyperparameters. + """ + + def __init__(self, hparams): + self.audiopaths_and_text = load_filepaths_and_text(hparams.training_files) + self.max_wav_value = hparams.max_wav_value + self.sample_rate = hparams.sample_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sample_rate = hparams.sample_rate + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 5000) + self._filter() + + def _filter(self): + """ + Filters audio paths and text pairs based on text length. + """ + audiopaths_and_text_new = [] + lengths = [] + for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text: + if self.min_text_len <= len(text) and len(text) <= self.max_text_len: + audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv]) + lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) + self.audiopaths_and_text = audiopaths_and_text_new + self.lengths = lengths + + def get_sid(self, sid): + """ + Converts speaker ID to a LongTensor. + + Args: + sid (str): Speaker ID. + """ + try: + sid = torch.LongTensor([int(sid)]) + except ValueError as error: + print(f"Error converting speaker ID '{sid}' to integer. Exception: {error}") + sid = torch.LongTensor([0]) + return sid + + def get_audio_text_pair(self, audiopath_and_text): + """ + Loads and processes audio and text data for a single pair. + + Args: + audiopath_and_text (list): List containing audio path, text, pitch, pitchf, and speaker ID. + """ + file = audiopath_and_text[0] + phone = audiopath_and_text[1] + pitch = audiopath_and_text[2] + pitchf = audiopath_and_text[3] + dv = audiopath_and_text[4] + + phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf) + spec, wav = self.get_audio(file) + dv = self.get_sid(dv) + + len_phone = phone.size()[0] + len_spec = spec.size()[-1] + if len_phone != len_spec: + len_min = min(len_phone, len_spec) + len_wav = len_min * self.hop_length + + spec = spec[:, :len_min] + wav = wav[:, :len_wav] + + phone = phone[:len_min, :] + pitch = pitch[:len_min] + pitchf = pitchf[:len_min] + + return (spec, wav, phone, pitch, pitchf, dv) + + def get_labels(self, phone, pitch, pitchf): + """ + Loads and processes phoneme, pitch, and pitchf labels. + + Args: + phone (str): Path to phoneme label file. + pitch (str): Path to pitch label file. + pitchf (str): Path to pitchf label file. + """ + phone = np.load(phone) + phone = np.repeat(phone, 2, axis=0) + pitch = np.load(pitch) + pitchf = np.load(pitchf) + n_num = min(phone.shape[0], 900) + phone = phone[:n_num, :] + pitch = pitch[:n_num] + pitchf = pitchf[:n_num] + phone = torch.FloatTensor(phone) + pitch = torch.LongTensor(pitch) + pitchf = torch.FloatTensor(pitchf) + return phone, pitch, pitchf + + def get_audio(self, filename): + """ + Loads and processes audio data. + + Args: + filename (str): Path to audio file. + """ + audio, sample_rate = load_wav_to_torch(filename) + if sample_rate != self.sample_rate: + raise ValueError( + f"{sample_rate} SR doesn't match target {self.sample_rate} SR" + ) + audio_norm = audio + audio_norm = audio_norm.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pt") + if os.path.exists(spec_filename): + try: + spec = torch.load(spec_filename, weights_only=True) + except Exception as error: + print(f"An error occurred getting spec from {spec_filename}: {error}") + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + else: + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + return spec, audio_norm + + def __getitem__(self, index): + """ + Returns a single audio-text pair. + + Args: + index (int): Index of the data sample. + """ + return self.get_audio_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + """ + Returns the length of the dataset. + """ + return len(self.audiopaths_and_text) + + +class TextAudioCollateMultiNSFsid: + """ + Collates text and audio data for training. + + Args: + return_ids (bool, optional): Whether to return sample IDs. Defaults to False. + """ + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """ + Collates a batch of data samples. + + Args: + batch (list): List of data samples. + """ + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True + ) + + max_spec_len = max([x[0].size(1) for x in batch]) + max_wave_len = max([x[1].size(1) for x in batch]) + spec_lengths = torch.LongTensor(len(batch)) + wave_lengths = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) + wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) + spec_padded.zero_() + wave_padded.zero_() + + max_phone_len = max([x[2].size(0) for x in batch]) + phone_lengths = torch.LongTensor(len(batch)) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) + pitch_padded = torch.LongTensor(len(batch), max_phone_len) + pitchf_padded = torch.FloatTensor(len(batch), max_phone_len) + phone_padded.zero_() + pitch_padded.zero_() + pitchf_padded.zero_() + sid = torch.LongTensor(len(batch)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spec = row[0] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wave = row[1] + wave_padded[i, :, : wave.size(1)] = wave + wave_lengths[i] = wave.size(1) + + phone = row[2] + phone_padded[i, : phone.size(0), :] = phone + phone_lengths[i] = phone.size(0) + + pitch = row[3] + pitch_padded[i, : pitch.size(0)] = pitch + pitchf = row[4] + pitchf_padded[i, : pitchf.size(0)] = pitchf + + sid[i] = row[5] + + return ( + phone_padded, + phone_lengths, + pitch_padded, + pitchf_padded, + spec_padded, + spec_lengths, + wave_padded, + wave_lengths, + sid, + ) + + +class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): + """ + Distributed sampler that groups data into buckets based on length. + + Args: + dataset (torch.utils.data.Dataset): Dataset to sample from. + batch_size (int): Batch size. + boundaries (list): List of length boundaries for buckets. + num_replicas (int, optional): Number of processes participating in distributed training. Defaults to None. + rank (int, optional): Rank of the current process. Defaults to None. + shuffle (bool, optional): Whether to shuffle the data. Defaults to True. + """ + + def __init__( + self, + dataset, + batch_size, + boundaries, + num_replicas=None, + rank=None, + shuffle=True, + ): + super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + self.lengths = dataset.lengths + self.batch_size = batch_size + self.boundaries = boundaries + + self.buckets, self.num_samples_per_bucket = self._create_buckets() + self.total_size = sum(self.num_samples_per_bucket) + self.num_samples = self.total_size // self.num_replicas + + def _create_buckets(self): + """ + Creates buckets of data samples based on length. + """ + buckets = [[] for _ in range(len(self.boundaries) - 1)] + for i in range(len(self.lengths)): + length = self.lengths[i] + idx_bucket = self._bisect(length) + if idx_bucket != -1: + buckets[idx_bucket].append(i) + + for i in range(len(buckets) - 1, -1, -1): # + if len(buckets[i]) == 0: + buckets.pop(i) + self.boundaries.pop(i + 1) + + num_samples_per_bucket = [] + for i in range(len(buckets)): + len_bucket = len(buckets[i]) + total_batch_size = self.num_replicas * self.batch_size + rem = ( + total_batch_size - (len_bucket % total_batch_size) + ) % total_batch_size + num_samples_per_bucket.append(len_bucket + rem) + return buckets, num_samples_per_bucket + + def __iter__(self): + """ + Iterates over batches of data samples. + """ + g = torch.Generator() + g.manual_seed(self.epoch) + + indices = [] + if self.shuffle: + for bucket in self.buckets: + indices.append(torch.randperm(len(bucket), generator=g).tolist()) + else: + for bucket in self.buckets: + indices.append(list(range(len(bucket)))) + + batches = [] + for i in range(len(self.buckets)): + bucket = self.buckets[i] + len_bucket = len(bucket) + ids_bucket = indices[i] + num_samples_bucket = self.num_samples_per_bucket[i] + + rem = num_samples_bucket - len_bucket + ids_bucket = ( + ids_bucket + + ids_bucket * (rem // len_bucket) + + ids_bucket[: (rem % len_bucket)] + ) + + ids_bucket = ids_bucket[self.rank :: self.num_replicas] + + # batching + for j in range(len(ids_bucket) // self.batch_size): + batch = [ + bucket[idx] + for idx in ids_bucket[ + j * self.batch_size : (j + 1) * self.batch_size + ] + ] + batches.append(batch) + + if self.shuffle: + batch_ids = torch.randperm(len(batches), generator=g).tolist() + batches = [batches[i] for i in batch_ids] + self.batches = batches + + assert len(self.batches) * self.batch_size == self.num_samples + return iter(self.batches) + + def _bisect(self, x, lo=0, hi=None): + """ + Performs binary search to find the bucket index for a given length. + + Args: + x (int): Length to find the bucket for. + lo (int, optional): Lower bound of the search range. Defaults to 0. + hi (int, optional): Upper bound of the search range. Defaults to None. + """ + if hi is None: + hi = len(self.boundaries) - 1 + + if hi > lo: + mid = (hi + lo) // 2 + if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: + return mid + elif x <= self.boundaries[mid]: + return self._bisect(x, lo, mid) + else: + return self._bisect(x, mid + 1, hi) + else: + return -1 + + def __len__(self): + """ + Returns the length of the sampler. + """ + return self.num_samples // self.batch_size diff --git a/rvc/train/extract/extract.py b/rvc/train/extract/extract.py new file mode 100644 index 0000000000000000000000000000000000000000..5e6e1b831e80986b4631a33cde32f82a52e59d4e --- /dev/null +++ b/rvc/train/extract/extract.py @@ -0,0 +1,224 @@ +import os +import sys +import glob +import time +import tqdm +import torch +import torchcrepe +import numpy as np +import concurrent.futures +import multiprocessing as mp +import json + +now_dir = os.getcwd() +sys.path.append(os.path.join(now_dir)) + +# Zluda hijack +import rvc.lib.zluda + +from rvc.lib.utils import load_audio_16k, load_embedding +from rvc.train.extract.preparing_files import generate_config, generate_filelist +from rvc.lib.predictors.f0 import CREPE, FCPE, RMVPE +from rvc.configs.config import Config + +# Load config +config = Config() +mp.set_start_method("spawn", force=True) + + +class FeatureInput: + def __init__(self, f0_method="rmvpe", device="cpu"): + self.hop_size = 160 # default + self.sample_rate = 16000 # default + self.f0_bin = 256 + self.f0_max = 1100.0 + self.f0_min = 50.0 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.device = device + if f0_method in ("crepe", "crepe-tiny"): + self.model = CREPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.hop_size + ) + elif f0_method == "rmvpe": + self.model = RMVPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.hop_size + ) + elif f0_method == "fcpe": + self.model = FCPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.hop_size + ) + self.f0_method = f0_method + + def compute_f0(self, x, p_len=None): + if self.f0_method == "crepe": + f0 = self.model.get_f0(x, self.f0_min, self.f0_max, p_len, "full") + elif self.f0_method == "crepe-tiny": + f0 = self.model.get_f0(x, self.f0_min, self.f0_max, p_len, "tiny") + elif self.f0_method == "rmvpe": + f0 = self.model.get_f0(x, filter_radius=0.03) + elif self.f0_method == "fcpe": + f0 = self.model.get_f0(x, p_len, filter_radius=0.006) + return f0 + + def coarse_f0(self, f0): + f0_mel = 1127.0 * np.log(1.0 + f0 / 700.0) + f0_mel = np.clip( + (f0_mel - self.f0_mel_min) + * (self.f0_bin - 2) + / (self.f0_mel_max - self.f0_mel_min) + + 1, + 1, + self.f0_bin - 1, + ) + return np.rint(f0_mel).astype(int) + + def process_file(self, file_info): + inp_path, opt_path_coarse, opt_path_full, _ = file_info + if os.path.exists(opt_path_coarse) and os.path.exists(opt_path_full): + return + + try: + np_arr = load_audio_16k(inp_path) + feature_pit = self.compute_f0(np_arr) + np.save(opt_path_full, feature_pit, allow_pickle=False) + coarse_pit = self.coarse_f0(feature_pit) + np.save(opt_path_coarse, coarse_pit, allow_pickle=False) + except Exception as error: + print( + f"An error occurred extracting file {inp_path} on {self.device}: {error}" + ) + + +def process_files(files, f0_method, device, threads): + fe = FeatureInput(f0_method=f0_method, device=device) + with tqdm.tqdm(total=len(files), leave=True) as pbar: + for file_info in files: + fe.process_file(file_info) + pbar.update(1) + + +def run_pitch_extraction(files, devices, f0_method, threads): + devices_str = ", ".join(devices) + print(f"Starting pitch extraction on {devices_str} using {f0_method}...") + start_time = time.time() + + with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor: + tasks = [ + executor.submit( + process_files, + files[i :: len(devices)], + f0_method, + devices[i], + threads // len(devices), + ) + for i in range(len(devices)) + ] + concurrent.futures.wait(tasks) + + print(f"Pitch extraction completed in {time.time() - start_time:.2f} seconds.") + + +def process_file_embedding( + files, embedder_model, embedder_model_custom, device_num, device, n_threads +): + model = load_embedding(embedder_model, embedder_model_custom).to(device).float() + model.eval() + n_threads = max(1, n_threads) + + def worker(file_info): + wav_file_path, _, _, out_file_path = file_info + if os.path.exists(out_file_path): + return + feats = torch.from_numpy(load_audio_16k(wav_file_path)).to(device).float() + feats = feats.view(1, -1) + with torch.no_grad(): + result = model(feats)["last_hidden_state"] + feats_out = result.squeeze(0).float().cpu().numpy() + if not np.isnan(feats_out).any(): + np.save(out_file_path, feats_out, allow_pickle=False) + else: + print(f"{wav_file_path} produced NaN values; skipping.") + + with tqdm.tqdm(total=len(files), leave=True, position=device_num) as pbar: + with concurrent.futures.ThreadPoolExecutor(max_workers=n_threads) as executor: + futures = [executor.submit(worker, f) for f in files] + for _ in concurrent.futures.as_completed(futures): + pbar.update(1) + + +def run_embedding_extraction( + files, devices, embedder_model, embedder_model_custom, threads +): + devices_str = ", ".join(devices) + print( + f"Starting embedding extraction with {num_processes} cores on {devices_str}..." + ) + start_time = time.time() + with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor: + tasks = [ + executor.submit( + process_file_embedding, + files[i :: len(devices)], + embedder_model, + embedder_model_custom, + i, + devices[i], + threads // len(devices), + ) + for i in range(len(devices)) + ] + concurrent.futures.wait(tasks) + + print(f"Embedding extraction completed in {time.time() - start_time:.2f} seconds.") + + +if __name__ == "__main__": + exp_dir = sys.argv[1] + f0_method = sys.argv[2] + num_processes = int(sys.argv[3]) + gpus = sys.argv[4] + sample_rate = sys.argv[5] + embedder_model = sys.argv[6] + embedder_model_custom = sys.argv[7] if len(sys.argv) > 7 else None + include_mutes = int(sys.argv[8]) if len(sys.argv) > 8 else 2 + + wav_path = os.path.join(exp_dir, "sliced_audios_16k") + os.makedirs(os.path.join(exp_dir, "f0"), exist_ok=True) + os.makedirs(os.path.join(exp_dir, "f0_voiced"), exist_ok=True) + os.makedirs(os.path.join(exp_dir, "extracted"), exist_ok=True) + + chosen_embedder_model = ( + embedder_model_custom if embedder_model == "custom" else embedder_model + ) + file_path = os.path.join(exp_dir, "model_info.json") + if os.path.exists(file_path): + with open(file_path, "r") as f: + data = json.load(f) + else: + data = {} + data["embedder_model"] = chosen_embedder_model + with open(file_path, "w") as f: + json.dump(data, f, indent=4) + + files = [] + for file in glob.glob(os.path.join(wav_path, "*.wav")): + file_name = os.path.basename(file) + file_info = [ + file, + os.path.join(exp_dir, "f0", file_name + ".npy"), + os.path.join(exp_dir, "f0_voiced", file_name + ".npy"), + os.path.join(exp_dir, "extracted", file_name.replace("wav", "npy")), + ] + files.append(file_info) + + devices = ["cpu"] if gpus == "-" else [f"cuda:{idx}" for idx in gpus.split("-")] + + run_pitch_extraction(files, devices, f0_method, num_processes) + + run_embedding_extraction( + files, devices, embedder_model, embedder_model_custom, num_processes + ) + + generate_config(sample_rate, exp_dir) + generate_filelist(exp_dir, sample_rate, include_mutes) diff --git a/rvc/train/extract/preparing_files.py b/rvc/train/extract/preparing_files.py new file mode 100644 index 0000000000000000000000000000000000000000..38a01bf510e3c3ad4aad4258a1df824cc125b2a7 --- /dev/null +++ b/rvc/train/extract/preparing_files.py @@ -0,0 +1,89 @@ +import os +import shutil +from random import shuffle +from rvc.configs.config import Config +import json + +config = Config() +current_directory = os.getcwd() + + +def generate_config(sample_rate: int, model_path: str): + config_path = os.path.join("rvc", "configs", f"{sample_rate}.json") + config_save_path = os.path.join(model_path, "config.json") + if not os.path.exists(config_save_path): + shutil.copyfile(config_path, config_save_path) + + +def generate_filelist(model_path: str, sample_rate: int, include_mutes: int = 2): + gt_wavs_dir = os.path.join(model_path, "sliced_audios") + feature_dir = os.path.join(model_path, f"extracted") + + f0_dir, f0nsf_dir = None, None + f0_dir = os.path.join(model_path, "f0") + f0nsf_dir = os.path.join(model_path, "f0_voiced") + + gt_wavs_files = set(name.split(".")[0] for name in os.listdir(gt_wavs_dir)) + feature_files = set(name.split(".")[0] for name in os.listdir(feature_dir)) + + f0_files = set(name.split(".")[0] for name in os.listdir(f0_dir)) + f0nsf_files = set(name.split(".")[0] for name in os.listdir(f0nsf_dir)) + names = gt_wavs_files & feature_files & f0_files & f0nsf_files + + try: + model_info_path = os.path.join(model_path, "model_info.json") + with open(model_info_path, "r") as f: + model_info = json.load(f) + embedder_name = model_info["embedder_model"] + except: + embedder_name = "contentvec" + + if embedder_name == "spin": + mute_base_path = os.path.join(current_directory, "logs", "mute_spin") + elif embedder_name == "spin-v2": + mute_base_path = os.path.join(current_directory, "logs", "mute_spin-v2") + else: + mute_base_path = os.path.join(current_directory, "logs", "mute") + + options = [] + sids = [] + for name in names: + sid = name.split("_")[0] + if sid not in sids: + sids.append(sid) + options.append( + f"{os.path.join(gt_wavs_dir, name)}.wav|{os.path.join(feature_dir, name)}.npy|{os.path.join(f0_dir, name)}.wav.npy|{os.path.join(f0nsf_dir, name)}.wav.npy|{sid}" + ) + + if include_mutes > 0: + mute_audio_path = os.path.join( + mute_base_path, "sliced_audios", f"mute{sample_rate}.wav" + ) + mute_feature_path = os.path.join(mute_base_path, f"extracted", "mute.npy") + mute_f0_path = os.path.join(mute_base_path, "f0", "mute.wav.npy") + mute_f0nsf_path = os.path.join(mute_base_path, "f0_voiced", "mute.wav.npy") + + # adding x files per sid + for sid in sids * include_mutes: + options.append( + f"{mute_audio_path}|{mute_feature_path}|{mute_f0_path}|{mute_f0nsf_path}|{sid}" + ) + + file_path = os.path.join(model_path, "model_info.json") + if os.path.exists(file_path): + with open(file_path, "r") as f: + data = json.load(f) + else: + data = {} + data.update( + { + "speakers_id": len(sids), + } + ) + with open(file_path, "w") as f: + json.dump(data, f, indent=4) + + shuffle(options) + + with open(os.path.join(model_path, "filelist.txt"), "w") as f: + f.write("\n".join(options)) diff --git a/rvc/train/losses.py b/rvc/train/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..ef3a2eb27b139a43a54ce5e074c557af01a0f417 --- /dev/null +++ b/rvc/train/losses.py @@ -0,0 +1,132 @@ +import torch + + +def feature_loss(fmap_r, fmap_g): + """ + Compute the feature loss between reference and generated feature maps. + + Args: + fmap_r (list of torch.Tensor): List of reference feature maps. + fmap_g (list of torch.Tensor): List of generated feature maps. + """ + return 2 * sum( + torch.mean(torch.abs(rl - gl)) + for dr, dg in zip(fmap_r, fmap_g) + for rl, gl in zip(dr, dg) + ) + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + """ + Compute the discriminator loss for real and generated outputs. + + Args: + disc_real_outputs (list of torch.Tensor): List of discriminator outputs for real samples. + disc_generated_outputs (list of torch.Tensor): List of discriminator outputs for generated samples. + """ + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1 - dr.float()) ** 2) + g_loss = torch.mean(dg.float() ** 2) + + # r_losses.append(r_loss.item()) + # g_losses.append(g_loss.item()) + loss += r_loss + g_loss + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + """ + Compute the generator loss based on discriminator outputs. + + Args: + disc_outputs (list of torch.Tensor): List of discriminator outputs for generated samples. + """ + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1 - dg.float()) ** 2) + # gen_losses.append(l.item()) + loss += l + + return loss, gen_losses + + +def discriminator_loss_scaled(disc_real, disc_fake, scale=1.0): + loss = 0 + for i, (d_real, d_fake) in enumerate(zip(disc_real, disc_fake)): + real_loss = torch.mean((1 - d_real) ** 2) + fake_loss = torch.mean(d_fake**2) + _loss = real_loss + fake_loss + loss += _loss if i < len(disc_real) / 2 else scale * _loss + return loss, None, None + + +def generator_loss_scaled(disc_outputs, scale=1.0): + loss = 0 + for i, d_fake in enumerate(disc_outputs): + d_fake = d_fake.float() + _loss = torch.mean((1 - d_fake) ** 2) + loss += _loss if i < len(disc_outputs) / 2 else scale * _loss + return loss, None, None + + +def discriminator_loss_scaled(disc_real, disc_fake, scale=1.0): + """ + Compute the scaled discriminator loss for real and generated outputs. + + Args: + disc_real (list of torch.Tensor): List of discriminator outputs for real samples. + disc_fake (list of torch.Tensor): List of discriminator outputs for generated samples. + scale (float, optional): Scaling factor applied to losses beyond the midpoint. Default is 1.0. + """ + midpoint = len(disc_real) // 2 + losses = [] + for i, (d_real, d_fake) in enumerate(zip(disc_real, disc_fake)): + real_loss = (1 - d_real).pow(2).mean() + fake_loss = d_fake.pow(2).mean() + total_loss = real_loss + fake_loss + if i >= midpoint: + total_loss *= scale + losses.append(total_loss) + loss = sum(losses) + return loss, None, None + + +def generator_loss_scaled(disc_outputs, scale=1.0): + """ + Compute the scaled generator loss based on discriminator outputs. + + Args: + disc_outputs (list of torch.Tensor): List of discriminator outputs for generated samples. + scale (float, optional): Scaling factor applied to losses beyond the midpoint. Default is 1.0. + """ + midpoint = len(disc_outputs) // 2 + losses = [] + for i, d_fake in enumerate(disc_outputs): + loss_value = (1 - d_fake).pow(2).mean() + if i >= midpoint: + loss_value *= scale + losses.append(loss_value) + loss = sum(losses) + return loss, None, None + + +def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): + """ + Compute the Kullback-Leibler divergence loss. + + Args: + z_p (torch.Tensor): Latent variable z_p [b, h, t_t]. + logs_q (torch.Tensor): Log variance of q [b, h, t_t]. + m_p (torch.Tensor): Mean of p [b, h, t_t]. + logs_p (torch.Tensor): Log variance of p [b, h, t_t]. + z_mask (torch.Tensor): Mask for the latent variables [b, h, t_t]. + """ + kl = logs_p - logs_q - 0.5 + 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2 * logs_p) + kl = (kl * z_mask).sum() + loss = kl / z_mask.sum() + return loss diff --git a/rvc/train/mel_processing.py b/rvc/train/mel_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..d99c0b0eeb6c23d5329ac2125d967cd4914ad58a --- /dev/null +++ b/rvc/train/mel_processing.py @@ -0,0 +1,188 @@ +import torch +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn + + +mel_basis = {} +hann_window = {} + + +def spectrogram_torch(y, n_fft, hop_size, win_size, center=False): + """ + Compute the spectrogram of a signal using STFT. + + Args: + y (torch.Tensor): Input signal. + n_fft (int): FFT window size. + hop_size (int): Hop size between frames. + win_size (int): Window size. + center (bool, optional): Whether to center the window. Defaults to False. + """ + global hann_window + dtype_device = str(y.dtype) + "_" + str(y.device) + wnsize_dtype_device = str(win_size) + "_" + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( + dtype=y.dtype, device=y.device + ) + + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + spec = torch.stft( + y, + n_fft=n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + + spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) + + return spec + + +def spec_to_mel_torch(spec, n_fft, num_mels, sample_rate, fmin, fmax): + """ + Convert a spectrogram to a mel-spectrogram. + + Args: + spec (torch.Tensor): Magnitude spectrogram. + n_fft (int): FFT window size. + num_mels (int): Number of mel frequency bins. + sample_rate (int): Sampling rate of the audio signal. + fmin (float): Minimum frequency. + fmax (float): Maximum frequency. + """ + global mel_basis + dtype_device = str(spec.dtype) + "_" + str(spec.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn( + sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax + ) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( + dtype=spec.dtype, device=spec.device + ) + + melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) + melspec = torch.log(melspec.clamp(min=1e-5) * 1) + return melspec + + +def mel_spectrogram_torch( + y, n_fft, num_mels, sample_rate, hop_size, win_size, fmin, fmax, center=False +): + """ + Compute the mel-spectrogram of a signal. + + Args: + y (torch.Tensor): Input signal. + n_fft (int): FFT window size. + num_mels (int): Number of mel frequency bins. + sample_rate (int): Sampling rate of the audio signal. + hop_size (int): Hop size between frames. + win_size (int): Window size. + fmin (float): Minimum frequency. + fmax (float): Maximum frequency. + center (bool, optional): Whether to center the window. Defaults to False. + """ + spec = spectrogram_torch(y, n_fft, hop_size, win_size, center) + + melspec = spec_to_mel_torch(spec, n_fft, num_mels, sample_rate, fmin, fmax) + + return melspec + + +def compute_window_length(n_mels: int, sample_rate: int): + f_min = 0 + f_max = sample_rate / 2 + window_length_seconds = 8 * n_mels / (f_max - f_min) + window_length = int(window_length_seconds * sample_rate) + return 2 ** (window_length.bit_length() - 1) + + +class MultiScaleMelSpectrogramLoss(torch.nn.Module): + + def __init__( + self, + sample_rate: int = 24000, + n_mels: list[int] = [5, 10, 20, 40, 80, 160, 320], # , 480], + window_lengths: list[int] = [32, 64, 128, 256, 512, 1024, 2048], # , 4096], + loss_fn=torch.nn.L1Loss(), + ): + super().__init__() + self.sample_rate = sample_rate + self.loss_fn = loss_fn + self.log_base = torch.log(torch.tensor(10.0)) + self.stft_params: list[tuple] = [] + self.hann_window: dict[int, torch.Tensor] = {} + self.mel_banks: dict[int, torch.Tensor] = {} + + self.stft_params = [(mel, win) for mel, win in zip(n_mels, window_lengths)] + + def mel_spectrogram( + self, + wav: torch.Tensor, + n_mels: int, + window_length: int, + ): + # IDs for caching + dtype_device = str(wav.dtype) + "_" + str(wav.device) + win_dtype_device = str(window_length) + "_" + dtype_device + mel_dtype_device = str(n_mels) + "_" + dtype_device + # caching hann window + if win_dtype_device not in self.hann_window: + self.hann_window[win_dtype_device] = torch.hann_window( + window_length, device=wav.device, dtype=torch.float32 + ) + + wav = wav.squeeze(1) # -> torch(B, T) + + stft = torch.stft( + wav.float(), + n_fft=window_length, + hop_length=window_length // 4, + window=self.hann_window[win_dtype_device], + return_complex=True, + ) # -> torch (B, window_length // 2 + 1, (T - window_length)/hop_length + 1) + + magnitude = torch.sqrt(stft.real.pow(2) + stft.imag.pow(2) + 1e-6) + + # caching mel filter + if mel_dtype_device not in self.mel_banks: + self.mel_banks[mel_dtype_device] = torch.from_numpy( + librosa_mel_fn( + sr=self.sample_rate, + n_mels=n_mels, + n_fft=window_length, + fmin=0, + fmax=None, + ) + ).to(device=wav.device, dtype=torch.float32) + + mel_spectrogram = torch.matmul( + self.mel_banks[mel_dtype_device], magnitude + ) # torch(B, n_mels, stft.frames) + return mel_spectrogram + + def forward( + self, real: torch.Tensor, fake: torch.Tensor + ): # real: torch(B, 1, T) , fake: torch(B, 1, T) + loss = 0.0 + for p in self.stft_params: + real_mels = self.mel_spectrogram(real, *p) + fake_mels = self.mel_spectrogram(fake, *p) + real_logmels = torch.log(real_mels.clamp(min=1e-5)) / self.log_base + fake_logmels = torch.log(fake_mels.clamp(min=1e-5)) / self.log_base + loss += self.loss_fn(real_logmels, fake_logmels) + return loss diff --git a/rvc/train/preprocess/preprocess.py b/rvc/train/preprocess/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..c5a3f5d7bf1648528f835d452ffd0d064da18369 --- /dev/null +++ b/rvc/train/preprocess/preprocess.py @@ -0,0 +1,368 @@ +import os +import sys +import time +from scipy import signal +from scipy.io import wavfile +import numpy as np +import concurrent.futures +from tqdm import tqdm +import json +from distutils.util import strtobool +import librosa +import multiprocessing +import noisereduce as nr +import soxr + +now_directory = os.getcwd() +sys.path.append(now_directory) + +from rvc.lib.utils import load_audio +from rvc.train.preprocess.slicer import Slicer + +import logging + +logging.getLogger("numba.core.byteflow").setLevel(logging.WARNING) +logging.getLogger("numba.core.ssa").setLevel(logging.WARNING) +logging.getLogger("numba.core.interpreter").setLevel(logging.WARNING) + +OVERLAP = 0.3 +PERCENTAGE = 3.0 +MAX_AMPLITUDE = 0.9 +ALPHA = 0.75 +HIGH_PASS_CUTOFF = 48 +SAMPLE_RATE_16K = 16000 +RES_TYPE = "soxr_vhq" + + +class PreProcess: + def __init__(self, sr: int, exp_dir: str): + self.slicer = Slicer( + sr=sr, + threshold=-42, + min_length=1500, + min_interval=400, + hop_size=15, + max_sil_kept=500, + ) + self.sr = sr + self.b_high, self.a_high = signal.butter( + N=5, Wn=HIGH_PASS_CUTOFF, btype="high", fs=self.sr + ) + self.exp_dir = exp_dir + self.device = "cpu" + self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios") + self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k") + os.makedirs(self.gt_wavs_dir, exist_ok=True) + os.makedirs(self.wavs16k_dir, exist_ok=True) + + def _normalize_audio(self, audio: np.ndarray): + tmp_max = np.abs(audio).max() + if tmp_max > 2.5: + return None + return (audio / tmp_max * (MAX_AMPLITUDE * ALPHA)) + (1 - ALPHA) * audio + + def process_audio_segment( + self, + normalized_audio: np.ndarray, + sid: int, + idx0: int, + idx1: int, + normalization_mode: str, + ): + if normalized_audio is None: + print(f"{sid}-{idx0}-{idx1}-filtered") + return + if normalization_mode == "post": + normalized_audio = self._normalize_audio(normalized_audio) + wavfile.write( + os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{idx1}.wav"), + self.sr, + normalized_audio.astype(np.float32), + ) + audio_16k = librosa.resample( + normalized_audio, + orig_sr=self.sr, + target_sr=SAMPLE_RATE_16K, + res_type=RES_TYPE, + ) + wavfile.write( + os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{idx1}.wav"), + SAMPLE_RATE_16K, + audio_16k.astype(np.float32), + ) + + def simple_cut( + self, + audio: np.ndarray, + sid: int, + idx0: int, + chunk_len: float, + overlap_len: float, + normalization_mode: str, + ): + chunk_length = int(self.sr * chunk_len) + overlap_length = int(self.sr * overlap_len) + i = 0 + while i < len(audio): + chunk = audio[i : i + chunk_length] + if normalization_mode == "post": + chunk = self._normalize_audio(chunk) + if len(chunk) == chunk_length: + # full SR for training + wavfile.write( + os.path.join( + self.gt_wavs_dir, + f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav", + ), + self.sr, + chunk.astype(np.float32), + ) + # 16KHz for feature extraction + chunk_16k = librosa.resample( + chunk, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K, res_type=RES_TYPE + ) + wavfile.write( + os.path.join( + self.wavs16k_dir, + f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav", + ), + SAMPLE_RATE_16K, + chunk_16k.astype(np.float32), + ) + i += chunk_length - overlap_length + + def process_audio( + self, + path: str, + idx0: int, + sid: int, + cut_preprocess: str, + process_effects: bool, + noise_reduction: bool, + reduction_strength: float, + chunk_len: float, + overlap_len: float, + normalization_mode: str, + ): + audio_length = 0 + try: + audio = load_audio(path, self.sr) + audio_length = librosa.get_duration(y=audio, sr=self.sr) + + if process_effects: + audio = signal.lfilter(self.b_high, self.a_high, audio) + if normalization_mode == "pre": + audio = self._normalize_audio(audio) + if noise_reduction: + audio = nr.reduce_noise( + y=audio, sr=self.sr, prop_decrease=reduction_strength + ) + if cut_preprocess == "Skip": + # no cutting + self.process_audio_segment( + audio, + sid, + idx0, + 0, + normalization_mode, + ) + elif cut_preprocess == "Simple": + # simple + self.simple_cut( + audio, + sid, + idx0, + chunk_len, + overlap_len, + normalization_mode, + ) + elif cut_preprocess == "Automatic": + idx1 = 0 + # legacy + for audio_segment in self.slicer.slice(audio): + i = 0 + while True: + start = int(self.sr * (PERCENTAGE - OVERLAP) * i) + i += 1 + if ( + len(audio_segment[start:]) + > (PERCENTAGE + OVERLAP) * self.sr + ): + tmp_audio = audio_segment[ + start : start + int(PERCENTAGE * self.sr) + ] + self.process_audio_segment( + tmp_audio, + sid, + idx0, + idx1, + normalization_mode, + ) + idx1 += 1 + else: + tmp_audio = audio_segment[start:] + self.process_audio_segment( + tmp_audio, + sid, + idx0, + idx1, + normalization_mode, + ) + idx1 += 1 + break + + except Exception as error: + print(f"Error processing audio: {error}") + return audio_length + + +def format_duration(seconds): + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + seconds = int(seconds % 60) + return f"{hours:02}:{minutes:02}:{seconds:02}" + + +def save_dataset_duration(file_path, dataset_duration): + try: + with open(file_path, "r") as f: + data = json.load(f) + except FileNotFoundError: + data = {} + + formatted_duration = format_duration(dataset_duration) + new_data = { + "total_dataset_duration": formatted_duration, + "total_seconds": dataset_duration, + } + data.update(new_data) + + with open(file_path, "w") as f: + json.dump(data, f, indent=4) + + +def process_audio_wrapper(args): + ( + pp, + file, + cut_preprocess, + process_effects, + noise_reduction, + reduction_strength, + chunk_len, + overlap_len, + normalization_mode, + ) = args + file_path, idx0, sid = file + return pp.process_audio( + file_path, + idx0, + sid, + cut_preprocess, + process_effects, + noise_reduction, + reduction_strength, + chunk_len, + overlap_len, + normalization_mode, + ) + + +def preprocess_training_set( + input_root: str, + sr: int, + num_processes: int, + exp_dir: str, + cut_preprocess: str, + process_effects: bool, + noise_reduction: bool, + reduction_strength: float, + chunk_len: float, + overlap_len: float, + normalization_mode: str, +): + start_time = time.time() + pp = PreProcess(sr, exp_dir) + print(f"Starting preprocess with {num_processes} processes...") + + files = [] + idx = 0 + + for root, _, filenames in os.walk(input_root): + try: + sid = 0 if root == input_root else int(os.path.basename(root)) + for f in filenames: + if f.lower().endswith((".wav", ".mp3", ".flac", ".ogg")): + files.append((os.path.join(root, f), idx, sid)) + idx += 1 + except ValueError: + print( + f'Speaker ID folder is expected to be integer, got "{os.path.basename(root)}" instead.' + ) + + # print(f"Number of files: {len(files)}") + audio_length = [] + with tqdm(total=len(files)) as pbar: + with concurrent.futures.ProcessPoolExecutor( + max_workers=num_processes + ) as executor: + futures = [ + executor.submit( + process_audio_wrapper, + ( + pp, + file, + cut_preprocess, + process_effects, + noise_reduction, + reduction_strength, + chunk_len, + overlap_len, + normalization_mode, + ), + ) + for file in files + ] + for future in concurrent.futures.as_completed(futures): + audio_length.append(future.result()) + pbar.update(1) + + audio_length = sum(audio_length) + save_dataset_duration( + os.path.join(exp_dir, "model_info.json"), dataset_duration=audio_length + ) + elapsed_time = time.time() - start_time + print( + f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(audio_length)} seconds of audio." + ) + + +if __name__ == "__main__": + experiment_directory = str(sys.argv[1]) + input_root = str(sys.argv[2]) + sample_rate = int(sys.argv[3]) + num_processes = sys.argv[4] + if num_processes.lower() == "none": + num_processes = multiprocessing.cpu_count() + else: + num_processes = int(num_processes) + cut_preprocess = str(sys.argv[5]) + process_effects = strtobool(sys.argv[6]) + noise_reduction = strtobool(sys.argv[7]) + reduction_strength = float(sys.argv[8]) + chunk_len = float(sys.argv[9]) + overlap_len = float(sys.argv[10]) + normalization_mode = str(sys.argv[11]) + preprocess_training_set( + input_root, + sample_rate, + num_processes, + experiment_directory, + cut_preprocess, + process_effects, + noise_reduction, + reduction_strength, + chunk_len, + overlap_len, + normalization_mode, + ) diff --git a/rvc/train/preprocess/slicer.py b/rvc/train/preprocess/slicer.py new file mode 100644 index 0000000000000000000000000000000000000000..09c4f9a556dee5e8ef506115ccf3ace328ffaaa6 --- /dev/null +++ b/rvc/train/preprocess/slicer.py @@ -0,0 +1,235 @@ +import numpy as np + + +class Slicer: + """ + A class for slicing audio waveforms into segments based on silence detection. + + Attributes: + sr (int): Sampling rate of the audio waveform. + threshold (float): RMS threshold for silence detection, in dB. + min_length (int): Minimum length of a segment, in milliseconds. + min_interval (int): Minimum interval between segments, in milliseconds. + hop_size (int): Hop size for RMS calculation, in milliseconds. + max_sil_kept (int): Maximum length of silence to keep at the beginning or end of a segment, in milliseconds. + + Methods: + slice(waveform): Slices the given waveform into segments. + """ + + def __init__( + self, + sr: int, + threshold: float = -40.0, + min_length: int = 5000, + min_interval: int = 300, + hop_size: int = 20, + max_sil_kept: int = 5000, + ): + """ + Initializes a Slicer object. + + Args: + sr (int): Sampling rate of the audio waveform. + threshold (float, optional): RMS threshold for silence detection, in dB. Defaults to -40.0. + min_length (int, optional): Minimum length of a segment, in milliseconds. Defaults to 5000. + min_interval (int, optional): Minimum interval between segments, in milliseconds. Defaults to 300. + hop_size (int, optional): Hop size for RMS calculation, in milliseconds. Defaults to 20. + max_sil_kept (int, optional): Maximum length of silence to keep at the beginning or end of a segment, in milliseconds. Defaults to 5000. + + Raises: + ValueError: If the input parameters are not valid. + """ + if not min_length >= min_interval >= hop_size: + raise ValueError("min_length >= min_interval >= hop_size is required") + if not max_sil_kept >= hop_size: + raise ValueError("max_sil_kept >= hop_size is required") + + # Convert time-based parameters to sample-based parameters + min_interval = sr * min_interval / 1000 + self.threshold = 10 ** (threshold / 20.0) + self.hop_size = round(sr * hop_size / 1000) + self.win_size = min(round(min_interval), 4 * self.hop_size) + self.min_length = round(sr * min_length / 1000 / self.hop_size) + self.min_interval = round(min_interval / self.hop_size) + self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) + + def _apply_slice(self, waveform, begin, end): + """ + Applies a slice to the waveform. + + Args: + waveform (numpy.ndarray): The waveform to slice. + begin (int): Start frame index. + end (int): End frame index. + """ + start_idx = begin * self.hop_size + if len(waveform.shape) > 1: + end_idx = min(waveform.shape[1], end * self.hop_size) + return waveform[:, start_idx:end_idx] + else: + end_idx = min(waveform.shape[0], end * self.hop_size) + return waveform[start_idx:end_idx] + + def slice(self, waveform): + """ + Slices the given waveform into segments. + + Args: + waveform (numpy.ndarray): The waveform to slice. + """ + # Calculate RMS for each frame + samples = waveform.mean(axis=0) if len(waveform.shape) > 1 else waveform + if samples.shape[0] <= self.min_length: + return [waveform] + + rms_list = get_rms( + y=samples, frame_length=self.win_size, hop_length=self.hop_size + ).squeeze(0) + + # Detect silence segments and mark them + sil_tags = [] + silence_start, clip_start = None, 0 + for i, rms in enumerate(rms_list): + # If current frame is silent + if rms < self.threshold: + if silence_start is None: + silence_start = i + continue + + # If current frame is not silent + if silence_start is None: + continue + + # Check if current silence segment is leading silence or need to slice + is_leading_silence = silence_start == 0 and i > self.max_sil_kept + need_slice_middle = ( + i - silence_start >= self.min_interval + and i - clip_start >= self.min_length + ) + + # If not leading silence and not need to slice middle + if not is_leading_silence and not need_slice_middle: + silence_start = None + continue + + # Handle different cases of silence segments + if i - silence_start <= self.max_sil_kept: + # Short silence + pos = rms_list[silence_start : i + 1].argmin() + silence_start + if silence_start == 0: + sil_tags.append((0, pos)) + else: + sil_tags.append((pos, pos)) + clip_start = pos + elif i - silence_start <= self.max_sil_kept * 2: + # Medium silence + pos = rms_list[ + i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 + ].argmin() + pos += i - self.max_sil_kept + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + clip_start = pos_r + else: + sil_tags.append((min(pos_l, pos), max(pos_r, pos))) + clip_start = max(pos_r, pos) + else: + # Long silence + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + else: + sil_tags.append((pos_l, pos_r)) + clip_start = pos_r + silence_start = None + + # Handle trailing silence + total_frames = rms_list.shape[0] + if ( + silence_start is not None + and total_frames - silence_start >= self.min_interval + ): + silence_end = min(total_frames, silence_start + self.max_sil_kept) + pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start + sil_tags.append((pos, total_frames + 1)) + + # Extract segments based on silence tags + if not sil_tags: + return [waveform] + else: + chunks = [] + if sil_tags[0][0] > 0: + chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) + + for i in range(len(sil_tags) - 1): + chunks.append( + self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]) + ) + + if sil_tags[-1][1] < total_frames: + chunks.append( + self._apply_slice(waveform, sil_tags[-1][1], total_frames) + ) + + return chunks + + +def get_rms( + y, + frame_length=2048, + hop_length=512, + pad_mode="constant", +): + """ + Calculates the root mean square (RMS) of a waveform. + + Args: + y (numpy.ndarray): The waveform. + frame_length (int, optional): The length of the frame in samples. Defaults to 2048. + hop_length (int, optional): The hop length between frames in samples. Defaults to 512. + pad_mode (str, optional): The padding mode used for the waveform. Defaults to "constant". + """ + padding = (int(frame_length // 2), int(frame_length // 2)) + y = np.pad(y, padding, mode=pad_mode) + + axis = -1 + out_strides = y.strides + tuple([y.strides[axis]]) + x_shape_trimmed = list(y.shape) + x_shape_trimmed[axis] -= frame_length - 1 + out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) + xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) + + if axis < 0: + target_axis = axis - 1 + else: + target_axis = axis + 1 + + xw = np.moveaxis(xw, -1, target_axis) + slices = [slice(None)] * xw.ndim + slices[axis] = slice(0, None, hop_length) + x = xw[tuple(slices)] + + power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) + return np.sqrt(power) diff --git a/rvc/train/process/change_info.py b/rvc/train/process/change_info.py new file mode 100644 index 0000000000000000000000000000000000000000..0a58c2024cf53b0f6b861e896ac0a6d3b8afcf23 --- /dev/null +++ b/rvc/train/process/change_info.py @@ -0,0 +1,22 @@ +import os +import torch + + +def change_info(path, info, name): + try: + ckpt = torch.load(path, map_location="cpu", weights_only=True) + ckpt["info"] = info + + if not name: + name = os.path.splitext(os.path.basename(path))[0] + + target_dir = os.path.join("logs", name) + os.makedirs(target_dir, exist_ok=True) + + torch.save(ckpt, os.path.join(target_dir, f"{name}.pth")) + + return "Success." + + except Exception as error: + print(f"An error occurred while changing the info: {error}") + return f"Error: {error}" diff --git a/rvc/train/process/extract_index.py b/rvc/train/process/extract_index.py new file mode 100644 index 0000000000000000000000000000000000000000..3de2388b1645e9042bf53bd4d811cf3c485028bc --- /dev/null +++ b/rvc/train/process/extract_index.py @@ -0,0 +1,77 @@ +import os +import sys +from multiprocessing import cpu_count + +import faiss +import numpy as np +from sklearn.cluster import MiniBatchKMeans + +# Parse command line arguments +exp_dir = str(sys.argv[1]) +index_algorithm = str(sys.argv[2]) + +try: + feature_dir = os.path.join(exp_dir, f"extracted") + model_name = os.path.basename(exp_dir) + + if not os.path.exists(feature_dir): + print( + f"Feature to generate index file not found at {feature_dir}. Did you run preprocessing and feature extraction steps?" + ) + sys.exit(1) + + index_filename_added = f"{model_name}.index" + index_filepath_added = os.path.join(exp_dir, index_filename_added) + + if os.path.exists(index_filepath_added): + pass + else: + npys = [] + listdir_res = sorted(os.listdir(feature_dir)) + + for name in listdir_res: + file_path = os.path.join(feature_dir, name) + phone = np.load(file_path) + npys.append(phone) + + big_npy = np.concatenate(npys, axis=0) + + big_npy_idx = np.arange(big_npy.shape[0]) + np.random.shuffle(big_npy_idx) + big_npy = big_npy[big_npy_idx] + + if big_npy.shape[0] > 2e5 and ( + index_algorithm == "Auto" or index_algorithm == "KMeans" + ): + big_npy = ( + MiniBatchKMeans( + n_clusters=10000, + verbose=True, + batch_size=256 * cpu_count(), + compute_labels=False, + init="random", + ) + .fit(big_npy) + .cluster_centers_ + ) + + n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) + + # index_added + index_added = faiss.index_factory(768, f"IVF{n_ivf},Flat") + index_ivf_added = faiss.extract_index_ivf(index_added) + index_ivf_added.nprobe = 1 + index_added.train(big_npy) + + batch_size_add = 8192 + for i in range(0, big_npy.shape[0], batch_size_add): + index_added.add(big_npy[i : i + batch_size_add]) + + faiss.write_index(index_added, index_filepath_added) + print(f"Saved index file '{index_filepath_added}'") + +except Exception as error: + print(f"An error occurred extracting the index: {error}") + print( + "If you are running this code in a virtual environment, make sure you have enough GPU available to generate the Index file." + ) diff --git a/rvc/train/process/extract_model.py b/rvc/train/process/extract_model.py new file mode 100644 index 0000000000000000000000000000000000000000..09bef31d903e97f135bb8f33910859aa5a2a0e52 --- /dev/null +++ b/rvc/train/process/extract_model.py @@ -0,0 +1,114 @@ +import datetime +import hashlib +import json +import os +import sys +from collections import OrderedDict + +import torch + +now_dir = os.getcwd() +sys.path.append(now_dir) + + +def replace_keys_in_dict(d, old_key_part, new_key_part): + if isinstance(d, OrderedDict): + updated_dict = OrderedDict() + else: + updated_dict = {} + for key, value in d.items(): + new_key = key.replace(old_key_part, new_key_part) + if isinstance(value, dict): + value = replace_keys_in_dict(value, old_key_part, new_key_part) + updated_dict[new_key] = value + return updated_dict + + +def extract_model( + ckpt, + sr, + name, + model_path, + epoch, + step, + hps, + overtrain_info, + vocoder, + pitch_guidance=True, + version="v2", +): + try: + model_dir = os.path.dirname(model_path) + os.makedirs(model_dir, exist_ok=True) + + if os.path.exists(os.path.join(model_dir, "model_info.json")): + with open(os.path.join(model_dir, "model_info.json"), "r") as f: + data = json.load(f) + dataset_length = data.get("total_dataset_duration", None) + embedder_model = data.get("embedder_model", None) + speakers_id = data.get("speakers_id", 1) + else: + dataset_length = None + + with open(os.path.join(now_dir, "assets", "config.json"), "r") as f: + data = json.load(f) + model_author = data.get("model_author", None) + + opt = OrderedDict( + weight={ + key: value.half() for key, value in ckpt.items() if "enc_q" not in key + } + ) + opt["config"] = [ + hps.data.filter_length // 2 + 1, + 32, + hps.model.inter_channels, + hps.model.hidden_channels, + hps.model.filter_channels, + hps.model.n_heads, + hps.model.n_layers, + hps.model.kernel_size, + hps.model.p_dropout, + hps.model.resblock, + hps.model.resblock_kernel_sizes, + hps.model.resblock_dilation_sizes, + hps.model.upsample_rates, + hps.model.upsample_initial_channel, + hps.model.upsample_kernel_sizes, + hps.model.spk_embed_dim, + hps.model.gin_channels, + hps.data.sample_rate, + ] + + opt["epoch"] = epoch + opt["step"] = step + opt["sr"] = sr + opt["f0"] = pitch_guidance + opt["version"] = version + opt["creation_date"] = datetime.datetime.now().isoformat() + + hash_input = f"{name}-{epoch}-{step}-{sr}-{version}-{opt['config']}" + opt["model_hash"] = hashlib.sha256(hash_input.encode()).hexdigest() + opt["overtrain_info"] = overtrain_info + opt["dataset_length"] = dataset_length + opt["model_name"] = name + opt["author"] = model_author + opt["embedder_model"] = embedder_model + opt["speakers_id"] = speakers_id + opt["vocoder"] = vocoder + + torch.save( + replace_keys_in_dict( + replace_keys_in_dict( + opt, ".parametrizations.weight.original1", ".weight_v" + ), + ".parametrizations.weight.original0", + ".weight_g", + ), + model_path, + ) + + print(f"Saved model '{model_path}' (epoch {epoch} and step {step})") + + except Exception as error: + print(f"An error occurred extracting the model: {error}") diff --git a/rvc/train/process/model_blender.py b/rvc/train/process/model_blender.py new file mode 100644 index 0000000000000000000000000000000000000000..79b97893ab85a175586e2bac61fd2d5c5120597e --- /dev/null +++ b/rvc/train/process/model_blender.py @@ -0,0 +1,76 @@ +import os +import torch +from collections import OrderedDict + + +def extract(ckpt): + a = ckpt["model"] + opt = OrderedDict() + opt["weight"] = {} + for key in a.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = a[key] + return opt + + +def model_blender(name, path1, path2, ratio): + try: + message = f"Model {path1} and {path2} are merged with alpha {ratio}." + ckpt1 = torch.load(path1, map_location="cpu", weights_only=True) + ckpt2 = torch.load(path2, map_location="cpu", weights_only=True) + + sr1 = str(ckpt1["sr"]).lower().replace("k", "000") + sr2 = str(ckpt2["sr"]).lower().replace("k", "000") + + if sr1 != sr2: + print( + f"Sample rate of {path1} {sr1} does not match the sample rate of {path2} {sr2}." + ) + return "The sample rates of the two models are not the same." + + cfg = ckpt1["config"] + cfg_f0 = ckpt1["f0"] + cfg_version = ckpt1["version"] + cfg_sr = sr1 + vocoder = ckpt1.get("vocoder", "HiFi-GAN") + + if "model" in ckpt1: + ckpt1 = extract(ckpt1) + else: + ckpt1 = ckpt1["weight"] + if "model" in ckpt2: + ckpt2 = extract(ckpt2) + else: + ckpt2 = ckpt2["weight"] + + if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())): + return "Fail to merge the models. The model architectures are not the same." + + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt1.keys(): + if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape: + min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0]) + opt["weight"][key] = ( + ratio * (ckpt1[key][:min_shape0].float()) + + (1 - ratio) * (ckpt2[key][:min_shape0].float()) + ).half() + else: + opt["weight"][key] = ( + ratio * (ckpt1[key].float()) + (1 - ratio) * (ckpt2[key].float()) + ).half() + + opt["config"] = cfg + opt["sr"] = cfg_sr + opt["f0"] = cfg_f0 + opt["version"] = cfg_version + opt["info"] = message + opt["vocoder"] = vocoder + + torch.save(opt, os.path.join("logs", f"{name}.pth")) + print(message) + return message, os.path.join("logs", f"{name}.pth") + except Exception as error: + print(f"An error occurred blending the models: {error}") + return error diff --git a/rvc/train/process/model_information.py b/rvc/train/process/model_information.py new file mode 100644 index 0000000000000000000000000000000000000000..e05d1a15042070e23b37252fc6ff51f0ee4da110 --- /dev/null +++ b/rvc/train/process/model_information.py @@ -0,0 +1,49 @@ +import torch +from datetime import datetime + + +def prettify_date(date_str): + if date_str is None: + return "None" + try: + date_time_obj = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f") + return date_time_obj.strftime("%Y-%m-%d %H:%M:%S") + except ValueError: + return "Invalid date format" + + +def model_information(path): + model_data = torch.load(path, map_location="cpu", weights_only=True) + + print(f"Loaded model from {path}") + + model_name = model_data.get("model_name", "None") + epochs = model_data.get("epoch", "None") + steps = model_data.get("step", "None") + sr = model_data.get("sr", "None") + f0 = model_data.get("f0", "None") + dataset_length = model_data.get("dataset_length", "None") + vocoder = model_data.get("vocoder", "None") + creation_date = model_data.get("creation_date", "None") + model_hash = model_data.get("model_hash", None) + overtrain_info = model_data.get("overtrain_info", "None") + model_author = model_data.get("author", "None") + embedder_model = model_data.get("embedder_model", "None") + speakers_id = model_data.get("speakers_id", 0) + + creation_date_str = prettify_date(creation_date) if creation_date else "None" + + return ( + f"Model Name: {model_name}\n" + f"Model Creator: {model_author}\n" + f"Epochs: {epochs}\n" + f"Steps: {steps}\n" + f"Vocoder: {vocoder}\n" + f"Sampling Rate: {sr}\n" + f"Dataset Length: {dataset_length}\n" + f"Creation Date: {creation_date_str}\n" + f"Overtrain Info: {overtrain_info}\n" + f"Embedder Model: {embedder_model}\n" + f"Max Speakers ID: {speakers_id}" + f"Hash: {model_hash}\n" + ) diff --git a/rvc/train/train.py b/rvc/train/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f3694de01584ad1f5f9bc96274091542c0011b9c --- /dev/null +++ b/rvc/train/train.py @@ -0,0 +1,1149 @@ +import os +import sys + +os.environ["USE_LIBUV"] = "0" if sys.platform == "win32" else "1" +import datetime +import glob +import json +from collections import deque +from distutils.util import strtobool +from random import randint, shuffle +from time import time as ttime + +import numpy as np +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.nn import functional as F +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + +now_dir = os.getcwd() +sys.path.append(os.path.join(now_dir)) + +from losses import discriminator_loss, feature_loss, generator_loss, kl_loss +from mel_processing import ( + MultiScaleMelSpectrogramLoss, + mel_spectrogram_torch, + spec_to_mel_torch, +) +from utils import ( + HParams, + latest_checkpoint_path, + load_checkpoint, + load_wav_to_torch, + plot_spectrogram_to_numpy, + save_checkpoint, + summarize, +) + +# Zluda hijack +import rvc.lib.zluda +from rvc.lib.algorithm import commons +from rvc.train.process.extract_model import extract_model + +# Parse command line arguments +model_name = sys.argv[1] +save_every_epoch = int(sys.argv[2]) +total_epoch = int(sys.argv[3]) +pretrainG = sys.argv[4] +pretrainD = sys.argv[5] +gpus = sys.argv[6] +batch_size = int(sys.argv[7]) +sample_rate = int(sys.argv[8]) +save_only_latest = strtobool(sys.argv[9]) +save_every_weights = strtobool(sys.argv[10]) +cache_data_in_gpu = strtobool(sys.argv[11]) +overtraining_detector = strtobool(sys.argv[12]) +overtraining_threshold = int(sys.argv[13]) +cleanup = strtobool(sys.argv[14]) +vocoder = sys.argv[15] +checkpointing = strtobool(sys.argv[16]) +# experimental settings +randomized = True +d_lr_coeff = 1.0 +g_lr_coeff = 1.0 +d_step_per_g_step = 1 +multiscale_mel_loss = False +bf16_adamw = False + +current_dir = os.getcwd() + +try: + with open(os.path.join(current_dir, "assets", "config.json"), "r") as f: + config = json.load(f) + precision = config["precision"] + if ( + precision == "bf16" + and torch.cuda.is_available() + and torch.cuda.is_bf16_supported() + ): + train_dtype = torch.bfloat16 + elif precision == "fp16" and torch.cuda.is_available(): + train_dtype = torch.float16 + else: + train_dtype = torch.float32 +except (FileNotFoundError, json.JSONDecodeError, KeyError): + train_dtype = torch.float32 + +experiment_dir = os.path.join(current_dir, "logs", model_name) +config_save_path = os.path.join(experiment_dir, "config.json") +dataset_path = os.path.join(experiment_dir, "sliced_audios") +model_info_path = os.path.join(experiment_dir, "model_info.json") + +try: + with open(config_save_path, "r") as f: + config = json.load(f) + config = HParams(**config) +except FileNotFoundError: + print( + f"Config file not found at {config_save_path}. Did you run preprocessing and feature extraction steps?" + ) + sys.exit(1) + +config.data.training_files = os.path.join(experiment_dir, "filelist.txt") + +torch.backends.cudnn.deterministic = False +torch.backends.cudnn.benchmark = True + +global_step = 0 +last_loss_gen_all = 0 +overtrain_save_epoch = 0 +loss_gen_history = [] +smoothed_loss_gen_history = [] +loss_disc_history = [] +smoothed_loss_disc_history = [] +lowest_value = {"step": 0, "value": float("inf"), "epoch": 0} +training_file_path = os.path.join(experiment_dir, "training_data.json") + +avg_losses = { + "grad_d_50": deque(maxlen=50), + "grad_g_50": deque(maxlen=50), + "disc_loss_50": deque(maxlen=50), + "adv_loss_50": deque(maxlen=50), + "fm_loss_50": deque(maxlen=50), + "kl_loss_50": deque(maxlen=50), + "mel_loss_50": deque(maxlen=50), + "gen_loss_50": deque(maxlen=50), +} + +import logging + +logging.getLogger("torch").setLevel(logging.ERROR) + + +class EpochRecorder: + """ + Records the time elapsed per epoch. + """ + + def __init__(self): + self.last_time = ttime() + + def record(self): + """ + Records the elapsed time and returns a formatted string. + """ + now_time = ttime() + elapsed_time = now_time - self.last_time + self.last_time = now_time + elapsed_time = round(elapsed_time, 1) + elapsed_time_str = str(datetime.timedelta(seconds=int(elapsed_time))) + current_time = datetime.datetime.now().strftime("%H:%M:%S") + return f"time={current_time} | training_speed={elapsed_time_str}" + + +def main(): + """ + Main function to start the training process. + """ + global training_file_path, last_loss_gen_all, smoothed_loss_gen_history, loss_gen_history, loss_disc_history, smoothed_loss_disc_history, overtrain_save_epoch, gpus + + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(randint(20000, 55555)) + # Check sample rate + wavs = glob.glob( + os.path.join(os.path.join(experiment_dir, "sliced_audios"), "*.wav") + ) + if wavs: + _, sr = load_wav_to_torch(wavs[0]) + if sr != config.data.sample_rate: + print( + f"Error: Pretrained model sample rate ({config.data.sample_rate} Hz) does not match dataset audio sample rate ({sr} Hz)." + ) + os._exit(1) + else: + print("No wav file found.") + + if torch.cuda.is_available(): + device = torch.device("cuda") + gpus = [int(item) for item in gpus.split("-")] + n_gpus = len(gpus) + elif torch.backends.mps.is_available(): + device = torch.device("mps") + gpus = [0] + n_gpus = 1 + else: + device = torch.device("cpu") + gpus = [0] + n_gpus = 1 + print("Training with CPU, this will take a long time.") + + def start(): + """ + Starts the training process with multi-GPU support or CPU. + """ + children = [] + pid_data = {"process_pids": []} + with open(config_save_path, "r") as pid_file: + try: + existing_data = json.load(pid_file) + pid_data.update(existing_data) + except json.JSONDecodeError: + pass + with open(config_save_path, "w") as pid_file: + for rank, device_id in enumerate(gpus): + subproc = mp.Process( + target=run, + args=( + rank, + n_gpus, + experiment_dir, + pretrainG, + pretrainD, + total_epoch, + save_every_weights, + config, + device, + device_id, + ), + ) + children.append(subproc) + subproc.start() + pid_data["process_pids"].append(subproc.pid) + json.dump(pid_data, pid_file, indent=4) + + for i in range(n_gpus): + children[i].join() + + def load_from_json(file_path): + """ + Load data from a JSON file. + + Args: + file_path (str): The path to the JSON file. + """ + if os.path.exists(file_path): + with open(file_path, "r") as f: + data = json.load(f) + return ( + data.get("loss_disc_history", []), + data.get("smoothed_loss_disc_history", []), + data.get("loss_gen_history", []), + data.get("smoothed_loss_gen_history", []), + ) + return [], [], [], [] + + def continue_overtrain_detector(training_file_path): + """ + Continues the overtrain detector by loading the training history from a JSON file. + + Args: + training_file_path (str): The file path of the JSON file containing the training history. + """ + if overtraining_detector: + if os.path.exists(training_file_path): + ( + loss_disc_history, + smoothed_loss_disc_history, + loss_gen_history, + smoothed_loss_gen_history, + ) = load_from_json(training_file_path) + + if cleanup: + print("Removing files from the prior training attempt...") + + # Clean up unnecessary files + for root, dirs, files in os.walk( + os.path.join(now_dir, "logs", model_name), topdown=False + ): + for name in files: + file_path = os.path.join(root, name) + file_name, file_extension = os.path.splitext(name) + if ( + file_extension == ".0" + or (file_name.startswith("D_") and file_extension == ".pth") + or (file_name.startswith("G_") and file_extension == ".pth") + or (file_name.startswith("added") and file_extension == ".index") + ): + os.remove(file_path) + for name in dirs: + if name == "eval": + folder_path = os.path.join(root, name) + for item in os.listdir(folder_path): + item_path = os.path.join(folder_path, item) + if os.path.isfile(item_path): + os.remove(item_path) + os.rmdir(folder_path) + + print("Cleanup done!") + + continue_overtrain_detector(training_file_path) + start() + + +def run( + rank, + n_gpus, + experiment_dir, + pretrainG, + pretrainD, + custom_total_epoch, + custom_save_every_weights, + config, + device, + device_id, +): + """ + Runs the training loop on a specific GPU or CPU. + + Args: + rank (int): The rank of the current process within the distributed training setup. + n_gpus (int): The total number of GPUs available for training. + experiment_dir (str): The directory where experiment logs and checkpoints will be saved. + pretrainG (str): Path to the pre-trained generator model. + pretrainD (str): Path to the pre-trained discriminator model. + custom_total_epoch (int): The total number of epochs for training. + custom_save_every_weights (int): The interval (in epochs) at which to save model weights. + config (object): Configuration object containing training parameters. + device (torch.device): The device to use for training (CPU or GPU). + """ + global global_step, smoothed_value_gen, smoothed_value_disc + + smoothed_value_gen = 0 + smoothed_value_disc = 0 + + if rank == 0: + writer_eval = SummaryWriter(log_dir=os.path.join(experiment_dir, "eval")) + else: + writer_eval = None + + dist.init_process_group( + backend="gloo" if sys.platform == "win32" or device.type != "cuda" else "nccl", + init_method="env://", + world_size=n_gpus if device.type == "cuda" else 1, + rank=rank if device.type == "cuda" else 0, + ) + + torch.manual_seed(config.train.seed) + + if torch.cuda.is_available(): + torch.cuda.set_device(device_id) + + # Create datasets and dataloaders + from data_utils import ( + DistributedBucketSampler, + TextAudioCollateMultiNSFsid, + TextAudioLoaderMultiNSFsid, + ) + + train_dataset = TextAudioLoaderMultiNSFsid(config.data) + collate_fn = TextAudioCollateMultiNSFsid() + train_sampler = DistributedBucketSampler( + train_dataset, + batch_size * n_gpus, + [50, 100, 200, 300, 400, 500, 600, 700, 800, 900], + num_replicas=n_gpus, + rank=rank, + shuffle=True, + ) + + train_loader = DataLoader( + train_dataset, + num_workers=4, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler, + persistent_workers=True, + prefetch_factor=8, + ) + + # Validations + if len(train_loader) < 3: + print( + "Not enough data present in the training set. Perhaps you forgot to slice the audio files in preprocess?" + ) + os._exit(2333333) + + # defaults + embedder_name = "contentvec" + spk_dim = config.model.spk_embed_dim # 109 default speakers + + try: + with open(model_info_path, "r") as f: + model_info = json.load(f) + embedder_name = model_info["embedder_model"] + spk_dim = model_info["speakers_id"] + except Exception as e: + print(f"Could not load model info file: {e}. Using defaults.") + + # Try to load speaker dim from latest checkpoint or pretrainG + try: + last_g = latest_checkpoint_path(experiment_dir, "G_*.pth") + chk_path = ( + last_g if last_g else (pretrainG if pretrainG not in ("", "None") else None) + ) + + if chk_path: + ckpt = torch.load(chk_path, map_location="cpu", weights_only=True) + spk_dim = ckpt["model"]["emb_g.weight"].shape[0] + del ckpt + except Exception as e: + print(f"Failed to load checkpoint: {e}. Using default number of speakers.") + + # update config before the model init + print(f"Initializing the generator with {spk_dim} speakers.") + config.model.spk_embed_dim = spk_dim + + # Initialize models and optimizers + from rvc.lib.algorithm.discriminators import MultiPeriodDiscriminator + from rvc.lib.algorithm.synthesizers import Synthesizer + + net_g = Synthesizer( + config.data.filter_length // 2 + 1, + config.train.segment_size // config.data.hop_length, + **config.model, + use_f0=True, + sr=config.data.sample_rate, + vocoder=vocoder, + checkpointing=checkpointing, + randomized=randomized, + ) + + net_d = MultiPeriodDiscriminator( + config.model.use_spectral_norm, checkpointing=checkpointing + ) + + if torch.cuda.is_available(): + net_g = net_g.cuda(device_id) + net_d = net_d.cuda(device_id) + else: + net_g = net_g.to(device) + net_d = net_d.to(device) + + if bf16_adamw == True and train_dtype == torch.bfloat16: + print("Using BFload16 AdamW optimizer") + from rvc.train.anyprecision_optimizer import AnyPrecisionAdamW + + optimizer = AnyPrecisionAdamW + else: + print("Using AdamW optimizer") + optimizer = torch.optim.AdamW + + optim_g = optimizer( + net_g.parameters(), + config.train.learning_rate * g_lr_coeff, + betas=config.train.betas, + eps=config.train.eps, + ) + optim_d = optimizer( + net_d.parameters(), + config.train.learning_rate * d_lr_coeff, + betas=config.train.betas, + eps=config.train.eps, + ) + if multiscale_mel_loss: + fn_mel_loss = MultiScaleMelSpectrogramLoss(sample_rate=config.data.sample_rate) + print("Using Multi-Scale Mel loss function") + else: + fn_mel_loss = torch.nn.L1Loss() + print("Using Single-Scale Mel loss function") + + # Wrap models with DDP for multi-gpu processing + if n_gpus > 1 and device.type == "cuda": + net_g = DDP(net_g, device_ids=[device_id]) + net_d = DDP(net_d, device_ids=[device_id]) + + if rank == 0 and train_dtype == torch.bfloat16: + print("Using BFloat16 for training.") + elif rank == 0 and train_dtype == torch.float16: + print("Using Float16 for training.") + + # Load checkpoint if available + scaler_dict = {} + try: + print("Starting training...") + _, _, _, epoch_str, scaler_dict = load_checkpoint( + latest_checkpoint_path(experiment_dir, "D_*.pth"), net_d, optim_d + ) + _, _, _, epoch_str, _ = load_checkpoint( + latest_checkpoint_path(experiment_dir, "G_*.pth"), net_g, optim_g + ) + epoch_str += 1 + global_step = (epoch_str - 1) * len(train_loader) + + except Exception as e: + epoch_str = 1 + global_step = 0 + + if pretrainG not in ("", "None"): + if rank == 0: + print(f"Loaded pretrained (G) '{pretrainG}'") + try: + ckpt = torch.load(pretrainG, map_location="cpu", weights_only=True)[ + "model" + ] + if hasattr(net_g, "module"): + net_g.module.load_state_dict(ckpt) + else: + net_g.load_state_dict(ckpt) + del ckpt + except Exception as e: + print( + "The parameters of the pretrain model such as the sample rate or architecture do not match the selected model." + ) + print(e) + sys.exit(1) + + if pretrainD not in ("", "None"): + if rank == 0: + print(f"Loaded pretrained (D) '{pretrainD}'") + try: + ckpt = torch.load(pretrainD, map_location="cpu", weights_only=True)[ + "model" + ] + if hasattr(net_d, "module"): + net_d.module.load_state_dict(ckpt) + else: + net_d.load_state_dict(ckpt) + del ckpt + except Exception as e: + print( + "The parameters of the pretrain model such as the sample rate or architecture do not match the selected model." + ) + print(e) + sys.exit(1) + + # Initialize schedulers + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( + optim_g, gamma=config.train.lr_decay, last_epoch=epoch_str - 2 + ) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + optim_d, gamma=config.train.lr_decay, last_epoch=epoch_str - 2 + ) + + use_scaler = device.type == "cuda" and train_dtype == torch.float16 + scaler = torch.amp.GradScaler(enabled=use_scaler) + if len(scaler_dict) > 0: + scaler.load_state_dict(scaler_dict) + + cache = [] + # collect the reference audio for tensorboard evaluation + if os.path.isfile(os.path.join("logs", "reference", embedder_name, "feats.npy")): + print("Using", embedder_name, "reference set for validation") + phone = np.load(os.path.join("logs", "reference", embedder_name, "feats.npy")) + # expanding x2 to match pitch size + phone = np.repeat(phone, 2, axis=0) + phone_lengths = torch.LongTensor([phone.shape[0]]).to(device) + phone = torch.FloatTensor(phone).unsqueeze(0).to(device) + pitch = np.load(os.path.join("logs", "reference", "pitch_coarse.npy")) + # removed last frame to match features + pitch = torch.LongTensor(pitch[:-1]).unsqueeze(0).to(device) + pitchf = np.load(os.path.join("logs", "reference", "pitch_fine.npy")) + # removed last frame to match features + pitchf = torch.FloatTensor(pitchf[:-1]).unsqueeze(0).to(device) + sid = torch.LongTensor([0]).to(device) + reference = ( + phone, + phone_lengths, + pitch, + pitchf, + sid, + ) + else: + print("No custom reference found, using a default audio sample for validation") + info = next(iter(train_loader)) + phone, phone_lengths, pitch, pitchf, _, _, _, _, sid = info + reference = ( + phone.to(device), + phone_lengths.to(device), + pitch.to(device), + pitchf.to(device), + sid.to(device), + ) + + for epoch in range(epoch_str, total_epoch + 1): + train_and_evaluate( + rank, + epoch, + config, + [net_g, net_d], + [optim_g, optim_d], + [train_loader, None], + [writer_eval], + cache, + custom_save_every_weights, + custom_total_epoch, + device, + device_id, + reference, + fn_mel_loss, + scaler, + ) + + scheduler_g.step() + scheduler_d.step() + + +def train_and_evaluate( + rank, + epoch, + hps, + nets, + optims, + loaders, + writers, + cache, + custom_save_every_weights, + custom_total_epoch, + device, + device_id, + reference, + fn_mel_loss, + scaler, +): + """ + Trains and evaluates the model for one epoch. + + Args: + rank (int): Rank of the current process. + epoch (int): Current epoch number. + hps (Namespace): Hyperparameters. + nets (list): List of models [net_g, net_d]. + optims (list): List of optimizers [optim_g, optim_d]. + loaders (list): List of dataloaders [train_loader, eval_loader]. + writers (list): List of TensorBoard writers [writer_eval]. + cache (list): List to cache data in GPU memory. + use_cpu (bool): Whether to use CPU for training. + """ + global global_step, lowest_value, loss_disc, consecutive_increases_gen, consecutive_increases_disc, smoothed_value_gen, smoothed_value_disc + + if epoch == 1: + lowest_value = {"step": 0, "value": float("inf"), "epoch": 0} + consecutive_increases_gen = 0 + consecutive_increases_disc = 0 + + net_g, net_d = nets + optim_g, optim_d = optims + train_loader = loaders[0] if loaders is not None else None + if writers is not None: + writer = writers[0] + + train_loader.batch_sampler.set_epoch(epoch) + + net_g.train() + net_d.train() + + use_amp = device.type == "cuda" and ( + train_dtype == torch.bfloat16 or train_dtype == torch.float16 + ) + + # Data caching + if device.type == "cuda" and cache_data_in_gpu: + data_iterator = cache + if cache == []: + for batch_idx, info in enumerate(train_loader): + # phone, phone_lengths, pitch, pitchf, spec, spec_lengths, wave, wave_lengths, sid + info = [tensor.cuda(device_id, non_blocking=True) for tensor in info] + cache.append((batch_idx, info)) + else: + shuffle(cache) + else: + data_iterator = enumerate(train_loader) + + epoch_recorder = EpochRecorder() + with tqdm(total=len(train_loader), leave=False) as pbar: + for batch_idx, info in data_iterator: + if device.type == "cuda" and not cache_data_in_gpu: + info = [tensor.cuda(device_id, non_blocking=True) for tensor in info] + elif device.type != "cuda": + info = [tensor.to(device) for tensor in info] + # else iterator is going thru a cached list with a device already assigned + + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + + with torch.amp.autocast( + device_type="cuda", enabled=use_amp, dtype=train_dtype + ): + # Forward pass + model_output = net_g( + phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid + ) + y_hat, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q) = ( + model_output + ) + # slice of the original waveform to match a generate slice + if randomized: + wave = commons.slice_segments( + wave, + ids_slice * config.data.hop_length, + config.train.segment_size, + dim=3, + ) + for _ in range(d_step_per_g_step): # default x1 + with torch.amp.autocast( + device_type="cuda", enabled=use_amp, dtype=train_dtype + ): + y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) + loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) + # Discriminator backward and update + optim_d.zero_grad() + if train_dtype == torch.float16: + scaler.scale(loss_disc).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.grad_norm(net_d.parameters()) + scaler.step(optim_d) + else: + loss_disc.backward() + grad_norm_d = commons.grad_norm(net_d.parameters()) + optim_d.step() + + with torch.amp.autocast( + device_type="cuda", enabled=use_amp, dtype=train_dtype + ): + # Generator backward and update + _, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) + + if multiscale_mel_loss: + loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0 + else: + wave_mel = mel_spectrogram_torch( + wave.float().squeeze(1), + config.data.filter_length, + config.data.n_mel_channels, + config.data.sample_rate, + config.data.hop_length, + config.data.win_length, + config.data.mel_fmin, + config.data.mel_fmax, + ) + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), + config.data.filter_length, + config.data.n_mel_channels, + config.data.sample_rate, + config.data.hop_length, + config.data.win_length, + config.data.mel_fmin, + config.data.mel_fmax, + ) + loss_mel = fn_mel_loss(wave_mel, y_hat_mel) * config.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, _ = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + + if loss_gen_all < lowest_value["value"]: + lowest_value = { + "step": global_step, + "value": loss_gen_all, + "epoch": epoch, + } + optim_g.zero_grad() + if train_dtype == torch.float16: + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.grad_norm(net_g.parameters()) + scaler.step(optim_g) + scaler.update() + else: + loss_gen_all.backward() + grad_norm_g = commons.grad_norm(net_g.parameters()) + optim_g.step() + + global_step += 1 + + # queue for rolling losses over 50 steps + avg_losses["grad_d_50"].append(grad_norm_d) + avg_losses["grad_g_50"].append(grad_norm_g) + avg_losses["disc_loss_50"].append(loss_disc.detach()) + avg_losses["adv_loss_50"].append(loss_gen.detach()) + avg_losses["fm_loss_50"].append(loss_fm.detach()) + avg_losses["kl_loss_50"].append(loss_kl.detach()) + avg_losses["mel_loss_50"].append(loss_mel.detach()) + avg_losses["gen_loss_50"].append(loss_gen_all.detach()) + + if rank == 0 and global_step % 50 == 0: + # logging rolling averages + scalar_dict = { + "grad_avg_50/norm_d": sum(avg_losses["grad_d_50"]) + / len(avg_losses["grad_d_50"]), + "grad_avg_50/norm_g": sum(avg_losses["grad_g_50"]) + / len(avg_losses["grad_g_50"]), + "loss_avg_50/d/adv": torch.mean( + torch.stack(list(avg_losses["disc_loss_50"])) + ), + "loss_avg_50/g/adv": torch.mean( + torch.stack(list(avg_losses["adv_loss_50"])) + ), + "loss_avg_50/g/fm": torch.mean( + torch.stack(list(avg_losses["fm_loss_50"])) + ), + "loss_avg_50/g/kl": torch.mean( + torch.stack(list(avg_losses["kl_loss_50"])) + ), + "loss_avg_50/g/mel": torch.mean( + torch.stack(list(avg_losses["mel_loss_50"])) + ), + "loss_avg_50/g/total": torch.mean( + torch.stack(list(avg_losses["gen_loss_50"])) + ), + } + summarize( + writer=writer, + global_step=global_step, + scalars=scalar_dict, + ) + + pbar.update(1) + # end of batch train + # end of tqdm + with torch.no_grad(): + torch.cuda.empty_cache() + + # Logging and checkpointing + if rank == 0: + # used for tensorboard chart - all/mel + mel = spec_to_mel_torch( + spec, + config.data.filter_length, + config.data.n_mel_channels, + config.data.sample_rate, + config.data.mel_fmin, + config.data.mel_fmax, + ) + # used for tensorboard chart - slice/mel_org + if randomized: + y_mel = commons.slice_segments( + mel, + ids_slice, + config.train.segment_size // config.data.hop_length, + dim=3, + ) + else: + y_mel = mel + # used for tensorboard chart - slice/mel_gen + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), + config.data.filter_length, + config.data.n_mel_channels, + config.data.sample_rate, + config.data.hop_length, + config.data.win_length, + config.data.mel_fmin, + config.data.mel_fmax, + ) + + lr = optim_g.param_groups[0]["lr"] + + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/adv": loss_disc, + "learning_rate": lr, + "grad/norm_d": grad_norm_d, + "grad/norm_g": grad_norm_g, + "loss/g/adv": loss_gen, + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl": loss_kl, + } + + image_dict = { + "slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), + "slice/mel_gen": plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), + "all/mel": plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), + } + + if epoch % save_every_epoch == 0: + with torch.amp.autocast( + device_type="cuda", enabled=use_amp, dtype=train_dtype + ): + with torch.no_grad(): + if hasattr(net_g, "module"): + o, *_ = net_g.module.infer(*reference) + else: + o, *_ = net_g.infer(*reference) + audio_dict = {f"gen/audio_{global_step:07d}": o[0, :, :]} + summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + audios=audio_dict, + audio_sample_rate=config.data.sample_rate, + ) + else: + summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + ) + + # Save checkpoint + model_add = [] + model_del = [] + done = False + + if rank == 0: + overtrain_info = "" + # Check overtraining + if overtraining_detector and rank == 0 and epoch > 1: + # Add the current loss to the history + current_loss_disc = float(loss_disc) + loss_disc_history.append(current_loss_disc) + # Update smoothed loss history with loss_disc + smoothed_value_disc = update_exponential_moving_average( + smoothed_loss_disc_history, current_loss_disc + ) + # Check overtraining with smoothed loss_disc + is_overtraining_disc = check_overtraining( + smoothed_loss_disc_history, overtraining_threshold * 2 + ) + if is_overtraining_disc: + consecutive_increases_disc += 1 + else: + consecutive_increases_disc = 0 + # Add the current loss_gen to the history + current_loss_gen = float(lowest_value["value"]) + loss_gen_history.append(current_loss_gen) + # Update the smoothed loss_gen history + smoothed_value_gen = update_exponential_moving_average( + smoothed_loss_gen_history, current_loss_gen + ) + # Check for overtraining with the smoothed loss_gen + is_overtraining_gen = check_overtraining( + smoothed_loss_gen_history, overtraining_threshold, 0.01 + ) + if is_overtraining_gen: + consecutive_increases_gen += 1 + else: + consecutive_increases_gen = 0 + overtrain_info = f"Smoothed loss_g {smoothed_value_gen:.3f} and loss_d {smoothed_value_disc:.3f}" + # Save the data in the JSON file if the epoch is divisible by save_every_epoch + if epoch % save_every_epoch == 0: + save_to_json( + training_file_path, + loss_disc_history, + smoothed_loss_disc_history, + loss_gen_history, + smoothed_loss_gen_history, + ) + + if ( + is_overtraining_gen + and consecutive_increases_gen == overtraining_threshold + or is_overtraining_disc + and consecutive_increases_disc == overtraining_threshold * 2 + ): + print( + f"Overtraining detected at epoch {epoch} with smoothed loss_g {smoothed_value_gen:.3f} and loss_d {smoothed_value_disc:.3f}" + ) + done = True + else: + print( + f"New best epoch {epoch} with smoothed loss_g {smoothed_value_gen:.3f} and loss_d {smoothed_value_disc:.3f}" + ) + old_model_files = glob.glob( + os.path.join(experiment_dir, f"{model_name}_*e_*s_best_epoch.pth") + ) + for file in old_model_files: + model_del.append(file) + model_add.append( + os.path.join( + experiment_dir, + f"{model_name}_{epoch}e_{global_step}s_best_epoch.pth", + ) + ) + + # Print training progress + lowest_value_rounded = float(lowest_value["value"]) + lowest_value_rounded = round(lowest_value_rounded, 3) + + record = f"{model_name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()}" + if epoch > 1: + record = ( + record + + f" | lowest_value={lowest_value_rounded} (epoch {lowest_value['epoch']} and step {lowest_value['step']})" + ) + + if overtraining_detector: + remaining_epochs_gen = overtraining_threshold - consecutive_increases_gen + remaining_epochs_disc = ( + overtraining_threshold * 2 - consecutive_increases_disc + ) + record = ( + record + + f" | Number of epochs remaining for overtraining: g/total: {remaining_epochs_gen} d/total: {remaining_epochs_disc} | smoothed_loss_gen={smoothed_value_gen:.3f} | smoothed_loss_disc={smoothed_value_disc:.3f}" + ) + print(record) + + # Save weights every N epochs + if epoch % save_every_epoch == 0: + checkpoint_suffix = f"{2333333 if save_only_latest else global_step}.pth" + save_checkpoint( + net_g, + optim_g, + config.train.learning_rate, + epoch, + os.path.join(experiment_dir, "G_" + checkpoint_suffix), + scaler, + ) + save_checkpoint( + net_d, + optim_d, + config.train.learning_rate, + epoch, + os.path.join(experiment_dir, "D_" + checkpoint_suffix), + scaler, + ) + if custom_save_every_weights: + model_add.append( + os.path.join( + experiment_dir, f"{model_name}_{epoch}e_{global_step}s.pth" + ) + ) + + # Check completion + if epoch >= custom_total_epoch: + lowest_value_rounded = float(lowest_value["value"]) + lowest_value_rounded = round(lowest_value_rounded, 3) + print( + f"Training has been successfully completed with {epoch} epoch, {global_step} steps and {round(loss_gen_all.item(), 3)} loss gen." + ) + print( + f"Lowest generator loss: {lowest_value_rounded} at epoch {lowest_value['epoch']}, step {lowest_value['step']}" + ) + # Final model + model_add.append( + os.path.join( + experiment_dir, f"{model_name}_{epoch}e_{global_step}s.pth" + ) + ) + done = True + + # Clean-up old best epochs + for m in model_del: + os.remove(m) + + if model_add: + ckpt = ( + net_g.module.state_dict() + if hasattr(net_g, "module") + else net_g.state_dict() + ) + for m in model_add: + if not os.path.exists(m): + extract_model( + ckpt=ckpt, + sr=config.data.sample_rate, + name=model_name, + model_path=m, + epoch=epoch, + step=global_step, + hps=hps, + overtrain_info=overtrain_info, + vocoder=vocoder, + ) + + if done: + # Clean-up process IDs from config.json + pid_file_path = os.path.join(experiment_dir, "config.json") + with open(pid_file_path, "r") as pid_file: + pid_data = json.load(pid_file) + with open(pid_file_path, "w") as pid_file: + pid_data.pop("process_pids", None) + json.dump(pid_data, pid_file, indent=4) + os._exit(2333333) + + with torch.no_grad(): + torch.cuda.empty_cache() + + +def check_overtraining(smoothed_loss_history, threshold, epsilon=0.004): + """ + Checks for overtraining based on the smoothed loss history. + + Args: + smoothed_loss_history (list): List of smoothed losses for each epoch. + threshold (int): Number of consecutive epochs with insignificant changes or increases to consider overtraining. + epsilon (float): The maximum change considered insignificant. + """ + if len(smoothed_loss_history) < threshold + 1: + return False + + for i in range(-threshold, -1): + if smoothed_loss_history[i + 1] > smoothed_loss_history[i]: + return True + if abs(smoothed_loss_history[i + 1] - smoothed_loss_history[i]) >= epsilon: + return False + return True + + +def update_exponential_moving_average( + smoothed_loss_history, new_value, smoothing=0.987 +): + """ + Updates the exponential moving average with a new value. + + Args: + smoothed_loss_history (list): List of smoothed values. + new_value (float): New value to be added. + smoothing (float): Smoothing factor. + """ + if smoothed_loss_history: + smoothed_value = ( + smoothing * smoothed_loss_history[-1] + (1 - smoothing) * new_value + ) + else: + smoothed_value = new_value + smoothed_loss_history.append(smoothed_value) + return smoothed_value + + +def save_to_json( + file_path, + loss_disc_history, + smoothed_loss_disc_history, + loss_gen_history, + smoothed_loss_gen_history, +): + """ + Save the training history to a JSON file. + """ + data = { + "loss_disc_history": loss_disc_history, + "smoothed_loss_disc_history": smoothed_loss_disc_history, + "loss_gen_history": loss_gen_history, + "smoothed_loss_gen_history": smoothed_loss_gen_history, + } + with open(file_path, "w") as f: + json.dump(data, f) + + +if __name__ == "__main__": + torch.multiprocessing.set_start_method("spawn") + main() diff --git a/rvc/train/utils.py b/rvc/train/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a540a8280c62b90de54d9a8ae08e14322fad6fdb --- /dev/null +++ b/rvc/train/utils.py @@ -0,0 +1,251 @@ +import os +import glob +import torch +import numpy as np +import soundfile as sf +from collections import OrderedDict +import matplotlib.pyplot as plt + +MATPLOTLIB_FLAG = False + + +def replace_keys_in_dict(d, old_key_part, new_key_part): + """ + Recursively replace parts of the keys in a dictionary. + + Args: + d (dict or OrderedDict): The dictionary to update. + old_key_part (str): The part of the key to replace. + new_key_part (str): The new part of the key. + """ + updated_dict = OrderedDict() if isinstance(d, OrderedDict) else {} + for key, value in d.items(): + new_key = ( + key.replace(old_key_part, new_key_part) if isinstance(key, str) else key + ) + updated_dict[new_key] = ( + replace_keys_in_dict(value, old_key_part, new_key_part) + if isinstance(value, dict) + else value + ) + return updated_dict + + +def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): + """ + Load a checkpoint into a model and optionally the optimizer. + + Args: + checkpoint_path (str): Path to the checkpoint file. + model (torch.nn.Module): The model to load the checkpoint into. + optimizer (torch.optim.Optimizer, optional): The optimizer to load the state from. Defaults to None. + load_opt (int, optional): Whether to load the optimizer state. Defaults to 1. + """ + assert os.path.isfile( + checkpoint_path + ), f"Checkpoint file not found: {checkpoint_path}" + + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True) + checkpoint_dict = replace_keys_in_dict( + replace_keys_in_dict( + checkpoint_dict, ".weight_v", ".parametrizations.weight.original1" + ), + ".weight_g", + ".parametrizations.weight.original0", + ) + + # Update model state_dict + model_state_dict = ( + model.module.state_dict() if hasattr(model, "module") else model.state_dict() + ) + new_state_dict = { + k: checkpoint_dict["model"].get(k, v) for k, v in model_state_dict.items() + } + + # Load state_dict into model + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + + if optimizer and load_opt == 1: + optimizer.load_state_dict(checkpoint_dict.get("optimizer", {})) + + print( + f"Loaded checkpoint '{checkpoint_path}' (epoch {checkpoint_dict['iteration']})" + ) + return ( + model, + optimizer, + checkpoint_dict.get("learning_rate", 0), + checkpoint_dict["iteration"], + checkpoint_dict.get("scaler", {}), + ) + + +def save_checkpoint( + model, optimizer, learning_rate, iteration, checkpoint_path, scaler +): + """ + Save the model and optimizer state to a checkpoint file. + + Args: + model (torch.nn.Module): The model to save. + optimizer (torch.optim.Optimizer): The optimizer to save the state of. + learning_rate (float): The current learning rate. + iteration (int): The current iteration. + checkpoint_path (str): The path to save the checkpoint to. + """ + state_dict = ( + model.module.state_dict() if hasattr(model, "module") else model.state_dict() + ) + checkpoint_data = { + "model": state_dict, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + "scaler": scaler.state_dict(), + } + + # Create a backwards-compatible checkpoint + torch.save( + replace_keys_in_dict( + replace_keys_in_dict( + checkpoint_data, ".parametrizations.weight.original1", ".weight_v" + ), + ".parametrizations.weight.original0", + ".weight_g", + ), + checkpoint_path, + ) + + print(f"Saved model '{checkpoint_path}' (epoch {iteration})") + + +def summarize( + writer, + global_step, + scalars={}, + histograms={}, + images={}, + audios={}, + audio_sample_rate=22050, +): + """ + Log various summaries to a TensorBoard writer. + + Args: + writer (SummaryWriter): The TensorBoard writer. + global_step (int): The current global step. + scalars (dict, optional): Dictionary of scalar values to log. + histograms (dict, optional): Dictionary of histogram values to log. + images (dict, optional): Dictionary of image values to log. + audios (dict, optional): Dictionary of audio values to log. + audio_sample_rate (int, optional): Sampling rate of the audio data. + """ + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sample_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + """ + Get the latest checkpoint file in a directory. + + Args: + dir_path (str): The directory to search for checkpoints. + regex (str, optional): The regular expression to match checkpoint files. + """ + checkpoints = sorted( + glob.glob(os.path.join(dir_path, regex)), + key=lambda f: int("".join(filter(str.isdigit, f))), + ) + return checkpoints[-1] if checkpoints else None + + +def plot_spectrogram_to_numpy(spectrogram): + """ + Convert a spectrogram to a NumPy array for visualization. + + Args: + spectrogram (numpy.ndarray): The spectrogram to plot. + """ + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + plt.switch_backend("Agg") + MATPLOTLIB_FLAG = True + + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8) + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close(fig) + return data + + +def load_wav_to_torch(full_path): + """ + Load a WAV file into a PyTorch tensor. + + Args: + full_path (str): The path to the WAV file. + """ + data, sample_rate = sf.read(full_path, dtype="float32") + return torch.FloatTensor(data), sample_rate + + +def load_filepaths_and_text(filename, split="|"): + """ + Load filepaths and associated text from a file. + + Args: + filename (str): The path to the file. + split (str, optional): The delimiter used to split the lines. + """ + with open(filename, encoding="utf-8") as f: + return [line.strip().split(split) for line in f] + + +class HParams: + """ + A class for storing and accessing hyperparameters. + """ + + def __init__(self, **kwargs): + for k, v in kwargs.items(): + self[k] = HParams(**v) if isinstance(v, dict) else v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return self.__dict__[key] + + def __setitem__(self, key, value): + self.__dict__[key] = value + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return repr(self.__dict__)