mvsepless_plugins / vbach_portable_v2.py

Update vbach_portable_v2.py

1953b4c verified 5 months ago

177 kB

	import os
	import gc
	import ast
	import requests
	import sys
	import shutil
	import zipfile
	import gradio as gr
	import urllib.request
	import gdown
	import tempfile
	from datetime import datetime

	current_dir = os.getcwd()
	dirs = [
	"voice_models",
	"vbach",
	os.path.join("vbach", "cli"),
	os.path.join("vbach", "infer"),
	os.path.join("vbach", "lib"),
	os.sep.join(["vbach", "lib", "algorithm"]),
	os.sep.join(["vbach", "lib", "predictors"]),
	os.path.join("vbach", "models"),
	os.sep.join(["vbach", "models", "predictors"]),
	os.sep.join(["vbach", "models", "embedders"]),
	os.path.join("vbach", "scripts"),
	os.path.join("vbach", "utils")
	]

	RMVPE_PATH = os.path.join(dirs[8], "rmvpe.pt")
	FCPE_PATH = os.path.join(dirs[8], "fcpe.pt")
	RVC_MODELS_DIR = dirs[0]
	HUBERT_MODEL_PATH = os.path.join(
	dirs[9], "hubert_base.pt"
	)
	CURRENT_LANG = "ru"
	OUTPUT_FORMAT = ["mp3", "wav", "flac", "aiff", "m4a", "aac", "ogg", "opus"]
	TRANSLATIONS = {
	"ru": {
	"app_title": "VBach",
	"inference": "Инференс",
	"select_file": "Выберите файл",
	"audio_path": "Путь к файлу",
	"audio_path_info": "Здесь можно ввести путь к файлу/список путей к файлам , либо загрузить его/их выше и получить путь к нему/их список",
	"audio_processing": "Режим обработки аудио",
	"output_format": "Формат вывода",
	"name_format": "Шаблон",
	"name_format_info": """Доступные ключи для формата:
	NAME - Имя входного файла
	MODEL - Название модели
	PITCH - Высота тона
	F0_METHOD - Метод извлечения тона
	DATETIME - Время и дата создания результата

	Пример - NAME_MODEL_PITCH → name_your-model_12""",
	"convert_single": "Конвертировать один",
	"convert_batch": "Конвертировать несколько",
	"model_name": "Имя модели",
	"pitch_method": "Метод извлечения тона",
	"pitch": "Высота тона",
	"hop_length": "Длина шага",
	"bitrate": "Битрейт (Кбит/сек)",
	"f0_min": "Нижний лимит определения высоты тона",
	"f0_max": "Верхний лимит определения высоты тона",
	"advanced_settings": "Дополнительные настройки",
	"filter_radius": "Радиус фильтра",
	"index_rate": "Влияние индекса",
	"rms": "Огибающая громкости",
	"protect": "Защита согласных",
	"model_manager": "Менеджер моделей",
	"download_url": "Загрузить по ссылке",
	"download_zip": "Загрузить ZIP архивом",
	"download_files": "Загрузить файлами",
	"delete_model": "Удалить модель",
	"download_link": "Ссылка на загрузку модели",
	"unique_name": "Дайте вашей загружаемой модели уникальное имя, отличное от других голосовых моделей.",
	"download_button": "Загрузить модель",
	"supported_sites": "Поддерживаемые сайты",
	"output_message": "Сообщение вывода",
	"zip_file": "Zip-файл",
	"upload_steps": "<h3>1. Найдите и скачайте файлы: .pth и необязательный файл .index</h3><h3>2. Закиньте файл(-ы) в ZIP-архив и поместите его в область загрузки</h3><h3>3. Дождитесь полной загрузки ZIP-архива в интерфейс</h3>",
	"pth_file": "pth-файл",
	"index_file": "index-файл",
	"delete_info": "Выберите модель, которую надо удалить",
	"refresh_button": "Обновить список моделей",
	"delete_button": "Удалить модель",
	"batch_upload": "Пакетная загрузка",
	"single_upload": "Одиночная загрузка",
	"converted_voice": "Преобразованный вокал",
	"converted_voices": "Преобразованные вокалы",
	"update_button": "Обновить",
	"processing": "Сейчас обрабатывается - {namefile}",
	"files": "файлов",
	"error_no_audio": "Не удалось найти аудиофайл(ы). Убедитесь, что файл загрузился или проверьте правильность пути к нему.",
	"error_no_model": "Выберите модель голоса для преобразования голоса",
	"warning_file_not_found": "Файл {file} не найден.",
	"success_single": "Вокал успешно преобразован",
	"success_batch": "Вокалы успешно преобразованы",
	"language": "Язык",
	"stereo_modes": {
	"mono": "Моно",
	"left/right": "Левый/Правый",
	"sim/dif": "Сходство/Различия"
	},
	# Прогресс-бары
	'downloading_google': "[~] Загрузка модели с Google Drive...",
	'downloading_huggingface': "[~] Загрузка модели с HuggingFace...",
	'downloading_pixeldrain': "[~] Загрузка модели с Pixeldrain...",
	'downloading_yandex': "[~] Загрузка модели с Яндекс Диска...",
	'downloading_model': "[~] Загрузка голосовой модели {dir_name}...",
	'unpacking_zip': "[~] Распаковка zip-файла...",

	# Уведомления об ошибках
	'unsupported_source': "Неподдерживаемый источник: {url}",
	'download_error': "Ошибка при скачивании: {error}",
	'yandex_api_error': "Ошибка при получении ссылки с Яндекс Диска: {status}",
	'pth_not_found': "Не найден файл модели .pth в распакованном zip-файле. Проверьте содержимое в {folder}.",
	'model_exists': "Директория голосовой модели {dir_name} уже существует! Выберите другое имя.",
	'model_load_error': "Ошибка при загрузке модели: {error}",
	'model_delete_error': "Ошибка при удалении модели: {error}",

	# Статус операции
	'mega_unsupported': "Mega не поддерживается!",
	'model_uploaded': "[+] Модель {dir_name} успешно загружена!",
	'model_deleted': "[-] Модель {dir_name} успешно удалена!",
	'model_not_found': "[-] Модели {dir_name} не существует",
	"error_strlist_is_not_list": "Эта строка не является списком файлов",
	"error_path_is_list": "Путь к файлу является списком"
	},
	"en": {
	"app_title": "VBach",
	"inference": "Inference",
	"select_file": "Select File",
	"audio_path": "Audio path",
	"audio_path_info": "You can enter a file path or a list of file paths here, or upload the file(s) above to obtain their path(s)",
	"audio_processing": "Audio Processing Mode",
	"output_format": "Output Format",
	"name_format": "Template",
	"name_format_info": """Available format keys:
	NAME - Input file name
	MODEL - Model name
	PITCH - Pitch
	F0_METHOD - Method extraction pitch
	DATETIME - Date & time create results

	Example - NAME_MODEL_PITCH → name_your-model_12""",
	"convert_single": "Convert Single",
	"convert_batch": "Convert Batch",
	"model_name": "Model Name",
	"pitch_method": "Pitch Extraction Method",
	"pitch": "Pitch",
	"hop_length": "Hop Length",
	"bitrate": "Bitrate (Kbit/sec)",
	"f0_min": "F0 Min",
	"f0_max": "F0 Max",
	"advanced_settings": "Advanced Settings",
	"filter_radius": "Filter Radius",
	"index_rate": "Index Rate",
	"rms": "RMS Envelope",
	"protect": "Consonant Protection",
	"model_manager": "Model Manager",
	"download_url": "Download by URL",
	"download_zip": "Upload ZIP Archive",
	"download_files": "Upload Files",
	"delete_model": "Delete Model",
	"download_link": "Model Download Link",
	"unique_name": "Give your model a unique name different from other voice models.",
	"download_button": "Download Model",
	"supported_sites": "Supported Sites",
	"output_message": "Output Message",
	"zip_file": "Zip File",
	"upload_steps": "<h3>1. Find and download files: .pth and optional .index</h3><h3>2. Put file(s) in a ZIP archive and upload it</h3><h3>3. Wait for the ZIP archive to be fully uploaded</h3>",
	"pth_file": "PTH File",
	"index_file": "Index File",
	"delete_info": "Select the model to delete",
	"refresh_button": "Refresh Model List",
	"delete_button": "Delete Model",
	"batch_upload": "Batch Upload",
	"single_upload": "Single Upload",
	"converted_voice": "Converted Voice",
	"converted_voices": "Converted Voices",
	"update_button": "Refresh",
	"processing": "Processing - {namefile}",
	"files": "files",
	"error_no_audio": "Could not find audio file(s). Make sure the file is uploaded or check the file path.",
	"error_no_model": "Select a voice model for voice conversion",
	"warning_file_not_found": "File {file} not found.",
	"success_single": "Voice successfully converted",
	"success_batch": "Voices successfully converted",
	"language": "Language",
	"stereo_modes": {
	"mono": "Mono",
	"left/right": "Left/Right",
	"sim/dif": "Similarity/Difference"
	},
	'downloading_google': "[~] Downloading model from Google Drive...",
	'downloading_huggingface': "[~] Downloading model from HuggingFace...",
	'downloading_pixeldrain': "[~] Downloading model from Pixeldrain...",
	'downloading_yandex': "[~] Downloading model from Yandex Disk...",
	'downloading_model': "[~] Downloading voice model {dir_name}...",
	'unpacking_zip': "[~] Unpacking zip file...",

	# Error messages
	'unsupported_source': "Unsupported source: {url}",
	'download_error': "Download error: {error}",
	'yandex_api_error': "Yandex Disk API error: {status}",
	'pth_not_found': "Model .pth file not found in unzipped archive. Check contents in {folder}.",
	'model_exists': "Voice model directory {dir_name} already exists! Choose another name.",
	'model_load_error': "Error loading model: {error}",
	'model_delete_error': "Error deleting model: {error}",

	# Operation status
	'mega_unsupported': "Mega is not supported!",
	'model_uploaded': "[+] Model {dir_name} uploaded successfully!",
	'model_deleted': "[-] Model {dir_name} deleted successfully!",
	'model_not_found': "[-] Model {dir_name} does not exist",
	"error_strlist_is_not_list": "This string is not a file list",
	"error_path_is_list": "The file path is a list"
	}
	}


	for dir in dirs:
	os.makedirs(os.path.join(current_dir, dir), exist_ok=True)

	for url, file in [["https://huggingface.co/Politrees/RVC_resources/resolve/main/predictors/rmvpe.pt", RMVPE_PATH], ["https://huggingface.co/Politrees/RVC_resources/resolve/main/predictors/fcpe.pt", FCPE_PATH], ["https://huggingface.co/Politrees/RVC_resources/resolve/main/embedders/hubert_base.pt", HUBERT_MODEL_PATH]]:
	if not os.path.exists(file):
	try:
	r = requests.get(url, stream=True)
	r.raise_for_status()
	with open(os.path.join(file), "wb") as f:
	for chunk in r.iter_content(chunk_size=8192):
	f.write(chunk)
	except requests.exceptions.RequestException as e:
	print(f"Произошла ошибка при загрузке модели: {e}")
	except Exception as e:
	print(f"Произошла непредвиденная ошибка: {e}")


	inference = '''
	import torch
	import numpy as np
	import librosa
	from multiprocessing import cpu_count
	from fairseq import checkpoint_utils

	from vbach.lib.algorithm.synthesizers import Synthesizer
	from .pipeline import VC

	from separator.audio_writer import write_audio_file

	from vbach.utils.remove_center import remove_center

	def overlay_mono_on_stereo(mono_audio, stereo_audio, gain=0.5):
	if mono_audio is None or stereo_audio is None:
	raise ValueError("Input audio arrays cannot be None")

	# Ensure float32 for processing
	mono_audio = mono_audio.astype(np.float32)
	stereo_audio = stereo_audio.astype(np.float32)

	# Convert mono to stereo if needed
	if mono_audio.ndim == 1:
	mono_audio = np.vstack([mono_audio, mono_audio])
	elif mono_audio.shape[0] == 1:
	mono_audio = np.vstack([mono_audio[0], mono_audio[0]])

	if mono_audio.shape[0] != 2 or stereo_audio.shape[0] != 2:
	raise ValueError("Shapes must be (2, N)")

	min_len = min(mono_audio.shape[1], stereo_audio.shape[1])
	if min_len == 0:
	raise ValueError("Audio arrays cannot be empty")

	mono_audio = mono_audio[:, :min_len]
	stereo_audio = stereo_audio[:, :min_len]

	result = stereo_audio + mono_audio * gain

	# Normalize to prevent clipping
	max_amp = np.max(np.abs(result))
	if max_amp > 0:
	result /= max_amp

	# Convert back to int16 for output (if needed)
	result = (result * 32767).astype(np.int16)

	return result

	def load_audio(
	file_path: str,
	target_sr: int,
	stereo_mode: str
	) -> np.ndarray:
	"""
	Загружает аудиофайл с помощью librosa, обрабатывает и возвращает аудиосигнал

	Параметры:
	file_path: Путь к аудиофайлу
	target_sr: Целевая частота дискретизации
	mono: Преобразовать в моно (по умолчанию True)
	normalize: Нормализовать аудио (по умолчанию False)
	duration: Загрузить только указанную длительность (в секундах)
	offset: Начальное смещение для загрузки (в секундах)

	Возвращает:
	Аудиоданные в виде numpy array (моно: (samples,), стерео: (channels, samples))

	Исключения:
	RuntimeError: При ошибках загрузки или обработки аудио
	"""
	try:
	mid, left, right = None, None, None

	if stereo_mode == "mono":
	# Загрузка аудио с помощью librosa
	mid_audio, sr = librosa.load(
	file_path,
	sr=None,
	mono=True
	)
	mid_audio = librosa.resample(
	mid_audio, # Исправлено: было audio
	orig_sr=sr,
	target_sr=target_sr
	)
	mid = mid_audio.flatten()

	elif stereo_mode == "left/right" or stereo_mode == "sim/dif":
	# Загрузка аудио с помощью librosa
	stereo_audio, sr = librosa.load(
	file_path,
	sr=None,
	mono=False
	)

	if stereo_mode == "left/right":
	left_audio = stereo_audio[0] # Исправлено: было [:, 0]
	right_audio = stereo_audio[1] # Исправлено: было [:, 1]
	left_audio = librosa.resample(
	left_audio,
	orig_sr=sr,
	target_sr=target_sr
	)
	right_audio = librosa.resample(
	right_audio,
	orig_sr=sr,
	target_sr=target_sr
	)

	left = left_audio.flatten()
	right = right_audio.flatten()

	elif stereo_mode == "sim/dif":
	mid_left, mid_right, dif_left, dif_right = remove_center(input_array=stereo_audio, samplerate=sr)
	mid_audio = (mid_left + mid_right) * 0.5

	mid_audio = librosa.resample(
	mid_audio,
	orig_sr=sr,
	target_sr=target_sr
	)
	dif_left = librosa.resample(
	dif_left,
	orig_sr=sr,
	target_sr=target_sr
	)
	dif_right = librosa.resample(
	dif_right,
	orig_sr=sr,
	target_sr=target_sr
	)

	mid = mid_audio.flatten()
	left = dif_left.flatten() # Исправлено: было left_audio
	right = dif_right.flatten() # Исправлено: было right_audio

	return mid, left, right

	except Exception as e:
	raise RuntimeError(f"Ошибка загрузки аудио '{file_path}': {str(e)}")

	class Config:
	def __init__(self):
	self.device = self.get_device()
	self.is_half = self.device == "cpu"
	self.n_cpu = cpu_count()
	self.gpu_name = None
	self.gpu_mem = None
	self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()

	def get_device(self):
	if torch.cuda.is_available():
	return "cuda"
	elif torch.backends.mps.is_available():
	return "mps"
	else:
	return "cpu"

	def device_config(self):
	if torch.cuda.is_available():
	print("Используется устройство CUDA")
	self._configure_gpu()
	elif torch.backends.mps.is_available():
	print("Используется устройство MPS")
	self.device = "mps"
	else:
	print("Используется CPU")
	self.device = "cpu"
	self.is_half = True

	x_pad, x_query, x_center, x_max = (
	(3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
	)
	if self.gpu_mem is not None and self.gpu_mem <= 4:
	x_pad, x_query, x_center, x_max = (1, 5, 30, 32)

	return x_pad, x_query, x_center, x_max

	def _configure_gpu(self):
	self.gpu_name = torch.cuda.get_device_name(self.device)
	low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"]
	if (
	any(gpu in self.gpu_name for gpu in low_end_gpus)
	and "V100" not in self.gpu_name.upper()
	):
	self.is_half = False
	self.gpu_mem = int(
	torch.cuda.get_device_properties(self.device).total_memory
	/ 1024
	/ 1024
	/ 1024
	+ 0.4
	)

	# Загрузка модели Hubert
	def load_hubert(device, is_half, model_path):
	models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
	[model_path], suffix=""
	)
	hubert = models[0].to(device)
	hubert = hubert.half() if is_half else hubert.float()
	hubert.eval()
	return hubert

	# Получение голосового преобразователя
	def get_vc(device, is_half, config, model_path):
	cpt = torch.load(model_path, map_location="cpu", weights_only=False)
	if "config" not in cpt or "weight" not in cpt:
	raise ValueError(
	f"Некорректный формат для {model_path}. "
	"Используйте голосовую модель, обученную с использованием RVC v2."
	)

	tgt_sr = cpt["config"][-1]
	cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
	pitch_guidance = cpt.get("f0", 1)
	version = cpt.get("version", "v1")
	input_dim = 768 if version == "v2" else 256

	net_g = Synthesizer(
	*cpt["config"],
	use_f0=pitch_guidance,
	input_dim=input_dim,
	is_half=is_half,
	)

	del net_g.enc_q
	print(net_g.load_state_dict(cpt["weight"], strict=False))
	net_g.eval().to(device)
	net_g = net_g.half() if is_half else net_g.float()

	vc = VC(tgt_sr, config)
	return cpt, version, net_g, tgt_sr, vc

	def rvc_infer(
	index_path,
	index_rate,
	input_path,
	output_path,
	pitch,
	f0_method,
	cpt,
	version,
	net_g,
	filter_radius,
	tgt_sr,
	volume_envelope,
	protect,
	hop_length,
	vc,
	hubert_model,
	f0_min=50,
	f0_max=1100,
	format_output="wav",
	output_bitrate="320k",
	stereo_mode="mono"
	):

	mid, left, right = load_audio(input_path, 16000, stereo_mode)
	pitch_guidance = cpt.get("f0", 1)

	if stereo_mode == "mono":
	if mid is None:
	raise ValueError("Mono audio data is None")
	audio_opt = vc.pipeline(
	hubert_model,
	net_g,
	0,
	mid,
	input_path,
	pitch,
	f0_method,
	index_path,
	index_rate,
	pitch_guidance,
	filter_radius,
	tgt_sr,
	0,
	volume_envelope,
	version,
	protect,
	hop_length,
	f0_file=None,
	f0_min=f0_min,
	f0_max=f0_max,
	)

	elif stereo_mode == "left/right":
	if left is None or right is None:
	raise ValueError("Left or right audio channel is None")

	left_audio_opt = vc.pipeline(
	hubert_model,
	net_g,
	0,
	left,
	input_path,
	pitch,
	f0_method,
	index_path,
	index_rate,
	pitch_guidance,
	filter_radius,
	tgt_sr,
	0,
	volume_envelope,
	version,
	protect,
	hop_length,
	f0_file=None,
	f0_min=f0_min,
	f0_max=f0_max,
	)
	right_audio_opt = vc.pipeline(
	hubert_model,
	net_g,
	0,
	right,
	input_path,
	pitch,
	f0_method,
	index_path,
	index_rate,
	pitch_guidance,
	filter_radius,
	tgt_sr,
	0,
	volume_envelope,
	version,
	protect,
	hop_length,
	f0_file=None,
	f0_min=f0_min,
	f0_max=f0_max,
	)

	# Ensure both channels have the same length
	min_len = min(len(left_audio_opt), len(right_audio_opt))
	if min_len == 0:
	raise ValueError("Processed audio is empty")

	left_audio_opt = left_audio_opt[:min_len]
	right_audio_opt = right_audio_opt[:min_len]

	audio_opt = np.stack((left_audio_opt, right_audio_opt), axis=0)

	elif stereo_mode == "sim/dif":
	if mid is None or left is None or right is None:
	raise ValueError("Mid, left or right audio channel is None")

	mid_audio_opt = vc.pipeline(
	hubert_model,
	net_g,
	0,
	mid,
	input_path,
	pitch,
	f0_method,
	index_path,
	index_rate,
	pitch_guidance,
	filter_radius,
	tgt_sr,
	0,
	volume_envelope,
	version,
	protect,
	hop_length,
	f0_file=None,
	f0_min=f0_min,
	f0_max=f0_max,
	)
	left_audio_opt = vc.pipeline(
	hubert_model,
	net_g,
	0,
	left,
	input_path,
	pitch,
	f0_method,
	index_path,
	index_rate,
	pitch_guidance,
	filter_radius,
	tgt_sr,
	0,
	volume_envelope,
	version,
	protect,
	hop_length,
	f0_file=None,
	f0_min=f0_min,
	f0_max=f0_max,
	)
	right_audio_opt = vc.pipeline(
	hubert_model,
	net_g,
	0,
	right,
	input_path,
	pitch,
	f0_method,
	index_path,
	index_rate,
	pitch_guidance,
	filter_radius,
	tgt_sr,
	0,
	volume_envelope,
	version,
	protect,
	hop_length,
	f0_file=None,
	f0_min=f0_min,
	f0_max=f0_max,
	)

	# Ensure all channels have the same length
	min_len = min(len(mid_audio_opt), len(left_audio_opt), len(right_audio_opt))
	if min_len == 0:
	raise ValueError("Processed audio is empty")

	mid_audio_opt = mid_audio_opt[:min_len]
	left_audio_opt = left_audio_opt[:min_len]
	right_audio_opt = right_audio_opt[:min_len]

	dif_audio_opt = np.stack((left_audio_opt, right_audio_opt), axis=0)

	audio_opt = overlay_mono_on_stereo(mid_audio_opt, dif_audio_opt)

	write_audio_file(output_path, audio_opt, tgt_sr, format_output, output_bitrate)
	return output_path
	'''

	pipeline = '''
	import os
	import gc
	import torch
	import torch.nn.functional as F
	import torchcrepe
	import faiss
	import librosa
	import numpy as np
	from scipy import signal

	from vbach.lib.predictors.FCPE import FCPEF0Predictor
	from vbach.lib.predictors.RMVPE import RMVPE0Predictor

	PREDICTORS_DIR = os.path.join(os.getcwd(), "vbach", "models", "predictors")
	RMVPE_DIR = os.path.join(PREDICTORS_DIR, "rmvpe.pt")
	FCPE_DIR = os.path.join(PREDICTORS_DIR, "fcpe.pt")

	# Фильтр Баттерворта для высоких частот
	FILTER_ORDER = 5 # Порядок фильтра
	CUTOFF_FREQUENCY = 48 # Частота среза (в Гц)
	SAMPLE_RATE = 16000 # Частота дискретизации (в Гц)
	bh, ah = signal.butter(N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE)


	input_audio_path2wav = {}


	# Класс для обработки аудио
	class AudioProcessor:
	@staticmethod
	def change_rms(source_audio, source_rate, target_audio, target_rate, rate):
	"""
	Изменяет RMS (среднеквадратичное значение) аудио.
	"""
	rms1 = librosa.feature.rms(
	y=source_audio,
	frame_length=source_rate // 2 * 2,
	hop_length=source_rate // 2,
	)
	rms2 = librosa.feature.rms(
	y=target_audio,
	frame_length=target_rate // 2 * 2,
	hop_length=target_rate // 2,
	)

	rms1 = F.interpolate(
	torch.from_numpy(rms1).float().unsqueeze(0),
	size=target_audio.shape[0],
	mode="linear",
	).squeeze()
	rms2 = F.interpolate(
	torch.from_numpy(rms2).float().unsqueeze(0),
	size=target_audio.shape[0],
	mode="linear",
	).squeeze()
	rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6)

	adjusted_audio = (
	target_audio * (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy()
	)
	return adjusted_audio


	# Класс для преобразования голоса
	class VC:
	def __init__(self, tgt_sr, config):
	"""
	Инициализация параметров для преобразования голоса.
	"""
	self.x_pad = config.x_pad
	self.x_query = config.x_query
	self.x_center = config.x_center
	self.x_max = config.x_max
	self.is_half = config.is_half
	self.sample_rate = 16000
	self.window = 160
	self.t_pad = self.sample_rate * self.x_pad
	self.t_pad_tgt = tgt_sr * self.x_pad
	self.t_pad2 = self.t_pad * 2
	self.t_query = self.sample_rate * self.x_query
	self.t_center = self.sample_rate * self.x_center
	self.t_max = self.sample_rate * self.x_max
	self.time_step = self.window / self.sample_rate * 1000
	self.device = config.device

	def get_f0_crepe(self, x, f0_min, f0_max, p_len, hop_length, model="full"):
	"""
	Получает F0 с использованием модели crepe.
	"""
	x = x.astype(np.float32)
	x /= np.quantile(np.abs(x), 0.999)
	audio = torch.from_numpy(x).to(self.device, copy=True).unsqueeze(0)
	if audio.ndim == 2 and audio.shape[0] > 1:
	audio = torch.mean(audio, dim=0, keepdim=True)

	pitch = torchcrepe.predict(
	audio,
	self.sample_rate,
	hop_length,
	f0_min,
	f0_max,
	model,
	batch_size=hop_length * 2,
	device=self.device,
	pad=True,
	)

	p_len = p_len or x.shape[0] // hop_length
	source = np.array(pitch.squeeze(0).cpu().float().numpy())
	source[source < 0.001] = np.nan
	target = np.interp(
	np.arange(0, len(source) * p_len, len(source)) / p_len,
	np.arange(0, len(source)),
	source,
	)
	f0 = np.nan_to_num(target)
	return f0

	def get_f0_rmvpe(self, x, f0_min=1, f0_max=40000, args, *kwargs):
	"""
	Получает F0 с использованием модели rmvpe.
	"""
	if not hasattr(self, "model_rmvpe"):
	self.model_rmvpe = RMVPE0Predictor(
	RMVPE_DIR, is_half=self.is_half, device=self.device
	)
	f0 = self.model_rmvpe.infer_from_audio_with_pitch(
	x, thred=0.03, f0_min=f0_min, f0_max=f0_max
	)
	return f0

	def get_f0(
	self,
	input_audio_path,
	x,
	p_len,
	pitch,
	f0_method,
	filter_radius,
	hop_length,
	inp_f0=None,
	f0_min=50,
	f0_max=1100,
	):
	"""
	Получает F0 с использованием выбранного метода.
	"""
	global input_audio_path2wav
	f0_mel_min = 1127 * np.log(1 + f0_min / 700)
	f0_mel_max = 1127 * np.log(1 + f0_max / 700)

	if f0_method == "mangio-crepe":
	f0 = self.get_f0_crepe(x, f0_min, f0_max, p_len, int(hop_length))

	elif f0_method == "rmvpe+":
	params = {
	"x": x,
	"p_len": p_len,
	"pitch": pitch,
	"f0_min": f0_min,
	"f0_max": f0_max,
	"time_step": self.time_step,
	"filter_radius": filter_radius,
	"crepe_hop_length": int(hop_length),
	"model": "full",
	}
	f0 = self.get_f0_rmvpe(**params)

	elif f0_method == "fcpe":
	self.model_fcpe = FCPEF0Predictor(
	FCPE_DIR,
	f0_min=int(f0_min),
	f0_max=int(f0_max),
	dtype=torch.float32,
	device=self.device,
	sample_rate=self.sample_rate,
	threshold=0.03,
	)
	f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
	del self.model_fcpe
	gc.collect()

	f0 *= pow(2, pitch / 12)
	tf0 = self.sample_rate // self.window
	if inp_f0 is not None:
	delta_t = np.round(
	(inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
	).astype("int16")
	replace_f0 = np.interp(list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1])
	shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
	f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[:shape]

	f0bak = f0.copy()
	f0_mel = 1127 * np.log(1 + f0 / 700)
	f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
	f0_mel_max - f0_mel_min
	) + 1
	f0_mel[f0_mel <= 1] = 1
	f0_mel[f0_mel > 255] = 255
	f0_coarse = np.rint(f0_mel).astype(int)
	return f0_coarse, f0bak

	def vc(
	self,
	model,
	net_g,
	sid,
	audio0,
	pitch,
	pitchf,
	index,
	big_npy,
	index_rate,
	version,
	protect,
	):
	"""
	Преобразует аудио с использованием модели.
	"""
	feats = torch.from_numpy(audio0)
	feats = feats.half() if self.is_half else feats.float()
	if feats.dim() == 2:
	feats = feats.mean(-1)
	assert feats.dim() == 1, feats.dim()
	feats = feats.view(1, -1)
	padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)

	inputs = {
	"source": feats.to(self.device),
	"padding_mask": padding_mask,
	"output_layer": 9 if version == "v1" else 12,
	}

	with torch.no_grad():
	logits = model.extract_features(**inputs)
	feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
	if protect < 0.5 and pitch is not None and pitchf is not None:
	feats0 = feats.clone()
	if index is not None and big_npy is not None and index_rate != 0:
	npy = feats[0].cpu().numpy()
	npy = npy.astype("float32") if self.is_half else npy
	score, ix = index.search(npy, k=8)
	weight = np.square(1 / score)
	weight /= weight.sum(axis=1, keepdims=True)
	npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
	npy = npy.astype("float16") if self.is_half else npy
	feats = (
	torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
	+ (1 - index_rate) * feats
	)

	feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
	if protect < 0.5 and pitch is not None and pitchf is not None:
	feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
	0, 2, 1
	)
	p_len = audio0.shape[0] // self.window
	if feats.shape[1] < p_len:
	p_len = feats.shape[1]
	if pitch is not None and pitchf is not None:
	pitch = pitch[:, :p_len]
	pitchf = pitchf[:, :p_len]

	if protect < 0.5 and pitch is not None and pitchf is not None:
	pitchff = pitchf.clone()
	pitchff[pitchf > 0] = 1
	pitchff[pitchf < 1] = protect
	pitchff = pitchff.unsqueeze(-1)
	feats = feats * pitchff + feats0 * (1 - pitchff)
	feats = feats.to(feats0.dtype)
	p_len = torch.tensor([p_len], device=self.device).long()
	with torch.no_grad():
	if pitch is not None and pitchf is not None:
	audio1 = (
	(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
	.data.cpu()
	.float()
	.numpy()
	)
	else:
	audio1 = (
	(net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
	)
	del feats, p_len, padding_mask
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	return audio1

	def pipeline(
	self,
	model,
	net_g,
	sid,
	audio,
	input_audio_path,
	pitch,
	f0_method,
	file_index,
	index_rate,
	pitch_guidance,
	filter_radius,
	tgt_sr,
	resample_sr,
	volume_envelope,
	version,
	protect,
	hop_length,
	f0_file,
	f0_min=50,
	f0_max=1100,
	):
	"""
	Основной конвейер для преобразования аудио.
	"""
	if (
	file_index is not None
	and file_index != ""
	and os.path.exists(file_index)
	and index_rate != 0
	):
	try:
	index = faiss.read_index(file_index)
	big_npy = index.reconstruct_n(0, index.ntotal)
	except Exception as e:
	print(f"Произошла ошибка при чтении индекса FAISS: {e}")
	index = big_npy = None
	else:
	index = big_npy = None
	audio = signal.filtfilt(bh, ah, audio)
	audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
	opt_ts = []
	if audio_pad.shape[0] > self.t_max:
	audio_sum = np.zeros_like(audio)
	for i in range(self.window):
	audio_sum += audio_pad[i : i - self.window]
	for t in range(self.t_center, audio.shape[0], self.t_center):
	opt_ts.append(
	t
	- self.t_query
	+ np.where(
	np.abs(audio_sum[t - self.t_query : t + self.t_query])
	== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
	)[0][0]
	)
	s = 0
	audio_opt = []
	t = None
	audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
	p_len = audio_pad.shape[0] // self.window
	inp_f0 = None
	if f0_file and hasattr(f0_file, "name"):
	try:
	with open(f0_file.name, "r") as f:
	lines = f.read().strip("\\n").split("\\n")
	inp_f0 = np.array(
	[[float(i) for i in line.split(",")] for line in lines],
	dtype="float32",
	)
	except Exception as e:
	print(f"Произошла ошибка при чтении файла F0: {e}")
	sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
	if pitch_guidance:
	pitch, pitchf = self.get_f0(
	input_audio_path,
	audio_pad,
	p_len,
	pitch,
	f0_method,
	filter_radius,
	hop_length,
	inp_f0,
	f0_min,
	f0_max,
	)
	pitch = pitch[:p_len]
	pitchf = pitchf[:p_len]
	if self.device == "mps":
	pitchf = pitchf.astype(np.float32)
	pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
	pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
	for t in opt_ts:
	t = t // self.window * self.window
	if pitch_guidance:
	audio_opt.append(
	self.vc(
	model,
	net_g,
	sid,
	audio_pad[s : t + self.t_pad2 + self.window],
	pitch[:, s // self.window : (t + self.t_pad2) // self.window],
	pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
	index,
	big_npy,
	index_rate,
	version,
	protect,
	)[self.t_pad_tgt : -self.t_pad_tgt]
	)
	else:
	audio_opt.append(
	self.vc(
	model,
	net_g,
	sid,
	audio_pad[s : t + self.t_pad2 + self.window],
	None,
	None,
	index,
	big_npy,
	index_rate,
	version,
	protect,
	)[self.t_pad_tgt : -self.t_pad_tgt]
	)
	s = t
	if pitch_guidance:
	audio_opt.append(
	self.vc(
	model,
	net_g,
	sid,
	audio_pad[t:],
	pitch[:, t // self.window :] if t is not None else pitch,
	pitchf[:, t // self.window :] if t is not None else pitchf,
	index,
	big_npy,
	index_rate,
	version,
	protect,
	)[self.t_pad_tgt : -self.t_pad_tgt]
	)
	else:
	audio_opt.append(
	self.vc(
	model,
	net_g,
	sid,
	audio_pad[t:],
	None,
	None,
	index,
	big_npy,
	index_rate,
	version,
	protect,
	)[self.t_pad_tgt : -self.t_pad_tgt]
	)

	audio_opt = np.concatenate(audio_opt)
	if volume_envelope != 1:
	audio_opt = AudioProcessor.change_rms(
	audio, self.sample_rate, audio_opt, tgt_sr, volume_envelope
	)
	if resample_sr >= self.sample_rate and tgt_sr != resample_sr:
	audio_opt = librosa.resample(audio_opt, orig_sr=tgt_sr, target_sr=resample_sr)

	audio_max = np.abs(audio_opt).max() / 0.99
	max_int16 = 32768
	if audio_max > 1:
	max_int16 /= audio_max
	audio_opt = (audio_opt * max_int16).astype(np.int16)

	del pitch, pitchf, sid
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	return audio_opt
	'''

	for path, text in [[os.sep.join([current_dir, dirs[3], "infer.py"]), inference], [os.sep.join([current_dir, dirs[3], "pipeline.py"]), pipeline]]:
	with open(path, 'w') as f:
	f.write(text)

	remove_center = '''
	import numpy as np
	from scipy import signal

	def remove_center(input_array, samplerate, rdf=0.99999, window_size=2048, overlap=2, window_type="blackman", stereo_mode="stereo"):
	# Validate input
	# if input_array.ndim != 2 or input_array.shape[1] != 2:
	# raise ValueError("Input must be a stereo array with shape (samples, 2)")

	left = input_array[0]
	right = input_array[1]
	# mono = np.mean(input_array, axis=1)

	# Adjust window size if input is too short
	nperseg = min(window_size, len(left))
	if nperseg < 16: # Minimum reasonable window size
	nperseg = 16
	if len(left) < 16:
	# For very short inputs, just return the original with warning
	import warnings
	warnings.warn(f"Input too short ({len(left)} samples), returning original audio")
	return left, right, left, right

	noverlap = nperseg // overlap # Ensure noverlap < nperseg
	if noverlap >= nperseg:
	noverlap = nperseg - 1 # Ensure at least 1 sample difference

	# Compute STFT
	f, t, Z_left = signal.stft(left, fs=samplerate, nperseg=nperseg, noverlap=noverlap, window=window_type)
	f, t, Z_right = signal.stft(right, fs=samplerate, nperseg=nperseg, noverlap=noverlap, window=window_type)
	# f, t, Z_mono = signal.stft(mono, fs=samplerate, nperseg=nperseg, noverlap=noverlap, window=window_type)

	if stereo_mode == "mono":
	Z_common_left = np.minimum(np.abs(Z_left), np.abs(Z_right)) * np.exp(1j*np.angle(Z_mono))
	Z_common_right = np.minimum(np.abs(Z_left), np.abs(Z_right)) * np.exp(1j*np.angle(Z_mono))
	else:
	Z_common_left = np.minimum(np.abs(Z_left), np.abs(Z_right)) * np.exp(1j*np.angle(Z_right))
	Z_common_right = np.minimum(np.abs(Z_left), np.abs(Z_right)) * np.exp(1j*np.angle(Z_left))

	reduction_factor = rdf

	Z_new_left = Z_left - Z_common_left * reduction_factor
	Z_new_right = Z_right - Z_common_right * reduction_factor

	# Compute ISTFT
	_, new_left = signal.istft(Z_new_left, fs=samplerate, nperseg=nperseg, noverlap=noverlap, window=window_type)
	_, new_right = signal.istft(Z_new_right, fs=samplerate, nperseg=nperseg, noverlap=noverlap, window=window_type)
	_, common_signal_left = signal.istft(Z_common_left, fs=samplerate, nperseg=nperseg, noverlap=noverlap, window=window_type)
	_, common_signal_right = signal.istft(Z_common_right, fs=samplerate, nperseg=nperseg, noverlap=noverlap, window=window_type)

	# Trim to original length
	new_left = new_left[:len(left)]
	new_right = new_right[:len(right)]
	common_signal_left = common_signal_left[:len(left)]
	common_signal_right = common_signal_right[:len(left)]

	# Normalize
	peak = np.max([np.abs(new_left).max(), np.abs(new_right).max()])
	if peak > 1.0:
	new_left = new_left / peak
	new_right = new_right / peak

	inverted_center_left = -common_signal_left
	inverted_center_right = -common_signal_right

	mixed_left = left + inverted_center_left
	mixed_right = right + inverted_center_right

	peak_mixed = np.max([np.abs(mixed_left).max(), np.abs(mixed_right).max()])
	if peak_mixed > 1.0:
	mixed_left = mixed_left / peak_mixed
	mixed_right = mixed_right / peak_mixed

	return common_signal_left, common_signal_right, new_left, new_right
	'''

	for path, text in [[os.sep.join([current_dir, dirs[11], "remove_center.py"]), remove_center]]:
	with open(path, 'w') as f:
	f.write(text)

	lib_algorithm = {
	"synthesizers" : ["synthesizers.py", '''
	import torch
	from torch import nn
	from torch.nn.utils.weight_norm import remove_weight_norm
	from typing import Optional

	from .commons import slice_segments, rand_slice_segments
	from .encoders import TextEncoder, PosteriorEncoder
	from .generators import Generator
	from .nsf import GeneratorNSF
	from .residuals import ResidualCouplingBlock


	class Synthesizer(nn.Module):
	def __init__(
	self,
	spec_channels,
	segment_size,
	inter_channels,
	hidden_channels,
	filter_channels,
	n_heads,
	n_layers,
	kernel_size,
	p_dropout,
	resblock,
	resblock_kernel_sizes,
	resblock_dilation_sizes,
	upsample_rates,
	upsample_initial_channel,
	upsample_kernel_sizes,
	spk_embed_dim,
	gin_channels,
	sr,
	use_f0,
	input_dim=768,
	**kwargs
	):
	super(Synthesizer, self).__init__()
	self.spec_channels = spec_channels
	self.inter_channels = inter_channels
	self.hidden_channels = hidden_channels
	self.filter_channels = filter_channels
	self.n_heads = n_heads
	self.n_layers = n_layers
	self.kernel_size = kernel_size
	self.p_dropout = float(p_dropout)
	self.resblock = resblock
	self.resblock_kernel_sizes = resblock_kernel_sizes
	self.resblock_dilation_sizes = resblock_dilation_sizes
	self.upsample_rates = upsample_rates
	self.upsample_initial_channel = upsample_initial_channel
	self.upsample_kernel_sizes = upsample_kernel_sizes
	self.segment_size = segment_size
	self.gin_channels = gin_channels
	self.spk_embed_dim = spk_embed_dim
	self.use_f0 = use_f0

	self.enc_p = TextEncoder(
	inter_channels,
	hidden_channels,
	filter_channels,
	n_heads,
	n_layers,
	kernel_size,
	float(p_dropout),
	input_dim,
	f0=use_f0,
	)

	if use_f0:
	self.dec = GeneratorNSF(
	inter_channels,
	resblock,
	resblock_kernel_sizes,
	resblock_dilation_sizes,
	upsample_rates,
	upsample_initial_channel,
	upsample_kernel_sizes,
	gin_channels=gin_channels,
	sr=sr,
	is_half=kwargs["is_half"],
	)
	else:
	self.dec = Generator(
	inter_channels,
	resblock,
	resblock_kernel_sizes,
	resblock_dilation_sizes,
	upsample_rates,
	upsample_initial_channel,
	upsample_kernel_sizes,
	gin_channels=gin_channels,
	)

	self.enc_q = PosteriorEncoder(
	spec_channels,
	inter_channels,
	hidden_channels,
	5,
	1,
	16,
	gin_channels=gin_channels,
	)
	self.flow = ResidualCouplingBlock(
	inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
	)
	self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)

	def remove_weight_norm(self):
	self.dec.remove_weight_norm()
	self.flow.remove_weight_norm()
	self.enc_q.remove_weight_norm()

	def __prepare_scriptable__(self):
	for hook in self.dec._forward_pre_hooks.values():
	if (
	hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
	and hook.__class__.__name__ == "_WeightNorm"
	):
	remove_weight_norm(self.dec)
	for hook in self.flow._forward_pre_hooks.values():
	if (
	hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
	and hook.__class__.__name__ == "_WeightNorm"
	):
	remove_weight_norm(self.flow)
	if hasattr(self, "enc_q"):
	for hook in self.enc_q._forward_pre_hooks.values():
	if (
	hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
	and hook.__class__.__name__ == "_WeightNorm"
	):
	remove_weight_norm(self.enc_q)
	return self

	@torch.jit.ignore
	def forward(
	self,
	phone: torch.Tensor,
	phone_lengths: torch.Tensor,
	pitch: Optional[torch.Tensor] = None,
	pitchf: Optional[torch.Tensor] = None,
	y: torch.Tensor = None,
	y_lengths: torch.Tensor = None,
	ds: Optional[torch.Tensor] = None,
	):
	g = self.emb_g(ds).unsqueeze(-1)
	m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
	if y is not None:
	z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
	z_p = self.flow(z, y_mask, g=g)
	z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size)
	if self.use_f0:
	pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2)
	o = self.dec(z_slice, pitchf, g=g)
	else:
	o = self.dec(z_slice, g=g)
	return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
	else:
	return None, None, x_mask, None, (None, None, m_p, logs_p, None, None)

	@torch.jit.export
	def infer(
	self,
	phone: torch.Tensor,
	phone_lengths: torch.Tensor,
	pitch: Optional[torch.Tensor] = None,
	nsff0: Optional[torch.Tensor] = None,
	sid: torch.Tensor = None,
	rate: Optional[torch.Tensor] = None,
	):
	g = self.emb_g(sid).unsqueeze(-1)
	m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
	z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
	if rate is not None:
	assert isinstance(rate, torch.Tensor)
	head = int(z_p.shape[2] * (1.0 - rate.item()))
	z_p = z_p[:, :, head:]
	x_mask = x_mask[:, :, head:]
	if self.use_f0:
	nsff0 = nsff0[:, head:]
	if self.use_f0:
	z = self.flow(z_p, x_mask, g=g, reverse=True)
	o = self.dec(z * x_mask, nsff0, g=g)
	else:
	z = self.flow(z_p, x_mask, g=g, reverse=True)
	o = self.dec(z * x_mask, g=g)
	return o, x_mask, (z, z_p, m_p, logs_p)

	'''],
	"residuals" : ["residuals.py", '''
	import torch
	from torch import nn
	from torch.nn import functional as F
	from torch.nn.utils.weight_norm import remove_weight_norm
	from torch.nn.utils.parametrizations import weight_norm
	from typing import Optional

	from .commons import get_padding, init_weights
	from .modules import WaveNet


	LRELU_SLOPE = 0.1


	def create_conv1d_layer(channels, kernel_size, dilation):
	return weight_norm(
	nn.Conv1d(
	channels,
	channels,
	kernel_size,
	1,
	dilation=dilation,
	padding=get_padding(kernel_size, dilation),
	)
	)


	def apply_mask(tensor, mask):
	return tensor * mask if mask is not None else tensor


	class ResBlockBase(nn.Module):
	def __init__(self, channels, kernel_size, dilations):
	super(ResBlockBase, self).__init__()
	self.convs1 = nn.ModuleList(
	[create_conv1d_layer(channels, kernel_size, d) for d in dilations]
	)
	self.convs1.apply(init_weights)

	self.convs2 = nn.ModuleList(
	[create_conv1d_layer(channels, kernel_size, 1) for _ in dilations]
	)
	self.convs2.apply(init_weights)

	def forward(self, x, x_mask=None):
	for c1, c2 in zip(self.convs1, self.convs2):
	xt = F.leaky_relu(x, LRELU_SLOPE)
	xt = apply_mask(xt, x_mask)
	xt = F.leaky_relu(c1(xt), LRELU_SLOPE)
	xt = apply_mask(xt, x_mask)
	xt = c2(xt)
	x = xt + x
	return apply_mask(x, x_mask)

	def remove_weight_norm(self):
	for conv in self.convs1 + self.convs2:
	remove_weight_norm(conv)


	class ResBlock1(ResBlockBase):
	def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
	super(ResBlock1, self).__init__(channels, kernel_size, dilation)


	class ResBlock2(ResBlockBase):
	def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
	super(ResBlock2, self).__init__(channels, kernel_size, dilation)


	class Log(nn.Module):
	def forward(self, x, x_mask, reverse=False, **kwargs):
	if not reverse:
	y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
	logdet = torch.sum(-y, [1, 2])
	return y, logdet
	else:
	x = torch.exp(x) * x_mask
	return x


	class Flip(nn.Module):
	def forward(self, x, args, reverse=False, *kwargs):
	x = torch.flip(x, [1])
	if not reverse:
	logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
	return x, logdet
	else:
	return x


	class ElementwiseAffine(nn.Module):
	def __init__(self, channels):
	super().__init__()
	self.channels = channels
	self.m = nn.Parameter(torch.zeros(channels, 1))
	self.logs = nn.Parameter(torch.zeros(channels, 1))

	def forward(self, x, x_mask, reverse=False, **kwargs):
	if not reverse:
	y = self.m + torch.exp(self.logs) * x
	y = y * x_mask
	logdet = torch.sum(self.logs * x_mask, [1, 2])
	return y, logdet
	else:
	x = (x - self.m) * torch.exp(-self.logs) * x_mask
	return x


	class ResidualCouplingBlock(nn.Module):
	def __init__(
	self,
	channels,
	hidden_channels,
	kernel_size,
	dilation_rate,
	n_layers,
	n_flows=4,
	gin_channels=0,
	):
	super(ResidualCouplingBlock, self).__init__()
	self.channels = channels
	self.hidden_channels = hidden_channels
	self.kernel_size = kernel_size
	self.dilation_rate = dilation_rate
	self.n_layers = n_layers
	self.n_flows = n_flows
	self.gin_channels = gin_channels

	self.flows = nn.ModuleList()
	for i in range(n_flows):
	self.flows.append(
	ResidualCouplingLayer(
	channels,
	hidden_channels,
	kernel_size,
	dilation_rate,
	n_layers,
	gin_channels=gin_channels,
	mean_only=True,
	)
	)
	self.flows.append(Flip())

	def forward(
	self,
	x: torch.Tensor,
	x_mask: torch.Tensor,
	g: Optional[torch.Tensor] = None,
	reverse: bool = False,
	):
	if not reverse:
	for flow in self.flows:
	x, _ = flow(x, x_mask, g=g, reverse=reverse)
	else:
	for flow in reversed(self.flows):
	x = flow.forward(x, x_mask, g=g, reverse=reverse)
	return x

	def remove_weight_norm(self):
	for i in range(self.n_flows):
	self.flows[i * 2].remove_weight_norm()

	def __prepare_scriptable__(self):
	for i in range(self.n_flows):
	for hook in self.flows[i * 2]._forward_pre_hooks.values():
	if (
	hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
	and hook.__class__.__name__ == "_WeightNorm"
	):
	remove_weight_norm(self.flows[i * 2])

	return self


	class ResidualCouplingLayer(nn.Module):
	def __init__(
	self,
	channels,
	hidden_channels,
	kernel_size,
	dilation_rate,
	n_layers,
	p_dropout=0,
	gin_channels=0,
	mean_only=False,
	):
	assert channels % 2 == 0, "channels should be divisible by 2"
	super().__init__()
	self.channels = channels
	self.hidden_channels = hidden_channels
	self.kernel_size = kernel_size
	self.dilation_rate = dilation_rate
	self.n_layers = n_layers
	self.half_channels = channels // 2
	self.mean_only = mean_only

	self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
	self.enc = WaveNet(
	hidden_channels,
	kernel_size,
	dilation_rate,
	n_layers,
	p_dropout=p_dropout,
	gin_channels=gin_channels,
	)
	self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
	self.post.weight.data.zero_()
	self.post.bias.data.zero_()

	def forward(self, x, x_mask, g=None, reverse=False):
	x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
	h = self.pre(x0) * x_mask
	h = self.enc(h, x_mask, g=g)
	stats = self.post(h) * x_mask
	if not self.mean_only:
	m, logs = torch.split(stats, [self.half_channels] * 2, 1)
	else:
	m = stats
	logs = torch.zeros_like(m)

	if not reverse:
	x1 = m + x1 * torch.exp(logs) * x_mask
	x = torch.cat([x0, x1], 1)
	logdet = torch.sum(logs, [1, 2])
	return x, logdet
	else:
	x1 = (x1 - m) * torch.exp(-logs) * x_mask
	x = torch.cat([x0, x1], 1)
	return x

	def remove_weight_norm(self):
	self.enc.remove_weight_norm()

	'''],
	"nsf" : ["nsf.py", '''
	import math
	import torch
	from torch import nn
	from torch.nn import functional as F
	from torch.nn.utils.weight_norm import remove_weight_norm
	from torch.nn.utils.parametrizations import weight_norm
	from typing import Optional

	from .commons import init_weights
	from .generators import SineGen
	from .residuals import LRELU_SLOPE, ResBlock1, ResBlock2


	class SourceModuleHnNSF(nn.Module):
	def __init__(
	self,
	sample_rate,
	harmonic_num=0,
	sine_amp=0.1,
	add_noise_std=0.003,
	voiced_threshod=0,
	is_half=True,
	):
	super(SourceModuleHnNSF, self).__init__()

	self.sine_amp = sine_amp
	self.noise_std = add_noise_std
	self.is_half = is_half

	self.l_sin_gen = SineGen(
	sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
	)
	self.l_linear = nn.Linear(harmonic_num + 1, 1)
	self.l_tanh = nn.Tanh()

	def forward(self, x: torch.Tensor, upsample_factor: int = 1):
	sine_wavs, uv, _ = self.l_sin_gen(x, upsample_factor)
	sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
	sine_merge = self.l_tanh(self.l_linear(sine_wavs))
	return sine_merge, None, None


	class GeneratorNSF(nn.Module):
	def __init__(
	self,
	initial_channel,
	resblock,
	resblock_kernel_sizes,
	resblock_dilation_sizes,
	upsample_rates,
	upsample_initial_channel,
	upsample_kernel_sizes,
	gin_channels,
	sr,
	is_half=False,
	):
	super(GeneratorNSF, self).__init__()

	self.num_kernels = len(resblock_kernel_sizes)
	self.num_upsamples = len(upsample_rates)
	self.f0_upsamp = nn.Upsample(scale_factor=math.prod(upsample_rates))
	self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0, is_half=is_half)

	self.conv_pre = nn.Conv1d(
	initial_channel, upsample_initial_channel, 7, 1, padding=3
	)
	resblock_cls = ResBlock1 if resblock == "1" else ResBlock2

	self.ups = nn.ModuleList()
	self.noise_convs = nn.ModuleList()

	channels = [
	upsample_initial_channel // (2 ** (i + 1)) for i in range(len(upsample_rates))
	]
	stride_f0s = [
	math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1
	for i in range(len(upsample_rates))
	]

	for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
	self.ups.append(
	weight_norm(
	nn.ConvTranspose1d(
	upsample_initial_channel // (2**i),
	channels[i],
	k,
	u,
	padding=(k - u) // 2,
	)
	)
	)

	self.noise_convs.append(
	nn.Conv1d(
	1,
	channels[i],
	kernel_size=(stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1),
	stride=stride_f0s[i],
	padding=(stride_f0s[i] // 2 if stride_f0s[i] > 1 else 0),
	)
	)

	self.resblocks = nn.ModuleList(
	[
	resblock_cls(channels[i], k, d)
	for i in range(len(self.ups))
	for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes)
	]
	)

	self.conv_post = nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False)
	self.ups.apply(init_weights)

	if gin_channels != 0:
	self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)

	self.upp = math.prod(upsample_rates)
	self.lrelu_slope = LRELU_SLOPE

	def forward(self, x, f0, g: Optional[torch.Tensor] = None):
	har_source, _, _ = self.m_source(f0, self.upp)
	har_source = har_source.transpose(1, 2)
	x = self.conv_pre(x)

	if g is not None:
	x = x + self.cond(g)

	for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
	x = F.leaky_relu(x, self.lrelu_slope)
	x = ups(x)
	x = x + noise_convs(har_source)

	xs = sum(
	[
	resblock(x)
	for j, resblock in enumerate(self.resblocks)
	if j in range(i * self.num_kernels, (i + 1) * self.num_kernels)
	]
	)
	x = xs / self.num_kernels

	x = F.leaky_relu(x)
	x = torch.tanh(self.conv_post(x))
	return x

	def remove_weight_norm(self):
	for l in self.ups:
	remove_weight_norm(l)
	for l in self.resblocks:
	l.remove_weight_norm()

	def __prepare_scriptable__(self):
	for l in self.ups:
	for hook in l._forward_pre_hooks.values():
	if (
	hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
	and hook.__class__.__name__ == "_WeightNorm"
	):
	remove_weight_norm(l)
	for l in self.resblocks:
	for hook in l._forward_pre_hooks.values():
	if (
	hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
	and hook.__class__.__name__ == "_WeightNorm"
	):
	remove_weight_norm(l)
	return self

	'''],
	"normalization" : ["normalization.py", '''
	import torch
	from torch import nn
	from torch.nn import functional as F


	class LayerNorm(nn.Module):
	def __init__(self, channels, eps=1e-5):
	super().__init__()
	self.eps = eps
	self.gamma = nn.Parameter(torch.ones(channels))
	self.beta = nn.Parameter(torch.zeros(channels))

	def forward(self, x):
	x = x.transpose(1, -1)
	x = F.layer_norm(x, (x.size(-1),), self.gamma, self.beta, self.eps)
	return x.transpose(1, -1)
	'''],
	"modules" : ["modules.py", '''
	import torch
	from torch import nn
	from torch.nn.utils.weight_norm import remove_weight_norm
	from torch.nn.utils.parametrizations import weight_norm

	from .commons import fused_add_tanh_sigmoid_multiply


	class WaveNet(nn.Module):
	def __init__(
	self,
	hidden_channels,
	kernel_size,
	dilation_rate,
	n_layers,
	gin_channels=0,
	p_dropout=0,
	):
	super(WaveNet, self).__init__()
	assert kernel_size % 2 == 1
	self.hidden_channels = hidden_channels
	self.kernel_size = (kernel_size,)
	self.dilation_rate = dilation_rate
	self.n_layers = n_layers
	self.gin_channels = gin_channels
	self.p_dropout = p_dropout

	self.in_layers = nn.ModuleList()
	self.res_skip_layers = nn.ModuleList()
	self.drop = nn.Dropout(p_dropout)

	if gin_channels != 0:
	cond_layer = nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
	self.cond_layer = weight_norm(cond_layer, name="weight")

	dilations = [dilation_rate**i for i in range(n_layers)]
	paddings = [(kernel_size * d - d) // 2 for d in dilations]

	for i in range(n_layers):
	in_layer = nn.Conv1d(
	hidden_channels,
	2 * hidden_channels,
	kernel_size,
	dilation=dilations[i],
	padding=paddings[i],
	)
	in_layer = weight_norm(in_layer, name="weight")
	self.in_layers.append(in_layer)

	res_skip_channels = (
	hidden_channels if i == n_layers - 1 else 2 * hidden_channels
	)

	res_skip_layer = nn.Conv1d(hidden_channels, res_skip_channels, 1)
	res_skip_layer = weight_norm(res_skip_layer, name="weight")
	self.res_skip_layers.append(res_skip_layer)

	def forward(self, x, x_mask, g=None, **kwargs):
	output = torch.zeros_like(x)
	n_channels_tensor = torch.IntTensor([self.hidden_channels])

	if g is not None:
	g = self.cond_layer(g)

	for i in range(self.n_layers):
	x_in = self.in_layers[i](x)
	if g is not None:
	cond_offset = i * 2 * self.hidden_channels
	g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
	else:
	g_l = torch.zeros_like(x_in)

	acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)

	acts = self.drop(acts)

	res_skip_acts = self.res_skip_layers[i](acts)
	if i < self.n_layers - 1:
	res_acts = res_skip_acts[:, : self.hidden_channels, :]
	x = (x + res_acts) * x_mask
	output = output + res_skip_acts[:, self.hidden_channels :, :]
	else:
	output = output + res_skip_acts
	return output * x_mask

	def remove_weight_norm(self):
	if self.gin_channels != 0:
	remove_weight_norm(self.cond_layer)
	for l in self.in_layers:
	remove_weight_norm(l)
	for l in self.res_skip_layers:
	remove_weight_norm(l)

	'''],
	"generators" : ["generators.py", '''
	import torch
	from torch import nn
	from torch.nn import functional as F
	from torch.nn.utils.weight_norm import remove_weight_norm
	from torch.nn.utils.parametrizations import weight_norm
	from typing import Optional

	from .commons import init_weights
	from .residuals import LRELU_SLOPE, ResBlock1, ResBlock2


	class Generator(nn.Module):
	def __init__(
	self,
	initial_channel,
	resblock,
	resblock_kernel_sizes,
	resblock_dilation_sizes,
	upsample_rates,
	upsample_initial_channel,
	upsample_kernel_sizes,
	gin_channels=0,
	):
	super(Generator, self).__init__()
	self.num_kernels = len(resblock_kernel_sizes)
	self.num_upsamples = len(upsample_rates)
	self.conv_pre = nn.Conv1d(
	initial_channel, upsample_initial_channel, 7, 1, padding=3
	)
	resblock = ResBlock1 if resblock == "1" else ResBlock2

	self.ups_and_resblocks = nn.ModuleList()
	for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
	self.ups_and_resblocks.append(
	weight_norm(
	nn.ConvTranspose1d(
	upsample_initial_channel // (2**i),
	upsample_initial_channel // (2 ** (i + 1)),
	k,
	u,
	padding=(k - u) // 2,
	)
	)
	)
	ch = upsample_initial_channel // (2 ** (i + 1))
	for j, (k, d) in enumerate(
	zip(resblock_kernel_sizes, resblock_dilation_sizes)
	):
	self.ups_and_resblocks.append(resblock(ch, k, d))

	self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
	self.ups_and_resblocks.apply(init_weights)

	if gin_channels != 0:
	self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)

	def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
	x = self.conv_pre(x)
	if g is not None:
	x = x + self.cond(g)

	resblock_idx = 0
	for _ in range(self.num_upsamples):
	x = F.leaky_relu(x, LRELU_SLOPE)
	x = self.ups_and_resblocks[resblock_idx](x)
	resblock_idx += 1
	xs = 0
	for _ in range(self.num_kernels):
	xs += self.ups_and_resblocks[resblock_idx](x)
	resblock_idx += 1
	x = xs / self.num_kernels

	x = F.leaky_relu(x)
	x = self.conv_post(x)
	x = torch.tanh(x)

	return x

	def __prepare_scriptable__(self):
	for l in self.ups_and_resblocks:
	for hook in l._forward_pre_hooks.values():
	if (
	hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
	and hook.__class__.__name__ == "_WeightNorm"
	):
	remove_weight_norm(l)
	return self

	def remove_weight_norm(self):
	for l in self.ups_and_resblocks:
	remove_weight_norm(l)


	class SineGen(nn.Module):
	def __init__(
	self,
	samp_rate,
	harmonic_num=0,
	sine_amp=0.1,
	noise_std=0.003,
	voiced_threshold=0,
	flag_for_pulse=False,
	):
	super(SineGen, self).__init__()
	self.sine_amp = sine_amp
	self.noise_std = noise_std
	self.harmonic_num = harmonic_num
	self.dim = self.harmonic_num + 1
	self.sample_rate = samp_rate
	self.voiced_threshold = voiced_threshold

	def _f02uv(self, f0):
	uv = torch.ones_like(f0)
	uv = uv * (f0 > self.voiced_threshold)
	return uv

	def forward(self, f0: torch.Tensor, upp: int):
	with torch.no_grad():
	f0 = f0[:, None].transpose(1, 2)
	f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
	f0_buf[:, :, 0] = f0[:, :, 0]
	f0_buf[:, :, 1:] = (
	f0_buf[:, :, 0:1]
	* torch.arange(2, self.harmonic_num + 2, device=f0.device)[None, None, :]
	)
	rad_values = (f0_buf / float(self.sample_rate)) % 1
	rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device)
	rand_ini[:, 0] = 0
	rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
	tmp_over_one = torch.cumsum(rad_values, 1)
	tmp_over_one *= upp
	tmp_over_one = F.interpolate(
	tmp_over_one.transpose(2, 1),
	scale_factor=float(upp),
	mode="linear",
	align_corners=True,
	).transpose(2, 1)
	rad_values = F.interpolate(
	rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest"
	).transpose(2, 1)
	tmp_over_one %= 1
	tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
	cumsum_shift = torch.zeros_like(rad_values)
	cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
	sine_waves = torch.sin(
	torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi
	)
	sine_waves = sine_waves * self.sine_amp
	uv = self._f02uv(f0)
	uv = F.interpolate(
	uv.transpose(2, 1), scale_factor=float(upp), mode="nearest"
	).transpose(2, 1)
	noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
	noise = noise_amp * torch.randn_like(sine_waves)
	sine_waves = sine_waves * uv + noise
	return sine_waves, uv, noise

	'''],
	"encoders" : ["encoders.py", '''
	import math
	import torch
	from torch import nn
	from torch.nn.utils.weight_norm import remove_weight_norm
	from typing import Optional

	from .attentions import FFN, MultiHeadAttention
	from .commons import sequence_mask
	from .modules import WaveNet
	from .normalization import LayerNorm


	class Encoder(nn.Module):
	def __init__(
	self,
	hidden_channels,
	filter_channels,
	n_heads,
	n_layers,
	kernel_size=1,
	p_dropout=0.0,
	window_size=10,
	**kwargs
	):
	super().__init__()
	self.hidden_channels = hidden_channels
	self.filter_channels = filter_channels
	self.n_heads = n_heads
	self.n_layers = n_layers
	self.kernel_size = kernel_size
	self.p_dropout = p_dropout
	self.window_size = window_size

	self.drop = nn.Dropout(p_dropout)
	self.attn_layers = nn.ModuleList()
	self.norm_layers_1 = nn.ModuleList()
	self.ffn_layers = nn.ModuleList()
	self.norm_layers_2 = nn.ModuleList()
	for i in range(self.n_layers):
	self.attn_layers.append(
	MultiHeadAttention(
	hidden_channels,
	hidden_channels,
	n_heads,
	p_dropout=p_dropout,
	window_size=window_size,
	)
	)
	self.norm_layers_1.append(LayerNorm(hidden_channels))
	self.ffn_layers.append(
	FFN(
	hidden_channels,
	hidden_channels,
	filter_channels,
	kernel_size,
	p_dropout=p_dropout,
	)
	)
	self.norm_layers_2.append(LayerNorm(hidden_channels))

	def forward(self, x, x_mask):
	attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
	x = x * x_mask
	for i in range(self.n_layers):
	y = self.attn_layers[i](x, x, attn_mask)
	y = self.drop(y)
	x = self.norm_layers_1[i](x + y)

	y = self.ffn_layers[i](x, x_mask)
	y = self.drop(y)
	x = self.norm_layers_2[i](x + y)
	x = x * x_mask
	return x


	class TextEncoder(nn.Module):
	def __init__(
	self,
	out_channels,
	hidden_channels,
	filter_channels,
	n_heads,
	n_layers,
	kernel_size,
	p_dropout,
	embedding_dim,
	f0=True,
	):
	super(TextEncoder, self).__init__()
	self.out_channels = out_channels
	self.hidden_channels = hidden_channels
	self.filter_channels = filter_channels
	self.n_heads = n_heads
	self.n_layers = n_layers
	self.kernel_size = kernel_size
	self.p_dropout = float(p_dropout)
	self.emb_phone = nn.Linear(embedding_dim, hidden_channels)
	self.lrelu = nn.LeakyReLU(0.1, inplace=True)
	if f0:
	self.emb_pitch = nn.Embedding(256, hidden_channels)
	self.encoder = Encoder(
	hidden_channels,
	filter_channels,
	n_heads,
	n_layers,
	kernel_size,
	float(p_dropout),
	)
	self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)

	def forward(
	self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor
	):
	if pitch is None:
	x = self.emb_phone(phone)
	else:
	x = self.emb_phone(phone) + self.emb_pitch(pitch)
	x = x * math.sqrt(self.hidden_channels)
	x = self.lrelu(x)
	x = torch.transpose(x, 1, -1)
	x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
	x = self.encoder(x * x_mask, x_mask)
	stats = self.proj(x) * x_mask

	m, logs = torch.split(stats, self.out_channels, dim=1)
	return m, logs, x_mask


	class PosteriorEncoder(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels,
	hidden_channels,
	kernel_size,
	dilation_rate,
	n_layers,
	gin_channels=0,
	):
	super(PosteriorEncoder, self).__init__()
	self.in_channels = in_channels
	self.out_channels = out_channels
	self.hidden_channels = hidden_channels
	self.kernel_size = kernel_size
	self.dilation_rate = dilation_rate
	self.n_layers = n_layers
	self.gin_channels = gin_channels

	self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
	self.enc = WaveNet(
	hidden_channels,
	kernel_size,
	dilation_rate,
	n_layers,
	gin_channels=gin_channels,
	)
	self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)

	def forward(
	self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
	):
	x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
	x = self.pre(x) * x_mask
	x = self.enc(x, x_mask, g=g)
	stats = self.proj(x) * x_mask
	m, logs = torch.split(stats, self.out_channels, dim=1)
	z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
	return z, m, logs, x_mask

	def remove_weight_norm(self):
	self.enc.remove_weight_norm()

	def __prepare_scriptable__(self):
	for hook in self.enc._forward_pre_hooks.values():
	if (
	hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
	and hook.__class__.__name__ == "_WeightNorm"
	):
	remove_weight_norm(self.enc)
	return self

	'''],
	"discriminators" : ["discriminators.py", '''
	import torch
	from torch import nn
	from torch.nn import functional as F
	from torch.nn.utils.parametrizations import spectral_norm, weight_norm

	from .commons import get_padding
	from .residuals import LRELU_SLOPE


	PERIODS_V1 = [2, 3, 5, 7, 11, 17]
	PERIODS_V2 = [2, 3, 5, 7, 11, 17, 23, 37]
	IN_CHANNELS = [1, 32, 128, 512, 1024]
	OUT_CHANNELS = [32, 128, 512, 1024, 1024]


	class MultiPeriodDiscriminator(nn.Module):
	def __init__(self, use_spectral_norm=False):
	super(MultiPeriodDiscriminator, self).__init__()
	self.discriminators = nn.ModuleList(
	[DiscriminatorS(use_spectral_norm=use_spectral_norm)]
	+ [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in PERIODS_V1]
	)

	def forward(self, y, y_hat):
	y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
	for d in self.discriminators:
	y_d_r, fmap_r = d(y)
	y_d_g, fmap_g = d(y_hat)
	y_d_rs.append(y_d_r)
	y_d_gs.append(y_d_g)
	fmap_rs.append(fmap_r)
	fmap_gs.append(fmap_g)

	return y_d_rs, y_d_gs, fmap_rs, fmap_gs


	class MultiPeriodDiscriminatorV2(nn.Module):
	def __init__(self, use_spectral_norm=False):
	super(MultiPeriodDiscriminatorV2, self).__init__()
	self.discriminators = nn.ModuleList(
	[DiscriminatorS(use_spectral_norm=use_spectral_norm)]
	+ [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in PERIODS_V2]
	)

	def forward(self, y, y_hat):
	y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
	for d in self.discriminators:
	y_d_r, fmap_r = d(y)
	y_d_g, fmap_g = d(y_hat)
	y_d_rs.append(y_d_r)
	y_d_gs.append(y_d_g)
	fmap_rs.append(fmap_r)
	fmap_gs.append(fmap_g)

	return y_d_rs, y_d_gs, fmap_rs, fmap_gs


	class DiscriminatorS(nn.Module):
	def __init__(self, use_spectral_norm=False):
	super(DiscriminatorS, self).__init__()
	norm_f = spectral_norm if use_spectral_norm else weight_norm
	self.convs = nn.ModuleList(
	[
	norm_f(nn.Conv1d(1, 16, 15, 1, padding=7)),
	norm_f(nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
	norm_f(nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
	norm_f(nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
	norm_f(nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
	norm_f(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
	]
	)
	self.conv_post = norm_f(nn.Conv1d(1024, 1, 3, 1, padding=1))
	self.lrelu = nn.LeakyReLU(LRELU_SLOPE)

	def forward(self, x):
	fmap = []
	for conv in self.convs:
	x = self.lrelu(conv(x))
	fmap.append(x)
	x = self.conv_post(x)
	fmap.append(x)
	x = torch.flatten(x, 1, -1)
	return x, fmap


	class DiscriminatorP(nn.Module):
	def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
	super(DiscriminatorP, self).__init__()
	self.period = period
	norm_f = spectral_norm if use_spectral_norm else weight_norm

	self.convs = nn.ModuleList(
	[
	norm_f(
	nn.Conv2d(
	in_ch,
	out_ch,
	(kernel_size, 1),
	(stride, 1),
	padding=(get_padding(kernel_size, 1), 0),
	)
	)
	for in_ch, out_ch in zip(IN_CHANNELS, OUT_CHANNELS)
	]
	)

	self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
	self.lrelu = nn.LeakyReLU(LRELU_SLOPE)

	def forward(self, x):
	fmap = []
	b, c, t = x.shape
	if t % self.period != 0:
	n_pad = self.period - (t % self.period)
	x = F.pad(x, (0, n_pad), "reflect")
	x = x.view(b, c, -1, self.period)

	for conv in self.convs:
	x = self.lrelu(conv(x))
	fmap.append(x)

	x = self.conv_post(x)
	fmap.append(x)
	x = torch.flatten(x, 1, -1)
	return x, fmap

	'''],
	"commons" : ["commons.py", '''
	import math
	import torch
	from torch.nn import functional as F
	from typing import List, Optional


	def init_weights(m, mean=0.0, std=0.01):
	classname = m.__class__.__name__
	if classname.find("Conv") != -1:
	m.weight.data.normal_(mean, std)


	def get_padding(kernel_size, dilation=1):
	return int((kernel_size * dilation - dilation) / 2)


	def convert_pad_shape(pad_shape):
	l = pad_shape[::-1]
	pad_shape = [item for sublist in l for item in sublist]
	return pad_shape


	def kl_divergence(m_p, logs_p, m_q, logs_q):
	kl = (logs_q - logs_p) - 0.5
	kl += 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
	return kl


	def slice_segments(
	x: torch.Tensor, ids_str: torch.Tensor, segment_size: int = 4, dim: int = 2
	):
	if dim == 2:
	ret = torch.zeros_like(x[:, :segment_size])
	elif dim == 3:
	ret = torch.zeros_like(x[:, :, :segment_size])

	for i in range(x.size(0)):
	idx_str = ids_str[i].item()
	idx_end = idx_str + segment_size
	if dim == 2:
	ret[i] = x[i, idx_str:idx_end]
	else:
	ret[i] = x[i, :, idx_str:idx_end]

	return ret


	def rand_slice_segments(x, x_lengths=None, segment_size=4):
	b, d, t = x.size()
	if x_lengths is None:
	x_lengths = t
	ids_str_max = x_lengths - segment_size + 1
	ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
	ret = slice_segments(x, ids_str, segment_size, dim=3)
	return ret, ids_str


	def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
	position = torch.arange(length, dtype=torch.float)
	num_timescales = channels // 2
	log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
	num_timescales - 1
	)
	inv_timescales = min_timescale * torch.exp(
	torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
	)
	scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
	signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
	signal = F.pad(signal, [0, 0, 0, channels % 2])
	signal = signal.view(1, channels, length)
	return signal


	def subsequent_mask(length):
	mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
	return mask


	@torch.jit.script
	def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
	n_channels_int = n_channels[0]
	in_act = input_a + input_b
	t_act = torch.tanh(in_act[:, :n_channels_int, :])
	s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
	acts = t_act * s_act
	return acts


	def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
	if max_length is None:
	max_length = length.max()
	x = torch.arange(max_length, dtype=length.dtype, device=length.device)
	return x.unsqueeze(0) < length.unsqueeze(1)


	def clip_grad_value(parameters, clip_value, norm_type=2):
	if isinstance(parameters, torch.Tensor):
	parameters = [parameters]
	parameters = List(filter(lambda p: p.grad is not None, parameters))
	norm_type = float(norm_type)
	if clip_value is not None:
	clip_value = float(clip_value)

	total_norm = 0
	for p in parameters:
	param_norm = p.grad.data.norm(norm_type)
	total_norm += param_norm.item() ** norm_type
	if clip_value is not None:
	p.grad.data.clamp_(min=-clip_value, max=clip_value)
	total_norm = total_norm ** (1.0 / norm_type)
	return total_norm

	'''],
	"attentions" : ["attentions.py", '''
	import math
	import torch
	from torch import nn
	from torch.nn import functional as F

	from .commons import convert_pad_shape


	class MultiHeadAttention(nn.Module):
	def __init__(
	self,
	channels,
	out_channels,
	n_heads,
	p_dropout=0.0,
	window_size=None,
	heads_share=True,
	block_length=None,
	proximal_bias=False,
	proximal_init=False,
	):
	super().__init__()
	assert channels % n_heads == 0

	self.channels = channels
	self.out_channels = out_channels
	self.n_heads = n_heads
	self.p_dropout = p_dropout
	self.window_size = window_size
	self.heads_share = heads_share
	self.block_length = block_length
	self.proximal_bias = proximal_bias
	self.proximal_init = proximal_init
	self.attn = None

	self.k_channels = channels // n_heads
	self.conv_q = nn.Conv1d(channels, channels, 1)
	self.conv_k = nn.Conv1d(channels, channels, 1)
	self.conv_v = nn.Conv1d(channels, channels, 1)
	self.conv_o = nn.Conv1d(channels, out_channels, 1)
	self.drop = nn.Dropout(p_dropout)

	if window_size is not None:
	n_heads_rel = 1 if heads_share else n_heads
	rel_stddev = self.k_channels**-0.5
	self.emb_rel_k = nn.Parameter(
	torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
	* rel_stddev
	)
	self.emb_rel_v = nn.Parameter(
	torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
	* rel_stddev
	)

	nn.init.xavier_uniform_(self.conv_q.weight)
	nn.init.xavier_uniform_(self.conv_k.weight)
	nn.init.xavier_uniform_(self.conv_v.weight)
	if proximal_init:
	with torch.no_grad():
	self.conv_k.weight.copy_(self.conv_q.weight)
	self.conv_k.bias.copy_(self.conv_q.bias)

	def forward(self, x, c, attn_mask=None):
	q = self.conv_q(x)
	k = self.conv_k(c)
	v = self.conv_v(c)

	x, self.attn = self.attention(q, k, v, mask=attn_mask)

	x = self.conv_o(x)
	return x

	def attention(self, query, key, value, mask=None):
	b, d, t_s, t_t = (*key.size(), query.size(2))
	query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
	key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
	value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)

	scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
	if self.window_size is not None:
	assert t_s == t_t, "Relative attention is only available for self-attention."
	key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
	rel_logits = self._matmul_with_relative_keys(
	query / math.sqrt(self.k_channels), key_relative_embeddings
	)
	scores_local = self._relative_position_to_absolute_position(rel_logits)
	scores = scores + scores_local
	if self.proximal_bias:
	assert t_s == t_t, "Proximal bias is only available for self-attention."
	scores = scores + self._attention_bias_proximal(t_s).to(
	device=scores.device, dtype=scores.dtype
	)
	if mask is not None:
	scores = scores.masked_fill(mask == 0, -1e4)
	if self.block_length is not None:
	assert t_s == t_t, "Local attention is only available for self-attention."
	block_mask = (
	torch.ones_like(scores)
	.triu(-self.block_length)
	.tril(self.block_length)
	)
	scores = scores.masked_fill(block_mask == 0, -1e4)
	p_attn = F.softmax(scores, dim=-1)
	p_attn = self.drop(p_attn)
	output = torch.matmul(p_attn, value)
	if self.window_size is not None:
	relative_weights = self._absolute_position_to_relative_position(p_attn)
	value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
	output = output + self._matmul_with_relative_values(
	relative_weights, value_relative_embeddings
	)
	output = output.transpose(2, 3).contiguous().view(b, d, t_t)
	return output, p_attn

	def _matmul_with_relative_values(self, x, y):
	ret = torch.matmul(x, y.unsqueeze(0))
	return ret

	def _matmul_with_relative_keys(self, x, y):
	ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
	return ret

	def _get_relative_embeddings(self, relative_embeddings, length):
	pad_length = max(length - (self.window_size + 1), 0)
	slice_start_position = max((self.window_size + 1) - length, 0)
	slice_end_position = slice_start_position + 2 * length - 1
	if pad_length > 0:
	padded_relative_embeddings = F.pad(
	relative_embeddings,
	convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
	)
	else:
	padded_relative_embeddings = relative_embeddings
	used_relative_embeddings = padded_relative_embeddings[
	:, slice_start_position:slice_end_position
	]
	return used_relative_embeddings

	def _relative_position_to_absolute_position(self, x):
	batch, heads, length, _ = x.size()

	x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))

	x_flat = x.view([batch, heads, length * 2 * length])
	x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))

	x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
	:, :, :length, length - 1 :
	]
	return x_final

	def _absolute_position_to_relative_position(self, x):
	batch, heads, length, _ = x.size()
	x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
	x_flat = x.view([batch, heads, length*2 + length (length - 1)])
	x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
	x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
	return x_final

	def _attention_bias_proximal(self, length):
	r = torch.arange(length, dtype=torch.float32)
	diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
	return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)


	class FFN(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels,
	filter_channels,
	kernel_size,
	p_dropout=0.0,
	activation=None,
	causal=False,
	):
	super().__init__()
	self.in_channels = in_channels
	self.out_channels = out_channels
	self.filter_channels = filter_channels
	self.kernel_size = kernel_size
	self.p_dropout = p_dropout
	self.activation = activation
	self.causal = causal

	if causal:
	self.padding = self._causal_padding
	else:
	self.padding = self._same_padding

	self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
	self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
	self.drop = nn.Dropout(p_dropout)

	def forward(self, x, x_mask):
	x = self.conv_1(self.padding(x * x_mask))
	if self.activation == "gelu":
	x = x * torch.sigmoid(1.702 * x)
	else:
	x = torch.relu(x)
	x = self.drop(x)
	x = self.conv_2(self.padding(x * x_mask))
	return x * x_mask

	def _causal_padding(self, x):
	if self.kernel_size == 1:
	return x
	pad_l = self.kernel_size - 1
	pad_r = 0
	padding = [[0, 0], [0, 0], [pad_l, pad_r]]
	x = F.pad(x, convert_pad_shape(padding))
	return x

	def _same_padding(self, x):
	if self.kernel_size == 1:
	return x
	pad_l = (self.kernel_size - 1) // 2
	pad_r = self.kernel_size // 2
	padding = [[0, 0], [0, 0], [pad_l, pad_r]]
	x = F.pad(x, convert_pad_shape(padding))
	return x

	'''],
	"init" : ["__init__.py", '''
	''']
	}

	with open(os.sep.join([current_dir, dirs[5], lib_algorithm["synthesizers"][0]]), 'w') as f:
	f.write(lib_algorithm["synthesizers"][1])
	with open(os.sep.join([current_dir, dirs[5], lib_algorithm["residuals"][0]]), 'w') as f:
	f.write(lib_algorithm["residuals"][1])
	with open(os.sep.join([current_dir, dirs[5], lib_algorithm["nsf"][0]]), 'w') as f:
	f.write(lib_algorithm["nsf"][1])
	with open(os.sep.join([current_dir, dirs[5], lib_algorithm["normalization"][0]]), 'w') as f:
	f.write(lib_algorithm["normalization"][1])
	with open(os.sep.join([current_dir, dirs[5], lib_algorithm["modules"][0]]), 'w') as f:
	f.write(lib_algorithm["modules"][1])
	with open(os.sep.join([current_dir, dirs[5], lib_algorithm["generators"][0]]), 'w') as f:
	f.write(lib_algorithm["generators"][1])
	with open(os.sep.join([current_dir, dirs[5], lib_algorithm["encoders"][0]]), 'w') as f:
	f.write(lib_algorithm["encoders"][1])
	with open(os.sep.join([current_dir, dirs[5], lib_algorithm["discriminators"][0]]), 'w') as f:
	f.write(lib_algorithm["discriminators"][1])
	with open(os.sep.join([current_dir, dirs[5], lib_algorithm["commons"][0]]), 'w') as f:
	f.write(lib_algorithm["commons"][1])
	with open(os.sep.join([current_dir, dirs[5], lib_algorithm["attentions"][0]]), 'w') as f:
	f.write(lib_algorithm["attentions"][1])
	with open(os.sep.join([current_dir, dirs[5], lib_algorithm["init"][0]]), 'w') as f:
	f.write(lib_algorithm["init"][1])

	RMVPE = '''
	import torch
	import numpy as np
	import torch.nn as nn
	import torch.nn.functional as F
	from librosa.filters import mel
	from scipy.signal import get_window
	from librosa.util import pad_center, tiny, normalize


	def window_sumsquare(
	window,
	n_frames,
	hop_length=200,
	win_length=800,
	n_fft=800,
	dtype=np.float32,
	norm=None,
	):
	if win_length is None:
	win_length = n_fft

	n = n_fft + hop_length * (n_frames - 1)
	x = np.zeros(n, dtype=dtype)

	win_sq = get_window(window, win_length, fftbins=True)
	win_sq = normalize(win_sq, norm=norm) ** 2
	win_sq = pad_center(win_sq, n_fft)

	for i in range(n_frames):
	sample = i * hop_length
	x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
	return x


	class STFT(nn.Module):
	def __init__(
	self, filter_length=1024, hop_length=512, win_length=None, window="hann"
	):
	super(STFT, self).__init__()
	self.filter_length = filter_length
	self.hop_length = hop_length
	self.win_length = win_length if win_length else filter_length
	self.window = window
	self.pad_amount = int(self.filter_length / 2)
	scale = self.filter_length / self.hop_length
	fourier_basis = np.fft.fft(np.eye(self.filter_length))

	cutoff = int((self.filter_length / 2 + 1))
	fourier_basis = np.vstack(
	[np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
	)
	forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
	inverse_basis = torch.FloatTensor(
	np.linalg.pinv(scale * fourier_basis).T[:, None, :]
	)

	assert filter_length >= self.win_length
	fft_window = get_window(window, self.win_length, fftbins=True)
	fft_window = pad_center(fft_window, size=filter_length)
	fft_window = torch.from_numpy(fft_window).float()

	forward_basis *= fft_window
	inverse_basis *= fft_window

	self.register_buffer("forward_basis", forward_basis.float())
	self.register_buffer("inverse_basis", inverse_basis.float())

	def transform(self, input_data):
	num_batches = input_data.shape[0]
	num_samples = input_data.shape[-1]

	input_data = input_data.view(num_batches, 1, num_samples)
	input_data = F.pad(
	input_data.unsqueeze(1),
	(self.pad_amount, self.pad_amount, 0, 0, 0, 0),
	mode="reflect",
	).squeeze(1)
	forward_transform = F.conv1d(
	input_data, self.forward_basis, stride=self.hop_length, padding=0
	)

	cutoff = int((self.filter_length / 2) + 1)
	real_part = forward_transform[:, :cutoff, :]
	imag_part = forward_transform[:, cutoff:, :]
	return torch.sqrt(real_part2 + imag_part2)

	def inverse(self, magnitude, phase):
	recombine_magnitude_phase = torch.cat(
	[magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
	)
	inverse_transform = F.conv_transpose1d(
	recombine_magnitude_phase,
	self.inverse_basis,
	stride=self.hop_length,
	padding=0,
	)

	if self.window is not None:
	window_sum = window_sumsquare(
	self.window,
	magnitude.size(-1),
	hop_length=self.hop_length,
	win_length=self.win_length,
	n_fft=self.filter_length,
	dtype=np.float32,
	)
	approx_nonzero_indices = torch.from_numpy(
	np.where(window_sum > tiny(window_sum))[0]
	)
	window_sum = torch.from_numpy(window_sum).to(inverse_transform.device)
	inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
	approx_nonzero_indices
	]
	inverse_transform *= float(self.filter_length) / self.hop_length

	inverse_transform = inverse_transform[..., self.pad_amount :]
	inverse_transform = inverse_transform[..., : self.num_samples]
	return inverse_transform.squeeze(1)

	def forward(self, input_data):
	self.magnitude, self.phase = self.transform(input_data)
	return self.inverse(self.magnitude, self.phase)


	class BiGRU(nn.Module):
	def __init__(self, input_features, hidden_features, num_layers):
	super(BiGRU, self).__init__()
	self.gru = nn.GRU(
	input_features,
	hidden_features,
	num_layers=num_layers,
	batch_first=True,
	bidirectional=True,
	)

	def forward(self, x):
	return self.gru(x)[0]


	class ConvBlockRes(nn.Module):
	def __init__(self, in_channels, out_channels, momentum=0.01):
	super(ConvBlockRes, self).__init__()
	self.conv = nn.Sequential(
	nn.Conv2d(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=(3, 3),
	stride=(1, 1),
	padding=(1, 1),
	bias=False,
	),
	nn.BatchNorm2d(out_channels, momentum=momentum),
	nn.ReLU(),
	nn.Conv2d(
	in_channels=out_channels,
	out_channels=out_channels,
	kernel_size=(3, 3),
	stride=(1, 1),
	padding=(1, 1),
	bias=False,
	),
	nn.BatchNorm2d(out_channels, momentum=momentum),
	nn.ReLU(),
	)
	self.shortcut = (
	nn.Conv2d(in_channels, out_channels, (1, 1))
	if in_channels != out_channels
	else None
	)

	def forward(self, x):
	out = self.conv(x)
	if self.shortcut is not None:
	x = self.shortcut(x)
	return out + x


	class ResEncoderBlock(nn.Module):
	def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01):
	super(ResEncoderBlock, self).__init__()
	self.conv = nn.ModuleList(
	[
	ConvBlockRes(
	in_channels if i == 0 else out_channels, out_channels, momentum
	)
	for i in range(n_blocks)
	]
	)
	self.pool = (
	nn.AvgPool2d(kernel_size=kernel_size) if kernel_size is not None else None
	)

	def forward(self, x):
	for conv in self.conv:
	x = conv(x)
	pooled = self.pool(x) if self.pool is not None else x
	return pooled, x


	class Encoder(nn.Module):
	def __init__(
	self,
	in_channels,
	in_size,
	n_encoders,
	kernel_size,
	n_blocks,
	out_channels=16,
	momentum=0.01,
	):
	super(Encoder, self).__init__()
	self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
	self.layers = nn.ModuleList()
	self.latent_channels = []
	for _ in range(n_encoders):
	self.layers.append(
	ResEncoderBlock(
	in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
	)
	)
	self.latent_channels.append([out_channels, in_size])
	in_channels = out_channels
	out_channels *= 2
	in_size //= 2
	self.out_size = in_size
	self.out_channel = out_channels

	def forward(self, x):
	concat_tensors = []
	x = self.bn(x)
	for layer in self.layers:
	x, pooled = layer(x)
	concat_tensors.append(pooled)
	return x, concat_tensors


	class Intermediate(nn.Module):
	def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
	super(Intermediate, self).__init__()
	self.layers = nn.ModuleList(
	[
	ResEncoderBlock(
	in_channels if i == 0 else out_channels,
	out_channels,
	None,
	n_blocks,
	momentum,
	)
	for i in range(n_inters)
	]
	)

	def forward(self, x):
	for layer in self.layers:
	_, x = layer(x)
	return x


	class ResDecoderBlock(nn.Module):
	def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
	super(ResDecoderBlock, self).__init__()
	out_padding = (0, 1) if stride == (1, 2) else (1, 1)
	self.conv1 = nn.Sequential(
	nn.ConvTranspose2d(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=(3, 3),
	stride=stride,
	padding=(1, 1),
	output_padding=out_padding,
	bias=False,
	),
	nn.BatchNorm2d(out_channels, momentum=momentum),
	nn.ReLU(),
	)
	self.conv2 = nn.ModuleList(
	[
	ConvBlockRes(
	out_channels * 2 if i == 0 else out_channels, out_channels, momentum
	)
	for i in range(n_blocks)
	]
	)

	def forward(self, x, concat_tensor):
	x = self.conv1(x)
	x = torch.cat((x, concat_tensor), dim=1)
	for conv in self.conv2:
	x = conv(x)
	return x


	class Decoder(nn.Module):
	def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
	super(Decoder, self).__init__()
	self.layers = nn.ModuleList()
	for _ in range(n_decoders):
	out_channels = in_channels // 2
	self.layers.append(
	ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
	)
	in_channels = out_channels

	def forward(self, x, concat_tensors):
	for layer, concat_tensor in zip(self.layers, reversed(concat_tensors)):
	x = layer(x, concat_tensor)
	return x


	class DeepUnet(nn.Module):
	def __init__(
	self,
	kernel_size,
	n_blocks,
	en_de_layers=5,
	inter_layers=4,
	in_channels=1,
	en_out_channels=16,
	):
	super(DeepUnet, self).__init__()
	self.encoder = Encoder(
	in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
	)
	self.intermediate = Intermediate(
	self.encoder.out_channel // 2,
	self.encoder.out_channel,
	inter_layers,
	n_blocks,
	)
	self.decoder = Decoder(
	self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
	)

	def forward(self, x):
	x, concat_tensors = self.encoder(x)
	x = self.intermediate(x)
	return self.decoder(x, concat_tensors)


	class E2E(nn.Module):
	def __init__(
	self,
	n_blocks,
	n_gru,
	kernel_size,
	en_de_layers=5,
	inter_layers=4,
	in_channels=1,
	en_out_channels=16,
	):
	super(E2E, self).__init__()
	self.unet = DeepUnet(
	kernel_size,
	n_blocks,
	en_de_layers,
	inter_layers,
	in_channels,
	en_out_channels,
	)
	self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
	if n_gru:
	self.fc = nn.Sequential(
	BiGRU(3 * 128, 256, n_gru),
	nn.Linear(512, 360),
	nn.Dropout(0.25),
	nn.Sigmoid(),
	)
	else:
	self.fc = nn.Sequential(
	nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
	)

	def forward(self, mel):
	mel = mel.transpose(-1, -2).unsqueeze(1)
	x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
	return self.fc(x)


	class MelSpectrogram(nn.Module):
	def __init__(
	self,
	is_half,
	n_mel_channels,
	sample_rate,
	win_length,
	hop_length,
	n_fft=None,
	mel_fmin=0,
	mel_fmax=None,
	clamp=1e-5,
	):
	super(MelSpectrogram, self).__init__()
	n_fft = win_length if n_fft is None else n_fft
	self.hann_window = {}
	mel_basis = mel(
	sr=sample_rate,
	n_fft=n_fft,
	n_mels=n_mel_channels,
	fmin=mel_fmin,
	fmax=mel_fmax,
	htk=True,
	)
	self.register_buffer("mel_basis", torch.from_numpy(mel_basis).float())
	self.n_fft = n_fft
	self.hop_length = hop_length
	self.win_length = win_length
	self.sample_rate = sample_rate
	self.n_mel_channels = n_mel_channels
	self.clamp = clamp
	self.is_half = is_half

	def forward(self, audio, keyshift=0, speed=1, center=True):
	factor = 2 ** (keyshift / 12)
	n_fft_new = int(np.round(self.n_fft * factor))
	win_length_new = int(np.round(self.win_length * factor))
	hop_length_new = int(np.round(self.hop_length * speed))
	keyshift_key = f"{keyshift}_{audio.device}"
	if keyshift_key not in self.hann_window:
	self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
	audio.device
	)
	if not hasattr(self, "stft"):
	self.stft = STFT(
	filter_length=n_fft_new,
	hop_length=hop_length_new,
	win_length=win_length_new,
	window="hann",
	).to(audio.device)
	magnitude = self.stft.transform(audio)
	if keyshift != 0:
	size = self.n_fft // 2 + 1
	resize = magnitude.size(1)
	if resize < size:
	magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
	magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
	mel_output = torch.matmul(self.mel_basis, magnitude)
	if self.is_half:
	mel_output = mel_output.half()
	return torch.log(torch.clamp(mel_output, min=self.clamp))


	class RMVPE0Predictor:
	def __init__(self, model_path, is_half, device=None):
	self.resample_kernel = {}
	self.is_half = is_half
	if device is None:
	device = "cuda" if torch.cuda.is_available() else "cpu"
	self.device = device
	self.mel_extractor = MelSpectrogram(
	is_half, 128, 16000, 1024, 160, None, 30, 8000
	).to(device)
	model = E2E(4, 1, (2, 2))
	ckpt = torch.load(model_path, map_location="cpu", weights_only=True)
	model.load_state_dict(ckpt)
	model.eval()
	if is_half:
	model = model.half()
	self.model = model.to(device)
	self.cents_mapping = np.pad(20 * np.arange(360) + 1997.3794084376191, (4, 4))

	def mel2hidden(self, mel):
	with torch.no_grad():
	n_frames = mel.shape[-1]
	mel = mel.float()
	padding = min(32 * ((n_frames - 1) // 32 + 1) - n_frames, n_frames)
	mel = F.pad(mel, (0, padding), mode="reflect")
	if self.is_half:
	mel = mel.half()
	hidden = self.model(mel)
	return hidden[:, :n_frames]

	def decode(self, hidden, thred=0.03):
	cents_pred = self.to_local_average_cents(hidden, thred=thred)
	f0 = 10 * (2 ** (cents_pred / 1200))
	f0[f0 == 10] = 0
	return f0

	def infer_from_audio(self, audio, thred=0.03):
	audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
	mel = self.mel_extractor(audio, center=True)
	hidden = self.mel2hidden(mel)
	hidden = hidden.squeeze(0).cpu().numpy()
	if self.is_half:
	hidden = hidden.astype("float32")
	return self.decode(hidden, thred=thred)

	def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100):
	audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
	mel = self.mel_extractor(audio, center=True)
	hidden = self.mel2hidden(mel)
	hidden = hidden.squeeze(0).cpu().numpy()
	if self.is_half:
	hidden = hidden.astype("float32")
	f0 = self.decode(hidden, thred=thred)
	f0[(f0 < f0_min) \| (f0 > f0_max)] = 0
	return f0

	def to_local_average_cents(self, salience, thred=0.05):
	center = np.argmax(salience, axis=1)
	salience = np.pad(salience, ((0, 0), (4, 4)))
	center += 4
	todo_salience = []
	todo_cents_mapping = []
	starts = center - 4
	ends = center + 5
	for idx in range(salience.shape[0]):
	todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
	todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
	todo_salience = np.array(todo_salience)
	todo_cents_mapping = np.array(todo_cents_mapping)
	product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
	weight_sum = np.sum(todo_salience, 1)
	divided = product_sum / weight_sum
	maxx = np.max(salience, axis=1)
	divided[maxx <= thred] = 0
	return divided

	'''
	with open(os.sep.join([current_dir, dirs[6], "RMVPE.py"]), 'w') as f:
	f.write(RMVPE)

	FCPE = '''
	from typing import Union

	import torch.nn.functional as F
	import numpy as np
	import torch
	import torch.nn as nn
	from torch.nn.utils.parametrizations import weight_norm
	from torchaudio.transforms import Resample
	import os
	import librosa
	import soundfile as sf
	import torch.utils.data
	from librosa.filters import mel as librosa_mel_fn
	import math
	from functools import partial

	from einops import rearrange, repeat
	from local_attention import LocalAttention

	os.environ["LRU_CACHE_CAPACITY"] = "3"


	def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
	try:
	data, sample_rate = sf.read(full_path, always_2d=True)
	except Exception as error:
	print(f"An error occurred loading {full_path}: {error}")
	if return_empty_on_exception:
	return [], sample_rate or target_sr or 48000
	else:
	raise

	data = data[:, 0] if len(data.shape) > 1 else data
	assert len(data) > 2

	max_mag = (
	-np.iinfo(data.dtype).min
	if np.issubdtype(data.dtype, np.integer)
	else max(np.amax(data), -np.amin(data))
	)
	max_mag = (
	(231) + 1 if max_mag > (215) else ((2**15) + 1 if max_mag > 1.01 else 1.0)
	)
	data = torch.FloatTensor(data.astype(np.float32)) / max_mag

	if (torch.isinf(data) \| torch.isnan(data)).any() and return_empty_on_exception:
	return [], sample_rate or target_sr or 48000
	if target_sr is not None and sample_rate != target_sr:
	data = torch.from_numpy(
	librosa.core.resample(data.numpy(), orig_sr=sample_rate, target_sr=target_sr)
	)
	sample_rate = target_sr

	return data, sample_rate


	def dynamic_range_compression(x, C=1, clip_val=1e-5):
	return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)


	def dynamic_range_decompression(x, C=1):
	return np.exp(x) / C


	def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
	return torch.log(torch.clamp(x, min=clip_val) * C)


	def dynamic_range_decompression_torch(x, C=1):
	return torch.exp(x) / C


	class STFT:
	def __init__(
	self,
	sr=22050,
	n_mels=80,
	n_fft=1024,
	win_size=1024,
	hop_length=256,
	fmin=20,
	fmax=11025,
	clip_val=1e-5,
	):
	self.target_sr = sr
	self.n_mels = n_mels
	self.n_fft = n_fft
	self.win_size = win_size
	self.hop_length = hop_length
	self.fmin = fmin
	self.fmax = fmax
	self.clip_val = clip_val
	self.mel_basis = {}
	self.hann_window = {}

	def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
	sample_rate = self.target_sr
	n_mels = self.n_mels
	n_fft = self.n_fft
	win_size = self.win_size
	hop_length = self.hop_length
	fmin = self.fmin
	fmax = self.fmax
	clip_val = self.clip_val

	factor = 2 ** (keyshift / 12)
	n_fft_new = int(np.round(n_fft * factor))
	win_size_new = int(np.round(win_size * factor))
	hop_length_new = int(np.round(hop_length * speed))

	mel_basis = self.mel_basis if not train else {}
	hann_window = self.hann_window if not train else {}

	mel_basis_key = str(fmax) + "_" + str(y.device)
	if mel_basis_key not in mel_basis:
	mel = librosa_mel_fn(
	sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
	)
	mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)

	keyshift_key = str(keyshift) + "_" + str(y.device)
	if keyshift_key not in hann_window:
	hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)

	pad_left = (win_size_new - hop_length_new) // 2
	pad_right = max(
	(win_size_new - hop_length_new + 1) // 2,
	win_size_new - y.size(-1) - pad_left,
	)
	mode = "reflect" if pad_right < y.size(-1) else "constant"
	y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode)
	y = y.squeeze(1)

	spec = torch.stft(
	y,
	n_fft_new,
	hop_length=hop_length_new,
	win_length=win_size_new,
	window=hann_window[keyshift_key],
	center=center,
	pad_mode="reflect",
	normalized=False,
	onesided=True,
	return_complex=True,
	)
	spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))

	if keyshift != 0:
	size = n_fft // 2 + 1
	resize = spec.size(1)
	spec = (
	F.pad(spec, (0, 0, 0, size - resize))
	if resize < size
	else spec[:, :size, :]
	)
	spec = spec * win_size / win_size_new
	spec = torch.matmul(mel_basis[mel_basis_key], spec)
	spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
	return spec

	def __call__(self, audiopath):
	audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
	spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
	return spect


	stft = STFT()


	def softmax_kernel(
	data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None
	):
	b, h, *_ = data.shape

	data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0

	ratio = projection_matrix.shape[0] ** -0.5
	projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h)
	projection = projection.type_as(data)
	data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection)

	diag_data = data**2
	diag_data = torch.sum(diag_data, dim=-1)
	diag_data = (diag_data / 2.0) * (data_normalizer**2)
	diag_data = diag_data.unsqueeze(dim=-1)

	if is_query:
	data_dash = ratio * (
	torch.exp(
	data_dash - diag_data - torch.max(data_dash, dim=-1, keepdim=True).values
	)
	+ eps
	)
	else:
	data_dash = ratio * (torch.exp(data_dash - diag_data + eps))

	return data_dash.type_as(data)


	def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
	unstructured_block = torch.randn((cols, cols), device=device)
	q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
	q, r = map(lambda t: t.to(device), (q, r))

	if qr_uniform_q:
	d = torch.diag(r, 0)
	q *= d.sign()
	return q.t()


	def exists(val):
	return val is not None


	def empty(tensor):
	return tensor.numel() == 0


	def default(val, d):
	return val if exists(val) else d


	def cast_tuple(val):
	return (val,) if not isinstance(val, tuple) else val


	class PCmer(nn.Module):
	def __init__(
	self,
	num_layers,
	num_heads,
	dim_model,
	dim_keys,
	dim_values,
	residual_dropout,
	attention_dropout,
	):
	super().__init__()
	self.num_layers = num_layers
	self.num_heads = num_heads
	self.dim_model = dim_model
	self.dim_values = dim_values
	self.dim_keys = dim_keys
	self.residual_dropout = residual_dropout
	self.attention_dropout = attention_dropout

	self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])

	def forward(self, phone, mask=None):
	for layer in self._layers:
	phone = layer(phone, mask)
	return phone


	class _EncoderLayer(nn.Module):
	def __init__(self, parent: PCmer):
	super().__init__()
	self.conformer = ConformerConvModule(parent.dim_model)
	self.norm = nn.LayerNorm(parent.dim_model)
	self.dropout = nn.Dropout(parent.residual_dropout)
	self.attn = SelfAttention(
	dim=parent.dim_model, heads=parent.num_heads, causal=False
	)

	def forward(self, phone, mask=None):
	phone = phone + (self.attn(self.norm(phone), mask=mask))
	phone = phone + (self.conformer(phone))
	return phone


	def calc_same_padding(kernel_size):
	pad = kernel_size // 2
	return (pad, pad - (kernel_size + 1) % 2)


	class Swish(nn.Module):
	def forward(self, x):
	return x * x.sigmoid()


	class Transpose(nn.Module):
	def __init__(self, dims):
	super().__init__()
	assert len(dims) == 2, "dims must be a tuple of two dimensions"
	self.dims = dims

	def forward(self, x):
	return x.transpose(*self.dims)


	class GLU(nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.dim = dim

	def forward(self, x):
	out, gate = x.chunk(2, dim=self.dim)
	return out * gate.sigmoid()


	class DepthWiseConv1d(nn.Module):
	def __init__(self, chan_in, chan_out, kernel_size, padding):
	super().__init__()
	self.padding = padding
	self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)

	def forward(self, x):
	x = F.pad(x, self.padding)
	return self.conv(x)


	class ConformerConvModule(nn.Module):
	def __init__(
	self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0
	):
	super().__init__()

	inner_dim = dim * expansion_factor
	padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)

	self.net = nn.Sequential(
	nn.LayerNorm(dim),
	Transpose((1, 2)),
	nn.Conv1d(dim, inner_dim * 2, 1),
	GLU(dim=1),
	DepthWiseConv1d(
	inner_dim, inner_dim, kernel_size=kernel_size, padding=padding
	),
	Swish(),
	nn.Conv1d(inner_dim, dim, 1),
	Transpose((1, 2)),
	nn.Dropout(dropout),
	)

	def forward(self, x):
	return self.net(x)


	def linear_attention(q, k, v):
	if v is None:
	out = torch.einsum("...ed,...nd->...ne", k, q)
	return out
	else:
	k_cumsum = k.sum(dim=-2)
	D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8)
	context = torch.einsum("...nd,...ne->...de", k, v)
	out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv)
	return out


	def gaussian_orthogonal_random_matrix(
	nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None
	):
	nb_full_blocks = int(nb_rows / nb_columns)
	block_list = []

	for _ in range(nb_full_blocks):
	q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q=qr_uniform_q, device=device)
	block_list.append(q)

	remaining_rows = nb_rows - nb_full_blocks * nb_columns
	if remaining_rows > 0:
	q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q=qr_uniform_q, device=device)
	block_list.append(q[:remaining_rows])

	final_matrix = torch.cat(block_list)

	if scaling == 0:
	multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1)
	elif scaling == 1:
	multiplier = math.sqrt((float(nb_columns))) * torch.ones(
	(nb_rows,), device=device
	)
	else:
	raise ValueError(f"Invalid scaling {scaling}")

	return torch.diag(multiplier) @ final_matrix


	class FastAttention(nn.Module):
	def __init__(
	self,
	dim_heads,
	nb_features=None,
	ortho_scaling=0,
	causal=False,
	generalized_attention=False,
	kernel_fn=nn.ReLU(),
	qr_uniform_q=False,
	no_projection=False,
	):
	super().__init__()
	nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))

	self.dim_heads = dim_heads
	self.nb_features = nb_features
	self.ortho_scaling = ortho_scaling

	self.create_projection = partial(
	gaussian_orthogonal_random_matrix,
	nb_rows=self.nb_features,
	nb_columns=dim_heads,
	scaling=ortho_scaling,
	qr_uniform_q=qr_uniform_q,
	)
	projection_matrix = self.create_projection()
	self.register_buffer("projection_matrix", projection_matrix)

	self.generalized_attention = generalized_attention
	self.kernel_fn = kernel_fn
	self.no_projection = no_projection
	self.causal = causal

	@torch.no_grad()
	def redraw_projection_matrix(self):
	projections = self.create_projection()
	self.projection_matrix.copy_(projections)
	del projections

	def forward(self, q, k, v):
	device = q.device

	if self.no_projection:
	q = q.softmax(dim=-1)
	k = torch.exp(k) if self.causal else k.softmax(dim=-2)
	else:
	create_kernel = partial(
	softmax_kernel, projection_matrix=self.projection_matrix, device=device
	)
	q = create_kernel(q, is_query=True)
	k = create_kernel(k, is_query=False)

	attn_fn = linear_attention if not self.causal else self.causal_linear_fn

	if v is None:
	out = attn_fn(q, k, None)
	return out
	else:
	out = attn_fn(q, k, v)
	return out


	class SelfAttention(nn.Module):
	def __init__(
	self,
	dim,
	causal=False,
	heads=8,
	dim_head=64,
	local_heads=0,
	local_window_size=256,
	nb_features=None,
	feature_redraw_interval=1000,
	generalized_attention=False,
	kernel_fn=nn.ReLU(),
	qr_uniform_q=False,
	dropout=0.0,
	no_projection=False,
	):
	super().__init__()
	assert dim % heads == 0, "dimension must be divisible by number of heads"
	dim_head = default(dim_head, dim // heads)
	inner_dim = dim_head * heads
	self.fast_attention = FastAttention(
	dim_head,
	nb_features,
	causal=causal,
	generalized_attention=generalized_attention,
	kernel_fn=kernel_fn,
	qr_uniform_q=qr_uniform_q,
	no_projection=no_projection,
	)

	self.heads = heads
	self.global_heads = heads - local_heads
	self.local_attn = (
	LocalAttention(
	window_size=local_window_size,
	causal=causal,
	autopad=True,
	dropout=dropout,
	look_forward=int(not causal),
	rel_pos_emb_config=(dim_head, local_heads),
	)
	if local_heads > 0
	else None
	)

	self.to_q = nn.Linear(dim, inner_dim)
	self.to_k = nn.Linear(dim, inner_dim)
	self.to_v = nn.Linear(dim, inner_dim)
	self.to_out = nn.Linear(inner_dim, dim)
	self.dropout = nn.Dropout(dropout)

	@torch.no_grad()
	def redraw_projection_matrix(self):
	self.fast_attention.redraw_projection_matrix()

	def forward(
	self,
	x,
	context=None,
	mask=None,
	context_mask=None,
	name=None,
	inference=False,
	**kwargs,
	):
	_, _, _, h, gh = *x.shape, self.heads, self.global_heads

	cross_attend = exists(context)
	context = default(context, x)
	context_mask = default(context_mask, mask) if not cross_attend else context_mask
	q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)

	q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
	(q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))

	attn_outs = []
	if not empty(q):
	if exists(context_mask):
	global_mask = context_mask[:, None, :, None]
	v.masked_fill_(~global_mask, 0.0)
	if cross_attend:
	pass
	else:
	out = self.fast_attention(q, k, v)
	attn_outs.append(out)

	if not empty(lq):
	assert (
	not cross_attend
	), "local attention is not compatible with cross attention"
	out = self.local_attn(lq, lk, lv, input_mask=mask)
	attn_outs.append(out)

	out = torch.cat(attn_outs, dim=1)
	out = rearrange(out, "b h n d -> b n (h d)")
	out = self.to_out(out)
	return self.dropout(out)


	def l2_regularization(model, l2_alpha):
	l2_loss = []
	for module in model.modules():
	if type(module) is nn.Conv2d:
	l2_loss.append((module.weight**2).sum() / 2.0)
	return l2_alpha * sum(l2_loss)


	class FCPE(nn.Module):
	def __init__(
	self,
	input_channel=128,
	out_dims=360,
	n_layers=12,
	n_chans=512,
	use_siren=False,
	use_full=False,
	loss_mse_scale=10,
	loss_l2_regularization=False,
	loss_l2_regularization_scale=1,
	loss_grad1_mse=False,
	loss_grad1_mse_scale=1,
	f0_max=1975.5,
	f0_min=32.70,
	confidence=False,
	threshold=0.05,
	use_input_conv=True,
	):
	super().__init__()
	if use_siren is True:
	raise ValueError("Siren is not supported yet.")
	if use_full is True:
	raise ValueError("Full model is not supported yet.")

	self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
	self.loss_l2_regularization = (
	loss_l2_regularization if (loss_l2_regularization is not None) else False
	)
	self.loss_l2_regularization_scale = (
	loss_l2_regularization_scale
	if (loss_l2_regularization_scale is not None)
	else 1
	)
	self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
	self.loss_grad1_mse_scale = (
	loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
	)
	self.f0_max = f0_max if (f0_max is not None) else 1975.5
	self.f0_min = f0_min if (f0_min is not None) else 32.70
	self.confidence = confidence if (confidence is not None) else False
	self.threshold = threshold if (threshold is not None) else 0.05
	self.use_input_conv = use_input_conv if (use_input_conv is not None) else True

	self.cent_table_b = torch.Tensor(
	np.linspace(
	self.f0_to_cent(torch.Tensor([f0_min]))[0],
	self.f0_to_cent(torch.Tensor([f0_max]))[0],
	out_dims,
	)
	)
	self.register_buffer("cent_table", self.cent_table_b)

	_leaky = nn.LeakyReLU()
	self.stack = nn.Sequential(
	nn.Conv1d(input_channel, n_chans, 3, 1, 1),
	nn.GroupNorm(4, n_chans),
	_leaky,
	nn.Conv1d(n_chans, n_chans, 3, 1, 1),
	)

	self.decoder = PCmer(
	num_layers=n_layers,
	num_heads=8,
	dim_model=n_chans,
	dim_keys=n_chans,
	dim_values=n_chans,
	residual_dropout=0.1,
	attention_dropout=0.1,
	)
	self.norm = nn.LayerNorm(n_chans)

	self.n_out = out_dims
	self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out))

	def forward(
	self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax"
	):
	if cdecoder == "argmax":
	self.cdecoder = self.cents_decoder
	elif cdecoder == "local_argmax":
	self.cdecoder = self.cents_local_decoder

	x = (
	self.stack(mel.transpose(1, 2)).transpose(1, 2)
	if self.use_input_conv
	else mel
	)
	x = self.decoder(x)
	x = self.norm(x)
	x = self.dense_out(x)
	x = torch.sigmoid(x)

	if not infer:
	gt_cent_f0 = self.f0_to_cent(gt_f0)
	gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0)
	loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0)
	if self.loss_l2_regularization:
	loss_all = loss_all + l2_regularization(
	model=self, l2_alpha=self.loss_l2_regularization_scale
	)
	x = loss_all
	if infer:
	x = self.cdecoder(x)
	x = self.cent_to_f0(x)
	x = (1 + x / 700).log() if not return_hz_f0 else x

	return x

	def cents_decoder(self, y, mask=True):
	B, N, _ = y.size()
	ci = self.cent_table[None, None, :].expand(B, N, -1)
	rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(y, dim=-1, keepdim=True)
	if mask:
	confident = torch.max(y, dim=-1, keepdim=True)[0]
	confident_mask = torch.ones_like(confident)
	confident_mask[confident <= self.threshold] = float("-INF")
	rtn = rtn * confident_mask
	return (rtn, confident) if self.confidence else rtn

	def cents_local_decoder(self, y, mask=True):
	B, N, _ = y.size()
	ci = self.cent_table[None, None, :].expand(B, N, -1)
	confident, max_index = torch.max(y, dim=-1, keepdim=True)
	local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
	local_argmax_index = torch.clamp(local_argmax_index, 0, self.n_out - 1)
	ci_l = torch.gather(ci, -1, local_argmax_index)
	y_l = torch.gather(y, -1, local_argmax_index)
	rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(
	y_l, dim=-1, keepdim=True
	)
	if mask:
	confident_mask = torch.ones_like(confident)
	confident_mask[confident <= self.threshold] = float("-INF")
	rtn = rtn * confident_mask
	return (rtn, confident) if self.confidence else rtn

	def cent_to_f0(self, cent):
	return 10.0 * 2 ** (cent / 1200.0)

	def f0_to_cent(self, f0):
	return 1200.0 * torch.log2(f0 / 10.0)

	def gaussian_blurred_cent(self, cents):
	mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0)))
	B, N, _ = cents.size()
	ci = self.cent_table[None, None, :].expand(B, N, -1)
	return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()


	class FCPEInfer:
	def __init__(self, model_path, device=None, dtype=torch.float32):
	if device is None:
	device = "cuda" if torch.cuda.is_available() else "cpu"
	self.device = device
	ckpt = torch.load(model_path, map_location=torch.device(self.device))
	self.args = DotDict(ckpt["config"])
	self.dtype = dtype
	model = FCPE(
	input_channel=self.args.model.input_channel,
	out_dims=self.args.model.out_dims,
	n_layers=self.args.model.n_layers,
	n_chans=self.args.model.n_chans,
	use_siren=self.args.model.use_siren,
	use_full=self.args.model.use_full,
	loss_mse_scale=self.args.loss.loss_mse_scale,
	loss_l2_regularization=self.args.loss.loss_l2_regularization,
	loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
	loss_grad1_mse=self.args.loss.loss_grad1_mse,
	loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
	f0_max=self.args.model.f0_max,
	f0_min=self.args.model.f0_min,
	confidence=self.args.model.confidence,
	)
	model.to(self.device).to(self.dtype)
	model.load_state_dict(ckpt["model"])
	model.eval()
	self.model = model
	self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)

	@torch.no_grad()
	def __call__(self, audio, sr, threshold=0.05):
	self.model.threshold = threshold
	audio = audio[None, :]
	mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
	f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
	return f0


	class Wav2Mel:
	def __init__(self, args, device=None, dtype=torch.float32):
	self.sample_rate = args.mel.sampling_rate
	self.hop_size = args.mel.hop_size
	if device is None:
	device = "cuda" if torch.cuda.is_available() else "cpu"
	self.device = device
	self.dtype = dtype
	self.stft = STFT(
	args.mel.sampling_rate,
	args.mel.num_mels,
	args.mel.n_fft,
	args.mel.win_size,
	args.mel.hop_size,
	args.mel.fmin,
	args.mel.fmax,
	)
	self.resample_kernel = {}

	def extract_nvstft(self, audio, keyshift=0, train=False):
	mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2)
	return mel

	def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
	audio = audio.to(self.dtype).to(self.device)
	if sample_rate == self.sample_rate:
	audio_res = audio
	else:
	key_str = str(sample_rate)
	if key_str not in self.resample_kernel:
	self.resample_kernel[key_str] = Resample(
	sample_rate, self.sample_rate, lowpass_filter_width=128
	)
	self.resample_kernel[key_str] = (
	self.resample_kernel[key_str].to(self.dtype).to(self.device)
	)
	audio_res = self.resample_kernel[key_str](audio)

	mel = self.extract_nvstft(audio_res, keyshift=keyshift, train=train)
	n_frames = int(audio.shape[1] // self.hop_size) + 1
	mel = torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel
	mel = mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel
	return mel

	def __call__(self, audio, sample_rate, keyshift=0, train=False):
	return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)


	class DotDict(dict):
	def __getattr__(*args):
	val = dict.get(*args)
	return DotDict(val) if type(val) is dict else val

	__setattr__ = dict.__setitem__
	__delattr__ = dict.__delitem__


	class F0Predictor(object):
	def compute_f0(self, wav, p_len):
	pass

	def compute_f0_uv(self, wav, p_len):
	pass


	class FCPEF0Predictor(F0Predictor):
	def __init__(
	self,
	model_path,
	hop_length=512,
	f0_min=50,
	f0_max=1100,
	dtype=torch.float32,
	device=None,
	sample_rate=44100,
	threshold=0.05,
	):
	self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype)
	self.hop_length = hop_length
	self.f0_min = f0_min
	self.f0_max = f0_max
	self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
	self.threshold = threshold
	self.sample_rate = sample_rate
	self.dtype = dtype
	self.name = "fcpe"

	def repeat_expand(
	self,
	content: Union[torch.Tensor, np.ndarray],
	target_len: int,
	mode: str = "nearest",
	):
	ndim = content.ndim
	content = (
	content[None, None] if ndim == 1 else content[None] if ndim == 2 else content
	)
	assert content.ndim == 3
	is_np = isinstance(content, np.ndarray)
	content = torch.from_numpy(content) if is_np else content
	results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
	results = results.numpy() if is_np else results
	return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results

	def post_process(self, x, sample_rate, f0, pad_to):
	f0 = (
	torch.from_numpy(f0).float().to(x.device)
	if isinstance(f0, np.ndarray)
	else f0
	)
	f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0

	vuv_vector = torch.zeros_like(f0)
	vuv_vector[f0 > 0.0] = 1.0
	vuv_vector[f0 <= 0.0] = 0.0

	nzindex = torch.nonzero(f0).squeeze()
	f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
	time_org = self.hop_length / sample_rate * nzindex.cpu().numpy()
	time_frame = np.arange(pad_to) * self.hop_length / sample_rate

	vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]

	if f0.shape[0] <= 0:
	return np.zeros(pad_to), vuv_vector.cpu().numpy()
	if f0.shape[0] == 1:
	return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy()

	f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
	return f0, vuv_vector.cpu().numpy()

	def compute_f0(self, wav, p_len=None):
	x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
	p_len = x.shape[0] // self.hop_length if p_len is None else p_len
	f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
	if torch.all(f0 == 0):
	return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
	f0.cpu().numpy() if p_len is None else np.zeros(p_len)
	)
	return self.post_process(x, self.sample_rate, f0, p_len)[0]

	def compute_f0_uv(self, wav, p_len=None):
	x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
	p_len = x.shape[0] // self.hop_length if p_len is None else p_len
	f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
	if torch.all(f0 == 0):
	return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
	f0.cpu().numpy() if p_len is None else np.zeros(p_len)
	)
	return self.post_process(x, self.sample_rate, f0, p_len)

	'''

	with open(os.sep.join([current_dir, dirs[6], "FCPE.py"]), 'w') as f:
	f.write(FCPE)


	VBACH_CLI = '''
	import gc
	import os
	import datetime
	import gradio as gr
	import torch
	import librosa
	import tempfile
	from datetime import datetime
	import argparse
	from vbach.infer.infer import Config, load_hubert, get_vc, rvc_infer

	# Константы

	RVC_MODELS_DIR = os.path.join(os.getcwd(), "voice_models")
	HUBERT_MODEL_PATH = os.path.join(
	os.getcwd(), "vbach", "models", "embedders", "hubert_base.pt"
	)
	OUTPUT_FORMAT = ["mp3", "wav", "flac", "aiff", "m4a", "aac", "ogg", "opus"]

	audio_extensions = {".mp3", ".wav", ".flac", ".aiff", ".m4a", ".aac", ".ogg", ".opus"}


	# Важные функции

	def load_rvc_model(voice_model):
	model_dir = os.path.join(RVC_MODELS_DIR, voice_model)
	model_files = os.listdir(model_dir)
	rvc_model_path = next(
	(os.path.join(model_dir, f) for f in model_files if f.endswith(".pth")), None
	)
	rvc_index_path = next(
	(os.path.join(model_dir, f) for f in model_files if f.endswith(".index")), None
	)

	if not rvc_model_path:
	raise ValueError(
	f"\033[91mМодели {voice_model} не существует. "
	"Возможно, вы неправильно ввели имя.\033[0m"
	)

	return rvc_model_path, rvc_index_path

	def voice_conversion(
	voice_model,
	vocals_path,
	output_path,
	pitch,
	f0_method,
	index_rate,
	filter_radius,
	volume_envelope,
	protect,
	hop_length,
	f0_min,
	f0_max,
	format_output,
	output_bitrate,
	stereo_mode
	):
	rvc_model_path, rvc_index_path = load_rvc_model(voice_model)

	config = Config()
	hubert_model = load_hubert(config.device, config.is_half, HUBERT_MODEL_PATH)
	cpt, version, net_g, tgt_sr, vc = get_vc(
	config.device, config.is_half, config, rvc_model_path
	)

	output_audio = rvc_infer(
	rvc_index_path,
	index_rate,
	vocals_path,
	output_path,
	pitch,
	f0_method,
	cpt,
	version,
	net_g,
	filter_radius,
	tgt_sr,
	volume_envelope,
	protect,
	hop_length,
	vc,
	hubert_model,
	f0_min,
	f0_max,
	format_output,
	output_bitrate,
	stereo_mode
	)

	del hubert_model, cpt, net_g, vc
	gc.collect()
	torch.cuda.empty_cache()
	return output_audio

	def cli_conversion(input_audios, template="NAME_MODEL_F0METHOD_PITCH", output_dir="output", model_name="", index_rate=0, output_format="wav", stereo_mode="mono", method_pitch="rmvpe+", pitch=0, hop_length=128, filter_radius=3, rms=0.25, protect=0.33, f0_min=50, f0_max=1100):
	if not input_audios:
	raise ValueError(
	"Не удалось найти аудиофайл(ы). "
	"Убедитесь, что файл загрузился или проверьте правильность пути к нему."
	)
	if not model_name:
	raise ValueError("Выберите модель голоса для преобразования.")
	if not os.path.exists(input_audios):
	raise ValueError(f"Файл {input_audios} не найден.")

	if not os.path.exists(input_audios):
	raise FileNotFoundError(f"Ошибка: '{input_audios}' не существует.")

	os.makedirs(output_dir, exist_ok=True)

	if os.path.isfile(input_audios):
	# Проверяем, является ли файл аудио
	ext = os.path.splitext(input_audios)[1].lower()
	if ext not in audio_extensions:
	raise ValueError(f"Ошибка: '{input_audios}' не является аудиофайлом (допустимые расширения: {audio_extensions}).")
	print(f"Найден аудиофайл: {input_audios}")

	try:
	file_name = os.path.basename(input_audios)
	namefile = os.path.splitext(file_name)[0]
	time_create_file = datetime.now().strftime("%Y%m%d_%H%M%S")
	output_name = template
	output_path = os.path.join(output_dir, f"{output_name}.{output_format}")
	voice_conversion(model_name, input_audios, output_path, pitch, method_pitch, index_rate, filter_radius, rms, protect, hop_length, f0_min, f0_max, output_format, "320k", stereo_mode)
	finally:
	print("Вокал успешно преобразован")

	elif os.path.isdir(input_audios):
	# Ищем аудиофайлы в папке
	audio_files = []
	for file in os.listdir(input_audios):
	ext = os.path.splitext(file)[1].lower()
	if ext in audio_extensions:
	audio_files.append(os.path.join(input_audios, file))

	if not audio_files:
	raise FileNotFoundError(f"Ошибка: в папке '{input_audios}' нет аудиофайлов (допустимые расширения: {audio_extensions}).")

	print(f"Найдены аудиофайлы: {audio_files}")

	try:
	output_paths = []
	for file in audio_files:
	file_name = os.path.basename(file)
	namefile = os.path.splitext(file_name)[0]
	time_create_file = datetime.now().strftime("%Y%m%d_%H%M%S")
	output_name = (
	template
	.replace("DATETIME", time_create_file)
	.replace("NAME", namefile)
	.replace("MODEL", model_name)
	.replace("F0METHOD", method_pitch)
	.replace("PITCH", f"{pitch}")
	)
	output_path = os.path.join(output_dir, f"{output_name}.{output_format}")
	voice_conversion(model_name, file, output_path, pitch, method_pitch, index_rate, filter_radius, rms, protect, hop_length, 50, 1100, output_format, "320k", stereo_mode)
	output_paths.append(output_path)
	finally:
	print("Вокалы успешно преобразованы")
	else:
	raise ValueError(f"Ошибка: '{input_audios}' не является ни файлом, ни папкой.")

	def setup_args():
	parser = argparse.ArgumentParser(description='Vbach CLI')

	# Обязательные аргументы
	parser.add_argument(
	'input_audios',
	type=str,
	help='Путь к аудиофайлу или папке с аудиофайлами для обработки'
	)
	parser.add_argument(
	'output_dir',
	type=str,
	help='Папка для сохранения результатов конвертации'
	)
	parser.add_argument(
	'model_name',
	type=str,
	help='Название голосовой модели RVC для преобразования'
	)

	# Необязательные аргументы с значениями по умолчанию
	parser.add_argument(
	'--template',
	type=str,
	default="NAME_MODEL_F0METHOD_PITCH",
	help='Шаблон имени выходного файла (доступные замены: DATETIME, NAME, MODEL, F0METHOD, PITCH)'
	)
	parser.add_argument(
	'--index_rate',
	type=float,
	default=0,
	help='Интенсивность использования индексного файла (от 0.0 до 1.0)',
	metavar='[0.0-1.0]'
	)
	parser.add_argument(
	'--output_format',
	type=str,
	default="wav",
	choices=OUTPUT_FORMAT,
	help='Формат выходного аудиофайла'
	)
	parser.add_argument(
	'--stereo_mode',
	type=str,
	default="mono",
	choices=["mono", "left/right", "sim/dif"],
	help='Режим каналов: моно или стерео'
	)
	parser.add_argument(
	'--method_pitch',
	type=str,
	default="rmvpe+",
	help='Метод извлечения pitch (тона)'
	)
	parser.add_argument(
	'--pitch',
	type=int,
	default=0,
	help='Корректировка тона в полутонах'
	)
	parser.add_argument(
	'--hop_length',
	type=int,
	default=128,
	help='Длина hop (в семплах) для обработки'
	)
	parser.add_argument(
	'--filter_radius',
	type=int,
	default=3,
	help='Радиус фильтра для сглаживания'
	)
	parser.add_argument(
	'--rms',
	type=float,
	default=0.25,
	help='Масштабирование огибающей громкости (RMS)'
	)
	parser.add_argument(
	'--protect',
	type=float,
	default=0.33,
	help='Защита для глухих согласных звуков'
	)
	parser.add_argument(
	'--f0_min',
	type=int,
	default=50,
	help='Минимальная частота pitch (F0) в Hz'
	)
	parser.add_argument(
	'--f0_max',
	type=int,
	default=1100,
	help='Максимальная частота pitch (F0) в Hz'
	)

	return parser.parse_args()

	# Пример использования:
	if __name__ == "__main__":
	args = setup_args()
	cli_conversion(
	input_audios=args.input_audios,
	output_dir=args.output_dir,
	model_name=args.model_name,
	template=args.template,
	index_rate=args.index_rate,
	output_format=args.output_format,
	stereo_mode=args.stereo_mode,
	method_pitch=args.method_pitch,
	pitch=args.pitch,
	hop_length=args.hop_length,
	filter_radius=args.filter_radius,
	rms=args.rms,
	protect=args.protect,
	f0_min=args.f0_min,
	f0_max=args.f0_max
	)



	'''

	with open(os.sep.join([current_dir, dirs[2], "vbach.py"]), 'w') as f:
	f.write(VBACH_CLI)

	def set_language(lang):
	global CURRENT_LANG
	CURRENT_LANG = lang


	def t(key, **kwargs):
	translation = TRANSLATIONS[CURRENT_LANG].get(key, key)
	if isinstance(translation, dict):
	return translation
	return translation.format(**kwargs) if kwargs else translation

	def download_file(url, zip_name, progress):
	try:
	if "drive.google.com" in url:
	progress(0.5, desc=t('downloading_google'))
	download_from_google_drive(url, zip_name, progress)
	elif "huggingface.co" in url:
	progress(0.5, desc=t('downloading_huggingface'))
	download_from_huggingface(url, zip_name, progress)
	elif "pixeldrain.com" in url:
	progress(0.5, desc=t('downloading_pixeldrain'))
	download_from_pixeldrain(url, zip_name, progress)
	elif "mega.nz" in url:
	print(t('mega_unsupported'))
	elif "disk.yandex.ru" in url or "yadi.sk" in url:
	progress(0.5, desc=t('downloading_yandex'))
	download_from_yandex(url, zip_name, progress)
	else:
	raise ValueError(t('unsupported_source', url=url))
	except Exception as e:
	raise gr.Error(t('download_error', error=str(e)))

	def download_from_google_drive(url, zip_name, progress):
	file_id = (
	url.split("file/d/")[1].split("/")[0]
	if "file/d/" in url
	else url.split("id=")[1].split("&")[0]
	)
	gdown.download(id=file_id, output=str(zip_name), quiet=False)

	def download_from_huggingface(url, zip_name, progress):
	urllib.request.urlretrieve(url, zip_name)

	def download_from_pixeldrain(url, zip_name, progress):
	file_id = url.split("pixeldrain.com/u/")[1]
	response = requests.get(f"https://pixeldrain.com/api/file/{file_id}")
	with open(zip_name, "wb") as f:
	f.write(response.content)

	def download_from_yandex(url, zip_name, progress):
	yandex_public_key = f"download?public_key={url}"
	yandex_api_url = f"https://cloud-api.yandex.net/v1/disk/public/resources/{yandex_public_key}"
	response = requests.get(yandex_api_url)
	if response.status_code == 200:
	download_link = response.json().get("href")
	urllib.request.urlretrieve(download_link, zip_name)
	else:
	raise gr.Error(t('yandex_api_error', status=response.status_code))

	def extract_zip(extraction_folder, zip_name):
	os.makedirs(extraction_folder, exist_ok=True)
	with zipfile.ZipFile(zip_name, "r") as zip_ref:
	zip_ref.extractall(extraction_folder)
	os.remove(zip_name)

	index_filepath, model_filepath = None, None
	for root, _, files in os.walk(extraction_folder):
	for name in files:
	file_path = os.path.join(root, name)
	if name.endswith(".index") and os.stat(file_path).st_size > 1024 * 100:
	index_filepath = file_path
	if name.endswith(".pth") and os.stat(file_path).st_size > 1024 * 1024 * 40:
	model_filepath = file_path

	if not model_filepath:
	raise gr.Error(t('pth_not_found', folder=extraction_folder))

	rename_and_cleanup(extraction_folder, model_filepath, index_filepath)

	def rename_and_cleanup(extraction_folder, model_filepath, index_filepath):
	os.rename(
	model_filepath,
	os.path.join(extraction_folder, os.path.basename(model_filepath)),
	)
	if index_filepath:
	os.rename(
	index_filepath,
	os.path.join(extraction_folder, os.path.basename(index_filepath)),
	)

	for filepath in os.listdir(extraction_folder):
	full_path = os.path.join(extraction_folder, filepath)
	if os.path.isdir(full_path):
	shutil.rmtree(full_path)

	def download_from_url(url, dir_name, progress=gr.Progress()):
	try:
	progress(0, desc=t('downloading_model', dir_name=dir_name))
	zip_name = os.path.join(dirs[0], dir_name + ".zip")
	extraction_folder = os.path.join(current_dir, dirs[0], dir_name)

	if os.path.exists(extraction_folder):
	raise gr.Error(t('model_exists', dir_name=dir_name))

	download_file(url, zip_name, progress)
	progress(0.8, desc=t('unpacking_zip'))
	extract_zip(extraction_folder, zip_name)
	return t('model_uploaded', dir_name=dir_name)
	except Exception as e:
	raise gr.Error(t('model_load_error', error=str(e)))

	def upload_zip_file(zip_path, dir_name, progress=gr.Progress()):
	try:
	extraction_folder = os.path.join(current_dir, dirs[0], dir_name)
	if os.path.exists(extraction_folder):
	raise gr.Error(t('model_exists', dir_name=dir_name))

	zip_name = zip_path.name
	progress(0.8, desc=t('unpacking_zip'))
	extract_zip(extraction_folder, zip_name)
	return t('model_uploaded', dir_name=dir_name)
	except Exception as e:
	raise gr.Error(t('model_load_error', error=str(e)))

	def upload_separate_files(pth_file, index_file, dir_name, progress=gr.Progress()):
	try:
	extraction_folder = os.path.join(current_dir, dirs[0], dir_name)
	if os.path.exists(extraction_folder):
	raise gr.Error(t('model_exists', dir_name=dir_name))

	os.makedirs(extraction_folder, exist_ok=True)

	if pth_file:
	pth_path = os.path.join(extraction_folder, os.path.basename(pth_file.name))
	shutil.copyfile(pth_file.name, pth_path)

	if index_file:
	index_path = os.path.join(extraction_folder, os.path.basename(index_file.name))
	shutil.copyfile(index_file.name, index_path)

	return t('model_uploaded', dir_name=dir_name)
	except Exception as e:
	raise gr.Error(t('model_load_error', error=str(e)))

	def delete_model_name(dir_name):
	model_dir = os.path.join(current_dir, dirs[0], dir_name)
	if os.path.exists(model_dir):
	try:
	if os.path.isdir(model_dir):
	shutil.rmtree(model_dir)
	return t('model_deleted', dir_name=dir_name)
	except Exception as e:
	raise gr.Error(t('model_delete_error', error=str(e)))
	else:
	return t('model_not_found', dir_name=dir_name)

	from vbach.cli.vbach import voice_conversion

	def process_audio(
	input_file: str = None,
	input_list: str = None,
	template: str = "NAME_MODEL_F0METHOD_PITCH",
	model_name: str = "",
	index_rate: float = 0,
	output_format: str = "wav",
	output_bitrate: int = 320,
	stereo_mode: str = "mono",
	method_pitch: str = "rmvpe+",
	pitch: float = 0,
	hop_length: int = 128,
	filter_radius: int = 3,
	rms: float = 0.25,
	protect: float = 0.33,
	f0_min: int = 50,
	f0_max: int = 1100
	):

	keys = ["NAME", "PITCH", "F0_METHOD", "DATETIME", "MODEL"]

	if any(key in template for key in keys):
	pass
	else:
	template = "DATETIME_Vbach_F0METHOD_PITCH"

	if not isinstance(input_list, list) and not input_file:
	try:
	print(input_list)
	input_list = ast.literal_eval(input_list)
	except Exception as e:
	print(e)
	gr.Warning(t("error_strlist_is_not_list"))
	return None

	if input_file is not None:
	try:
	print(input_file)
	input_list = ast.literal_eval(input_file)
	gr.Warning(t("error_path_is_list"))
	return None
	except Exception as e:
	pass


	output_bitrate = f"{output_bitrate}k"
	if not input_file and not input_list:
	raise gr.Error(t("error_no_audio"))
	if not model_name:
	raise gr.Error(t("error_no_model"))
	if input_file is not None and isinstance(input_file, str) and input_list == None:
	if not os.path.exists(input_file):
	gr.Warning(t("warning_file_not_found", file=input_file))
	return None

	file_name = os.path.basename(input_file)
	namefile = os.path.splitext(file_name)[0]
	time_create_file = datetime.now().strftime("%Y%m%d_%H%M%S")
	output_name = template
	output_dir = tempfile.mkdtemp(prefix="converted_voice_")
	print(output_dir)
	output_name = (
	template
	.replace("DATETIME", time_create_file)
	.replace("NAME", namefile)
	.replace("MODEL", model_name)
	.replace("F0METHOD", method_pitch)
	.replace("PITCH", f"{pitch}")
	)
	output_path = os.path.join(output_dir, f"{output_name}.{output_format}")
	try:
	output_path = voice_conversion(
	model_name,
	input_file,
	output_path,
	pitch,
	method_pitch,
	index_rate,
	filter_radius,
	rms,
	protect,
	hop_length,
	f0_min,
	f0_max,
	output_format,
	output_bitrate,
	stereo_mode
	)
	except Exception as e:
	print(e)
	finally:
	print(t("success_single"))
	return output_path

	if input_file is None and input_list is not None and isinstance(input_list, list):
	output_dir = tempfile.mkdtemp(prefix="converted_voice_")
	print(output_dir)
	output_paths = []
	progress = gr.Progress()
	for i, file in enumerate(input_list):

	if not os.path.exists(file):
	gr.Warning(t("warning_file_not_found", file=file))
	continue

	total_steps = len(input_list)
	file_name = os.path.basename(file)
	namefile = os.path.splitext(file_name)[0]
	time_create_file = datetime.now().strftime("%Y%m%d_%H%M%S")
	progress(
	(i+1, total_steps),
	desc=t("processing", namefile=namefile),
	unit=t("files")
	)
	output_name = (
	template
	.replace("DATETIME", time_create_file)
	.replace("NAME", namefile)
	.replace("MODEL", model_name)
	.replace("F0METHOD", method_pitch)
	.replace("PITCH", f"{pitch}")
	)
	output_path = os.path.join(output_dir, f"{output_name}.{output_format}")
	try:
	output_path = voice_conversion(
	model_name,
	file,
	output_path,
	pitch,
	method_pitch,
	index_rate,
	filter_radius,
	rms,
	protect,
	hop_length,
	f0_min,
	f0_max,
	output_format,
	output_bitrate,
	stereo_mode
	)
	except Exception as e:
	print(e)

	finally:
	output_paths.append(output_path)
	print(t("success_batch"))
	return output_paths

	def vbach_plugin_name():
	return "VBach"

	def vbach_plugin(lang="ru"):
	set_language(lang)

	with gr.TabItem(t("inference")):
	with gr.Column():
	with gr.Column(scale=3) as input_voice_group:
	with gr.Group() as single_voice_file:
	input_voice = gr.Audio(label=t("select_file"), interactive=True, type="filepath")
	batch_upload_btn = gr.Button(t("batch_upload"))
	with gr.Group(visible=False) as batch_voice_file:
	input_voices = gr.Files(type="filepath", interactive=True, show_label=False)
	single_upload_btn = gr.Button(t("single_upload"))
	input_voice_path = gr.Textbox(label=t("audio_path"), info=t("audio_path_info"), interactive=True)
	input_voice.upload(fn=(lambda x: gr.update(value=x)), inputs=input_voice, outputs=input_voice_path)
	input_voices.upload(fn=(lambda x: gr.update(value=str(x))), inputs=input_voices, outputs=input_voice_path)
	with gr.Column():
	with gr.Row(equal_height=True):
	model_name = gr.Dropdown(label=t("model_name"), choices=[d for d in os.listdir(os.path.join(current_dir, dirs[0])) if os.path.isdir(os.path.join(os.path.join(current_dir, dirs[0]), d))], interactive=True, filterable=False, scale=6)
	model_update_btn = gr.Button(t("update_button"), variant="primary", scale=3, size="lg")
	model_update_btn.click(fn=(lambda : gr.update(choices=[d for d in os.listdir(os.path.join(current_dir, dirs[0])) if os.path.isdir(os.path.join(os.path.join(current_dir, dirs[0]), d))])), inputs=None, outputs=model_name)
	with gr.Row():
	method_pitch = gr.Dropdown(label=t("pitch_method"), choices=["mangio-crepe", "rmvpe+", "fcpe"], value="rmvpe+", interactive=True, filterable=False)
	hop_length = gr.Slider(minimum=2, maximum=512, step=1, value=128, label=t("hop_length"), interactive=True, visible=False)
	with gr.Row():
	pitch = gr.Slider(minimum=-48, maximum=48, step=12, value=0, label=t("pitch"), interactive=True)
	with gr.Row():
	f0_min = gr.Slider(minimum=50, maximum=3500, step=1, value=50, label=t("f0_min"), interactive=True)
	f0_max = gr.Slider(minimum=500, maximum=3500, step=1, value=1100, label=t("f0_max"), interactive=True)

	with gr.Column(variant="panel"):
	with gr.Group():
	with gr.Row(equal_height=True):
	with gr.Column(scale=3):
	stereo_mode = gr.Dropdown(
	label=t("audio_processing"),
	choices=list(t("stereo_modes").keys()),
	value="mono",
	interactive=True,
	filterable=False
	)
	output_format = gr.Dropdown(label=t("output_format"), choices=OUTPUT_FORMAT)
	output_bitrate = gr.Slider(32, 320, step=1, label=t("bitrate"), value=320, interactive=True)
	with gr.Column(scale=6) as single_output_group:
	converted_voice = gr.Audio(label=t("converted_voice"), type="filepath", interactive=False, show_download_button=True, elem_classes="fixed-height")
	with gr.Column(scale=6, visible=False) as batch_output_group:
	converted_voices = gr.Files(label=t("converted_voices"), type="filepath", interactive=False, height="100%", elem_classes="fixed-height")
	convert_btn = gr.Button(t("convert_single"), variant="primary", scale=3)
	convert_batch_btn = gr.Button(t("convert_batch"), variant="primary", visible=False, scale=3)


	with gr.Column():
	with gr.Tab(t("name_format")):
	template_info = gr.Markdown(t("name_format_info"), line_breaks=True)
	template = gr.Text(label=t("name_format"), value="NAME_MODEL_F0METHOD_PITCH", interactive=True)

	with gr.Tab(t("advanced_settings")):
	with gr.Row():
	with gr.Column(scale=3):
	filter_radius = gr.Slider(minimum=0, maximum=7, step=1, value=3, label=t("filter_radius"), interactive=True)
	index_rate = gr.Slider(minimum=0, maximum=1, step=0.01, value=0, label=t("index_rate"), interactive=True)
	rms = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.25, label=t("rms"), interactive=True)
	protect = gr.Slider(minimum=0, maximum=0.5, step=0.01, value=0.33, label=t("protect"), interactive=True)


	with gr.TabItem(t("model_manager")):
	with gr.TabItem(t("download_url")):
	with gr.Row():
	with gr.Column(variant="panel"):
	gr.HTML(f"<center><h3>{t('download_link')}</h3></center>")
	model_zip_link = gr.Text(label=t("download_link"))
	with gr.Group():
	zip_model_name = gr.Text(
	label=t("model_name"),
	info=t("unique_name"),
	)
	download_btn = gr.Button(t("download_button"), variant="primary")

	gr.HTML(
	f"<h3>{t('supported_sites')}: "
	"<a href='https://huggingface.co/' target='_blank'>HuggingFace</a>, "
	"<a href='https://pixeldrain.com/' target='_blank'>Pixeldrain</a>, "
	"<a href='https://drive.google.com/' target='_blank'>Google Drive</a>, "
	"<a href='https://disk.yandex.ru/' target='_blank'>Яндекс Диск</a>"
	"</h3>"
	)

	dl_output_message = gr.Text(label=t("output_message"), interactive=False)
	download_btn.click(
	download_from_url,
	inputs=[model_zip_link, zip_model_name],
	outputs=dl_output_message,
	)

	with gr.Tab(t("download_zip")):
	with gr.Row():
	with gr.Column():
	zip_file = gr.File(
	label=t("zip_file"), file_types=[".zip"], file_count="single"
	)
	with gr.Column(variant="panel"):
	gr.HTML(t("upload_steps"))
	with gr.Group():
	local_model_name = gr.Text(
	label=t("model_name"),
	info=t("unique_name"),
	)
	model_upload_button = gr.Button(t("download_button"), variant="primary")

	local_upload_output_message = gr.Text(label=t("output_message"), interactive=False)
	model_upload_button.click(
	upload_zip_file,
	inputs=[zip_file, local_model_name],
	outputs=local_upload_output_message,
	)

	with gr.TabItem(t("download_files")):
	with gr.Group():
	with gr.Row():
	pth_file = gr.File(
	label=t("pth_file"), file_types=[".pth"], file_count="single"
	)
	index_file = gr.File(
	label=t("index_file"), file_types=[".index"], file_count="single"
	)
	with gr.Column(variant="panel"):
	with gr.Group():
	separate_model_name = gr.Text(
	label=t("model_name"),
	info=t("unique_name"),
	)
	separate_upload_button = gr.Button(t("download_button"), variant="primary")

	separate_upload_output_message = gr.Text(
	label=t("output_message"), interactive=False
	)
	separate_upload_button.click(
	upload_separate_files,
	inputs=[pth_file, index_file, separate_model_name],
	outputs=separate_upload_output_message,
	)

	with gr.TabItem(t("delete_model")):
	with gr.Column(variant="panel"):
	with gr.Group():
	delete_voicemodel_name = gr.Dropdown(
	label=t("model_name"),
	info=t("delete_info"),
	choices=[d for d in os.listdir(os.path.join(current_dir, dirs[0])) if os.path.isdir(os.path.join(os.path.join(current_dir, dirs[0]), d))],
	interactive=True,
	filterable=False
	)
	refresh_delete_btn = gr.Button(t("refresh_button"))
	refresh_delete_btn.click(fn=(lambda : gr.update(choices=[d for d in os.listdir(os.path.join(current_dir, dirs[0])) if os.path.isdir(os.path.join(os.path.join(current_dir, dirs[0]), d))])), inputs=None, outputs=delete_voicemodel_name)
	delete_model_output_message = gr.Text(
	label=t("output_message"), interactive=False
	)
	delete_model_btn = gr.Button(t("delete_button"))
	delete_model_btn.click(
	fn=delete_model_name,
	inputs=delete_voicemodel_name,
	outputs=delete_model_output_message
	)


	method_pitch.change(fn=lambda x: gr.update(visible=True if x == "mangio-crepe" else False), inputs=method_pitch, outputs=hop_length)
	batch_upload_btn.click(fn=(lambda : (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True))), inputs=None, outputs=[single_voice_file, batch_voice_file, single_output_group, batch_output_group, convert_btn, convert_batch_btn])
	single_upload_btn.click(fn=(lambda : (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True))), inputs=None, outputs=[batch_voice_file, single_voice_file, batch_output_group, single_output_group, convert_batch_btn, convert_btn])
	convert_btn.click(fn=process_audio, inputs=[input_voice_path, gr.State(None), template, model_name, index_rate, output_format, output_bitrate, stereo_mode, method_pitch, pitch, hop_length, filter_radius, rms, protect, f0_min, f0_max], outputs=converted_voice)
	convert_batch_btn.click(fn=process_audio, inputs=[gr.State(None), input_voice_path, template, model_name, index_rate, output_format, output_bitrate, stereo_mode, method_pitch, pitch, hop_length, filter_radius, rms, protect, f0_min, f0_max], outputs=converted_voices)