diff --git a/main/app/app.py b/main/app/app.py
deleted file mode 100644
index cad92bf2082820970caad35dd8975389b1d9d378..0000000000000000000000000000000000000000
--- a/main/app/app.py
+++ /dev/null
@@ -1,218 +0,0 @@
-import os
-import sys
-import torch
-import shutil
-import librosa
-import logging
-import requests
-import subprocess
-import numpy as np
-import gradio as gr
-import soundfile as sf
-from time import sleep
-from multiprocessing import cpu_count
-
-sys.path.append(os.getcwd())
-from main.app.tabs.inference.inference import inference_tabs
-from main.app.tabs.models.model import model_tabs
-from main.app.tabs.utils.utils import utils_tabs
-
-from main.tools import huggingface
-from main.configs.config import Config
-from main.app.based.utils import *
-
-with gr.Blocks(title="Ultimate RVC Maker ⚡", theme=theme) as app:
- gr.HTML("
Ultimate RVC Maker ⚡
")
- gr.Markdown(
- f"""
- If you liked this HF Space you can give me a ❤️
-
- Try Ultimate RVC Maker WebUI using Colab [here](https://colab.research.google.com/github/TheNeodev/Notebook/blob/main/RVC-MAKER.ipynb)
- """
- )
- with gr.Tabs():
-
-
- with gr.TabItem("Inference"):
- inference_tabs()
- with gr.TabItem("Model Options"):
- model_tabs()
- with gr.TabItem(translations["separator_tab"], visible=configs.get("separator_tab", True)):
- gr.Markdown(f"## {translations['separator_tab']}")
- with gr.Row():
- gr.Markdown(translations["4_part"])
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row(equal_height=True):
- cleaner = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True, min_width=140)
- backing = gr.Checkbox(label=translations["separator_backing"], value=False, interactive=True, min_width=140)
- reverb = gr.Checkbox(label=translations["dereveb_audio"], value=False, interactive=True, min_width=140)
- backing_reverb = gr.Checkbox(label=translations["dereveb_backing"], value=False, interactive=False, min_width=140)
- denoise = gr.Checkbox(label=translations["denoise_mdx"], value=False, interactive=False, min_width=140)
- with gr.Row(equal_height=True):
- separator_model = gr.Dropdown(label=translations["separator_model"], value=uvr_model[0], choices=uvr_model, interactive=True)
- separator_backing_model = gr.Dropdown(label=translations["separator_backing_model"], value="Version-1", choices=["Version-1", "Version-2"], interactive=True, visible=backing.value)
-
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row(equal_height=True):
- shifts = gr.Slider(label=translations["shift"], info=translations["shift_info"], minimum=1, maximum=20, value=2, step=1, interactive=True)
- segment_size = gr.Slider(label=translations["segments_size"], info=translations["segments_size_info"], minimum=32, maximum=3072, value=256, step=32, interactive=True)
- with gr.Row():
- mdx_batch_size = gr.Slider(label=translations["batch_size"], info=translations["mdx_batch_size_info"], minimum=1, maximum=64, value=1, step=1, interactive=True, visible=backing.value or reverb.value or separator_model.value in mdx_model)
- with gr.Column():
- with gr.Group():
- with gr.Row():
- overlap = gr.Radio(label=translations["overlap"], info=translations["overlap_info"], choices=["0.25", "0.5", "0.75", "0.99"], value="0.25", interactive=True)
- with gr.Row():
- mdx_hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=8192, value=1024, step=1, interactive=True, visible=backing.value or reverb.value or separator_model.value in mdx_model)
- with gr.Column():
- with gr.Row():
- clean_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner.value)
- sample_rate1 = gr.Slider(minimum=8000, maximum=96000, step=1, value=44100, label=translations["sr"], info=translations["sr_info"], interactive=True)
- input = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
- audio_input = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- with gr.Column():
- with gr.Accordion(translations["use_url"], open=False):
- url = gr.Textbox(label=translations["url_audio"], value="", placeholder="https://www.youtube.com/...", scale=6)
- download_button = gr.Button(translations["downloads"])
- with gr.Column():
- with gr.Accordion(translations["input_output"], open=False):
- format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
- input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True)
- refesh_separator = gr.Button(translations["refesh"])
- output_separator = gr.Textbox(label=translations["output_folder"], value="audios", placeholder="audios", info=translations["output_folder_info"], interactive=True)
- separator_button = gr.Button(translations["separator_tab"], variant="primary")
- with gr.Row():
- gr.Markdown(translations["output_separator"])
- with gr.Row():
- instruments_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["instruments"])
- original_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["original_vocal"])
- main_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["main_vocal"], visible=backing.value)
- backing_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["backing_vocal"], visible=backing.value)
- with gr.Row():
- separator_model.change(fn=lambda a, b, c: [visible(a or b or c in mdx_model), visible(a or b or c in mdx_model), valueFalse_interactive(a or b or c in mdx_model), visible(c not in mdx_model)], inputs=[backing, reverb, separator_model], outputs=[mdx_batch_size, mdx_hop_length, denoise, shifts])
- backing.change(fn=lambda a, b, c: [visible(a or b or c in mdx_model), visible(a or b or c in mdx_model), valueFalse_interactive(a or b or c in mdx_model), visible(a), visible(a), visible(a), valueFalse_interactive(a and b)], inputs=[backing, reverb, separator_model], outputs=[mdx_batch_size, mdx_hop_length, denoise, separator_backing_model, main_vocals, backing_vocals, backing_reverb])
- reverb.change(fn=lambda a, b, c: [visible(a or b or c in mdx_model), visible(a or b or c in mdx_model), valueFalse_interactive(a or b or c in mdx_model), valueFalse_interactive(a and b)], inputs=[backing, reverb, separator_model], outputs=[mdx_batch_size, mdx_hop_length, denoise, backing_reverb])
- with gr.Row():
- input_audio.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio], outputs=[audio_input])
- cleaner.change(fn=visible, inputs=[cleaner], outputs=[clean_strength])
- with gr.Row():
- input.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("audios")), inputs=[input], outputs=[input_audio])
- refesh_separator.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio])
- with gr.Row():
- download_button.click(
- fn=download_url,
- inputs=[url],
- outputs=[input_audio, audio_input, url],
- api_name='download_url'
- )
- separator_button.click(
- fn=separator_music,
- inputs=[
- input_audio,
- output_separator,
- format,
- shifts,
- segment_size,
- overlap,
- cleaner,
- clean_strength,
- denoise,
- separator_model,
- separator_backing_model,
- backing,
- reverb,
- backing_reverb,
- mdx_hop_length,
- mdx_batch_size,
- sample_rate1
- ],
- outputs=[original_vocals, instruments_audio, main_vocals, backing_vocals],
- api_name='separator_music'
- )
- utils_tabs()
- with gr.TabItem(translations["settings"], visible=configs.get("settings_tab", True)):
- gr.Markdown(translations["settings_markdown"])
- with gr.Row():
- gr.Markdown(translations["settings_markdown_2"])
- with gr.Row():
- toggle_button = gr.Button(translations["change_light_dark"], variant="secondary", scale=2)
- with gr.Row():
- with gr.Column():
- language_dropdown = gr.Dropdown(label=translations["lang"], interactive=True, info=translations["lang_restart"], choices=configs.get("support_language", "vi-VN"), value=language)
- change_lang = gr.Button(translations["change_lang"], variant="primary", scale=2)
- with gr.Column():
- theme_dropdown = gr.Dropdown(label=translations["theme"], interactive=True, info=translations["theme_restart"], choices=configs.get("themes", theme), value=theme, allow_custom_value=True)
- changetheme = gr.Button(translations["theme_button"], variant="primary", scale=2)
- with gr.Row():
- with gr.Column():
- fp_choice = gr.Radio(choices=["fp16","fp32"], value="fp16" if configs.get("fp16", False) else "fp32", label=translations["precision"], info=translations["precision_info"], interactive=True)
- fp_button = gr.Button(translations["update_precision"], variant="secondary", scale=2)
- with gr.Column():
- font_choice = gr.Textbox(label=translations["font"], info=translations["font_info"], value=font, interactive=True)
- font_button = gr.Button(translations["change_font"])
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["stop"], open=False):
- separate_stop = gr.Button(translations["stop_separate"])
- convert_stop = gr.Button(translations["stop_convert"])
- create_dataset_stop = gr.Button(translations["stop_create_dataset"])
- audioldm2_stop = gr.Button(translations["stop_audioldm2"])
- with gr.Accordion(translations["stop_training"], open=False):
- model_name_stop = gr.Textbox(label=translations["modelname"], info=translations["training_model_name"], value="", placeholder=translations["modelname"], interactive=True)
- preprocess_stop = gr.Button(translations["stop_preprocess"])
- extract_stop = gr.Button(translations["stop_extract"])
- train_stop = gr.Button(translations["stop_training"])
- with gr.Row():
- toggle_button.click(fn=None, js="() => {document.body.classList.toggle('dark')}")
- fp_button.click(fn=change_fp, inputs=[fp_choice], outputs=[fp_choice])
- with gr.Row():
- change_lang.click(fn=change_language, inputs=[language_dropdown], outputs=[])
- changetheme.click(fn=change_theme, inputs=[theme_dropdown], outputs=[])
- font_button.click(fn=change_font, inputs=[font_choice], outputs=[])
- with gr.Row():
- change_lang.click(fn=None, js="setTimeout(function() {location.reload()}, 15000)", inputs=[], outputs=[])
- changetheme.click(fn=None, js="setTimeout(function() {location.reload()}, 15000)", inputs=[], outputs=[])
- font_button.click(fn=None, js="setTimeout(function() {location.reload()}, 15000)", inputs=[], outputs=[])
- with gr.Row():
- separate_stop.click(fn=lambda: stop_pid("separate_pid", None, False), inputs=[], outputs=[])
- convert_stop.click(fn=lambda: stop_pid("convert_pid", None, False), inputs=[], outputs=[])
- create_dataset_stop.click(fn=lambda: stop_pid("create_dataset_pid", None, False), inputs=[], outputs=[])
- with gr.Row():
- preprocess_stop.click(fn=lambda model_name_stop: stop_pid("preprocess_pid", model_name_stop, False), inputs=[model_name_stop], outputs=[])
- extract_stop.click(fn=lambda model_name_stop: stop_pid("extract_pid", model_name_stop, False), inputs=[model_name_stop], outputs=[])
- train_stop.click(fn=lambda model_name_stop: stop_pid("train_pid", model_name_stop, True), inputs=[model_name_stop], outputs=[])
- with gr.Row():
- audioldm2_stop.click(fn=lambda: stop_pid("audioldm2_pid", None, False), inputs=[], outputs=[])
-
-
- with gr.Row():
- gr.Markdown(translations["terms_of_use"])
- gr.Markdown(translations["exemption"])
-
- logger.info(translations["start_app"])
- logger.info(translations["set_lang"].format(lang=language))
-
- port = configs.get("app_port", 7860)
-
- for i in range(configs.get("num_of_restart", 5)):
- try:
- app.queue().launch(
- favicon_path=os.path.join("assets", "ico.png"),
- server_name=configs.get("server_name", "0.0.0.0"),
- server_port=port,
- show_error=configs.get("app_show_error", False),
- inbrowser="--open" in sys.argv,
- share="--share" in sys.argv,
- allowed_paths=allow_disk
- )
- break
- except OSError:
- logger.debug(translations["port"].format(port=port))
- port -= 1
- except Exception as e:
- logger.error(translations["error_occurred"].format(e=e))
- sys.exit(1)
\ No newline at end of file
diff --git a/main/app/based/utils.py b/main/app/based/utils.py
deleted file mode 100644
index 777b04aa856b6fd8046d08652ecda11447065594..0000000000000000000000000000000000000000
--- a/main/app/based/utils.py
+++ /dev/null
@@ -1,1534 +0,0 @@
-import os
-import re
-import ssl
-import sys
-import json
-import torch
-import codecs
-import shutil
-import asyncio
-import librosa
-import logging
-import datetime
-import platform
-import requests
-import warnings
-import threading
-import subprocess
-import logging.handlers
-
-import numpy as np
-import gradio as gr
-import pandas as pd
-import soundfile as sf
-
-from time import sleep
-from multiprocessing import cpu_count
-
-sys.path.append(os.getcwd())
-
-from main.tools import huggingface
-from main.configs.config import Config
-
-ssl._create_default_https_context = ssl._create_unverified_context
-logger = logging.getLogger(__name__)
-logger.propagate = False
-
-if logger.hasHandlers(): logger.handlers.clear()
-else:
- console_handler = logging.StreamHandler()
- console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
- console_handler.setFormatter(console_formatter)
- console_handler.setLevel(logging.INFO)
- file_handler = logging.handlers.RotatingFileHandler(os.path.join("assets", "logs", "app.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8')
- file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
- file_handler.setFormatter(file_formatter)
- file_handler.setLevel(logging.DEBUG)
- logger.addHandler(console_handler)
- logger.addHandler(file_handler)
- logger.setLevel(logging.DEBUG)
-
-warnings.filterwarnings("ignore")
-for l in ["httpx", "gradio", "uvicorn", "httpcore", "urllib3"]:
- logging.getLogger(l).setLevel(logging.ERROR)
-
-config = Config()
-python = sys.executable
-translations = config.translations
-configs_json = os.path.join("main", "configs", "config.json")
-configs = json.load(open(configs_json, "r"))
-
-os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
-os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
-
-if config.device in ["cpu", "mps"] and configs.get("fp16", False):
- logger.warning(translations["fp16_not_support"])
- configs["fp16"] = config.is_half = False
- with open(configs_json, "w") as f:
- json.dump(configs, f, indent=4)
-
-models, model_options = {}, {}
-
-method_f0 = ["mangio-crepe-full", "crepe-full", "fcpe", "rmvpe", "harvest", "pyin"]
-method_f0_full = ["pm", "dio", "mangio-crepe-tiny", "mangio-crepe-small", "mangio-crepe-medium", "mangio-crepe-large", "mangio-crepe-full", "crepe-tiny", "crepe-small", "crepe-medium", "crepe-large", "crepe-full", "fcpe", "fcpe-legacy", "rmvpe", "rmvpe-legacy", "harvest", "yin", "pyin", "swipe"]
-
-embedders_mode = ["fairseq", "onnx", "transformers", "spin"]
-embedders_model = ["contentvec_base", "hubert_base", "japanese_hubert_base", "korean_hubert_base", "chinese_hubert_base", "portuguese_hubert_base", "custom"]
-
-paths_for_files = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk("audios") for f in files if os.path.splitext(f)[1].lower() in (".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")])
-model_name, index_path, delete_index = sorted(list(model for model in os.listdir(os.path.join("assets", "weights")) if model.endswith((".pth", ".onnx")) and not model.startswith("G_") and not model.startswith("D_"))), sorted([os.path.join(root, name) for root, _, files in os.walk(os.path.join("assets", "logs"), topdown=False) for name in files if name.endswith(".index") and "trained" not in name]), sorted([os.path.join("assets", "logs", f) for f in os.listdir(os.path.join("assets", "logs")) if "mute" not in f and os.path.isdir(os.path.join("assets", "logs", f))])
-pretrainedD, pretrainedG, Allpretrained = ([model for model in os.listdir(os.path.join("assets", "models", "pretrained_custom")) if model.endswith(".pth") and "D" in model], [model for model in os.listdir(os.path.join("assets", "models", "pretrained_custom")) if model.endswith(".pth") and "G" in model], [os.path.join("assets", "models", path, model) for path in ["pretrained_v1", "pretrained_v2", "pretrained_custom"] for model in os.listdir(os.path.join("assets", "models", path)) if model.endswith(".pth") and ("D" in model or "G" in model)])
-
-separate_model = sorted([os.path.join("assets", "models", "uvr5", models) for models in os.listdir(os.path.join("assets", "models", "uvr5")) if models.endswith((".th", ".yaml", ".onnx"))])
-presets_file = sorted(list(f for f in os.listdir(os.path.join("assets", "presets")) if f.endswith(".json")))
-f0_file = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(os.path.join("assets", "f0")) for f in files if f.endswith(".txt")])
-
-language, theme, edgetts, google_tts_voice, mdx_model, uvr_model, font = configs.get("language", "vi-VN"), configs.get("theme", "NoCrypt/miku"), configs.get("edge_tts", ["vi-VN-HoaiMyNeural", "vi-VN-NamMinhNeural"]), configs.get("google_tts_voice", ["vi", "en"]), configs.get("mdx_model", "MDXNET_Main"), (configs.get("demucs_model", "HD_MMI") + configs.get("mdx_model", "MDXNET_Main")), configs.get("font", "https://fonts.googleapis.com/css2?family=Courgette&display=swap")
-
-csv_path = os.path.join("assets", "spreadsheet.csv")
-logger.info(config.device)
-
-if "--allow_all_disk" in sys.argv:
- import win32api
-
- allow_disk = win32api.GetLogicalDriveStrings().split('\x00')[:-1]
-else: allow_disk = []
-
-if language == "vi-VN":
- import gradio.strings
- gradio.strings.en = {"RUNNING_LOCALLY": "* Chạy trên liên kết nội bộ: {}://{}:{}", "RUNNING_LOCALLY_SSR": "* Chạy trên liên kết nội bộ: {}://{}:{}, với SSR ⚡ (thử nghiệm, để tắt hãy dùng `ssr=False` trong `launch()`)", "SHARE_LINK_DISPLAY": "* Chạy trên liên kết công khai: {}", "COULD_NOT_GET_SHARE_LINK": "\nKhông thể tạo liên kết công khai. Vui lòng kiểm tra kết nối mạng của bạn hoặc trang trạng thái của chúng tôi: https://status.gradio.app.", "COULD_NOT_GET_SHARE_LINK_MISSING_FILE": "\nKhông thể tạo liên kết công khai. Thiếu tập tin: {}. \n\nVui lòng kiểm tra kết nối internet của bạn. Điều này có thể xảy ra nếu phần mềm chống vi-rút của bạn chặn việc tải xuống tệp này. Bạn có thể cài đặt thủ công bằng cách làm theo các bước sau: \n\n1. Tải xuống tệp này: {}\n2. Đổi tên tệp đã tải xuống thành: {}\n3. Di chuyển tệp đến vị trí này: {}", "COLAB_NO_LOCAL": "Không thể hiển thị giao diện nội bộ trên google colab, liên kết công khai đã được tạo.", "PUBLIC_SHARE_TRUE": "\nĐể tạo một liên kết công khai, hãy đặt `share=True` trong `launch()`.", "MODEL_PUBLICLY_AVAILABLE_URL": "Mô hình được cung cấp công khai tại: {} (có thể mất tới một phút để sử dụng được liên kết)", "GENERATING_PUBLIC_LINK": "Đang tạo liên kết công khai (có thể mất vài giây...):", "BETA_INVITE": "\nCảm ơn bạn đã là người dùng Gradio! Nếu bạn có thắc mắc hoặc phản hồi, vui lòng tham gia máy chủ Discord của chúng tôi và trò chuyện với chúng tôi: https://discord.gg/feTf9x3ZSB", "COLAB_DEBUG_TRUE": "Đã phát hiện thấy sổ tay Colab. Ô này sẽ chạy vô thời hạn để bạn có thể xem lỗi và nhật ký. " "Để tắt, hãy đặt debug=False trong launch().", "COLAB_DEBUG_FALSE": "Đã phát hiện thấy sổ tay Colab. Để hiển thị lỗi trong sổ ghi chép colab, hãy đặt debug=True trong launch()", "COLAB_WARNING": "Lưu ý: việc mở Chrome Inspector có thể làm hỏng bản demo trong sổ tay Colab.", "SHARE_LINK_MESSAGE": "\nLiên kết công khai sẽ hết hạn sau 72 giờ. Để nâng cấp GPU và lưu trữ vĩnh viễn miễn phí, hãy chạy `gradio deploy` từ terminal trong thư mục làm việc để triển khai lên huggingface (https://huggingface.co/spaces)", "INLINE_DISPLAY_BELOW": "Đang tải giao diện bên dưới...", "COULD_NOT_GET_SHARE_LINK_CHECKSUM": "\nKhông thể tạo liên kết công khai. Tổng kiểm tra không khớp cho tập tin: {}."}
-
-if os.path.exists(csv_path): cached_data = pd.read_csv(csv_path)
-else:
- cached_data = pd.read_csv(codecs.decode("uggcf://qbpf.tbbtyr.pbz/fcernqfurrgf/q/1gNHnDeRULtEfz1Yieaw14USUQjWJy0Oq9k0DrCrjApb/rkcbeg?sbezng=pfi&tvq=1977693859", "rot13"))
- cached_data.to_csv(csv_path, index=False)
-
-for _, row in cached_data.iterrows():
- filename = row['Filename']
- url = None
-
- for value in row.values:
- if isinstance(value, str) and "huggingface" in value:
- url = value
- break
-
- if url: models[filename] = url
-
-
-
-def gr_info(message):
- gr.Info(message, duration=2)
- logger.info(message)
-
-def gr_warning(message):
- gr.Warning(message, duration=2)
- logger.warning(message)
-
-def gr_error(message):
- gr.Error(message=message, duration=6)
- logger.error(message)
-
-def get_gpu_info():
- ngpu = torch.cuda.device_count()
- gpu_infos = [f"{i}: {torch.cuda.get_device_name(i)} ({int(torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + 0.4)} GB)" for i in range(ngpu) if torch.cuda.is_available() or ngpu != 0]
- return "\n".join(gpu_infos) if len(gpu_infos) > 0 else translations["no_support_gpu"]
-
-def change_f0_choices():
- f0_file = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(os.path.join("assets", "f0")) for f in files if f.endswith(".txt")])
- return {"value": f0_file[0] if len(f0_file) >= 1 else "", "choices": f0_file, "__type__": "update"}
-
-def change_audios_choices(input_audio):
- audios = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk("audios") for f in files if os.path.splitext(f)[1].lower() in (".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")])
- return {"value": input_audio if input_audio != "" else (audios[0] if len(audios) >= 1 else ""), "choices": audios, "__type__": "update"}
-
-def change_separate_choices():
- return [{"choices": sorted([os.path.join("assets", "models", "uvr5", models) for models in os.listdir(os.path.join("assets", "models", "uvr5")) if model.endswith((".th", ".yaml", ".onnx"))]), "__type__": "update"}]
-
-def change_models_choices():
- model, index = sorted(list(model for model in os.listdir(os.path.join("assets", "weights")) if model.endswith((".pth", ".onnx")) and not model.startswith("G_") and not model.startswith("D_"))), sorted([os.path.join(root, name) for root, _, files in os.walk(os.path.join("assets", "logs"), topdown=False) for name in files if name.endswith(".index") and "trained" not in name])
- return [{"value": model[0] if len(model) >= 1 else "", "choices": model, "__type__": "update"}, {"value": index[0] if len(index) >= 1 else "", "choices": index, "__type__": "update"}]
-
-def change_allpretrained_choices():
- return [{"choices": sorted([os.path.join("assets", "models", path, model) for path in ["pretrained_v1", "pretrained_v2", "pretrained_custom"] for model in os.listdir(os.path.join("assets", "models", path)) if model.endswith(".pth") and ("D" in model or "G" in model)]), "__type__": "update"}]
-
-def change_pretrained_choices():
- return [{"choices": sorted([model for model in os.listdir(os.path.join("assets", "models", "pretrained_custom")) if model.endswith(".pth") and "D" in model]), "__type__": "update"}, {"choices": sorted([model for model in os.listdir(os.path.join("assets", "models", "pretrained_custom")) if model.endswith(".pth") and "G" in model]), "__type__": "update"}]
-
-def change_choices_del():
- return [{"choices": sorted(list(model for model in os.listdir(os.path.join("assets", "weights")) if model.endswith(".pth") and not model.startswith("G_") and not model.startswith("D_"))), "__type__": "update"}, {"choices": sorted([os.path.join("assets", "logs", f) for f in os.listdir(os.path.join("assets", "logs")) if "mute" not in f and os.path.isdir(os.path.join("assets", "logs", f))]), "__type__": "update"}]
-
-def change_preset_choices():
- return {"value": "", "choices": sorted(list(f for f in os.listdir(os.path.join("assets", "presets")) if f.endswith(".json"))), "__type__": "update"}
-
-def change_tts_voice_choices(google):
- return {"choices": google_tts_voice if google else edgetts, "value": google_tts_voice[0] if google else edgetts[0], "__type__": "update"}
-
-def change_backing_choices(backing, merge):
- if backing or merge: return {"value": False, "interactive": False, "__type__": "update"}
- elif not backing or not merge: return {"interactive": True, "__type__": "update"}
- else: gr_warning(translations["option_not_valid"])
-
-def change_download_choices(select):
- selects = [False]*10
-
- if select == translations["download_url"]: selects[0] = selects[1] = selects[2] = True
- elif select == translations["download_from_csv"]: selects[3] = selects[4] = True
- elif select == translations["search_models"]: selects[5] = selects[6] = True
- elif select == translations["upload"]: selects[9] = True
- else: gr_warning(translations["option_not_valid"])
-
- return [{"visible": selects[i], "__type__": "update"} for i in range(len(selects))]
-
-def change_download_pretrained_choices(select):
- selects = [False]*8
-
- if select == translations["download_url"]: selects[0] = selects[1] = selects[2] = True
- elif select == translations["list_model"]: selects[3] = selects[4] = selects[5] = True
- elif select == translations["upload"]: selects[6] = selects[7] = True
- else: gr_warning(translations["option_not_valid"])
-
- return [{"visible": selects[i], "__type__": "update"} for i in range(len(selects))]
-
-def get_index(model):
- model = os.path.basename(model).split("_")[0]
- return {"value": next((f for f in [os.path.join(root, name) for root, _, files in os.walk(os.path.join("assets", "logs"), topdown=False) for name in files if name.endswith(".index") and "trained" not in name] if model.split(".")[0] in f), ""), "__type__": "update"} if model else None
-
-def index_strength_show(index):
- return {"visible": index != "" and os.path.exists(index), "value": 0.5, "__type__": "update"}
-
-def hoplength_show(method, hybrid_method=None):
- show_hop_length_method = ["mangio-crepe-tiny", "mangio-crepe-small", "mangio-crepe-medium", "mangio-crepe-large", "mangio-crepe-full", "fcpe", "fcpe-legacy", "yin", "pyin"]
-
- if method in show_hop_length_method: visible = True
- elif method == "hybrid":
- methods_str = re.search("hybrid\[(.+)\]", hybrid_method)
- if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")]
-
- for i in methods:
- visible = i in show_hop_length_method
- if visible: break
- else: visible = False
-
- return {"visible": visible, "__type__": "update"}
-
-def visible(value):
- return {"visible": value, "__type__": "update"}
-
-def valueFalse_interactive(inp):
- return {"value": False, "interactive": inp, "__type__": "update"}
-
-def valueEmpty_visible1(inp1):
- return {"value": "", "visible": inp1, "__type__": "update"}
-
-def process_input(file_path):
- file_contents = ""
-
- if not file_path.endswith(".srt"):
- with open(file_path, "r", encoding="utf-8") as file:
- file_contents = file.read()
-
- gr_info(translations["upload_success"].format(name=translations["text"]))
- return file_contents
-
-def fetch_pretrained_data():
- response = requests.get(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/wfba/phfgbz_cergenvarq.wfba", "rot13"))
- response.raise_for_status()
-
- return response.json()
-
-def update_sample_rate_dropdown(model):
- data = fetch_pretrained_data()
- if model != translations["success"]: return {"choices": list(data[model].keys()), "value": list(data[model].keys())[0], "__type__": "update"}
-
-def if_done(done, p):
- while 1:
- if p.poll() is None: sleep(0.5)
- else: break
-
- done[0] = True
-
-def restart_app():
- global app
-
- gr_info(translations["15s"])
- os.system("cls" if platform.system() == "Windows" else "clear")
-
- app.close()
- subprocess.run([python, os.path.join("main", "app", "app.py")] + sys.argv[1:])
-
-def change_language(lang):
- configs = json.load(open(configs_json, "r"))
- configs["language"] = lang
-
- with open(configs_json, "w") as f:
- json.dump(configs, f, indent=4)
-
- restart_app()
-
-def change_theme(theme):
- with open(configs_json, "r") as f:
- configs = json.load(f)
-
- configs["theme"] = theme
- with open(configs_json, "w") as f:
- json.dump(configs, f, indent=4)
-
- restart_app()
-
-def change_font(font):
- with open(configs_json, "r") as f:
- configs = json.load(f)
-
- configs["font"] = font
- with open(configs_json, "w") as f:
- json.dump(configs, f, indent=4)
-
- restart_app()
-
-def zip_file(name, pth, index):
- pth_path = os.path.join("assets", "weights", pth)
- if not pth or not os.path.exists(pth_path) or not pth.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"]))
-
- zip_file_path = os.path.join("assets", "logs", name, name + ".zip")
- gr_info(translations["start"].format(start=translations["zip"]))
-
- import zipfile
- with zipfile.ZipFile(zip_file_path, 'w') as zipf:
- zipf.write(pth_path, os.path.basename(pth_path))
- if index: zipf.write(index, os.path.basename(index))
-
- gr_info(translations["success"])
- return {"visible": True, "value": zip_file_path, "__type__": "update"}
-
-def fetch_models_data(search):
- all_table_data = []
- page = 1
-
- while 1:
- try:
- response = requests.post(url=codecs.decode("uggcf://ibvpr-zbqryf.pbz/srgpu_qngn.cuc", "rot13"), data={"page": page, "search": search})
-
- if response.status_code == 200:
- table_data = response.json().get("table", "")
- if not table_data.strip(): break
- all_table_data.append(table_data)
- page += 1
- else:
- logger.debug(f"{translations['code_error']} {response.status_code}")
- break
- except json.JSONDecodeError:
- logger.debug(translations["json_error"])
- break
- except requests.RequestException as e:
- logger.debug(translations["requests_error"].format(e=e))
- break
- return all_table_data
-
-def search_models(name):
- gr_info(translations["start"].format(start=translations["search"]))
- tables = fetch_models_data(name)
-
- if len(tables) == 0:
- gr_info(translations["not_found"].format(name=name))
- return [None]*2
- else:
- model_options.clear()
-
- from bs4 import BeautifulSoup
-
- for table in tables:
- for row in BeautifulSoup(table, "html.parser").select("tr"):
- name_tag, url_tag = row.find("a", {"class": "fs-5"}), row.find("a", {"class": "btn btn-sm fw-bold btn-light ms-0 p-1 ps-2 pe-2"})
- url = url_tag["href"].replace("https://easyaivoice.com/run?url=", "")
- if "huggingface" in url:
- if name_tag and url_tag: model_options[name_tag.text.replace(".onnx", "").replace(".pth", "").replace(".index", "").replace(".zip", "").replace(" ", "_").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace(",", "").replace('"', "").replace("'", "").replace("|", "").strip()] = url
-
- gr_info(translations["found"].format(results=len(model_options)))
- return [{"value": "", "choices": model_options, "interactive": True, "visible": True, "__type__": "update"}, {"value": translations["downloads"], "visible": True, "__type__": "update"}]
-
-def move_files_from_directory(src_dir, dest_weights, dest_logs, model_name):
- for root, _, files in os.walk(src_dir):
- for file in files:
- file_path = os.path.join(root, file)
- if file.endswith(".index"):
- model_log_dir = os.path.join(dest_logs, model_name)
- os.makedirs(model_log_dir, exist_ok=True)
-
- filepath = os.path.join(model_log_dir, file.replace(' ', '_').replace('(', '').replace(')', '').replace('[', '').replace(']', '').replace(",", "").replace('"', "").replace("'", "").replace("|", "").strip())
- if os.path.exists(filepath): os.remove(filepath)
-
- shutil.move(file_path, filepath)
- elif file.endswith(".pth") and not file.startswith("D_") and not file.startswith("G_"):
- pth_path = os.path.join(dest_weights, model_name + ".pth")
- if os.path.exists(pth_path): os.remove(pth_path)
-
- shutil.move(file_path, pth_path)
- elif file.endswith(".onnx") and not file.startswith("D_") and not file.startswith("G_"):
- pth_path = os.path.join(dest_weights, model_name + ".onnx")
- if os.path.exists(pth_path): os.remove(pth_path)
-
- shutil.move(file_path, pth_path)
-
-def download_url(url):
- import yt_dlp
-
- if not url: return gr_warning(translations["provide_url"])
- if not os.path.exists("audios"): os.makedirs("audios", exist_ok=True)
-
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore")
- ydl_opts = {"format": "bestaudio/best", "postprocessors": [{"key": "FFmpegExtractAudio", "preferredcodec": "wav", "preferredquality": "192"}], "quiet": True, "no_warnings": True, "noplaylist": True, "verbose": False}
-
- gr_info(translations["start"].format(start=translations["download_music"]))
-
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
- audio_output = os.path.join("audios", re.sub(r'\s+', '-', re.sub(r'[^\w\s\u4e00-\u9fff\uac00-\ud7af\u0400-\u04FF\u1100-\u11FF]', '', ydl.extract_info(url, download=False).get('title', 'video')).strip()))
- if os.path.exists(audio_output): shutil.rmtree(audio_output, ignore_errors=True)
-
- ydl_opts['outtmpl'] = audio_output
-
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
- audio_output = audio_output + ".wav"
- if os.path.exists(audio_output): os.remove(audio_output)
-
- ydl.download([url])
-
- gr_info(translations["success"])
- return [audio_output, audio_output, translations["success"]]
-
-def download_model(url=None, model=None):
- if not url: return gr_warning(translations["provide_url"])
- if not model: return gr_warning(translations["provide_name_is_save"])
-
- model = model.replace(".onnx", "").replace(".pth", "").replace(".index", "").replace(".zip", "").replace(" ", "_").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace(",", "").replace('"', "").replace("'", "").replace("|", "").strip()
- url = url.replace("/blob/", "/resolve/").replace("?download=true", "").strip()
-
- download_dir = os.path.join("download_model")
- weights_dir = os.path.join("assets", "weights")
- logs_dir = os.path.join("assets", "logs")
-
- if not os.path.exists(download_dir): os.makedirs(download_dir, exist_ok=True)
- if not os.path.exists(weights_dir): os.makedirs(weights_dir, exist_ok=True)
- if not os.path.exists(logs_dir): os.makedirs(logs_dir, exist_ok=True)
-
- try:
- gr_info(translations["start"].format(start=translations["download"]))
-
- if url.endswith(".pth"): huggingface.HF_download_file(url, os.path.join(weights_dir, f"{model}.pth"))
- elif url.endswith(".onnx"): huggingface.HF_download_file(url, os.path.join(weights_dir, f"{model}.onnx"))
- elif url.endswith(".index"):
- model_log_dir = os.path.join(logs_dir, model)
- os.makedirs(model_log_dir, exist_ok=True)
-
- huggingface.HF_download_file(url, os.path.join(model_log_dir, f"{model}.index"))
- elif url.endswith(".zip"):
- output_path = huggingface.HF_download_file(url, os.path.join(download_dir, model + ".zip"))
- shutil.unpack_archive(output_path, download_dir)
-
- move_files_from_directory(download_dir, weights_dir, logs_dir, model)
- else:
- if "drive.google.com" in url or "drive.usercontent.google.com" in url:
- file_id = None
-
- from main.tools import gdown
-
- if "/file/d/" in url: file_id = url.split("/d/")[1].split("/")[0]
- elif "open?id=" in url: file_id = url.split("open?id=")[1].split("/")[0]
- elif "/download?id=" in url: file_id = url.split("/download?id=")[1].split("&")[0]
-
- if file_id:
- file = gdown.gdown_download(id=file_id, output=download_dir)
- if file.endswith(".zip"): shutil.unpack_archive(file, download_dir)
-
- move_files_from_directory(download_dir, weights_dir, logs_dir, model)
- elif "mega.nz" in url:
- from main.tools import meganz
-
- meganz.mega_download_url(url, download_dir)
-
- file_download = next((f for f in os.listdir(download_dir)), None)
- if file_download.endswith(".zip"): shutil.unpack_archive(os.path.join(download_dir, file_download), download_dir)
-
- move_files_from_directory(download_dir, weights_dir, logs_dir, model)
- elif "mediafire.com" in url:
- from main.tools import mediafire
-
- file = mediafire.Mediafire_Download(url, download_dir)
- if file.endswith(".zip"): shutil.unpack_archive(file, download_dir)
-
- move_files_from_directory(download_dir, weights_dir, logs_dir, model)
- elif "pixeldrain.com" in url:
- from main.tools import pixeldrain
-
- file = pixeldrain.pixeldrain(url, download_dir)
- if file.endswith(".zip"): shutil.unpack_archive(file, download_dir)
-
- move_files_from_directory(download_dir, weights_dir, logs_dir, model)
- else:
- gr_warning(translations["not_support_url"])
- return translations["not_support_url"]
-
- gr_info(translations["success"])
- return translations["success"]
- except Exception as e:
- gr_error(message=translations["error_occurred"].format(e=e))
- logger.debug(e)
- return translations["error_occurred"].format(e=e)
- finally:
- shutil.rmtree(download_dir, ignore_errors=True)
-
-def save_drop_model(dropbox):
- weight_folder = os.path.join("assets", "weights")
- logs_folder = os.path.join("assets", "logs")
- save_model_temp = os.path.join("save_model_temp")
-
- if not os.path.exists(weight_folder): os.makedirs(weight_folder, exist_ok=True)
- if not os.path.exists(logs_folder): os.makedirs(logs_folder, exist_ok=True)
- if not os.path.exists(save_model_temp): os.makedirs(save_model_temp, exist_ok=True)
-
- shutil.move(dropbox, save_model_temp)
-
- try:
- file_name = os.path.basename(dropbox)
-
- if file_name.endswith(".pth") and file_name.endswith(".onnx") and file_name.endswith(".index"): gr_warning(translations["not_model"])
- else:
- if file_name.endswith(".zip"):
- shutil.unpack_archive(os.path.join(save_model_temp, file_name), save_model_temp)
- move_files_from_directory(save_model_temp, weight_folder, logs_folder, file_name.replace(".zip", ""))
- elif file_name.endswith((".pth", ".onnx")):
- output_file = os.path.join(weight_folder, file_name)
- if os.path.exists(output_file): os.remove(output_file)
-
- shutil.move(os.path.join(save_model_temp, file_name), output_file)
- elif file_name.endswith(".index"):
- def extract_name_model(filename):
- match = re.search(r"([A-Za-z]+)(?=_v|\.|$)", filename)
- return match.group(1) if match else None
-
- model_logs = os.path.join(logs_folder, extract_name_model(file_name))
- if not os.path.exists(model_logs): os.makedirs(model_logs, exist_ok=True)
- shutil.move(os.path.join(save_model_temp, file_name), model_logs)
- else:
- gr_warning(translations["unable_analyze_model"])
- return None
-
- gr_info(translations["upload_success"].format(name=translations["model"]))
- return None
- except Exception as e:
- gr_error(message=translations["error_occurred"].format(e=e))
- logger.debug(e)
- return None
- finally:
- shutil.rmtree(save_model_temp, ignore_errors=True)
-
-def download_pretrained_model(choices, model, sample_rate):
- pretraineds_custom_path = os.path.join("assets", "models", "pretrained_custom")
- if choices == translations["list_model"]:
- paths = fetch_pretrained_data()[model][sample_rate]
-
- if not os.path.exists(pretraineds_custom_path): os.makedirs(pretraineds_custom_path, exist_ok=True)
- url = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cergenvarq_phfgbz/", "rot13") + paths
-
- gr_info(translations["download_pretrain"])
- file = huggingface.HF_download_file(url.replace("/blob/", "/resolve/").replace("?download=true", "").strip(), os.path.join(pretraineds_custom_path, paths))
-
- if file.endswith(".zip"):
- shutil.unpack_archive(file, pretraineds_custom_path)
- os.remove(file)
-
- gr_info(translations["success"])
- return translations["success"]
- elif choices == translations["download_url"]:
- if not model: return gr_warning(translations["provide_pretrain"].format(dg="D"))
- if not sample_rate: return gr_warning(translations["provide_pretrain"].format(dg="G"))
-
- gr_info(translations["download_pretrain"])
-
- huggingface.HF_download_file(model.replace("/blob/", "/resolve/").replace("?download=true", "").strip(), pretraineds_custom_path)
- huggingface.HF_download_file(sample_rate.replace("/blob/", "/resolve/").replace("?download=true", "").strip(), pretraineds_custom_path)
-
- gr_info(translations["success"])
- return translations["success"]
-
-def fushion_model_pth(name, pth_1, pth_2, ratio):
- if not name.endswith(".pth"): name = name + ".pth"
-
- if not pth_1 or not os.path.exists(pth_1) or not pth_1.endswith(".pth"):
- gr_warning(translations["provide_file"].format(filename=translations["model"] + " 1"))
- return [translations["provide_file"].format(filename=translations["model"] + " 1"), None]
-
- if not pth_2 or not os.path.exists(pth_2) or not pth_2.endswith(".pth"):
- gr_warning(translations["provide_file"].format(filename=translations["model"] + " 2"))
- return [translations["provide_file"].format(filename=translations["model"] + " 2"), None]
-
- from collections import OrderedDict
-
- def extract(ckpt):
- a = ckpt["model"]
- opt = OrderedDict()
- opt["weight"] = {}
-
- for key in a.keys():
- if "enc_q" in key: continue
-
- opt["weight"][key] = a[key]
-
- return opt
-
- try:
- ckpt1 = torch.load(pth_1, map_location="cpu")
- ckpt2 = torch.load(pth_2, map_location="cpu")
-
- if ckpt1["sr"] != ckpt2["sr"]:
- gr_warning(translations["sr_not_same"])
- return [translations["sr_not_same"], None]
-
- cfg = ckpt1["config"]
- cfg_f0 = ckpt1["f0"]
- cfg_version = ckpt1["version"]
- cfg_sr = ckpt1["sr"]
-
- vocoder = ckpt1.get("vocoder", "Default")
-
- ckpt1 = extract(ckpt1) if "model" in ckpt1 else ckpt1["weight"]
- ckpt2 = extract(ckpt2) if "model" in ckpt2 else ckpt2["weight"]
-
- if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
- gr_warning(translations["architectures_not_same"])
- return [translations["architectures_not_same"], None]
-
- gr_info(translations["start"].format(start=translations["fushion_model"]))
-
- opt = OrderedDict()
- opt["weight"] = {}
-
- for key in ckpt1.keys():
- if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
- min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
- opt["weight"][key] = (ratio * (ckpt1[key][:min_shape0].float()) + (1 - ratio) * (ckpt2[key][:min_shape0].float())).half()
- else: opt["weight"][key] = (ratio * (ckpt1[key].float()) + (1 - ratio) * (ckpt2[key].float())).half()
-
- opt["config"] = cfg
- opt["sr"] = cfg_sr
- opt["f0"] = cfg_f0
- opt["version"] = cfg_version
- opt["infos"] = translations["model_fushion_info"].format(name=name, pth_1=pth_1, pth_2=pth_2, ratio=ratio)
- opt["vocoder"] = vocoder
-
- output_model = os.path.join("assets", "weights")
- if not os.path.exists(output_model): os.makedirs(output_model, exist_ok=True)
-
- torch.save(opt, os.path.join(output_model, name))
-
- gr_info(translations["success"])
- return [translations["success"], os.path.join(output_model, name)]
- except Exception as e:
- gr_error(message=translations["error_occurred"].format(e=e))
- logger.debug(e)
- return [e, None]
-
-def fushion_model(name, path_1, path_2, ratio):
- if not name:
- gr_warning(translations["provide_name_is_save"])
- return [translations["provide_name_is_save"], None]
-
- if path_1.endswith(".pth") and path_2.endswith(".pth"): return fushion_model_pth(name.replace(".onnx", ".pth"), path_1, path_2, ratio)
- else:
- gr_warning(translations["format_not_valid"])
- return [None, None]
-
-def onnx_export(model_path):
- from main.library.algorithm.onnx_export import onnx_exporter
-
- if not model_path.endswith(".pth"): model_path + ".pth"
- if not model_path or not os.path.exists(model_path) or not model_path.endswith(".pth"):
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
- return [None, translations["provide_file"].format(filename=translations["model"])]
-
- try:
- gr_info(translations["start_onnx_export"])
- output = onnx_exporter(model_path, model_path.replace(".pth", ".onnx"), is_half=config.is_half, device=config.device)
-
- gr_info(translations["success"])
- return [output, translations["success"]]
- except Exception as e:
- return [None, e]
-
-def model_info(path):
- if not path or not os.path.exists(path) or os.path.isdir(path) or not path.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"]))
-
- def prettify_date(date_str):
- if date_str == translations["not_found_create_time"]: return None
-
- try:
- return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f").strftime("%Y-%m-%d %H:%M:%S")
- except ValueError as e:
- logger.debug(e)
- return translations["format_not_valid"]
-
- if path.endswith(".pth"): model_data = torch.load(path, map_location=torch.device("cpu"))
- else:
- import onnx
-
- model = onnx.load(path)
- model_data = None
-
- for prop in model.metadata_props:
- if prop.key == "model_info":
- model_data = json.loads(prop.value)
- break
-
- gr_info(translations["read_info"])
-
- epochs = model_data.get("epoch", None)
- if epochs is None:
- epochs = model_data.get("info", None)
- try:
- epoch = epochs.replace("epoch", "").replace("e", "").isdigit()
- if epoch and epochs is None: epochs = translations["not_found"].format(name=translations["epoch"])
- except:
- pass
-
- steps = model_data.get("step", translations["not_found"].format(name=translations["step"]))
- sr = model_data.get("sr", translations["not_found"].format(name=translations["sr"]))
- f0 = model_data.get("f0", translations["not_found"].format(name=translations["f0"]))
- version = model_data.get("version", translations["not_found"].format(name=translations["version"]))
- creation_date = model_data.get("creation_date", translations["not_found_create_time"])
- model_hash = model_data.get("model_hash", translations["not_found"].format(name="model_hash"))
- pitch_guidance = translations["trained_f0"] if f0 else translations["not_f0"]
- creation_date_str = prettify_date(creation_date) if creation_date else translations["not_found_create_time"]
- model_name = model_data.get("model_name", translations["unregistered"])
- model_author = model_data.get("author", translations["not_author"])
- vocoder = model_data.get("vocoder", "Default")
-
- gr_info(translations["success"])
- return translations["model_info"].format(model_name=model_name, model_author=model_author, epochs=epochs, steps=steps, version=version, sr=sr, pitch_guidance=pitch_guidance, model_hash=model_hash, creation_date_str=creation_date_str, vocoder=vocoder)
-
-def audio_effects(input_path, output_path, resample, resample_sr, chorus_depth, chorus_rate, chorus_mix, chorus_delay, chorus_feedback, distortion_drive, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift, delay_seconds, delay_feedback, delay_mix, compressor_threshold, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold, limiter_release, gain_db, bitcrush_bit_depth, clipping_threshold, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost_db, bass_boost_frequency, treble_boost_db, treble_boost_frequency, fade_in_duration, fade_out_duration, export_format, chorus, distortion, reverb, delay, compressor, limiter, gain, bitcrush, clipping, phaser, treble_bass_boost, fade_in_out, audio_combination, audio_combination_input):
- if not input_path or not os.path.exists(input_path) or os.path.isdir(input_path):
- gr_warning(translations["input_not_valid"])
- return None
-
- if not output_path:
- gr_warning(translations["output_not_valid"])
- return None
-
- if os.path.isdir(output_path): output_path = os.path.join(output_path, f"audio_effects.{export_format}")
- output_dir = os.path.dirname(output_path) or output_path
-
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
- if os.path.exists(output_path): os.remove(output_path)
-
- gr_info(translations["start"].format(start=translations["apply_effect"]))
- subprocess.run([python, "main/inference/audio_effects.py", "--input_path", input_path, "--output_path", output_path, "--resample", str(resample), "--resample_sr", str(resample_sr), "--chorus_depth", str(chorus_depth), "--chorus_rate", str(chorus_rate), "--chorus_mix", str(chorus_mix), "--chorus_delay", str(chorus_delay), "--chorus_feedback", str(chorus_feedback), "--drive_db", str(distortion_drive), "--reverb_room_size", str(reverb_room_size), "--reverb_damping", str(reverb_damping), "--reverb_wet_level", str(reverb_wet_level), "--reverb_dry_level", str(reverb_dry_level), "--reverb_width", str(reverb_width), "--reverb_freeze_mode", str(reverb_freeze_mode), "--pitch_shift", str(pitch_shift), "--delay_seconds", str(delay_seconds), "--delay_feedback", str(delay_feedback), "--delay_mix", str(delay_mix), "--compressor_threshold", str(compressor_threshold), "--compressor_ratio", str(compressor_ratio), "--compressor_attack_ms", str(compressor_attack_ms), "--compressor_release_ms", str(compressor_release_ms), "--limiter_threshold", str(limiter_threshold), "--limiter_release", str(limiter_release), "--gain_db", str(gain_db), "--bitcrush_bit_depth", str(bitcrush_bit_depth), "--clipping_threshold", str(clipping_threshold), "--phaser_rate_hz", str(phaser_rate_hz), "--phaser_depth", str(phaser_depth), "--phaser_centre_frequency_hz", str(phaser_centre_frequency_hz), "--phaser_feedback", str(phaser_feedback), "--phaser_mix", str(phaser_mix), "--bass_boost_db", str(bass_boost_db), "--bass_boost_frequency", str(bass_boost_frequency), "--treble_boost_db", str(treble_boost_db), "--treble_boost_frequency", str(treble_boost_frequency), "--fade_in_duration", str(fade_in_duration), "--fade_out_duration", str(fade_out_duration), "--export_format", export_format, "--chorus", str(chorus), "--distortion", str(distortion), "--reverb", str(reverb), "--pitchshift", str(pitch_shift != 0), "--delay", str(delay), "--compressor", str(compressor), "--limiter", str(limiter), "--gain", str(gain), "--bitcrush", str(bitcrush), "--clipping", str(clipping), "--phaser", str(phaser), "--treble_bass_boost", str(treble_bass_boost), "--fade_in_out", str(fade_in_out), "--audio_combination", str(audio_combination), "--audio_combination_input", audio_combination_input])
-
- gr_info(translations["success"])
- return output_path.replace("wav", export_format)
-
-def synthesize_tts(prompt, voice, speed, output, pitch, google):
- if not google:
- from edge_tts import Communicate
-
- asyncio.run(Communicate(text=prompt, voice=voice, rate=f"+{speed}%" if speed >= 0 else f"{speed}%", pitch=f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz").save(output))
- else:
- response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyr.pbz/genafyngr_ggf", "rot13"), params={"ie": "UTF-8", "q": prompt, "tl": voice, "ttsspeed": speed, "client": "tw-ob"}, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"})
-
- if response.status_code == 200:
- with open(output, "wb") as f:
- f.write(response.content)
-
- if pitch != 0 or speed != 0:
- y, sr = librosa.load(output, sr=None)
-
- if pitch != 0: y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch)
- if speed != 0: y = librosa.effects.time_stretch(y, rate=speed)
-
- sf.write(file=output, data=y, samplerate=sr, format=os.path.splitext(os.path.basename(output))[-1].lower().replace('.', ''))
- else: gr_error(f"{response.status_code}, {response.text}")
-
-def time_stretch(y, sr, target_duration):
- rate = (len(y) / sr) / target_duration
- if rate != 1.0: y = librosa.effects.time_stretch(y=y.astype(np.float32), rate=rate)
-
- n_target = int(round(target_duration * sr))
- return np.pad(y, (0, n_target - len(y))) if len(y) < n_target else y[:n_target]
-
-def pysrttime_to_seconds(t):
- return (t.hours * 60 + t.minutes) * 60 + t.seconds + t.milliseconds / 1000
-
-def srt_tts(srt_file, out_file, voice, rate = 0, sr = 24000, google = False):
- import pysrt
- import tempfile
-
- subs = pysrt.open(srt_file)
- if not subs: raise ValueError(translations["srt"])
-
- final_audio = np.zeros(int(round(pysrttime_to_seconds(subs[-1].end) * sr)), dtype=np.float32)
-
- with tempfile.TemporaryDirectory() as tempdir:
- for idx, seg in enumerate(subs):
- wav_path = os.path.join(tempdir, f"seg_{idx}.wav")
- synthesize_tts(" ".join(seg.text.splitlines()), voice, 0, wav_path, rate, google)
-
- audio, file_sr = sf.read(wav_path, dtype=np.float32)
- if file_sr != sr: audio = np.interp(np.linspace(0, len(audio) - 1, int(len(audio) * sr / file_sr)), np.arange(len(audio)), audio)
- adjusted = time_stretch(audio, sr, pysrttime_to_seconds(seg.duration))
-
- start_sample = int(round(pysrttime_to_seconds(seg.start) * sr))
- end_sample = start_sample + adjusted.shape[0]
-
- if end_sample > final_audio.shape[0]:
- adjusted = adjusted[: final_audio.shape[0] - start_sample]
- end_sample = final_audio.shape[0]
-
- final_audio[start_sample:end_sample] += adjusted
-
- sf.write(out_file, final_audio, sr)
-
-def TTS(prompt, voice, speed, output, pitch, google, srt_input):
- if not srt_input: srt_input = ""
-
- if not prompt and not srt_input.endswith(".srt"):
- gr_warning(translations["enter_the_text"])
- return None
-
- if not voice:
- gr_warning(translations["choose_voice"])
- return None
-
- if not output:
- gr_warning(translations["output_not_valid"])
- return None
-
- if os.path.isdir(output): output = os.path.join(output, f"tts.wav")
- gr_info(translations["convert"].format(name=translations["text"]))
-
- output_dir = os.path.dirname(output) or output
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
-
- if srt_input.endswith(".srt"): srt_tts(srt_input, output, voice, 0, 24000, google)
- else: synthesize_tts(prompt, voice, speed, output, pitch, google)
-
- gr_info(translations["success"])
- return output
-
-def separator_music(input, output_audio, format, shifts, segments_size, overlap, clean_audio, clean_strength, denoise, separator_model, kara_model, backing, reverb, backing_reverb, hop_length, batch_size, sample_rate):
- output = os.path.dirname(output_audio) or output_audio
-
- if not input or not os.path.exists(input) or os.path.isdir(input):
- gr_warning(translations["input_not_valid"])
- return [None]*4
-
- if not os.path.exists(output):
- gr_warning(translations["output_not_valid"])
- return [None]*4
-
- if not os.path.exists(output): os.makedirs(output)
- gr_info(translations["start"].format(start=translations["separator_music"]))
-
- subprocess.run([python, "main/inference/separator_music.py", "--input_path", input, "--output_path", output, "--format", format, "--shifts", str(shifts), "--segments_size", str(segments_size), "--overlap", str(overlap), "--mdx_hop_length", str(hop_length), "--mdx_batch_size", str(batch_size), "--clean_audio", str(clean_audio), "--clean_strength", str(clean_strength), "--kara_model", kara_model, "--backing", str(backing), "--mdx_denoise", str(denoise), "--reverb", str(reverb), "--backing_reverb", str(backing_reverb), "--model_name", separator_model, "--sample_rate", str(sample_rate)])
- gr_info(translations["success"])
-
- filename, _ = os.path.splitext(os.path.basename(input))
- output = os.path.join(output, filename)
-
- return [os.path.join(output, f"Original_Vocals_No_Reverb.{format}") if reverb else os.path.join(output, f"Original_Vocals.{format}"), os.path.join(output, f"Instruments.{format}"), (os.path.join(output, f"Main_Vocals_No_Reverb.{format}") if reverb else os.path.join(output, f"Main_Vocals.{format}") if backing else None), (os.path.join(output, f"Backing_Vocals_No_Reverb.{format}") if backing_reverb else os.path.join(output, f"Backing_Vocals.{format}") if backing else None)] if os.path.isfile(input) else [None]*4
-
-def convert(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file):
- subprocess.run([python, "main/inference/convert.py", "--pitch", str(pitch), "--filter_radius", str(filter_radius), "--index_rate", str(index_rate), "--volume_envelope", str(volume_envelope), "--protect", str(protect), "--hop_length", str(hop_length), "--f0_method", f0_method, "--input_path", input_path, "--output_path", output_path, "--pth_path", pth_path, "--index_path", index_path if index_path else "", "--f0_autotune", str(f0_autotune), "--clean_audio", str(clean_audio), "--clean_strength", str(clean_strength), "--export_format", export_format, "--embedder_model", embedder_model, "--resample_sr", str(resample_sr), "--split_audio", str(split_audio), "--f0_autotune_strength", str(f0_autotune_strength), "--checkpointing", str(checkpointing), "--f0_onnx", str(onnx_f0_mode), "--embedders_mode", embedders_mode, "--formant_shifting", str(formant_shifting), "--formant_qfrency", str(formant_qfrency), "--formant_timbre", str(formant_timbre), "--f0_file", f0_file])
-
-def convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, input_audio_name, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode):
- model_path = os.path.join("assets", "weights", model)
-
- return_none = [None]*6
- return_none[5] = {"visible": True, "__type__": "update"}
-
- if not use_audio:
- if merge_instrument or not_merge_backing or convert_backing or use_original:
- gr_warning(translations["turn_on_use_audio"])
- return return_none
-
- if use_original:
- if convert_backing:
- gr_warning(translations["turn_off_convert_backup"])
- return return_none
- elif not_merge_backing:
- gr_warning(translations["turn_off_merge_backup"])
- return return_none
-
- if not model or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")):
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
- return return_none
-
- f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)
-
- if use_audio:
- output_audio = os.path.join("audios", input_audio_name)
-
- from main.library.utils import pydub_convert, pydub_load
-
- def get_audio_file(label):
- matching_files = [f for f in os.listdir(output_audio) if label in f]
-
- if not matching_files: return translations["notfound"]
- return os.path.join(output_audio, matching_files[0])
-
- output_path = os.path.join(output_audio, f"Convert_Vocals.{format}")
- output_backing = os.path.join(output_audio, f"Convert_Backing.{format}")
- output_merge_backup = os.path.join(output_audio, f"Vocals+Backing.{format}")
- output_merge_instrument = os.path.join(output_audio, f"Vocals+Instruments.{format}")
-
- if os.path.exists(output_audio): os.makedirs(output_audio, exist_ok=True)
- if os.path.exists(output_path): os.remove(output_path)
-
- if use_original:
- original_vocal = get_audio_file('Original_Vocals_No_Reverb.')
-
- if original_vocal == translations["notfound"]: original_vocal = get_audio_file('Original_Vocals.')
-
- if original_vocal == translations["notfound"]:
- gr_warning(translations["not_found_original_vocal"])
- return return_none
-
- input_path = original_vocal
- else:
- main_vocal = get_audio_file('Main_Vocals_No_Reverb.')
- backing_vocal = get_audio_file('Backing_Vocals_No_Reverb.')
-
- if main_vocal == translations["notfound"]: main_vocal = get_audio_file('Main_Vocals.')
- if not not_merge_backing and backing_vocal == translations["notfound"]: backing_vocal = get_audio_file('Backing_Vocals.')
-
- if main_vocal == translations["notfound"]:
- gr_warning(translations["not_found_main_vocal"])
- return return_none
-
- if not not_merge_backing and backing_vocal == translations["notfound"]:
- gr_warning(translations["not_found_backing_vocal"])
- return return_none
-
- input_path = main_vocal
- backing_path = backing_vocal
-
- gr_info(translations["convert_vocal"])
-
- convert(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0method, input_path, output_path, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file)
-
- gr_info(translations["convert_success"])
-
- if convert_backing:
- if os.path.exists(output_backing): os.remove(output_backing)
-
- gr_info(translations["convert_backup"])
-
- convert(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0method, backing_path, output_backing, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file)
-
- gr_info(translations["convert_backup_success"])
-
- try:
- if not not_merge_backing and not use_original:
- backing_source = output_backing if convert_backing else backing_vocal
-
- if os.path.exists(output_merge_backup): os.remove(output_merge_backup)
-
- gr_info(translations["merge_backup"])
-
- pydub_convert(pydub_load(output_path)).overlay(pydub_convert(pydub_load(backing_source))).export(output_merge_backup, format=format)
-
- gr_info(translations["merge_success"])
-
- if merge_instrument:
- vocals = output_merge_backup if not not_merge_backing and not use_original else output_path
-
- if os.path.exists(output_merge_instrument): os.remove(output_merge_instrument)
-
- gr_info(translations["merge_instruments_process"])
-
- instruments = get_audio_file('Instruments.')
-
- if instruments == translations["notfound"]:
- gr_warning(translations["not_found_instruments"])
- output_merge_instrument = None
- else: pydub_convert(pydub_load(instruments)).overlay(pydub_convert(pydub_load(vocals))).export(output_merge_instrument, format=format)
-
- gr_info(translations["merge_success"])
- except:
- return return_none
-
- return [(None if use_original else output_path), output_backing, (None if not_merge_backing and use_original else output_merge_backup), (output_path if use_original else None), (output_merge_instrument if merge_instrument else None), {"visible": True, "__type__": "update"}]
- else:
- if not input or not os.path.exists(input) or os.path.isdir(input):
- gr_warning(translations["input_not_valid"])
- return return_none
-
- if not output:
- gr_warning(translations["output_not_valid"])
- return return_none
-
- output = output.replace("wav", format)
-
- if os.path.isdir(input):
- gr_info(translations["is_folder"])
-
- if not [f for f in os.listdir(input) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]:
- gr_warning(translations["not_found_in_folder"])
- return return_none
-
- gr_info(translations["batch_convert"])
-
- output_dir = os.path.dirname(output) or output
- convert(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0method, input, output_dir, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file)
-
- gr_info(translations["batch_convert_success"])
-
- return return_none
- else:
- output_dir = os.path.dirname(output) or output
-
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
- if os.path.exists(output): os.remove(output)
-
- gr_info(translations["convert_vocal"])
-
- convert(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file)
-
- gr_info(translations["convert_success"])
-
- return_none[0] = output
- return return_none
-
-def convert_selection(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode):
- if use_audio:
- gr_info(translations["search_separate"])
-
- choice = [f for f in os.listdir("audios") if os.path.isdir(os.path.join("audios", f))]
-
- gr_info(translations["found_choice"].format(choice=len(choice)))
-
- if len(choice) == 0:
- gr_warning(translations["separator==0"])
-
- return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, None, None, None, None, None, {"visible": True, "__type__": "update"}]
- elif len(choice) == 1:
- convert_output = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, None, None, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, choice[0], checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode)
-
- return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, convert_output[0], convert_output[1], convert_output[2], convert_output[3], convert_output[4], {"visible": True, "__type__": "update"}]
- else: return [{"choices": choice, "value": "", "interactive": True, "visible": True, "__type__": "update"}, None, None, None, None, None, {"visible": False, "__type__": "update"}]
- else:
- main_convert = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, None, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode)
-
- return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, main_convert[0], None, None, None, None, {"visible": True, "__type__": "update"}]
-
-def convert_with_whisper(num_spk, model_size, cleaner, clean_strength, autotune, f0_autotune_strength, checkpointing, model_1, model_2, model_index_1, model_index_2, pitch_1, pitch_2, index_strength_1, index_strength_2, export_format, input_audio, output_audio, onnx_f0_mode, method, hybrid_method, hop_length, embed_mode, embedders, custom_embedders, resample_sr, filter_radius, volume_envelope, protect, formant_shifting, formant_qfrency_1, formant_timbre_1, formant_qfrency_2, formant_timbre_2):
- from pydub import AudioSegment
- from sklearn.cluster import AgglomerativeClustering
-
- from main.library.speaker_diarization.audio import Audio
- from main.library.speaker_diarization.segment import Segment
- from main.library.speaker_diarization.whisper import load_model
- from main.library.utils import check_spk_diarization, pydub_convert, pydub_load
- from main.library.speaker_diarization.embedding import SpeechBrainPretrainedSpeakerEmbedding
-
- check_spk_diarization(model_size)
- model_pth_1, model_pth_2 = os.path.join("assets", "weights", model_1), os.path.join("assets", "weights", model_2)
-
- if (not model_1 or not os.path.exists(model_pth_1) or os.path.isdir(model_pth_1) or not model_pth_1.endswith((".pth", ".onnx"))) and (not model_2 or not os.path.exists(model_pth_2) or os.path.isdir(model_pth_2) or not model_pth_2.endswith((".pth", ".onnx"))):
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
- return None
-
- if not model_1: model_pth_1 = model_pth_2
- if not model_2: model_pth_2 = model_pth_1
-
- if not input_audio or not os.path.exists(input_audio) or os.path.isdir(input_audio):
- gr_warning(translations["input_not_valid"])
- return None
-
- if not output_audio:
- gr_warning(translations["output_not_valid"])
- return None
-
- if os.path.exists(output_audio): os.remove(output_audio)
- gr_info(translations["start_whisper"])
-
- try:
- audio = Audio()
-
- embedding_model = SpeechBrainPretrainedSpeakerEmbedding(device=config.device)
- segments = load_model(model_size, device=config.device).transcribe(input_audio, fp16=configs.get("fp16", False), word_timestamps=True)["segments"]
-
- y, sr = librosa.load(input_audio, sr=None)
- duration = len(y) / sr
-
- def segment_embedding(segment):
- waveform, _ = audio.crop(input_audio, Segment(segment["start"], min(duration, segment["end"])))
- return embedding_model(waveform.mean(dim=0, keepdim=True)[None] if waveform.shape[0] == 2 else waveform[None])
-
- def time(secs):
- return datetime.timedelta(seconds=round(secs))
-
- def merge_audio(files_list, time_stamps, original_file_path, output_path, format):
- def extract_number(filename):
- match = re.search(r'_(\d+)', filename)
- return int(match.group(1)) if match else 0
-
- total_duration = len(pydub_load(original_file_path))
- combined = AudioSegment.empty()
- current_position = 0
-
- for file, (start_i, end_i) in zip(sorted(files_list, key=extract_number), time_stamps):
- if start_i > current_position: combined += AudioSegment.silent(duration=start_i - current_position)
-
- combined += pydub_load(file)
- current_position = end_i
-
- if current_position < total_duration: combined += AudioSegment.silent(duration=total_duration - current_position)
- combined.export(output_path, format=format)
-
- return output_path
-
- embeddings = np.zeros(shape=(len(segments), 192))
- for i, segment in enumerate(segments):
- embeddings[i] = segment_embedding(segment)
-
- labels = AgglomerativeClustering(num_spk).fit(np.nan_to_num(embeddings)).labels_
- for i in range(len(segments)):
- segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
-
- merged_segments, current_text = [], []
- current_speaker, current_start = None, None
-
- for i, segment in enumerate(segments):
- speaker = segment["speaker"]
- start_time = segment["start"]
- text = segment["text"][1:]
-
- if speaker == current_speaker:
- current_text.append(text)
- end_time = segment["end"]
- else:
- if current_speaker is not None: merged_segments.append({"speaker": current_speaker, "start": current_start, "end": end_time, "text": " ".join(current_text)})
-
- current_speaker = speaker
- current_start = start_time
- current_text = [text]
- end_time = segment["end"]
-
- if current_speaker is not None: merged_segments.append({"speaker": current_speaker, "start": current_start, "end": end_time, "text": " ".join(current_text)})
-
- gr_info(translations["whisper_done"])
-
- x = ""
- for segment in merged_segments:
- x += f"\n{segment['speaker']} {str(time(segment['start']))} - {str(time(segment['end']))}\n"
- x += segment["text"] + "\n"
-
- logger.info(x)
-
- gr_info(translations["process_audio"])
-
- audio = pydub_convert(pydub_load(input_audio))
- output_folder = "audios_temp"
-
- if os.path.exists(output_folder): shutil.rmtree(output_folder, ignore_errors=True)
- for f in [output_folder, os.path.join(output_folder, "1"), os.path.join(output_folder, "2")]:
- os.makedirs(f, exist_ok=True)
-
- time_stamps, processed_segments = [], []
- for i, segment in enumerate(merged_segments):
- start_ms = int(segment["start"] * 1000)
- end_ms = int(segment["end"] * 1000)
-
- index = i + 1
-
- segment_filename = os.path.join(output_folder, "1" if i % 2 == 1 else "2", f"segment_{index}.wav")
- audio[start_ms:end_ms].export(segment_filename, format="wav")
-
- processed_segments.append(os.path.join(output_folder, "1" if i % 2 == 1 else "2", f"segment_{index}_output.wav"))
- time_stamps.append((start_ms, end_ms))
-
- f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)
-
- gr_info(translations["process_done_start_convert"])
-
- convert(pitch_1, filter_radius, index_strength_1, volume_envelope, protect, hop_length, f0method, os.path.join(output_folder, "1"), output_folder, model_pth_1, model_index_1, autotune, cleaner, clean_strength, "wav", embedder_model, resample_sr, False, f0_autotune_strength, checkpointing, onnx_f0_mode, embed_mode, formant_shifting, formant_qfrency_1, formant_timbre_1, "")
- convert(pitch_2, filter_radius, index_strength_2, volume_envelope, protect, hop_length, f0method, os.path.join(output_folder, "2"), output_folder, model_pth_2, model_index_2, autotune, cleaner, clean_strength, "wav", embedder_model, resample_sr, False, f0_autotune_strength, checkpointing, onnx_f0_mode, embed_mode, formant_shifting, formant_qfrency_2, formant_timbre_2, "")
-
- gr_info(translations["convert_success"])
- return merge_audio(processed_segments, time_stamps, input_audio, output_audio.replace("wav", export_format), export_format)
- except Exception as e:
- gr_error(translations["error_occurred"].format(e=e))
- import traceback
- logger.debug(traceback.format_exc())
- return None
- finally:
- if os.path.exists("audios_temp"): shutil.rmtree("audios_temp", ignore_errors=True)
-
-def convert_tts(clean, autotune, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode):
- model_path = os.path.join("assets", "weights", model)
-
- if not model_path or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")):
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
- return None
-
- if not input or not os.path.exists(input):
- gr_warning(translations["input_not_valid"])
- return None
-
- if os.path.isdir(input):
- input_audio = [f for f in os.listdir(input) if "tts" in f and f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]
-
- if not input_audio:
- gr_warning(translations["not_found_in_folder"])
- return None
-
- input = os.path.join(input, input_audio[0])
-
- if not output:
- gr_warning(translations["output_not_valid"])
- return None
-
- output = output.replace("wav", format)
- if os.path.isdir(output): output = os.path.join(output, f"tts.{format}")
-
- output_dir = os.path.dirname(output)
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
-
- if os.path.exists(output): os.remove(output)
-
- f0method = method if method != "hybrid" else hybrid_method
- embedder_model = embedders if embedders != "custom" else custom_embedders
-
- gr_info(translations["convert_vocal"])
-
- convert(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file)
-
- gr_info(translations["convert_success"])
- return output
-
-def log_read(log_file, done):
- f = open(log_file, "w", encoding="utf-8")
- f.close()
-
- while 1:
- with open(log_file, "r", encoding="utf-8") as f:
- yield "".join(line for line in f.readlines() if "DEBUG" not in line and line.strip() != "")
-
- sleep(1)
- if done[0]: break
-
- with open(log_file, "r", encoding="utf-8") as f:
- log = "".join(line for line in f.readlines() if "DEBUG" not in line and line.strip() != "")
-
- yield log
-
-def create_dataset(input_audio, output_dataset, clean_dataset, clean_strength, separator_reverb, kim_vocals_version, overlap, segments_size, denoise_mdx, skip, skip_start, skip_end, hop_length, batch_size, sample_rate):
- version = 1 if kim_vocals_version == "Version-1" else 2
-
- gr_info(translations["start"].format(start=translations["create"]))
-
- p = subprocess.Popen(f'{python} main/inference/create_dataset.py --input_audio "{input_audio}" --output_dataset "{output_dataset}" --clean_dataset {clean_dataset} --clean_strength {clean_strength} --separator_reverb {separator_reverb} --kim_vocal_version {version} --overlap {overlap} --segments_size {segments_size} --mdx_hop_length {hop_length} --mdx_batch_size {batch_size} --denoise_mdx {denoise_mdx} --skip {skip} --skip_start_audios "{skip_start}" --skip_end_audios "{skip_end}" --sample_rate {sample_rate}', shell=True)
- done = [False]
-
- threading.Thread(target=if_done, args=(done, p)).start()
-
- for log in log_read(os.path.join("assets", "logs", "create_dataset.log"), done):
- yield log
-
-def preprocess(model_name, sample_rate, cpu_core, cut_preprocess, process_effects, path, clean_dataset, clean_strength):
- dataset = os.path.join(path)
- sr = int(float(sample_rate.rstrip("k")) * 1000)
-
- if not model_name: return gr_warning(translations["provide_name"])
- if not any(f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3")) for f in os.listdir(dataset) if os.path.isfile(os.path.join(dataset, f))): return gr_warning(translations["not_found_data"])
-
- model_dir = os.path.join("assets", "logs", model_name)
- if os.path.exists(model_dir): shutil.rmtree(model_dir, ignore_errors=True)
-
- p = subprocess.Popen(f'{python} main/inference/preprocess.py --model_name "{model_name}" --dataset_path "{dataset}" --sample_rate {sr} --cpu_cores {cpu_core} --cut_preprocess {cut_preprocess} --process_effects {process_effects} --clean_dataset {clean_dataset} --clean_strength {clean_strength}', shell=True)
- done = [False]
-
- threading.Thread(target=if_done, args=(done, p)).start()
- os.makedirs(model_dir, exist_ok=True)
-
- for log in log_read(os.path.join(model_dir, "preprocess.log"), done):
- yield log
-
-def extract(model_name, version, method, pitch_guidance, hop_length, cpu_cores, gpu, sample_rate, embedders, custom_embedders, onnx_f0_mode, embedders_mode):
- embedder_model = embedders if embedders != "custom" else custom_embedders
- sr = int(float(sample_rate.rstrip("k")) * 1000)
-
- if not model_name: return gr_warning(translations["provide_name"])
-
- model_dir = os.path.join("assets", "logs", model_name)
- if not any(os.path.isfile(os.path.join(model_dir, "sliced_audios", f)) for f in os.listdir(os.path.join(model_dir, "sliced_audios"))) or not any(os.path.isfile(os.path.join(model_dir, "sliced_audios_16k", f)) for f in os.listdir(os.path.join(model_dir, "sliced_audios_16k"))): return gr_warning(translations["not_found_data_preprocess"])
-
- p = subprocess.Popen(f'{python} main/inference/extract.py --model_name "{model_name}" --rvc_version {version} --f0_method {method} --pitch_guidance {pitch_guidance} --hop_length {hop_length} --cpu_cores {cpu_cores} --gpu {gpu} --sample_rate {sr} --embedder_model {embedder_model} --f0_onnx {onnx_f0_mode} --embedders_mode {embedders_mode}', shell=True)
- done = [False]
-
- threading.Thread(target=if_done, args=(done, p)).start()
- os.makedirs(model_dir, exist_ok=True)
-
- for log in log_read(os.path.join(model_dir, "extract.log"), done):
- yield log
-
-def create_index(model_name, rvc_version, index_algorithm):
- if not model_name: return gr_warning(translations["provide_name"])
- model_dir = os.path.join("assets", "logs", model_name)
-
- if not any(os.path.isfile(os.path.join(model_dir, f"{rvc_version}_extracted", f)) for f in os.listdir(os.path.join(model_dir, f"{rvc_version}_extracted"))): return gr_warning(translations["not_found_data_extract"])
-
- p = subprocess.Popen(f'{python} main/inference/create_index.py --model_name "{model_name}" --rvc_version {rvc_version} --index_algorithm {index_algorithm}', shell=True)
- done = [False]
-
- threading.Thread(target=if_done, args=(done, p)).start()
- os.makedirs(model_dir, exist_ok=True)
-
- for log in log_read(os.path.join(model_dir, "create_index.log"), done):
- yield log
-
-def training(model_name, rvc_version, save_every_epoch, save_only_latest, save_every_weights, total_epoch, sample_rate, batch_size, gpu, pitch_guidance, not_pretrain, custom_pretrained, pretrain_g, pretrain_d, detector, threshold, clean_up, cache, model_author, vocoder, checkpointing, deterministic, benchmark):
- sr = int(float(sample_rate.rstrip("k")) * 1000)
- if not model_name: return gr_warning(translations["provide_name"])
-
- model_dir = os.path.join("assets", "logs", model_name)
- if os.path.exists(os.path.join(model_dir, "train_pid.txt")): os.remove(os.path.join(model_dir, "train_pid.txt"))
-
- if not any(os.path.isfile(os.path.join(model_dir, f"{rvc_version}_extracted", f)) for f in os.listdir(os.path.join(model_dir, f"{rvc_version}_extracted"))): return gr_warning(translations["not_found_data_extract"])
-
- if not not_pretrain:
- if not custom_pretrained:
- pretrained_selector = {True: {32000: ("f0G32k.pth", "f0D32k.pth"), 40000: ("f0G40k.pth", "f0D40k.pth"), 48000: ("f0G48k.pth", "f0D48k.pth")}, False: {32000: ("G32k.pth", "D32k.pth"), 40000: ("G40k.pth", "D40k.pth"), 48000: ("G48k.pth", "D48k.pth")}}
-
- pg, pd = pretrained_selector[pitch_guidance][sr]
- else:
- if not pretrain_g: return gr_warning(translations["provide_pretrained"].format(dg="G"))
- if not pretrain_d: return gr_warning(translations["provide_pretrained"].format(dg="D"))
-
- pg, pd = pretrain_g, pretrain_d
-
- pretrained_G, pretrained_D = (os.path.join("assets", "models", f"pretrained_{rvc_version}", f"{vocoder}_{pg}" if vocoder != 'Default' else pg), os.path.join("assets", "models", f"pretrained_{rvc_version}", f"{vocoder}_{pd}" if vocoder != 'Default' else pd)) if not custom_pretrained else (os.path.join("assets", "models", f"pretrained_custom", pg), os.path.join("assets", "models", f"pretrained_custom", pd))
- download_version = codecs.decode(f"uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cergenvarq_i{'2' if rvc_version == 'v2' else '1'}/", "rot13")
-
- if not custom_pretrained:
- try:
- if not os.path.exists(pretrained_G):
- gr_info(translations["download_pretrained"].format(dg="G", rvc_version=rvc_version))
- huggingface.HF_download_file("".join([download_version, vocoder, "_", pg]) if vocoder != 'Default' else (download_version + pg), os.path.join("assets", "models", f"pretrained_{rvc_version}", f"{vocoder}_{pg}" if vocoder != 'Default' else pg))
-
- if not os.path.exists(pretrained_D):
- gr_info(translations["download_pretrained"].format(dg="D", rvc_version=rvc_version))
- huggingface.HF_download_file("".join([download_version, vocoder, "_", pd]) if vocoder != 'Default' else (download_version + pd), os.path.join("assets", "models", f"pretrained_{rvc_version}", f"{vocoder}_{pd}" if vocoder != 'Default' else pd))
- except:
- gr_warning(translations["not_use_pretrain_error_download"])
- pretrained_G, pretrained_D = None, None
- else:
- if not os.path.exists(pretrained_G): return gr_warning(translations["not_found_pretrain"].format(dg="G"))
- if not os.path.exists(pretrained_D): return gr_warning(translations["not_found_pretrain"].format(dg="D"))
- else: gr_warning(translations["not_use_pretrain"])
-
- gr_info(translations["start"].format(start=translations["training"]))
-
- p = subprocess.Popen(f'{python} main/inference/train.py --model_name "{model_name}" --rvc_version {rvc_version} --save_every_epoch {save_every_epoch} --save_only_latest {save_only_latest} --save_every_weights {save_every_weights} --total_epoch {total_epoch} --sample_rate {sr} --batch_size {batch_size} --gpu {gpu} --pitch_guidance {pitch_guidance} --overtraining_detector {detector} --overtraining_threshold {threshold} --cleanup {clean_up} --cache_data_in_gpu {cache} --g_pretrained_path "{pretrained_G}" --d_pretrained_path "{pretrained_D}" --model_author "{model_author}" --vocoder "{vocoder}" --checkpointing {checkpointing} --deterministic {deterministic} --benchmark {benchmark}', shell=True)
- done = [False]
-
- with open(os.path.join(model_dir, "train_pid.txt"), "w") as pid_file:
- pid_file.write(str(p.pid))
-
- threading.Thread(target=if_done, args=(done, p)).start()
-
- for log in log_read(os.path.join(model_dir, "train.log"), done):
- if len(log.split("\n")) > 100: log = log[-100:]
- yield log
-
-def stop_pid(pid_file, model_name=None, train=False):
- try:
- pid_file_path = os.path.join("assets", f"{pid_file}.txt") if model_name is None else os.path.join("assets", "logs", model_name, f"{pid_file}.txt")
-
- if not os.path.exists(pid_file_path): return gr_warning(translations["not_found_pid"])
- else:
- with open(pid_file_path, "r") as pid_file:
- pids = [int(pid) for pid in pid_file.readlines()]
-
- for pid in pids:
- os.kill(pid, 9)
-
- if os.path.exists(pid_file_path): os.remove(pid_file_path)
-
- pid_file_path = os.path.join("assets", "logs", model_name, "config.json")
-
- if train and os.path.exists(pid_file_path):
- with open(pid_file_path, "r") as pid_file:
- pid_data = json.load(pid_file)
- pids = pid_data.get("process_pids", [])
-
- with open(pid_file_path, "w") as pid_file:
- pid_data.pop("process_pids", None)
-
- json.dump(pid_data, pid_file, indent=4)
-
- for pid in pids:
- os.kill(pid, 9)
-
- gr_info(translations["end_pid"])
- except:
- pass
-
-def load_presets(presets, cleaner, autotune, pitch, clean_strength, index_strength, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, formant_shifting, formant_qfrency, formant_timbre):
- if not presets: return gr_warning(translations["provide_file_settings"])
-
- with open(os.path.join("assets", "presets", presets)) as f:
- file = json.load(f)
-
- gr_info(translations["load_presets"].format(presets=presets))
- return file.get("cleaner", cleaner), file.get("autotune", autotune), file.get("pitch", pitch), file.get("clean_strength", clean_strength), file.get("index_strength", index_strength), file.get("resample_sr", resample_sr), file.get("filter_radius", filter_radius), file.get("volume_envelope", volume_envelope), file.get("protect", protect), file.get("split_audio", split_audio), file.get("f0_autotune_strength", f0_autotune_strength), file.get("formant_shifting", formant_shifting), file.get("formant_qfrency", formant_qfrency), file.get("formant_timbre", formant_timbre)
-
-def save_presets(name, cleaner, autotune, pitch, clean_strength, index_strength, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, cleaner_chbox, autotune_chbox, pitch_chbox, index_strength_chbox, resample_sr_chbox, filter_radius_chbox, volume_envelope_chbox, protect_chbox, split_audio_chbox, formant_shifting_chbox, formant_shifting, formant_qfrency, formant_timbre):
- if not name: return gr_warning(translations["provide_filename_settings"])
- if not any([cleaner_chbox, autotune_chbox, pitch_chbox, index_strength_chbox, resample_sr_chbox, filter_radius_chbox, volume_envelope_chbox, protect_chbox, split_audio_chbox, formant_shifting_chbox]): return gr_warning(translations["choose1"])
-
- settings = {}
-
- for checkbox, data in [(cleaner_chbox, {"cleaner": cleaner, "clean_strength": clean_strength}), (autotune_chbox, {"autotune": autotune, "f0_autotune_strength": f0_autotune_strength}), (pitch_chbox, {"pitch": pitch}), (index_strength_chbox, {"index_strength": index_strength}), (resample_sr_chbox, {"resample_sr": resample_sr}), (filter_radius_chbox, {"filter_radius": filter_radius}), (volume_envelope_chbox, {"volume_envelope": volume_envelope}), (protect_chbox, {"protect": protect}), (split_audio_chbox, {"split_audio": split_audio}), (formant_shifting_chbox, {"formant_shifting": formant_shifting, "formant_qfrency": formant_qfrency, "formant_timbre": formant_timbre})]:
- if checkbox: settings.update(data)
-
- with open(os.path.join("assets", "presets", name + ".json"), "w") as f:
- json.dump(settings, f, indent=4)
-
- gr_info(translations["export_settings"])
- return change_preset_choices()
-
-def report_bug(error_info, provide):
- report_path = os.path.join("assets", "logs", "report_bugs.log")
- if os.path.exists(report_path): os.remove(report_path)
-
- report_url = codecs.decode(requests.get(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/jroubbx.gkg", "rot13")).text, "rot13")
- if not error_info: error_info = "Không Có"
-
- gr_info(translations["thank"])
-
- if provide:
- try:
- for log in [os.path.join(root, name) for root, _, files in os.walk(os.path.join("assets", "logs"), topdown=False) for name in files if name.endswith(".log")]:
- with open(log, "r", encoding="utf-8") as r:
- with open(report_path, "a", encoding="utf-8") as w:
- w.write(str(r.read()))
- w.write("\n")
- except Exception as e:
- gr_error(translations["error_read_log"])
- logger.debug(e)
-
- try:
- with open(report_path, "r", encoding="utf-8") as f:
- content = f.read()
-
- requests.post(report_url, json={"embeds": [{"title": "Báo Cáo Lỗi", "description": f"Mô tả lỗi: {error_info}", "color": 15158332, "author": {"name": "Vietnamese_RVC", "icon_url": codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/vpb.cat", "rot13"), "url": codecs.decode("uggcf://tvguho.pbz/CunzUhlauNau16/Ivrganzrfr-EIP/gerr/znva","rot13")}, "thumbnail": {"url": codecs.decode("uggcf://p.grabe.pbz/7dADJbv-36fNNNNq/grabe.tvs", "rot13")}, "fields": [{"name": "Số Lượng Gỡ Lỗi", "value": content.count("DEBUG")}, {"name": "Số Lượng Thông Tin", "value": content.count("INFO")}, {"name": "Số Lượng Cảnh Báo", "value": content.count("WARNING")}, {"name": "Số Lượng Lỗi", "value": content.count("ERROR")}], "footer": {"text": f"Tên Máy: {platform.uname().node} - Hệ Điều Hành: {platform.system()}-{platform.version()}\nThời Gian Báo Cáo Lỗi: {datetime.datetime.now()}."}}]})
-
- with open(report_path, "rb") as f:
- requests.post(report_url, files={"file": f})
- except Exception as e:
- gr_error(translations["error_send"])
- logger.debug(e)
- finally:
- if os.path.exists(report_path): os.remove(report_path)
- else: requests.post(report_url, json={"embeds": [{"title": "Báo Cáo Lỗi", "description": error_info}]})
-
-def f0_extract(audio, f0_method, f0_onnx):
- if not audio or not os.path.exists(audio) or os.path.isdir(audio):
- gr_warning(translations["input_not_valid"])
- return [None]*2
-
- from matplotlib import pyplot as plt
- from main.library.utils import check_predictors
- from main.inference.extract import FeatureInput
-
- check_predictors(f0_method, f0_onnx)
-
- f0_path = os.path.join("assets", "f0", os.path.splitext(os.path.basename(audio))[0])
- image_path = os.path.join(f0_path, "f0.png")
- txt_path = os.path.join(f0_path, "f0.txt")
-
- gr_info(translations["start_extract"])
-
- if not os.path.exists(f0_path): os.makedirs(f0_path, exist_ok=True)
-
- y, sr = librosa.load(audio, sr=None)
-
- feats = FeatureInput(sample_rate=sr, is_half=config.is_half, device=config.device)
- feats.f0_max = 1600.0
-
- F_temp = np.array(feats.compute_f0(y.flatten(), f0_method, 160, f0_onnx), dtype=np.float32)
- F_temp[F_temp == 0] = np.nan
-
- f0 = 1200 * np.log2(F_temp / librosa.midi_to_hz(0))
-
- plt.figure(figsize=(10, 4))
- plt.plot(f0)
- plt.title(f0_method)
- plt.xlabel(translations["time_frames"])
- plt.ylabel(translations["Frequency"])
- plt.savefig(image_path)
- plt.close()
-
- with open(txt_path, "w") as f:
- for i, f0_value in enumerate(f0):
- f.write(f"{i * sr / 160},{f0_value}\n")
-
- gr_info(translations["extract_done"])
-
- return [txt_path, image_path]
-
-def pitch_guidance_lock(vocoders):
- return {"value": True, "interactive": vocoders == "Default", "__type__": "update"}
-
-def vocoders_lock(pitch, vocoders):
- return {"value": vocoders if pitch else "Default", "interactive": pitch, "__type__": "update"}
-
-def run_audioldm2(input_path, output_path, export_format, sample_rate, audioldm_model, source_prompt, target_prompt, steps, cfg_scale_src, cfg_scale_tar, t_start, save_compute):
- if not input_path or not os.path.exists(input_path) or os.path.isdir(input_path):
- gr_warning(translations["input_not_valid"])
- return None
-
- if not output_path:
- gr_warning(translations["output_not_valid"])
- return None
-
- output_path = output_path.replace("wav", export_format)
-
- if os.path.exists(output_path): os.remove(output_path)
-
- gr_info(translations["start_edit"].format(input_path=input_path))
- subprocess.run([python, "main/inference/audioldm2.py", "--input_path", input_path, "--output_path", output_path, "--export_format", str(export_format), "--sample_rate", str(sample_rate), "--audioldm_model", audioldm_model, "--source_prompt", source_prompt, "--target_prompt", target_prompt, "--steps", str(steps), "--cfg_scale_src", str(cfg_scale_src), "--cfg_scale_tar", str(cfg_scale_tar), "--t_start", str(t_start), "--save_compute", str(save_compute)])
-
- gr_info(translations["success"])
- return output_path
-
-def change_fp(fp):
- fp16 = fp == "fp16"
-
- if fp16 and config.device == "cpu":
- gr_warning(translations["fp16_not_support"])
- return "fp32"
- else:
- gr_info(translations["start_update_precision"])
-
- configs = json.load(open(configs_json, "r"))
- configs["fp16"] = config.is_half = fp16
-
- with open(configs_json, "w") as f:
- json.dump(configs, f, indent=4)
-
- gr_info(translations["success"])
- return "fp16" if fp16 else "fp32"
-
-def unlock_f0(value):
- return {"choices": method_f0_full if value else method_f0, "value": "rmvpe", "__type__": "update"}
-
-def unlock_vocoder(value, vocoder):
- return {"value": vocoder if value == "v2" else "Default", "interactive": value == "v2", "__type__": "update"}
-
-def unlock_ver(value, vocoder):
- return {"value": "v2" if vocoder == "Default" else value, "interactive": vocoder == "Default", "__type__": "update"}
-
-def visible_embedders(value):
- return {"visible": value != "spin", "__type__": "update"}
-
diff --git a/main/app/parser.py b/main/app/parser.py
deleted file mode 100644
index b98fde8f963138d2a00962804b883018521e5ebe..0000000000000000000000000000000000000000
--- a/main/app/parser.py
+++ /dev/null
@@ -1,339 +0,0 @@
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-try:
- argv = sys.argv[1]
-except IndexError:
- argv = None
-
-argv_is_allows = ["--audio_effects", "--audioldm2", "--convert", "--create_dataset", "--create_index", "--extract", "--preprocess", "--separator_music", "--train", "--help_audio_effects", "--help_audioldm2", "--help_convert", "--help_create_dataset", "--help_create_index", "--help_extract", "--help_preprocess", "--help_separator_music", "--help_train", "--help"]
-
-if argv not in argv_is_allows:
- print("Invalid syntax! Use --help for more information")
- quit()
-
-if argv_is_allows[0] in argv: from main.inference.audio_effects import main
-elif argv_is_allows[1] in argv: from main.inference.audioldm2 import main
-elif argv_is_allows[2] in argv: from main.inference.convert import main
-elif argv_is_allows[3] in argv: from main.inference.create_dataset import main
-elif argv_is_allows[4] in argv: from main.inference.create_index import main
-elif argv_is_allows[5] in argv: from main.inference.extract import main
-elif argv_is_allows[6] in argv: from main.inference.preprocess import main
-elif argv_is_allows[7] in argv: from main.inference.separator_music import main
-elif argv_is_allows[8] in argv: from main.inference.train import main
-elif argv_is_allows[9] in argv:
- print("""Parameters for `--audio_effects`:
- 1. File paths:
- - `--input_path` (required): Path to the input audio file.
- - `--output_path` (default: `./audios/apply_effects.wav`): Path to save the output file.
- - `--export_format` (default: `wav`): Output file format (`wav`, `mp3`, ...).
-
- 2. Resampling:
- - `--resample` (default: `False`): Whether to resample or not.
- - `--resample_sr` (default: `0`): New sampling frequency (Hz).
-
- 3. Chorus effect:
- - `--chorus`: Enable/disable chorus.
- - `--chorus_depth`, `--chorus_rate`, `--chorus_mix`, `--chorus_delay`, `--chorus_feedback`: Parameters to adjust chorus.
-
- 4. Distortion effect:
- - `--distortion`: Enable/disable distortion.
- - `--drive_db`: Degree of audio distortion.
-
- 5. Reverb effect:
- - `--reverb`: Enable/disable reverb.
- - `--reverb_room_size`, `--reverb_damping`, `--reverb_wet_level`, `--reverb_dry_level`, `--reverb_width`, `--reverb_freeze_mode`: Adjust reverb.
-
- 6. Pitch shift effect:
- - `--pitchshift`: Enable/disable pitch shift.
- - `--pitch_shift`: Pitch shift value.
-
- 7. Delay effect:
- - `--delay`: Enable/disable delay.
- - `--delay_seconds`, `--delay_feedback`, `--delay_mix`: Adjust delay time, feedback, and mix.
-
- 8. Compressor:
- - `--compressor`: Enable/disable compressor.
- - `--compressor_threshold`, `--compressor_ratio`, `--compressor_attack_ms`, `--compressor_release_ms`: Compression parameters.
-
- 9. Limiter:
- - `--limiter`: Enable/disable audio level limiter.
- - `--limiter_threshold`, `--limiter_release`: Limiter threshold and release time.
-
- 10. Gain (Amplification):
- - `--gain`: Enable/disable gain.
- - `--gain_db`: Gain level (dB).
-
- 11. Bitcrush:
- - `--bitcrush`: Enable/disable bit resolution reduction effect.
- - `--bitcrush_bit_depth`: Bit depth for bitcrush.
-
- 12. Clipping:
- - `--clipping`: Enable/disable audio clipping.
- - `--clipping_threshold`: Clipping threshold.
-
- 13. Phaser:
- - `--phaser`: Enable/disable phaser effect.
- - `--phaser_rate_hz`, `--phaser_depth`, `--phaser_centre_frequency_hz`, `--phaser_feedback`, `--phaser_mix`: Adjust phaser effect.
-
- 14. Boost bass & treble:
- - `--treble_bass_boost`: Enable/disable bass and treble boost.
- - `--bass_boost_db`, `--bass_boost_frequency`, `--treble_boost_db`, `--treble_boost_frequency`: Bass and treble boost parameters.
-
- 15. Fade in & fade out:
- - `--fade_in_out`: Enable/disable fade effect.
- - `--fade_in_duration`, `--fade_out_duration`: Fade in/out duration.
-
- 16. Audio combination:
- - `--audio_combination`: Enable/disable combining multiple audio files.
- - `--audio_combination_input`: Path to additional audio files.
- """)
- quit()
-elif argv_is_allows[10] in argv:
- print("""Parameters for `--audioldm2`:
- 1. File paths:
- - `--input_path` (required): Path to the input audio file.
- - `--output_path` (default: `./output.wav`): Path to save the output file.
- - `--export_format` (default: `wav`): Output file format.
-
- 2. Audio configuration:
- - `--sample_rate` (default: `44100`): Sampling frequency (Hz).
-
- 3. AudioLDM model configuration:
- - `--audioldm_model` (default: `audioldm2-music`): Select AudioLDM model for processing.
-
- 4. Model guidance prompt:
- - `--source_prompt` (default: ``): Description of source audio.
- - `--target_prompt` (default: ``): Description of target audio.
-
- 5. Processing algorithm configuration:
- - `--steps` (default: `200`): Number of steps in audio synthesis process.
- - `--cfg_scale_src` (default: `3.5`): Guidance scale for source audio.
- - `--cfg_scale_tar` (default: `12`): Guidance scale for target audio.
- - `--t_start` (default: `45`): Editing level.
-
- 6. Computation optimization:
- - `--save_compute` (default: `False`): Whether to enable compute optimization mode.
- """)
- quit()
-elif argv_is_allows[11] in argv:
- print("""Parameters for `--convert`:
- 1. Voice processing configuration:
- - `--pitch` (default: `0`): Adjust pitch.
- - `--filter_radius` (default: `3`): F0 curve smoothness.
- - `--index_rate` (default: `0.5`): Voice index usage rate.
- - `--volume_envelope` (default: `1`): Volume amplitude adjustment factor.
- - `--protect` (default: `0.33`): Consonant protection.
-
- 2. Frame hop configuration:
- - `--hop_length` (default: `64`): Hop length during audio processing.
-
- 3. F0 configuration:
- - `--f0_method` (default: `rmvpe`): F0 prediction method (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`).
- - `--f0_autotune` (default: `False`): Whether to auto-tune F0.
- - `--f0_autotune_strength` (default: `1`): Strength of F0 auto-tuning.
- - `--f0_file` (default: ``): Path to existing F0 file.
- - `--f0_onnx` (default: `False`): Whether to use ONNX version of F0.
-
- 4. Embedding model:
- - `--embedder_model` (default: `contentvec_base`): Embedding model used.
- - `--embedders_mode` (default: `fairseq`): Embedding mode (`fairseq`, `transformers`, `onnx`).
-
- 5. File paths:
- - `--input_path` (required): Path to input audio file.
- - `--output_path` (default: `./audios/output.wav`): Path to save output file.
- - `--export_format` (default: `wav`): Output file format.
- - `--pth_path` (required): Path to `.pth` model file.
- - `--index_path` (default: `None`): Path to index file (if any).
-
- 6. Audio cleaning:
- - `--clean_audio` (default: `False`): Whether to apply audio cleaning.
- - `--clean_strength` (default: `0.7`): Cleaning strength.
-
- 7. Resampling & audio splitting:
- - `--resample_sr` (default: `0`): New sampling frequency (0 means keep original).
- - `--split_audio` (default: `False`): Whether to split audio before processing.
-
- 8. Testing & optimization:
- - `--checkpointing` (default: `False`): Enable/disable checkpointing to save RAM.
-
- 9. Formant shifting:
- - `--formant_shifting` (default: `False`): Whether to enable formant shifting effect.
- - `--formant_qfrency` (default: `0.8`): Formant shift frequency factor.
- - `--formant_timbre` (default: `0.8`): Voice timbre change factor.
- """)
- quit()
-elif argv_is_allows[12] in argv:
- print("""Parameters for `--create_dataset`:
- 1. Dataset paths & configuration:
- - `--input_audio` (required): Path to audio link (YouTube link, can use `,` for multiple links).
- - `--output_dataset` (default: `./dataset`): Output data directory.
- - `--sample_rate` (default: `44100`): Audio sampling frequency.
-
- 2. Data cleaning:
- - `--clean_dataset` (default: `False`): Whether to apply data cleaning.
- - `--clean_strength` (default: `0.7`): Data cleaning strength.
-
- 3. Voice separation & effects:
- - `--separator_reverb` (default: `False`): Whether to separate voice reverb.
- - `--kim_vocal_version` (default: `2`): Kim Vocal model version for separation (`1`, `2`).
-
- 4. Audio segmentation configuration:
- - `--overlap` (default: `0.25`): Overlap level between segments during separation.
- - `--segments_size` (default: `256`): Size of each segment.
-
- 5. MDX (Music Demixing) configuration:
- - `--mdx_hop_length` (default: `1024`): MDX hop length during processing.
- - `--mdx_batch_size` (default: `1`): Batch size during MDX processing.
- - `--denoise_mdx` (default: `False`): Whether to apply denoising during MDX separation.
-
- 6. Skip audio sections:
- - `--skip` (default: `False`): Whether to skip any audio seconds.
- - `--skip_start_audios` (default: `0`): Time (seconds) to skip at the start of audio.
- - `--skip_end_audios` (default: `0`): Time (seconds) to skip at the end of audio.
- """)
- quit()
-elif argv_is_allows[13] in argv:
- print("""Parameters for `--create_index`:
- 1. Model information:
- - `--model_name` (required): Model name.
- - `--rvc_version` (default: `v2`): Version (`v1`, `v2`).
- - `--index_algorithm` (default: `Auto`): Index algorithm used (`Auto`, `Faiss`, `KMeans`).
- """)
- quit()
-elif argv_is_allows[14] in argv:
- print("""Parameters for `--extract`:
- 1. Model information:
- - `--model_name` (required): Model name.
- - `--rvc_version` (default: `v2`): RVC version (`v1`, `v2`).
-
- 2. F0 configuration:
- - `--f0_method` (default: `rmvpe`): F0 prediction method (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`).
- - `--pitch_guidance` (default: `True`): Whether to use pitch guidance.
-
- 3. Processing configuration:
- - `--hop_length` (default: `128`): Hop length during processing.
- - `--cpu_cores` (default: `2`): Number of CPU threads used.
- - `--gpu` (default: `-`): Specify GPU to use (e.g., `0` for first GPU, `-` to disable GPU).
- - `--sample_rate` (required): Input audio sampling frequency.
-
- 4. Embedding configuration:
- - `--embedder_model` (default: `contentvec_base`): Embedding model name.
- - `--f0_onnx` (default: `False`): Whether to use ONNX version of F0.
- - `--embedders_mode` (default: `fairseq`): Embedding mode (`fairseq`, `transformers`, `onnx`).
- """)
- quit()
-elif argv_is_allows[15] in argv:
- print("""Parameters for `--preprocess`:
- 1. Model information:
- - `--model_name` (required): Model name.
-
- 2. Data configuration:
- - `--dataset_path` (default: `./dataset`): Path to directory containing data files.
- - `--sample_rate` (required): Audio data sampling frequency.
-
- 3. Processing configuration:
- - `--cpu_cores` (default: `2`): Number of CPU threads used.
- - `--cut_preprocess` (default: `True`): Whether to cut data files.
- - `--process_effects` (default: `False`): Whether to apply preprocessing.
- - `--clean_dataset` (default: `False`): Whether to clean data files.
- - `--clean_strength` (default: `0.7`): Data cleaning strength.
- """)
- quit()
-elif argv_is_allows[16] in argv:
- print("""Parameters for `--separator_music`:
- 1. Data paths:
- - `--input_path` (required): Path to input audio file.
- - `--output_path` (default: `./audios`): Directory to save output files.
- - `--format` (default: `wav`): Output file format (`wav`, `mp3`, ...).
-
- 2. Audio processing configuration:
- - `--shifts` (default: `2`): Number of predictions.
- - `--segments_size` (default: `256`): Audio segment size.
- - `--overlap` (default: `0.25`): Overlap level between segments.
- - `--mdx_hop_length` (default: `1024`): MDX hop length during processing.
- - `--mdx_batch_size` (default: `1`): Batch size.
-
- 3. Cleaning processing:
- - `--clean_audio` (default: `False`): Whether to clean audio.
- - `--clean_strength` (default: `0.7`): Cleaning filter strength.
-
- 4. Model configuration:
- - `--model_name` (default: `HT-Normal`): Music separation model (`Main_340`, `Main_390`, `Main_406`, `Main_427`, `Main_438`, `Inst_full_292`, `Inst_HQ_1`, `Inst_HQ_2`, `Inst_HQ_3`, `Inst_HQ_4`, `Inst_HQ_5`, `Kim_Vocal_1`, `Kim_Vocal_2`, `Kim_Inst`, `Inst_187_beta`, `Inst_82_beta`, `Inst_90_beta`, `Voc_FT`, `Crowd_HQ`, `Inst_1`, `Inst_2`, `Inst_3`, `MDXNET_1_9703`, `MDXNET_2_9682`, `MDXNET_3_9662`, `Inst_Main`, `MDXNET_Main`, `MDXNET_9482`, `HT-Normal`, `HT-Tuned`, `HD_MMI`, `HT_6S`).
- - `--kara_model` (default: `Version-1`): Backing track separation model version (`Version-1`, `Version-2`).
-
- 5. Effects and post-processing:
- - `--backing` (default: `False`): Whether to separate backing vocals.
- - `--mdx_denoise` (default: `False`): Whether to use MDX denoising.
- - `--reverb` (default: `False`): Whether to separate reverb.
- - `--backing_reverb` (default: `False`): Whether to separate reverb for backing vocals.
-
- 6. Sampling frequency:
- - `--sample_rate` (default: `44100`): Output audio sampling frequency.
- """)
- quit()
-elif argv_is_allows[17] in argv:
- print("""Parameters for `--train`:
- 1. Model configuration:
- - `--model_name` (required): Model name.
- - `--rvc_version` (default: `v2`): RVC version (`v1`, `v2`).
- - `--model_author` (optional): Model author.
-
- 2. Save configuration:
- - `--save_every_epoch` (required): Number of epochs between saves.
- - `--save_only_latest` (default: `True`): Save only the latest checkpoint.
- - `--save_every_weights` (default: `True`): Save all model weights.
-
- 3. Training configuration:
- - `--total_epoch` (default: `300`): Total number of training epochs.
- - `--batch_size` (default: `8`): Batch size during training.
- - `--sample_rate` (required): Audio sampling frequency.
-
- 4. Device configuration:
- - `--gpu` (default: `0`): Specify GPU to use (agena: Specify GPU to use (number or `-` if not using GPU).
- - `--cache_data_in_gpu` (default: `False`): Cache data in GPU for faster processing.
-
- 5. Advanced training configuration:
- - `--pitch_guidance` (default: `True`): Use pitch guidance.
- - `--g_pretrained_path` (default: ``): Path to pretrained G weights.
- - `--d_pretrained_path` (default: ``): Path to pretrained D weights.
- - `--vocoder` (default: `Default`): Vocoder used (`Default`, `MRF-HiFi-GAN`, `RefineGAN`).
-
- 6. Overtraining detection:
- - `--overtraining_detector` (default: `False`): Enable/disable overtraining detection.
- - `--overtraining_threshold` (default: `50`): Threshold for detecting overtraining.
-
- 7. Data processing:
- - `--cleanup` (default: `False`): Clean old training files to start training from scratch.
-
- 8. Optimization:
- - `--checkpointing` (default: `False`): Enable/disable checkpointing to save RAM.
- - `--deterministic` (default: `False`): When enabled, uses deterministic algorithms to ensure consistent results for the same input data.
- - `--benchmark` (default: `False`): When enabled, tests and selects the optimal algorithm for the hardware and specific size.
- """)
- quit()
-elif argv_is_allows[18] in argv:
- print("""Usage:
- 1. `--help_audio_effects`: Help for adding audio effects.
- 2. `--help_audioldm2`: Help for music editing.
- 3. `--help_convert`: Help for audio conversion.
- 4. `--help_create_dataset`: Help for creating training data.
- 5. `--help_create_index`: Help for creating an index.
- 6. `--help_extract`: Help for extracting training data.
- 7. `--help_preprocess`: Help for preprocessing data.
- 8. `--help_separator_music`: Help for music separation.
- 9. `--help_train`: Help for model training.
- """)
- quit()
-
-if __name__ == "__main__":
- if "--train" in argv:
- import torch.multiprocessing as mp
- mp.set_start_method("spawn")
-
- try:
- main()
- except:
- pass
diff --git a/main/app/tabs/inference/inference.py b/main/app/tabs/inference/inference.py
deleted file mode 100644
index d0b5e2833359e8d234e9ac14fe243299f6c5ba09..0000000000000000000000000000000000000000
--- a/main/app/tabs/inference/inference.py
+++ /dev/null
@@ -1,596 +0,0 @@
-import gradio as gr
-from main.tools import huggingface
-from main.configs.config import Config
-from main.app.based.utils import *
-
-def inference_tabs():
- # Audio Conversion Tab
- with gr.TabItem(translations["convert_audio"], visible=configs.get("convert_tab", True)):
- gr.Markdown(f"## {translations['convert_audio']}")
- with gr.Row():
- gr.Markdown(translations["convert_info"])
-
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["model_accordion"], open=True):
- with gr.Row(equal_height=True):
- model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
- model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
- refesh = gr.Button(translations["refesh"])
-
- with gr.Row():
- with gr.Column():
- audio_select = gr.Dropdown(label=translations["select_separate"], choices=[], value="", interactive=True, allow_custom_value=True, visible=False)
- convert_button_2 = gr.Button(translations["convert_audio"], visible=False)
- with gr.Row():
- with gr.Column():
- input0 = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
- play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- with gr.Row():
- with gr.Column():
- with gr.Row():
- index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "")
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["input_output"], open=False):
- with gr.Column():
- export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
- input_audio0 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
- output_audio = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
- with gr.Column():
- refesh0 = gr.Button(translations["refesh"])
- with gr.Accordion(translations["setting"], open=False):
- with gr.Row():
- cleaner0 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
- autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
- use_audio = gr.Checkbox(label=translations["use_audio"], value=False, interactive=True)
- checkpointing = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
- with gr.Row():
- use_original = gr.Checkbox(label=translations["convert_original"], value=False, interactive=True, visible=use_audio.value)
- convert_backing = gr.Checkbox(label=translations["convert_backing"], value=False, interactive=True, visible=use_audio.value)
- not_merge_backing = gr.Checkbox(label=translations["not_merge_backing"], value=False, interactive=True, visible=use_audio.value)
- merge_instrument = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True, visible=use_audio.value)
- with gr.Row():
- pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
- clean_strength0 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner0.value)
-
- with gr.Accordion(translations["f0_method"], open=False):
- with gr.Group():
- with gr.Row():
- onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
- method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0+["hybrid"], value="rmvpe", interactive=True)
- hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"], value="hybrid[pm+dio]", interactive=True, allow_custom_value=True, visible=method.value == "hybrid")
- hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False)
- with gr.Accordion(translations["f0_file"], open=False):
- upload_f0_file = gr.File(label=translations["upload_f0"], file_types=[".txt"])
- f0_file_dropdown = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True)
- refesh_f0_file = gr.Button(translations["refesh"])
- with gr.Accordion(translations["hubert_model"], open=False):
- embed_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
- embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
- custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom")
- with gr.Accordion(translations["use_presets"], open=False):
- with gr.Row():
- presets_name = gr.Dropdown(label=translations["file_preset"], choices=presets_file, value=presets_file[0] if len(presets_file) > 0 else '', interactive=True, allow_custom_value=True)
- with gr.Row():
- load_click = gr.Button(translations["load_file"], variant="primary")
- refesh_click = gr.Button(translations["refesh"])
- with gr.Accordion(translations["export_file"], open=False):
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row():
- cleaner_chbox = gr.Checkbox(label=translations["save_clean"], value=True, interactive=True)
- autotune_chbox = gr.Checkbox(label=translations["save_autotune"], value=True, interactive=True)
- pitch_chbox = gr.Checkbox(label=translations["save_pitch"], value=True, interactive=True)
- index_strength_chbox = gr.Checkbox(label=translations["save_index_2"], value=True, interactive=True)
- resample_sr_chbox = gr.Checkbox(label=translations["save_resample"], value=True, interactive=True)
- filter_radius_chbox = gr.Checkbox(label=translations["save_filter"], value=True, interactive=True)
- volume_envelope_chbox = gr.Checkbox(label=translations["save_envelope"], value=True, interactive=True)
- protect_chbox = gr.Checkbox(label=translations["save_protect"], value=True, interactive=True)
- split_audio_chbox = gr.Checkbox(label=translations["save_split"], value=True, interactive=True)
- formant_shifting_chbox = gr.Checkbox(label=translations["formantshift"], value=True, interactive=True)
- with gr.Row():
- with gr.Column():
- name_to_save_file = gr.Textbox(label=translations["filename_to_save"])
- save_file_button = gr.Button(translations["export_file"])
- with gr.Row():
- upload_presets = gr.File(label=translations["upload_presets"], file_types=[".json"])
- with gr.Column():
- with gr.Row():
- split_audio = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True)
- formant_shifting = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value)
- resample_sr = gr.Slider(minimum=0, maximum=96000, label=translations["resample"], info=translations["resample_info"], value=0, step=1, interactive=True)
- filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
- volume_envelope = gr.Slider(minimum=0, maximum=1, label=translations["volume_envelope"], info=translations["volume_envelope_info"], value=1, step=0.1, interactive=True)
- protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
- with gr.Row():
- formant_qfrency = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- formant_timbre = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- with gr.Row():
- convert_button = gr.Button(translations["convert_audio"], variant="primary")
- gr.Markdown(translations["output_convert"])
- with gr.Row():
- main_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["main_convert"])
- backing_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_backing"], visible=convert_backing.value)
- main_backing = gr.Audio(show_download_button=True, interactive=False, label=translations["main_or_backing"], visible=convert_backing.value)
- with gr.Row():
- original_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_original"], visible=use_original.value)
- vocal_instrument = gr.Audio(show_download_button=True, interactive=False, label=translations["voice_or_instruments"], visible=merge_instrument.value)
- with gr.Row():
- upload_f0_file.upload(fn=lambda inp: shutil.move(inp.name, os.path.join("assets", "f0")), inputs=[upload_f0_file], outputs=[f0_file_dropdown])
- refesh_f0_file.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown])
- unlock_full_method.change(fn=unlock_f0, inputs=[unlock_full_method], outputs=[method])
- with gr.Row():
- load_click.click(
- fn=load_presets,
- inputs=[
- presets_name,
- cleaner0,
- autotune,
- pitch,
- clean_strength0,
- index_strength,
- resample_sr,
- filter_radius,
- volume_envelope,
- protect,
- split_audio,
- f0_autotune_strength,
- formant_qfrency,
- formant_timbre
- ],
- outputs=[
- cleaner0,
- autotune,
- pitch,
- clean_strength0,
- index_strength,
- resample_sr,
- filter_radius,
- volume_envelope,
- protect,
- split_audio,
- f0_autotune_strength,
- formant_shifting,
- formant_qfrency,
- formant_timbre
- ]
- )
- refesh_click.click(fn=change_preset_choices, inputs=[], outputs=[presets_name])
- save_file_button.click(
- fn=save_presets,
- inputs=[
- name_to_save_file,
- cleaner0,
- autotune,
- pitch,
- clean_strength0,
- index_strength,
- resample_sr,
- filter_radius,
- volume_envelope,
- protect,
- split_audio,
- f0_autotune_strength,
- cleaner_chbox,
- autotune_chbox,
- pitch_chbox,
- index_strength_chbox,
- resample_sr_chbox,
- filter_radius_chbox,
- volume_envelope_chbox,
- protect_chbox,
- split_audio_chbox,
- formant_shifting_chbox,
- formant_shifting,
- formant_qfrency,
- formant_timbre
- ],
- outputs=[presets_name]
- )
- with gr.Row():
- upload_presets.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("assets", "presets")), inputs=[upload_presets], outputs=[presets_name])
- autotune.change(fn=visible, inputs=[autotune], outputs=[f0_autotune_strength])
- use_audio.change(fn=lambda a: [visible(a), visible(a), visible(a), visible(a), visible(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), visible(not a), visible(not a), visible(not a), visible(not a)], inputs=[use_audio], outputs=[main_backing, use_original, convert_backing, not_merge_backing, merge_instrument, use_original, convert_backing, not_merge_backing, merge_instrument, input_audio0, output_audio, input0, play_audio])
- with gr.Row():
- convert_backing.change(fn=lambda a,b: [change_backing_choices(a, b), visible(a)], inputs=[convert_backing, not_merge_backing], outputs=[use_original, backing_convert])
- use_original.change(fn=lambda audio, original: [visible(original), visible(not original), visible(audio and not original), valueFalse_interactive(not original), valueFalse_interactive(not original)], inputs=[use_audio, use_original], outputs=[original_convert, main_convert, main_backing, convert_backing, not_merge_backing])
- cleaner0.change(fn=visible, inputs=[cleaner0], outputs=[clean_strength0])
- with gr.Row():
- merge_instrument.change(fn=visible, inputs=[merge_instrument], outputs=[vocal_instrument])
- not_merge_backing.change(fn=lambda audio, merge, cvb: [visible(audio and not merge), change_backing_choices(cvb, merge)], inputs=[use_audio, not_merge_backing, convert_backing], outputs=[main_backing, use_original])
- method.change(fn=lambda method, hybrid: [visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method, hybrid_method], outputs=[hybrid_method, hop_length])
- with gr.Row():
- hybrid_method.change(fn=hoplength_show, inputs=[method, hybrid_method], outputs=[hop_length])
- refesh.click(fn=change_models_choices, inputs=[], outputs=[model_pth, model_index])
- model_pth.change(fn=get_index, inputs=[model_pth], outputs=[model_index])
- with gr.Row():
- input0.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("audios")), inputs=[input0], outputs=[input_audio0])
- input_audio0.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio0], outputs=[play_audio])
- formant_shifting.change(fn=lambda a: [visible(a)]*2, inputs=[formant_shifting], outputs=[formant_qfrency, formant_timbre])
- with gr.Row():
- embedders.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders], outputs=[custom_embedders])
- refesh0.click(fn=change_audios_choices, inputs=[input_audio0], outputs=[input_audio0])
- model_index.change(fn=index_strength_show, inputs=[model_index], outputs=[index_strength])
- with gr.Row():
- audio_select.change(fn=lambda: visible(True), inputs=[], outputs=[convert_button_2])
- convert_button.click(fn=lambda: visible(False), inputs=[], outputs=[convert_button])
- convert_button_2.click(fn=lambda: [visible(False), visible(False)], inputs=[], outputs=[audio_select, convert_button_2])
- with gr.Row():
- convert_button.click(
- fn=convert_selection,
- inputs=[
- cleaner0,
- autotune,
- use_audio,
- use_original,
- convert_backing,
- not_merge_backing,
- merge_instrument,
- pitch,
- clean_strength0,
- model_pth,
- model_index,
- index_strength,
- input_audio0,
- output_audio,
- export_format,
- method,
- hybrid_method,
- hop_length,
- embedders,
- custom_embedders,
- resample_sr,
- filter_radius,
- volume_envelope,
- protect,
- split_audio,
- f0_autotune_strength,
- checkpointing,
- onnx_f0_mode,
- formant_shifting,
- formant_qfrency,
- formant_timbre,
- f0_file_dropdown,
- embed_mode
- ],
- outputs=[audio_select, main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button],
- api_name="convert_selection"
- )
- embed_mode.change(fn=visible_embedders, inputs=[embed_mode], outputs=[embedders])
- convert_button_2.click(
- fn=convert_audio,
- inputs=[
- cleaner0,
- autotune,
- use_audio,
- use_original,
- convert_backing,
- not_merge_backing,
- merge_instrument,
- pitch,
- clean_strength0,
- model_pth,
- model_index,
- index_strength,
- input_audio0,
- output_audio,
- export_format,
- method,
- hybrid_method,
- hop_length,
- embedders,
- custom_embedders,
- resample_sr,
- filter_radius,
- volume_envelope,
- protect,
- split_audio,
- f0_autotune_strength,
- audio_select,
- checkpointing,
- onnx_f0_mode,
- formant_shifting,
- formant_qfrency,
- formant_timbre,
- f0_file_dropdown,
- embed_mode
- ],
- outputs=[main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button],
- api_name="convert_audio"
- )
-
- # Text-to-Speech Conversion Tab
- with gr.TabItem(translations["convert_text"], visible=configs.get("tts_tab", True)):
- gr.Markdown(translations["convert_text_markdown"])
- with gr.Row():
- gr.Markdown(translations["convert_text_markdown_2"])
- with gr.Accordion(translations["model_accordion"], open=True):
- with gr.Row(equal_height=True):
- model_pth0 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
- model_index0 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
- refesh1 = gr.Button(translations["refesh"])
-
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row():
- use_txt = gr.Checkbox(label=translations["input_txt"], value=False, interactive=True)
- google_tts_check_box = gr.Checkbox(label=translations["googletts"], value=False, interactive=True)
- prompt = gr.Textbox(label=translations["text_to_speech"], value="", placeholder="Hello Words", lines=3)
- with gr.Column():
- speed = gr.Slider(label=translations["voice_speed"], info=translations["voice_speed_info"], minimum=-100, maximum=100, value=0, step=1)
- pitch0 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
- with gr.Row():
- tts_button = gr.Button(translations["tts_1"], variant="primary", scale=2)
- convert_button0 = gr.Button(translations["tts_2"], variant="secondary", scale=2)
- with gr.Row():
- with gr.Column():
- txt_input = gr.File(label=translations["drop_text"], file_types=[".txt", ".srt"], visible=use_txt.value)
- tts_voice = gr.Dropdown(label=translations["voice"], choices=edgetts, interactive=True, value="vi-VN-NamMinhNeural")
- tts_pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info_2"], label=translations["pitch"], value=0, interactive=True)
- with gr.Column():
-
- with gr.Row():
- index_strength0 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index0.value != "")
- with gr.Accordion(translations["output_path"], open=False):
- export_format0 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
- output_audio0 = gr.Textbox(label=translations["output_tts"], value="audios/tts.wav", placeholder="audios/tts.wav", info=translations["tts_output"], interactive=True)
- output_audio1 = gr.Textbox(label=translations["output_tts_convert"], value="audios/tts-convert.wav", placeholder="audios/tts-convert.wav", info=translations["tts_output"], interactive=True)
- with gr.Accordion(translations["setting"], open=False):
- with gr.Accordion(translations["f0_method"], open=False):
- with gr.Group():
- with gr.Row():
- onnx_f0_mode1 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
- unlock_full_method3 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
- method0 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0+["hybrid"], value="rmvpe", interactive=True)
- hybrid_method0 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"], value="hybrid[pm+dio]", interactive=True, allow_custom_value=True, visible=method0.value == "hybrid")
- hop_length0 = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False)
- with gr.Accordion(translations["f0_file"], open=False):
- upload_f0_file0 = gr.File(label=translations["upload_f0"], file_types=[".txt"])
- f0_file_dropdown0 = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True)
- refesh_f0_file0 = gr.Button(translations["refesh"])
- with gr.Accordion(translations["hubert_model"], open=False):
- embed_mode1 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
- embedders0 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
- custom_embedders0 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders0.value == "custom")
- with gr.Group():
- with gr.Row():
- formant_shifting1 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
- split_audio0 = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True)
- cleaner1 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
- autotune3 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
- checkpointing0 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
- with gr.Column():
- f0_autotune_strength0 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune3.value)
- clean_strength1 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner1.value)
- resample_sr0 = gr.Slider(minimum=0, maximum=96000, label=translations["resample"], info=translations["resample_info"], value=0, step=1, interactive=True)
- filter_radius0 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
- volume_envelope0 = gr.Slider(minimum=0, maximum=1, label=translations["volume_envelope"], info=translations["volume_envelope_info"], value=1, step=0.1, interactive=True)
- protect0 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
- with gr.Row():
- formant_qfrency1 = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- formant_timbre1 = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- with gr.Row():
- gr.Markdown(translations["output_tts_markdown"])
- with gr.Row():
- tts_voice_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["output_text_to_speech"])
- tts_voice_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"])
- with gr.Row():
- unlock_full_method3.change(fn=unlock_f0, inputs=[unlock_full_method3], outputs=[method0])
- upload_f0_file0.upload(fn=lambda inp: shutil.move(inp.name, os.path.join("assets", "f0")), inputs=[upload_f0_file0], outputs=[f0_file_dropdown0])
- refesh_f0_file0.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown0])
- with gr.Row():
- embed_mode1.change(fn=visible_embedders, inputs=[embed_mode1], outputs=[embedders0])
- autotune3.change(fn=visible, inputs=[autotune3], outputs=[f0_autotune_strength0])
- model_pth0.change(fn=get_index, inputs=[model_pth0], outputs=[model_index0])
- with gr.Row():
- cleaner1.change(fn=visible, inputs=[cleaner1], outputs=[clean_strength1])
- method0.change(fn=lambda method, hybrid: [visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method0, hybrid_method0], outputs=[hybrid_method0, hop_length0])
- hybrid_method0.change(fn=hoplength_show, inputs=[method0, hybrid_method0], outputs=[hop_length0])
- with gr.Row():
- refesh1.click(fn=change_models_choices, inputs=[], outputs=[model_pth0, model_index0])
- embedders0.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders0], outputs=[custom_embedders0])
- formant_shifting1.change(fn=lambda a: [visible(a)]*2, inputs=[formant_shifting1], outputs=[formant_qfrency1, formant_timbre1])
- with gr.Row():
- model_index0.change(fn=index_strength_show, inputs=[model_index0], outputs=[index_strength0])
- txt_input.upload(fn=process_input, inputs=[txt_input], outputs=[prompt])
- use_txt.change(fn=visible, inputs=[use_txt], outputs=[txt_input])
- with gr.Row():
- google_tts_check_box.change(fn=change_tts_voice_choices, inputs=[google_tts_check_box], outputs=[tts_voice])
- tts_button.click(
- fn=TTS,
- inputs=[
- prompt,
- tts_voice,
- speed,
- output_audio0,
- tts_pitch,
- google_tts_check_box,
- txt_input
- ],
- outputs=[tts_voice_audio],
- api_name="text-to-speech"
- )
- convert_button0.click(
- fn=convert_tts,
- inputs=[
- cleaner1,
- autotune3,
- pitch0,
- clean_strength1,
- model_pth0,
- model_index0,
- index_strength0,
- output_audio0,
- output_audio1,
- export_format0,
- method0,
- hybrid_method0,
- hop_length0,
- embedders0,
- custom_embedders0,
- resample_sr0,
- filter_radius0,
- volume_envelope0,
- protect0,
- split_audio0,
- f0_autotune_strength0,
- checkpointing0,
- onnx_f0_mode1,
- formant_shifting1,
- formant_qfrency1,
- formant_timbre1,
- f0_file_dropdown0,
- embed_mode1
- ],
- outputs=[tts_voice_convert],
- api_name="convert_tts"
- )
-
- # Whisper Conversion Tab
- with gr.TabItem(translations["convert_with_whisper"], visible=configs.get("convert_with_whisper", True)):
- gr.Markdown(f"## {translations['convert_with_whisper']}")
- with gr.Row():
- gr.Markdown(translations["convert_with_whisper_info"])
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["model_accordion"] + " 1", open=True):
- with gr.Row(equal_height=True):
- model_pth2 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
- model_index2 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
- refesh2 = gr.Button(translations["refesh"])
- with gr.Accordion(translations["model_accordion"] + " 2", open=True):
- with gr.Row(equal_height=True):
- model_pth3 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
- model_index3 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
- refesh3 = gr.Button(translations["refesh"])
- with gr.Group():
- with gr.Row():
- cleaner2 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
- autotune2 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
- checkpointing2 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
- formant_shifting2 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
- with gr.Row():
- num_spk = gr.Slider(minimum=2, maximum=8, step=1, info=translations["num_spk_info"], label=translations["num_spk"], value=2, interactive=True)
- with gr.Row():
- with gr.Column():
- convert_button3 = gr.Button(translations["convert_audio"], variant="primary")
- with gr.Row():
- with gr.Column():
- with gr.Row():
- pitch3 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
- index_strength2 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index2.value != "")
- with gr.Accordion(translations["input_output"], open=False):
- with gr.Column():
- export_format2 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
- input_audio1 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
- output_audio2 = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
- with gr.Column():
- refesh4 = gr.Button(translations["refesh"])
- with gr.Row():
- input2 = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
- with gr.Column():
- with gr.Row():
- pitch4 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
- index_strength3 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index3.value != "")
- with gr.Accordion(translations["setting"], open=False):
- with gr.Row():
- model_size = gr.Radio(label=translations["model_size"], info=translations["model_size_info"], choices=["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3", "large-v3-turbo"], value="medium", interactive=True)
- with gr.Accordion(translations["f0_method"], open=False):
- with gr.Group():
- with gr.Row():
- onnx_f0_mode4 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
- unlock_full_method2 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
- method3 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0+["hybrid"], value="rmvpe", interactive=True)
- hybrid_method3 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"], value="hybrid[pm+dio]", interactive=True, allow_custom_value=True, visible=method3.value == "hybrid")
- hop_length3 = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False)
- with gr.Accordion(translations["hubert_model"], open=False):
- embed_mode3 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
- embedders3 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
- custom_embedders3 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders3.value == "custom")
- with gr.Column():
- clean_strength3 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner2.value)
- f0_autotune_strength3 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value)
- resample_sr3 = gr.Slider(minimum=0, maximum=96000, label=translations["resample"], info=translations["resample_info"], value=0, step=1, interactive=True)
- filter_radius3 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
- volume_envelope3 = gr.Slider(minimum=0, maximum=1, label=translations["volume_envelope"], info=translations["volume_envelope_info"], value=1, step=0.1, interactive=True)
- protect3 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
- with gr.Row():
- formant_qfrency3 = gr.Slider(value=1.0, label=translations["formant_qfrency"] + " 1", info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- formant_timbre3 = gr.Slider(value=1.0, label=translations["formant_timbre"] + " 1", info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- with gr.Row():
- formant_qfrency4 = gr.Slider(value=1.0, label=translations["formant_qfrency"] + " 2", info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- formant_timbre4 = gr.Slider(value=1.0, label=translations["formant_timbre"] + " 2", info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- with gr.Row():
- gr.Markdown(translations["input_output"])
- with gr.Row():
- play_audio2 = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- play_audio3 = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"])
- with gr.Row():
- autotune2.change(fn=visible, inputs=[autotune2], outputs=[f0_autotune_strength3])
- cleaner2.change(fn=visible, inputs=[cleaner2], outputs=[clean_strength3])
- method3.change(fn=lambda method, hybrid: [visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method3, hybrid_method3], outputs=[hybrid_method3, hop_length3])
- with gr.Row():
- hybrid_method3.change(fn=hoplength_show, inputs=[method3, hybrid_method3], outputs=[hop_length3])
- refesh2.click(fn=change_models_choices, inputs=[], outputs=[model_pth2, model_index2])
- model_pth2.change(fn=get_index, inputs=[model_pth2], outputs=[model_index2])
- with gr.Row():
- refesh3.click(fn=change_models_choices, inputs=[], outputs=[model_pth3, model_index3])
- model_pth3.change(fn=get_index, inputs=[model_pth3], outputs=[model_index3])
- input2.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("audios")), inputs=[input2], outputs=[input_audio1])
- with gr.Row():
- input_audio1.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio1], outputs=[play_audio2])
- formant_shifting2.change(fn=lambda a: [visible(a)]*4, inputs=[formant_shifting2], outputs=[formant_qfrency3, formant_timbre3, formant_qfrency4, formant_timbre4])
- embedders3.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders3], outputs=[custom_embedders3])
- with gr.Row():
- refesh4.click(fn=change_audios_choices, inputs=[input_audio1], outputs=[input_audio1])
- model_index2.change(fn=index_strength_show, inputs=[model_index2], outputs=[index_strength2])
- model_index3.change(fn=index_strength_show, inputs=[model_index3], outputs=[index_strength3])
- with gr.Row():
- unlock_full_method2.change(fn=unlock_f0, inputs=[unlock_full_method2], outputs=[method3])
- embed_mode3.change(fn=visible_embedders, inputs=[embed_mode3], outputs=[embedders3])
- convert_button3.click(
- fn=convert_with_whisper,
- inputs=[
- num_spk,
- model_size,
- cleaner2,
- clean_strength3,
- autotune2,
- f0_autotune_strength3,
- checkpointing2,
- model_pth2,
- model_pth3,
- model_index2,
- model_index3,
- pitch3,
- pitch4,
- index_strength2,
- index_strength3,
- export_format2,
- input_audio1,
- output_audio2,
- onnx_f0_mode4,
- method3,
- hybrid_method3,
- hop_length3,
- embed_mode3,
- embedders3,
- custom_embedders3,
- resample_sr3,
- filter_radius3,
- volume_envelope3,
- protect3,
- formant_shifting2,
- formant_qfrency3,
- formant_timbre3,
- formant_qfrency4,
- formant_timbre4,
- ],
- outputs=[play_audio3],
- api_name="convert_with_whisper"
- )
\ No newline at end of file
diff --git a/main/app/tabs/models/model.py b/main/app/tabs/models/model.py
deleted file mode 100644
index ed441cb6e339d5e047703e0956f17c7b61c7900c..0000000000000000000000000000000000000000
--- a/main/app/tabs/models/model.py
+++ /dev/null
@@ -1,465 +0,0 @@
-from main.tools import huggingface
-from main.configs.config import Config
-from main.app.based.utils import *
-import gradio as gr
-
-
-def model_tabs():
- with gr.Tabs():
- with gr.Tab(label=translations["downloads"], visible=configs.get("downloads_tab", True)):
- gr.Markdown(translations["download_markdown"])
- with gr.Row():
- gr.Markdown(translations["download_markdown_2"])
- with gr.Row():
- with gr.Accordion(translations["model_download"], open=True):
- with gr.Row():
- downloadmodel = gr.Radio(label=translations["model_download_select"], choices=[translations["download_url"], translations["download_from_csv"], translations["search_models"], translations["upload"]], interactive=True, value=translations["download_url"])
- with gr.Row():
- gr.Markdown("___")
- with gr.Column():
- with gr.Row():
- url_input = gr.Textbox(label=translations["model_url"], value="", placeholder="https://...", scale=6)
- download_model_name = gr.Textbox(label=translations["modelname"], value="", placeholder=translations["modelname"], scale=2)
- url_download = gr.Button(value=translations["downloads"], scale=2)
- with gr.Column():
- model_browser = gr.Dropdown(choices=models.keys(), label=translations["model_warehouse"], scale=8, allow_custom_value=True, visible=False)
- download_from_browser = gr.Button(value=translations["get_model"], scale=2, variant="primary", visible=False)
- with gr.Column():
- search_name = gr.Textbox(label=translations["name_to_search"], placeholder=translations["modelname"], interactive=True, scale=8, visible=False)
- search = gr.Button(translations["search_2"], scale=2, visible=False)
- search_dropdown = gr.Dropdown(label=translations["select_download_model"], value="", choices=[], allow_custom_value=True, interactive=False, visible=False)
- download = gr.Button(translations["downloads"], variant="primary", visible=False)
- with gr.Column():
- model_upload = gr.File(label=translations["drop_model"], file_types=[".pth", ".onnx", ".index", ".zip"], visible=False)
- with gr.Row():
- with gr.Accordion(translations["download_pretrained_2"], open=False):
- with gr.Row():
- pretrain_download_choices = gr.Radio(label=translations["model_download_select"], choices=[translations["download_url"], translations["list_model"], translations["upload"]], value=translations["download_url"], interactive=True)
- with gr.Row():
- gr.Markdown("___")
- with gr.Column():
- with gr.Row():
- pretrainD = gr.Textbox(label=translations["pretrained_url"].format(dg="D"), value="", info=translations["only_huggingface"], placeholder="https://...", interactive=True, scale=4)
- pretrainG = gr.Textbox(label=translations["pretrained_url"].format(dg="G"), value="", info=translations["only_huggingface"], placeholder="https://...", interactive=True, scale=4)
- download_pretrain_button = gr.Button(translations["downloads"], scale=2)
- with gr.Column():
- with gr.Row():
- pretrain_choices = gr.Dropdown(label=translations["select_pretrain"], info=translations["select_pretrain_info"], choices=list(fetch_pretrained_data().keys()), value="Titan_Medium", allow_custom_value=True, interactive=True, scale=6, visible=False)
- sample_rate_pretrain = gr.Dropdown(label=translations["pretrain_sr"], info=translations["pretrain_sr"], choices=["48k", "40k", "32k"], value="48k", interactive=True, visible=False)
- download_pretrain_choices_button = gr.Button(translations["downloads"], scale=2, variant="primary", visible=False)
- with gr.Row():
- pretrain_upload_g = gr.File(label=translations["drop_pretrain"].format(dg="G"), file_types=[".pth"], visible=False)
- pretrain_upload_d = gr.File(label=translations["drop_pretrain"].format(dg="D"), file_types=[".pth"], visible=False)
- with gr.Row():
- url_download.click(
- fn=download_model,
- inputs=[
- url_input,
- download_model_name
- ],
- outputs=[url_input],
- api_name="download_model"
- )
- download_from_browser.click(
- fn=lambda model: download_model(models[model], model),
- inputs=[model_browser],
- outputs=[model_browser],
- api_name="download_browser"
- )
- with gr.Row():
- downloadmodel.change(fn=change_download_choices, inputs=[downloadmodel], outputs=[url_input, download_model_name, url_download, model_browser, download_from_browser, search_name, search, search_dropdown, download, model_upload])
- search.click(fn=search_models, inputs=[search_name], outputs=[search_dropdown, download])
- model_upload.upload(fn=save_drop_model, inputs=[model_upload], outputs=[model_upload])
- download.click(
- fn=lambda model: download_model(model_options[model], model),
- inputs=[search_dropdown],
- outputs=[search_dropdown],
- api_name="search_models"
- )
- with gr.Row():
- pretrain_download_choices.change(fn=change_download_pretrained_choices, inputs=[pretrain_download_choices], outputs=[pretrainD, pretrainG, download_pretrain_button, pretrain_choices, sample_rate_pretrain, download_pretrain_choices_button, pretrain_upload_d, pretrain_upload_g])
- pretrain_choices.change(fn=update_sample_rate_dropdown, inputs=[pretrain_choices], outputs=[sample_rate_pretrain])
- with gr.Row():
- download_pretrain_button.click(
- fn=download_pretrained_model,
- inputs=[
- pretrain_download_choices,
- pretrainD,
- pretrainG
- ],
- outputs=[pretrainD],
- api_name="download_pretrain_link"
- )
- download_pretrain_choices_button.click(
- fn=download_pretrained_model,
- inputs=[
- pretrain_download_choices,
- pretrain_choices,
- sample_rate_pretrain
- ],
- outputs=[pretrain_choices],
- api_name="download_pretrain_choices"
- )
- pretrain_upload_g.upload(
- fn=lambda pretrain_upload_g: shutil.move(pretrain_upload_g.name, os.path.join("assets", "models", "pretrained_custom")),
- inputs=[pretrain_upload_g],
- outputs=[],
- api_name="upload_pretrain_g"
- )
- pretrain_upload_d.upload(
- fn=lambda pretrain_upload_d: shutil.move(pretrain_upload_d.name, os.path.join("assets", "models", "pretrained_custom")),
- inputs=[pretrain_upload_d],
- outputs=[],
- api_name="upload_pretrain_d"
- )
-
- with gr.Tab(label=translations["createdataset"], visible=configs.get("create_dataset_tab", True)):
- gr.Markdown(translations["create_dataset_markdown"])
- with gr.Row():
- gr.Markdown(translations["create_dataset_markdown_2"])
- with gr.Row():
- dataset_url = gr.Textbox(label=translations["url_audio"], info=translations["create_dataset_url"], value="", placeholder="https://www.youtube.com/...", interactive=True)
- output_dataset = gr.Textbox(label=translations["output_data"], info=translations["output_data_info"], value="dataset", placeholder="dataset", interactive=True)
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row():
- separator_reverb = gr.Checkbox(label=translations["dereveb_audio"], value=False, interactive=True)
- denoise_mdx = gr.Checkbox(label=translations["denoise"], value=False, interactive=True)
- with gr.Row():
- kim_vocal_version = gr.Radio(label=translations["model_ver"], info=translations["model_ver_info"], choices=["Version-1", "Version-2"], value="Version-2", interactive=True)
- kim_vocal_overlap = gr.Radio(label=translations["overlap"], info=translations["overlap_info"], choices=["0.25", "0.5", "0.75", "0.99"], value="0.25", interactive=True)
- with gr.Row():
- kim_vocal_hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=8192, value=1024, step=1, interactive=True)
- kim_vocal_batch_size = gr.Slider(label=translations["batch_size"], info=translations["mdx_batch_size_info"], minimum=1, maximum=64, value=1, step=1, interactive=True)
- with gr.Row():
- kim_vocal_segments_size = gr.Slider(label=translations["segments_size"], info=translations["segments_size_info"], minimum=32, maximum=3072, value=256, step=32, interactive=True)
- with gr.Row():
- sample_rate0 = gr.Slider(minimum=8000, maximum=96000, step=1, value=44100, label=translations["sr"], info=translations["sr_info"], interactive=True)
- with gr.Column():
- create_button = gr.Button(translations["createdataset"], variant="primary", scale=2, min_width=4000)
- with gr.Group():
- with gr.Row():
- clean_audio = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
- skip = gr.Checkbox(label=translations["skip"], value=False, interactive=True)
- with gr.Row():
- dataset_clean_strength = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.5, label=translations["clean_strength"], info=translations["clean_strength_info"], interactive=True, visible=clean_audio.value)
- with gr.Row():
- skip_start = gr.Textbox(label=translations["skip_start"], info=translations["skip_start_info"], value="", placeholder="0,...", interactive=True, visible=skip.value)
- skip_end = gr.Textbox(label=translations["skip_end"], info=translations["skip_end_info"], value="", placeholder="0,...", interactive=True, visible=skip.value)
- create_dataset_info = gr.Textbox(label=translations["create_dataset_info"], value="", interactive=False)
- with gr.Row():
- clean_audio.change(fn=visible, inputs=[clean_audio], outputs=[dataset_clean_strength])
- skip.change(fn=lambda a: [valueEmpty_visible1(a)]*2, inputs=[skip], outputs=[skip_start, skip_end])
- with gr.Row():
- create_button.click(
- fn=create_dataset,
- inputs=[
- dataset_url,
- output_dataset,
- clean_audio,
- dataset_clean_strength,
- separator_reverb,
- kim_vocal_version,
- kim_vocal_overlap,
- kim_vocal_segments_size,
- denoise_mdx,
- skip,
- skip_start,
- skip_end,
- kim_vocal_hop_length,
- kim_vocal_batch_size,
- sample_rate0
- ],
- outputs=[create_dataset_info],
- api_name="create_dataset"
- )
-
- with gr.Tab(label=translations["training_model"], visible=configs.get("training_tab", True)):
- gr.Markdown(f"## {translations['training_model']}")
- with gr.Row():
- gr.Markdown(translations["training_markdown"])
- with gr.Row():
- with gr.Column():
- with gr.Row():
- with gr.Column():
- training_name = gr.Textbox(label=translations["modelname"], info=translations["training_model_name"], value="", placeholder=translations["modelname"], interactive=True)
- training_sr = gr.Radio(label=translations["sample_rate"], info=translations["sample_rate_info"], choices=["32k", "40k", "48k"], value="48k", interactive=True)
- training_ver = gr.Radio(label=translations["training_version"], info=translations["training_version_info"], choices=["v1", "v2"], value="v2", interactive=True)
- with gr.Row():
- clean_dataset = gr.Checkbox(label=translations["clear_dataset"], value=False, interactive=True)
- preprocess_cut = gr.Checkbox(label=translations["split_audio"], value=True, interactive=True)
- process_effects = gr.Checkbox(label=translations["preprocess_effect"], value=False, interactive=True)
- checkpointing1 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
- training_f0 = gr.Checkbox(label=translations["training_pitch"], value=True, interactive=True)
- upload = gr.Checkbox(label=translations["upload_dataset"], value=False, interactive=True)
- with gr.Row():
- clean_dataset_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.7, step=0.1, interactive=True, visible=clean_dataset.value)
- with gr.Column():
- preprocess_button = gr.Button(translations["preprocess_button"], scale=2)
- upload_dataset = gr.Files(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"], visible=upload.value)
- preprocess_info = gr.Textbox(label=translations["preprocess_info"], value="", interactive=False)
- with gr.Column():
- with gr.Row():
- with gr.Column():
- with gr.Accordion(label=translations["f0_method"], open=False):
- with gr.Group():
- with gr.Row():
- onnx_f0_mode2 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
- unlock_full_method4 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
- extract_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
- extract_hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False)
- with gr.Accordion(label=translations["hubert_model"], open=False):
- with gr.Group():
- embed_mode2 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
- extract_embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
- with gr.Row():
- extract_embedders_custom = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=extract_embedders.value == "custom")
- with gr.Column():
- extract_button = gr.Button(translations["extract_button"], scale=2)
- extract_info = gr.Textbox(label=translations["extract_info"], value="", interactive=False)
- with gr.Column():
- with gr.Row():
- with gr.Column():
- total_epochs = gr.Slider(label=translations["total_epoch"], info=translations["total_epoch_info"], minimum=1, maximum=10000, value=300, step=1, interactive=True)
- save_epochs = gr.Slider(label=translations["save_epoch"], info=translations["save_epoch_info"], minimum=1, maximum=10000, value=50, step=1, interactive=True)
- with gr.Column():
- with gr.Row():
- index_button = gr.Button(f"3. {translations['create_index']}", variant="primary", scale=2)
- training_button = gr.Button(f"4. {translations['training_model']}", variant="primary", scale=2)
- with gr.Row():
- with gr.Accordion(label=translations["setting"], open=False):
- with gr.Row():
- index_algorithm = gr.Radio(label=translations["index_algorithm"], info=translations["index_algorithm_info"], choices=["Auto", "Faiss", "KMeans"], value="Auto", interactive=True)
- with gr.Row():
- custom_dataset = gr.Checkbox(label=translations["custom_dataset"], info=translations["custom_dataset_info"], value=False, interactive=True)
- overtraining_detector = gr.Checkbox(label=translations["overtraining_detector"], info=translations["overtraining_detector_info"], value=False, interactive=True)
- clean_up = gr.Checkbox(label=translations["cleanup_training"], info=translations["cleanup_training_info"], value=False, interactive=True)
- cache_in_gpu = gr.Checkbox(label=translations["cache_in_gpu"], info=translations["cache_in_gpu_info"], value=False, interactive=True)
- with gr.Column():
- dataset_path = gr.Textbox(label=translations["dataset_folder"], value="dataset", interactive=True, visible=custom_dataset.value)
- with gr.Column():
- threshold = gr.Slider(minimum=1, maximum=100, value=50, step=1, label=translations["threshold"], interactive=True, visible=overtraining_detector.value)
- with gr.Accordion(translations["setting_cpu_gpu"], open=False):
- with gr.Column():
- gpu_number = gr.Textbox(label=translations["gpu_number"], value=str("-".join(map(str, range(torch.cuda.device_count()))) if torch.cuda.is_available() else "-"), info=translations["gpu_number_info"], interactive=True)
- gpu_info = gr.Textbox(label=translations["gpu_info"], value=get_gpu_info(), info=translations["gpu_info_2"], interactive=False)
- cpu_core = gr.Slider(label=translations["cpu_core"], info=translations["cpu_core_info"], minimum=0, maximum=cpu_count(), value=cpu_count(), step=1, interactive=True)
- train_batch_size = gr.Slider(label=translations["batch_size"], info=translations["batch_size_info"], minimum=1, maximum=64, value=8, step=1, interactive=True)
- with gr.Row():
- save_only_latest = gr.Checkbox(label=translations["save_only_latest"], info=translations["save_only_latest_info"], value=True, interactive=True)
- save_every_weights = gr.Checkbox(label=translations["save_every_weights"], info=translations["save_every_weights_info"], value=True, interactive=True)
- not_use_pretrain = gr.Checkbox(label=translations["not_use_pretrain_2"], info=translations["not_use_pretrain_info"], value=False, interactive=True)
- custom_pretrain = gr.Checkbox(label=translations["custom_pretrain"], info=translations["custom_pretrain_info"], value=False, interactive=True)
- with gr.Row():
- vocoders = gr.Radio(label=translations["vocoder"], info=translations["vocoder_info"], choices=["Default", "MRF-HiFi-GAN", "RefineGAN"], value="Default", interactive=True)
- with gr.Row():
- deterministic = gr.Checkbox(label=translations["deterministic"], info=translations["deterministic_info"], value=False, interactive=True)
- benchmark = gr.Checkbox(label=translations["benchmark"], info=translations["benchmark_info"], value=False, interactive=True)
- with gr.Row():
- model_author = gr.Textbox(label=translations["training_author"], info=translations["training_author_info"], value="", placeholder=translations["training_author"], interactive=True)
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["custom_pretrain_info"], open=False, visible=custom_pretrain.value and not not_use_pretrain.value) as pretrain_setting:
- pretrained_D = gr.Dropdown(label=translations["pretrain_file"].format(dg="D"), choices=pretrainedD, value=pretrainedD[0] if len(pretrainedD) > 0 else '', interactive=True, allow_custom_value=True)
- pretrained_G = gr.Dropdown(label=translations["pretrain_file"].format(dg="G"), choices=pretrainedG, value=pretrainedG[0] if len(pretrainedG) > 0 else '', interactive=True, allow_custom_value=True)
- refesh_pretrain = gr.Button(translations["refesh"], scale=2)
- with gr.Row():
- training_info = gr.Textbox(label=translations["train_info"], value="", interactive=False)
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["export_model"], open=False):
- with gr.Row():
- model_file= gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
- index_file = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
- with gr.Row():
- refesh_file = gr.Button(f"1. {translations['refesh']}", scale=2)
- zip_model = gr.Button(translations["zip_model"], variant="primary", scale=2)
- with gr.Row():
- zip_output = gr.File(label=translations["output_zip"], file_types=[".zip"], interactive=False, visible=False)
- with gr.Row():
- vocoders.change(fn=pitch_guidance_lock, inputs=[vocoders], outputs=[training_f0])
- training_f0.change(fn=vocoders_lock, inputs=[training_f0, vocoders], outputs=[vocoders])
- unlock_full_method4.change(fn=unlock_f0, inputs=[unlock_full_method4], outputs=[extract_method])
- with gr.Row():
- refesh_file.click(fn=change_models_choices, inputs=[], outputs=[model_file, index_file])
- zip_model.click(fn=zip_file, inputs=[training_name, model_file, index_file], outputs=[zip_output])
- dataset_path.change(fn=lambda folder: os.makedirs(folder, exist_ok=True), inputs=[dataset_path], outputs=[])
- with gr.Row():
- upload.change(fn=visible, inputs=[upload], outputs=[upload_dataset])
- overtraining_detector.change(fn=visible, inputs=[overtraining_detector], outputs=[threshold])
- clean_dataset.change(fn=visible, inputs=[clean_dataset], outputs=[clean_dataset_strength])
- with gr.Row():
- custom_dataset.change(fn=lambda custom_dataset: [visible(custom_dataset), "dataset"],inputs=[custom_dataset], outputs=[dataset_path, dataset_path])
- training_ver.change(fn=unlock_vocoder, inputs=[training_ver, vocoders], outputs=[vocoders])
- vocoders.change(fn=unlock_ver, inputs=[training_ver, vocoders], outputs=[training_ver])
- upload_dataset.upload(
- fn=lambda files, folder: [shutil.move(f.name, os.path.join(folder, os.path.split(f.name)[1])) for f in files] if folder != "" else gr_warning(translations["dataset_folder1"]),
- inputs=[upload_dataset, dataset_path],
- outputs=[],
- api_name="upload_dataset"
- )
- with gr.Row():
- not_use_pretrain.change(fn=lambda a, b: visible(a and not b), inputs=[custom_pretrain, not_use_pretrain], outputs=[pretrain_setting])
- custom_pretrain.change(fn=lambda a, b: visible(a and not b), inputs=[custom_pretrain, not_use_pretrain], outputs=[pretrain_setting])
- refesh_pretrain.click(fn=change_pretrained_choices, inputs=[], outputs=[pretrained_D, pretrained_G])
- with gr.Row():
- preprocess_button.click(
- fn=preprocess,
- inputs=[
- training_name,
- training_sr,
- cpu_core,
- preprocess_cut,
- process_effects,
- dataset_path,
- clean_dataset,
- clean_dataset_strength
- ],
- outputs=[preprocess_info],
- api_name="preprocess"
- )
- with gr.Row():
- embed_mode2.change(fn=visible_embedders, inputs=[embed_mode2], outputs=[extract_embedders])
- extract_method.change(fn=hoplength_show, inputs=[extract_method], outputs=[extract_hop_length])
- extract_embedders.change(fn=lambda extract_embedders: visible(extract_embedders == "custom"), inputs=[extract_embedders], outputs=[extract_embedders_custom])
- with gr.Row():
- extract_button.click(
- fn=extract,
- inputs=[
- training_name,
- training_ver,
- extract_method,
- training_f0,
- extract_hop_length,
- cpu_core,
- gpu_number,
- training_sr,
- extract_embedders,
- extract_embedders_custom,
- onnx_f0_mode2,
- embed_mode2
- ],
- outputs=[extract_info],
- api_name="extract"
- )
- with gr.Row():
- index_button.click(
- fn=create_index,
- inputs=[
- training_name,
- training_ver,
- index_algorithm
- ],
- outputs=[training_info],
- api_name="create_index"
- )
- with gr.Row():
- training_button.click(
- fn=training,
- inputs=[
- training_name,
- training_ver,
- save_epochs,
- save_only_latest,
- save_every_weights,
- total_epochs,
- training_sr,
- train_batch_size,
- gpu_number,
- training_f0,
- not_use_pretrain,
- custom_pretrain,
- pretrained_G,
- pretrained_D,
- overtraining_detector,
- threshold,
- clean_up,
- cache_in_gpu,
- model_author,
- vocoders,
- checkpointing1,
- deterministic,
- benchmark
- ],
- outputs=[training_info],
- api_name="training_model"
- )
-
- with gr.Tab(label=translations["fushion"], visible=configs.get("fushion_tab", True)):
- gr.Markdown(translations["fushion_markdown"])
- with gr.Row():
- gr.Markdown(translations["fushion_markdown_2"])
- with gr.Row():
- name_to_save = gr.Textbox(label=translations["modelname"], placeholder="Model.pth", value="", max_lines=1, interactive=True)
- with gr.Row():
- fushion_button = gr.Button(translations["fushion"], variant="primary", scale=4)
- with gr.Column():
- with gr.Row():
- model_a = gr.File(label=f"{translations['model_name']} 1", file_types=[".pth", ".onnx"])
- model_b = gr.File(label=f"{translations['model_name']} 2", file_types=[".pth", ".onnx"])
- with gr.Row():
- model_path_a = gr.Textbox(label=f"{translations['model_path']} 1", value="", placeholder="assets/weights/Model_1.pth")
- model_path_b = gr.Textbox(label=f"{translations['model_path']} 2", value="", placeholder="assets/weights/Model_2.pth")
- with gr.Row():
- ratio = gr.Slider(minimum=0, maximum=1, label=translations["model_ratio"], info=translations["model_ratio_info"], value=0.5, interactive=True)
- with gr.Row():
- output_model = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False)
- with gr.Row():
- model_a.upload(fn=lambda model: shutil.move(model.name, os.path.join("assets", "weights")), inputs=[model_a], outputs=[model_path_a])
- model_b.upload(fn=lambda model: shutil.move(model.name, os.path.join("assets", "weights")), inputs=[model_b], outputs=[model_path_b])
- with gr.Row():
- fushion_button.click(
- fn=fushion_model,
- inputs=[
- name_to_save,
- model_path_a,
- model_path_b,
- ratio
- ],
- outputs=[name_to_save, output_model],
- api_name="fushion_model"
- )
- fushion_button.click(fn=lambda: visible(True), inputs=[], outputs=[output_model])
-
- with gr.Tab(label=translations["read_model"], visible=configs.get("read_tab", True)):
- gr.Markdown(translations["read_model_markdown"])
- with gr.Row():
- gr.Markdown(translations["read_model_markdown_2"])
- with gr.Row():
- model = gr.File(label=translations["drop_model"], file_types=[".pth", ".onnx"])
- with gr.Row():
- read_button = gr.Button(translations["readmodel"], variant="primary", scale=2)
- with gr.Column():
- model_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True)
- output_info = gr.Textbox(label=translations["modelinfo"], value="", interactive=False, scale=6)
- with gr.Row():
- model.upload(fn=lambda model: shutil.move(model.name, os.path.join("assets", "weights")), inputs=[model], outputs=[model_path])
- read_button.click(
- fn=model_info,
- inputs=[model_path],
- outputs=[output_info],
- api_name="read_model"
- )
-
- with gr.Tab(label=translations["convert_model"], visible=configs.get("onnx_tab", True)):
- gr.Markdown(translations["pytorch2onnx"])
- with gr.Row():
- gr.Markdown(translations["pytorch2onnx_markdown"])
- with gr.Row():
- model_pth_upload = gr.File(label=translations["drop_model"], file_types=[".pth"])
- with gr.Row():
- convert_onnx = gr.Button(translations["convert_model"], variant="primary", scale=2)
- with gr.Row():
- model_pth_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True)
- with gr.Row():
- output_model2 = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False)
- with gr.Row():
- model_pth_upload.upload(fn=lambda model_pth_upload: shutil.move(model_pth_upload.name, os.path.join("assets", "weights")), inputs=[model_pth_upload], outputs=[model_pth_path])
- convert_onnx.click(
- fn=onnx_export,
- inputs=[model_pth_path],
- outputs=[output_model2, output_info],
- api_name="model_onnx_export"
- )
- convert_onnx.click(fn=lambda: visible(True), inputs=[], outputs=[output_model2])
\ No newline at end of file
diff --git a/main/app/tabs/utils/utils.py b/main/app/tabs/utils/utils.py
deleted file mode 100644
index a39b9c54b8586ca9f9c8d978cc8eb93466b5f80f..0000000000000000000000000000000000000000
--- a/main/app/tabs/utils/utils.py
+++ /dev/null
@@ -1,305 +0,0 @@
-import gradio as gr
-from main.tools import huggingface
-from main.configs.config import Config
-from main.app.based.utils import *
-
-def utils_tabs():
- with gr.TabItem("utils"):
- with gr.Tabs():
- with gr.TabItem(translations["audio_editing"], visible=False):
- gr.Markdown(translations["audio_editing_info"])
- with gr.Row():
- gr.Markdown(translations["audio_editing_markdown"])
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row():
- save_compute = gr.Checkbox(label=translations["save_compute"], value=True, interactive=True)
- tar_prompt = gr.Textbox(label=translations["target_prompt"], info=translations["target_prompt_info"], placeholder="Piano and violin cover", lines=5, interactive=True)
- with gr.Column():
- cfg_scale_src = gr.Slider(value=3, minimum=0.5, maximum=25, label=translations["cfg_scale_src"], info=translations["cfg_scale_src_info"], interactive=True)
- cfg_scale_tar = gr.Slider(value=12, minimum=0.5, maximum=25, label=translations["cfg_scale_tar"], info=translations["cfg_scale_tar_info"], interactive=True)
- with gr.Row():
- edit_button = gr.Button(translations["editing"], variant="primary")
- with gr.Row():
- with gr.Column():
- drop_audio_file = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
- display_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- with gr.Column():
- with gr.Accordion(translations["input_output"], open=False):
- with gr.Column():
- export_audio_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
- input_audiopath = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
- output_audiopath = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
- with gr.Column():
- refesh_audio = gr.Button(translations["refesh"])
- with gr.Accordion(translations["setting"], open=False):
- audioldm2_model = gr.Radio(label=translations["audioldm2_model"], info=translations["audioldm2_model_info"], choices=["audioldm2", "audioldm2-large", "audioldm2-music"], value="audioldm2-music", interactive=True)
- with gr.Row():
- src_prompt = gr.Textbox(label=translations["source_prompt"], lines=2, interactive=True, info=translations["source_prompt_info"], placeholder="A recording of a happy upbeat classical music piece")
- with gr.Row():
- with gr.Column():
- audioldm2_sample_rate = gr.Slider(minimum=8000, maximum=96000, label=translations["sr"], info=translations["sr_info"], value=44100, step=1, interactive=True)
- t_start = gr.Slider(minimum=15, maximum=85, value=45, step=1, label=translations["t_start"], interactive=True, info=translations["t_start_info"])
- steps = gr.Slider(value=50, step=1, minimum=10, maximum=300, label=translations["steps_label"], info=translations["steps_info"], interactive=True)
- with gr.Row():
- gr.Markdown(translations["output_audio"])
- with gr.Row():
- output_audioldm2 = gr.Audio(show_download_button=True, interactive=False, label=translations["output_audio"])
- with gr.Row():
- refesh_audio.click(fn=change_audios_choices, inputs=[input_audiopath], outputs=[input_audiopath])
- drop_audio_file.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("audios")), inputs=[drop_audio_file], outputs=[input_audiopath])
- input_audiopath.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audiopath], outputs=[display_audio])
- with gr.Row():
- edit_button.click(
- fn=run_audioldm2,
- inputs=[
- input_audiopath,
- output_audiopath,
- export_audio_format,
- audioldm2_sample_rate,
- audioldm2_model,
- src_prompt,
- tar_prompt,
- steps,
- cfg_scale_src,
- cfg_scale_tar,
- t_start,
- save_compute
- ],
- outputs=[output_audioldm2],
- api_name="audioldm2"
- )
-
- with gr.TabItem(translations["audio_effects"], visible=configs.get("effects_tab", True)):
- gr.Markdown(translations["apply_audio_effects"])
- with gr.Row():
- gr.Markdown(translations["audio_effects_edit"])
- with gr.Row():
- with gr.Column():
- with gr.Row():
- reverb_check_box = gr.Checkbox(label=translations["reverb"], value=False, interactive=True)
- chorus_check_box = gr.Checkbox(label=translations["chorus"], value=False, interactive=True)
- delay_check_box = gr.Checkbox(label=translations["delay"], value=False, interactive=True)
- phaser_check_box = gr.Checkbox(label=translations["phaser"], value=False, interactive=True)
- compressor_check_box = gr.Checkbox(label=translations["compressor"], value=False, interactive=True)
- more_options = gr.Checkbox(label=translations["more_option"], value=False, interactive=True)
- with gr.Row():
- with gr.Accordion(translations["input_output"], open=False):
- with gr.Row():
- upload_audio = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
- with gr.Row():
- audio_in_path = gr.Dropdown(label=translations["input_audio"], value="", choices=paths_for_files, info=translations["provide_audio"], interactive=True, allow_custom_value=True)
- audio_out_path = gr.Textbox(label=translations["output_audio"], value="audios/audio_effects.wav", placeholder="audios/audio_effects.wav", info=translations["provide_output"], interactive=True)
- with gr.Row():
- with gr.Column():
- audio_combination = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True)
- audio_combination_input = gr.Dropdown(label=translations["input_audio"], value="", choices=paths_for_files, info=translations["provide_audio"], interactive=True, allow_custom_value=True, visible=audio_combination.value)
- with gr.Row():
- audio_effects_refesh = gr.Button(translations["refesh"])
- with gr.Row():
- audio_output_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
- with gr.Row():
- apply_effects_button = gr.Button(translations["apply"], variant="primary", scale=2)
- with gr.Row():
- with gr.Column():
- with gr.Row():
- with gr.Accordion(translations["reverb"], open=False, visible=reverb_check_box.value) as reverb_accordion:
- reverb_freeze_mode = gr.Checkbox(label=translations["reverb_freeze"], info=translations["reverb_freeze_info"], value=False, interactive=True)
- reverb_room_size = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.15, label=translations["room_size"], info=translations["room_size_info"], interactive=True)
- reverb_damping = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.7, label=translations["damping"], info=translations["damping_info"], interactive=True)
- reverb_wet_level = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label=translations["wet_level"], info=translations["wet_level_info"], interactive=True)
- reverb_dry_level = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.8, label=translations["dry_level"], info=translations["dry_level_info"], interactive=True)
- reverb_width = gr.Slider(minimum=0, maximum=1, step=0.01, value=1, label=translations["width"], info=translations["width_info"], interactive=True)
- with gr.Row():
- with gr.Accordion(translations["chorus"], open=False, visible=chorus_check_box.value) as chorus_accordion:
- chorus_depth = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["chorus_depth"], info=translations["chorus_depth_info"], interactive=True)
- chorus_rate_hz = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1.5, label=translations["chorus_rate_hz"], info=translations["chorus_rate_hz_info"], interactive=True)
- chorus_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["chorus_mix"], info=translations["chorus_mix_info"], interactive=True)
- chorus_centre_delay_ms = gr.Slider(minimum=0, maximum=50, step=1, value=10, label=translations["chorus_centre_delay_ms"], info=translations["chorus_centre_delay_ms_info"], interactive=True)
- chorus_feedback = gr.Slider(minimum=-1, maximum=1, step=0.01, value=0, label=translations["chorus_feedback"], info=translations["chorus_feedback_info"], interactive=True)
- with gr.Row():
- with gr.Accordion(translations["delay"], open=False, visible=delay_check_box.value) as delay_accordion:
- delay_second = gr.Slider(minimum=0, maximum=5, step=0.01, value=0.5, label=translations["delay_seconds"], info=translations["delay_seconds_info"], interactive=True)
- delay_feedback = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["delay_feedback"], info=translations["delay_feedback_info"], interactive=True)
- delay_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["delay_mix"], info=translations["delay_mix_info"], interactive=True)
- with gr.Column():
- with gr.Row():
- with gr.Accordion(translations["more_option"], open=False, visible=more_options.value) as more_accordion:
- with gr.Row():
- fade = gr.Checkbox(label=translations["fade"], value=False, interactive=True)
- bass_or_treble = gr.Checkbox(label=translations["bass_or_treble"], value=False, interactive=True)
- limiter = gr.Checkbox(label=translations["limiter"], value=False, interactive=True)
- resample_checkbox = gr.Checkbox(label=translations["resample"], value=False, interactive=True)
- with gr.Row():
- distortion_checkbox = gr.Checkbox(label=translations["distortion"], value=False, interactive=True)
- gain_checkbox = gr.Checkbox(label=translations["gain"], value=False, interactive=True)
- bitcrush_checkbox = gr.Checkbox(label=translations["bitcrush"], value=False, interactive=True)
- clipping_checkbox = gr.Checkbox(label=translations["clipping"], value=False, interactive=True)
- with gr.Accordion(translations["fade"], open=True, visible=fade.value) as fade_accordion:
- with gr.Row():
- fade_in = gr.Slider(minimum=0, maximum=10000, step=100, value=0, label=translations["fade_in"], info=translations["fade_in_info"], interactive=True)
- fade_out = gr.Slider(minimum=0, maximum=10000, step=100, value=0, label=translations["fade_out"], info=translations["fade_out_info"], interactive=True)
- with gr.Accordion(translations["bass_or_treble"], open=True, visible=bass_or_treble.value) as bass_treble_accordion:
- with gr.Row():
- bass_boost = gr.Slider(minimum=0, maximum=20, step=1, value=0, label=translations["bass_boost"], info=translations["bass_boost_info"], interactive=True)
- bass_frequency = gr.Slider(minimum=20, maximum=200, step=10, value=100, label=translations["bass_frequency"], info=translations["bass_frequency_info"], interactive=True)
- with gr.Row():
- treble_boost = gr.Slider(minimum=0, maximum=20, step=1, value=0, label=translations["treble_boost"], info=translations["treble_boost_info"], interactive=True)
- treble_frequency = gr.Slider(minimum=1000, maximum=10000, step=500, value=3000, label=translations["treble_frequency"], info=translations["treble_frequency_info"], interactive=True)
- with gr.Accordion(translations["limiter"], open=True, visible=limiter.value) as limiter_accordion:
- with gr.Row():
- limiter_threashold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-1, label=translations["limiter_threashold_db"], info=translations["limiter_threashold_db_info"], interactive=True)
- limiter_release_ms = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label=translations["limiter_release_ms"], info=translations["limiter_release_ms_info"], interactive=True)
- with gr.Column():
- pitch_shift_semitones = gr.Slider(minimum=-20, maximum=20, step=1, value=0, label=translations["pitch"], info=translations["pitch_info"], interactive=True)
- audio_effect_resample_sr = gr.Slider(minimum=0, maximum=96000, step=1, value=0, label=translations["resample"], info=translations["resample_info"], interactive=True, visible=resample_checkbox.value)
- distortion_drive_db = gr.Slider(minimum=0, maximum=50, step=1, value=20, label=translations["distortion"], info=translations["distortion_info"], interactive=True, visible=distortion_checkbox.value)
- gain_db = gr.Slider(minimum=-60, maximum=60, step=1, value=0, label=translations["gain"], info=translations["gain_info"], interactive=True, visible=gain_checkbox.value)
- clipping_threashold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-1, label=translations["clipping_threashold_db"], info=translations["clipping_threashold_db_info"], interactive=True, visible=clipping_checkbox.value)
- bitcrush_bit_depth = gr.Slider(minimum=1, maximum=24, step=1, value=16, label=translations["bitcrush_bit_depth"], info=translations["bitcrush_bit_depth_info"], interactive=True, visible=bitcrush_checkbox.value)
- with gr.Row():
- with gr.Accordion(translations["phaser"], open=False, visible=phaser_check_box.value) as phaser_accordion:
- phaser_depth = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["phaser_depth"], info=translations["phaser_depth_info"], interactive=True)
- phaser_rate_hz = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1, label=translations["phaser_rate_hz"], info=translations["phaser_rate_hz_info"], interactive=True)
- phaser_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["phaser_mix"], info=translations["phaser_mix_info"], interactive=True)
- phaser_centre_frequency_hz = gr.Slider(minimum=50, maximum=5000, step=10, value=1000, label=translations["phaser_centre_frequency_hz"], info=translations["phaser_centre_frequency_hz_info"], interactive=True)
- phaser_feedback = gr.Slider(minimum=-1, maximum=1, step=0.01, value=0, label=translations["phaser_feedback"], info=translations["phaser_feedback_info"], interactive=True)
- with gr.Row():
- with gr.Accordion(translations["compressor"], open=False, visible=compressor_check_box.value) as compressor_accordion:
- compressor_threashold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-20, label=translations["compressor_threashold_db"], info=translations["compressor_threashold_db_info"], interactive=True)
- compressor_ratio = gr.Slider(minimum=1, maximum=20, step=0.1, value=1, label=translations["compressor_ratio"], info=translations["compressor_ratio_info"], interactive=True)
- compressor_attack_ms = gr.Slider(minimum=0.1, maximum=100, step=0.1, value=10, label=translations["compressor_attack_ms"], info=translations["compressor_attack_ms_info"], interactive=True)
- compressor_release_ms = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label=translations["compressor_release_ms"], info=translations["compressor_release_ms_info"], interactive=True)
- with gr.Row():
- gr.Markdown(translations["output_audio"])
- with gr.Row():
- audio_play_input = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- audio_play_output = gr.Audio(show_download_button=True, interactive=False, label=translations["output_audio"])
- with gr.Row():
- reverb_check_box.change(fn=visible, inputs=[reverb_check_box], outputs=[reverb_accordion])
- chorus_check_box.change(fn=visible, inputs=[chorus_check_box], outputs=[chorus_accordion])
- delay_check_box.change(fn=visible, inputs=[delay_check_box], outputs=[delay_accordion])
- with gr.Row():
- compressor_check_box.change(fn=visible, inputs=[compressor_check_box], outputs=[compressor_accordion])
- phaser_check_box.change(fn=visible, inputs=[phaser_check_box], outputs=[phaser_accordion])
- more_options.change(fn=visible, inputs=[more_options], outputs=[more_accordion])
- with gr.Row():
- fade.change(fn=visible, inputs=[fade], outputs=[fade_accordion])
- bass_or_treble.change(fn=visible, inputs=[bass_or_treble], outputs=[bass_treble_accordion])
- limiter.change(fn=visible, inputs=[limiter], outputs=[limiter_accordion])
- resample_checkbox.change(fn=visible, inputs=[resample_checkbox], outputs=[audio_effect_resample_sr])
- with gr.Row():
- distortion_checkbox.change(fn=visible, inputs=[distortion_checkbox], outputs=[distortion_drive_db])
- gain_checkbox.change(fn=visible, inputs=[gain_checkbox], outputs=[gain_db])
- clipping_checkbox.change(fn=visible, inputs=[clipping_checkbox], outputs=[clipping_threashold_db])
- bitcrush_checkbox.change(fn=visible, inputs=[bitcrush_checkbox], outputs=[bitcrush_bit_depth])
- with gr.Row():
- upload_audio.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("audios")), inputs=[upload_audio], outputs=[audio_in_path])
- audio_in_path.change(fn=lambda audio: audio if audio else None, inputs=[audio_in_path], outputs=[audio_play_input])
- audio_effects_refesh.click(fn=lambda a, b: [change_audios_choices(a), change_audios_choices(b)], inputs=[audio_in_path, audio_combination_input], outputs=[audio_in_path, audio_combination_input])
- with gr.Row():
- more_options.change(fn=lambda: [False]*8, inputs=[], outputs=[fade, bass_or_treble, limiter, resample_checkbox, distortion_checkbox, gain_checkbox, clipping_checkbox, bitcrush_checkbox])
- audio_combination.change(fn=visible, inputs=[audio_combination], outputs=[audio_combination_input])
- with gr.Row():
- apply_effects_button.click(
- fn=audio_effects,
- inputs=[
- audio_in_path,
- audio_out_path,
- resample_checkbox,
- audio_effect_resample_sr,
- chorus_depth,
- chorus_rate_hz,
- chorus_mix,
- chorus_centre_delay_ms,
- chorus_feedback,
- distortion_drive_db,
- reverb_room_size,
- reverb_damping,
- reverb_wet_level,
- reverb_dry_level,
- reverb_width,
- reverb_freeze_mode,
- pitch_shift_semitones,
- delay_second,
- delay_feedback,
- delay_mix,
- compressor_threashold_db,
- compressor_ratio,
- compressor_attack_ms,
- compressor_release_ms,
- limiter_threashold_db,
- limiter_release_ms,
- gain_db,
- bitcrush_bit_depth,
- clipping_threashold_db,
- phaser_rate_hz,
- phaser_depth,
- phaser_centre_frequency_hz,
- phaser_feedback,
- phaser_mix,
- bass_boost,
- bass_frequency,
- treble_boost,
- treble_frequency,
- fade_in,
- fade_out,
- audio_output_format,
- chorus_check_box,
- distortion_checkbox,
- reverb_check_box,
- delay_check_box,
- compressor_check_box,
- limiter,
- gain_checkbox,
- bitcrush_checkbox,
- clipping_checkbox,
- phaser_check_box,
- bass_or_treble,
- fade,
- audio_combination,
- audio_combination_input
- ],
- outputs=[audio_play_output],
- api_name="audio_effects"
- )
-
- with gr.TabItem(translations["f0_extractor_tab"], visible=configs.get("f0_extractor_tab", True)):
- gr.Markdown(translations["f0_extractor_markdown"])
- with gr.Row():
- gr.Markdown(translations["f0_extractor_markdown_2"])
- with gr.Row():
- extractor_button = gr.Button(translations["extract_button"].replace("2. ", ""), variant="primary")
- with gr.Row():
- with gr.Column():
- upload_audio_file = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
- audioplay = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- with gr.Column():
- with gr.Accordion(translations["f0_method"], open=False):
- with gr.Group():
- onnx_f0_mode3 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
- f0_method_extract = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
- with gr.Accordion(translations["audio_path"], open=True):
- input_audio_path = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True)
- refesh_audio_button = gr.Button(translations["refesh"])
- with gr.Row():
- gr.Markdown("___")
- with gr.Row():
- file_output = gr.File(label="", file_types=[".txt"], interactive=False)
- image_output = gr.Image(label="", interactive=False, show_download_button=True)
- with gr.Row():
- upload_audio_file.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("audios")), inputs=[upload_audio_file], outputs=[input_audio_path])
- input_audio_path.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio_path], outputs=[audioplay])
- refesh_audio_button.click(fn=change_audios_choices, inputs=[input_audio_path], outputs=[input_audio_path])
- with gr.Row():
- extractor_button.click(
- fn=f0_extract,
- inputs=[
- input_audio_path,
- f0_method_extract,
- onnx_f0_mode3
- ],
- outputs=[file_output, image_output],
- api_name="f0_extract"
- )
\ No newline at end of file
diff --git a/main/app/tensorboard.py b/main/app/tensorboard.py
deleted file mode 100644
index 5248a21c3eb3df542bc378aa4e8c7302daacfd6b..0000000000000000000000000000000000000000
--- a/main/app/tensorboard.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import os
-import sys
-import json
-import logging
-import webbrowser
-
-from tensorboard import program
-
-sys.path.append(os.getcwd())
-
-from main.configs.config import Config
-translations = Config().translations
-
-with open(os.path.join("main", "configs", "config.json"), "r") as f:
- configs = json.load(f)
-
-def launch_tensorboard():
- for l in ["root", "tensorboard"]:
- logging.getLogger(l).setLevel(logging.ERROR)
-
- tb = program.TensorBoard()
- tb.configure(argv=[None, "--logdir", "assets/logs", f"--port={configs['tensorboard_port']}"])
- url = tb.launch()
-
- print(f"{translations['tensorboard_url']}: {url}")
- if "--open" in sys.argv: webbrowser.open(url)
-
- return f"{translations['tensorboard_url']}: {url}"
-
-if __name__ == "__main__": launch_tensorboard()
\ No newline at end of file
diff --git a/main/configs/config.json b/main/configs/config.json
deleted file mode 100644
index 457b93ccb4b895251413f92612bdf59cf14d032f..0000000000000000000000000000000000000000
--- a/main/configs/config.json
+++ /dev/null
@@ -1,549 +0,0 @@
-{
- "language": "en-US",
- "support_language": [
- "en-US",
- "id_Id",
- "ja-JP",
- "vi-VN"
- ],
- "theme": "gradio/default",
- "themes": [
- "NoCrypt/miku",
- "gstaff/xkcd",
- "JohnSmith9982/small_and_pretty",
- "ParityError/Interstellar",
- "earneleh/paris",
- "shivi/calm_seafoam",
- "Hev832/Applio",
- "YTheme/Minecraft",
- "gstaff/sketch",
- "SebastianBravo/simci_css",
- "allenai/gradio-theme",
- "Nymbo/Nymbo_Theme_5",
- "lone17/kotaemon",
- "Zarkel/IBM_Carbon_Theme",
- "SherlockRamos/Feliz",
- "freddyaboulton/dracula_revamped",
- "freddyaboulton/bad-theme-space",
- "gradio/dracula_revamped",
- "abidlabs/dracula_revamped",
- "gradio/dracula_test",
- "gradio/seafoam",
- "gradio/glass",
- "gradio/monochrome",
- "gradio/soft",
- "gradio/default",
- "gradio/base",
- "abidlabs/pakistan",
- "dawood/microsoft_windows",
- "ysharma/steampunk",
- "ysharma/huggingface",
- "abidlabs/Lime",
- "freddyaboulton/this-theme-does-not-exist-2",
- "aliabid94/new-theme",
- "aliabid94/test2",
- "aliabid94/test3",
- "aliabid94/test4",
- "abidlabs/banana",
- "freddyaboulton/test-blue",
- "gstaff/whiteboard",
- "ysharma/llamas",
- "abidlabs/font-test",
- "YenLai/Superhuman",
- "bethecloud/storj_theme",
- "sudeepshouche/minimalist",
- "knotdgaf/gradiotest",
- "ParityError/Anime",
- "Ajaxon6255/Emerald_Isle",
- "ParityError/LimeFace",
- "finlaymacklon/smooth_slate",
- "finlaymacklon/boxy_violet",
- "derekzen/stardust",
- "EveryPizza/Cartoony-Gradio-Theme",
- "Ifeanyi/Cyanister",
- "Tshackelton/IBMPlex-DenseReadable",
- "snehilsanyal/scikit-learn",
- "Himhimhim/xkcd",
- "nota-ai/theme",
- "rawrsor1/Everforest",
- "rottenlittlecreature/Moon_Goblin",
- "abidlabs/test-yellow",
- "abidlabs/test-yellow3",
- "idspicQstitho/dracula_revamped",
- "kfahn/AnimalPose",
- "HaleyCH/HaleyCH_Theme",
- "simulKitke/dracula_test",
- "braintacles/CrimsonNight",
- "wentaohe/whiteboardv2",
- "reilnuud/polite",
- "remilia/Ghostly",
- "Franklisi/darkmode",
- "coding-alt/soft",
- "xiaobaiyuan/theme_land",
- "step-3-profit/Midnight-Deep",
- "xiaobaiyuan/theme_demo",
- "Taithrah/Minimal",
- "Insuz/SimpleIndigo",
- "zkunn/Alipay_Gradio_theme",
- "Insuz/Mocha",
- "xiaobaiyuan/theme_brief",
- "Ama434/434-base-Barlow",
- "Ama434/def_barlow",
- "Ama434/neutral-barlow",
- "dawood/dracula_test",
- "nuttea/Softblue",
- "BlueDancer/Alien_Diffusion",
- "naughtondale/monochrome",
- "Dagfinn1962/standard",
- "default"
- ],
- "mdx_model": [
- "Main_340",
- "Main_390",
- "Main_406",
- "Main_427",
- "Main_438",
- "Inst_full_292",
- "Inst_HQ_1",
- "Inst_HQ_2",
- "Inst_HQ_3",
- "Inst_HQ_4",
- "Inst_HQ_5",
- "Kim_Vocal_1",
- "Kim_Vocal_2",
- "Kim_Inst",
- "Inst_187_beta",
- "Inst_82_beta",
- "Inst_90_beta",
- "Voc_FT",
- "Crowd_HQ",
- "Inst_1",
- "Inst_2",
- "Inst_3",
- "MDXNET_1_9703",
- "MDXNET_2_9682",
- "MDXNET_3_9662",
- "Inst_Main",
- "MDXNET_Main",
- "MDXNET_9482"
- ],
- "demucs_model": [
- "HT-Normal",
- "HT-Tuned",
- "HD_MMI",
- "HT_6S"
- ],
- "edge_tts": [
- "af-ZA-AdriNeural",
- "af-ZA-WillemNeural",
- "sq-AL-AnilaNeural",
- "sq-AL-IlirNeural",
- "am-ET-AmehaNeural",
- "am-ET-MekdesNeural",
- "ar-DZ-AminaNeural",
- "ar-DZ-IsmaelNeural",
- "ar-BH-AliNeural",
- "ar-BH-LailaNeural",
- "ar-EG-SalmaNeural",
- "ar-EG-ShakirNeural",
- "ar-IQ-BasselNeural",
- "ar-IQ-RanaNeural",
- "ar-JO-SanaNeural",
- "ar-JO-TaimNeural",
- "ar-KW-FahedNeural",
- "ar-KW-NouraNeural",
- "ar-LB-LaylaNeural",
- "ar-LB-RamiNeural",
- "ar-LY-ImanNeural",
- "ar-LY-OmarNeural",
- "ar-MA-JamalNeural",
- "ar-MA-MounaNeural",
- "ar-OM-AbdullahNeural",
- "ar-OM-AyshaNeural",
- "ar-QA-AmalNeural",
- "ar-QA-MoazNeural",
- "ar-SA-HamedNeural",
- "ar-SA-ZariyahNeural",
- "ar-SY-AmanyNeural",
- "ar-SY-LaithNeural",
- "ar-TN-HediNeural",
- "ar-TN-ReemNeural",
- "ar-AE-FatimaNeural",
- "ar-AE-HamdanNeural",
- "ar-YE-MaryamNeural",
- "ar-YE-SalehNeural",
- "az-AZ-BabekNeural",
- "az-AZ-BanuNeural",
- "bn-BD-NabanitaNeural",
- "bn-BD-PradeepNeural",
- "bn-IN-BashkarNeural",
- "bn-IN-TanishaaNeural",
- "bs-BA-GoranNeural",
- "bs-BA-VesnaNeural",
- "bg-BG-BorislavNeural",
- "bg-BG-KalinaNeural",
- "my-MM-NilarNeural",
- "my-MM-ThihaNeural",
- "ca-ES-EnricNeural",
- "ca-ES-JoanaNeural",
- "zh-HK-HiuGaaiNeural",
- "zh-HK-HiuMaanNeural",
- "zh-HK-WanLungNeural",
- "zh-CN-XiaoxiaoNeural",
- "zh-CN-XiaoyiNeural",
- "zh-CN-YunjianNeural",
- "zh-CN-YunxiNeural",
- "zh-CN-YunxiaNeural",
- "zh-CN-YunyangNeural",
- "zh-CN-liaoning-XiaobeiNeural",
- "zh-TW-HsiaoChenNeural",
- "zh-TW-YunJheNeural",
- "zh-TW-HsiaoYuNeural",
- "zh-CN-shaanxi-XiaoniNeural",
- "hr-HR-GabrijelaNeural",
- "hr-HR-SreckoNeural",
- "cs-CZ-AntoninNeural",
- "cs-CZ-VlastaNeural",
- "da-DK-ChristelNeural",
- "da-DK-JeppeNeural",
- "nl-BE-ArnaudNeural",
- "nl-BE-DenaNeural",
- "nl-NL-ColetteNeural",
- "nl-NL-FennaNeural",
- "nl-NL-MaartenNeural",
- "en-AU-NatashaNeural",
- "en-AU-WilliamNeural",
- "en-CA-ClaraNeural",
- "en-CA-LiamNeural",
- "en-HK-SamNeural",
- "en-HK-YanNeural",
- "en-IN-NeerjaExpressiveNeural",
- "en-IN-NeerjaNeural",
- "en-IN-PrabhatNeural",
- "en-IE-ConnorNeural",
- "en-IE-EmilyNeural",
- "en-KE-AsiliaNeural",
- "en-KE-ChilembaNeural",
- "en-NZ-MitchellNeural",
- "en-NZ-MollyNeural",
- "en-NG-AbeoNeural",
- "en-NG-EzinneNeural",
- "en-PH-JamesNeural",
- "en-PH-RosaNeural",
- "en-SG-LunaNeural",
- "en-SG-WayneNeural",
- "en-ZA-LeahNeural",
- "en-ZA-LukeNeural",
- "en-TZ-ElimuNeural",
- "en-TZ-ImaniNeural",
- "en-GB-LibbyNeural",
- "en-GB-MaisieNeural",
- "en-GB-RyanNeural",
- "en-GB-SoniaNeural",
- "en-GB-ThomasNeural",
- "en-US-AvaMultilingualNeural",
- "en-US-AndrewMultilingualNeural",
- "en-US-EmmaMultilingualNeural",
- "en-US-BrianMultilingualNeural",
- "en-US-AvaNeural",
- "en-US-AndrewNeural",
- "en-US-EmmaNeural",
- "en-US-BrianNeural",
- "en-US-AnaNeural",
- "en-US-AriaNeural",
- "en-US-ChristopherNeural",
- "en-US-EricNeural",
- "en-US-GuyNeural",
- "en-US-JennyNeural",
- "en-US-MichelleNeural",
- "en-US-RogerNeural",
- "en-US-SteffanNeural",
- "et-EE-AnuNeural",
- "et-EE-KertNeural",
- "fil-PH-AngeloNeural",
- "fil-PH-BlessicaNeural",
- "fi-FI-HarriNeural",
- "fi-FI-NooraNeural",
- "fr-BE-CharlineNeural",
- "fr-BE-GerardNeural",
- "fr-CA-ThierryNeural",
- "fr-CA-AntoineNeural",
- "fr-CA-JeanNeural",
- "fr-CA-SylvieNeural",
- "fr-FR-VivienneMultilingualNeural",
- "fr-FR-RemyMultilingualNeural",
- "fr-FR-DeniseNeural",
- "fr-FR-EloiseNeural",
- "fr-FR-HenriNeural",
- "fr-CH-ArianeNeural",
- "fr-CH-FabriceNeural",
- "gl-ES-RoiNeural",
- "gl-ES-SabelaNeural",
- "ka-GE-EkaNeural",
- "ka-GE-GiorgiNeural",
- "de-AT-IngridNeural",
- "de-AT-JonasNeural",
- "de-DE-SeraphinaMultilingualNeural",
- "de-DE-FlorianMultilingualNeural",
- "de-DE-AmalaNeural",
- "de-DE-ConradNeural",
- "de-DE-KatjaNeural",
- "de-DE-KillianNeural",
- "de-CH-JanNeural",
- "de-CH-LeniNeural",
- "el-GR-AthinaNeural",
- "el-GR-NestorasNeural",
- "gu-IN-DhwaniNeural",
- "gu-IN-NiranjanNeural",
- "he-IL-AvriNeural",
- "he-IL-HilaNeural",
- "hi-IN-MadhurNeural",
- "hi-IN-SwaraNeural",
- "hu-HU-NoemiNeural",
- "hu-HU-TamasNeural",
- "is-IS-GudrunNeural",
- "is-IS-GunnarNeural",
- "id-ID-ArdiNeural",
- "id-ID-GadisNeural",
- "ga-IE-ColmNeural",
- "ga-IE-OrlaNeural",
- "it-IT-GiuseppeNeural",
- "it-IT-DiegoNeural",
- "it-IT-ElsaNeural",
- "it-IT-IsabellaNeural",
- "ja-JP-KeitaNeural",
- "ja-JP-NanamiNeural",
- "jv-ID-DimasNeural",
- "jv-ID-SitiNeural",
- "kn-IN-GaganNeural",
- "kn-IN-SapnaNeural",
- "kk-KZ-AigulNeural",
- "kk-KZ-DauletNeural",
- "km-KH-PisethNeural",
- "km-KH-SreymomNeural",
- "ko-KR-HyunsuNeural",
- "ko-KR-InJoonNeural",
- "ko-KR-SunHiNeural",
- "lo-LA-ChanthavongNeural",
- "lo-LA-KeomanyNeural",
- "lv-LV-EveritaNeural",
- "lv-LV-NilsNeural",
- "lt-LT-LeonasNeural",
- "lt-LT-OnaNeural",
- "mk-MK-AleksandarNeural",
- "mk-MK-MarijaNeural",
- "ms-MY-OsmanNeural",
- "ms-MY-YasminNeural",
- "ml-IN-MidhunNeural",
- "ml-IN-SobhanaNeural",
- "mt-MT-GraceNeural",
- "mt-MT-JosephNeural",
- "mr-IN-AarohiNeural",
- "mr-IN-ManoharNeural",
- "mn-MN-BataaNeural",
- "mn-MN-YesuiNeural",
- "ne-NP-HemkalaNeural",
- "ne-NP-SagarNeural",
- "nb-NO-FinnNeural",
- "nb-NO-PernilleNeural",
- "ps-AF-GulNawazNeural",
- "ps-AF-LatifaNeural",
- "fa-IR-DilaraNeural",
- "fa-IR-FaridNeural",
- "pl-PL-MarekNeural",
- "pl-PL-ZofiaNeural",
- "pt-BR-ThalitaNeural",
- "pt-BR-AntonioNeural",
- "pt-BR-FranciscaNeural",
- "pt-PT-DuarteNeural",
- "pt-PT-RaquelNeural",
- "ro-RO-AlinaNeural",
- "ro-RO-EmilNeural",
- "ru-RU-DmitryNeural",
- "ru-RU-SvetlanaNeural",
- "sr-RS-NicholasNeural",
- "sr-RS-SophieNeural",
- "si-LK-SameeraNeural",
- "si-LK-ThiliniNeural",
- "sk-SK-LukasNeural",
- "sk-SK-ViktoriaNeural",
- "sl-SI-PetraNeural",
- "sl-SI-RokNeural",
- "so-SO-MuuseNeural",
- "so-SO-UbaxNeural",
- "es-AR-ElenaNeural",
- "es-AR-TomasNeural",
- "es-BO-MarceloNeural",
- "es-BO-SofiaNeural",
- "es-CL-CatalinaNeural",
- "es-CL-LorenzoNeural",
- "es-ES-XimenaNeural",
- "es-CO-GonzaloNeural",
- "es-CO-SalomeNeural",
- "es-CR-JuanNeural",
- "es-CR-MariaNeural",
- "es-CU-BelkysNeural",
- "es-CU-ManuelNeural",
- "es-DO-EmilioNeural",
- "es-DO-RamonaNeural",
- "es-EC-AndreaNeural",
- "es-EC-LuisNeural",
- "es-SV-LorenaNeural",
- "es-SV-RodrigoNeural",
- "es-GQ-JavierNeural",
- "es-GQ-TeresaNeural",
- "es-GT-AndresNeural",
- "es-GT-MartaNeural",
- "es-HN-CarlosNeural",
- "es-HN-KarlaNeural",
- "es-MX-DaliaNeural",
- "es-MX-JorgeNeural",
- "es-NI-FedericoNeural",
- "es-NI-YolandaNeural",
- "es-PA-MargaritaNeural",
- "es-PA-RobertoNeural",
- "es-PY-MarioNeural",
- "es-PY-TaniaNeural",
- "es-PE-AlexNeural",
- "es-PE-CamilaNeural",
- "es-PR-KarinaNeural",
- "es-PR-VictorNeural",
- "es-ES-AlvaroNeural",
- "es-ES-ElviraNeural",
- "es-US-AlonsoNeural",
- "es-US-PalomaNeural",
- "es-UY-MateoNeural",
- "es-UY-ValentinaNeural",
- "es-VE-PaolaNeural",
- "es-VE-SebastianNeural",
- "su-ID-JajangNeural",
- "su-ID-TutiNeural",
- "sw-KE-RafikiNeural",
- "sw-KE-ZuriNeural",
- "sw-TZ-DaudiNeural",
- "sw-TZ-RehemaNeural",
- "sv-SE-MattiasNeural",
- "sv-SE-SofieNeural",
- "ta-IN-PallaviNeural",
- "ta-IN-ValluvarNeural",
- "ta-MY-KaniNeural",
- "ta-MY-SuryaNeural",
- "ta-SG-AnbuNeural",
- "ta-SG-VenbaNeural",
- "ta-LK-KumarNeural",
- "ta-LK-SaranyaNeural",
- "te-IN-MohanNeural",
- "te-IN-ShrutiNeural",
- "th-TH-NiwatNeural",
- "th-TH-PremwadeeNeural",
- "tr-TR-AhmetNeural",
- "tr-TR-EmelNeural",
- "uk-UA-OstapNeural",
- "uk-UA-PolinaNeural",
- "ur-IN-GulNeural",
- "ur-IN-SalmanNeural",
- "ur-PK-AsadNeural",
- "ur-PK-UzmaNeural",
- "uz-UZ-MadinaNeural",
- "uz-UZ-SardorNeural",
- "vi-VN-HoaiMyNeural",
- "vi-VN-NamMinhNeural",
- "cy-GB-AledNeural",
- "cy-GB-NiaNeural",
- "zu-ZA-ThandoNeural",
- "zu-ZA-ThembaNeural"
- ],
- "google_tts_voice": [
- "af",
- "am",
- "ar",
- "bg",
- "bn",
- "bs",
- "ca",
- "cs",
- "cy",
- "da",
- "de",
- "el",
- "en",
- "es",
- "et",
- "eu",
- "fi",
- "fr",
- "fr-CA",
- "gl",
- "gu",
- "ha",
- "hi",
- "hr",
- "hu",
- "id",
- "is",
- "it",
- "iw",
- "ja",
- "jw",
- "km",
- "kn",
- "ko",
- "la",
- "lt",
- "lv",
- "ml",
- "mr",
- "ms",
- "my",
- "ne",
- "nl",
- "no",
- "pa",
- "pl",
- "pt",
- "pt-PT",
- "ro",
- "ru",
- "si",
- "sk",
- "sq",
- "sr",
- "su",
- "sv",
- "sw",
- "ta",
- "te",
- "th",
- "tl",
- "tr",
- "uk",
- "ur",
- "vi",
- "yue",
- "zh-CN",
- "zh-TW",
- "zh"
- ],
- "fp16": false,
- "separator_tab": true,
- "convert_tab": true,
- "convert_with_whisper": true,
- "tts_tab": true,
- "audioldm2": true,
- "effects_tab": true,
- "create_dataset_tab": true,
- "training_tab": true,
- "fushion_tab": true,
- "read_tab": true,
- "onnx_tab": true,
- "downloads_tab": true,
- "f0_extractor_tab": true,
- "settings_tab": true,
- "report_bug_tab": true,
- "font": "",
- "app_port": 7860,
- "tensorboard_port": 6870,
- "num_of_restart": 5,
- "server_name": "0.0.0.0",
- "app_show_error": true
-}
diff --git a/main/configs/config.py b/main/configs/config.py
deleted file mode 100644
index 37475406a550b6af49401657a7dc4cba6b09bf9f..0000000000000000000000000000000000000000
--- a/main/configs/config.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import os
-import json
-import torch
-
-
-version_config_paths = [os.path.join(version, size) for version in ["v1", "v2"] for size in ["32000.json", "40000.json", "48000.json"]]
-
-def singleton(cls):
- instances = {}
-
- def get_instance(*args, **kwargs):
- if cls not in instances: instances[cls] = cls(*args, **kwargs)
- return instances[cls]
-
- return get_instance
-
-@singleton
-class Config:
- def __init__(self):
- self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
- self.configs = json.load(open(os.path.join("main", "configs", "config.json"), "r"))
- self.translations = self.multi_language()
- self.json_config = self.load_config_json()
- self.gpu_mem = None
- self.per_preprocess = 3.7
- self.is_half = self.is_fp16()
- self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
-
- def multi_language(self):
- try:
- lang = self.configs.get("language", "en-US")
- if len([l for l in os.listdir(os.path.join("assets", "languages")) if l.endswith(".json")]) < 1: raise FileNotFoundError("Không tìm thấy bất cứ gói ngôn ngữ nào(No package languages found)")
-
- if not lang: lang = "en-US"
- if lang not in self.configs["support_language"]: raise ValueError("Language not supported....")
-
- lang_path = os.path.join("assets", "languages", f"{lang}.json")
- if not os.path.exists(lang_path): lang_path = os.path.join("assets", "languages", "en-US.json")
-
- with open(lang_path, encoding="utf-8") as f:
- translations = json.load(f)
- except json.JSONDecodeError:
- print(self.translations["empty_json"].format(file=lang))
- pass
-
- return translations
-
- def is_fp16(self):
- fp16 = self.configs.get("fp16", False)
-
- if self.device in ["cpu", "mps"] and fp16:
- self.configs["fp16"] = False
- fp16 = False
-
- with open(os.path.join("main", "configs", "config.json"), "w") as f:
- json.dump(self.configs, f, indent=4)
-
- if not fp16: self.preprocess_per = 3.0
- return fp16
-
- def load_config_json(self):
- configs = {}
-
- for config_file in version_config_paths:
- try:
- with open(os.path.join("main", "configs", config_file), "r") as f:
- configs[config_file] = json.load(f)
- except json.JSONDecodeError:
- print(self.translations["empty_json"].format(file=config_file))
- pass
-
- return configs
-
- def device_config(self):
- if self.device.startswith("cuda"): self.set_cuda_config()
- elif self.has_mps(): self.device = "mps"
- else: self.device = "cpu"
-
- if self.gpu_mem is not None and self.gpu_mem <= 4:
- self.preprocess_per = 3.0
- return 1, 5, 30, 32
-
- return (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
-
- def set_cuda_config(self):
- i_device = int(self.device.split(":")[-1])
- self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (1024**3)
-
- def has_mps(self):
- return torch.backends.mps.is_available()
diff --git a/main/configs/decrypt.bin b/main/configs/decrypt.bin
deleted file mode 100644
index 85da68557da0749d6532388eab083cdfea3de416..0000000000000000000000000000000000000000
--- a/main/configs/decrypt.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:330268cbf6b9317a76510b533e1640ef48ed074a07c013e5b1abc4d48cfd9dce
-size 32
diff --git a/main/configs/v1/32000.json b/main/configs/v1/32000.json
deleted file mode 100644
index 9a9230505fcae1ea713893e1bd3006bebf04e014..0000000000000000000000000000000000000000
--- a/main/configs/v1/32000.json
+++ /dev/null
@@ -1,46 +0,0 @@
-{
- "train": {
- "log_interval": 200,
- "seed": 1234,
- "epochs": 20000,
- "learning_rate": 0.0001,
- "betas": [0.8, 0.99],
- "eps": 1e-09,
- "batch_size": 4,
- "lr_decay": 0.999875,
- "segment_size": 12800,
- "init_lr_ratio": 1,
- "warmup_epochs": 0,
- "c_mel": 45,
- "c_kl": 1.0
- },
- "data": {
- "max_wav_value": 32768.0,
- "sample_rate": 32000,
- "filter_length": 1024,
- "hop_length": 320,
- "win_length": 1024,
- "n_mel_channels": 80,
- "mel_fmin": 0.0,
- "mel_fmax": null
- },
- "model": {
- "inter_channels": 192,
- "hidden_channels": 192,
- "filter_channels": 768,
- "text_enc_hidden_dim": 256,
- "n_heads": 2,
- "n_layers": 6,
- "kernel_size": 3,
- "p_dropout": 0,
- "resblock": "1",
- "resblock_kernel_sizes": [3, 7, 11],
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
- "upsample_rates": [10, 4, 2, 2, 2],
- "upsample_initial_channel": 512,
- "upsample_kernel_sizes": [16, 16, 4, 4, 4],
- "use_spectral_norm": false,
- "gin_channels": 256,
- "spk_embed_dim": 109
- }
-}
\ No newline at end of file
diff --git a/main/configs/v1/40000.json b/main/configs/v1/40000.json
deleted file mode 100644
index 2dae4ec3ede25702ac4b5b03cfad61862a818164..0000000000000000000000000000000000000000
--- a/main/configs/v1/40000.json
+++ /dev/null
@@ -1,46 +0,0 @@
-{
- "train": {
- "log_interval": 200,
- "seed": 1234,
- "epochs": 20000,
- "learning_rate": 0.0001,
- "betas": [0.8, 0.99],
- "eps": 1e-09,
- "batch_size": 4,
- "lr_decay": 0.999875,
- "segment_size": 12800,
- "init_lr_ratio": 1,
- "warmup_epochs": 0,
- "c_mel": 45,
- "c_kl": 1.0
- },
- "data": {
- "max_wav_value": 32768.0,
- "sample_rate": 40000,
- "filter_length": 2048,
- "hop_length": 400,
- "win_length": 2048,
- "n_mel_channels": 125,
- "mel_fmin": 0.0,
- "mel_fmax": null
- },
- "model": {
- "inter_channels": 192,
- "hidden_channels": 192,
- "filter_channels": 768,
- "text_enc_hidden_dim": 256,
- "n_heads": 2,
- "n_layers": 6,
- "kernel_size": 3,
- "p_dropout": 0,
- "resblock": "1",
- "resblock_kernel_sizes": [3, 7, 11],
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
- "upsample_rates": [10, 10, 2, 2],
- "upsample_initial_channel": 512,
- "upsample_kernel_sizes": [16, 16, 4, 4],
- "use_spectral_norm": false,
- "gin_channels": 256,
- "spk_embed_dim": 109
- }
-}
\ No newline at end of file
diff --git a/main/configs/v1/48000.json b/main/configs/v1/48000.json
deleted file mode 100644
index 7ce48e4107bf9237bb1ebf6276fd5906891566f0..0000000000000000000000000000000000000000
--- a/main/configs/v1/48000.json
+++ /dev/null
@@ -1,46 +0,0 @@
-{
- "train": {
- "log_interval": 200,
- "seed": 1234,
- "epochs": 20000,
- "learning_rate": 0.0001,
- "betas": [0.8, 0.99],
- "eps": 1e-09,
- "batch_size": 4,
- "lr_decay": 0.999875,
- "segment_size": 11520,
- "init_lr_ratio": 1,
- "warmup_epochs": 0,
- "c_mel": 45,
- "c_kl": 1.0
- },
- "data": {
- "max_wav_value": 32768.0,
- "sample_rate": 48000,
- "filter_length": 2048,
- "hop_length": 480,
- "win_length": 2048,
- "n_mel_channels": 128,
- "mel_fmin": 0.0,
- "mel_fmax": null
- },
- "model": {
- "inter_channels": 192,
- "hidden_channels": 192,
- "filter_channels": 768,
- "text_enc_hidden_dim": 256,
- "n_heads": 2,
- "n_layers": 6,
- "kernel_size": 3,
- "p_dropout": 0,
- "resblock": "1",
- "resblock_kernel_sizes": [3, 7, 11],
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
- "upsample_rates": [10, 6, 2, 2, 2],
- "upsample_initial_channel": 512,
- "upsample_kernel_sizes": [16, 16, 4, 4, 4],
- "use_spectral_norm": false,
- "gin_channels": 256,
- "spk_embed_dim": 109
- }
-}
\ No newline at end of file
diff --git a/main/configs/v2/32000.json b/main/configs/v2/32000.json
deleted file mode 100644
index bad38e5b2df278d08acd7be7f4f3c42fdda249e5..0000000000000000000000000000000000000000
--- a/main/configs/v2/32000.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
- "train": {
- "log_interval": 200,
- "seed": 1234,
- "learning_rate": 0.0001,
- "betas": [0.8, 0.99],
- "eps": 1e-09,
- "lr_decay": 0.999875,
- "segment_size": 12800,
- "c_mel": 45,
- "c_kl": 1.0
- },
- "data": {
- "max_wav_value": 32768.0,
- "sample_rate": 32000,
- "filter_length": 1024,
- "hop_length": 320,
- "win_length": 1024,
- "n_mel_channels": 80,
- "mel_fmin": 0.0,
- "mel_fmax": null
- },
- "model": {
- "inter_channels": 192,
- "hidden_channels": 192,
- "filter_channels": 768,
- "text_enc_hidden_dim": 768,
- "n_heads": 2,
- "n_layers": 6,
- "kernel_size": 3,
- "p_dropout": 0,
- "resblock": "1",
- "resblock_kernel_sizes": [3, 7, 11],
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
- "upsample_rates": [10, 8, 2, 2],
- "upsample_initial_channel": 512,
- "upsample_kernel_sizes": [20, 16, 4, 4],
- "use_spectral_norm": false,
- "gin_channels": 256,
- "spk_embed_dim": 109
- }
-}
\ No newline at end of file
diff --git a/main/configs/v2/40000.json b/main/configs/v2/40000.json
deleted file mode 100644
index b2c54f2d255f5ffd3756ee978dd15700b50b6f25..0000000000000000000000000000000000000000
--- a/main/configs/v2/40000.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
- "train": {
- "log_interval": 200,
- "seed": 1234,
- "learning_rate": 0.0001,
- "betas": [0.8, 0.99],
- "eps": 1e-09,
- "lr_decay": 0.999875,
- "segment_size": 12800,
- "c_mel": 45,
- "c_kl": 1.0
- },
- "data": {
- "max_wav_value": 32768.0,
- "sample_rate": 40000,
- "filter_length": 2048,
- "hop_length": 400,
- "win_length": 2048,
- "n_mel_channels": 125,
- "mel_fmin": 0.0,
- "mel_fmax": null
- },
- "model": {
- "inter_channels": 192,
- "hidden_channels": 192,
- "filter_channels": 768,
- "text_enc_hidden_dim": 768,
- "n_heads": 2,
- "n_layers": 6,
- "kernel_size": 3,
- "p_dropout": 0,
- "resblock": "1",
- "resblock_kernel_sizes": [3, 7, 11],
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
- "upsample_rates": [10, 10, 2, 2],
- "upsample_initial_channel": 512,
- "upsample_kernel_sizes": [16, 16, 4, 4],
- "use_spectral_norm": false,
- "gin_channels": 256,
- "spk_embed_dim": 109
- }
-}
\ No newline at end of file
diff --git a/main/configs/v2/48000.json b/main/configs/v2/48000.json
deleted file mode 100644
index 4c76c9c7f804d36504d4afbfdfd22fd0faee355b..0000000000000000000000000000000000000000
--- a/main/configs/v2/48000.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
- "train": {
- "log_interval": 200,
- "seed": 1234,
- "learning_rate": 0.0001,
- "betas": [0.8, 0.99],
- "eps": 1e-09,
- "lr_decay": 0.999875,
- "segment_size": 17280,
- "c_mel": 45,
- "c_kl": 1.0
- },
- "data": {
- "max_wav_value": 32768.0,
- "sample_rate": 48000,
- "filter_length": 2048,
- "hop_length": 480,
- "win_length": 2048,
- "n_mel_channels": 128,
- "mel_fmin": 0.0,
- "mel_fmax": null
- },
- "model": {
- "inter_channels": 192,
- "hidden_channels": 192,
- "filter_channels": 768,
- "text_enc_hidden_dim": 768,
- "n_heads": 2,
- "n_layers": 6,
- "kernel_size": 3,
- "p_dropout": 0,
- "resblock": "1",
- "resblock_kernel_sizes": [3, 7, 11],
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
- "upsample_rates": [12, 10, 2, 2],
- "upsample_initial_channel": 512,
- "upsample_kernel_sizes": [24, 20, 4, 4],
- "use_spectral_norm": false,
- "gin_channels": 256,
- "spk_embed_dim": 109
- }
-}
\ No newline at end of file
diff --git a/main/inference/audio_effects.py b/main/inference/audio_effects.py
deleted file mode 100644
index f7c30f00bc94996cf72704fce2c53a9100bb2708..0000000000000000000000000000000000000000
--- a/main/inference/audio_effects.py
+++ /dev/null
@@ -1,180 +0,0 @@
-import os
-import sys
-import librosa
-import argparse
-
-import numpy as np
-import soundfile as sf
-
-from distutils.util import strtobool
-from scipy.signal import butter, filtfilt
-from pedalboard import Pedalboard, Chorus, Distortion, Reverb, PitchShift, Delay, Limiter, Gain, Bitcrush, Clipping, Compressor, Phaser, HighpassFilter
-
-sys.path.append(os.getcwd())
-
-from main.configs.config import Config
-from main.library.utils import pydub_convert, pydub_load
-
-translations = Config().translations
-
-def parse_arguments():
- parser = argparse.ArgumentParser()
- parser.add_argument("--input_path", type=str, required=True)
- parser.add_argument("--output_path", type=str, default="./audios/apply_effects.wav")
- parser.add_argument("--export_format", type=str, default="wav")
- parser.add_argument("--resample", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--resample_sr", type=int, default=0)
- parser.add_argument("--chorus", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--chorus_depth", type=float, default=0.5)
- parser.add_argument("--chorus_rate", type=float, default=1.5)
- parser.add_argument("--chorus_mix", type=float, default=0.5)
- parser.add_argument("--chorus_delay", type=int, default=10)
- parser.add_argument("--chorus_feedback", type=float, default=0)
- parser.add_argument("--distortion", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--drive_db", type=int, default=20)
- parser.add_argument("--reverb", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--reverb_room_size", type=float, default=0.5)
- parser.add_argument("--reverb_damping", type=float, default=0.5)
- parser.add_argument("--reverb_wet_level", type=float, default=0.33)
- parser.add_argument("--reverb_dry_level", type=float, default=0.67)
- parser.add_argument("--reverb_width", type=float, default=1)
- parser.add_argument("--reverb_freeze_mode", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--pitchshift", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--pitch_shift", type=int, default=0)
- parser.add_argument("--delay", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--delay_seconds", type=float, default=0.5)
- parser.add_argument("--delay_feedback", type=float, default=0.5)
- parser.add_argument("--delay_mix", type=float, default=0.5)
- parser.add_argument("--compressor", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--compressor_threshold", type=int, default=-20)
- parser.add_argument("--compressor_ratio", type=float, default=4)
- parser.add_argument("--compressor_attack_ms", type=float, default=10)
- parser.add_argument("--compressor_release_ms", type=int, default=200)
- parser.add_argument("--limiter", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--limiter_threshold", type=int, default=0)
- parser.add_argument("--limiter_release", type=int, default=100)
- parser.add_argument("--gain", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--gain_db", type=int, default=0)
- parser.add_argument("--bitcrush", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--bitcrush_bit_depth", type=int, default=16)
- parser.add_argument("--clipping", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--clipping_threshold", type=int, default=-10)
- parser.add_argument("--phaser", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--phaser_rate_hz", type=float, default=0.5)
- parser.add_argument("--phaser_depth", type=float, default=0.5)
- parser.add_argument("--phaser_centre_frequency_hz", type=int, default=1000)
- parser.add_argument("--phaser_feedback", type=float, default=0)
- parser.add_argument("--phaser_mix", type=float, default=0.5)
- parser.add_argument("--treble_bass_boost", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--bass_boost_db", type=int, default=0)
- parser.add_argument("--bass_boost_frequency", type=int, default=100)
- parser.add_argument("--treble_boost_db", type=int, default=0)
- parser.add_argument("--treble_boost_frequency", type=int, default=3000)
- parser.add_argument("--fade_in_out", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--fade_in_duration", type=float, default=2000)
- parser.add_argument("--fade_out_duration", type=float, default=2000)
- parser.add_argument("--audio_combination", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--audio_combination_input", type=str)
-
- return parser.parse_args()
-
-def process_audio(input_path, output_path, resample, resample_sr, chorus_depth, chorus_rate, chorus_mix, chorus_delay, chorus_feedback, distortion_drive, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift, delay_seconds, delay_feedback, delay_mix, compressor_threshold, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold, limiter_release, gain_db, bitcrush_bit_depth, clipping_threshold, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost_db, bass_boost_frequency, treble_boost_db, treble_boost_frequency, fade_in_duration, fade_out_duration, export_format, chorus, distortion, reverb, pitchshift, delay, compressor, limiter, gain, bitcrush, clipping, phaser, treble_bass_boost, fade_in_out, audio_combination, audio_combination_input):
- def bass_boost(audio, gain_db, frequency, sample_rate):
- if gain_db >= 1:
- b, a = butter(4, frequency / (0.5 * sample_rate), btype='low')
-
- return filtfilt(b, a, audio) * 10 ** (gain_db / 20)
- else: return audio
-
- def treble_boost(audio, gain_db, frequency, sample_rate):
- if gain_db >=1:
- b, a = butter(4, frequency / (0.5 * sample_rate), btype='high')
-
- return filtfilt(b, a, audio) * 10 ** (gain_db / 20)
- else: return audio
-
- def fade_out_effect(audio, sr, duration=3.0):
- length = int(duration * sr)
- end = audio.shape[0]
-
- if length > end: length = end
- start = end - length
-
- audio[start:end] = audio[start:end] * np.linspace(1.0, 0.0, length)
- return audio
-
- def fade_in_effect(audio, sr, duration=3.0):
- length = int(duration * sr)
- start = 0
-
- if length > audio.shape[0]: length = audio.shape[0]
- end = length
-
- audio[start:end] = audio[start:end] * np.linspace(0.0, 1.0, length)
- return audio
-
- if not input_path or not os.path.exists(input_path):
- print(translations["input_not_valid"])
- sys.exit(1)
-
- if not output_path:
- print(translations["output_not_valid"])
- sys.exit(1)
-
- if os.path.exists(output_path): os.remove(output_path)
-
- try:
- input_path = input_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
-
- try:
- audio, sample_rate = sf.read(input_path, dtype=np.float32)
- except:
- audio, sample_rate = librosa.load(input_path, sr=None)
- except Exception as e:
- raise RuntimeError(f"{translations['errors_loading_audio']}: {e}")
-
- audio = audio.flatten()
-
- try:
- board = Pedalboard([HighpassFilter()])
-
- if chorus: board.append(Chorus(depth=chorus_depth, rate_hz=chorus_rate, mix=chorus_mix, centre_delay_ms=chorus_delay, feedback=chorus_feedback))
- if distortion: board.append(Distortion(drive_db=distortion_drive))
- if reverb: board.append(Reverb(room_size=reverb_room_size, damping=reverb_damping, wet_level=reverb_wet_level, dry_level=reverb_dry_level, width=reverb_width, freeze_mode=1 if reverb_freeze_mode else 0))
- if pitchshift: board.append(PitchShift(semitones=pitch_shift))
- if delay: board.append(Delay(delay_seconds=delay_seconds, feedback=delay_feedback, mix=delay_mix))
- if compressor: board.append(Compressor(threshold_db=compressor_threshold, ratio=compressor_ratio, attack_ms=compressor_attack_ms, release_ms=compressor_release_ms))
- if limiter: board.append(Limiter(threshold_db=limiter_threshold, release_ms=limiter_release))
- if gain: board.append(Gain(gain_db=gain_db))
- if bitcrush: board.append(Bitcrush(bit_depth=bitcrush_bit_depth))
- if clipping: board.append(Clipping(threshold_db=clipping_threshold))
- if phaser: board.append(Phaser(rate_hz=phaser_rate_hz, depth=phaser_depth, centre_frequency_hz=phaser_centre_frequency_hz, feedback=phaser_feedback, mix=phaser_mix))
-
- processed_audio = board(audio, sample_rate)
-
- if treble_bass_boost:
- processed_audio = bass_boost(processed_audio, bass_boost_db, bass_boost_frequency, sample_rate)
- processed_audio = treble_boost(processed_audio, treble_boost_db, treble_boost_frequency, sample_rate)
-
- if fade_in_out:
- processed_audio = fade_in_effect(processed_audio, sample_rate, fade_in_duration)
- processed_audio = fade_out_effect(processed_audio, sample_rate, fade_out_duration)
-
- if resample_sr != sample_rate and resample_sr > 0 and resample:
- target_sr = min([8000, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000, 96000], key=lambda x: abs(x - resample_sr))
- processed_audio = librosa.resample(processed_audio, orig_sr=sample_rate, target_sr=target_sr, res_type="soxr_vhq")
- sample_rate = target_sr
-
- sf.write(output_path.replace("wav", export_format), processed_audio, sample_rate, format=export_format)
-
- if audio_combination: pydub_convert(pydub_load(audio_combination_input)).overlay(pydub_convert(pydub_load(output_path.replace("wav", export_format)))).export(output_path.replace("wav", export_format), format=export_format)
- except Exception as e:
- raise RuntimeError(translations["apply_error"].format(e=e))
-
- return output_path
-
-def main():
- args = parse_arguments()
- process_audio(input_path=args.input_path, output_path=args.output_path, resample=args.resample, resample_sr=args.resample_sr, chorus_depth=args.chorus_depth, chorus_rate=args.chorus_rate, chorus_mix=args.chorus_mix, chorus_delay=args.chorus_delay, chorus_feedback=args.chorus_feedback, distortion_drive=args.drive_db, reverb_room_size=args.reverb_room_size, reverb_damping=args.reverb_damping, reverb_wet_level=args.reverb_wet_level, reverb_dry_level=args.reverb_dry_level, reverb_width=args.reverb_width, reverb_freeze_mode=args.reverb_freeze_mode, pitch_shift=args.pitch_shift, delay_seconds=args.delay_seconds, delay_feedback=args.delay_feedback, delay_mix=args.delay_mix, compressor_threshold=args.compressor_threshold, compressor_ratio=args.compressor_ratio, compressor_attack_ms=args.compressor_attack_ms, compressor_release_ms=args.compressor_release_ms, limiter_threshold=args.limiter_threshold, limiter_release=args.limiter_release, gain_db=args.gain_db, bitcrush_bit_depth=args.bitcrush_bit_depth, clipping_threshold=args.clipping_threshold, phaser_rate_hz=args.phaser_rate_hz, phaser_depth=args.phaser_depth, phaser_centre_frequency_hz=args.phaser_centre_frequency_hz, phaser_feedback=args.phaser_feedback, phaser_mix=args.phaser_mix, bass_boost_db=args.bass_boost_db, bass_boost_frequency=args.bass_boost_frequency, treble_boost_db=args.treble_boost_db, treble_boost_frequency=args.treble_boost_frequency, fade_in_duration=args.fade_in_duration, fade_out_duration=args.fade_out_duration, export_format=args.export_format, chorus=args.chorus, distortion=args.distortion, reverb=args.reverb, pitchshift=args.pitchshift, delay=args.delay, compressor=args.compressor, limiter=args.limiter, gain=args.gain, bitcrush=args.bitcrush, clipping=args.clipping, phaser=args.phaser, treble_bass_boost=args.treble_bass_boost, fade_in_out=args.fade_in_out, audio_combination=args.audio_combination, audio_combination_input=args.audio_combination_input)
-
-if __name__ == "__main__": main()
\ No newline at end of file
diff --git a/main/inference/audioldm2.py b/main/inference/audioldm2.py
deleted file mode 100644
index 0e5417ac9c8ae2654e24f740f4d10bfdb56a1a6d..0000000000000000000000000000000000000000
--- a/main/inference/audioldm2.py
+++ /dev/null
@@ -1,210 +0,0 @@
-import os
-import sys
-import time
-import tqdm
-import torch
-import logging
-import librosa
-import argparse
-import scipy.signal
-import logging.handlers
-
-import numpy as np
-import soundfile as sf
-
-from torch import inference_mode
-from distutils.util import strtobool
-
-sys.path.append(os.getcwd())
-
-from main.configs.config import Config
-from main.library.audioldm2.utils import load_audio
-from main.library.audioldm2.models import load_model
-
-config = Config()
-translations = config.translations
-logger = logging.getLogger(__name__)
-logger.propagate = False
-
-for l in ["torch", "httpx", "httpcore", "diffusers", "transformers"]:
- logging.getLogger(l).setLevel(logging.ERROR)
-
-if logger.hasHandlers(): logger.handlers.clear()
-else:
- console_handler = logging.StreamHandler()
- console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
- console_handler.setFormatter(console_formatter)
- console_handler.setLevel(logging.INFO)
- file_handler = logging.handlers.RotatingFileHandler(os.path.join("assets", "logs", "audioldm2.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8')
- file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
- file_handler.setFormatter(file_formatter)
- file_handler.setLevel(logging.DEBUG)
- logger.addHandler(console_handler)
- logger.addHandler(file_handler)
- logger.setLevel(logging.DEBUG)
-
-def parse_arguments():
- parser = argparse.ArgumentParser()
- parser.add_argument("--input_path", type=str, required=True)
- parser.add_argument("--output_path", type=str, default="./output.wav")
- parser.add_argument("--export_format", type=str, default="wav")
- parser.add_argument("--sample_rate", type=int, default=44100)
- parser.add_argument("--audioldm_model", type=str, default="audioldm2-music")
- parser.add_argument("--source_prompt", type=str, default="")
- parser.add_argument("--target_prompt", type=str, default="")
- parser.add_argument("--steps", type=int, default=200)
- parser.add_argument("--cfg_scale_src", type=float, default=3.5)
- parser.add_argument("--cfg_scale_tar", type=float, default=12)
- parser.add_argument("--t_start", type=int, default=45)
- parser.add_argument("--save_compute", type=lambda x: bool(strtobool(x)), default=False)
-
- return parser.parse_args()
-
-def main():
- args = parse_arguments()
- input_path, output_path, export_format, sample_rate, audioldm_model, source_prompt, target_prompt, steps, cfg_scale_src, cfg_scale_tar, t_start, save_compute = args.input_path, args.output_path, args.export_format, args.sample_rate, args.audioldm_model, args.source_prompt, args.target_prompt, args.steps, args.cfg_scale_src, args.cfg_scale_tar, args.t_start, args.save_compute
-
- log_data = {translations['audio_path']: input_path, translations['output_path']: output_path.replace('wav', export_format), translations['model_name']: audioldm_model, translations['export_format']: export_format, translations['sample_rate']: sample_rate, translations['steps']: steps, translations['source_prompt']: source_prompt, translations['target_prompt']: target_prompt, translations['cfg_scale_src']: cfg_scale_src, translations['cfg_scale_tar']: cfg_scale_tar, translations['t_start']: t_start, translations['save_compute']: save_compute}
-
- for key, value in log_data.items():
- logger.debug(f"{key}: {value}")
-
- start_time = time.time()
- logger.info(translations["start_edit"].format(input_path=input_path))
- pid_path = os.path.join("assets", "audioldm2_pid.txt")
- with open(pid_path, "w") as pid_file:
- pid_file.write(str(os.getpid()))
-
- try:
- edit(input_path, output_path, audioldm_model, source_prompt, target_prompt, steps, cfg_scale_src, cfg_scale_tar, t_start, save_compute, sample_rate, config.device, export_format=export_format)
- except Exception as e:
- logger.error(translations["error_edit"].format(e=e))
- import traceback
- logger.debug(traceback.format_exc())
-
- logger.info(translations["edit_success"].format(time=f"{(time.time() - start_time):.2f}", output_path=output_path.replace('wav', export_format)))
-
-def invert(ldm_stable, x0, prompt_src, num_diffusion_steps, cfg_scale_src, duration, save_compute):
- with inference_mode():
- w0 = ldm_stable.vae_encode(x0)
-
- _, zs, wts, extra_info = inversion_forward_process(ldm_stable, w0, etas=1, prompts=[prompt_src], cfg_scales=[cfg_scale_src], num_inference_steps=num_diffusion_steps, numerical_fix=True, duration=duration, save_compute=save_compute)
- return zs, wts, extra_info
-
-def low_pass_filter(audio, cutoff=7500, sr=16000):
- b, a = scipy.signal.butter(4, cutoff / (sr / 2), btype='low')
- return scipy.signal.filtfilt(b, a, audio)
-
-def sample(output_audio, sr, ldm_stable, zs, wts, extra_info, prompt_tar, tstart, cfg_scale_tar, duration, save_compute, export_format = "wav"):
- tstart = torch.tensor(tstart, dtype=torch.int32)
- w0, _ = inversion_reverse_process(ldm_stable, xT=wts, tstart=tstart, etas=1., prompts=[prompt_tar], neg_prompts=[""], cfg_scales=[cfg_scale_tar], zs=zs[:int(tstart)], duration=duration, extra_info=extra_info, save_compute=save_compute)
-
- with inference_mode():
- x0_dec = ldm_stable.vae_decode(w0.to(torch.float16 if config.is_half else torch.float32))
-
- if x0_dec.dim() < 4: x0_dec = x0_dec[None, :, :, :]
-
- with torch.no_grad():
- audio = ldm_stable.decode_to_mel(x0_dec.to(torch.float16 if config.is_half else torch.float32))
-
- audio = audio.float().squeeze().cpu().numpy()
- orig_sr = 16000
-
- if sr != 16000 and sr > 0:
- audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=sr, res_type="soxr_vhq")
- orig_sr = sr
-
- audio = low_pass_filter(audio, 7500, orig_sr)
-
- sf.write(output_audio, np.tile(audio, (2, 1)).T, orig_sr, format=export_format)
- return output_audio
-
-def edit(input_audio, output_audio, model_id, source_prompt = "", target_prompt = "", steps = 200, cfg_scale_src = 3.5, cfg_scale_tar = 12, t_start = 45, save_compute = True, sr = 44100, device = "cpu", export_format = "wav"):
- ldm_stable = load_model(model_id, device=device)
- ldm_stable.model.scheduler.set_timesteps(steps, device=device)
- x0, duration = load_audio(input_audio, ldm_stable.get_melspectrogram(), device=device)
- zs_tensor, wts_tensor, extra_info_list = invert(ldm_stable=ldm_stable, x0=x0, prompt_src=source_prompt, num_diffusion_steps=steps, cfg_scale_src=cfg_scale_src, duration=duration, save_compute=save_compute)
-
- return sample(output_audio, sr, ldm_stable, zs_tensor, wts_tensor, extra_info_list, prompt_tar=target_prompt, tstart=int(t_start / 100 * steps), cfg_scale_tar=cfg_scale_tar, duration=duration, save_compute=save_compute, export_format=export_format)
-
-def inversion_forward_process(model, x0, etas = None, prompts = [""], cfg_scales = [3.5], num_inference_steps = 50, numerical_fix = False, duration = None, first_order = False, save_compute = True):
- if len(prompts) > 1 or prompts[0] != "":
- text_embeddings_hidden_states, text_embeddings_class_labels, text_embeddings_boolean_prompt_mask = model.encode_text(prompts)
- uncond_embeddings_hidden_states, uncond_embeddings_class_lables, uncond_boolean_prompt_mask = model.encode_text([""], negative=True, save_compute=save_compute, cond_length=text_embeddings_class_labels.shape[1] if text_embeddings_class_labels is not None else None)
- else: uncond_embeddings_hidden_states, uncond_embeddings_class_lables, uncond_boolean_prompt_mask = model.encode_text([""], negative=True, save_compute=False)
-
- timesteps = model.model.scheduler.timesteps.to(model.device)
- variance_noise_shape = model.get_noise_shape(x0, num_inference_steps)
-
- if type(etas) in [int, float]: etas = [etas]*model.model.scheduler.num_inference_steps
-
- xts = model.sample_xts_from_x0(x0, num_inference_steps=num_inference_steps)
- zs = torch.zeros(size=variance_noise_shape, device=model.device)
- extra_info = [None] * len(zs)
-
- if timesteps[0].dtype == torch.int64: t_to_idx = {int(v): k for k, v in enumerate(timesteps)}
- elif timesteps[0].dtype == torch.float32: t_to_idx = {float(v): k for k, v in enumerate(timesteps)}
-
- xt = x0
- model.setup_extra_inputs(xt, init_timestep=timesteps[0], audio_end_in_s=duration, save_compute=save_compute and prompts[0] != "")
-
- for t in tqdm.tqdm(timesteps, desc=translations["inverting"], ncols=100, unit="a"):
- idx = num_inference_steps - t_to_idx[int(t) if timesteps[0].dtype == torch.int64 else float(t)] - 1
- xt = xts[idx + 1][None]
- xt_inp = model.model.scheduler.scale_model_input(xt, t).to(torch.float16 if config.is_half else torch.float32)
-
- with torch.no_grad():
- if save_compute and prompts[0] != "":
- comb_out, _, _ = model.unet_forward(xt_inp.expand(2, -1, -1, -1) if hasattr(model.model, 'unet') else xt_inp.expand(2, -1, -1), timestep=t, encoder_hidden_states=torch.cat([uncond_embeddings_hidden_states, text_embeddings_hidden_states], dim=0) if uncond_embeddings_hidden_states is not None else None, class_labels=torch.cat([uncond_embeddings_class_lables, text_embeddings_class_labels], dim=0) if uncond_embeddings_class_lables is not None else None, encoder_attention_mask=torch.cat([uncond_boolean_prompt_mask, text_embeddings_boolean_prompt_mask], dim=0) if uncond_boolean_prompt_mask is not None else None)
- out, cond_out = comb_out.sample.chunk(2, dim=0)
- else:
- out = model.unet_forward(xt_inp, timestep=t, encoder_hidden_states=uncond_embeddings_hidden_states, class_labels=uncond_embeddings_class_lables, encoder_attention_mask=uncond_boolean_prompt_mask)[0].sample
- if len(prompts) > 1 or prompts[0] != "": cond_out = model.unet_forward(xt_inp, timestep=t, encoder_hidden_states=text_embeddings_hidden_states, class_labels=text_embeddings_class_labels, encoder_attention_mask=text_embeddings_boolean_prompt_mask)[0].sample
-
- if len(prompts) > 1 or prompts[0] != "": noise_pred = out + (cfg_scales[0] * (cond_out - out)).sum(axis=0).unsqueeze(0)
- else: noise_pred = out
-
- xtm1 = xts[idx][None]
- z, xtm1, extra = model.get_zs_from_xts(xt, xtm1, noise_pred, t, eta=etas[idx], numerical_fix=numerical_fix, first_order=first_order)
- zs[idx] = z
- xts[idx] = xtm1
- extra_info[idx] = extra
-
- if zs is not None: zs[0] = torch.zeros_like(zs[0])
- return xt, zs, xts, extra_info
-
-def inversion_reverse_process(model, xT, tstart, etas = 0, prompts = [""], neg_prompts = [""], cfg_scales = None, zs = None, duration = None, first_order = False, extra_info = None, save_compute = True):
- text_embeddings_hidden_states, text_embeddings_class_labels, text_embeddings_boolean_prompt_mask = model.encode_text(prompts)
- uncond_embeddings_hidden_states, uncond_embeddings_class_lables, uncond_boolean_prompt_mask = model.encode_text(neg_prompts, negative=True, save_compute=save_compute, cond_length=text_embeddings_class_labels.shape[1] if text_embeddings_class_labels is not None else None)
- xt = xT[tstart.max()].unsqueeze(0)
-
- if etas is None: etas = 0
- if type(etas) in [int, float]: etas = [etas]*model.model.scheduler.num_inference_steps
-
- assert len(etas) == model.model.scheduler.num_inference_steps
- timesteps = model.model.scheduler.timesteps.to(model.device)
-
- if timesteps[0].dtype == torch.int64: t_to_idx = {int(v): k for k, v in enumerate(timesteps[-zs.shape[0]:])}
- elif timesteps[0].dtype == torch.float32: t_to_idx = {float(v): k for k, v in enumerate(timesteps[-zs.shape[0]:])}
-
- model.setup_extra_inputs(xt, extra_info=extra_info, init_timestep=timesteps[-zs.shape[0]], audio_end_in_s=duration, save_compute=save_compute)
-
- for t in tqdm.tqdm(timesteps[-zs.shape[0]:], desc=translations["editing"], ncols=100, unit="a"):
- idx = model.model.scheduler.num_inference_steps - t_to_idx[int(t) if timesteps[0].dtype == torch.int64 else float(t)] - (model.model.scheduler.num_inference_steps - zs.shape[0] + 1)
- xt_inp = model.model.scheduler.scale_model_input(xt, t).to(torch.float16 if config.is_half else torch.float32)
-
- with torch.no_grad():
- if save_compute:
- comb_out, _, _ = model.unet_forward(xt_inp.expand(2, -1, -1, -1) if hasattr(model.model, 'unet') else xt_inp.expand(2, -1, -1), timestep=t, encoder_hidden_states=torch.cat([uncond_embeddings_hidden_states, text_embeddings_hidden_states], dim=0) if uncond_embeddings_hidden_states is not None else None, class_labels=torch.cat([uncond_embeddings_class_lables, text_embeddings_class_labels], dim=0) if uncond_embeddings_class_lables is not None else None, encoder_attention_mask=torch.cat([uncond_boolean_prompt_mask, text_embeddings_boolean_prompt_mask], dim=0) if uncond_boolean_prompt_mask is not None else None)
- uncond_out, cond_out = comb_out.sample.chunk(2, dim=0)
- else:
- uncond_out = model.unet_forward(xt_inp, timestep=t, encoder_hidden_states=uncond_embeddings_hidden_states, class_labels=uncond_embeddings_class_lables, encoder_attention_mask=uncond_boolean_prompt_mask)[0].sample
- cond_out = model.unet_forward(xt_inp, timestep=t, encoder_hidden_states=text_embeddings_hidden_states, class_labels=text_embeddings_class_labels, encoder_attention_mask=text_embeddings_boolean_prompt_mask)[0].sample
-
- z = zs[idx] if zs is not None else None
- noise_pred = uncond_out + (cfg_scales[0] * (cond_out - uncond_out)).sum(axis=0).unsqueeze(0)
- xt = model.reverse_step_with_custom_noise(noise_pred, t, xt, variance_noise=z.unsqueeze(0), eta=etas[idx], first_order=first_order)
-
- return xt, zs
-
-if __name__ == "__main__": main()
\ No newline at end of file
diff --git a/main/inference/convert.py b/main/inference/convert.py
deleted file mode 100644
index e6636bde4600ce0461a0e7e8657f5c641815b093..0000000000000000000000000000000000000000
--- a/main/inference/convert.py
+++ /dev/null
@@ -1,590 +0,0 @@
-import re
-import os
-import gc
-import sys
-import time
-import faiss
-import torch
-import librosa
-import logging
-import argparse
-import warnings
-import onnxruntime
-import logging.handlers
-
-import numpy as np
-import soundfile as sf
-import torch.nn.functional as F
-
-from tqdm import tqdm
-from scipy import signal
-from distutils.util import strtobool
-
-warnings.filterwarnings("ignore")
-sys.path.append(os.getcwd())
-
-from main.configs.config import Config
-from main.library.algorithm.synthesizers import Synthesizer
-from main.library.utils import check_predictors, check_embedders, load_audio, load_embedders_model, cut, restore
-
-bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
-config = Config()
-translations = config.translations
-logger = logging.getLogger(__name__)
-logger.propagate = False
-
-for l in ["torch", "faiss", "httpx", "httpcore", "faiss.loader", "numba.core", "urllib3", "transformers", "matplotlib"]:
- logging.getLogger(l).setLevel(logging.ERROR)
-
-if logger.hasHandlers(): logger.handlers.clear()
-else:
- console_handler = logging.StreamHandler()
- console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
- console_handler.setFormatter(console_formatter)
- console_handler.setLevel(logging.INFO)
- file_handler = logging.handlers.RotatingFileHandler(os.path.join("assets", "logs", "convert.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8')
- file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
- file_handler.setFormatter(file_formatter)
- file_handler.setLevel(logging.DEBUG)
- logger.addHandler(console_handler)
- logger.addHandler(file_handler)
- logger.setLevel(logging.DEBUG)
-
-def parse_arguments():
- parser = argparse.ArgumentParser()
- parser.add_argument("--pitch", type=int, default=0)
- parser.add_argument("--filter_radius", type=int, default=3)
- parser.add_argument("--index_rate", type=float, default=0.5)
- parser.add_argument("--volume_envelope", type=float, default=1)
- parser.add_argument("--protect", type=float, default=0.33)
- parser.add_argument("--hop_length", type=int, default=64)
- parser.add_argument("--f0_method", type=str, default="rmvpe")
- parser.add_argument("--embedder_model", type=str, default="contentvec_base")
- parser.add_argument("--input_path", type=str, required=True)
- parser.add_argument("--output_path", type=str, default="./audios/output.wav")
- parser.add_argument("--export_format", type=str, default="wav")
- parser.add_argument("--pth_path", type=str, required=True)
- parser.add_argument("--index_path", type=str)
- parser.add_argument("--f0_autotune", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--f0_autotune_strength", type=float, default=1)
- parser.add_argument("--clean_audio", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--clean_strength", type=float, default=0.7)
- parser.add_argument("--resample_sr", type=int, default=0)
- parser.add_argument("--split_audio", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--checkpointing", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--f0_file", type=str, default="")
- parser.add_argument("--f0_onnx", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--embedders_mode", type=str, default="fairseq")
- parser.add_argument("--formant_shifting", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--formant_qfrency", type=float, default=0.8)
- parser.add_argument("--formant_timbre", type=float, default=0.8)
-
- return parser.parse_args()
-
-def main():
- args = parse_arguments()
- pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, f0_autotune_strength, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, checkpointing, f0_file, f0_onnx, embedders_mode, formant_shifting, formant_qfrency, formant_timbre = args.pitch, args.filter_radius, args.index_rate, args.volume_envelope,args.protect, args.hop_length, args.f0_method, args.input_path, args.output_path, args.pth_path, args.index_path, args.f0_autotune, args.f0_autotune_strength, args.clean_audio, args.clean_strength, args.export_format, args.embedder_model, args.resample_sr, args.split_audio, args.checkpointing, args.f0_file, args.f0_onnx, args.embedders_mode, args.formant_shifting, args.formant_qfrency, args.formant_timbre
-
- log_data = {translations['pitch']: pitch, translations['filter_radius']: filter_radius, translations['index_strength']: index_rate, translations['volume_envelope']: volume_envelope, translations['protect']: protect, "Hop length": hop_length, translations['f0_method']: f0_method, translations['audio_path']: input_path, translations['output_path']: output_path.replace('wav', export_format), translations['model_path']: pth_path, translations['indexpath']: index_path, translations['autotune']: f0_autotune, translations['clear_audio']: clean_audio, translations['export_format']: export_format, translations['hubert_model']: embedder_model, translations['split_audio']: split_audio, translations['memory_efficient_training']: checkpointing, translations["f0_onnx_mode"]: f0_onnx, translations["embed_mode"]: embedders_mode}
-
- if clean_audio: log_data[translations['clean_strength']] = clean_strength
- if resample_sr != 0: log_data[translations['sample_rate']] = resample_sr
-
- if f0_autotune: log_data[translations['autotune_rate_info']] = f0_autotune_strength
- if os.path.isfile(f0_file): log_data[translations['f0_file']] = f0_file
-
- if formant_shifting:
- log_data[translations['formant_qfrency']] = formant_qfrency
- log_data[translations['formant_timbre']] = formant_timbre
-
- for key, value in log_data.items():
- logger.debug(f"{key}: {value}")
-
- run_convert_script(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, input_path=input_path, output_path=output_path, pth_path=pth_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, resample_sr=resample_sr, split_audio=split_audio, checkpointing=checkpointing, f0_file=f0_file, f0_onnx=f0_onnx, embedders_mode=embedders_mode, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre)
-
-def run_convert_script(pitch=0, filter_radius=3, index_rate=0.5, volume_envelope=1, protect=0.5, hop_length=64, f0_method="rmvpe", input_path=None, output_path="./output.wav", pth_path=None, index_path=None, f0_autotune=False, f0_autotune_strength=1, clean_audio=False, clean_strength=0.7, export_format="wav", embedder_model="contentvec_base", resample_sr=0, split_audio=False, checkpointing=False, f0_file=None, f0_onnx=False, embedders_mode="fairseq", formant_shifting=False, formant_qfrency=0.8, formant_timbre=0.8):
- check_predictors(f0_method, f0_onnx); check_embedders(embedder_model, embedders_mode)
-
- if not pth_path or not os.path.exists(pth_path) or os.path.isdir(pth_path) or not pth_path.endswith((".pth", ".onnx")):
- logger.warning(translations["provide_file"].format(filename=translations["model"]))
- sys.exit(1)
-
- cvt = VoiceConverter(pth_path, 0)
- start_time = time.time()
-
- pid_path = os.path.join("assets", "convert_pid.txt")
- with open(pid_path, "w") as pid_file:
- pid_file.write(str(os.getpid()))
-
- if os.path.isdir(input_path):
- logger.info(translations["convert_batch"])
- audio_files = [f for f in os.listdir(input_path) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]
-
- if not audio_files:
- logger.warning(translations["not_found_audio"])
- sys.exit(1)
-
- logger.info(translations["found_audio"].format(audio_files=len(audio_files)))
-
- for audio in audio_files:
- audio_path = os.path.join(input_path, audio)
- output_audio = os.path.join(input_path, os.path.splitext(audio)[0] + f"_output.{export_format}")
-
- logger.info(f"{translations['convert_audio']} '{audio_path}'...")
- if os.path.exists(output_audio): os.remove(output_audio)
- cvt.convert_audio(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, audio_input_path=audio_path, audio_output_path=output_audio, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, resample_sr=resample_sr, checkpointing=checkpointing, f0_file=f0_file, f0_onnx=f0_onnx, embedders_mode=embedders_mode, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre, split_audio=split_audio)
-
- logger.info(translations["convert_batch_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}", output_path=output_path.replace('wav', export_format)))
- else:
- if not os.path.exists(input_path):
- logger.warning(translations["not_found_audio"])
- sys.exit(1)
-
- logger.info(f"{translations['convert_audio']} '{input_path}'...")
- if os.path.exists(output_path): os.remove(output_path)
- cvt.convert_audio(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, audio_input_path=input_path, audio_output_path=output_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, resample_sr=resample_sr, checkpointing=checkpointing, f0_file=f0_file, f0_onnx=f0_onnx, embedders_mode=embedders_mode, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre, split_audio=split_audio)
-
- if os.path.exists(pid_path): os.remove(pid_path)
- logger.info(translations["convert_audio_success"].format(input_path=input_path, elapsed_time=f"{(time.time() - start_time):.2f}", output_path=output_path.replace('wav', export_format)))
-
-def change_rms(source_audio, source_rate, target_audio, target_rate, rate):
- rms2 = F.interpolate(torch.from_numpy(librosa.feature.rms(y=target_audio, frame_length=target_rate // 2 * 2, hop_length=target_rate // 2)).float().unsqueeze(0), size=target_audio.shape[0], mode="linear").squeeze()
- return (target_audio * (torch.pow(F.interpolate(torch.from_numpy(librosa.feature.rms(y=source_audio, frame_length=source_rate // 2 * 2, hop_length=source_rate // 2)).float().unsqueeze(0), size=target_audio.shape[0], mode="linear").squeeze(), 1 - rate) * torch.pow(torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6), rate - 1)).numpy())
-
-def clear_gpu_cache():
- gc.collect()
- if torch.cuda.is_available(): torch.cuda.empty_cache()
- elif torch.backends.mps.is_available(): torch.mps.empty_cache()
-
-def get_providers():
- ort_providers = onnxruntime.get_available_providers()
-
- if "CUDAExecutionProvider" in ort_providers: providers = ["CUDAExecutionProvider"]
- elif "CoreMLExecutionProvider" in ort_providers: providers = ["CoreMLExecutionProvider"]
- else: providers = ["CPUExecutionProvider"]
-
- return providers
-
-class Autotune:
- def __init__(self, ref_freqs):
- self.ref_freqs = ref_freqs
- self.note_dict = self.ref_freqs
-
- def autotune_f0(self, f0, f0_autotune_strength):
- autotuned_f0 = np.zeros_like(f0)
-
- for i, freq in enumerate(f0):
- autotuned_f0[i] = freq + (min(self.note_dict, key=lambda x: abs(x - freq)) - freq) * f0_autotune_strength
-
- return autotuned_f0
-
-class VC:
- def __init__(self, tgt_sr, config):
- self.x_pad = config.x_pad
- self.x_query = config.x_query
- self.x_center = config.x_center
- self.x_max = config.x_max
- self.sample_rate = 16000
- self.window = 160
- self.t_pad = self.sample_rate * self.x_pad
- self.t_pad_tgt = tgt_sr * self.x_pad
- self.t_pad2 = self.t_pad * 2
- self.t_query = self.sample_rate * self.x_query
- self.t_center = self.sample_rate * self.x_center
- self.t_max = self.sample_rate * self.x_max
- self.time_step = self.window / self.sample_rate * 1000
- self.f0_min = 50
- self.f0_max = 1100
- self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
- self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
- self.device = config.device
- self.is_half = config.is_half
- self.ref_freqs = [49.00, 51.91, 55.00, 58.27, 61.74, 65.41, 69.30, 73.42, 77.78, 82.41, 87.31, 92.50, 98.00, 103.83, 110.00, 116.54, 123.47, 130.81, 138.59, 146.83, 155.56, 164.81, 174.61, 185.00, 196.00, 207.65, 220.00, 233.08, 246.94, 261.63, 277.18, 293.66, 311.13, 329.63, 349.23, 369.99, 392.00, 415.30, 440.00, 466.16, 493.88, 523.25, 554.37, 587.33, 622.25, 659.25, 698.46, 739.99, 783.99, 830.61, 880.00, 932.33, 987.77, 1046.50]
- self.autotune = Autotune(self.ref_freqs)
- self.note_dict = self.autotune.note_dict
-
- def get_f0_pm(self, x, p_len):
- import parselmouth
-
- f0 = (parselmouth.Sound(x, self.sample_rate).to_pitch_ac(time_step=self.window / self.sample_rate * 1000 / 1000, voicing_threshold=0.6, pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array["frequency"])
- pad_size = (p_len - len(f0) + 1) // 2
-
- if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
- return f0
-
- def get_f0_mangio_crepe(self, x, p_len, hop_length, model="full", onnx=False):
- from main.library.predictors.CREPE import predict
-
- x = x.astype(np.float32)
- x /= np.quantile(np.abs(x), 0.999)
-
- audio = torch.unsqueeze(torch.from_numpy(x).to(self.device, copy=True), dim=0)
- if audio.ndim == 2 and audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True).detach()
-
- p_len = p_len or x.shape[0] // hop_length
- source = np.array(predict(audio.detach(), self.sample_rate, hop_length, self.f0_min, self.f0_max, model, batch_size=hop_length * 2, device=self.device, pad=True, providers=get_providers(), onnx=onnx).squeeze(0).cpu().float().numpy())
- source[source < 0.001] = np.nan
-
- return np.nan_to_num(np.interp(np.arange(0, len(source) * p_len, len(source)) / p_len, np.arange(0, len(source)), source))
-
- def get_f0_crepe(self, x, model="full", onnx=False):
- from main.library.predictors.CREPE import predict, mean, median
-
- f0, pd = predict(torch.tensor(np.copy(x))[None].float(), self.sample_rate, self.window, self.f0_min, self.f0_max, model, batch_size=512, device=self.device, return_periodicity=True, providers=get_providers(), onnx=onnx)
- f0, pd = mean(f0, 3), median(pd, 3)
- f0[pd < 0.1] = 0
-
- return f0[0].cpu().numpy()
-
- def get_f0_fcpe(self, x, p_len, hop_length, onnx=False, legacy=False):
- from main.library.predictors.FCPE import FCPE
-
- model_fcpe = FCPE(os.path.join("assets", "models", "predictors", ("fcpe_legacy" if legacy else "fcpe") + (".onnx" if onnx else ".pt")), hop_length=int(hop_length), f0_min=int(self.f0_min), f0_max=int(self.f0_max), dtype=torch.float32, device=self.device, sample_rate=self.sample_rate, threshold=0.03 if legacy else 0.006, providers=get_providers(), onnx=onnx, legacy=legacy)
- f0 = model_fcpe.compute_f0(x, p_len=p_len)
-
- del model_fcpe
- return f0
-
- def get_f0_rmvpe(self, x, legacy=False, onnx=False):
- from main.library.predictors.RMVPE import RMVPE
-
- rmvpe_model = RMVPE(os.path.join("assets", "models", "predictors", "rmvpe" + (".onnx" if onnx else ".pt")), is_half=self.is_half, device=self.device, onnx=onnx, providers=get_providers())
- f0 = rmvpe_model.infer_from_audio_with_pitch(x, thred=0.03, f0_min=self.f0_min, f0_max=self.f0_max) if legacy else rmvpe_model.infer_from_audio(x, thred=0.03)
-
- del rmvpe_model
- return f0
-
- def get_f0_pyworld(self, x, filter_radius, model="harvest"):
- from main.library.predictors.WORLD_WRAPPER import PYWORLD
-
- pw = PYWORLD()
- x = x.astype(np.double)
-
- if model == "harvest": f0, t = pw.harvest(x, fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=10)
- elif model == "dio": f0, t = pw.dio(x, fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=10)
- else: raise ValueError(translations["method_not_valid"])
-
- f0 = pw.stonemask(x, self.sample_rate, t, f0)
-
- if filter_radius > 2 or model == "dio": f0 = signal.medfilt(f0, filter_radius)
- return f0
-
- def get_f0_swipe(self, x):
- from main.library.predictors.SWIPE import swipe
-
- f0, _ = swipe(x.astype(np.float32), self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, frame_period=10)
- return f0
-
- def get_f0_yin(self, x, hop_length, p_len, mode="yin"):
- source = np.array(librosa.yin(x.astype(np.float32), sr=self.sample_rate, fmin=self.f0_min, fmax=self.f0_max, hop_length=hop_length) if mode == "yin" else librosa.pyin(x.astype(np.float32), fmin=self.f0_min, fmax=self.f0_max, sr=self.sample_rate, hop_length=hop_length)[0])
- source[source < 0.001] = np.nan
- return np.nan_to_num(np.interp(np.arange(0, len(source) * p_len, len(source)) / p_len, np.arange(0, len(source)), source))
-
- def get_f0_hybrid(self, methods_str, x, p_len, hop_length, filter_radius, onnx_mode):
- methods_str = re.search("hybrid\[(.+)\]", methods_str)
- if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")]
-
- f0_computation_stack, resampled_stack = [], []
- logger.debug(translations["hybrid_methods"].format(methods=methods))
-
- x = x.astype(np.float32)
- x /= np.quantile(np.abs(x), 0.999)
-
- for method in methods:
- f0 = None
- f0_methods = {"pm": lambda: self.get_f0_pm(x, p_len), "dio": lambda: self.get_f0_pyworld(x, filter_radius, "dio"), "mangio-crepe-tiny": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "tiny", onnx=onnx_mode), "mangio-crepe-small": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "small", onnx=onnx_mode), "mangio-crepe-medium": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "medium", onnx=onnx_mode), "mangio-crepe-large": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "large", onnx=onnx_mode), "mangio-crepe-full": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "full", onnx=onnx_mode), "crepe-tiny": lambda: self.get_f0_crepe(x, "tiny", onnx=onnx_mode), "crepe-small": lambda: self.get_f0_crepe(x, "small", onnx=onnx_mode), "crepe-medium": lambda: self.get_f0_crepe(x, "medium", onnx=onnx_mode), "crepe-large": lambda: self.get_f0_crepe(x, "large", onnx=onnx_mode), "crepe-full": lambda: self.get_f0_crepe(x, "full", onnx=onnx_mode), "fcpe": lambda: self.get_f0_fcpe(x, p_len, int(hop_length), onnx=onnx_mode), "fcpe-legacy": lambda: self.get_f0_fcpe(x, p_len, int(hop_length), legacy=True, onnx=onnx_mode), "rmvpe": lambda: self.get_f0_rmvpe(x, onnx=onnx_mode), "rmvpe-legacy": lambda: self.get_f0_rmvpe(x, legacy=True, onnx=onnx_mode), "harvest": lambda: self.get_f0_pyworld(x, filter_radius, "harvest"), "yin": lambda: self.get_f0_yin(x, int(hop_length), p_len, mode="yin"), "pyin": lambda: self.get_f0_yin(x, int(hop_length), p_len, mode="pyin"), "swipe": lambda: self.get_f0_swipe(x)}
- f0 = f0_methods.get(method, lambda: ValueError(translations["method_not_valid"]))()
- f0_computation_stack.append(f0)
-
- for f0 in f0_computation_stack:
- resampled_stack.append(np.interp(np.linspace(0, len(f0), p_len), np.arange(len(f0)), f0))
-
- return resampled_stack[0] if len(resampled_stack) == 1 else np.nanmedian(np.vstack(resampled_stack), axis=0)
-
- def get_f0(self, x, p_len, pitch, f0_method, filter_radius, hop_length, f0_autotune, f0_autotune_strength, inp_f0=None, onnx_mode=False):
- f0_methods = {"pm": lambda: self.get_f0_pm(x, p_len), "dio": lambda: self.get_f0_pyworld(x, filter_radius, "dio"), "mangio-crepe-tiny": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "tiny", onnx=onnx_mode), "mangio-crepe-small": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "small", onnx=onnx_mode), "mangio-crepe-medium": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "medium", onnx=onnx_mode), "mangio-crepe-large": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "large", onnx=onnx_mode), "mangio-crepe-full": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "full", onnx=onnx_mode), "crepe-tiny": lambda: self.get_f0_crepe(x, "tiny", onnx=onnx_mode), "crepe-small": lambda: self.get_f0_crepe(x, "small", onnx=onnx_mode), "crepe-medium": lambda: self.get_f0_crepe(x, "medium", onnx=onnx_mode), "crepe-large": lambda: self.get_f0_crepe(x, "large", onnx=onnx_mode), "crepe-full": lambda: self.get_f0_crepe(x, "full", onnx=onnx_mode), "fcpe": lambda: self.get_f0_fcpe(x, p_len, int(hop_length), onnx=onnx_mode), "fcpe-legacy": lambda: self.get_f0_fcpe(x, p_len, int(hop_length), legacy=True, onnx=onnx_mode), "rmvpe": lambda: self.get_f0_rmvpe(x, onnx=onnx_mode), "rmvpe-legacy": lambda: self.get_f0_rmvpe(x, legacy=True, onnx=onnx_mode), "harvest": lambda: self.get_f0_pyworld(x, filter_radius, "harvest"), "yin": lambda: self.get_f0_yin(x, int(hop_length), p_len, mode="yin"), "pyin": lambda: self.get_f0_yin(x, int(hop_length), p_len, mode="pyin"), "swipe": lambda: self.get_f0_swipe(x)}
- f0 = self.get_f0_hybrid(f0_method, x, p_len, hop_length, filter_radius, onnx_mode) if "hybrid" in f0_method else f0_methods.get(f0_method, lambda: ValueError(translations["method_not_valid"]))()
-
- if f0_autotune: f0 = Autotune.autotune_f0(self, f0, f0_autotune_strength)
- if isinstance(f0, tuple): f0 = f0[0]
-
- f0 *= pow(2, pitch / 12)
- tf0 = self.sample_rate // self.window
-
- if inp_f0 is not None:
- replace_f0 = np.interp(list(range(np.round((inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1).astype(np.int16))), inp_f0[:, 0] * 100, inp_f0[:, 1])
- f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[:f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]]
-
- f0_mel = 1127 * np.log(1 + f0 / 700)
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (self.f0_mel_max - self.f0_mel_min) + 1
- f0_mel[f0_mel <= 1] = 1
- f0_mel[f0_mel > 255] = 255
-
- return np.rint(f0_mel).astype(np.int32), f0.copy()
-
- def extract_features(self, model, feats, version):
- return torch.as_tensor(model.run([model.get_outputs()[0].name, model.get_outputs()[1].name], {"feats": feats.detach().cpu().numpy()})[0 if version == "v1" else 1], dtype=torch.float32, device=feats.device)
-
- def voice_conversion(self, model, net_g, sid, audio0, pitch, pitchf, index, big_npy, index_rate, version, protect):
- pitch_guidance = pitch != None and pitchf != None
- feats = (torch.from_numpy(audio0).half() if self.is_half else torch.from_numpy(audio0).float())
-
- if feats.dim() == 2: feats = feats.mean(-1)
- assert feats.dim() == 1, feats.dim()
- feats = feats.view(1, -1)
-
- with torch.no_grad():
- if self.embed_suffix == ".pt":
- padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
- logits = model.extract_features(**{"source": feats.to(self.device), "padding_mask": padding_mask, "output_layer": 9 if version == "v1" else 12})
- feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
- elif self.embed_suffix == ".onnx": feats = self.extract_features(model, feats.to(self.device), version).to(self.device)
- elif self.embed_suffix == ".safetensors":
- logits = model(feats.to(self.device))["last_hidden_state"]
- feats = (model.final_proj(logits[0]).unsqueeze(0) if version == "v1" else logits)
- else: raise ValueError(translations["option_not_valid"])
-
- if protect < 0.5 and pitch_guidance: feats0 = feats.clone()
-
- if (not isinstance(index, type(None)) and not isinstance(big_npy, type(None)) and index_rate != 0):
- npy = feats[0].cpu().numpy()
- if self.is_half: npy = npy.astype(np.float32)
-
- score, ix = index.search(npy, k=8)
- weight = np.square(1 / score)
-
- npy = np.sum(big_npy[ix] * np.expand_dims(weight / weight.sum(axis=1, keepdims=True), axis=2), axis=1)
- if self.is_half: npy = npy.astype(np.float16)
-
- feats = (torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats)
-
- feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
- if protect < 0.5 and pitch_guidance: feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
-
- p_len = audio0.shape[0] // self.window
-
- if feats.shape[1] < p_len:
- p_len = feats.shape[1]
- if pitch_guidance:
- pitch = pitch[:, :p_len]
- pitchf = pitchf[:, :p_len]
-
- if protect < 0.5 and pitch_guidance:
- pitchff = pitchf.clone()
- pitchff[pitchf > 0] = 1
- pitchff[pitchf < 1] = protect
- pitchff = pitchff.unsqueeze(-1)
-
- feats = (feats * pitchff + feats0 * (1 - pitchff)).to(feats0.dtype)
-
- p_len = torch.tensor([p_len], device=self.device).long()
- audio1 = ((net_g.infer(feats.half() if self.is_half else feats.float(), p_len, pitch if pitch_guidance else None, (pitchf.half() if self.is_half else pitchf.float()) if pitch_guidance else None, sid)[0][0, 0]).data.cpu().float().numpy()) if self.suffix == ".pth" else (net_g.run([net_g.get_outputs()[0].name], ({net_g.get_inputs()[0].name: feats.cpu().numpy().astype(np.float32), net_g.get_inputs()[1].name: p_len.cpu().numpy(), net_g.get_inputs()[2].name: np.array([sid.cpu().item()], dtype=np.int64), net_g.get_inputs()[3].name: np.random.randn(1, 192, p_len).astype(np.float32), net_g.get_inputs()[4].name: pitch.cpu().numpy().astype(np.int64), net_g.get_inputs()[5].name: pitchf.cpu().numpy().astype(np.float32)} if pitch_guidance else {net_g.get_inputs()[0].name: feats.cpu().numpy().astype(np.float32), net_g.get_inputs()[1].name: p_len.cpu().numpy(), net_g.get_inputs()[2].name: np.array([sid.cpu().item()], dtype=np.int64), net_g.get_inputs()[3].name: np.random.randn(1, 192, p_len).astype(np.float32)}))[0][0, 0])
-
- if self.embed_suffix == ".pt": del padding_mask
- del feats, p_len, net_g
- clear_gpu_cache()
- return audio1
-
- def pipeline(self, model, net_g, sid, audio, pitch, f0_method, file_index, index_rate, pitch_guidance, filter_radius, volume_envelope, version, protect, hop_length, f0_autotune, f0_autotune_strength, suffix, embed_suffix, f0_file=None, f0_onnx=False, pbar=None):
- self.suffix = suffix
- self.embed_suffix = embed_suffix
-
- if file_index != "" and os.path.exists(file_index) and index_rate != 0:
- try:
- index = faiss.read_index(file_index)
- big_npy = index.reconstruct_n(0, index.ntotal)
- except Exception as e:
- logger.error(translations["read_faiss_index_error"].format(e=e))
- index = big_npy = None
- else: index = big_npy = None
-
- pbar.update(1)
- opt_ts, audio_opt = [], []
- audio = signal.filtfilt(bh, ah, audio)
- audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
-
- if audio_pad.shape[0] > self.t_max:
- audio_sum = np.zeros_like(audio)
- for i in range(self.window):
- audio_sum += audio_pad[i : i - self.window]
-
- for t in range(self.t_center, audio.shape[0], self.t_center):
- opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query : t + self.t_query]) == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min())[0][0])
-
- s = 0
- t, inp_f0 = None, None
- audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
- sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
- p_len = audio_pad.shape[0] // self.window
-
- if hasattr(f0_file, "name"):
- try:
- with open(f0_file.name, "r") as f:
- raw_lines = f.read()
- if len(raw_lines) > 0:
- inp_f0 = []
- for line in raw_lines.strip("\n").split("\n"):
- inp_f0.append([float(i) for i in line.split(",")])
-
- inp_f0 = np.array(inp_f0, dtype=np.float32)
- except:
- logger.error(translations["error_readfile"])
- inp_f0 = None
-
- pbar.update(1)
- if pitch_guidance:
- pitch, pitchf = self.get_f0(audio_pad, p_len, pitch, f0_method, filter_radius, hop_length, f0_autotune, f0_autotune_strength, inp_f0, onnx_mode=f0_onnx)
- pitch, pitchf = pitch[:p_len], pitchf[:p_len]
- if self.device == "mps": pitchf = pitchf.astype(np.float32)
- pitch, pitchf = torch.tensor(pitch, device=self.device).unsqueeze(0).long(), torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
-
- pbar.update(1)
- for t in opt_ts:
- t = t // self.window * self.window
- audio_opt.append(self.voice_conversion(model, net_g, sid, audio_pad[s : t + self.t_pad2 + self.window], pitch[:, s // self.window : (t + self.t_pad2) // self.window] if pitch_guidance else None, pitchf[:, s // self.window : (t + self.t_pad2) // self.window] if pitch_guidance else None, index, big_npy, index_rate, version, protect)[self.t_pad_tgt : -self.t_pad_tgt])
- s = t
-
- audio_opt.append(self.voice_conversion(model, net_g, sid, audio_pad[t:], (pitch[:, t // self.window :] if t is not None else pitch) if pitch_guidance else None, (pitchf[:, t // self.window :] if t is not None else pitchf) if pitch_guidance else None, index, big_npy, index_rate, version, protect)[self.t_pad_tgt : -self.t_pad_tgt])
- audio_opt = np.concatenate(audio_opt)
- if volume_envelope != 1: audio_opt = change_rms(audio, self.sample_rate, audio_opt, self.sample_rate, volume_envelope)
- audio_max = np.abs(audio_opt).max() / 0.99
- if audio_max > 1: audio_opt /= audio_max
-
- if pitch_guidance: del pitch, pitchf
- del sid
- clear_gpu_cache()
- pbar.update(1)
-
- return audio_opt
-
-class VoiceConverter:
- def __init__(self, model_path, sid = 0):
- self.config = config
- self.device = config.device
- self.hubert_model = None
- self.tgt_sr = None
- self.net_g = None
- self.vc = None
- self.cpt = None
- self.version = None
- self.n_spk = None
- self.use_f0 = None
- self.loaded_model = None
- self.vocoder = "Default"
- self.checkpointing = False
- self.sample_rate = 16000
- self.sid = sid
- self.get_vc(model_path, sid)
-
- def convert_audio(self, audio_input_path, audio_output_path, index_path, embedder_model, pitch, f0_method, index_rate, volume_envelope, protect, hop_length, f0_autotune, f0_autotune_strength, filter_radius, clean_audio, clean_strength, export_format, resample_sr = 0, checkpointing = False, f0_file = None, f0_onnx = False, embedders_mode = "fairseq", formant_shifting = False, formant_qfrency = 0.8, formant_timbre = 0.8, split_audio = False):
- try:
- with tqdm(total=10, desc=translations["convert_audio"], ncols=100, unit="a") as pbar:
- audio = load_audio(logger, audio_input_path, self.sample_rate, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre)
- self.checkpointing = checkpointing
- audio_max = np.abs(audio).max() / 0.95
- if audio_max > 1: audio /= audio_max
-
- pbar.update(1)
- if not self.hubert_model:
- models, _, embed_suffix = load_embedders_model(embedder_model, embedders_mode, providers=get_providers())
- self.hubert_model = (models.to(self.device).half() if self.config.is_half else models.to(self.device).float()).eval() if embed_suffix in [".pt", ".safetensors"] else models
- self.embed_suffix = embed_suffix
-
- pbar.update(1)
- if self.tgt_sr != resample_sr >= self.sample_rate: self.tgt_sr = resample_sr
- target_sr = min([8000, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000, 96000], key=lambda x: abs(x - self.tgt_sr))
-
- if split_audio:
- chunks = cut(audio, self.sample_rate, db_thresh=-60, min_interval=500)
- pbar.total = len(chunks) * 4 + 6
- logger.info(f"{translations['split_total']}: {len(chunks)}")
- else: chunks = [(audio, 0, 0)]
-
- converted_chunks = []
- pbar.update(1)
-
- for waveform, start, end in chunks:
- converted_chunks.append((start, end, self.vc.pipeline(model=self.hubert_model, net_g=self.net_g, sid=self.sid, audio=waveform, pitch=pitch, f0_method=f0_method, file_index=(index_path.strip().strip('"').strip("\n").strip('"').strip().replace("trained", "added")), index_rate=index_rate, pitch_guidance=self.use_f0, filter_radius=filter_radius, volume_envelope=volume_envelope, version=self.version, protect=protect, hop_length=hop_length, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, suffix=self.suffix, embed_suffix=self.embed_suffix, f0_file=f0_file, f0_onnx=f0_onnx, pbar=pbar)))
-
- pbar.update(1)
- audio_output = restore(converted_chunks, total_len=len(audio), dtype=converted_chunks[0][2].dtype) if split_audio else converted_chunks[0][2]
- if target_sr >= self.sample_rate and self.tgt_sr != target_sr: audio_output = librosa.resample(audio_output, orig_sr=self.tgt_sr, target_sr=target_sr, res_type="soxr_vhq")
-
- pbar.update(1)
- if clean_audio:
- from main.tools.noisereduce import reduce_noise
- audio_output = reduce_noise(y=audio_output, sr=target_sr, prop_decrease=clean_strength, device=self.device)
-
- sf.write(audio_output_path, audio_output, target_sr, format=export_format)
- pbar.update(1)
- except Exception as e:
- logger.error(translations["error_convert"].format(e=e))
- import traceback
- logger.debug(traceback.format_exc())
-
- def get_vc(self, weight_root, sid):
- if sid == "" or sid == []:
- self.cleanup()
- clear_gpu_cache()
-
- if not self.loaded_model or self.loaded_model != weight_root:
- self.loaded_model = weight_root
- self.load_model()
- if self.cpt is not None: self.setup()
-
- def cleanup(self):
- if self.hubert_model is not None:
- del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
- self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
- clear_gpu_cache()
-
- del self.net_g, self.cpt
- clear_gpu_cache()
- self.cpt = None
-
- def load_model(self):
- if os.path.isfile(self.loaded_model):
- if self.loaded_model.endswith(".pth"): self.cpt = torch.load(self.loaded_model, map_location="cpu")
- else:
- sess_options = onnxruntime.SessionOptions()
- sess_options.log_severity_level = 3
- self.cpt = onnxruntime.InferenceSession(self.loaded_model, sess_options=sess_options, providers=get_providers())
- else: self.cpt = None
-
- def setup(self):
- if self.cpt is not None:
- if self.loaded_model.endswith(".pth"):
- self.tgt_sr = self.cpt["config"][-1]
- self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
- self.use_f0 = self.cpt.get("f0", 1)
- self.version = self.cpt.get("version", "v1")
- self.vocoder = self.cpt.get("vocoder", "Default")
- if self.vocoder != "Default": self.config.is_half = False
-
- self.net_g = Synthesizer(*self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=768 if self.version == "v2" else 256, vocoder=self.vocoder, checkpointing=self.checkpointing)
- del self.net_g.enc_q
-
- self.net_g.load_state_dict(self.cpt["weight"], strict=False)
- self.net_g.eval().to(self.device)
- self.net_g = (self.net_g.half() if self.config.is_half else self.net_g.float())
- self.n_spk = self.cpt["config"][-3]
- self.suffix = ".pth"
- else:
- import json
- import onnx
-
- metadata_dict = None
- for prop in onnx.load(self.loaded_model).metadata_props:
- if prop.key == "model_info":
- metadata_dict = json.loads(prop.value)
- break
-
- self.net_g = self.cpt
- self.tgt_sr = metadata_dict.get("sr", 32000)
- self.use_f0 = metadata_dict.get("f0", 1)
- self.version = metadata_dict.get("version", "v1")
- self.suffix = ".onnx"
-
- self.vc = VC(self.tgt_sr, self.config)
-
-if __name__ == "__main__": main()
\ No newline at end of file
diff --git a/main/inference/create_dataset.py b/main/inference/create_dataset.py
deleted file mode 100644
index 6d5f0a8f73dad06c7644cc7aa393ad84b9c01da5..0000000000000000000000000000000000000000
--- a/main/inference/create_dataset.py
+++ /dev/null
@@ -1,230 +0,0 @@
-import os
-import sys
-import time
-import yt_dlp
-import shutil
-import librosa
-import logging
-import argparse
-import warnings
-import logging.handlers
-
-from soundfile import read, write
-from distutils.util import strtobool
-
-sys.path.append(os.getcwd())
-
-from main.configs.config import Config
-from main.library.algorithm.separator import Separator
-
-config = Config()
-translations = config.translations
-dataset_temp = os.path.join("dataset_temp")
-logger = logging.getLogger(__name__)
-
-if logger.hasHandlers(): logger.handlers.clear()
-else:
- console_handler = logging.StreamHandler()
- console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
- console_handler.setFormatter(console_formatter)
- console_handler.setLevel(logging.INFO)
- file_handler = logging.handlers.RotatingFileHandler(os.path.join("assets", "logs", "create_dataset.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8')
- file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
- file_handler.setFormatter(file_formatter)
- file_handler.setLevel(logging.DEBUG)
- logger.addHandler(console_handler)
- logger.addHandler(file_handler)
- logger.setLevel(logging.DEBUG)
-
-def parse_arguments():
- parser = argparse.ArgumentParser()
- parser.add_argument("--input_audio", type=str, required=True)
- parser.add_argument("--output_dataset", type=str, default="./dataset")
- parser.add_argument("--sample_rate", type=int, default=44100)
- parser.add_argument("--clean_dataset", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--clean_strength", type=float, default=0.7)
- parser.add_argument("--separator_reverb", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--kim_vocal_version", type=int, default=2)
- parser.add_argument("--overlap", type=float, default=0.25)
- parser.add_argument("--segments_size", type=int, default=256)
- parser.add_argument("--mdx_hop_length", type=int, default=1024)
- parser.add_argument("--mdx_batch_size", type=int, default=1)
- parser.add_argument("--denoise_mdx", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--skip", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--skip_start_audios", type=str, default="0")
- parser.add_argument("--skip_end_audios", type=str, default="0")
-
- return parser.parse_args()
-
-def main():
- pid_path = os.path.join("assets", "create_dataset_pid.txt")
- with open(pid_path, "w") as pid_file:
- pid_file.write(str(os.getpid()))
-
- args = parse_arguments()
- input_audio, output_dataset, sample_rate, clean_dataset, clean_strength, separator_reverb, kim_vocal_version, overlap, segments_size, hop_length, batch_size, denoise_mdx, skip, skip_start_audios, skip_end_audios = args.input_audio, args.output_dataset, args.sample_rate, args.clean_dataset, args.clean_strength, args.separator_reverb, args.kim_vocal_version, args.overlap, args.segments_size, args.mdx_hop_length, args.mdx_batch_size, args.denoise_mdx, args.skip, args.skip_start_audios, args.skip_end_audios
- log_data = {translations['audio_path']: input_audio, translations['output_path']: output_dataset, translations['sr']: sample_rate, translations['clear_dataset']: clean_dataset, translations['dereveb_audio']: separator_reverb, translations['segments_size']: segments_size, translations['overlap']: overlap, "Hop length": hop_length, translations['batch_size']: batch_size, translations['denoise_mdx']: denoise_mdx, translations['skip']: skip}
-
- if clean_dataset: log_data[translations['clean_strength']] = clean_strength
- if skip:
- log_data[translations['skip_start']] = skip_start_audios
- log_data[translations['skip_end']] = skip_end_audios
-
- for key, value in log_data.items():
- logger.debug(f"{key}: {value}")
-
- if kim_vocal_version not in [1, 2]: raise ValueError(translations["version_not_valid"])
- start_time = time.time()
-
- try:
- paths = []
-
- if not os.path.exists(dataset_temp): os.makedirs(dataset_temp, exist_ok=True)
- urls = input_audio.replace(", ", ",").split(",")
-
- for url in urls:
- path = downloader(url, urls.index(url))
- paths.append(path)
-
- if skip:
- skip_start_audios, skip_end_audios = skip_start_audios.replace(", ", ",").split(","), skip_end_audios.replace(", ", ",").split(",")
-
- if len(skip_start_audios) < len(paths) or len(skip_end_audios) < len(paths):
- logger.warning(translations["skip