diff --git a/main/app/app.py b/main/app/app.py
deleted file mode 100644
index b357f9b2acc4a5a8977537eeb13bdf05828398a1..0000000000000000000000000000000000000000
--- a/main/app/app.py
+++ /dev/null
@@ -1,524 +0,0 @@
-import os
-import io
-import ssl
-import sys
-import time
-import codecs
-import logging
-import warnings
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-start_time = time.time()
-
-from main.app.tabs.extra.extra import extra_tab
-from main.app.tabs.editing.editing import editing_tab
-from main.app.tabs.training.training import training_tab
-from main.app.tabs.downloads.downloads import download_tab
-from main.app.tabs.inference.inference import inference_tab
-from main.configs.rpc import connect_discord_ipc, send_discord_rpc
-from main.app.variables import logger, config, translations, theme, font, configs, language, allow_disk
-
-ssl._create_default_https_context = ssl._create_unverified_context
-
-warnings.filterwarnings("ignore")
-for l in ["httpx", "gradio", "uvicorn", "httpcore", "urllib3"]:
- logging.getLogger(l).setLevel(logging.ERROR)
-
-js_code = """
-() => {
- window._activeStream = null;
- window._audioCtx = null;
- window._workletNode = null;
- window._playbackNode = null;
- window._ws = null;
-
- function setStatus(msg, use_alert = true) {
- const realtimeStatus = document.querySelector("#realtime-status-info h2.output-class");
- if (use_alert) alert(msg);
-
- if (realtimeStatus) {
- realtimeStatus.innerText = msg;
- realtimeStatus.style.whiteSpace = "nowrap";
- realtimeStatus.style.textAlign = "center";
- }
- }
-
- async function addModuleFromString(ctx, codeStr) {
- const blob = new Blob([codeStr], {type: 'application/javascript'});
- const url = URL.createObjectURL(blob);
-
- await ctx.audioWorklet.addModule(url);
- URL.revokeObjectURL(url);
- };
-
- function createOutputRoute(audioCtx, playbackNode, sinkId, gainValue = 1.0) {
- const dest = audioCtx.createMediaStreamDestination();
- const gainNode = audioCtx.createGain();
- gainNode.gain.value = gainValue;
-
- playbackNode.connect(gainNode);
- gainNode.connect(dest);
-
- const el = document.createElement('audio');
- el.autoplay = true;
- el.srcObject = dest.stream;
- el.style.display = 'none';
- document.body.appendChild(el);
-
- if (el.setSinkId) el.setSinkId(sinkId).catch(err => console.error(err));
- return { dest, gainNode, el };
- }
-
- const inputWorkletSource = `
- class InputProcessor extends AudioWorkletProcessor {
- constructor() {
- super();
- this.buffer = new Float32Array(0);
- this.block_frame = 128;
- this.port.onmessage = (e) => {
- if (e.data && e.data.block_frame) this.block_frame = e.data.block_frame;
- };
- }
-
- process(inputs) {
- const input = inputs[0];
- if (!input || !input[0]) return true;
- const frame = input[0];
-
- const newBuf = new Float32Array(this.buffer.length + frame.length);
- newBuf.set(this.buffer, 0);
- newBuf.set(frame, this.buffer.length);
- this.buffer = newBuf;
-
- while (this.buffer.length >= this.block_frame) {
- const chunk = this.buffer.slice(0, this.block_frame);
-
- this.port.postMessage({chunk}, [chunk.buffer]);
- this.buffer = this.buffer.slice(this.block_frame);
- }
-
- return true;
- }
- }
- registerProcessor('input-processor', InputProcessor);
- `;
-
- const playbackWorkletSource = `
- class PlaybackProcessor extends AudioWorkletProcessor {
- constructor(options) {
- super(options);
- const bufferSize = options.processorOptions && options.processorOptions.bufferSize ? options.processorOptions.bufferSize: 98304;
- this.buffer = new Float32Array(bufferSize);
- this.bufferCapacity = bufferSize;
- this.writePointer = 0;
- this.readPointer = 0;
- this.availableSamples = 0;
- this.port.onmessage = (e) => {
- if (e.data && e.data.chunk) {
- const chunk = new Float32Array(e.data.chunk);
- const chunkSize = chunk.length;
-
- if (this.availableSamples + chunkSize > this.bufferCapacity) return;
-
- for (let i = 0; i < chunkSize; i++) {
- this.buffer[this.writePointer] = chunk[i];
- this.writePointer = (this.writePointer + 1) % this.bufferCapacity;
- }
-
- this.availableSamples += chunkSize;
- }
- };
- }
-
- process(inputs, outputs) {
- const output = outputs[0];
- if (!output || !output[0]) return true;
-
- const frame = output[0];
- const frameSize = frame.length;
-
- if (this.availableSamples >= frameSize) {
- for (let i = 0; i < frameSize; i++) {
- frame[i] = this.buffer[this.readPointer];
- this.readPointer = (this.readPointer + 1) % this.bufferCapacity;
- }
- this.availableSamples -= frameSize;
- } else {
- frame.fill(0);
- }
-
- if (output.length > 1) output[1].set(output[0]);
- return true;
- }
- }
- registerProcessor('playback-processor', PlaybackProcessor);
- `;
-
- window.getAudioDevices = async function() {
- if (!navigator.mediaDevices) {
- setStatus("__MEDIA_DEVICES__");
- return {"inputs": {}, "outputs": {}};
- }
-
- try {
- await navigator.mediaDevices.getUserMedia({ audio: true });
- } catch (err) {
- console.error(err);
- setStatus("__MIC_INACCESSIBLE__")
-
- return {"inputs": {}, "outputs": {}};
- }
-
- const devices = await navigator.mediaDevices.enumerateDevices();
- const inputs = {};
- const outputs = {};
-
- for (const device of devices) {
- if (device.kind === "audioinput") {
- inputs[device.label] = device.deviceId
- } else if (device.kind === "audiooutput") {
- outputs[device.label] = device.deviceId
- }
- }
-
- if (!Object.keys(inputs).length && !Object.keys(outputs).length) return {"inputs": {}, "outputs": {}};
- return {"inputs": inputs, "outputs": outputs};
- };
-
- window.StreamAudioRealtime = async function(
- monitor,
- vad_enabled,
- input_audio_device,
- output_audio_device,
- monitor_output_device,
- input_audio_gain,
- output_audio_gain,
- monitor_audio_gain,
- chunk_size,
- pitch,
- model_pth,
- model_index,
- index_strength,
- onnx_f0_mode,
- f0_method,
- hop_length,
- embed_mode,
- embedders,
- custom_embedders,
- f0_autotune,
- proposal_pitch,
- f0_autotune_strength,
- proposal_pitch_threshold,
- rms_mix_rate,
- protect,
- filter_radius,
- silent_threshold,
- extra_convert_size,
- cross_fade_overlap_size,
- vad_sensitivity,
- vad_frame_ms,
- clean_audio,
- clean_strength
- ) {
- const SampleRate = 48000;
- const ReadChunkSize = Math.round(chunk_size * SampleRate / 1000 / 128);
- const block_frame = parseInt(ReadChunkSize) * 128;
- const ButtonState = { start_button: true, stop_button: false };
- const devices = await window.getAudioDevices();
-
- input_audio_device = devices["inputs"][input_audio_device];
- output_audio_device = devices["outputs"][output_audio_device];
- if (monitor && devices["outputs"][monitor_output_device]) monitor_output_device = devices["outputs"][monitor_output_device];
-
- try {
- if (!input_audio_device || !output_audio_device) {
- setStatus("__PROVIDE_AUDIO_DEVICE__");
- return ButtonState;
- }
-
- if (monitor && !monitor_output_device) {
- setStatus("__PROVIDE_MONITOR_DEVICE__");
- return ButtonState;
- }
-
- if (!model_pth) {
- setStatus("__PROVIDE_MODEL__")
- return ButtonState;
- }
-
- setStatus("__START_REALTIME__", use_alert=false)
-
- const stream = await navigator.mediaDevices.getUserMedia({
- audio: {
- deviceId: { exact: input_audio_device },
- channelCount: 1,
- sampleRate: SampleRate,
- echoCancellation: false,
- noiseSuppression: false,
- autoGainControl: false
- }
- });
-
- window._activeStream = stream;
- window._audioCtx = new AudioContext({ sampleRate: SampleRate, latencyHint: "interactive" });
-
- await addModuleFromString(window._audioCtx, inputWorkletSource);
- await addModuleFromString(window._audioCtx, playbackWorkletSource);
-
- const src = window._audioCtx.createMediaStreamSource(stream);
- const inputNode = new AudioWorkletNode(window._audioCtx, 'input-processor');
- const playbackNode = new AudioWorkletNode(window._audioCtx, 'playback-processor', {
- processorOptions: {
- bufferSize: block_frame * 2
- }
- });
-
- inputNode.port.postMessage({ block_frame: block_frame });
- src.connect(inputNode);
-
- createOutputRoute(window._audioCtx, playbackNode, output_audio_device, output_audio_gain / 100);
- if (monitor && monitor_output_device) createOutputRoute(window._audioCtx, playbackNode, monitor_output_device, monitor_audio_gain / 100);
-
- const protocol = (location.protocol === "https:") ? "wss:" : "ws:";
- const wsUrl = protocol + '//' + location.hostname + `:${location.port}` + '/api/ws-audio';
- const ws = new WebSocket(wsUrl);
-
- ButtonState.start_button = false;
- ButtonState.stop_button = true;
-
- ws.binaryType = "arraybuffer";
- window._ws = ws;
-
- ws.onopen = () => {
- console.log("__WS_CONNECTED__")
-
- ws.send(
- JSON.stringify({
- type: 'init',
- chunk_size: ReadChunkSize,
- embedders: embedders,
- model_pth: model_pth,
- custom_embedders: custom_embedders,
- cross_fade_overlap_size: cross_fade_overlap_size,
- extra_convert_size: extra_convert_size,
- model_index: model_index,
- f0_method: f0_method,
- f0_onnx: onnx_f0_mode,
- embedders_mode: embed_mode,
- hop_length: hop_length,
- silent_threshold: silent_threshold,
- vad_enabled: vad_enabled,
- vad_sensitivity: vad_sensitivity,
- vad_frame_ms: vad_frame_ms,
- clean_audio: clean_audio,
- clean_strength: clean_strength,
- f0_up_key: pitch,
- index_rate: index_strength,
- protect: protect,
- filter_radius: filter_radius,
- rms_mix_rate: rms_mix_rate,
- f0_autotune: f0_autotune,
- f0_autotune_strength: f0_autotune_strength,
- proposal_pitch: proposal_pitch,
- proposal_pitch_threshold: proposal_pitch_threshold,
- input_audio_gain: input_audio_gain
- })
- );
- };
-
- inputNode.port.onmessage = (e) => {
- const chunk = e.data && e.data.chunk;
-
- if (!chunk) return;
- if (ws.readyState === WebSocket.OPEN) ws.send(chunk);
- };
-
- ws.onmessage = (ev) => {
- if (typeof ev.data === 'string') {
- const msg = JSON.parse(ev.data);
-
- if (msg.type === 'latency') setStatus(`__LATENCY__: ${msg.value.toFixed(1)} ms`, use_alert=false)
- if (msg.type === 'warnings') {
- setStatus(msg.value);
- StopAudioStream();
- }
-
- return;
- }
-
- const ab = ev.data;
- playbackNode.port.postMessage({ chunk: ab }, [ab]);
- };
-
- ws.onclose = () => console.log("__WS_CLOSED__");
- window._workletNode = inputNode;
- window._playbackNode = playbackNode;
-
- if (window._audioCtx.state === 'suspended') await window._audioCtx.resume();
-
- console.log("__REALTIME_STARTED__");
- return ButtonState;
- } catch (err) {
- console.error("__ERROR__", err);
- alert("__ERROR__" + err.message);
-
- return StopAudioStream();
- }
- };
-
- window.StopAudioStream = async function() {
- try {
- if (window._ws) {
- window._ws.close();
- window._ws = null;
- }
-
- if (window._activeStream) {
- window._activeStream.getTracks().forEach(t => t.stop());
- window._activeStream = null;
- }
-
- if (window._workletNode) {
- window._workletNode.disconnect();
- window._workletNode = null;
- }
-
- if (window._playbackNode) {
- window._playbackNode.disconnect();
- window._playbackNode = null;
- }
-
- if (window._audioCtx) {
- await window._audioCtx.close();
- window._audioCtx = null;
- }
-
- document.querySelectorAll('audio').forEach(a => a.remove());
- setStatus("__REALTIME_HAS_STOP__", use_alert=false);
-
- return {"start_button": true, "stop_button": false};
- } catch (e) {
- setStatus(`__ERROR__ ${e}`);
-
- return {"start_button": false, "stop_button": true}
- }
- };
-}
-""".replace(
- "__MEDIA_DEVICES__", translations["media_devices"]
-).replace(
- "__MIC_INACCESSIBLE__", translations["mic_inaccessible"]
-).replace(
- "__PROVIDE_AUDIO_DEVICE__", translations["provide_audio_device"]
-).replace(
- "__PROVIDE_MONITOR_DEVICE__", translations["provide_monitor_device"]
-).replace(
- "__START_REALTIME__", translations["start_realtime"]
-).replace(
- "__LATENCY__", translations['latency']
-).replace(
- "__WS_CONNECTED__", translations["ws_connected"]
-).replace(
- "__WS_CLOSED__", translations["ws_closed"]
-).replace(
- "__REALTIME_STARTED__", translations["realtime_is_ready"]
-).replace(
- "__ERROR__", translations["error_occurred"].format(e="")
-).replace(
- "__REALTIME_HAS_STOP__", translations["realtime_has_stop"]
-).replace(
- "__PROVIDE_MODEL__", translations["provide_file"].format(filename=translations["model"])
-)
-
-client_mode = True # "--client" in sys.argv
-
-with gr.Blocks(
- title="📱 Vietnamese-RVC GUI BY ANH",
- js=js_code if client_mode else None,
- theme=theme,
- css="".format(fonts=font or "https://fonts.googleapis.com/css2?family=Courgette&display=swap")
-) as app:
- gr.HTML("
🎵VIETNAMESE RVC BY ANH🎵
")
- gr.HTML(f"{translations['title']}
")
-
- with gr.Tabs():
- inference_tab()
- editing_tab()
-
- if client_mode:
- from main.app.tabs.realtime.realtime_client import realtime_client_tab
- realtime_client_tab()
- else:
- from main.app.tabs.realtime.realtime import realtime_tab
- realtime_tab()
-
- training_tab()
- download_tab()
- extra_tab(app)
-
- with gr.Row():
- gr.Markdown(translations["rick_roll"].format(rickroll=codecs.decode('uggcf://jjj.lbhghor.pbz/jngpu?i=qDj4j9JtKpD', 'rot13')))
-
- with gr.Row():
- gr.Markdown(translations["terms_of_use"])
-
- with gr.Row():
- gr.Markdown(translations["exemption"])
-
- if __name__ == "__main__":
- logger.info(config.device.replace("privateuseone", "dml"))
- logger.info(translations["start_app"])
- logger.info(translations["set_lang"].format(lang=language))
-
- port = configs.get("app_port", 7860)
- server_name = configs.get("server_name", "0.0.0.0")
- share = "--share" in sys.argv
-
- original_stdout = sys.stdout
- sys.stdout = io.StringIO()
-
- for i in range(configs.get("num_of_restart", 5)):
- try:
- gradio_app, _, share_url = app.queue().launch(
- favicon_path=configs["ico_path"],
- server_name=server_name,
- server_port=port,
- show_error=configs.get("app_show_error", False),
- inbrowser="--open" in sys.argv,
- share=share,
- allowed_paths=allow_disk,
- prevent_thread_lock=True,
- quiet=True
- )
- break
- except OSError:
- logger.debug(translations["port"].format(port=port))
- port -= 1
- except Exception as e:
- logger.error(translations["error_occurred"].format(e=e))
- sys.exit(1)
-
- if client_mode:
- from main.app.core.realtime_client import app as fastapi_app
- gradio_app.mount("/api", fastapi_app)
-
- sys.stdout = original_stdout
-
- if configs.get("discord_presence", True):
- pipe = connect_discord_ipc()
- if pipe:
- try:
- logger.info(translations["start_rpc"])
- send_discord_rpc(pipe)
- except KeyboardInterrupt:
- logger.info(translations["stop_rpc"])
- pipe.close()
-
- logger.info(f"{translations['running_local_url']}: {server_name}:{port}")
- if share: logger.info(f"{translations['running_share_url']}: {share_url}")
- logger.info(f"{translations['gradio_start']}: {(time.time() - start_time):.2f}s")
-
- while 1:
- time.sleep(5)
\ No newline at end of file
diff --git a/main/app/core/csrt.py b/main/app/core/csrt.py
deleted file mode 100644
index f99f0878ee52a4d1ba26dac59339042cff38b356..0000000000000000000000000000000000000000
--- a/main/app/core/csrt.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-from main.app.core.inference import whisper_process
-from main.library.utils import check_spk_diarization
-from main.app.core.ui import gr_info, gr_warning, process_output
-from main.app.variables import config, translations, configs, logger
-
-def create_srt(model_size, input_audio, output_file, word_timestamps):
- import multiprocessing as mp
-
- if not input_audio or not os.path.exists(input_audio) or os.path.isdir(input_audio):
- gr_warning(translations["input_not_valid"])
- return [None]*2
-
- if not output_file.endswith(".srt"): output_file += ".srt"
-
- if not output_file:
- gr_warning(translations["output_not_valid"])
- return [None]*2
-
- output_dir = os.path.dirname(output_file)
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
-
- info = ""
- output_file = process_output(output_file)
-
- check_spk_diarization(model_size, speechbrain=False)
- gr_info(translations["csrt"])
-
- try:
- mp.set_start_method("spawn")
- except:
- pass
-
- whisper_queue = mp.Queue()
- whisperprocess = mp.Process(target=whisper_process, args=(model_size, input_audio, configs, config.device, whisper_queue, word_timestamps))
- whisperprocess.start()
-
- segments = whisper_queue.get()
-
- with open(output_file, "w", encoding="utf-8") as f:
- for i, segment in enumerate(segments):
- start = segment["start"]
- end = segment["end"]
- text = segment["text"].strip()
-
- index = f"{i+1}\n"
- timestamp = f"{format_timestamp(start)} --> {format_timestamp(end)}\n"
- text1 = f"{text}\n\n"
-
- f.write(index)
- f.write(timestamp)
- f.write(text1)
-
- info = info + index + timestamp + text1
- logger.info(info)
-
- gr_info(translations["success"])
-
- return [{"value": output_file, "visible": True, "__type__": "update"}, info]
-
-def format_timestamp(seconds):
- hours = int(seconds // 3600)
- minutes = int((seconds % 3600) // 60)
-
- seconds = int(seconds % 60)
- miliseconds = int((seconds - int(seconds)) * 1000)
-
- return f"{hours:02}:{minutes:02}:{seconds:02},{miliseconds:03}"
\ No newline at end of file
diff --git a/main/app/core/downloads.py b/main/app/core/downloads.py
deleted file mode 100644
index 715d7313e50a84fe92a863cbc7f1b9786474d6f9..0000000000000000000000000000000000000000
--- a/main/app/core/downloads.py
+++ /dev/null
@@ -1,208 +0,0 @@
-import os
-import re
-import sys
-import json
-import codecs
-import shutil
-import yt_dlp
-import warnings
-import requests
-
-from bs4 import BeautifulSoup
-
-sys.path.append(os.getcwd())
-
-from main.tools import huggingface, gdown, meganz, mediafire, pixeldrain
-from main.app.variables import logger, translations, model_options, configs
-from main.app.core.process import move_files_from_directory, fetch_pretrained_data, extract_name_model
-from main.app.core.ui import gr_info, gr_warning, gr_error, process_output, replace_url, replace_modelname
-
-def download_url(url):
- if not url:
- gr_warning(translations["provide_url"])
- return [None]*3
-
- if not os.path.exists(configs["audios_path"]): os.makedirs(configs["audios_path"], exist_ok=True)
-
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore")
- ydl_opts = {
- "format": "bestaudio/best",
- "postprocessors": [{
- "key": "FFmpegExtractAudio",
- "preferredcodec": "wav",
- "preferredquality": "192"
- }],
- "quiet": True,
- "no_warnings": True,
- "noplaylist": True,
- "verbose": False
- }
-
- gr_info(translations["start"].format(start=translations["download_music"]))
-
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
- audio_output = os.path.join(configs["audios_path"], re.sub(r'\s+', '-', re.sub(r'[^\w\s\u4e00-\u9fff\uac00-\ud7af\u0400-\u04FF\u1100-\u11FF]', '', ydl.extract_info(url, download=False).get('title', 'video')).strip()))
- if os.path.exists(audio_output): shutil.rmtree(audio_output, ignore_errors=True)
-
- ydl_opts['outtmpl'] = audio_output
-
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
- audio_output = process_output(audio_output + ".wav")
-
- ydl.download([url])
-
- gr_info(translations["success"])
- return [audio_output, audio_output, translations["success"]]
-
-def move_file(file, download_dir, model):
- weights_dir = configs["weights_path"]
- logs_dir = configs["logs_path"]
-
- if not os.path.exists(weights_dir): os.makedirs(weights_dir, exist_ok=True)
- if not os.path.exists(logs_dir): os.makedirs(logs_dir, exist_ok=True)
-
- if file.endswith(".zip"): shutil.unpack_archive(file, download_dir)
- move_files_from_directory(download_dir, weights_dir, logs_dir, model)
-
-def download_model(url=None, model=None):
- if not url: return gr_warning(translations["provide_url"])
-
- url = replace_url(url)
- download_dir = "download_model"
-
- os.makedirs(download_dir, exist_ok=True)
-
- try:
- gr_info(translations["start"].format(start=translations["download"]))
-
- if "huggingface.co" in url: file = huggingface.HF_download_file(url, download_dir)
- elif "google.com" in url: file = gdown.gdown_download(url, download_dir)
- elif "mediafire.com" in url: file = mediafire.Mediafire_Download(url, download_dir)
- elif "pixeldrain.com" in url: file = pixeldrain.pixeldrain(url, download_dir)
- elif "mega.nz" in url: file = meganz.mega_download_url(url, download_dir)
- else:
- gr_warning(translations["not_support_url"])
- return translations["not_support_url"]
-
- if not model:
- modelname = os.path.basename(file)
- model = extract_name_model(modelname) if modelname.endswith(".index") else os.path.splitext(modelname)[0]
- if model is None: model = os.path.splitext(modelname)[0]
-
- model = replace_modelname(model)
-
- move_file(file, download_dir, model)
- gr_info(translations["success"])
-
- return translations["success"]
- except Exception as e:
- gr_error(message=translations["error_occurred"].format(e=e))
- return translations["error_occurred"].format(e=e)
- finally:
- shutil.rmtree(download_dir, ignore_errors=True)
-
-def download_pretrained_model(choices, model, sample_rate):
- pretraineds_custom_path = configs["pretrained_custom_path"]
-
- if choices == translations["list_model"]:
- paths = fetch_pretrained_data()[model][sample_rate]
-
- if not os.path.exists(pretraineds_custom_path): os.makedirs(pretraineds_custom_path, exist_ok=True)
- url = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cergenvarq_phfgbz/", "rot13") + paths
-
- gr_info(translations["download_pretrain"])
- file = huggingface.HF_download_file(replace_url(url), os.path.join(pretraineds_custom_path, paths))
-
- if file.endswith(".zip"):
- shutil.unpack_archive(file, pretraineds_custom_path)
- os.remove(file)
-
- gr_info(translations["success"])
- return translations["success"]
- elif choices == translations["download_url"]:
- pretrain_is_zip = model.endswith(".zip") or model.endswith(".zip?download=true") or sample_rate.endswith(".zip") or sample_rate.endswith(".zip?download=true")
- urls = []
-
- if not model and not pretrain_is_zip:
- gr_warning(translations["provide_pretrain"].format(dg="D"))
- return [None]*2
-
- if not sample_rate and not pretrain_is_zip:
- gr_warning(translations["provide_pretrain"].format(dg="G"))
- return [None]*2
-
- gr_info(translations["download_pretrain"])
-
- if model: urls.append(model)
- if sample_rate: urls.append(sample_rate)
-
- for url in urls:
- url = replace_url(url)
-
- if "huggingface.co" in url: file = huggingface.HF_download_file(url, pretraineds_custom_path)
- elif "google.com" in url: file = gdown.gdown_download(url, pretraineds_custom_path)
- elif "mediafire.com" in url: file = mediafire.Mediafire_Download(url, pretraineds_custom_path)
- elif "pixeldrain.com" in url: file = pixeldrain.pixeldrain(url, pretraineds_custom_path)
- elif "mega.nz" in url: file = meganz.mega_download_url(url, pretraineds_custom_path)
- else:
- gr_warning(translations["not_support_url"])
- return translations["not_support_url"], translations["not_support_url"]
-
- if file.endswith(".zip"):
- shutil.unpack_archive(file, pretraineds_custom_path)
- if os.path.exists(file): os.remove(file)
-
- gr_info(translations["success"])
- return translations["success"], translations["success"]
-
-def fetch_models_data(search):
- all_table_data = []
- page = 1
-
- while 1:
- try:
- response = requests.post(url=codecs.decode("uggcf://ibvpr-zbqryf.pbz/srgpu_qngn.cuc", "rot13"), data={"page": page, "search": search})
-
- if response.status_code == 200:
- table_data = response.json().get("table", "")
- if not table_data.strip(): break
-
- all_table_data.append(table_data)
- page += 1
- else:
- logger.debug(f"{translations['code_error']} {response.status_code}")
- break
- except json.JSONDecodeError:
- logger.debug(translations["json_error"])
- break
- except requests.RequestException as e:
- logger.debug(translations["requests_error"].format(e=e))
- break
-
- return all_table_data
-
-def search_models(name):
- if not name:
- gr_warning(translations["provide_name"])
- return [None]*2
-
- gr_info(translations["start"].format(start=translations["search"]))
-
- tables = fetch_models_data(name)
-
- if len(tables) == 0:
- gr_info(translations["not_found"].format(name=name))
- return [None]*2
- else:
- model_options.clear()
-
- for table in tables:
- for row in BeautifulSoup(table, "html.parser").select("tr"):
- name_tag, url_tag = row.find("a", {"class": "fs-5"}), row.find("a", {"class": "btn btn-sm fw-bold btn-light ms-0 p-1 ps-2 pe-2"})
- url = url_tag["href"].replace("https://easyaivoice.com/run?url=", "")
- if "huggingface" in url:
- if name_tag and url_tag: model_options[replace_modelname(name_tag.text)] = url
-
- gr_info(translations["found"].format(results=len(model_options)))
- return [{"value": "", "choices": model_options, "interactive": True, "visible": True, "__type__": "update"}, {"value": translations["downloads"], "visible": True, "__type__": "update"}]
\ No newline at end of file
diff --git a/main/app/core/editing.py b/main/app/core/editing.py
deleted file mode 100644
index d30e43638d0689a9fe5475646ed4ad158e769863..0000000000000000000000000000000000000000
--- a/main/app/core/editing.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import os
-import sys
-import random
-import subprocess
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import python, translations, configs
-from main.app.core.ui import gr_info, gr_warning, process_output, replace_export_format
-
-def audio_effects(input_path, output_path, resample, resample_sr, chorus_depth, chorus_rate, chorus_mix, chorus_delay, chorus_feedback, distortion_drive, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift, delay_seconds, delay_feedback, delay_mix, compressor_threshold, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold, limiter_release, gain_db, bitcrush_bit_depth, clipping_threshold, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost_db, bass_boost_frequency, treble_boost_db, treble_boost_frequency, fade_in_duration, fade_out_duration, export_format, chorus, distortion, reverb, delay, compressor, limiter, gain, bitcrush, clipping, phaser, treble_bass_boost, fade_in_out, audio_combination, audio_combination_input, main_vol, combine_vol):
- if not input_path or not os.path.exists(input_path) or os.path.isdir(input_path):
- gr_warning(translations["input_not_valid"])
- return None
-
- if not output_path:
- gr_warning(translations["output_not_valid"])
- return None
-
- if os.path.isdir(output_path): output_path = os.path.join(output_path, f"audio_effects.{export_format}")
- output_dir = os.path.dirname(output_path) or output_path
-
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
- output_path = process_output(output_path)
-
- gr_info(translations["start"].format(start=translations["apply_effect"]))
-
- subprocess.run([python, configs["audio_effects_path"], "--input_path", input_path, "--output_path", output_path, "--resample", str(resample), "--resample_sr", str(resample_sr), "--chorus_depth", str(chorus_depth), "--chorus_rate", str(chorus_rate), "--chorus_mix", str(chorus_mix), "--chorus_delay", str(chorus_delay), "--chorus_feedback", str(chorus_feedback), "--drive_db", str(distortion_drive), "--reverb_room_size", str(reverb_room_size), "--reverb_damping", str(reverb_damping), "--reverb_wet_level", str(reverb_wet_level), "--reverb_dry_level", str(reverb_dry_level), "--reverb_width", str(reverb_width), "--reverb_freeze_mode", str(reverb_freeze_mode), "--pitch_shift", str(pitch_shift), "--delay_seconds", str(delay_seconds), "--delay_feedback", str(delay_feedback), "--delay_mix", str(delay_mix), "--compressor_threshold", str(compressor_threshold), "--compressor_ratio", str(compressor_ratio), "--compressor_attack_ms", str(compressor_attack_ms), "--compressor_release_ms", str(compressor_release_ms), "--limiter_threshold", str(limiter_threshold), "--limiter_release", str(limiter_release), "--gain_db", str(gain_db), "--bitcrush_bit_depth", str(bitcrush_bit_depth), "--clipping_threshold", str(clipping_threshold), "--phaser_rate_hz", str(phaser_rate_hz), "--phaser_depth", str(phaser_depth), "--phaser_centre_frequency_hz", str(phaser_centre_frequency_hz), "--phaser_feedback", str(phaser_feedback), "--phaser_mix", str(phaser_mix), "--bass_boost_db", str(bass_boost_db), "--bass_boost_frequency", str(bass_boost_frequency), "--treble_boost_db", str(treble_boost_db), "--treble_boost_frequency", str(treble_boost_frequency), "--fade_in_duration", str(fade_in_duration), "--fade_out_duration", str(fade_out_duration), "--export_format", export_format, "--chorus", str(chorus), "--distortion", str(distortion), "--reverb", str(reverb), "--pitchshift", str(pitch_shift != 0), "--delay", str(delay), "--compressor", str(compressor), "--limiter", str(limiter), "--gain", str(gain), "--bitcrush", str(bitcrush), "--clipping", str(clipping), "--phaser", str(phaser), "--treble_bass_boost", str(treble_bass_boost), "--fade_in_out", str(fade_in_out), "--audio_combination", str(audio_combination), "--audio_combination_input", audio_combination_input, "--main_volume", str(main_vol), "--combination_volume", str(combine_vol)])
-
- gr_info(translations["success"])
- return replace_export_format(output_path, export_format)
-
-def apply_voice_quirk(audio_path, mode, output_path, export_format):
- if not audio_path or not os.path.exists(audio_path) or os.path.isdir(audio_path):
- gr_warning(translations["input_not_valid"])
- return None
-
- if not output_path:
- gr_warning(translations["output_not_valid"])
- return None
-
- if os.path.isdir(output_path): output_path = os.path.join(output_path, f"audio_quirk.{export_format}")
- output_dir = os.path.dirname(output_path) or output_path
-
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
- output_path = process_output(output_path)
-
- gr_info(translations["start"].format(start=translations["apply_effect"]))
-
- import librosa
- import numpy as np
- import soundfile as sf
-
- def vibrato(y, sr, freq=5, depth=0.003):
- return y[np.clip((np.arange(len(y)) + (depth * np.sin(2 * np.pi * freq * (np.arange(len(y)) / sr))) * sr).astype(int), 0, len(y) - 1)]
-
- y, sr = librosa.load(audio_path, sr=None)
- output_path = replace_export_format(output_path, export_format)
-
- mode = translations["quirk_choice"][mode]
- if mode == 0: mode = random.randint(1, 16)
-
- if mode == 1: y *= np.random.uniform(0.5, 0.8, size=len(y))
- elif mode == 2: y = librosa.effects.pitch_shift(y=y + np.random.normal(0, 0.01, y.shape), sr=sr, n_steps=np.random.uniform(-1.5, -3.5))
- elif mode == 3: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=3), rate=1.2)
- elif mode == 4: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=8), rate=1.3)
- elif mode == 5: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=-3), rate=0.75)
- elif mode == 6: y *= np.sin(np.linspace(0, np.pi * 20, len(y))) * 0.5 + 0.5
- elif mode == 7: y = librosa.effects.time_stretch(vibrato(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=-4), sr, freq=3, depth=0.004), rate=0.85)
- elif mode == 8: y *= 0.6 + np.pad(y, (sr // 2, 0), mode='constant')[:len(y)] * 0.4
- elif mode == 9: y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=2) + np.sin(np.linspace(0, np.pi * 20, len(y))) * 0.02
- elif mode == 10: y = vibrato(y, sr, freq=8, depth=0.005)
- elif mode == 11: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=4), rate=1.25)
- elif mode == 12: y = np.hstack([np.pad(f, (0, int(len(f)*0.3)), mode='edge') for f in librosa.util.frame(y, frame_length=2048, hop_length=512).T])
- elif mode == 13: y = np.concatenate([y, np.sin(2 * np.pi * np.linspace(0, 1, int(0.05 * sr))) * 0.02])
- elif mode == 14: y += np.random.normal(0, 0.005, len(y))
- elif mode == 15:
- frame = int(sr * 0.2)
- chunks = [y[i:i + frame] for i in range(0, len(y), frame)]
-
- np.random.shuffle(chunks)
- y = np.concatenate(chunks)
- elif mode == 16:
- frame = int(sr * 0.3)
-
- for i in range(0, len(y), frame * 2):
- y[i:i+frame] = y[i:i+frame][::-1]
-
- sf.write(output_path, y, sr, format=export_format)
- gr_info(translations["success"])
-
- return output_path
\ No newline at end of file
diff --git a/main/app/core/f0_extract.py b/main/app/core/f0_extract.py
deleted file mode 100644
index 2c91d46a763f55f3905044d79a853c8148972b30..0000000000000000000000000000000000000000
--- a/main/app/core/f0_extract.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-from main.app.core.ui import gr_info, gr_warning
-from main.app.variables import config, translations, configs
-
-def f0_extract(audio, f0_method, f0_onnx):
- if not audio or not os.path.exists(audio) or os.path.isdir(audio):
- gr_warning(translations["input_not_valid"])
- return [None]*2
-
- import librosa
- import numpy as np
- import matplotlib.pyplot as plt
-
- from main.library.utils import check_assets, load_audio
- from main.library.predictors.Generator import Generator
-
- check_assets(f0_method, "", f0_onnx, "")
-
- f0_path = os.path.join(configs["f0_path"], os.path.splitext(os.path.basename(audio))[0])
- image_path = os.path.join(f0_path, "f0.png")
- txt_path = os.path.join(f0_path, "f0.txt")
-
- gr_info(translations["start_extract"])
-
- if not os.path.exists(f0_path): os.makedirs(f0_path, exist_ok=True)
-
- y = load_audio(audio, sample_rate=16000)
- f0_generator = Generator(16000, 160, 50, 1100, 0.5, is_half=config.is_half, device=config.device, f0_onnx_mode=f0_onnx, del_onnx_model=f0_onnx)
- _, pitchf = f0_generator.calculator(config.x_pad, f0_method, y, 0, None, 3, False, 0, None, False)
-
- F_temp = np.array(pitchf, dtype=np.float32)
- F_temp[F_temp == 0] = np.nan
-
- f0 = 1200 * np.log2(F_temp / librosa.midi_to_hz(0))
-
- plt.figure(figsize=(10, 4))
- plt.plot(f0)
- plt.title(f0_method)
- plt.xlabel(translations["time_frames"])
- plt.ylabel(translations["Frequency"])
- plt.savefig(image_path)
- plt.close()
-
- with open(txt_path, "w") as f:
- for i, f0_value in enumerate(f0):
- f.write(f"{i * 100.0},{f0_value}\n")
-
- gr_info(translations["extract_done"])
-
- return [txt_path, image_path]
\ No newline at end of file
diff --git a/main/app/core/inference.py b/main/app/core/inference.py
deleted file mode 100644
index 2690dac3839d6749b0bf59196bf25fb0d5a6f7bd..0000000000000000000000000000000000000000
--- a/main/app/core/inference.py
+++ /dev/null
@@ -1,441 +0,0 @@
-import os
-import re
-import gc
-import sys
-import shutil
-import datetime
-import subprocess
-
-import numpy as np
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import logger, config, configs, translations, python
-from main.app.core.ui import gr_info, gr_warning, gr_error, process_output, replace_export_format
-
-def convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, f0_onnx, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5):
- subprocess.run([
- python,
- configs["convert_path"],
- "--pitch", str(pitch),
- "--filter_radius", str(filter_radius),
- "--index_rate", str(index_rate),
- "--rms_mix_rate", str(rms_mix_rate),
- "--protect", str(protect),
- "--hop_length", str(hop_length),
- "--f0_method", f0_method,
- "--input_path", input_path,
- "--output_path", output_path,
- "--pth_path", pth_path,
- "--index_path", index_path,
- "--f0_autotune", str(f0_autotune),
- "--clean_audio", str(clean_audio),
- "--clean_strength", str(clean_strength),
- "--export_format", export_format,
- "--embedder_model", embedder_model,
- "--resample_sr", str(resample_sr),
- "--split_audio", str(split_audio),
- "--f0_autotune_strength", str(f0_autotune_strength),
- "--checkpointing", str(checkpointing),
- "--f0_onnx", str(f0_onnx),
- "--embedders_mode", embedders_mode,
- "--formant_shifting", str(formant_shifting),
- "--formant_qfrency", str(formant_qfrency),
- "--formant_timbre", str(formant_timbre),
- "--f0_file", f0_file,
- "--proposal_pitch", str(proposal_pitch),
- "--proposal_pitch_threshold", str(proposal_pitch_threshold),
- "--audio_processing", str(audio_processing),
- "--alpha", str(alpha)
- ])
-
-def convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, input_audio_name, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5):
- model_path = os.path.join(configs["weights_path"], model) if not os.path.exists(model) else model
-
- return_none = [None]*6
- return_none[5] = {"visible": True, "__type__": "update"}
-
- if not use_audio:
- if merge_instrument or not_merge_backing or convert_backing or use_original:
- gr_warning(translations["turn_on_use_audio"])
- return return_none
-
- if use_original:
- if convert_backing:
- gr_warning(translations["turn_off_convert_backup"])
- return return_none
- elif not_merge_backing:
- gr_warning(translations["turn_off_merge_backup"])
- return return_none
-
- if not model or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")):
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
- return return_none
-
- f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)
-
- if use_audio:
- output_audio = os.path.join(configs["audios_path"], input_audio_name)
-
- from main.library.utils import pydub_load
-
- def get_audio_file(label):
- matching_files = [f for f in os.listdir(output_audio) if label in f]
-
- if not matching_files: return translations["notfound"]
- return os.path.join(output_audio, matching_files[0])
-
- output_path = os.path.join(output_audio, f"Convert_Vocals.{format}")
- output_backing = os.path.join(output_audio, f"Convert_Backing.{format}")
- output_merge_backup = os.path.join(output_audio, f"Vocals+Backing.{format}")
- output_merge_instrument = os.path.join(output_audio, f"Vocals+Instruments.{format}")
-
- if os.path.exists(output_audio): os.makedirs(output_audio, exist_ok=True)
- output_path = process_output(output_path)
-
- if use_original:
- original_vocal = get_audio_file('Original_Vocals_No_Reverb.')
-
- if original_vocal == translations["notfound"]: original_vocal = get_audio_file('Original_Vocals.')
-
- if original_vocal == translations["notfound"]:
- gr_warning(translations["not_found_original_vocal"])
- return return_none
-
- input_path = original_vocal
- else:
- main_vocal = get_audio_file('Main_Vocals_No_Reverb.')
- backing_vocal = get_audio_file('Backing_Vocals.')
-
- if main_vocal == translations["notfound"]: main_vocal = get_audio_file('Main_Vocals.')
- if main_vocal == translations["notfound"]:
- gr_warning(translations["not_found_main_vocal"])
- return return_none
-
- if not not_merge_backing and backing_vocal == translations["notfound"]:
- gr_warning(translations["not_found_backing_vocal"])
- return return_none
-
- input_path = main_vocal
- backing_path = backing_vocal
-
- gr_info(translations["convert_vocal"])
-
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input_path, output_path, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
-
- gr_info(translations["convert_success"])
-
- if convert_backing:
- output_backing = process_output(output_backing)
-
- gr_info(translations["convert_backup"])
-
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, backing_path, output_backing, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
-
- gr_info(translations["convert_backup_success"])
-
- try:
- if not not_merge_backing and not use_original:
- backing_source = output_backing if convert_backing else backing_vocal
-
- output_merge_backup = process_output(output_merge_backup)
-
- gr_info(translations["merge_backup"])
-
- pydub_load(output_path, volume=-4).overlay(pydub_load(backing_source, volume=-6)).export(output_merge_backup, format=format)
-
- gr_info(translations["merge_success"])
-
- if merge_instrument:
- vocals = output_merge_backup if not not_merge_backing and not use_original else output_path
-
- output_merge_instrument = process_output(output_merge_instrument)
-
- gr_info(translations["merge_instruments_process"])
-
- instruments = get_audio_file('Instruments.')
-
- if instruments == translations["notfound"]:
- gr_warning(translations["not_found_instruments"])
- output_merge_instrument = None
- else: pydub_load(instruments, volume=-7).overlay(pydub_load(vocals, volume=-4 if use_original else None)).export(output_merge_instrument, format=format)
-
- gr_info(translations["merge_success"])
- except:
- return return_none
-
- return [(None if use_original else output_path), output_backing, (None if not_merge_backing and use_original else output_merge_backup), (output_path if use_original else None), (output_merge_instrument if merge_instrument else None), {"visible": True, "__type__": "update"}]
- else:
- if not input or not os.path.exists(input):
- gr_warning(translations["input_not_valid"])
- return return_none
-
- if not output:
- gr_warning(translations["output_not_valid"])
- return return_none
-
- output = replace_export_format(output, format)
-
- if os.path.isdir(input):
- gr_info(translations["is_folder"])
-
- if not [f for f in os.listdir(input) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]:
- gr_warning(translations["not_found_in_folder"])
- return return_none
-
- gr_info(translations["batch_convert"])
-
- output_dir = os.path.dirname(output) or output
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output_dir, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
-
- gr_info(translations["batch_convert_success"])
-
- return return_none
- else:
- output_dir = os.path.dirname(output) or output
-
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
- output = process_output(output)
-
- gr_info(translations["convert_vocal"])
-
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
-
- gr_info(translations["convert_success"])
-
- return_none[0] = output
- return return_none
-
-def convert_selection(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5):
- if use_audio:
- gr_info(translations["search_separate"])
- choice = [f for f in os.listdir(configs["audios_path"]) if os.path.isdir(os.path.join(configs["audios_path"], f))] if config.debug_mode else [f for f in os.listdir(configs["audios_path"]) if os.path.isdir(os.path.join(configs["audios_path"], f)) and any(file.lower().endswith((".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")) for file in os.listdir(os.path.join(configs["audios_path"], f)))]
-
- gr_info(translations["found_choice"].format(choice=len(choice)))
-
- if len(choice) == 0:
- gr_warning(translations["separator==0"])
-
- return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, None, None, None, None, None, {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}]
- elif len(choice) == 1:
- convert_output = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, None, None, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, choice[0], checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
-
- return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, convert_output[0], convert_output[1], convert_output[2], convert_output[3], convert_output[4], {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}]
- else: return [{"choices": choice, "value": choice[0], "interactive": True, "visible": True, "__type__": "update"}, None, None, None, None, None, {"visible": False, "__type__": "update"}, {"visible": True, "__type__": "update"}]
- else:
- main_convert = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, None, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
-
- return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, main_convert[0], None, None, None, None, {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}]
-
-def whisper_process(model_size, input_audio, configs, device, out_queue, word_timestamps=True):
- from main.library.speaker_diarization.whisper import load_model
-
- try:
- segments = load_model(model_size, device=device).transcribe(input_audio, fp16=configs.get("fp16", False), word_timestamps=word_timestamps)
- out_queue.put(segments["segments"])
- except Exception as e:
- out_queue.put(e)
- finally:
- del segments
- gc.collect()
-
-def convert_with_whisper(num_spk, model_size, cleaner, clean_strength, autotune, f0_autotune_strength, checkpointing, model_1, model_2, model_index_1, model_index_2, pitch_1, pitch_2, index_strength_1, index_strength_2, export_format, input_audio, output_audio, onnx_f0_mode, method, hybrid_method, hop_length, embed_mode, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, formant_shifting, formant_qfrency_1, formant_timbre_1, formant_qfrency_2, formant_timbre_2, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5):
- import librosa
- import multiprocessing as mp
-
- from pydub import AudioSegment
- from sklearn.cluster import AgglomerativeClustering
-
- from main.library.utils import clear_gpu_cache
- from main.library.speaker_diarization.audio import Audio
- from main.library.speaker_diarization.segment import Segment
- from main.library.utils import check_spk_diarization, pydub_load
- from main.library.speaker_diarization.embedding import SpeechBrainPretrainedSpeakerEmbedding
-
- check_spk_diarization(model_size)
- model_pth_1, model_pth_2 = os.path.join(configs["weights_path"], model_1) if not os.path.exists(model_1) else model_1, os.path.join(configs["weights_path"], model_2) if not os.path.exists(model_2) else model_2
-
- if (not model_1 or not os.path.exists(model_pth_1) or os.path.isdir(model_pth_1) or not model_pth_1.endswith((".pth", ".onnx"))) and (not model_2 or not os.path.exists(model_pth_2) or os.path.isdir(model_pth_2) or not model_pth_2.endswith((".pth", ".onnx"))):
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
- return None
-
- if not model_1: model_pth_1 = model_pth_2
- if not model_2: model_pth_2 = model_pth_1
-
- if not input_audio or not os.path.exists(input_audio) or os.path.isdir(input_audio):
- gr_warning(translations["input_not_valid"])
- return None
-
- if not output_audio:
- gr_warning(translations["output_not_valid"])
- return None
-
- output_audio = process_output(output_audio)
- gr_info(translations["start_whisper"])
-
- try:
- try:
- mp.set_start_method("spawn")
- except:
- pass
-
- whisper_queue = mp.Queue()
- whisperprocess = mp.Process(target=whisper_process, args=(model_size, input_audio, configs, config.device, whisper_queue, True))
- whisperprocess.start()
-
- segments = whisper_queue.get()
- audio = Audio()
-
- embedding_model = SpeechBrainPretrainedSpeakerEmbedding(embedding=os.path.join(configs["speaker_diarization_path"], "models", "speechbrain"), device=config.device)
- y, sr = librosa.load(input_audio, sr=None)
- duration = len(y) / sr
-
- def segment_embedding(segment):
- waveform, _ = audio.crop(input_audio, Segment(segment["start"], min(duration, segment["end"])))
- return embedding_model(waveform.mean(dim=0, keepdim=True)[None] if waveform.shape[0] == 2 else waveform[None])
-
- def time(secs):
- return datetime.timedelta(seconds=round(secs))
-
- def merge_audio(files_list, time_stamps, original_file_path, output_path, format):
- def extract_number(filename):
- match = re.search(r'_(\d+)', filename)
- return int(match.group(1)) if match else 0
-
- total_duration = len(pydub_load(original_file_path))
- combined = AudioSegment.empty()
- current_position = 0
-
- for file, (start_i, end_i) in zip(sorted(files_list, key=extract_number), time_stamps):
- if start_i > current_position: combined += AudioSegment.silent(duration=start_i - current_position)
-
- combined += pydub_load(file)
- current_position = end_i
-
- if current_position < total_duration: combined += AudioSegment.silent(duration=total_duration - current_position)
- combined.export(output_path, format=format)
-
- return output_path
-
- embeddings = np.zeros(shape=(len(segments), 192))
- for i, segment in enumerate(segments):
- embeddings[i] = segment_embedding(segment)
-
- labels = AgglomerativeClustering(num_spk).fit(np.nan_to_num(embeddings)).labels_
- for i in range(len(segments)):
- segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
-
- merged_segments, current_text = [], []
- current_speaker, current_start = None, None
-
- for i, segment in enumerate(segments):
- speaker = segment["speaker"]
- start_time = segment["start"]
- text = segment["text"][1:]
-
- if speaker == current_speaker:
- current_text.append(text)
- end_time = segment["end"]
- else:
- if current_speaker is not None: merged_segments.append({"speaker": current_speaker, "start": current_start, "end": end_time, "text": " ".join(current_text)})
-
- current_speaker = speaker
- current_start = start_time
- current_text = [text]
- end_time = segment["end"]
-
- if current_speaker is not None: merged_segments.append({"speaker": current_speaker, "start": current_start, "end": end_time, "text": " ".join(current_text)})
-
- gr_info(translations["whisper_done"])
-
- x = ""
- for segment in merged_segments:
- x += f"\n{segment['speaker']} {str(time(segment['start']))} - {str(time(segment['end']))}\n"
- x += segment["text"] + "\n"
-
- logger.info(x)
-
- del audio, embedding_model, segments, labels
- clear_gpu_cache()
- gc.collect()
-
- gr_info(translations["process_audio"])
-
- audio = pydub_load(input_audio)
- output_folder = "audios_temp"
-
- if os.path.exists(output_folder): shutil.rmtree(output_folder, ignore_errors=True)
- for f in [output_folder, os.path.join(output_folder, "1"), os.path.join(output_folder, "2")]:
- os.makedirs(f, exist_ok=True)
-
- time_stamps, processed_segments = [], []
- for i, segment in enumerate(merged_segments):
- start_ms = int(segment["start"] * 1000)
- end_ms = int(segment["end"] * 1000)
-
- index = i + 1
-
- segment_filename = os.path.join(output_folder, "1" if i % 2 == 1 else "2", f"segment_{index}.wav")
- audio[start_ms:end_ms].export(segment_filename, format="wav")
-
- processed_segments.append(os.path.join(output_folder, "1" if i % 2 == 1 else "2", f"segment_{index}_output.wav"))
- time_stamps.append((start_ms, end_ms))
-
- f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)
-
- gr_info(translations["process_done_start_convert"])
-
- convert(pitch_1, filter_radius, index_strength_1, rms_mix_rate, protect, hop_length, f0method, os.path.join(output_folder, "1"), output_folder, model_pth_1, model_index_1, autotune, cleaner, clean_strength, "wav", embedder_model, resample_sr, False, f0_autotune_strength, checkpointing, onnx_f0_mode, embed_mode, formant_shifting, formant_qfrency_1, formant_timbre_1, "", proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
- convert(pitch_2, filter_radius, index_strength_2, rms_mix_rate, protect, hop_length, f0method, os.path.join(output_folder, "2"), output_folder, model_pth_2, model_index_2, autotune, cleaner, clean_strength, "wav", embedder_model, resample_sr, False, f0_autotune_strength, checkpointing, onnx_f0_mode, embed_mode, formant_shifting, formant_qfrency_2, formant_timbre_2, "", proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
-
- gr_info(translations["convert_success"])
- return merge_audio(processed_segments, time_stamps, input_audio, replace_export_format(output_audio, export_format), export_format)
- except Exception as e:
- gr_error(translations["error_occurred"].format(e=e))
- import traceback
- logger.debug(traceback.format_exc())
- return None
- finally:
- if os.path.exists("audios_temp"): shutil.rmtree("audios_temp", ignore_errors=True)
-
-def convert_tts(clean, autotune, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5):
- model_path = os.path.join(configs["weights_path"], model) if not os.path.exists(model) else model
-
- if not model_path or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")):
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
- return None
-
- if not input or not os.path.exists(input):
- gr_warning(translations["input_not_valid"])
- return None
-
- if os.path.isdir(input):
- input_audio = [f for f in os.listdir(input) if "tts" in f and f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]
-
- if not input_audio:
- gr_warning(translations["not_found_in_folder"])
- return None
-
- input = os.path.join(input, input_audio[0])
-
- if not output:
- gr_warning(translations["output_not_valid"])
- return None
-
- output = replace_export_format(output, format)
- if os.path.isdir(output): output = os.path.join(output, f"tts.{format}")
-
- output_dir = os.path.dirname(output)
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
-
- output = process_output(output)
-
- f0method = method if method != "hybrid" else hybrid_method
- embedder_model = embedders if embedders != "custom" else custom_embedders
-
- gr_info(translations["convert_vocal"])
-
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
-
- gr_info(translations["convert_success"])
- return output
\ No newline at end of file
diff --git a/main/app/core/model_utils.py b/main/app/core/model_utils.py
deleted file mode 100644
index f897182f442434a7426b0ffd71107b0a95d14904..0000000000000000000000000000000000000000
--- a/main/app/core/model_utils.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import os
-import sys
-import json
-import torch
-import datetime
-
-sys.path.append(os.getcwd())
-
-from main.app.core.ui import gr_info, gr_warning, gr_error
-from main.app.variables import config, logger, translations, configs
-
-def fushion_model_pth(name, pth_1, pth_2, ratio):
- if not name.endswith(".pth"): name = name + ".pth"
-
- if not pth_1 or not os.path.exists(pth_1) or not pth_1.endswith(".pth"):
- gr_warning(translations["provide_file"].format(filename=translations["model"] + " 1"))
- return [translations["provide_file"].format(filename=translations["model"] + " 1"), None]
-
- if not pth_2 or not os.path.exists(pth_2) or not pth_2.endswith(".pth"):
- gr_warning(translations["provide_file"].format(filename=translations["model"] + " 2"))
- return [translations["provide_file"].format(filename=translations["model"] + " 2"), None]
-
- from collections import OrderedDict
-
- def extract(ckpt):
- a = ckpt["model"]
- opt = OrderedDict()
- opt["weight"] = {}
-
- for key in a.keys():
- if "enc_q" in key: continue
-
- opt["weight"][key] = a[key]
-
- return opt
-
- try:
- ckpt1 = torch.load(pth_1, map_location="cpu", weights_only=True)
- ckpt2 = torch.load(pth_2, map_location="cpu", weights_only=True)
-
- if ckpt1["sr"] != ckpt2["sr"]:
- gr_warning(translations["sr_not_same"])
- return [translations["sr_not_same"], None]
-
- cfg = ckpt1["config"]
- cfg_f0 = ckpt1["f0"]
- cfg_version = ckpt1["version"]
- cfg_sr = ckpt1["sr"]
-
- vocoder = ckpt1.get("vocoder", "Default")
- rms_extract = ckpt1.get("energy", False)
-
- ckpt1 = extract(ckpt1) if "model" in ckpt1 else ckpt1["weight"]
- ckpt2 = extract(ckpt2) if "model" in ckpt2 else ckpt2["weight"]
-
- if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
- gr_warning(translations["architectures_not_same"])
- return [translations["architectures_not_same"], None]
-
- gr_info(translations["start"].format(start=translations["fushion_model"]))
-
- opt = OrderedDict()
- opt["weight"] = {}
-
- for key in ckpt1.keys():
- if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
- min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
- opt["weight"][key] = (ratio * (ckpt1[key][:min_shape0].float()) + (1 - ratio) * (ckpt2[key][:min_shape0].float())).half()
- else: opt["weight"][key] = (ratio * (ckpt1[key].float()) + (1 - ratio) * (ckpt2[key].float())).half()
-
- opt["config"] = cfg
- opt["sr"] = cfg_sr
- opt["f0"] = cfg_f0
- opt["version"] = cfg_version
- opt["infos"] = translations["model_fushion_info"].format(name=name, pth_1=pth_1, pth_2=pth_2, ratio=ratio)
- opt["vocoder"] = vocoder
- opt["energy"] = rms_extract
-
- output_model = configs["weights_path"]
- if not os.path.exists(output_model): os.makedirs(output_model, exist_ok=True)
-
- torch.save(opt, os.path.join(output_model, name))
-
- gr_info(translations["success"])
- return [translations["success"], os.path.join(output_model, name)]
- except Exception as e:
- gr_error(message=translations["error_occurred"].format(e=e))
- return [e, None]
-
-def fushion_model(name, path_1, path_2, ratio):
- if not name:
- gr_warning(translations["provide_name_is_save"])
- return [translations["provide_name_is_save"], None]
-
- if path_1.endswith(".pth") and path_2.endswith(".pth"): return fushion_model_pth(name, path_1, path_2, ratio)
- else:
- gr_warning(translations["format_not_valid"])
- return [None, None]
-
-def onnx_export(model_path):
- if not model_path.endswith(".pth"): model_path += ".pth"
- if not model_path or not os.path.exists(model_path) or not model_path.endswith(".pth"): return gr_warning(translations["provide_file"].format(filename=translations["model"]))
-
- try:
- gr_info(translations["start_onnx_export"])
-
- from main.library.onnx.onnx_export import onnx_exporter
- output = onnx_exporter(model_path, model_path.replace(".pth", ".onnx"), is_half=config.is_half, device=config.device)
-
- gr_info(translations["success"])
- return output
- except Exception as e:
- return gr_error(e)
-
-def model_info(path):
- if not path or not os.path.exists(path) or os.path.isdir(path) or not path.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"]))
-
- def prettify_date(date_str):
- if date_str == translations["not_found_create_time"]: return None
-
- try:
- return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f").strftime("%Y-%m-%d %H:%M:%S")
- except ValueError as e:
- logger.debug(e)
- return translations["format_not_valid"]
-
- if path.endswith(".pth"): model_data = torch.load(path, map_location="cpu")
- else:
- import onnx
-
- model = onnx.load(path)
- model_data = None
-
- for prop in model.metadata_props:
- if prop.key == "model_info":
- model_data = json.loads(prop.value)
- break
-
- gr_info(translations["read_info"])
-
- epochs = model_data.get("epoch", None)
- if epochs is None:
- epochs = model_data.get("info", None)
- try:
- epoch = epochs.replace("epoch", "").replace("e", "").isdigit()
- if epoch and epochs is None: epochs = translations["not_found"].format(name=translations["epoch"])
- except:
- pass
-
- steps = model_data.get("step", translations["not_found"].format(name=translations["step"]))
- sr = model_data.get("sr", translations["not_found"].format(name=translations["sr"]))
- f0 = model_data.get("f0", translations["not_found"].format(name=translations["f0"]))
- version = model_data.get("version", translations["not_found"].format(name=translations["version"]))
- creation_date = model_data.get("creation_date", translations["not_found_create_time"])
- model_hash = model_data.get("model_hash", translations["not_found"].format(name="model_hash"))
- pitch_guidance = translations["trained_f0"] if f0 else translations["not_f0"]
- creation_date_str = prettify_date(creation_date) if creation_date else translations["not_found_create_time"]
- model_name = model_data.get("model_name", translations["unregistered"])
- model_author = model_data.get("author", translations["not_author"])
- vocoder = model_data.get("vocoder", "Default")
- rms_extract = model_data.get("energy", False)
-
- gr_info(translations["success"])
- return translations["model_info"].format(model_name=model_name, model_author=model_author, epochs=epochs, steps=steps, version=version, sr=sr, pitch_guidance=pitch_guidance, model_hash=model_hash, creation_date_str=creation_date_str, vocoder=vocoder, rms_extract=rms_extract)
\ No newline at end of file
diff --git a/main/app/core/presets.py b/main/app/core/presets.py
deleted file mode 100644
index 058d0086309908c4c99e9bc01b88a325c230b279..0000000000000000000000000000000000000000
--- a/main/app/core/presets.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import os
-import sys
-import json
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import translations, configs
-from main.app.core.ui import gr_info, gr_warning, change_preset_choices, change_effect_preset_choices
-
-def load_presets(presets, cleaner, autotune, pitch, clean_strength, index_strength, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, formant_shifting, formant_qfrency, formant_timbre, proposal_pitch, proposal_pitch_threshold):
- if not presets: gr_warning(translations["provide_file_settings"])
-
- file = {}
- if presets:
- with open(os.path.join(configs["presets_path"], presets)) as f:
- file = json.load(f)
-
- gr_info(translations["load_presets"].format(presets=presets))
-
- return [file.get("cleaner", cleaner), file.get("autotune", autotune), file.get("pitch", pitch), file.get("clean_strength", clean_strength), file.get("index_strength", index_strength), file.get("resample_sr", resample_sr), file.get("filter_radius", filter_radius), file.get("rms_mix_rate", rms_mix_rate), file.get("protect", protect), file.get("split_audio", split_audio), file.get("f0_autotune_strength", f0_autotune_strength), file.get("formant_shifting", formant_shifting), file.get("formant_qfrency", formant_qfrency), file.get("formant_timbre", formant_timbre), file.get("proposal_pitch", proposal_pitch), file.get("proposal_pitch_threshold", proposal_pitch_threshold)]
-
-def save_presets(name, cleaner, autotune, pitch, clean_strength, index_strength, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, cleaner_chbox, autotune_chbox, pitch_chbox, index_strength_chbox, resample_sr_chbox, filter_radius_chbox, rms_mix_rate_chbox, protect_chbox, split_audio_chbox, formant_shifting_chbox, formant_shifting, formant_qfrency, formant_timbre, proposal_pitch, proposal_pitch_threshold):
- if not name: return gr_warning(translations["provide_filename_settings"])
- if not any([cleaner_chbox, autotune_chbox, pitch_chbox, index_strength_chbox, resample_sr_chbox, filter_radius_chbox, rms_mix_rate_chbox, protect_chbox, split_audio_chbox, formant_shifting_chbox]): return gr_warning(translations["choose1"])
-
- settings = {}
-
- for checkbox, data in [(cleaner_chbox, {"cleaner": cleaner, "clean_strength": clean_strength}), (autotune_chbox, {"autotune": autotune, "f0_autotune_strength": f0_autotune_strength}), (pitch_chbox, {"pitch": pitch}), (index_strength_chbox, {"index_strength": index_strength}), (resample_sr_chbox, {"resample_sr": resample_sr}), (filter_radius_chbox, {"filter_radius": filter_radius}), (rms_mix_rate_chbox, {"rms_mix_rate": rms_mix_rate}), (protect_chbox, {"protect": protect}), (split_audio_chbox, {"split_audio": split_audio}), (formant_shifting_chbox, {"formant_shifting": formant_shifting, "formant_qfrency": formant_qfrency, "formant_timbre": formant_timbre}), (proposal_pitch, {"proposal_pitch": proposal_pitch, "proposal_pitch_threshold": proposal_pitch_threshold})]:
- if checkbox: settings.update(data)
-
- with open(os.path.join(configs["presets_path"], name + ".conversion.json"), "w") as f:
- json.dump(settings, f, indent=4)
-
- gr_info(translations["export_settings"].format(name=name))
- return change_preset_choices()
-
-def audio_effect_load_presets(presets, resample_checkbox, audio_effect_resample_sr, chorus_depth, chorus_rate_hz, chorus_mix, chorus_centre_delay_ms, chorus_feedback, distortion_drive_db, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift_semitones, delay_second, delay_feedback, delay_mix, compressor_threshold_db, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold_db, limiter_release_ms, gain_db, bitcrush_bit_depth, clipping_threshold_db, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost, bass_frequency, treble_boost, treble_frequency, fade_in, fade_out, chorus_check_box, distortion_checkbox, reverb_check_box, delay_check_box, compressor_check_box, limiter, gain_checkbox, bitcrush_checkbox, clipping_checkbox, phaser_check_box, bass_or_treble, fade):
- if not presets: gr_warning(translations["provide_file_settings"])
-
- file = {}
- if presets:
- with open(os.path.join(configs["presets_path"], presets)) as f:
- file = json.load(f)
-
- gr_info(translations["load_presets"].format(presets=presets))
- return [
- file.get("resample_checkbox", resample_checkbox), file.get("audio_effect_resample_sr", audio_effect_resample_sr),
- file.get("chorus_depth", chorus_depth), file.get("chorus_rate_hz", chorus_rate_hz),
- file.get("chorus_mix", chorus_mix), file.get("chorus_centre_delay_ms", chorus_centre_delay_ms),
- file.get("chorus_feedback", chorus_feedback), file.get("distortion_drive_db", distortion_drive_db),
- file.get("reverb_room_size", reverb_room_size), file.get("reverb_damping", reverb_damping),
- file.get("reverb_wet_level", reverb_wet_level), file.get("reverb_dry_level", reverb_dry_level),
- file.get("reverb_width", reverb_width), file.get("reverb_freeze_mode", reverb_freeze_mode),
- file.get("pitch_shift_semitones", pitch_shift_semitones), file.get("delay_second", delay_second),
- file.get("delay_feedback", delay_feedback), file.get("delay_mix", delay_mix),
- file.get("compressor_threshold_db", compressor_threshold_db), file.get("compressor_ratio", compressor_ratio),
- file.get("compressor_attack_ms", compressor_attack_ms), file.get("compressor_release_ms", compressor_release_ms),
- file.get("limiter_threshold_db", limiter_threshold_db), file.get("limiter_release_ms", limiter_release_ms),
- file.get("gain_db", gain_db), file.get("bitcrush_bit_depth", bitcrush_bit_depth),
- file.get("clipping_threshold_db", clipping_threshold_db), file.get("phaser_rate_hz", phaser_rate_hz),
- file.get("phaser_depth", phaser_depth), file.get("phaser_centre_frequency_hz", phaser_centre_frequency_hz),
- file.get("phaser_feedback", phaser_feedback), file.get("phaser_mix", phaser_mix),
- file.get("bass_boost", bass_boost), file.get("bass_frequency", bass_frequency),
- file.get("treble_boost", treble_boost), file.get("treble_frequency", treble_frequency),
- file.get("fade_in", fade_in), file.get("fade_out", fade_out),
- file.get("chorus_check_box", chorus_check_box), file.get("distortion_checkbox", distortion_checkbox),
- file.get("reverb_check_box", reverb_check_box), file.get("delay_check_box", delay_check_box),
- file.get("compressor_check_box", compressor_check_box), file.get("limiter", limiter),
- file.get("gain_checkbox", gain_checkbox), file.get("bitcrush_checkbox", bitcrush_checkbox),
- file.get("clipping_checkbox", clipping_checkbox), file.get("phaser_check_box", phaser_check_box),
- file.get("bass_or_treble", bass_or_treble), file.get("fade", fade)
- ]
-
-def audio_effect_save_presets(name, resample_checkbox, audio_effect_resample_sr, chorus_depth, chorus_rate_hz, chorus_mix, chorus_centre_delay_ms, chorus_feedback, distortion_drive_db, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift_semitones, delay_second, delay_feedback, delay_mix, compressor_threshold_db, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold_db, limiter_release_ms, gain_db, bitcrush_bit_depth, clipping_threshold_db, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost, bass_frequency, treble_boost, treble_frequency, fade_in, fade_out, chorus_check_box, distortion_checkbox, reverb_check_box, delay_check_box, compressor_check_box, limiter, gain_checkbox, bitcrush_checkbox, clipping_checkbox, phaser_check_box, bass_or_treble, fade):
- if not name: return gr_warning(translations["provide_filename_settings"])
- if not any([resample_checkbox, chorus_check_box, distortion_checkbox, reverb_check_box, delay_check_box, compressor_check_box, limiter, gain_checkbox, bitcrush_checkbox, clipping_checkbox, phaser_check_box, bass_or_treble, fade, pitch_shift_semitones != 0]): return gr_warning(translations["choose1"])
-
- settings = {}
-
- for checkbox, data in [
- (resample_checkbox, {
- "resample_checkbox": resample_checkbox,
- "audio_effect_resample_sr": audio_effect_resample_sr
- }),
- (chorus_check_box, {
- "chorus_check_box": chorus_check_box,
- "chorus_depth": chorus_depth,
- "chorus_rate_hz": chorus_rate_hz,
- "chorus_mix": chorus_mix,
- "chorus_centre_delay_ms": chorus_centre_delay_ms,
- "chorus_feedback": chorus_feedback
- }),
- (distortion_checkbox, {
- "distortion_checkbox": distortion_checkbox,
- "distortion_drive_db": distortion_drive_db
- }),
- (reverb_check_box, {
- "reverb_check_box": reverb_check_box,
- "reverb_room_size": reverb_room_size,
- "reverb_damping": reverb_damping,
- "reverb_wet_level": reverb_wet_level,
- "reverb_dry_level": reverb_dry_level,
- "reverb_width": reverb_width,
- "reverb_freeze_mode": reverb_freeze_mode
- }),
- (pitch_shift_semitones != 0, {
- "pitch_shift_semitones": pitch_shift_semitones
- }),
- (delay_check_box, {
- "delay_check_box": delay_check_box,
- "delay_second": delay_second,
- "delay_feedback": delay_feedback,
- "delay_mix": delay_mix
- }),
- (compressor_check_box, {
- "compressor_check_box": compressor_check_box,
- "compressor_threshold_db": compressor_threshold_db,
- "compressor_ratio": compressor_ratio,
- "compressor_attack_ms": compressor_attack_ms,
- "compressor_release_ms": compressor_release_ms
- }),
- (limiter, {
- "limiter": limiter,
- "limiter_threshold_db": limiter_threshold_db,
- "limiter_release_ms": limiter_release_ms
- }),
- (gain_checkbox, {
- "gain_checkbox": gain_checkbox,
- "gain_db": gain_db
- }),
- (bitcrush_checkbox, {
- "bitcrush_checkbox": bitcrush_checkbox,
- "bitcrush_bit_depth": bitcrush_bit_depth
- }),
- (clipping_checkbox, {
- "clipping_checkbox": clipping_checkbox,
- "clipping_threshold_db": clipping_threshold_db
- }),
- (phaser_check_box, {
- "phaser_check_box": phaser_check_box,
- "phaser_rate_hz": phaser_rate_hz,
- "phaser_depth": phaser_depth,
- "phaser_centre_frequency_hz": phaser_centre_frequency_hz,
- "phaser_feedback": phaser_feedback,
- "phaser_mix": phaser_mix
- }),
- (bass_or_treble, {
- "bass_or_treble": bass_or_treble,
- "bass_boost": bass_boost,
- "bass_frequency": bass_frequency,
- "treble_boost": treble_boost,
- "treble_frequency": treble_frequency
- }),
- (fade, {
- "fade": fade,
- "fade_in": fade_in,
- "fade_out": fade_out
- })
- ]:
- if checkbox: settings.update(data)
-
- with open(os.path.join(configs["presets_path"], name + ".effect.json"), "w") as f:
- json.dump(settings, f, indent=4)
-
- gr_info(translations["export_settings"].format(name=name))
- return change_effect_preset_choices()
\ No newline at end of file
diff --git a/main/app/core/process.py b/main/app/core/process.py
deleted file mode 100644
index 8be9c6677978f698a6a6af661c40dc98e1281e51..0000000000000000000000000000000000000000
--- a/main/app/core/process.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import os
-import re
-import sys
-import shutil
-import codecs
-import zipfile
-import requests
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import logger, translations, configs
-from main.app.core.ui import gr_info, gr_warning, gr_error, process_output, replace_punctuation
-
-def read_docx_text(path):
- import xml.etree.ElementTree
-
- with zipfile.ZipFile(path) as docx:
- with docx.open("word/document.xml") as document_xml:
- xml_content = document_xml.read()
-
- WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
-
- paragraphs = []
- for paragraph in xml.etree.ElementTree.XML(xml_content).iter(WORD_NAMESPACE + 'p'):
- texts = [node.text for node in paragraph.iter(WORD_NAMESPACE + 't') if node.text]
- if texts: paragraphs.append(''.join(texts))
-
- return '\n'.join(paragraphs)
-
-def process_input(file_path):
- if file_path.endswith(".srt"): file_contents = ""
- elif file_path.endswith(".docx"): file_contents = read_docx_text(file_path)
- else:
- try:
- with open(file_path, "r", encoding="utf-8") as file:
- file_contents = file.read()
- except Exception as e:
- gr_warning(translations["read_error"])
- logger.debug(e)
- file_contents = ""
-
- gr_info(translations["upload_success"].format(name=translations["text"]))
- return file_contents
-
-def move_files_from_directory(src_dir, dest_weights, dest_logs, model_name):
- for root, _, files in os.walk(src_dir):
- for file in files:
- file_path = os.path.join(root, file)
- if file.endswith(".index"):
- model_log_dir = os.path.join(dest_logs, model_name)
- os.makedirs(model_log_dir, exist_ok=True)
-
- filepath = process_output(os.path.join(model_log_dir, replace_punctuation(file)))
-
- shutil.move(file_path, filepath)
- elif file.endswith(".pth") and not file.startswith("D_") and not file.startswith("G_"):
- pth_path = process_output(os.path.join(dest_weights, model_name + ".pth"))
-
- shutil.move(file_path, pth_path)
- elif file.endswith(".onnx") and not file.startswith("D_") and not file.startswith("G_"):
- pth_path = process_output(os.path.join(dest_weights, model_name + ".onnx"))
-
- shutil.move(file_path, pth_path)
-
-def extract_name_model(filename):
- match = re.search(r"_([A-Za-z0-9]+)(?=_v\d*)", replace_punctuation(filename))
- return match.group(1) if match else None
-
-def save_drop_model(dropboxs):
- weight_folder = configs["weights_path"]
- logs_folder = configs["logs_path"]
- save_model_temp = "save_model_temp"
-
- if not os.path.exists(weight_folder): os.makedirs(weight_folder, exist_ok=True)
- if not os.path.exists(logs_folder): os.makedirs(logs_folder, exist_ok=True)
- if not os.path.exists(save_model_temp): os.makedirs(save_model_temp, exist_ok=True)
-
- try:
- for dropbox in dropboxs:
- shutil.move(dropbox, save_model_temp)
- file_name = os.path.basename(dropbox)
-
- if file_name.endswith(".zip"):
- shutil.unpack_archive(os.path.join(save_model_temp, file_name), save_model_temp)
- move_files_from_directory(save_model_temp, weight_folder, logs_folder, file_name.replace(".zip", ""))
- elif file_name.endswith((".pth", ".onnx")):
- output_file = process_output(os.path.join(weight_folder, file_name))
-
- shutil.move(os.path.join(save_model_temp, file_name), output_file)
- elif file_name.endswith(".index"):
- modelname = extract_name_model(file_name)
- if modelname is None: modelname = os.path.splitext(os.path.basename(file_name))[0]
-
- model_logs = os.path.join(logs_folder, modelname)
- if not os.path.exists(model_logs): os.makedirs(model_logs, exist_ok=True)
-
- shutil.move(os.path.join(save_model_temp, file_name), model_logs)
- else:
- gr_warning(translations["unable_analyze_model"])
- return None
-
- gr_info(translations["upload_success"].format(name=translations["model"]))
- return None
- except Exception as e:
- gr_error(message=translations["error_occurred"].format(e=e))
- return None
- finally:
- shutil.rmtree(save_model_temp, ignore_errors=True)
-
-def zip_file(name, pth, index):
- pth_path = os.path.join(configs["weights_path"], pth)
- if not pth or not os.path.exists(pth_path) or not pth.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"]))
-
- zip_file_path = os.path.join(configs["logs_path"], name, name + ".zip")
- gr_info(translations["start"].format(start=translations["zip"]))
-
- with zipfile.ZipFile(zip_file_path, 'w') as zipf:
- zipf.write(pth_path, os.path.basename(pth_path))
- if index: zipf.write(index, os.path.basename(index))
-
- gr_info(translations["success"])
- return {"visible": True, "value": zip_file_path, "__type__": "update"}
-
-def fetch_pretrained_data():
- try:
- response = requests.get(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/wfba/phfgbz_cergenvarq.wfba", "rot13"))
- response.raise_for_status()
-
- return response.json()
- except:
- return {}
-
-def update_sample_rate_dropdown(model):
- data = fetch_pretrained_data()
- if model != translations["success"]: return {"choices": list(data[model].keys()), "value": list(data[model].keys())[0], "__type__": "update"}
\ No newline at end of file
diff --git a/main/app/core/realtime.py b/main/app/core/realtime.py
deleted file mode 100644
index 54831e2a7f009792e830bb2f0dd8161e067be19c..0000000000000000000000000000000000000000
--- a/main/app/core/realtime.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import os
-import sys
-import time
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import translations, configs
-from main.app.core.ui import gr_info, gr_warning, audio_device
-
-running, callbacks, audio_manager = False, None, None
-
-PIPELINE_SAMPLE_RATE = 16000
-DEVICE_SAMPLE_RATE = 48000
-
-interactive_true = {"interactive": True, "__type__": "update"}
-interactive_false = {"interactive": False, "__type__": "update"}
-
-def realtime_start(
- monitor,
- exclusive_mode,
- vad_enabled,
- input_audio_device,
- output_audio_device,
- monitor_output_device,
- input_audio_gain,
- output_audio_gain,
- monitor_audio_gain,
- input_asio_channels,
- output_asio_channels,
- monitor_asio_channels,
- chunk_size,
- pitch,
- model_pth,
- model_index,
- index_strength,
- onnx_f0_mode,
- f0_method,
- hop_length,
- embed_mode,
- embedders,
- custom_embedders,
- f0_autotune,
- proposal_pitch,
- f0_autotune_strength,
- proposal_pitch_threshold,
- rms_mix_rate,
- protect,
- filter_radius,
- silent_threshold,
- extra_convert_size,
- cross_fade_overlap_size,
- vad_sensitivity,
- vad_frame_ms,
- clean_audio,
- clean_strength
-):
- global running, callbacks, audio_manager
- running = True
-
- gr_info(translations["start_realtime"])
- yield translations["start_realtime"], interactive_false, interactive_true
-
- if not input_audio_device or not output_audio_device:
- gr_warning(translations["provide_audio_device"])
- yield translations["provide_audio_device"], interactive_true, interactive_false
- return
-
- if monitor and not monitor_output_device:
- gr_warning(translations["provide_monitor_device"])
- yield translations["provide_monitor_device"], interactive_true, interactive_false
- return
-
- model_pth = os.path.join(configs["weights_path"], model_pth) if not os.path.exists(model_pth) else model_pth
- embedder_model = (embedders if embedders != "custom" else custom_embedders)
-
- if not model_pth or not os.path.exists(model_pth) or os.path.isdir(model_pth) or not model_pth.endswith((".pth", ".onnx")):
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
- yield translations["provide_file"].format(filename=translations["model"]), interactive_true, interactive_false
- return
-
- input_devices, output_devices = audio_device()
- input_device_id = input_devices[input_audio_device][0]
- output_device_id = output_devices[output_audio_device][0]
- output_monitor_id = output_devices[monitor_output_device][0] if monitor else None
-
- input_audio_gain /= 100.0
- output_audio_gain /= 100.0
- monitor_audio_gain /= 100.0
-
- chunk_size = int(chunk_size * DEVICE_SAMPLE_RATE / 1000 / 128)
-
- from main.inference.realtime.callbacks import AudioCallbacks
-
- callbacks = AudioCallbacks(
- pass_through=False,
- read_chunk_size=chunk_size,
- cross_fade_overlap_size=cross_fade_overlap_size,
- input_sample_rate=DEVICE_SAMPLE_RATE,
- output_sample_rate=DEVICE_SAMPLE_RATE,
- extra_convert_size=extra_convert_size,
- model_path=model_pth,
- index_path=model_index,
- f0_method=f0_method,
- f0_onnx=onnx_f0_mode,
- embedder_model=embedder_model,
- embedders_mode=embed_mode,
- sample_rate=PIPELINE_SAMPLE_RATE,
- hop_length=hop_length,
- silent_threshold=silent_threshold,
- f0_up_key=pitch,
- index_rate=index_strength,
- protect=protect,
- filter_radius=filter_radius,
- rms_mix_rate=rms_mix_rate,
- f0_autotune=f0_autotune,
- f0_autotune_strength=f0_autotune_strength,
- proposal_pitch=proposal_pitch,
- proposal_pitch_threshold=proposal_pitch_threshold,
- input_audio_gain=input_audio_gain,
- output_audio_gain=output_audio_gain,
- monitor_audio_gain=monitor_audio_gain,
- monitor=monitor,
- vad_enabled=vad_enabled,
- vad_sensitivity=vad_sensitivity,
- vad_frame_ms=vad_frame_ms,
- clean_audio=clean_audio,
- clean_strength=clean_strength
- )
-
- audio_manager = callbacks.audio
- audio_manager.start(
- input_device_id=input_device_id,
- output_device_id=output_device_id,
- output_monitor_id=output_monitor_id,
- exclusive_mode=exclusive_mode,
- asio_input_channel=input_asio_channels,
- asio_output_channel=output_asio_channels,
- asio_output_monitor_channel=monitor_asio_channels,
- read_chunk_size=chunk_size,
- input_audio_sample_rate=DEVICE_SAMPLE_RATE,
- output_monitor_sample_rate=DEVICE_SAMPLE_RATE
- )
-
- gr_info(translations["realtime_is_ready"])
-
- while running and callbacks is not None and audio_manager is not None:
- time.sleep(0.1)
- if hasattr(callbacks, "latency"): yield f"{translations['latency']}: {callbacks.latency:.2f} ms", interactive_false, interactive_true
-
- return translations["realtime_has_stop"], interactive_true, interactive_false
-
-def realtime_stop():
- global running, callbacks, audio_manager
-
- if running and audio_manager is not None and callbacks is not None:
- gr_info(translations["stop_realtime"])
-
- audio_manager.stop()
- running = False
-
- if hasattr(callbacks, "latency"): del callbacks.latency
- del audio_manager, callbacks
-
- audio_manager = callbacks = None
- gr_info(translations["realtime_has_stop"])
-
- from main.library.utils import clear_gpu_cache
- clear_gpu_cache()
-
- return translations["realtime_has_stop"], interactive_true, interactive_false
- else:
- gr_warning(translations["realtime_not_found"])
-
- return translations["realtime_not_found"], interactive_true, interactive_false
\ No newline at end of file
diff --git a/main/app/core/realtime_client.py b/main/app/core/realtime_client.py
deleted file mode 100644
index e9f39d924c15e4a7386cb46680cf9a6c33666b8e..0000000000000000000000000000000000000000
--- a/main/app/core/realtime_client.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import os
-import sys
-import json
-
-import numpy as np
-
-from fastapi import FastAPI, WebSocketDisconnect, WebSocket
-
-sys.path.append(os.getcwd())
-
-from main.library.utils import clear_gpu_cache
-from main.app.variables import configs, translations, logger
-from main.inference.realtime.realtime import VoiceChanger, RVC_Realtime
-
-app = FastAPI()
-vc_instance = None
-
-PIPELINE_SAMPLE_RATE = 16000
-DEVICE_SAMPLE_RATE = 48000
-
-@app.websocket("/ws-audio")
-async def websocket_audio(ws: WebSocket):
- global vc_instance
- await ws.accept()
-
- logger.info(translations["ws_connected"])
-
- try:
- text = await ws.receive_text()
- params = json.loads(text)
-
- read_chunk_size = int(params["chunk_size"])
- block_frame = read_chunk_size * 128
- embedders = params["embedders"]
-
- model_pth = params["model_pth"]
- model_pth = os.path.join(configs["weights_path"], model_pth) if not os.path.exists(model_pth) else model_pth
-
- if not model_pth or not os.path.exists(model_pth) or os.path.isdir(model_pth) or not model_pth.endswith((".pth", ".onnx")):
- logger.warning(translations["provide_file"].format(filename=translations["model"]))
- await ws.send_text(json.dumps({"type": "warnings", "value": translations["provide_file"].format(filename=translations["model"])}))
- return
-
- logger.info(translations["start_realtime"])
-
- if vc_instance is None:
- vc_instance = VoiceChanger(
- read_chunk_size=read_chunk_size,
- cross_fade_overlap_size=params["cross_fade_overlap_size"],
- input_sample_rate=DEVICE_SAMPLE_RATE,
- extra_convert_size=params["extra_convert_size"]
- )
- vc_instance.initialize(vc_model=RVC_Realtime(
- model_path=model_pth,
- index_path=params["model_index"],
- f0_method=params["f0_method"],
- f0_onnx=params["f0_onnx"],
- embedder_model=(embedders if embedders != "custom" else params["custom_embedders"]),
- embedders_mode=params["embedders_mode"],
- sample_rate=PIPELINE_SAMPLE_RATE,
- hop_length=params["hop_length"],
- silent_threshold=params["silent_threshold"],
- input_sample_rate=DEVICE_SAMPLE_RATE,
- output_sample_rate=DEVICE_SAMPLE_RATE,
- vad_enabled=params["vad_enabled"],
- vad_sensitivity=params["vad_sensitivity"],
- vad_frame_ms=params["vad_frame_ms"],
- clean_audio=params["clean_audio"],
- clean_strength=params["clean_strength"]
- ))
-
- logger.info(translations["realtime_is_ready"])
-
- while 1:
- audio = await ws.receive_bytes()
- arr = np.frombuffer(audio, dtype=np.float32)
-
- if arr.size != block_frame:
- arr = np.pad(arr, (0, block_frame - arr.size)).astype(np.float32) if arr.size < block_frame else arr[:block_frame].astype(np.float32)
-
- audio_output, _, perf = vc_instance.on_request(
- arr * (params["input_audio_gain"] / 100.0),
- f0_up_key=params["f0_up_key"],
- index_rate=params["index_rate"],
- protect=params["protect"],
- filter_radius=params["filter_radius"],
- rms_mix_rate=params["rms_mix_rate"],
- f0_autotune=params["f0_autotune"],
- f0_autotune_strength=params["f0_autotune_strength"],
- proposal_pitch=params["proposal_pitch"],
- proposal_pitch_threshold=params["proposal_pitch_threshold"]
- )
-
- await ws.send_text(json.dumps({"type": "latency", "value": perf[1]}))
- await ws.send_bytes(audio_output.tobytes())
- except WebSocketDisconnect:
- logger.info(translations["ws_disconnected"])
- except Exception as e:
- import traceback
- logger.debug(traceback.format_exc())
- logger.info(translations["error_occurred"].format(e=e))
- finally:
- if vc_instance is not None:
- del vc_instance
- vc_instance = None
-
- clear_gpu_cache()
-
- try:
- await ws.close()
- except:
- pass
-
- logger.info(translations["ws_closed"])
\ No newline at end of file
diff --git a/main/app/core/restart.py b/main/app/core/restart.py
deleted file mode 100644
index d4029c84ff41511517b45eb6afa15fefd87dc997..0000000000000000000000000000000000000000
--- a/main/app/core/restart.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import os
-import sys
-import json
-import platform
-import subprocess
-
-sys.path.append(os.getcwd())
-
-from main.app.core.ui import gr_info
-from main.app.variables import python, translations, configs_json
-
-def restart_app(app):
- gr_info(translations["30s"])
- os.system("cls" if platform.system() == "Windows" else "clear")
-
- app.close()
- subprocess.run([python, os.path.join("main", "app", "app.py")] + [arg for arg in sys.argv[1:] if arg != "--open"])
-
-def change_language(lang, app):
- configs = json.load(open(configs_json, "r"))
-
- if lang != configs["language"]:
- configs["language"] = lang
-
- with open(configs_json, "w") as f:
- json.dump(configs, f, indent=4)
-
- restart_app(app)
-
-def change_theme(theme, app):
- configs = json.load(open(configs_json, "r"))
-
- if theme != configs["theme"]:
- configs["theme"] = theme
- with open(configs_json, "w") as f:
- json.dump(configs, f, indent=4)
-
- restart_app(app)
-
-def change_font(font, app):
- configs = json.load(open(configs_json, "r"))
-
- if font != configs["font"]:
- configs["font"] = font
- with open(configs_json, "w") as f:
- json.dump(configs, f, indent=4)
-
- restart_app(app)
\ No newline at end of file
diff --git a/main/app/core/separate.py b/main/app/core/separate.py
deleted file mode 100644
index b27e4858412221b170599a70614d646cd0592454..0000000000000000000000000000000000000000
--- a/main/app/core/separate.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import os
-import sys
-import subprocess
-
-sys.path.append(os.getcwd())
-
-from main.app.core.ui import gr_info, gr_warning
-from main.app.variables import python, translations, configs
-
-def separate_music(
- input_path,
- output_dirs,
- export_format,
- model_name,
- karaoke_model,
- reverb_model,
- denoise_model,
- sample_rate,
- shifts,
- batch_size,
- overlap,
- aggression,
- hop_length,
- window_size,
- segments_size,
- post_process_threshold,
- enable_tta,
- enable_denoise,
- high_end_process,
- enable_post_process,
- separate_backing,
- separate_reverb
-):
- output_dirs = os.path.dirname(output_dirs) or output_dirs
-
- if not input_path or not os.path.exists(input_path) or os.path.isdir(input_path):
- gr_warning(translations["input_not_valid"])
- return [None]*4
-
- if not os.path.exists(output_dirs):
- gr_warning(translations["output_not_valid"])
- return [None]*4
-
- if not os.path.exists(output_dirs): os.makedirs(output_dirs)
- gr_info(translations["start"].format(start=translations["separator_music"]))
-
- subprocess.run([
- python, configs["separate_path"],
- "--input_path", input_path,
- "--output_dirs", output_dirs,
- "--export_format", export_format,
- "--model_name", model_name,
- "--karaoke_model", karaoke_model,
- "--reverb_model", reverb_model,
- "--denoise_model", denoise_model,
- "--sample_rate", str(sample_rate),
- "--shifts", str(shifts),
- "--batch_size", str(batch_size),
- "--overlap", str(overlap),
- "--aggression", str(aggression),
- "--hop_length", str(hop_length),
- "--window_size", str(window_size),
- "--segments_size", str(segments_size),
- "--post_process_threshold", str(post_process_threshold),
- "--enable_tta", str(enable_tta),
- "--enable_denoise", str(enable_denoise),
- "--high_end_process", str(high_end_process),
- "--enable_post_process", str(enable_post_process),
- "--separate_backing", str(separate_backing),
- "--separate_reverb", str(separate_reverb),
- ])
-
- gr_info(translations["success"])
-
- filename, _ = os.path.splitext(os.path.basename(input_path))
- output_dirs = os.path.join(output_dirs, filename)
-
- return [
- os.path.join(
- output_dirs,
- f"Original_Vocals_No_Reverb.{export_format}" if separate_reverb else f"Original_Vocals.{export_format}"
- ),
- os.path.join(
- output_dirs,
- f"Instruments.{export_format}"
- ),
- os.path.join(
- output_dirs,
- f"Main_Vocals_No_Reverb.{export_format}" if separate_reverb else f"Main_Vocals.{export_format}"
- ) if separate_backing else None,
- os.path.join(
- output_dirs,
- f"Backing_Vocals.{export_format}"
- ) if separate_backing else None
- ] if os.path.isfile(input_path) else [None]*4
\ No newline at end of file
diff --git a/main/app/core/training.py b/main/app/core/training.py
deleted file mode 100644
index 3a276f0af33a5aabe2c4831578a82f6368e7e024..0000000000000000000000000000000000000000
--- a/main/app/core/training.py
+++ /dev/null
@@ -1,265 +0,0 @@
-import os
-import sys
-import time
-import shutil
-import codecs
-import threading
-import subprocess
-
-sys.path.append(os.getcwd())
-
-from main.tools import huggingface
-from main.app.core.ui import gr_info, gr_warning
-from main.app.variables import python, translations, configs
-
-def if_done(done, p):
- while 1:
- if p.poll() is None: time.sleep(0.5)
- else: break
-
- done[0] = True
-
-def log_read(done, name):
- log_file = os.path.join(configs["logs_path"], "app.log")
-
- f = open(log_file, "w", encoding="utf-8")
- f.close()
-
- while 1:
- with open(log_file, "r", encoding="utf-8") as f:
- yield "".join(line for line in f.readlines() if "DEBUG" not in line and name in line and line.strip() != "")
-
- time.sleep(1)
- if done[0]: break
-
- with open(log_file, "r", encoding="utf-8") as f:
- log = "".join(line for line in f.readlines() if "DEBUG" not in line and line.strip() != "")
-
- yield log
-
-def create_dataset(
- input_data,
- output_dirs,
- skip_seconds,
- skip_start_audios,
- skip_end_audios,
- separate,
- model_name,
- reverb_model,
- denoise_model,
- sample_rate,
- shifts,
- batch_size,
- overlap,
- aggression,
- hop_length,
- window_size,
- segments_size,
- post_process_threshold,
- enable_tta,
- enable_denoise,
- high_end_process,
- enable_post_process,
- separate_reverb,
- clean_dataset,
- clean_strength
-):
- gr_info(translations["start"].format(start=translations["create"]))
-
- p = subprocess.Popen(f'{python} {configs["create_dataset_path"]} --input_data "{input_data}" --output_dirs "{output_dirs}" --skip_seconds {skip_seconds} --skip_start_audios "{skip_start_audios}" --skip_end_audios "{skip_end_audios}" --separate {separate} --model_name "{model_name}" --reverb_model "{reverb_model}" --denoise_model "{denoise_model}" --sample_rate {sample_rate} --shifts {shifts} --batch_size {batch_size} --overlap {overlap} --aggression {aggression} --hop_length {hop_length} --window_size {window_size} --segments_size {segments_size} --post_process_threshold {post_process_threshold} --enable_tta {enable_tta} --enable_denoise {enable_denoise} --high_end_process {high_end_process} --enable_post_process {enable_post_process} --separate_reverb {separate_reverb} --clean_dataset {clean_dataset} --clean_strength {clean_strength}', shell=True)
- done = [False]
-
- threading.Thread(target=if_done, args=(done, p)).start()
-
- for log in log_read(done, "create_dataset"):
- yield log
-
-def create_reference(audio_path, reference_name, pitch_guidance, use_energy, version, embedder_model, embedders_mode, f0_method, f0_onnx, f0_up_key, filter_radius, f0_autotune, f0_autotune_strength, proposal_pitch, proposal_pitch_threshold, alpha=0.5):
- gr_info(translations["start"].format(start=translations["create_reference"]))
-
- p = subprocess.Popen(f'{python} {configs["create_reference_path"]} --audio_path "{audio_path}" --reference_name "{reference_name}" --pitch_guidance {pitch_guidance} --use_energy {use_energy} --version {version} --embedder_model {embedder_model} --embedders_mode {embedders_mode} --f0_method {f0_method} --f0_onnx {f0_onnx} --f0_up_key {f0_up_key} --filter_radius {filter_radius} --f0_autotune {f0_autotune} --f0_autotune_strength {f0_autotune_strength} --proposal_pitch {proposal_pitch} --proposal_pitch_threshold {proposal_pitch_threshold} --alpha {alpha}', shell=True)
- done = [False]
-
- threading.Thread(target=if_done, args=(done, p)).start()
-
- for log in log_read(done, "create_reference"):
- yield log
-
-def preprocess(model_name, sample_rate, cpu_core, cut_preprocess, process_effects, dataset, clean_dataset, clean_strength, chunk_len=3.0, overlap_len=0.3, normalization_mode="none"):
- sr = int(float(sample_rate.rstrip("k")) * 1000)
-
- if not model_name: return gr_warning(translations["provide_name"])
- if not os.path.exists(dataset) or not any(f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3")) for f in os.listdir(dataset) if os.path.isfile(os.path.join(dataset, f))): return gr_warning(translations["not_found_data"])
-
- model_dir = os.path.join(configs["logs_path"], model_name)
- if os.path.exists(model_dir): shutil.rmtree(model_dir, ignore_errors=True)
-
- p = subprocess.Popen(f'{python} {configs["preprocess_path"]} --model_name "{model_name}" --dataset_path "{dataset}" --sample_rate {sr} --cpu_cores {cpu_core} --cut_preprocess {cut_preprocess} --process_effects {process_effects} --clean_dataset {clean_dataset} --clean_strength {clean_strength} --chunk_len {chunk_len} --overlap_len {overlap_len} --normalization_mode {normalization_mode}', shell=True)
- done = [False]
-
- threading.Thread(target=if_done, args=(done, p)).start()
- os.makedirs(model_dir, exist_ok=True)
-
- for log in log_read(done, "preprocess"):
- yield log
-
-def extract(model_name, version, method, pitch_guidance, hop_length, cpu_cores, gpu, sample_rate, embedders, custom_embedders, onnx_f0_mode, embedders_mode, f0_autotune, f0_autotune_strength, hybrid_method, rms_extract, alpha=0.5):
- f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)
- sr = int(float(sample_rate.rstrip("k")) * 1000)
-
- if not model_name: return gr_warning(translations["provide_name"])
- model_dir = os.path.join(configs["logs_path"], model_name)
-
- try:
- if not any(os.path.isfile(os.path.join(model_dir, "sliced_audios", f)) for f in os.listdir(os.path.join(model_dir, "sliced_audios"))) or not any(os.path.isfile(os.path.join(model_dir, "sliced_audios_16k", f)) for f in os.listdir(os.path.join(model_dir, "sliced_audios_16k"))): return gr_warning(translations["not_found_data_preprocess"])
- except:
- return gr_warning(translations["not_found_data_preprocess"])
-
- p = subprocess.Popen(f'{python} {configs["extract_path"]} --model_name "{model_name}" --rvc_version {version} --f0_method {f0method} --pitch_guidance {pitch_guidance} --hop_length {hop_length} --cpu_cores {cpu_cores} --gpu {gpu} --sample_rate {sr} --embedder_model {embedder_model} --f0_onnx {onnx_f0_mode} --embedders_mode {embedders_mode} --f0_autotune {f0_autotune} --f0_autotune_strength {f0_autotune_strength} --rms_extract {rms_extract} --alpha {alpha}', shell=True)
- done = [False]
-
- threading.Thread(target=if_done, args=(done, p)).start()
- os.makedirs(model_dir, exist_ok=True)
-
- for log in log_read(done, "extract"):
- yield log
-
-def create_index(model_name, rvc_version, index_algorithm):
- if not model_name: return gr_warning(translations["provide_name"])
- model_dir = os.path.join(configs["logs_path"], model_name)
-
- try:
- if not any(os.path.isfile(os.path.join(model_dir, f"{rvc_version}_extracted", f)) for f in os.listdir(os.path.join(model_dir, f"{rvc_version}_extracted"))): return gr_warning(translations["not_found_data_extract"])
- except:
- return gr_warning(translations["not_found_data_extract"])
-
- p = subprocess.Popen(f'{python} {configs["create_index_path"]} --model_name "{model_name}" --rvc_version {rvc_version} --index_algorithm {index_algorithm}', shell=True)
- done = [False]
-
- threading.Thread(target=if_done, args=(done, p)).start()
- os.makedirs(model_dir, exist_ok=True)
-
- for log in log_read(done, "create_index"):
- yield log
-
-def training(model_name, rvc_version, save_every_epoch, save_only_latest, save_every_weights, total_epoch, sample_rate, batch_size, gpu, pitch_guidance, not_pretrain, custom_pretrained, pretrain_g, pretrain_d, detector, threshold, clean_up, cache, model_author, vocoder, checkpointing, deterministic, benchmark, optimizer, energy_use, custom_reference=False, reference_name="", multiscale_mel_loss=False):
- sr = int(float(sample_rate.rstrip("k")) * 1000)
- if not model_name: return gr_warning(translations["provide_name"])
-
- model_dir = os.path.join(configs["logs_path"], model_name)
- if os.path.exists(os.path.join(model_dir, "train_pid.txt")): os.remove(os.path.join(model_dir, "train_pid.txt"))
-
- try:
- if not any(os.path.isfile(os.path.join(model_dir, f"{rvc_version}_extracted", f)) for f in os.listdir(os.path.join(model_dir, f"{rvc_version}_extracted"))): return gr_warning(translations["not_found_data_extract"])
- except:
- return gr_warning(translations["not_found_data_extract"])
-
- if not not_pretrain:
- if not custom_pretrained:
- pretrain_dir = configs["pretrained_v2_path"] if rvc_version == 'v2' else configs["pretrained_v1_path"]
- download_version = codecs.decode(f"uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cergenvarq_", "rot13") + f"{rvc_version}/"
-
- pretrained_selector = {
- True: {
- 32000: ("f0G32k.pth", "f0D32k.pth"),
- 40000: ("f0G40k.pth", "f0D40k.pth"),
- 48000: ("f0G48k.pth", "f0D48k.pth")
- },
- False: {
- 32000: ("G32k.pth", "D32k.pth"),
- 40000: ("G40k.pth", "D40k.pth"),
- 48000: ("G48k.pth", "D48k.pth")
- }
- }
-
- pg2, pd2 = "", ""
- pg, pd = pretrained_selector[pitch_guidance][sr]
-
- if energy_use: pg2, pd2 = pg2 + "ENERGY_", pd2 + "ENERGY_"
- if vocoder != 'Default': pg2, pd2 = pg2 + vocoder + "_", pd2 + vocoder + "_"
-
- pg2, pd2 = pg2 + pg, pd2 + pd
- pretrained_G, pretrained_D = (
- os.path.join(
- pretrain_dir,
- pg2
- ),
- os.path.join(
- pretrain_dir,
- pd2
- )
- )
-
- try:
- if not os.path.exists(pretrained_G):
- gr_info(translations["download_pretrained"].format(dg="G", rvc_version=rvc_version))
- huggingface.HF_download_file(
- "".join(
- [
- download_version,
- pg2
- ]
- ),
- os.path.join(
- pretrain_dir,
- pg2
- )
- )
-
- if not os.path.exists(pretrained_D):
- gr_info(translations["download_pretrained"].format(dg="D", rvc_version=rvc_version))
- huggingface.HF_download_file(
- "".join(
- [
- download_version,
- pd2
- ]
- ),
- os.path.join(
- pretrain_dir,
- pd2
- )
- )
- except:
- gr_warning(translations["not_use_pretrain_error_download"])
- pretrained_G = pretrained_D = None
- else:
- if not pretrain_g: return gr_warning(translations["provide_pretrained"].format(dg="G"))
- if not pretrain_d: return gr_warning(translations["provide_pretrained"].format(dg="D"))
-
- pg2, pd2 = pretrain_g, pretrain_d
- pretrained_G, pretrained_D = (
- (os.path.join(configs["pretrained_custom_path"], pg2) if not os.path.exists(pg2) else pg2),
- (os.path.join(configs["pretrained_custom_path"], pd2) if not os.path.exists(pd2) else pd2)
- )
-
- if not os.path.exists(pretrained_G): return gr_warning(translations["not_found_pretrain"].format(dg="G"))
- if not os.path.exists(pretrained_D): return gr_warning(translations["not_found_pretrain"].format(dg="D"))
- else:
- pretrained_G = pretrained_D = None
- gr_warning(translations["not_use_pretrain"])
-
- if custom_reference:
- reference_path = os.path.join(configs["reference_path"], reference_name)
-
- if not os.path.exists(reference_path):
- gr_warning(translations["not_found_reference"])
-
- custom_reference = False
- reference_path = None
- else: reference_path = None
-
- gr_info(translations["start"].format(start=translations["training"]))
-
- p = subprocess.Popen(f'{python} {configs["train_path"]} --model_name "{model_name}" --rvc_version {rvc_version} --save_every_epoch {save_every_epoch} --save_only_latest {save_only_latest} --save_every_weights {save_every_weights} --total_epoch {total_epoch} --batch_size {batch_size} --gpu {gpu} --pitch_guidance {pitch_guidance} --overtraining_detector {detector} --overtraining_threshold {threshold} --cleanup {clean_up} --cache_data_in_gpu {cache} --g_pretrained_path "{pretrained_G}" --d_pretrained_path "{pretrained_D}" --model_author "{model_author}" --vocoder "{vocoder}" --checkpointing {checkpointing} --deterministic {deterministic} --benchmark {benchmark} --optimizer {optimizer} --energy_use {energy_use} --use_custom_reference {custom_reference} --reference_path {reference_path} --multiscale_mel_loss {multiscale_mel_loss}', shell=True)
- done = [False]
-
- with open(os.path.join(model_dir, "train_pid.txt"), "w") as pid_file:
- pid_file.write(str(p.pid))
-
- threading.Thread(target=if_done, args=(done, p)).start()
-
- for log in log_read(done, "train"):
- lines = log.splitlines()
- if len(lines) > 50: log = "\n".join(lines[-50:])
- yield log
\ No newline at end of file
diff --git a/main/app/core/tts.py b/main/app/core/tts.py
deleted file mode 100644
index 327fd849371c19fedf405e27bf426d6bfc57c903..0000000000000000000000000000000000000000
--- a/main/app/core/tts.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import os
-import sys
-import pysrt
-import codecs
-import librosa
-import asyncio
-import requests
-import tempfile
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import translations
-from main.app.core.ui import gr_info, gr_warning, gr_error
-
-def synthesize_tts(prompt, voice, speed, output, pitch, google):
- if not google:
- from edge_tts import Communicate
- asyncio.run(Communicate(text=prompt, voice=voice, rate=f"+{speed}%" if speed >= 0 else f"{speed}%", pitch=f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz").save(output))
- else:
- response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyr.pbz/genafyngr_ggf", "rot13"), params={"ie": "UTF-8", "q": prompt, "tl": voice, "ttsspeed": speed, "client": "tw-ob"}, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"})
-
- if response.status_code == 200:
- with open(output, "wb") as f:
- f.write(response.content)
-
- if pitch != 0 or speed != 0:
- y, sr = librosa.load(output, sr=None)
-
- if pitch != 0: y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch)
- if speed != 0: y = librosa.effects.time_stretch(y, rate=speed)
-
- import soundfile as sf
- sf.write(file=output, data=y, samplerate=sr, format=os.path.splitext(os.path.basename(output))[-1].lower().replace('.', ''))
- else: gr_error(f"{response.status_code}, {response.text}")
-
-def srt_tts(srt_file, out_file, voice, rate = 0, sr = 24000, google = False):
- import numpy as np
- import soundfile as sf
-
- def time_stretch(y, sr, target_duration):
- rate = (len(y) / sr) / target_duration
- if rate != 1.0: y = librosa.effects.time_stretch(y=y.astype(np.float32), rate=rate)
-
- n_target = int(round(target_duration * sr))
- return np.pad(y, (0, n_target - len(y))) if len(y) < n_target else y[:n_target]
-
- def pysrttime_to_seconds(t):
- return (t.hours * 60 + t.minutes) * 60 + t.seconds + t.milliseconds / 1000
-
- subs = pysrt.open(srt_file)
- if not subs: raise ValueError(translations["srt"])
-
- final_audio = np.zeros(int(round(pysrttime_to_seconds(subs[-1].end) * sr)), dtype=np.float32)
-
- with tempfile.TemporaryDirectory() as tempdir:
- for idx, seg in enumerate(subs):
- wav_path = os.path.join(tempdir, f"seg_{idx}.wav")
- synthesize_tts(" ".join(seg.text.splitlines()), voice, 0, wav_path, rate, google)
-
- audio, file_sr = sf.read(wav_path, dtype=np.float32)
- if file_sr != sr: audio = np.interp(np.linspace(0, len(audio) - 1, int(len(audio) * sr / file_sr)), np.arange(len(audio)), audio)
- adjusted = time_stretch(audio, sr, pysrttime_to_seconds(seg.duration))
-
- start_sample = int(round(pysrttime_to_seconds(seg.start) * sr))
- end_sample = start_sample + adjusted.shape[0]
-
- if end_sample > final_audio.shape[0]:
- adjusted = adjusted[: final_audio.shape[0] - start_sample]
- end_sample = final_audio.shape[0]
-
- final_audio[start_sample:end_sample] += adjusted
-
- sf.write(out_file, final_audio, sr)
-
-def TTS(prompt, voice, speed, output, pitch, google, srt_input):
- if not srt_input: srt_input = ""
-
- if not prompt and not srt_input.endswith(".srt"):
- gr_warning(translations["enter_the_text"])
- return None
-
- if not voice:
- gr_warning(translations["choose_voice"])
- return None
-
- if not output:
- gr_warning(translations["output_not_valid"])
- return None
-
- if os.path.isdir(output): output = os.path.join(output, f"tts.wav")
- gr_info(translations["convert"].format(name=translations["text"]))
-
- output_dir = os.path.dirname(output) or output
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
-
- if srt_input.endswith(".srt"): srt_tts(srt_input, output, voice, 0, 24000, google)
- else: synthesize_tts(prompt, voice, speed, output, pitch, google)
-
- gr_info(translations["success"])
- return output
\ No newline at end of file
diff --git a/main/app/core/ui.py b/main/app/core/ui.py
deleted file mode 100644
index 87362b30005132bfc722bc606b81a9f3ce4ad973..0000000000000000000000000000000000000000
--- a/main/app/core/ui.py
+++ /dev/null
@@ -1,362 +0,0 @@
-import os
-import re
-import sys
-import json
-import torch
-import shutil
-
-import gradio as gr
-import sounddevice as sd
-
-sys.path.append(os.getcwd())
-
-from main.library.backends import directml, opencl
-from main.inference.realtime.audio import list_audio_device
-from main.app.variables import config, configs, configs_json, logger, translations, edgetts, google_tts_voice, method_f0, method_f0_full, vr_models, mdx_models, demucs_models, embedders_model, spin_model, whisper_model
-
-def gr_info(message):
- gr.Info(message, duration=2)
- logger.info(message)
-
-def gr_warning(message):
- gr.Warning(message, duration=2)
- logger.warning(message)
-
-def gr_error(message):
- gr.Error(message=message, duration=6)
- logger.error(message)
-
-def get_gpu_info():
- ngpu = torch.cuda.device_count()
- gpu_infos = [
- f"{i}: {torch.cuda.get_device_name(i)} ({int(torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + 0.4)} GB)"
- for i in range(ngpu)
- if torch.cuda.is_available() or ngpu != 0
- ]
-
- if len(gpu_infos) == 0:
- if directml.torch_available:
- ngpu = directml.device_count()
- gpu_infos = [f"{i}: {directml.device_name(i)}" for i in range(ngpu) if directml.is_available() or ngpu != 0]
- elif opencl.torch_available:
- ngpu = opencl.device_count()
- gpu_infos = [f"{i}: {opencl.device_name(i)}" for i in range(ngpu) if opencl.is_available() or ngpu != 0]
- else:
- ngpu = 0
- gpu_infos = []
-
- return "\n".join(gpu_infos) if len(gpu_infos) > 0 and not config.cpu_mode else translations["no_support_gpu"]
-
-def gpu_number_str():
- if config.cpu_mode: return "-"
-
- ngpu = torch.cuda.device_count()
- if ngpu == 0: ngpu = directml.device_count() if directml.torch_available else opencl.device_count()
-
- return str("-".join(map(str, range(ngpu))) if torch.cuda.is_available() or directml.is_available() or opencl.is_available() else "-")
-
-def change_f0_choices():
- f0_file = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["f0_path"]) for f in files if f.endswith(".txt")])
- return {"value": f0_file[0] if len(f0_file) >= 1 else "", "choices": f0_file, "__type__": "update"}
-
-def change_audios_choices(input_audio):
- audios = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["audios_path"]) for f in files if os.path.splitext(f)[1].lower() in (".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")])
- return {"value": input_audio if input_audio != "" else (audios[0] if len(audios) >= 1 else ""), "choices": audios, "__type__": "update"}
-
-def change_reference_choices():
- reference = sorted([re.sub(r'_v\d+_(?:[A-Za-z0-9_]+?)_(True|False)_(True|False)$', '', name) for name in os.listdir(configs["reference_path"]) if os.path.exists(os.path.join(configs["reference_path"], name)) and os.path.isdir(os.path.join(configs["reference_path"], name))])
- return {"value": reference[0] if len(reference) >= 1 else "", "choices": reference, "__type__": "update"}
-
-def change_models_choices():
- model, index = sorted(list(model for model in os.listdir(configs["weights_path"]) if model.endswith((".pth", ".onnx")) and not model.startswith("G_") and not model.startswith("D_"))), sorted([os.path.join(root, name) for root, _, files in os.walk(configs["logs_path"], topdown=False) for name in files if name.endswith(".index") and "trained" not in name])
- return [{"value": model[0] if len(model) >= 1 else "", "choices": model, "__type__": "update"}, {"value": index[0] if len(index) >= 1 else "", "choices": index, "__type__": "update"}]
-
-def change_pretrained_choices():
- pretrainD = sorted([model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "D" in model])
- pretrainG = sorted([model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "G" in model])
-
- return [{"choices": pretrainD, "value": pretrainD[0] if len(pretrainD) >= 1 else "", "__type__": "update"}, {"choices": pretrainG, "value": pretrainG[0] if len(pretrainG) >= 1 else "", "__type__": "update"}]
-
-def change_choices_del():
- return [{"choices": sorted(list(model for model in os.listdir(configs["weights_path"]) if model.endswith(".pth") and not model.startswith("G_") and not model.startswith("D_"))), "__type__": "update"}, {"choices": sorted([os.path.join(configs["logs_path"], f) for f in os.listdir(configs["logs_path"]) if f not in ["mute", "reference"] and os.path.isdir(os.path.join(configs["logs_path"], f))]), "__type__": "update"}]
-
-def change_preset_choices():
- return {"value": "", "choices": sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".conversion.json"))), "__type__": "update"}
-
-def change_effect_preset_choices():
- return {"value": "", "choices": sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".effect.json"))), "__type__": "update"}
-
-def change_tts_voice_choices(google):
- return {"choices": google_tts_voice if google else edgetts, "value": google_tts_voice[0] if google else edgetts[0], "__type__": "update"}
-
-def change_backing_choices(backing, merge):
- if backing or merge: return {"value": False, "interactive": False, "__type__": "update"}
- elif not backing or not merge: return {"interactive": True, "__type__": "update"}
- else: gr_warning(translations["option_not_valid"])
-
-def change_download_choices(select):
- selects = [False]*10
-
- if select == translations["download_url"]: selects[0] = selects[1] = selects[2] = True
- elif select == translations["download_from_csv"]: selects[3] = selects[4] = True
- elif select == translations["search_models"]: selects[5] = selects[6] = True
- elif select == translations["upload"]: selects[9] = True
- else: gr_warning(translations["option_not_valid"])
-
- return [{"visible": selects[i], "__type__": "update"} for i in range(len(selects))]
-
-def change_download_pretrained_choices(select):
- selects = [False]*7
-
- if select == translations["download_url"]: selects[0] = selects[1] = selects[2] = True
- elif select == translations["list_model"]: selects[3] = selects[4] = selects[5] = True
- elif select == translations["upload"]: selects[6] = True
- else: gr_warning(translations["option_not_valid"])
-
- return [{"visible": selects[i], "__type__": "update"} for i in range(len(selects))]
-
-def get_index(model):
- model = os.path.basename(model).split("_")[0]
- return {"value": next((f for f in [os.path.join(root, name) for root, _, files in os.walk(configs["logs_path"], topdown=False) for name in files if name.endswith(".index") and "trained" not in name] if model.split(".")[0] in f), ""), "__type__": "update"} if model else None
-
-def index_strength_show(index):
- return {"visible": index != "" and index != None and os.path.exists(index) and os.path.isfile(index), "value": 0.5, "__type__": "update"}
-
-def hoplength_show(method, hybrid_method=None):
- visible = False
-
- for m in ["mangio-crepe", "fcpe", "yin", "piptrack", "mangio-penn"]:
- if m in method: visible = True
- if hybrid_method is not None and m in hybrid_method: visible = True
-
- if visible: break
- else: visible = False
-
- return {"visible": visible, "__type__": "update"}
-
-def visible(value):
- return {"visible": value, "__type__": "update"}
-
-def valueFalse_interactive(value):
- return {"value": False, "interactive": value, "__type__": "update"}
-
-def valueEmpty_visible1(value):
- return {"value": "", "visible": value, "__type__": "update"}
-
-def pitch_guidance_lock(vocoders):
- return {"value": True, "interactive": vocoders == "Default", "__type__": "update"}
-
-def vocoders_lock(pitch, vocoders):
- return {"value": vocoders if pitch else "Default", "interactive": pitch, "__type__": "update"}
-
-def unlock_f0(value):
- return {"choices": method_f0_full if value else method_f0, "value": "rmvpe", "__type__": "update"}
-
-def unlock_vocoder(value, vocoder):
- return {"value": vocoder if value == "v2" else "Default", "interactive": value == "v2", "__type__": "update"}
-
-def unlock_ver(value, vocoder):
- return {"value": "v2" if vocoder == "Default" else value, "interactive": vocoder == "Default", "__type__": "update"}
-
-def change_embedders_mode(value):
- if value == "spin":
- return {"value": spin_model[0], "choices": spin_model, "__type__": "update"}
- elif value == "whisper":
- return {"value": whisper_model[0], "choices": whisper_model, "__type__": "update"}
- else:
- return {"value": embedders_model[0], "choices": embedders_model, "__type__": "update"}
-
-def change_fp(fp):
- fp16 = fp == "fp16"
-
- if fp16 and config.device in ["cpu", "mps", "ocl:0"]:
- gr_warning(translations["fp16_not_support"])
- return "fp32"
- else:
- gr_info(translations["start_update_precision"])
-
- configs = json.load(open(configs_json, "r"))
- configs["fp16"] = config.is_half = fp16
-
- with open(configs_json, "w") as f:
- json.dump(configs, f, indent=4)
-
- gr_info(translations["success"])
- return "fp16" if fp16 else "fp32"
-
-def process_output(file_path):
- if config.configs.get("delete_exists_file", True):
- if os.path.exists(file_path) and os.path.isfile(file_path): os.remove(file_path)
- return file_path
- else:
- if not os.path.exists(file_path): return file_path
- file = os.path.splitext(os.path.basename(file_path))
-
- index = 1
- while 1:
- file_path = os.path.join(os.path.dirname(file_path), f"{file[0]}_{index}{file[1]}")
- if not os.path.exists(file_path): return file_path
- index += 1
-
-def shutil_move(input_path, output_path):
- output_path = os.path.join(output_path, os.path.basename(input_path)) if os.path.isdir(output_path) else output_path
-
- return shutil.move(input_path, process_output(output_path)) if os.path.exists(output_path) else shutil.move(input_path, output_path)
-
-def separate_change(model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise):
- model_type = "vr" if model_name in list(vr_models.keys()) else "mdx" if model_name in list(mdx_models.keys()) else "demucs" if model_name in list(demucs_models.keys()) else ""
- karaoke_type = ("vr" if karaoke_model.startswith("VR") else "mdx") if separate_backing else None
- reverb_type = ("vr" if not reverb_model.startswith("MDX") else "mdx") if separate_reverb else None
-
- all_types = {model_type, karaoke_type, reverb_type}
-
- is_vr = "vr" in all_types
- is_mdx = "mdx" in all_types
- is_demucs = "demucs" in all_types
-
- return [
- visible(separate_backing),
- visible(separate_reverb),
- visible(is_mdx or is_demucs),
- visible(is_mdx or is_demucs),
- visible(is_mdx),
- visible(is_mdx or is_vr),
- visible(is_demucs),
- visible(is_vr),
- visible(is_vr),
- visible(is_vr and enable_post_process),
- visible(is_vr and enable_denoise),
- valueFalse_interactive(is_vr),
- valueFalse_interactive(is_vr),
- valueFalse_interactive(is_vr)
- ]
-
-def create_dataset_change(model_name, reverb_model, enable_post_process, separate_reverb, enable_denoise):
- model_type = "vr" if model_name in list(vr_models.keys()) else "mdx" if model_name in list(mdx_models.keys()) else "demucs" if model_name in list(demucs_models.keys()) else ""
- reverb_type = ("vr" if not reverb_model.startswith("MDX") else "mdx") if separate_reverb else None
- all_types = {model_type, reverb_type}
-
- is_vr = "vr" in all_types
- is_mdx = "mdx" in all_types
- is_demucs = "demucs" in all_types
-
- return [
- visible(separate_reverb),
- visible(is_mdx or is_demucs),
- visible(is_mdx or is_demucs),
- visible(is_mdx),
- visible(is_mdx or is_vr),
- visible(is_demucs),
- visible(is_vr),
- visible(is_vr),
- visible(is_vr and enable_post_process),
- visible(is_vr and enable_denoise),
- valueFalse_interactive(is_vr),
- valueFalse_interactive(is_vr),
- valueFalse_interactive(is_vr)
- ]
-
-def audio_device():
- try:
- input_devices, output_devices = list_audio_device()
-
- def priority(name):
- n = name.lower()
- if "virtual" in n:
- return 0
- if "vb" in n:
- return 1
- return 2
-
- output_sorted = sorted(output_devices, key=lambda d: priority(d.name))
- input_sorted = sorted(
- input_devices, key=lambda d: priority(d.name), reverse=True
- )
-
- input_device_list = {
- f"{input_sorted.index(d)+1}: {d.name} ({d.host_api})": [d.index, d.max_input_channels] for d in input_sorted
- }
- output_device_list = {
- f"{output_sorted.index(d)+1}: {d.name} ({d.host_api})": [d.index, d.max_output_channels] for d in output_sorted
- }
-
- return input_device_list, output_device_list
- except Exception:
- return [], []
-
-def update_audio_device(input_device, output_device, monitor_device, monitor):
- input_channels_map, output_channels_map = audio_device()
-
- input_is_asio = "ASIO" in input_device if input_device else False
- output_is_asio = "ASIO" in output_device if output_device else False
- monitor_is_asio = "ASIO" in monitor_device if monitor_device else False
-
- try:
- input_max_ch = input_channels_map.get(input_device, [])[1]
- output_max_ch = output_channels_map.get(output_device, [])[1]
- monitor_max_ch = output_channels_map.get(monitor_device, [])[1] if monitor else 128
- except:
- input_max_ch = output_max_ch = monitor_max_ch = -1
-
- return [
- visible(monitor),
- visible(monitor),
- visible(monitor_is_asio),
- visible(input_is_asio or output_is_asio or monitor_is_asio),
- gr.update(visible=input_is_asio, maximum=input_max_ch),
- gr.update(visible=output_is_asio, maximum=output_max_ch),
- gr.update(visible=monitor_is_asio, maximum=monitor_max_ch)
- ]
-
-def change_audio_device_choices():
- sd._terminate()
- sd._initialize()
-
- input_channels_map, output_channels_map = audio_device()
- input_channels_map, output_channels_map = list(input_channels_map.keys()), list(output_channels_map.keys())
-
- return [
- {"value": input_channels_map[0] if len(input_channels_map) >= 1 else "", "choices": input_channels_map, "__type__": "update"},
- {"value": output_channels_map[0] if len(output_channels_map) >= 1 else "", "choices": output_channels_map, "__type__": "update"},
- {"value": output_channels_map[0] if len(output_channels_map) >= 1 else "", "choices": output_channels_map, "__type__": "update"}
- ]
-
-def replace_punctuation(filename):
- return filename.replace(" ", "_").replace("-", "").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace(",", "").replace('"', "").replace("'", "").replace("|", "_").replace("{", "").replace("}", "").replace("-_-", "_").replace("_-_", "_").replace("-", "_").replace("---", "_").replace("___", "_").strip()
-
-def replace_url(url):
- return url.replace("/blob/", "/resolve/").replace("?download=true", "").strip()
-
-def replace_modelname(modelname):
- return replace_punctuation(modelname.replace(".onnx", "").replace(".pth", "").replace(".index", "").replace(".zip", ""))
-
-def replace_export_format(audio_path, export_format = "wav"):
- export_format = f".{export_format}"
-
- return audio_path if audio_path.endswith(export_format) else audio_path.replace(f".{os.path.basename(audio_path).split('.')[-1]}", export_format)
-
-def update_dropdowns_from_json(data):
- if not data:
- return [
- gr.update(choices=[], value=None),
- gr.update(choices=[], value=None),
- gr.update(choices=[], value=None)
- ]
-
- inputs = list(data.get("inputs", {}).keys())
- outputs = list(data.get("outputs", {}).keys())
-
- return [
- gr.update(choices=inputs, value=inputs[0] if len(inputs) > 0 else None),
- gr.update(choices=outputs, value=outputs[0] if len(outputs) > 0 else None),
- gr.update(choices=outputs, value=outputs[0] if len(outputs) > 0 else None),
- ]
-
-def update_button_from_json(data):
- if not data:
- return [gr.update(interactive=True), gr.update(interactive=False)]
-
- return [
- gr.update(interactive=data.get("start_button", True)),
- gr.update(interactive=data.get("stop_button", False))
- ]
\ No newline at end of file
diff --git a/main/app/core/utils.py b/main/app/core/utils.py
deleted file mode 100644
index 4f42c26d4a9c502c27a07a5dea4d3f65b7dc3757..0000000000000000000000000000000000000000
--- a/main/app/core/utils.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import os
-import sys
-import json
-import codecs
-import requests
-
-sys.path.append(os.getcwd())
-
-from main.app.core.ui import gr_info, gr_warning
-from main.app.variables import translations, configs
-
-def stop_pid(pid_file, model_name=None, train=False):
- try:
- pid_file_path = os.path.join("assets", f"{pid_file}.txt") if model_name is None else os.path.join(configs["logs_path"], model_name, f"{pid_file}.txt")
-
- if not os.path.exists(pid_file_path): return gr_warning(translations["not_found_pid"])
- else:
- with open(pid_file_path, "r") as pid_file:
- pids = [int(pid) for pid in pid_file.readlines()]
-
- for pid in pids:
- os.kill(pid, 9)
-
- if os.path.exists(pid_file_path): os.remove(pid_file_path)
-
- pid_file_path = os.path.join(configs["logs_path"], model_name, "config.json")
-
- if train and os.path.exists(pid_file_path):
- with open(pid_file_path, "r") as pid_file:
- pid_data = json.load(pid_file)
- pids = pid_data.get("process_pids", [])
-
- with open(pid_file_path, "w") as pid_file:
- pid_data.pop("process_pids", None)
-
- json.dump(pid_data, pid_file, indent=4)
-
- for pid in pids:
- os.kill(pid, 9)
-
- gr_info(translations["end_pid"])
- except:
- pass
-
-def google_translate(text, source='auto', target='vi'):
- if text == "": return gr_warning(translations["prompt_warning"])
-
- try:
- import textwrap
-
- def translate_chunk(chunk):
- response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyrncvf.pbz/genafyngr_n/fvatyr", "rot13"), params={'client': 'gtx', 'sl': source, 'tl': target, 'dt': 't', 'q': chunk})
- return ''.join([i[0] for i in response.json()[0]]) if response.status_code == 200 else chunk
-
- translated_text = ''
- for chunk in textwrap.wrap(text, 5000, break_long_words=False, break_on_hyphens=False):
- translated_text += translate_chunk(chunk)
-
- return translated_text
- except:
- return text
\ No newline at end of file
diff --git a/main/app/parser.py b/main/app/parser.py
deleted file mode 100644
index 4dd900ce8326407a65800ee9c0cae25e53e11f8f..0000000000000000000000000000000000000000
--- a/main/app/parser.py
+++ /dev/null
@@ -1,369 +0,0 @@
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-try:
- argv = sys.argv[1]
-except IndexError:
- argv = None
-
-argv_is_allows = ["--audio_effects", "--convert", "--create_dataset", "--create_index", "--extract", "--preprocess", "--separator_music", "--train", "--help_audio_effects", "--help_convert", "--help_create_dataset", "--help_create_index", "--help_extract", "--help_preprocess", "--help_separate_music", "--help_train", "--help", "--create_reference", "help_create_reference"]
-
-if argv not in argv_is_allows:
- print("Cú pháp không hợp lệ! Sử dụng --help để biết thêm")
- quit()
-
-if argv_is_allows[0] in argv: from main.inference.audio_effects import main
-elif argv_is_allows[1] in argv: from main.inference.conversion.convert import main
-elif argv_is_allows[2] in argv: from main.inference.create_dataset import main
-elif argv_is_allows[3] in argv: from main.inference.create_index import main
-elif argv_is_allows[4] in argv: from main.inference.extracting.extract import main
-elif argv_is_allows[5] in argv: from main.inference.preprocess.preprocess import main
-elif argv_is_allows[6] in argv: from main.inference.separate_music import main
-elif argv_is_allows[7] in argv: from main.inference.training.train import main
-elif argv_is_allows[17] in argv: from main.inference.create_reference import main
-elif argv_is_allows[8] in argv:
- print("""Các tham số của `--audio_effects`:
- 1. Đường dẫn tệp:
- - `--input_path` (bắt buộc): Đường dẫn đến tệp âm thanh đầu vào.
- - `--output_path` (mặc định: `./audios/apply_effects.wav`): Đường dẫn lưu tệp đầu ra.
- - `--export_format` (mặc định: `wav`): Định dạng xuất tệp (`wav`, `mp3`, ...).
-
- 2. Lấy mẫu lại:
- - `--resample` (mặc định: `False`): Có lấy mẫu lại hay không.
- - `--resample_sr` (mặc định: `0`): Tần số lấy mẫu mới (Hz).
-
- 3. Hiệu ứng chorus:
- - `--chorus`: Bật/tắt chorus.
- - `--chorus_depth`, `--chorus_rate`, `--chorus_mix`, `--chorus_delay`, `--chorus_feedback`: Các thông số điều chỉnh chorus.
-
- 4. Hiệu ứng distortion:
- - `--distortion`: Bật/tắt distortion.
- - `--drive_db`: Mức độ méo âm thanh.
-
- 5. Hiệu ứng reverb:
- - `--reverb`: Bật/tắt hồi âm.
- - `--reverb_room_size`, `--reverb_damping`, `--reverb_wet_level`, `--reverb_dry_level`, `--reverb_width`, `--reverb_freeze_mode`: Điều chỉnh hồi âm.
-
- 6. Hiệu ứng pitch shift:
- - `--pitchshift`: Bật/tắt thay đổi cao độ.
- - `--pitch_shift`: Giá trị dịch cao độ.
-
- 7. Hiệu ứng delay:
- - `--delay`: Bật/tắt delay.
- - `--delay_seconds`, `--delay_feedback`, `--delay_mix`: Điều chỉnh thời gian trễ, phản hồi và hòa trộn.
-
- 8. Compressor:
- - `--compressor`: Bật/tắt compressor.
- - `--compressor_threshold`, `--compressor_ratio`, `--compressor_attack_ms`, `--compressor_release_ms`: Các thông số nén.
-
- 9. Limiter:
- - `--limiter`: Bật/tắt giới hạn mức âm thanh.
- - `--limiter_threshold`, `--limiter_release`: Ngưỡng giới hạn và thời gian nhả.
-
- 10. Gain (Khuếch đại):
- - `--gain`: Bật/tắt gain.
- - `--gain_db`: Mức gain (dB).
-
- 11. Bitcrush:
- - `--bitcrush`: Bật/tắt hiệu ứng giảm độ phân giải.
- - `--bitcrush_bit_depth`: Số bit của bitcrush.
-
- 12. Clipping:
- - `--clipping`: Bật/tắt cắt âm thanh.
- - `--clipping_threshold`: Ngưỡng clipping.
-
- 13. Phaser:
- - `--phaser`: Bật/tắt hiệu ứng phaser.
- - `--phaser_rate_hz`, `--phaser_depth`, `--phaser_centre_frequency_hz`, `--phaser_feedback`, `--phaser_mix`: Điều chỉnh hiệu ứng phaser.
-
- 14. Boost bass & treble:
- - `--treble_bass_boost`: Bật/tắt tăng cường âm bass và treble.
- - `--bass_boost_db`, `--bass_boost_frequency`, `--treble_boost_db`, `--treble_boost_frequency`: Các thông số tăng bass và treble.
-
- 15. Fade in & fade out:
- - `--fade_in_out`: Bật/tắt hiệu ứng fade.
- - `--fade_in_duration`, `--fade_out_duration`: Thời gian fade vào/ra.
-
- 16. Kết hợp âm thanh:
- - `--audio_combination`: Bật/tắt ghép nhiều tệp âm thanh.
- - `--audio_combination_input`: Đường dẫn tệp âm thanh bổ sung.
- - `--main_volume`: Âm lượng của âm thanh chính.
- - `--combination_volume`:: Âm lượng của âm thanh cần kết hợp.
- """)
- quit()
-elif argv_is_allows[9] in argv:
- print("""Các tham số của --convert:
- 1. Cấu hình xử lí giọng nói:
- - `--pitch` (mặc định: `0`): Điều chỉnh cao độ.
- - `--filter_radius` (mặc định: `3`): Độ mượt của đường F0.
- - `--index_rate` (mặc định: `0.5`): Tỷ lệ sử dụng chỉ mục giọng nói.
- - `--rms_mix_rate` (mặc định: `1`): Hệ số điều chỉnh biên độ âm lượng.
- - `--protect` (mặc định: `0.33`): Bảo vệ phụ âm.
- - `--hop_length` (mặc định: `64`): Bước nhảy khi xử lí âm thanh.
-
- 2. Cấu hình F0:
- - `--f0_method` (mặc định: `rmvpe`): Phương pháp dự đoán F0 (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`).
- - `--f0_autotune` (mặc định: `False`): Có tự động điều chỉnh F0 hay không.
- - `--f0_autotune_strength` (mặc định: `1`): Cường độ hiệu chỉnh tự động F0.
- - `--f0_file` (mặc định: ``): Đường dẫn tệp F0 có sẵn.
- - `--f0_onnx` (mặc định: `False`): Có sử dụng phiên bản ONNX của F0 hay không.
- - `--proposal_pitch` (mặc định: `False`): Đề xuất cao độ thay vì điều chỉnh thủ công.
- - `--proposal_pitch_threshold` (mặc định: `0.0`): Ngưỡng tần số ước tính cao độ.
- - `--alpha` (mặc định: `0.5`): Ngưỡng trộn cao độ khi ước tính cao độ hybrid.
-
- 3. Mô hình nhúng:
- - `--embedder_model` (mặc định: `hubert_base`): Mô hình nhúng sử dụng.
- - `--embedders_mode` (mặc định: `fairseq`): Chế độ nhúng (`fairseq`, `transformers`, `onnx`, `whisper`).
-
- 4. Đường dẫn tệp:
- - `--input_path` (bắt buộc): Đường dẫn tệp âm thanh đầu vào.
- - `--output_path` (mặc định: `./audios/output.wav`): Đường dẫn lưu tệp đầu ra.
- - `--export_format` (mặc định: `wav`): Định dạng xuất tệp.
- - `--pth_path` (bắt buộc): Đường dẫn đến tệp mô hình `.pth`.
- - `--index_path` (mặc định: `None`): Đường dẫn tệp chỉ mục (nếu có).
-
- 5. Làm sạch âm thanh:
- - `--clean_audio` (mặc định: `False`): Có áp dụng làm sạch âm thanh không.
- - `--clean_strength` (mặc định: `0.7`): Mức độ làm sạch.
-
- 6. Resampling & chia nhỏ âm thanh:
- - `--resample_sr` (mặc định: `0`): Tần số lấy mẫu mới (0 nghĩa là giữ nguyên).
- - `--split_audio` (mặc định: `False`): Có chia nhỏ audio trước khi xử lí không.
-
- 7. Kiểm tra & tối ưu hóa:
- - `--checkpointing` (mặc định: `False`): Bật/tắt checkpointing để tiết kiệm RAM.
-
- 8. Dịch formant:
- - `--formant_shifting` (mặc định: `False`): Có bật hiệu ứng dịch formant không.
- - `--formant_qfrency` (mặc định: `0.8`): Hệ số dịch formant theo tần số.
- - `--formant_timbre` (mặc định: `0.8`): Hệ số thay đổi màu sắc giọng.
- """)
- quit()
-elif argv_is_allows[10] in argv:
- print("""Các tham số của --create_dataset:
- 1. Đường dẫn & cấu hình dataset:
- - `--input_data` (bắt buộc): Đường dẫn liên kết đến âm thanh (Liên kết Youtube, có thể dùng dấu `,` để dùng nhiều liên kết).
- - `--output_dirs` (mặc định: `./dataset`): Thư mục xuất dữ liệu đầu ra.
- - `--sample_rate` (mặc định: `48000`): Tần số lấy mẫu cho âm thanh.
-
- 2. Làm sạch dữ liệu:
- - `--clean_dataset` (mặc định: `False`): Có áp dụng làm sạch dữ liệu hay không.
- - `--clean_strength` (mặc định: `0.7`): Mức độ làm sạch dữ liệu.
-
- 3. Tách giọng & hiệu ứng:
- - `--separate` (mặc định: `True`): có tách nhạc hay không.
- - `--separator_reverb` (mặc định: `False`): Có tách vang giọng không.
- - `--model_name` (mặc định: `MDXNET_Main`): Mô hình tách nhạc ('Main_340', 'Main_390', 'Main_406', 'Main_427', 'Main_438', 'Inst_full_292', 'Inst_HQ_1', 'Inst_HQ_2', 'Inst_HQ_3', 'Inst_HQ_4', 'Inst_HQ_5', 'Kim_Vocal_1', 'Kim_Vocal_2', 'Kim_Inst', 'Inst_187_beta', 'Inst_82_beta', 'Inst_90_beta', 'Voc_FT', 'Crowd_HQ', 'MDXNET_9482', 'Inst_1', 'Inst_2', 'Inst_3', 'MDXNET_1_9703', 'MDXNET_2_9682', 'MDXNET_3_9662', 'Inst_Main', 'MDXNET_Main', 'HT-Tuned', 'HT-Normal', 'HD_MMI', 'HT_6S', 'HP-1', 'HP-2', 'HP-Vocal-1', 'HP-Vocal-2', 'HP2-1', 'HP2-2', 'HP2-3', 'SP-2B-1', 'SP-2B-2', 'SP-3B-1', 'SP-4B-1', 'SP-4B-2', 'SP-MID-1', 'SP-MID-2').
- - `--reverb_model` (mặc định: `MDX-Reverb`): Mô hình tách nhạc ("MDX-Reverb", 'VR-Reverb', 'Echo-Aggressive', 'Echo-Normal').
- - `--denoise_model` (mặc định: `Normal`): Mô hình tách nhạc ('Lite', 'Normal').
-
- 4. Cấu hình xử lí âm thanh:
- - `--shifts` (mặc định: `2`): Số lượng dự đoán.
- - `--batch_size` (mặc định: `1`): Kích thước lô.
- - `--overlap` (mặc định: `0.25`): Mức độ chồng lấn giữa các đoạn.
- - `--aggression` (mặc định: `5`): Cường độ chiết xuất thân chính.
- - `--hop_length` (mặc định: `1024`): Bước nhảy MDX khi xử lí.
- - `--window_size` (mặc định: `512`): Kích thước cửa sổ.
- - `--segments_size` (mặc định: `256`): Kích thước phân đoạn âm thanh.
- - `--post_process_threshold` (mặc định: `0.2`): Mức độ xử lí hậu kỳ sau khi tách nhạc.
-
- 5. Cấu hình xử lí âm thanh khác:
- - `--enable_tta` (mặc định: `False`): Tăng cường suy luận.
- - `--enable_denoise` (mặc định: `False`): Khữ tách nhạc.
- - `--high_end_process` (mặc định: `False`): Xử lí dải cao.
- - `--enable_post_process` (mặc định: `False`): Hậu xử lí.
-
- 6. Bỏ qua phần âm thanh:
- - `--skip_seconds` (mặc định: `False`): Có bỏ qua giây âm thanh nào không.
- - `--skip_start_audios` (mặc định: `0`): Thời gian (giây) cần bỏ qua ở đầu audio.
- - `--skip_end_audios` (mặc định: `0`): Thời gian (giây) cần bỏ qua ở cuối audio.
- """)
- quit()
-elif argv_is_allows[11] in argv:
- print("""Các tham số của --create_index:
- 1. Thông tin mô hình:
- - `--model_name` (bắt buộc): Tên mô hình.
- - `--rvc_version` (mặc định: `v2`): Phiên bản (`v1`, `v2`).
- - `--index_algorithm` (mặc định: `Auto`): Thuật toán index sử dụng (`Auto`, `Faiss`, `KMeans`).
- """)
- quit()
-elif argv_is_allows[12] in argv:
- print("""Các tham số của --extract:
- 1. Thông tin mô hình:
- - `--model_name` (bắt buộc): Tên mô hình.
- - `--rvc_version` (mặc định: `v2`): Phiên bản RVC (`v1`, `v2`).
-
- 2. Cấu hình F0:
- - `--f0_method` (mặc định: `rmvpe`): Phương pháp dự đoán F0 (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`).
- - `--f0_onnx` (mặc định: `False`): Có sử dụng phiên bản ONNX của F0 hay không.
- - `--pitch_guidance` (mặc định: `True`): Có sử dụng hướng dẫn cao độ hay không.
- - `--f0_autotune` (mặc định: `False`): Có tự động điều chỉnh F0 hay không.
- - `--f0_autotune_strength` (mặc định: `1`): Cường độ hiệu chỉnh tự động F0.
- - `--alpha` (mặc định: `0.5`): Ngưỡng trộn cao độ khi ước tính cao độ hybrid.
-
- 3. Cấu hình xử lí:
- - `--hop_length` (mặc định: `128`): Độ dài bước nhảy trong quá trình xử lí.
- - `--cpu_cores` (mặc định: `2`): Số lượng luồng CPU sử dụng.
- - `--gpu` (mặc định: `-`): Chỉ định GPU sử dụng (ví dụ: `0` cho GPU đầu tiên, `-` để tắt GPU).
- - `--sample_rate` (bắt buộc): Tần số lấy mẫu của âm thanh đầu vào.
-
- 4. Cấu hình nhúng:
- - `--embedder_model` (mặc định: `hubert_base`): Tên mô hình nhúng.
- - `--embedders_mode` (mặc định: `fairseq`): Chế độ nhúng (`fairseq`, `transformers`, `onnx`, `whisper`).
-
- 4. RMS:
- - `--rms_extract` (mặc định: False): Trích xuất thêm năng lượng rms.
- """)
- quit()
-elif argv_is_allows[13] in argv:
- print("""Các tham số của --preprocess:
- 1. Thông tin mô hình:
- - `--model_name` (bắt buộc): Tên mô hình.
-
- 2. Cấu hình dữ liệu:
- - `--dataset_path` (mặc định: `./dataset`): Đường dẫn thư mục chứa tệp dữ liệu.
- - `--sample_rate` (bắt buộc): Tần số lấy mẫu của dữ liệu âm thanh.
-
- 3. Cấu hình xử lí:
- - `--cpu_cores` (mặc định: `2`): Số lượng luồng CPU sử dụng.
- - `--cut_preprocess` (mặc định: `Automatic`): Cách cắt dữ liệu tiền xử lí (`Automatic`, `Simple`, `Skip`).
- - `--process_effects` (mặc định: `False`): Có áp dụng tiền xử lí hay không.
- - `--clean_dataset` (mặc định: `False`): Có làm sạch tệp dữ liệu hay không.
- - `--clean_strength` (mặc định: `0.7`): Độ mạnh của quá trình làm sạch dữ liệu.
-
- 4. Cấu hình khác:
- - `--chunk_len` (mặc định: `3.0`): Độ dài của đoạn âm thanh cho phương pháp 'Simple'.
- - `--overlap_len` (mặc định: `0.3`): Độ dài của phần chồng chéo giữa các lát cắt đối với phương pháp 'Simple'.
- - `--normalization_mode` (mặc định: `none`): Có xử lí chuẩn hóa âm thanh không (`none`, `pre`, `post`)
- """)
- quit()
-elif argv_is_allows[14] in argv:
- print("""Các tham số của --separate_music:
- 1. Cấu hình đầu vào, đầu ra:
- - `--input_path` (bắt buộc): Đường dẫn tệp âm thanh đầu vào.
- - `--output_dirs` (mặc định: `./audios`): Thư mục lưu tệp đầu ra.
- - `--export_format` (mặc định: `wav`): Định dạng xuất tệp (`wav`, `mp3`,...).
- - `--sample_rate` (mặc định: `44100`): Tần số lấy mẫu của âm thanh đầu ra.
-
- 2. Cấu hình mô hình:
- - `--model_name` (mặc định: `MDXNET_Main`): Mô hình tách nhạc ('Main_340', 'Main_390', 'Main_406', 'Main_427', 'Main_438', 'Inst_full_292', 'Inst_HQ_1', 'Inst_HQ_2', 'Inst_HQ_3', 'Inst_HQ_4', 'Inst_HQ_5', 'Kim_Vocal_1', 'Kim_Vocal_2', 'Kim_Inst', 'Inst_187_beta', 'Inst_82_beta', 'Inst_90_beta', 'Voc_FT', 'Crowd_HQ', 'MDXNET_9482', 'Inst_1', 'Inst_2', 'Inst_3', 'MDXNET_1_9703', 'MDXNET_2_9682', 'MDXNET_3_9662', 'Inst_Main', 'MDXNET_Main', 'HT-Tuned', 'HT-Normal', 'HD_MMI', 'HT_6S', 'HP-1', 'HP-2', 'HP-Vocal-1', 'HP-Vocal-2', 'HP2-1', 'HP2-2', 'HP2-3', 'SP-2B-1', 'SP-2B-2', 'SP-3B-1', 'SP-4B-1', 'SP-4B-2', 'SP-MID-1', 'SP-MID-2').
- - `--karaoke_model` (mặc định: `MDX-Version-1`): Mô hình tách nhạc ('MDX-Version-1', 'MDX-Version-2', 'VR-Version-1', 'VR-Version-2').
- - `--reverb_model` (mặc định: `MDX-Reverb`): Mô hình tách nhạc ("MDX-Reverb", 'VR-Reverb', 'Echo-Aggressive', 'Echo-Normal').
- - `--denoise_model` (mặc định: `Normal`): Mô hình tách nhạc ('Lite', 'Normal').
-
- 3. Cấu hình xử lí âm thanh:
- - `--shifts` (mặc định: `2`): Số lượng dự đoán.
- - `--batch_size` (mặc định: `1`): Kích thước lô.
- - `--overlap` (mặc định: `0.25`): Mức độ chồng lấn giữa các đoạn.
- - `--aggression` (mặc định: `5`): Cường độ chiết xuất thân chính.
- - `--hop_length` (mặc định: `1024`): Bước nhảy MDX khi xử lí.
- - `--window_size` (mặc định: `512`): Kích thước cửa sổ.
- - `--segments_size` (mặc định: `256`): Kích thước phân đoạn âm thanh.
- - `--post_process_threshold` (mặc định: `0.2`): Mức độ xử lí hậu kỳ sau khi tách nhạc.
-
- 4. Cấu hình xử lí âm thanh khác:
- - `--enable_tta` (mặc định: `False`): Tăng cường suy luận.
- - `--enable_denoise` (mặc định: `False`): Khữ tách nhạc.
- - `--high_end_process` (mặc định: `False`): Xử lí dải cao.
- - `--enable_post_process` (mặc định: `False`): Hậu xử lí.
- - `--separate_backing` (mặc định: `False`): Tách bè giọng.
- - `--separate_reverb` (mặc định: `False`): Tách vang giọng.
- """)
- quit()
-elif argv_is_allows[15] in argv:
- print("""Các tham số của --train:
- 1. Cấu hình mô hình:
- - `--model_name` (bắt buộc): Tên mô hình.
- - `--rvc_version` (mặc định: `v2`): Phiên bản RVC (`v1`, `v2`).
- - `--model_author` (tùy chọn): Tác giả của mô hình.
-
- 2. Cấu hình lưu:
- - `--save_every_epoch` (bắt buộc): Số kỷ nguyên giữa mỗi lần lưu.
- - `--save_only_latest` (mặc định: `True`): Chỉ lưu điểm mới nhất.
- - `--save_every_weights` (mặc định: `True`): Lưu tất cả trọng số của mô hình.
-
- 3. Cấu hình huấn luyện:
- - `--total_epoch` (mặc định: `300`): Tổng số kỷ nguyên huấn luyện.
- - `--batch_size` (mặc định: `8`): Kích thước lô trong quá trình huấn luyện.
-
- 4. Cấu hình thiết bị:
- - `--gpu` (mặc định: `0`): Chỉ định GPU để sử dụng (số hoặc `-` nếu không dùng GPU).
- - `--cache_data_in_gpu` (mặc định: `False`): Lưu dữ liệu vào GPU để tăng tốc.
-
- 5. Cấu hình huấn luyện nâng cao:
- - `--pitch_guidance` (mặc định: `True`): Sử dụng hướng dẫn cao độ.
- - `--g_pretrained_path` (mặc định: ``): Đường dẫn đến trọng số G đã huấn luyện trước.
- - `--d_pretrained_path` (mặc định: ``): Đường dẫn đến trọng số D đã huấn luyện trước.
- - `--vocoder` (mặc định: `Default`): Bộ mã hóa được sử dụng (`Default`, `MRF-HiFi-GAN`, `RefineGAN`).
- - `--energy_use` (mặc định: `False`): Sử dụng năng lượng rms.
-
- 6. Phát hiện huấn luyện quá mức:
- - `--overtraining_detector` (mặc định: `False`): Bật/tắt chế độ phát hiện huấn luyện quá mức.
- - `--overtraining_threshold` (mặc định: `50`): Ngưỡng để xác định huấn luyện quá mức.
-
- 7. Xử lí dữ liệu:
- - `--cleanup` (mặc định: `False`): Dọn dẹp tệp huấn luyện cũ để tiến hành huấn luyện lại từ đầu.
-
- 8. Tối ưu:
- - `--checkpointing` (mặc định: `False`): Bật/tắt checkpointing để tiết kiệm RAM.
- - `--deterministic` (mặc định: `False`): Khi bật sẽ sử dụng các thuật toán có tính xác định cao, đảm bảo rằng mỗi lần chạy cùng một dữ liệu đầu vào sẽ cho kết quả giống nhau.
- - `--benchmark` (mặc định: `False`): Khi bật sẽ thử nghiệm và chọn thuật toán tối ưu nhất cho phần cứng và kích thước cụ thể.
- - `--optimizer` (mặc định: `AdamW`): Trình tối ưu hóa được sử dụng (`AdamW`, `RAdam`, `AnyPrecisionAdamW`).
- - `--multiscale_mel_loss` (mặc định: `False`): So sánh phổ Mel của âm thanh thật và âm thanh giả ở nhiều thang độ khác nhau. Giúp mô hình học được chi tiết âm sắc, độ sáng và cấu trúc tần số tốt hơn, từ đó cải thiện chất lượng và độ tự nhiên của giọng nói đầu ra.
-
- 9. Bộ tham chiếu:
- - `--use_custom_reference` (mặc định: `False`): Có tùy chỉnh bộ tham chiếu hay không.
- - `--reference_path` (mặc định: `False`): Đường dẫn đến bộ tham chiếu.
- """)
- quit()
-elif argv_is_allows[18] in argv:
- print("""Các tham số của --create_reference:
- 1. Đường dẫn tệp:
- - `--audio_path` (bắt buộc): Đường dẫn tệp âm thanh đầu vào.
- - `--reference_name` (mặc định: `reference`): Đường dẫn lưu bộ tham chiếu đầu ra.
-
- 2. Cấu hình bộ tham chiếu:
- - `--pitch_guidance` (mặc định: `True`): Sử dụng hướng dẫn cao độ.
- - `--energy_use` (mặc định: `False`): Sử dụng năng lượng rms.
- - `--version` (mặc định: `v2`): Phiên bản RVC (`v1`, `v2`).
-
- 3. Cấu hình nhúng:
- - `--embedder_model` (mặc định: `hubert_base`): Tên mô hình nhúng.
- - `--embedders_mode` (mặc định: `fairseq`): Chế độ nhúng (`fairseq`, `transformers`, `onnx`, `whisper`).
-
- 4. Cấu hình F0:
- - `--f0_method` (mặc định: `rmvpe`): Phương pháp dự đoán F0 (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`).
- - `--f0_onnx` (mặc định: `False`): Có sử dụng phiên bản ONNX của F0 hay không.
- - `--f0_up_key` (mặc định: `0`): Điều chỉnh cao độ.
- - `--filter_radius` (mặc định: `3`): Độ mượt của đường F0.
- - `--f0_autotune` (mặc định: `False`): Có tự động điều chỉnh F0 hay không.
- - `--f0_autotune_strength` (mặc định: `1`): Cường độ hiệu chỉnh tự động F0.
- - `--f0_file` (mặc định: ``): Đường dẫn tệp F0 có sẵn.
- - `--proposal_pitch` (mặc định: `False`): Đề xuất cao độ thay vì điều chỉnh thủ công.
- - `--proposal_pitch_threshold` (mặc định: `0.0`): Ngưỡng tần số ước tính cao độ.
- - `--alpha` (mặc định: `0.5`): Ngưỡng trộn cao độ khi ước tính cao độ hybrid.
- """)
- quit()
-elif argv_is_allows[16] in argv:
- print("""Sử dụng:
- 1. `--help_audio_effects`: Trợ giúp về phần thêm hiệu ứng âm thanh.
- 2. `--help_convert`: Trợ giúp về chuyển đổi âm thanh.
- 3. `--help_create_dataset`: Trợ giúp về tạo dữ liệu huấn luyện.
- 4. `--help_create_index`: Trợ giúp về tạo chỉ mục.
- 5. `--help_extract`: Trợ giúp về trích xuất dữ liệu huấn luyện.
- 6. `--help_preprocess`: Trợ giúp về xử lí trước dữ liệu.
- 7. `--help_separate_music`: Trợ giúp về tách nhạc.
- 8. `--help_train`: Trợ giúp về huấn luyện mô hình.
- 9. `--help_create_reference`: Trợ giúp về tạo bộ tham chiếu.
- """)
- quit()
-
-if __name__ == "__main__":
- import torch.multiprocessing as mp
-
- if "--train" in argv: mp.set_start_method("spawn")
- if "--preprocess" in argv or "--extract" in argv: mp.set_start_method("spawn", force=True)
-
- main()
\ No newline at end of file
diff --git a/main/app/run_tensorboard.py b/main/app/run_tensorboard.py
deleted file mode 100644
index 56fb927d6744eeb276902267e4297695dfb7acbd..0000000000000000000000000000000000000000
--- a/main/app/run_tensorboard.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import os
-import sys
-import time
-import logging
-import warnings
-import webbrowser
-
-from tensorboard import program
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import config, translations, logger
-
-def launch_tensorboard():
- warnings.filterwarnings("ignore")
- for l in ["root", "tensorboard"]:
- logging.getLogger(l).setLevel(logging.ERROR)
-
- tb = program.TensorBoard()
- tb.configure(argv=[None, "--logdir", config.configs["logs_path"], f"--port={config.configs['tensorboard_port']}"])
- url = tb.launch()
-
- logger.info(f"{translations['tensorboard_url']}: {url}")
- if "--open" in sys.argv: webbrowser.open(url)
-
- return f"{translations['tensorboard_url']}: {url}"
-
-if __name__ == "__main__":
- launch_tensorboard()
-
- while 1:
- time.sleep(5)
\ No newline at end of file
diff --git a/main/app/tabs/downloads/downloads.py b/main/app/tabs/downloads/downloads.py
deleted file mode 100644
index eaaf06b1f31d831369a3b9452feb5b0d00b6cd71..0000000000000000000000000000000000000000
--- a/main/app/tabs/downloads/downloads.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import translations, configs, models, model_options
-from main.app.core.downloads import download_model, search_models, download_pretrained_model
-from main.app.core.ui import change_download_choices, change_download_pretrained_choices, shutil_move
-from main.app.core.process import fetch_pretrained_data, save_drop_model, update_sample_rate_dropdown
-
-def download_tab():
- with gr.TabItem(translations["downloads"], visible=configs.get("downloads_tab", True)):
- gr.Markdown(translations["download_markdown"])
- with gr.Row():
- gr.Markdown(translations["download_markdown_2"])
- with gr.Row():
- with gr.Accordion(translations["model_download"], open=True):
- with gr.Row():
- downloadmodel = gr.Radio(label=translations["model_download_select"], choices=[translations["download_url"], translations["download_from_csv"], translations["search_models"], translations["upload"]], interactive=True, value=translations["download_url"])
- with gr.Row():
- gr.Markdown("___")
- with gr.Column():
- with gr.Row():
- url_input = gr.Textbox(label=translations["model_url"], value="", placeholder="https://...", scale=6)
- download_model_name = gr.Textbox(label=translations["modelname"], value="", placeholder=translations["modelname"], scale=2)
- url_download = gr.Button(value=translations["downloads"], scale=2)
- with gr.Column():
- model_browser = gr.Dropdown(choices=models.keys(), label=translations["model_warehouse"], scale=8, allow_custom_value=True, visible=False)
- download_from_browser = gr.Button(value=translations["get_model"], scale=2, variant="primary", visible=False)
- with gr.Column():
- search_name = gr.Textbox(label=translations["name_to_search"], placeholder=translations["modelname"], interactive=True, scale=8, visible=False)
- search = gr.Button(translations["search_2"], scale=2, visible=False)
- search_dropdown = gr.Dropdown(label=translations["select_download_model"], value="", choices=[], allow_custom_value=True, interactive=False, visible=False)
- download = gr.Button(translations["downloads"], variant="primary", visible=False)
- with gr.Column():
- model_upload = gr.Files(label=translations["drop_model"], file_types=[".pth", ".onnx", ".index", ".zip"], visible=False)
- with gr.Row():
- with gr.Accordion(translations["download_pretrained_2"], open=False):
- with gr.Row():
- pretrain_download_choices = gr.Radio(label=translations["model_download_select"], choices=[translations["download_url"], translations["list_model"], translations["upload"]], value=translations["download_url"], interactive=True)
- with gr.Row():
- gr.Markdown("___")
- with gr.Column():
- with gr.Row():
- pretrainD = gr.Textbox(label=translations["pretrained_url"].format(dg="D"), value="", placeholder="https://...", interactive=True, scale=4)
- pretrainG = gr.Textbox(label=translations["pretrained_url"].format(dg="G"), value="", placeholder="https://...", interactive=True, scale=4)
- download_pretrain_button = gr.Button(translations["downloads"], scale=2)
- with gr.Column():
- with gr.Row():
- pretrain_choices = gr.Dropdown(label=translations["select_pretrain"], info=translations["select_pretrain_info"], choices=list(fetch_pretrained_data().keys()), value="Titan_Medium", allow_custom_value=True, interactive=True, scale=6, visible=False)
- sample_rate_pretrain = gr.Dropdown(label=translations["pretrain_sr"], info=translations["pretrain_sr"], choices=["48k", "40k", "32k"], value="48k", interactive=True, visible=False)
- download_pretrain_choices_button = gr.Button(translations["downloads"], scale=2, variant="primary", visible=False)
- with gr.Row():
- pretrain_upload = gr.Files(label=translations["drop_pretrain"].format(dg="G, D"), file_types=[".pth"], visible=False)
- with gr.Row():
- url_download.click(
- fn=download_model,
- inputs=[
- url_input,
- download_model_name
- ],
- outputs=[url_input],
- api_name="download_model"
- )
- download_from_browser.click(
- fn=lambda model: download_model(models[model], model),
- inputs=[model_browser],
- outputs=[model_browser],
- api_name="download_browser"
- )
- with gr.Row():
- downloadmodel.change(fn=change_download_choices, inputs=[downloadmodel], outputs=[url_input, download_model_name, url_download, model_browser, download_from_browser, search_name, search, search_dropdown, download, model_upload])
- search.click(fn=search_models, inputs=[search_name], outputs=[search_dropdown, download])
- model_upload.upload(fn=save_drop_model, inputs=[model_upload], outputs=[model_upload])
- download.click(
- fn=lambda model: download_model(model_options[model], model),
- inputs=[search_dropdown],
- outputs=[search_dropdown],
- api_name="search_models"
- )
- with gr.Row():
- pretrain_download_choices.change(fn=change_download_pretrained_choices, inputs=[pretrain_download_choices], outputs=[pretrainD, pretrainG, download_pretrain_button, pretrain_choices, sample_rate_pretrain, download_pretrain_choices_button, pretrain_upload])
- pretrain_choices.change(fn=update_sample_rate_dropdown, inputs=[pretrain_choices], outputs=[sample_rate_pretrain])
- with gr.Row():
- download_pretrain_button.click(
- fn=download_pretrained_model,
- inputs=[
- pretrain_download_choices,
- pretrainD,
- pretrainG
- ],
- outputs=[pretrainD, pretrainG],
- api_name="download_pretrain_link"
- )
- download_pretrain_choices_button.click(
- fn=download_pretrained_model,
- inputs=[
- pretrain_download_choices,
- pretrain_choices,
- sample_rate_pretrain
- ],
- outputs=[pretrain_choices],
- api_name="download_pretrain_choices"
- )
- pretrain_upload.upload(
- fn=lambda pretrain_upload: [shutil_move(pretrain.name, configs["pretrained_custom_path"]) for pretrain in pretrain_upload],
- inputs=[pretrain_upload],
- outputs=[],
- api_name="upload_pretrain"
- )
\ No newline at end of file
diff --git a/main/app/tabs/editing/child/audio_effects.py b/main/app/tabs/editing/child/audio_effects.py
deleted file mode 100644
index 370d117cdac347912c6a29a62d1a7587dd8050eb..0000000000000000000000000000000000000000
--- a/main/app/tabs/editing/child/audio_effects.py
+++ /dev/null
@@ -1,393 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.editing import audio_effects
-from main.app.core.presets import audio_effect_load_presets, audio_effect_save_presets
-from main.app.core.ui import visible, change_audios_choices, change_effect_preset_choices, shutil_move
-from main.app.variables import translations, paths_for_files, sample_rate_choice, audio_effect_presets_file, configs, file_types, export_format_choices
-
-def audio_effects_tab():
- with gr.Row():
- gr.Markdown(translations["audio_effects_edit"])
- with gr.Row():
- with gr.Column():
- with gr.Row():
- reverb_check_box = gr.Checkbox(label=translations["reverb"], value=False, interactive=True)
- chorus_check_box = gr.Checkbox(label=translations["chorus"], value=False, interactive=True)
- delay_check_box = gr.Checkbox(label=translations["delay"], value=False, interactive=True)
- phaser_check_box = gr.Checkbox(label=translations["phaser"], value=False, interactive=True)
- compressor_check_box = gr.Checkbox(label=translations["compressor"], value=False, interactive=True)
- more_options = gr.Checkbox(label=translations["more_option"], value=False, interactive=True)
- with gr.Row():
- with gr.Accordion(translations["input_output"], open=False):
- with gr.Row():
- upload_audio = gr.Files(label=translations["drop_audio"], file_types=file_types)
- with gr.Row():
- audio_in_path = gr.Dropdown(label=translations["input_audio"], value="", choices=paths_for_files, info=translations["provide_audio"], interactive=True, allow_custom_value=True)
- audio_out_path = gr.Textbox(label=translations["output_audio"], value="audios/audio_effects.wav", placeholder="audios/audio_effects.wav", info=translations["provide_output"], interactive=True)
- with gr.Row():
- with gr.Column():
- audio_combination = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True)
- audio_combination_input = gr.Dropdown(label=translations["input_audio"], value="", choices=paths_for_files, info=translations["provide_audio"], interactive=True, allow_custom_value=True, visible=audio_combination.value)
- with gr.Row():
- main_vol = gr.Slider(minimum=-80, maximum=80, label=translations["main_volume"], info=translations["main_volume_info"], value=-4, step=1, interactive=True, visible=audio_combination.value)
- combine_vol = gr.Slider(minimum=-80, maximum=80, label=translations["combination_volume"], info=translations["combination_volume_info"], value=-7, step=1, interactive=True, visible=audio_combination.value)
- with gr.Row():
- audio_effects_refresh = gr.Button(translations["refresh"])
- with gr.Row():
- audio_output_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True)
- with gr.Row():
- with gr.Accordion(translations["use_presets"], open=False):
- with gr.Row():
- presets_name = gr.Dropdown(label=translations["file_preset"], choices=audio_effect_presets_file, value=audio_effect_presets_file[0] if len(audio_effect_presets_file) > 0 else '', interactive=True, allow_custom_value=True)
- with gr.Row():
- load_click = gr.Button(translations["load_file"], variant="primary")
- refresh_click = gr.Button(translations["refresh"])
- with gr.Accordion(translations["export_file"], open=False):
- with gr.Row():
- with gr.Column():
- name_to_save_file = gr.Textbox(label=translations["filename_to_save"])
- save_file_button = gr.Button(translations["export_file"])
- with gr.Row():
- upload_presets = gr.Files(label=translations["upload_presets"], file_types=[".effect.json"])
- with gr.Row():
- apply_effects_button = gr.Button(translations["apply"], variant="primary", scale=2)
- with gr.Row():
- with gr.Column():
- with gr.Row():
- with gr.Accordion(translations["reverb"], open=False, visible=reverb_check_box.value) as reverb_accordion:
- reverb_freeze_mode = gr.Checkbox(label=translations["reverb_freeze"], info=translations["reverb_freeze_info"], value=False, interactive=True)
- reverb_room_size = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.15, label=translations["room_size"], info=translations["room_size_info"], interactive=True)
- reverb_damping = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.7, label=translations["damping"], info=translations["damping_info"], interactive=True)
- reverb_wet_level = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label=translations["wet_level"], info=translations["wet_level_info"], interactive=True)
- reverb_dry_level = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.8, label=translations["dry_level"], info=translations["dry_level_info"], interactive=True)
- reverb_width = gr.Slider(minimum=0, maximum=1, step=0.01, value=1, label=translations["width"], info=translations["width_info"], interactive=True)
- with gr.Row():
- with gr.Accordion(translations["chorus"], open=False, visible=chorus_check_box.value) as chorus_accordion:
- chorus_depth = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["chorus_depth"], info=translations["chorus_depth_info"], interactive=True)
- chorus_rate_hz = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1.5, label=translations["chorus_rate_hz"], info=translations["chorus_rate_hz_info"], interactive=True)
- chorus_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["chorus_mix"], info=translations["chorus_mix_info"], interactive=True)
- chorus_centre_delay_ms = gr.Slider(minimum=0, maximum=50, step=1, value=10, label=translations["chorus_centre_delay_ms"], info=translations["chorus_centre_delay_ms_info"], interactive=True)
- chorus_feedback = gr.Slider(minimum=-1, maximum=1, step=0.01, value=0, label=translations["chorus_feedback"], info=translations["chorus_feedback_info"], interactive=True)
- with gr.Row():
- with gr.Accordion(translations["delay"], open=False, visible=delay_check_box.value) as delay_accordion:
- delay_second = gr.Slider(minimum=0, maximum=5, step=0.01, value=0.5, label=translations["delay_seconds"], info=translations["delay_seconds_info"], interactive=True)
- delay_feedback = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["delay_feedback"], info=translations["delay_feedback_info"], interactive=True)
- delay_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["delay_mix"], info=translations["delay_mix_info"], interactive=True)
- with gr.Column():
- with gr.Row():
- with gr.Accordion(translations["more_option"], open=False, visible=more_options.value) as more_accordion:
- with gr.Row():
- fade = gr.Checkbox(label=translations["fade"], value=False, interactive=True)
- bass_or_treble = gr.Checkbox(label=translations["bass_or_treble"], value=False, interactive=True)
- limiter = gr.Checkbox(label=translations["limiter"], value=False, interactive=True)
- resample_checkbox = gr.Checkbox(label=translations["resample"], value=False, interactive=True)
- with gr.Row():
- distortion_checkbox = gr.Checkbox(label=translations["distortion"], value=False, interactive=True)
- gain_checkbox = gr.Checkbox(label=translations["gain"], value=False, interactive=True)
- bitcrush_checkbox = gr.Checkbox(label=translations["bitcrush"], value=False, interactive=True)
- clipping_checkbox = gr.Checkbox(label=translations["clipping"], value=False, interactive=True)
- with gr.Accordion(translations["fade"], open=True, visible=fade.value) as fade_accordion:
- with gr.Row():
- fade_in = gr.Slider(minimum=0, maximum=10000, step=100, value=0, label=translations["fade_in"], info=translations["fade_in_info"], interactive=True)
- fade_out = gr.Slider(minimum=0, maximum=10000, step=100, value=0, label=translations["fade_out"], info=translations["fade_out_info"], interactive=True)
- with gr.Accordion(translations["bass_or_treble"], open=True, visible=bass_or_treble.value) as bass_treble_accordion:
- with gr.Row():
- bass_boost = gr.Slider(minimum=0, maximum=20, step=1, value=0, label=translations["bass_boost"], info=translations["bass_boost_info"], interactive=True)
- bass_frequency = gr.Slider(minimum=20, maximum=200, step=10, value=100, label=translations["bass_frequency"], info=translations["bass_frequency_info"], interactive=True)
- with gr.Row():
- treble_boost = gr.Slider(minimum=0, maximum=20, step=1, value=0, label=translations["treble_boost"], info=translations["treble_boost_info"], interactive=True)
- treble_frequency = gr.Slider(minimum=1000, maximum=10000, step=500, value=3000, label=translations["treble_frequency"], info=translations["treble_frequency_info"], interactive=True)
- with gr.Accordion(translations["limiter"], open=True, visible=limiter.value) as limiter_accordion:
- with gr.Row():
- limiter_threshold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-1, label=translations["limiter_threshold_db"], info=translations["limiter_threshold_db_info"], interactive=True)
- limiter_release_ms = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label=translations["limiter_release_ms"], info=translations["limiter_release_ms_info"], interactive=True)
- with gr.Column():
- pitch_shift_semitones = gr.Slider(minimum=-20, maximum=20, step=1, value=0, label=translations["pitch"], info=translations["pitch_info"], interactive=True)
- audio_effect_resample_sr = gr.Radio(choices=[0]+sample_rate_choice, value=0, label=translations["resample"], info=translations["resample_info"], interactive=True, visible=resample_checkbox.value)
- distortion_drive_db = gr.Slider(minimum=0, maximum=50, step=1, value=20, label=translations["distortion"], info=translations["distortion_info"], interactive=True, visible=distortion_checkbox.value)
- gain_db = gr.Slider(minimum=-60, maximum=60, step=1, value=0, label=translations["gain"], info=translations["gain_info"], interactive=True, visible=gain_checkbox.value)
- clipping_threshold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-1, label=translations["clipping_threshold_db"], info=translations["clipping_threshold_db_info"], interactive=True, visible=clipping_checkbox.value)
- bitcrush_bit_depth = gr.Slider(minimum=1, maximum=24, step=1, value=16, label=translations["bitcrush_bit_depth"], info=translations["bitcrush_bit_depth_info"], interactive=True, visible=bitcrush_checkbox.value)
- with gr.Row():
- with gr.Accordion(translations["phaser"], open=False, visible=phaser_check_box.value) as phaser_accordion:
- phaser_depth = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["phaser_depth"], info=translations["phaser_depth_info"], interactive=True)
- phaser_rate_hz = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1, label=translations["phaser_rate_hz"], info=translations["phaser_rate_hz_info"], interactive=True)
- phaser_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["phaser_mix"], info=translations["phaser_mix_info"], interactive=True)
- phaser_centre_frequency_hz = gr.Slider(minimum=50, maximum=5000, step=10, value=1000, label=translations["phaser_centre_frequency_hz"], info=translations["phaser_centre_frequency_hz_info"], interactive=True)
- phaser_feedback = gr.Slider(minimum=-1, maximum=1, step=0.01, value=0, label=translations["phaser_feedback"], info=translations["phaser_feedback_info"], interactive=True)
- with gr.Row():
- with gr.Accordion(translations["compressor"], open=False, visible=compressor_check_box.value) as compressor_accordion:
- compressor_threshold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-20, label=translations["compressor_threshold_db"], info=translations["compressor_threshold_db_info"], interactive=True)
- compressor_ratio = gr.Slider(minimum=1, maximum=20, step=0.1, value=1, label=translations["compressor_ratio"], info=translations["compressor_ratio_info"], interactive=True)
- compressor_attack_ms = gr.Slider(minimum=0.1, maximum=100, step=0.1, value=10, label=translations["compressor_attack_ms"], info=translations["compressor_attack_ms_info"], interactive=True)
- compressor_release_ms = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label=translations["compressor_release_ms"], info=translations["compressor_release_ms_info"], interactive=True)
- with gr.Row():
- gr.Markdown(translations["output_audio"])
- with gr.Row():
- audio_play_input = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- audio_play_output = gr.Audio(show_download_button=True, interactive=False, label=translations["output_audio"])
- with gr.Row():
- reverb_check_box.change(fn=visible, inputs=[reverb_check_box], outputs=[reverb_accordion])
- chorus_check_box.change(fn=visible, inputs=[chorus_check_box], outputs=[chorus_accordion])
- delay_check_box.change(fn=visible, inputs=[delay_check_box], outputs=[delay_accordion])
- with gr.Row():
- compressor_check_box.change(fn=visible, inputs=[compressor_check_box], outputs=[compressor_accordion])
- phaser_check_box.change(fn=visible, inputs=[phaser_check_box], outputs=[phaser_accordion])
- more_options.change(fn=visible, inputs=[more_options], outputs=[more_accordion])
- with gr.Row():
- fade.change(fn=visible, inputs=[fade], outputs=[fade_accordion])
- bass_or_treble.change(fn=visible, inputs=[bass_or_treble], outputs=[bass_treble_accordion])
- limiter.change(fn=visible, inputs=[limiter], outputs=[limiter_accordion])
- resample_checkbox.change(fn=visible, inputs=[resample_checkbox], outputs=[audio_effect_resample_sr])
- with gr.Row():
- distortion_checkbox.change(fn=visible, inputs=[distortion_checkbox], outputs=[distortion_drive_db])
- gain_checkbox.change(fn=visible, inputs=[gain_checkbox], outputs=[gain_db])
- clipping_checkbox.change(fn=visible, inputs=[clipping_checkbox], outputs=[clipping_threshold_db])
- bitcrush_checkbox.change(fn=visible, inputs=[bitcrush_checkbox], outputs=[bitcrush_bit_depth])
- with gr.Row():
- upload_audio.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[upload_audio], outputs=[audio_in_path])
- audio_in_path.change(fn=lambda audio: audio if audio else None, inputs=[audio_in_path], outputs=[audio_play_input])
- audio_effects_refresh.click(fn=lambda a, b: [change_audios_choices(a), change_audios_choices(b)], inputs=[audio_in_path, audio_combination_input], outputs=[audio_in_path, audio_combination_input])
- with gr.Row():
- more_options.change(fn=lambda: [False]*8, inputs=[], outputs=[fade, bass_or_treble, limiter, resample_checkbox, distortion_checkbox, gain_checkbox, clipping_checkbox, bitcrush_checkbox])
- audio_combination.change(fn=visible, inputs=[audio_combination], outputs=[audio_combination_input])
- audio_combination.change(fn=lambda a: [visible(a) for _ in range(2)], inputs=[audio_combination], outputs=[main_vol, combine_vol])
- with gr.Row():
- upload_presets.upload(fn=lambda presets_in: [shutil_move(preset.name, configs["presets_path"]) for preset in presets_in][0], inputs=[upload_presets], outputs=[presets_name])
- refresh_click.click(fn=change_effect_preset_choices, inputs=[], outputs=[presets_name])
- with gr.Row():
- load_click.click(
- fn=audio_effect_load_presets,
- inputs=[
- presets_name,
- resample_checkbox,
- audio_effect_resample_sr,
- chorus_depth,
- chorus_rate_hz,
- chorus_mix,
- chorus_centre_delay_ms,
- chorus_feedback,
- distortion_drive_db,
- reverb_room_size,
- reverb_damping,
- reverb_wet_level,
- reverb_dry_level,
- reverb_width,
- reverb_freeze_mode,
- pitch_shift_semitones,
- delay_second,
- delay_feedback,
- delay_mix,
- compressor_threshold_db,
- compressor_ratio,
- compressor_attack_ms,
- compressor_release_ms,
- limiter_threshold_db,
- limiter_release_ms,
- gain_db,
- bitcrush_bit_depth,
- clipping_threshold_db,
- phaser_rate_hz,
- phaser_depth,
- phaser_centre_frequency_hz,
- phaser_feedback,
- phaser_mix,
- bass_boost,
- bass_frequency,
- treble_boost,
- treble_frequency,
- fade_in,
- fade_out,
- chorus_check_box,
- distortion_checkbox,
- reverb_check_box,
- delay_check_box,
- compressor_check_box,
- limiter,
- gain_checkbox,
- bitcrush_checkbox,
- clipping_checkbox,
- phaser_check_box,
- bass_or_treble,
- fade
- ],
- outputs=[
- resample_checkbox,
- audio_effect_resample_sr,
- chorus_depth,
- chorus_rate_hz,
- chorus_mix,
- chorus_centre_delay_ms,
- chorus_feedback,
- distortion_drive_db,
- reverb_room_size,
- reverb_damping,
- reverb_wet_level,
- reverb_dry_level,
- reverb_width,
- reverb_freeze_mode,
- pitch_shift_semitones,
- delay_second,
- delay_feedback,
- delay_mix,
- compressor_threshold_db,
- compressor_ratio,
- compressor_attack_ms,
- compressor_release_ms,
- limiter_threshold_db,
- limiter_release_ms,
- gain_db,
- bitcrush_bit_depth,
- clipping_threshold_db,
- phaser_rate_hz,
- phaser_depth,
- phaser_centre_frequency_hz,
- phaser_feedback,
- phaser_mix,
- bass_boost,
- bass_frequency,
- treble_boost,
- treble_frequency,
- fade_in,
- fade_out,
- chorus_check_box,
- distortion_checkbox,
- reverb_check_box,
- delay_check_box,
- compressor_check_box,
- limiter,
- gain_checkbox,
- bitcrush_checkbox,
- clipping_checkbox,
- phaser_check_box,
- bass_or_treble,
- fade
- ],
- )
- save_file_button.click(
- fn=audio_effect_save_presets,
- inputs=[
- name_to_save_file,
- resample_checkbox,
- audio_effect_resample_sr,
- chorus_depth,
- chorus_rate_hz,
- chorus_mix,
- chorus_centre_delay_ms,
- chorus_feedback,
- distortion_drive_db,
- reverb_room_size,
- reverb_damping,
- reverb_wet_level,
- reverb_dry_level,
- reverb_width,
- reverb_freeze_mode,
- pitch_shift_semitones,
- delay_second,
- delay_feedback,
- delay_mix,
- compressor_threshold_db,
- compressor_ratio,
- compressor_attack_ms,
- compressor_release_ms,
- limiter_threshold_db,
- limiter_release_ms,
- gain_db,
- bitcrush_bit_depth,
- clipping_threshold_db,
- phaser_rate_hz,
- phaser_depth,
- phaser_centre_frequency_hz,
- phaser_feedback,
- phaser_mix,
- bass_boost,
- bass_frequency,
- treble_boost,
- treble_frequency,
- fade_in,
- fade_out,
- chorus_check_box,
- distortion_checkbox,
- reverb_check_box,
- delay_check_box,
- compressor_check_box,
- limiter,
- gain_checkbox,
- bitcrush_checkbox,
- clipping_checkbox,
- phaser_check_box,
- bass_or_treble,
- fade
- ],
- outputs=[presets_name]
- )
- with gr.Row():
- apply_effects_button.click(
- fn=audio_effects,
- inputs=[
- audio_in_path,
- audio_out_path,
- resample_checkbox,
- audio_effect_resample_sr,
- chorus_depth,
- chorus_rate_hz,
- chorus_mix,
- chorus_centre_delay_ms,
- chorus_feedback,
- distortion_drive_db,
- reverb_room_size,
- reverb_damping,
- reverb_wet_level,
- reverb_dry_level,
- reverb_width,
- reverb_freeze_mode,
- pitch_shift_semitones,
- delay_second,
- delay_feedback,
- delay_mix,
- compressor_threshold_db,
- compressor_ratio,
- compressor_attack_ms,
- compressor_release_ms,
- limiter_threshold_db,
- limiter_release_ms,
- gain_db,
- bitcrush_bit_depth,
- clipping_threshold_db,
- phaser_rate_hz,
- phaser_depth,
- phaser_centre_frequency_hz,
- phaser_feedback,
- phaser_mix,
- bass_boost,
- bass_frequency,
- treble_boost,
- treble_frequency,
- fade_in,
- fade_out,
- audio_output_format,
- chorus_check_box,
- distortion_checkbox,
- reverb_check_box,
- delay_check_box,
- compressor_check_box,
- limiter,
- gain_checkbox,
- bitcrush_checkbox,
- clipping_checkbox,
- phaser_check_box,
- bass_or_treble,
- fade,
- audio_combination,
- audio_combination_input,
- main_vol,
- combine_vol
- ],
- outputs=[audio_play_output],
- api_name="audio_effects"
- )
\ No newline at end of file
diff --git a/main/app/tabs/editing/child/quirk.py b/main/app/tabs/editing/child/quirk.py
deleted file mode 100644
index f723b1cf61dd7a05d2e630a47ed7130671660bd9..0000000000000000000000000000000000000000
--- a/main/app/tabs/editing/child/quirk.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.editing import apply_voice_quirk
-from main.app.core.ui import change_audios_choices, shutil_move
-from main.app.variables import translations, paths_for_files, configs, file_types, export_format_choices
-
-def quirk_tab():
- with gr.Row():
- gr.Markdown(translations["quirk_markdown"])
- with gr.Row():
- input_audio_play = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- with gr.Row():
- quirk_choice = gr.Radio(label=translations["quirk_label"], info=translations["quirk_label_info"], choices=list(translations["quirk_choice"].keys()), interactive=True, value=list(translations["quirk_choice"].keys())[0])
- with gr.Row():
- apply_quirk_button = gr.Button(translations["apply"], variant="primary")
- with gr.Row():
- with gr.Accordion(translations["input_output"], open=False):
- with gr.Row():
- quirk_upload_audio = gr.Files(label=translations["drop_audio"], file_types=file_types)
- with gr.Column():
- quirk_export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True)
- quirk_input_path = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
- quirk_output_path = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
- with gr.Column():
- quirk_refresh = gr.Button(translations["refresh"])
- with gr.Row():
- output_audio_play = gr.Audio(show_download_button=True, interactive=False, label=translations["output_audio"])
- with gr.Row():
- quirk_upload_audio.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[quirk_upload_audio], outputs=[quirk_input_path])
- quirk_input_path.change(fn=lambda audio: audio if audio else None, inputs=[quirk_input_path], outputs=[input_audio_play])
- quirk_refresh.click(fn=change_audios_choices, inputs=[quirk_input_path], outputs=[quirk_input_path])
- with gr.Row():
- apply_quirk_button.click(
- fn=apply_voice_quirk,
- inputs=[
- quirk_input_path,
- quirk_choice,
- quirk_output_path,
- quirk_export_format
- ],
- outputs=[output_audio_play],
- api_name="quirk"
- )
\ No newline at end of file
diff --git a/main/app/tabs/editing/editing.py b/main/app/tabs/editing/editing.py
deleted file mode 100644
index 10964204b1e39de7c2d239fdfe959eb6900f6ae9..0000000000000000000000000000000000000000
--- a/main/app/tabs/editing/editing.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import configs, translations
-from main.app.tabs.editing.child.quirk import quirk_tab
-from main.app.tabs.editing.child.audio_effects import audio_effects_tab
-
-def editing_tab():
- with gr.TabItem(translations["editing"], visible=configs.get("editing_tab", True)):
- with gr.TabItem(translations["audio_effects"], visible=configs.get("effects_tab", True)):
- gr.Markdown(translations["apply_audio_effects"])
- audio_effects_tab()
-
- with gr.TabItem(translations["quirk"], visible=configs.get("quirk", True)):
- gr.Markdown(translations["quirk_info"])
- quirk_tab()
\ No newline at end of file
diff --git a/main/app/tabs/extra/child/convert_model.py b/main/app/tabs/extra/child/convert_model.py
deleted file mode 100644
index 410ffd88ab46a829484266b7fd2bf6d6e18743f6..0000000000000000000000000000000000000000
--- a/main/app/tabs/extra/child/convert_model.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.ui import visible, shutil_move
-from main.app.core.model_utils import onnx_export
-from main.app.variables import translations, configs
-
-def convert_model_tab():
- with gr.Row():
- gr.Markdown(translations["pytorch2onnx_markdown"])
- with gr.Row():
- model_pth_upload = gr.File(label=translations["drop_model"], file_types=[".pth"])
- with gr.Row():
- convert_onnx = gr.Button(translations["convert_model"], variant="primary", scale=2)
- with gr.Row():
- model_pth_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True)
- with gr.Row():
- output_model2 = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False)
- with gr.Row():
- model_pth_upload.upload(fn=lambda model_pth_upload: shutil_move(model_pth_upload.name, configs["weights_path"]), inputs=[model_pth_upload], outputs=[model_pth_path])
- convert_onnx.click(
- fn=onnx_export,
- inputs=[model_pth_path],
- outputs=[output_model2],
- api_name="model_onnx_export"
- )
- convert_onnx.click(fn=lambda: visible(True), inputs=[], outputs=[output_model2])
\ No newline at end of file
diff --git a/main/app/tabs/extra/child/create_srt.py b/main/app/tabs/extra/child/create_srt.py
deleted file mode 100644
index 1fd8c70cc41cc01bdeb7ee9aae5e3c4e54538a71..0000000000000000000000000000000000000000
--- a/main/app/tabs/extra/child/create_srt.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.csrt import create_srt
-from main.app.core.ui import shutil_move, change_audios_choices
-from main.app.variables import translations, file_types, configs, paths_for_files
-
-def create_srt_tab():
- with gr.Row():
- gr.Markdown(translations["create_srt_markdown_2"])
- with gr.Row():
- with gr.Column():
- srt_content = gr.Textbox(label=translations["srt_content"], value="", lines=9, max_lines=9, interactive=False)
- with gr.Column():
- word_timestamps = gr.Checkbox(label=translations["word_timestamps"], info=translations["word_timestamps_info"], value=False, interactive=True)
- model_size = gr.Radio(label=translations["model_size"], info=translations["model_size_info"], choices=["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3", "large-v3-turbo"], value="medium", interactive=True)
- with gr.Row():
- convert_button = gr.Button(translations["convert_audio"], variant="primary")
- with gr.Row():
- with gr.Accordion(translations["input_output"], open=False):
- with gr.Column():
- input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
- output_file = gr.Textbox(label=translations["srt_output_file"], value="srt/output.srt", placeholder="srt/output.srt", interactive=True)
- with gr.Column():
- refresh = gr.Button(translations["refresh"])
- with gr.Row():
- input_file = gr.Files(label=translations["drop_audio"], file_types=file_types)
- with gr.Row():
- play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- with gr.Row():
- output_srt = gr.File(label=translations["srt_output_file"], file_types=[".srt"], interactive=False, visible=False)
- with gr.Row():
- input_file.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[input_file], outputs=[input_audio])
- input_audio.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio], outputs=[play_audio])
- refresh.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio])
- with gr.Row():
- convert_button.click(
- fn=create_srt,
- inputs=[
- model_size,
- input_audio,
- output_file,
- word_timestamps
- ],
- outputs=[
- output_srt,
- srt_content
- ],
- api_name="create_srt"
- )
-
-
diff --git a/main/app/tabs/extra/child/f0_extract.py b/main/app/tabs/extra/child/f0_extract.py
deleted file mode 100644
index 3062ee728faa655a89d0866a4a1ed02a0a6547bc..0000000000000000000000000000000000000000
--- a/main/app/tabs/extra/child/f0_extract.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.f0_extract import f0_extract
-from main.app.core.ui import change_audios_choices, unlock_f0, shutil_move
-from main.app.variables import translations, paths_for_files, method_f0, configs, file_types
-
-def f0_extract_tab():
- with gr.Row():
- gr.Markdown(translations["f0_extractor_markdown_2"])
- with gr.Row():
- extractor_button = gr.Button(translations["extract_button"].replace("2. ", ""), variant="primary")
- with gr.Row():
- with gr.Column():
- upload_audio_file = gr.Files(label=translations["drop_audio"], file_types=file_types)
- audioplay = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- with gr.Column():
- with gr.Accordion(translations["f0_method"], open=False):
- with gr.Group():
- with gr.Row():
- onnx_f0_mode3 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
- f0_method_extract = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=[m for m in method_f0 if m != "hybrid"], value="rmvpe", interactive=True)
- with gr.Accordion(translations["audio_path"], open=True):
- input_audio_path = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True)
- refresh_audio_button = gr.Button(translations["refresh"])
- with gr.Row():
- gr.Markdown("___")
- with gr.Row():
- file_output = gr.File(label="", file_types=[".txt"], interactive=False)
- image_output = gr.Image(label="", interactive=False, show_download_button=True)
- with gr.Row():
- upload_audio_file.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[upload_audio_file], outputs=[input_audio_path])
- input_audio_path.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio_path], outputs=[audioplay])
- refresh_audio_button.click(fn=change_audios_choices, inputs=[input_audio_path], outputs=[input_audio_path])
- with gr.Row():
- unlock_full_method.change(fn=lambda method: {"choices": [m for m in unlock_f0(method)["choices"] if m != "hybrid"], "value": "rmvpe", "__type__": "update"}, inputs=[unlock_full_method], outputs=[f0_method_extract])
- extractor_button.click(
- fn=f0_extract,
- inputs=[
- input_audio_path,
- f0_method_extract,
- onnx_f0_mode3
- ],
- outputs=[file_output, image_output],
- api_name="f0_extract"
- )
\ No newline at end of file
diff --git a/main/app/tabs/extra/child/fushion.py b/main/app/tabs/extra/child/fushion.py
deleted file mode 100644
index 0064ef81ec702236ded2833a65d1d394d552e312..0000000000000000000000000000000000000000
--- a/main/app/tabs/extra/child/fushion.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.ui import visible, shutil_move
-from main.app.core.model_utils import fushion_model
-from main.app.variables import translations, configs
-
-def fushion_tab():
- with gr.Row():
- gr.Markdown(translations["fushion_markdown_2"])
- with gr.Row():
- name_to_save = gr.Textbox(label=translations["modelname"], placeholder="Model.pth", value="", max_lines=1, interactive=True)
- with gr.Row():
- fushion_button = gr.Button(translations["fushion"], variant="primary", scale=4)
- with gr.Column():
- with gr.Row():
- model_a = gr.File(label=f"{translations['model_name']} 1", file_types=[".pth", ".onnx"])
- model_b = gr.File(label=f"{translations['model_name']} 2", file_types=[".pth", ".onnx"])
- with gr.Row():
- model_path_a = gr.Textbox(label=f"{translations['model_path']} 1", value="", placeholder="assets/weights/Model_1.pth")
- model_path_b = gr.Textbox(label=f"{translations['model_path']} 2", value="", placeholder="assets/weights/Model_2.pth")
- with gr.Row():
- ratio = gr.Slider(minimum=0, maximum=1, label=translations["model_ratio"], info=translations["model_ratio_info"], value=0.5, interactive=True)
- with gr.Row():
- output_model = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False)
- with gr.Row():
- model_a.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model_a], outputs=[model_path_a])
- model_b.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model_b], outputs=[model_path_b])
- with gr.Row():
- fushion_button.click(
- fn=fushion_model,
- inputs=[
- name_to_save,
- model_path_a,
- model_path_b,
- ratio
- ],
- outputs=[name_to_save, output_model],
- api_name="fushion_model"
- )
- fushion_button.click(fn=lambda: visible(True), inputs=[], outputs=[output_model])
\ No newline at end of file
diff --git a/main/app/tabs/extra/child/read_model.py b/main/app/tabs/extra/child/read_model.py
deleted file mode 100644
index 4ca25625fd48dbff9e64bbb388851fc35883a450..0000000000000000000000000000000000000000
--- a/main/app/tabs/extra/child/read_model.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.ui import shutil_move
-from main.app.core.model_utils import model_info
-from main.app.variables import translations, configs
-
-def read_model_tab():
- with gr.Row():
- gr.Markdown(translations["read_model_markdown_2"])
- with gr.Row():
- model = gr.File(label=translations["drop_model"], file_types=[".pth", ".onnx"])
- with gr.Row():
- read_button = gr.Button(translations["readmodel"], variant="primary", scale=2)
- with gr.Column():
- model_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True)
- output_info = gr.Textbox(label=translations["modelinfo"], value="", interactive=False, scale=6)
- with gr.Row():
- model.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model], outputs=[model_path])
- read_button.click(
- fn=model_info,
- inputs=[model_path],
- outputs=[output_info],
- api_name="read_model"
- )
\ No newline at end of file
diff --git a/main/app/tabs/extra/child/settings.py b/main/app/tabs/extra/child/settings.py
deleted file mode 100644
index fd839e3bdb93314a7e89b861712980e1637df767..0000000000000000000000000000000000000000
--- a/main/app/tabs/extra/child/settings.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.ui import change_fp
-from main.app.core.utils import stop_pid
-from main.app.core.restart import change_font, change_language, change_theme
-from main.app.variables import translations, theme, font, configs, language, config
-
-def settings_tab(app):
- with gr.Row():
- gr.Markdown(translations["settings_markdown_2"])
- with gr.Row():
- toggle_button = gr.Button(translations["change_light_dark"], variant="secondary", scale=2)
- with gr.Row():
- with gr.Column():
- language_dropdown = gr.Dropdown(label=translations["lang"], interactive=True, info=translations["lang_restart"], choices=configs.get("support_language", "vi-VN"), value=language)
- change_lang = gr.Button(translations["change_lang"], variant="primary", scale=2)
- with gr.Column():
- theme_dropdown = gr.Dropdown(label=translations["theme"], interactive=True, info=translations["theme_restart"], choices=configs.get("themes", theme), value=theme, allow_custom_value=True)
- changetheme = gr.Button(translations["theme_button"], variant="primary", scale=2)
- with gr.Row():
- with gr.Column():
- fp_choice = gr.Radio(choices=["fp16","fp32"], value="fp16" if configs.get("fp16", False) else "fp32", label=translations["precision"], info=translations["precision_info"], interactive=config.device not in ["cpu", "mps", "ocl:0"])
- fp_button = gr.Button(translations["update_precision"], variant="secondary", scale=2)
- with gr.Column():
- font_choice = gr.Textbox(label=translations["font"], info=translations["font_info"], value=font, interactive=True)
- font_button = gr.Button(translations["change_font"])
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["stop"], open=False, visible=True):
- separate_stop = gr.Button(translations["stop_separate"])
- convert_stop = gr.Button(translations["stop_convert"])
- create_dataset_stop = gr.Button(translations["stop_create_dataset"])
- with gr.Accordion(translations["stop_training"], open=False):
- model_name_stop = gr.Textbox(label=translations["modelname"], info=translations["training_model_name"], value="", placeholder=translations["modelname"], interactive=True)
- preprocess_stop = gr.Button(translations["stop_preprocess"])
- extract_stop = gr.Button(translations["stop_extract"])
- train_stop = gr.Button(translations["stop_training"])
- with gr.Row():
- toggle_button.click(fn=None, js="() => {document.body.classList.toggle('dark')}")
- fp_button.click(fn=change_fp, inputs=[fp_choice], outputs=[fp_choice])
- with gr.Row():
- change_lang.click(fn=lambda a: change_language(a, app), inputs=[language_dropdown], outputs=[])
- changetheme.click(fn=lambda a: change_theme(a, app) , inputs=[theme_dropdown], outputs=[])
- font_button.click(fn=lambda a: change_font(a, app), inputs=[font_choice], outputs=[])
- with gr.Row():
- change_lang.click(fn=None, js="setTimeout(function() {location.reload()}, 30000)", inputs=[], outputs=[])
- changetheme.click(fn=None, js="setTimeout(function() {location.reload()}, 30000)", inputs=[], outputs=[])
- font_button.click(fn=None, js="setTimeout(function() {location.reload()}, 30000)", inputs=[], outputs=[])
- with gr.Row():
- separate_stop.click(fn=lambda: stop_pid("separate_pid", None, False), inputs=[], outputs=[])
- convert_stop.click(fn=lambda: stop_pid("convert_pid", None, False), inputs=[], outputs=[])
- create_dataset_stop.click(fn=lambda: stop_pid("create_dataset_pid", None, False), inputs=[], outputs=[])
- with gr.Row():
- preprocess_stop.click(fn=lambda model_name_stop: stop_pid("preprocess_pid", model_name_stop, False), inputs=[model_name_stop], outputs=[])
- extract_stop.click(fn=lambda model_name_stop: stop_pid("extract_pid", model_name_stop, False), inputs=[model_name_stop], outputs=[])
- train_stop.click(fn=lambda model_name_stop: stop_pid("train_pid", model_name_stop, True), inputs=[model_name_stop], outputs=[])
\ No newline at end of file
diff --git a/main/app/tabs/extra/extra.py b/main/app/tabs/extra/extra.py
deleted file mode 100644
index f2938e7341fc5187eb5a8c9af54a4320e5725e04..0000000000000000000000000000000000000000
--- a/main/app/tabs/extra/extra.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import translations, configs
-from main.app.tabs.extra.child.fushion import fushion_tab
-from main.app.tabs.extra.child.settings import settings_tab
-from main.app.tabs.extra.child.read_model import read_model_tab
-from main.app.tabs.extra.child.f0_extract import f0_extract_tab
-from main.app.tabs.extra.child.create_srt import create_srt_tab
-from main.app.tabs.extra.child.convert_model import convert_model_tab
-
-def extra_tab(app):
- with gr.TabItem(translations["extra"], visible=configs.get("extra_tab", True)):
- with gr.TabItem(translations["fushion"], visible=configs.get("fushion_tab", True)):
- gr.Markdown(translations["fushion_markdown"])
- fushion_tab()
-
- with gr.TabItem(translations["read_model"], visible=configs.get("read_tab", True)):
- gr.Markdown(translations["read_model_markdown"])
- read_model_tab()
-
- with gr.TabItem(translations["convert_model"], visible=configs.get("onnx_tab", True)):
- gr.Markdown(translations["pytorch2onnx"])
- convert_model_tab()
-
- with gr.TabItem(translations["f0_extractor_tab"], visible=configs.get("f0_extractor_tab", True)):
- gr.Markdown(translations["f0_extractor_markdown"])
- f0_extract_tab()
-
- with gr.TabItem(translations["create_srt_tab"], visible=configs.get("create_srt_tab", True)):
- gr.Markdown(translations["create_srt_markdown"])
- create_srt_tab()
-
- with gr.TabItem(translations["settings"], visible=configs.get("settings_tab", True)):
- gr.Markdown(translations["settings_markdown"])
- settings_tab(app)
\ No newline at end of file
diff --git a/main/app/tabs/inference/child/convert.py b/main/app/tabs/inference/child/convert.py
deleted file mode 100644
index 2d09f3f4ed1817a8fa17d1d9875de32d2431cafc..0000000000000000000000000000000000000000
--- a/main/app/tabs/inference/child/convert.py
+++ /dev/null
@@ -1,328 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.presets import load_presets, save_presets
-from main.app.core.inference import convert_audio, convert_selection
-from main.app.variables import translations, paths_for_files, sample_rate_choice, model_name, index_path, method_f0, f0_file, embedders_mode, embedders_model, presets_file, configs, file_types, export_format_choices, hybrid_f0_method
-from main.app.core.ui import visible, valueFalse_interactive, change_audios_choices, change_f0_choices, unlock_f0, change_preset_choices, change_backing_choices, hoplength_show, change_models_choices, get_index, index_strength_show, change_embedders_mode, shutil_move
-
-def convert_tab():
- with gr.Row():
- gr.Markdown(translations["convert_info"])
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row():
- cleaner0 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
- autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
- use_audio = gr.Checkbox(label=translations["use_audio"], value=False, interactive=True)
- checkpointing = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
- with gr.Row():
- use_original = gr.Checkbox(label=translations["convert_original"], value=False, interactive=True, visible=use_audio.value)
- convert_backing = gr.Checkbox(label=translations["convert_backing"], value=False, interactive=True, visible=use_audio.value)
- not_merge_backing = gr.Checkbox(label=translations["not_merge_backing"], value=False, interactive=True, visible=use_audio.value)
- merge_instrument = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True, visible=use_audio.value)
- with gr.Row():
- pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
- clean_strength0 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner0.value)
- with gr.Row():
- with gr.Column():
- audio_select = gr.Dropdown(label=translations["select_separate"], choices=[], value="", interactive=True, allow_custom_value=True, visible=False)
- convert_button_2 = gr.Button(translations["convert_audio"], visible=False)
- with gr.Row():
- with gr.Column():
- convert_button = gr.Button(translations["convert_audio"], variant="primary")
- with gr.Row():
- with gr.Column():
- input0 = gr.Files(label=translations["drop_audio"], file_types=file_types)
- play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- with gr.Column():
- with gr.Accordion(translations["model_accordion"], open=True):
- with gr.Row():
- model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
- model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
- with gr.Row():
- refresh = gr.Button(translations["refresh"])
- with gr.Row():
- index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "")
- with gr.Accordion(translations["input_output"], open=False):
- with gr.Column():
- export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True)
- input_audio0 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
- output_audio = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
- with gr.Column():
- refresh0 = gr.Button(translations["refresh"])
- with gr.Accordion(translations["setting"], open=False):
- with gr.Accordion(translations["f0_method"], open=False):
- with gr.Group():
- with gr.Row():
- onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
- method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
- hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=method.value == "hybrid")
- hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False)
- alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
- with gr.Accordion(translations["f0_file"], open=False):
- upload_f0_file = gr.File(label=translations["upload_f0"], file_types=[".txt"])
- f0_file_dropdown = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True)
- refresh_f0_file = gr.Button(translations["refresh"])
- with gr.Accordion(translations["hubert_model"], open=False):
- embed_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
- embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
- custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom")
- with gr.Accordion(translations["use_presets"], open=False):
- with gr.Row():
- presets_name = gr.Dropdown(label=translations["file_preset"], choices=presets_file, value=presets_file[0] if len(presets_file) > 0 else '', interactive=True, allow_custom_value=True)
- with gr.Row():
- load_click = gr.Button(translations["load_file"], variant="primary")
- refresh_click = gr.Button(translations["refresh"])
- with gr.Accordion(translations["export_file"], open=False):
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row():
- cleaner_chbox = gr.Checkbox(label=translations["save_clean"], value=True, interactive=True)
- autotune_chbox = gr.Checkbox(label=translations["save_autotune"], value=True, interactive=True)
- pitch_chbox = gr.Checkbox(label=translations["save_pitch"], value=True, interactive=True)
- index_strength_chbox = gr.Checkbox(label=translations["save_index_2"], value=True, interactive=True)
- resample_sr_chbox = gr.Checkbox(label=translations["save_resample"], value=True, interactive=True)
- filter_radius_chbox = gr.Checkbox(label=translations["save_filter"], value=True, interactive=True)
- rms_mix_rate_chbox = gr.Checkbox(label=translations["save_envelope"], value=True, interactive=True)
- protect_chbox = gr.Checkbox(label=translations["save_protect"], value=True, interactive=True)
- split_audio_chbox = gr.Checkbox(label=translations["save_split"], value=True, interactive=True)
- formant_shifting_chbox = gr.Checkbox(label=translations["formantshift"], value=True, interactive=True)
- with gr.Row():
- with gr.Column():
- name_to_save_file = gr.Textbox(label=translations["filename_to_save"])
- save_file_button = gr.Button(translations["export_file"])
- with gr.Row():
- upload_presets = gr.Files(label=translations["upload_presets"], file_types=[".conversion.json"])
- with gr.Column():
- with gr.Group():
- with gr.Row():
- split_audio = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True)
- formant_shifting = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
- with gr.Row():
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
- audio_processing = gr.Checkbox(label=translations["audio_processing"], value=False, interactive=True)
- resample_sr = gr.Radio(choices=[0]+sample_rate_choice, label=translations["resample"], info=translations["resample_info"], value=0, interactive=True)
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value)
- filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
- rms_mix_rate = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
- protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
- with gr.Row():
- formant_qfrency = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- formant_timbre = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- with gr.Row():
- gr.Markdown(translations["output_convert"])
- with gr.Row():
- main_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["main_convert"])
- backing_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_backing"], visible=convert_backing.value)
- main_backing = gr.Audio(show_download_button=True, interactive=False, label=translations["main_or_backing"], visible=convert_backing.value)
- with gr.Row():
- original_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_original"], visible=use_original.value)
- vocal_instrument = gr.Audio(show_download_button=True, interactive=False, label=translations["voice_or_instruments"], visible=merge_instrument.value)
- with gr.Row():
- upload_f0_file.upload(fn=lambda inp: shutil_move(inp.name, configs["f0_path"]), inputs=[upload_f0_file], outputs=[f0_file_dropdown])
- refresh_f0_file.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown])
- unlock_full_method.change(fn=unlock_f0, inputs=[unlock_full_method], outputs=[method])
- with gr.Row():
- load_click.click(
- fn=load_presets,
- inputs=[
- presets_name,
- cleaner0,
- autotune,
- pitch,
- clean_strength0,
- index_strength,
- resample_sr,
- filter_radius,
- rms_mix_rate,
- protect,
- split_audio,
- f0_autotune_strength,
- formant_shifting,
- formant_qfrency,
- formant_timbre,
- proposal_pitch,
- proposal_pitch_threshold
- ],
- outputs=[
- cleaner0,
- autotune,
- pitch,
- clean_strength0,
- index_strength,
- resample_sr,
- filter_radius,
- rms_mix_rate,
- protect,
- split_audio,
- f0_autotune_strength,
- formant_shifting,
- formant_qfrency,
- formant_timbre,
- proposal_pitch,
- proposal_pitch_threshold
- ]
- )
- refresh_click.click(fn=change_preset_choices, inputs=[], outputs=[presets_name])
- save_file_button.click(
- fn=save_presets,
- inputs=[
- name_to_save_file,
- cleaner0,
- autotune,
- pitch,
- clean_strength0,
- index_strength,
- resample_sr,
- filter_radius,
- rms_mix_rate,
- protect,
- split_audio,
- f0_autotune_strength,
- cleaner_chbox,
- autotune_chbox,
- pitch_chbox,
- index_strength_chbox,
- resample_sr_chbox,
- filter_radius_chbox,
- rms_mix_rate_chbox,
- protect_chbox,
- split_audio_chbox,
- formant_shifting_chbox,
- formant_shifting,
- formant_qfrency,
- formant_timbre,
- proposal_pitch,
- proposal_pitch_threshold
- ],
- outputs=[presets_name]
- )
- with gr.Row():
- upload_presets.upload(fn=lambda presets_in: [shutil_move(preset.name, configs["presets_path"]) for preset in presets_in][0], inputs=[upload_presets], outputs=[presets_name])
- autotune.change(fn=visible, inputs=[autotune], outputs=[f0_autotune_strength])
- use_audio.change(fn=lambda a: [visible(a), visible(a), visible(a), visible(a), visible(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), visible(not a), visible(not a), visible(not a), visible(not a)], inputs=[use_audio], outputs=[main_backing, use_original, convert_backing, not_merge_backing, merge_instrument, use_original, convert_backing, not_merge_backing, merge_instrument, input_audio0, output_audio, input0, play_audio])
- with gr.Row():
- convert_backing.change(fn=lambda a,b: [change_backing_choices(a, b), visible(a)], inputs=[convert_backing, not_merge_backing], outputs=[use_original, backing_convert])
- use_original.change(fn=lambda audio, original: [visible(original), visible(not original), visible(audio and not original), valueFalse_interactive(not original), valueFalse_interactive(not original)], inputs=[use_audio, use_original], outputs=[original_convert, main_convert, main_backing, convert_backing, not_merge_backing])
- cleaner0.change(fn=visible, inputs=[cleaner0], outputs=[clean_strength0])
- with gr.Row():
- merge_instrument.change(fn=visible, inputs=[merge_instrument], outputs=[vocal_instrument])
- not_merge_backing.change(fn=lambda audio, merge, cvb: [visible(audio and not merge), change_backing_choices(cvb, merge)], inputs=[use_audio, not_merge_backing, convert_backing], outputs=[main_backing, use_original])
- method.change(fn=lambda method, hybrid: [visible(method == "hybrid"), visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method, hybrid_method], outputs=[hybrid_method, alpha, hop_length])
- with gr.Row():
- hybrid_method.change(fn=hoplength_show, inputs=[method, hybrid_method], outputs=[hop_length])
- refresh.click(fn=change_models_choices, inputs=[], outputs=[model_pth, model_index])
- model_pth.change(fn=get_index, inputs=[model_pth], outputs=[model_index])
- with gr.Row():
- input0.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[input0], outputs=[input_audio0])
- input_audio0.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio0], outputs=[play_audio])
- formant_shifting.change(fn=lambda a: [visible(a) for _ in range(2)], inputs=[formant_shifting], outputs=[formant_qfrency, formant_timbre])
- with gr.Row():
- embedders.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders], outputs=[custom_embedders])
- refresh0.click(fn=change_audios_choices, inputs=[input_audio0], outputs=[input_audio0])
- model_index.change(fn=index_strength_show, inputs=[model_index], outputs=[index_strength])
- with gr.Row():
- convert_button.click(fn=lambda: visible(False), inputs=[], outputs=[convert_button])
- convert_button_2.click(fn=lambda: [visible(False), visible(False)], inputs=[], outputs=[audio_select, convert_button_2])
- with gr.Row():
- proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold])
- embed_mode.change(fn=change_embedders_mode, inputs=[embed_mode], outputs=[embedders])
- with gr.Row():
- convert_button.click(
- fn=convert_selection,
- inputs=[
- cleaner0,
- autotune,
- use_audio,
- use_original,
- convert_backing,
- not_merge_backing,
- merge_instrument,
- pitch,
- clean_strength0,
- model_pth,
- model_index,
- index_strength,
- input_audio0,
- output_audio,
- export_format,
- method,
- hybrid_method,
- hop_length,
- embedders,
- custom_embedders,
- resample_sr,
- filter_radius,
- rms_mix_rate,
- protect,
- split_audio,
- f0_autotune_strength,
- checkpointing,
- onnx_f0_mode,
- formant_shifting,
- formant_qfrency,
- formant_timbre,
- f0_file_dropdown,
- embed_mode,
- proposal_pitch,
- proposal_pitch_threshold,
- audio_processing,
- alpha
- ],
- outputs=[audio_select, main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button, convert_button_2],
- api_name="convert_selection"
- )
- convert_button_2.click(
- fn=convert_audio,
- inputs=[
- cleaner0,
- autotune,
- use_audio,
- use_original,
- convert_backing,
- not_merge_backing,
- merge_instrument,
- pitch,
- clean_strength0,
- model_pth,
- model_index,
- index_strength,
- input_audio0,
- output_audio,
- export_format,
- method,
- hybrid_method,
- hop_length,
- embedders,
- custom_embedders,
- resample_sr,
- filter_radius,
- rms_mix_rate,
- protect,
- split_audio,
- f0_autotune_strength,
- audio_select,
- checkpointing,
- onnx_f0_mode,
- formant_shifting,
- formant_qfrency,
- formant_timbre,
- f0_file_dropdown,
- embed_mode,
- proposal_pitch,
- proposal_pitch_threshold,
- audio_processing,
- alpha
- ],
- outputs=[main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button],
- api_name="convert_audio"
- )
\ No newline at end of file
diff --git a/main/app/tabs/inference/child/convert_tts.py b/main/app/tabs/inference/child/convert_tts.py
deleted file mode 100644
index 48de23f74d25053f0afcb179650267d1b21e9f6f..0000000000000000000000000000000000000000
--- a/main/app/tabs/inference/child/convert_tts.py
+++ /dev/null
@@ -1,280 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.tts import TTS
-from main.app.core.process import process_input
-from main.app.core.inference import convert_tts
-from main.app.core.utils import google_translate
-from main.app.core.presets import save_presets, load_presets
-from main.app.core.ui import visible, change_f0_choices, unlock_f0, hoplength_show, change_models_choices, get_index, index_strength_show, change_embedders_mode, change_tts_voice_choices, shutil_move, change_preset_choices
-from main.app.variables import translations, sample_rate_choice, model_name, index_path, method_f0, f0_file, embedders_mode, embedders_model, edgetts, google_tts_voice, configs, presets_file, export_format_choices, hybrid_f0_method
-
-def convert_tts_tab():
- with gr.Row():
- gr.Markdown(translations["convert_text_markdown_2"])
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row():
- use_txt = gr.Checkbox(label=translations["input_txt"], value=False, interactive=True)
- google_tts_check_box = gr.Checkbox(label=translations["googletts"], value=False, interactive=True)
- prompt = gr.Textbox(label=translations["text_to_speech"], value="", placeholder="Hello Words", lines=3)
- with gr.Column():
- speed = gr.Slider(label=translations["voice_speed"], info=translations["voice_speed_info"], minimum=-100, maximum=100, value=0, step=1)
- pitch0 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
- with gr.Row():
- tts_button = gr.Button(translations["tts_1"], variant="primary", scale=2)
- convert_button0 = gr.Button(translations["tts_2"], variant="secondary", scale=2)
- with gr.Row():
- with gr.Column():
- txt_input = gr.File(label=translations["drop_text"], file_types=[".txt", ".srt", ".docx"], visible=use_txt.value)
- tts_voice = gr.Dropdown(label=translations["voice"], choices=edgetts, interactive=True, value="vi-VN-NamMinhNeural")
- tts_pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info_2"], label=translations["pitch"], value=0, interactive=True)
- with gr.Accordion(translations["translate"], open=False):
- with gr.Row():
- source_lang = gr.Dropdown(label=translations["source_lang"], choices=["auto"]+google_tts_voice, interactive=True, value="auto")
- target_lang = gr.Dropdown(label=translations["target_lang"], choices=google_tts_voice, interactive=True, value="en")
- translate_button = gr.Button(translations["translate"])
- with gr.Column():
- with gr.Accordion(translations["model_accordion"], open=True):
- with gr.Row():
- model_pth0 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
- model_index0 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
- with gr.Row():
- refresh1 = gr.Button(translations["refresh"])
- with gr.Row():
- index_strength0 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index0.value != "")
- with gr.Accordion(translations["output_path"], open=False):
- export_format0 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True)
- output_audio0 = gr.Textbox(label=translations["output_tts"], value="audios/tts.wav", placeholder="audios/tts.wav", info=translations["tts_output"], interactive=True)
- output_audio1 = gr.Textbox(label=translations["output_tts_convert"], value="audios/tts-convert.wav", placeholder="audios/tts-convert.wav", info=translations["tts_output"], interactive=True)
- with gr.Accordion(translations["setting"], open=False):
- with gr.Accordion(translations["f0_method"], open=False):
- with gr.Group():
- with gr.Row():
- onnx_f0_mode1 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
- unlock_full_method3 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
- method0 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
- hybrid_method0 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=method0.value == "hybrid")
- hop_length0 = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False)
- alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
- with gr.Accordion(translations["f0_file"], open=False):
- upload_f0_file0 = gr.File(label=translations["upload_f0"], file_types=[".txt"])
- f0_file_dropdown0 = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True)
- refresh_f0_file0 = gr.Button(translations["refresh"])
- with gr.Accordion(translations["hubert_model"], open=False):
- embed_mode1 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
- embedders0 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
- custom_embedders0 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders0.value == "custom")
- with gr.Accordion(translations["use_presets"], open=False):
- with gr.Row():
- presets_name = gr.Dropdown(label=translations["file_preset"], choices=presets_file, value=presets_file[0] if len(presets_file) > 0 else '', interactive=True, allow_custom_value=True)
- with gr.Row():
- load_click = gr.Button(translations["load_file"], variant="primary")
- refresh_click = gr.Button(translations["refresh"])
- with gr.Accordion(translations["export_file"], open=False):
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row():
- cleaner_chbox = gr.Checkbox(label=translations["save_clean"], value=True, interactive=True)
- autotune_chbox = gr.Checkbox(label=translations["save_autotune"], value=True, interactive=True)
- pitch_chbox = gr.Checkbox(label=translations["save_pitch"], value=True, interactive=True)
- index_strength_chbox = gr.Checkbox(label=translations["save_index_2"], value=True, interactive=True)
- resample_sr_chbox = gr.Checkbox(label=translations["save_resample"], value=True, interactive=True)
- filter_radius_chbox = gr.Checkbox(label=translations["save_filter"], value=True, interactive=True)
- rms_mix_rate_chbox = gr.Checkbox(label=translations["save_envelope"], value=True, interactive=True)
- protect_chbox = gr.Checkbox(label=translations["save_protect"], value=True, interactive=True)
- split_audio_chbox = gr.Checkbox(label=translations["save_split"], value=True, interactive=True)
- formant_shifting_chbox = gr.Checkbox(label=translations["formantshift"], value=True, interactive=True)
- with gr.Row():
- with gr.Column():
- name_to_save_file = gr.Textbox(label=translations["filename_to_save"])
- save_file_button = gr.Button(translations["export_file"])
- with gr.Row():
- upload_presets = gr.Files(label=translations["upload_presets"], file_types=[".conversion.json"])
- with gr.Group():
- with gr.Row():
- audio_processing = gr.Checkbox(label=translations["audio_processing"], value=False, interactive=True)
- with gr.Row():
- formant_shifting1 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
- split_audio0 = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True)
- cleaner1 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
- with gr.Row():
- autotune3 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
- checkpointing0 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
- with gr.Column():
- resample_sr0 = gr.Radio(choices=[0]+sample_rate_choice, label=translations["resample"], info=translations["resample_info"], value=0, interactive=True)
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
- f0_autotune_strength0 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune3.value)
- clean_strength1 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner1.value)
- filter_radius0 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
- rms_mix_rate0 = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
- protect0 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
- with gr.Row():
- formant_qfrency1 = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- formant_timbre1 = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- with gr.Row():
- gr.Markdown(translations["output_tts_markdown"])
- with gr.Row():
- tts_voice_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["output_text_to_speech"])
- tts_voice_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"])
- with gr.Row():
- load_click.click(
- fn=load_presets,
- inputs=[
- presets_name,
- cleaner1,
- autotune3,
- pitch0,
- clean_strength1,
- index_strength0,
- resample_sr0,
- filter_radius0,
- rms_mix_rate0,
- protect0,
- split_audio0,
- f0_autotune_strength0,
- formant_shifting1,
- formant_qfrency1,
- formant_timbre1,
- proposal_pitch,
- proposal_pitch_threshold
- ],
- outputs=[
- cleaner1,
- autotune3,
- pitch0,
- clean_strength1,
- index_strength0,
- resample_sr0,
- filter_radius0,
- rms_mix_rate0,
- protect0,
- split_audio0,
- f0_autotune_strength0,
- formant_shifting1,
- formant_qfrency1,
- formant_timbre1,
- proposal_pitch,
- proposal_pitch_threshold
- ]
- )
- refresh_click.click(fn=change_preset_choices, inputs=[], outputs=[presets_name])
- save_file_button.click(
- fn=save_presets,
- inputs=[
- name_to_save_file,
- cleaner1,
- autotune3,
- pitch0,
- clean_strength1,
- index_strength0,
- resample_sr0,
- filter_radius0,
- rms_mix_rate0,
- protect0,
- split_audio0,
- f0_autotune_strength0,
- cleaner_chbox,
- autotune_chbox,
- pitch_chbox,
- index_strength_chbox,
- resample_sr_chbox,
- filter_radius_chbox,
- rms_mix_rate_chbox,
- protect_chbox,
- split_audio_chbox,
- formant_shifting_chbox,
- formant_shifting1,
- formant_qfrency1,
- formant_timbre1,
- proposal_pitch,
- proposal_pitch_threshold
- ],
- outputs=[presets_name]
- )
- with gr.Row():
- proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold])
- upload_presets.upload(fn=lambda presets_in: [shutil_move(preset.name, configs["presets_path"]) for preset in presets_in][0], inputs=[upload_presets], outputs=[presets_name])
- translate_button.click(fn=google_translate, inputs=[prompt, source_lang, target_lang], outputs=[prompt], api_name="google_translate")
- with gr.Row():
- unlock_full_method3.change(fn=unlock_f0, inputs=[unlock_full_method3], outputs=[method0])
- upload_f0_file0.upload(fn=lambda inp: shutil_move(inp.name, configs["f0_path"]), inputs=[upload_f0_file0], outputs=[f0_file_dropdown0])
- refresh_f0_file0.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown0])
- with gr.Row():
- embed_mode1.change(fn=change_embedders_mode, inputs=[embed_mode1], outputs=[embedders0])
- autotune3.change(fn=visible, inputs=[autotune3], outputs=[f0_autotune_strength0])
- model_pth0.change(fn=get_index, inputs=[model_pth0], outputs=[model_index0])
- with gr.Row():
- cleaner1.change(fn=visible, inputs=[cleaner1], outputs=[clean_strength1])
- method0.change(fn=lambda method, hybrid: [visible(method == "hybrid"), visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method0, hybrid_method0], outputs=[hybrid_method0, alpha, hop_length0])
- hybrid_method0.change(fn=hoplength_show, inputs=[method0, hybrid_method0], outputs=[hop_length0])
- with gr.Row():
- refresh1.click(fn=change_models_choices, inputs=[], outputs=[model_pth0, model_index0])
- embedders0.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders0], outputs=[custom_embedders0])
- formant_shifting1.change(fn=lambda a: [visible(a) for _ in range(2)], inputs=[formant_shifting1], outputs=[formant_qfrency1, formant_timbre1])
- with gr.Row():
- model_index0.change(fn=index_strength_show, inputs=[model_index0], outputs=[index_strength0])
- txt_input.upload(fn=process_input, inputs=[txt_input], outputs=[prompt])
- use_txt.change(fn=visible, inputs=[use_txt], outputs=[txt_input])
- with gr.Row():
- google_tts_check_box.change(fn=change_tts_voice_choices, inputs=[google_tts_check_box], outputs=[tts_voice])
- tts_button.click(
- fn=TTS,
- inputs=[
- prompt,
- tts_voice,
- speed,
- output_audio0,
- tts_pitch,
- google_tts_check_box,
- txt_input
- ],
- outputs=[tts_voice_audio],
- api_name="text-to-speech"
- )
- convert_button0.click(
- fn=convert_tts,
- inputs=[
- cleaner1,
- autotune3,
- pitch0,
- clean_strength1,
- model_pth0,
- model_index0,
- index_strength0,
- output_audio0,
- output_audio1,
- export_format0,
- method0,
- hybrid_method0,
- hop_length0,
- embedders0,
- custom_embedders0,
- resample_sr0,
- filter_radius0,
- rms_mix_rate0,
- protect0,
- split_audio0,
- f0_autotune_strength0,
- checkpointing0,
- onnx_f0_mode1,
- formant_shifting1,
- formant_qfrency1,
- formant_timbre1,
- f0_file_dropdown0,
- embed_mode1,
- proposal_pitch,
- proposal_pitch_threshold,
- audio_processing,
- alpha
- ],
- outputs=[tts_voice_convert],
- api_name="convert_tts"
- )
\ No newline at end of file
diff --git a/main/app/tabs/inference/child/convert_with_whisper.py b/main/app/tabs/inference/child/convert_with_whisper.py
deleted file mode 100644
index 2c023ee67772475b39136760014de9e7cad0bc8f..0000000000000000000000000000000000000000
--- a/main/app/tabs/inference/child/convert_with_whisper.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.inference import convert_with_whisper
-from main.app.core.ui import visible, change_audios_choices, unlock_f0, hoplength_show, change_models_choices, get_index, index_strength_show, change_embedders_mode, shutil_move
-from main.app.variables import translations, paths_for_files, sample_rate_choice, model_name, index_path, method_f0, embedders_mode, embedders_model, configs, file_types, export_format_choices, whisper_model, hybrid_f0_method
-
-def convert_with_whisper_tab():
- with gr.Row():
- gr.Markdown(translations["convert_with_whisper_info"])
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row():
- cleaner2 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
- autotune2 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
- checkpointing2 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
- formant_shifting2 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
- audio_processing = gr.Checkbox(label=translations["audio_processing"], value=False, interactive=True)
- with gr.Row():
- num_spk = gr.Slider(minimum=2, maximum=8, step=1, info=translations["num_spk_info"], label=translations["num_spk"], value=2, interactive=True)
- with gr.Row():
- with gr.Column():
- convert_button3 = gr.Button(translations["convert_audio"], variant="primary")
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["model_accordion"] + " 1", open=True):
- with gr.Row():
- model_pth2 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
- model_index2 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
- with gr.Row():
- refresh2 = gr.Button(translations["refresh"])
- with gr.Row():
- pitch3 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
- index_strength2 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index2.value != "")
- with gr.Accordion(translations["input_output"], open=False):
- with gr.Column():
- export_format2 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True)
- input_audio1 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
- output_audio2 = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
- with gr.Column():
- refresh4 = gr.Button(translations["refresh"])
- with gr.Row():
- input2 = gr.Files(label=translations["drop_audio"], file_types=file_types)
- with gr.Column():
- with gr.Accordion(translations["model_accordion"] + " 2", open=True):
- with gr.Row():
- model_pth3 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
- model_index3 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
- with gr.Row():
- refresh3 = gr.Button(translations["refresh"])
- with gr.Row():
- pitch4 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
- index_strength3 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index3.value != "")
- with gr.Accordion(translations["setting"], open=False):
- with gr.Row():
- model_size = gr.Radio(label=translations["model_size"], info=translations["model_size_info"], choices=whisper_model, value="medium", interactive=True)
- with gr.Accordion(translations["f0_method"], open=False):
- with gr.Group():
- with gr.Row():
- onnx_f0_mode4 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
- unlock_full_method2 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
- method3 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
- hybrid_method3 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=method3.value == "hybrid")
- hop_length3 = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False)
- alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
- with gr.Accordion(translations["hubert_model"], open=False):
- embed_mode3 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
- embedders3 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
- custom_embedders3 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders3.value == "custom")
- with gr.Column():
- resample_sr3 = gr.Radio(choices=[0]+sample_rate_choice, label=translations["resample"], info=translations["resample_info"], value=0, interactive=True)
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
- clean_strength3 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner2.value)
- f0_autotune_strength3 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune2.value)
- filter_radius3 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
- rms_mix_rate3 = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
- protect3 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
- with gr.Row():
- formant_qfrency3 = gr.Slider(value=1.0, label=translations["formant_qfrency"] + " 1", info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- formant_timbre3 = gr.Slider(value=1.0, label=translations["formant_timbre"] + " 1", info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- with gr.Row():
- formant_qfrency4 = gr.Slider(value=1.0, label=translations["formant_qfrency"] + " 2", info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- formant_timbre4 = gr.Slider(value=1.0, label=translations["formant_timbre"] + " 2", info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
- with gr.Row():
- gr.Markdown(translations["input_output"])
- with gr.Row():
- play_audio2 = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- play_audio3 = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"])
- with gr.Row():
- autotune2.change(fn=visible, inputs=[autotune2], outputs=[f0_autotune_strength3])
- cleaner2.change(fn=visible, inputs=[cleaner2], outputs=[clean_strength3])
- method3.change(fn=lambda method, hybrid: [visible(method == "hybrid"), visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method3, hybrid_method3], outputs=[hybrid_method3, alpha, hop_length3])
- with gr.Row():
- hybrid_method3.change(fn=hoplength_show, inputs=[method3, hybrid_method3], outputs=[hop_length3])
- refresh2.click(fn=change_models_choices, inputs=[], outputs=[model_pth2, model_index2])
- model_pth2.change(fn=get_index, inputs=[model_pth2], outputs=[model_index2])
- with gr.Row():
- refresh3.click(fn=change_models_choices, inputs=[], outputs=[model_pth3, model_index3])
- model_pth3.change(fn=get_index, inputs=[model_pth3], outputs=[model_index3])
- input2.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[input2], outputs=[input_audio1])
- with gr.Row():
- input_audio1.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio1], outputs=[play_audio2])
- formant_shifting2.change(fn=lambda a: [visible(a) for _ in range(4)], inputs=[formant_shifting2], outputs=[formant_qfrency3, formant_timbre3, formant_qfrency4, formant_timbre4])
- embedders3.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders3], outputs=[custom_embedders3])
- with gr.Row():
- refresh4.click(fn=change_audios_choices, inputs=[input_audio1], outputs=[input_audio1])
- model_index2.change(fn=index_strength_show, inputs=[model_index2], outputs=[index_strength2])
- model_index3.change(fn=index_strength_show, inputs=[model_index3], outputs=[index_strength3])
- with gr.Row():
- unlock_full_method2.change(fn=unlock_f0, inputs=[unlock_full_method2], outputs=[method3])
- embed_mode3.change(fn=change_embedders_mode, inputs=[embed_mode3], outputs=[embedders3])
- proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold])
- with gr.Row():
- convert_button3.click(
- fn=convert_with_whisper,
- inputs=[
- num_spk,
- model_size,
- cleaner2,
- clean_strength3,
- autotune2,
- f0_autotune_strength3,
- checkpointing2,
- model_pth2,
- model_pth3,
- model_index2,
- model_index3,
- pitch3,
- pitch4,
- index_strength2,
- index_strength3,
- export_format2,
- input_audio1,
- output_audio2,
- onnx_f0_mode4,
- method3,
- hybrid_method3,
- hop_length3,
- embed_mode3,
- embedders3,
- custom_embedders3,
- resample_sr3,
- filter_radius3,
- rms_mix_rate3,
- protect3,
- formant_shifting2,
- formant_qfrency3,
- formant_timbre3,
- formant_qfrency4,
- formant_timbre4,
- proposal_pitch,
- proposal_pitch_threshold,
- audio_processing,
- alpha
- ],
- outputs=[play_audio3],
- api_name="convert_with_whisper"
- )
\ No newline at end of file
diff --git a/main/app/tabs/inference/child/separate.py b/main/app/tabs/inference/child/separate.py
deleted file mode 100644
index c43102281fd41b9604f185ec186f4c638aca2414..0000000000000000000000000000000000000000
--- a/main/app/tabs/inference/child/separate.py
+++ /dev/null
@@ -1,263 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.downloads import download_url
-from main.app.core.separate import separate_music
-from main.app.core.ui import visible, valueFalse_interactive, change_audios_choices, shutil_move, separate_change
-from main.app.variables import translations, uvr_model, karaoke_models, reverb_models, vr_models, denoise_models, mdx_models, paths_for_files, sample_rate_choice, configs, file_types, export_format_choices
-
-def separate_tab():
- with gr.Row():
- gr.Markdown(translations["4_part"])
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row():
- enable_denoise = gr.Checkbox(label=translations["denoise_mdx"], value=False, interactive=False)
- separate_backing = gr.Checkbox(label=translations["separator_backing"], value=False, interactive=True)
- separate_reverb = gr.Checkbox(label=translations["dereveb_audio"], value=False, interactive=True)
- enable_tta = gr.Checkbox(label=translations["enable_tta"], value=False, interactive=False)
- high_end_process = gr.Checkbox(label=translations["high_end_process"], value=False, interactive=False)
- enable_post_process = gr.Checkbox(label=translations["enable_post_process"], value=False, interactive=False)
- with gr.Row():
- model_name = gr.Dropdown(label=translations["separator_model"], value=uvr_model[0], choices=uvr_model, interactive=True)
- karaoke_model = gr.Dropdown(label=translations["separator_backing_model"], value=list(karaoke_models.keys())[0], choices=list(karaoke_models.keys()), interactive=True, visible=separate_backing.value)
- reverb_model = gr.Dropdown(label=translations["dereveb_model"], value=list(reverb_models.keys())[0], choices=list(reverb_models.keys()), interactive=True, visible=separate_reverb.value)
- denoise_model = gr.Dropdown(label=translations["denoise_model"], value=list(denoise_models.keys())[0], choices=list(denoise_models.keys()), interactive=True, visible=enable_denoise.value and model_name.value in list(vr_models.keys()))
- with gr.Row():
- with gr.Column():
- separate_button = gr.Button(translations["separator_tab"], variant="primary")
- with gr.Row():
- with gr.Column():
- with gr.Group():
- with gr.Row():
- shifts = gr.Slider(label=translations["shift"], info=translations["shift_info"], minimum=1, maximum=20, value=2, step=1, interactive=True)
- batch_size = gr.Slider(label=translations["batch_size"], info=translations["mdx_batch_size_info"], minimum=1, maximum=64, value=1, step=1, interactive=True, visible=False)
- with gr.Row():
- segments_size = gr.Slider(label=translations["segments_size"], info=translations["segments_size_info"], minimum=32, maximum=3072, value=256, step=32, interactive=True)
- aggression = gr.Slider(label=translations['aggression'], info=translations["aggression_info"], minimum=1, maximum=50, value=5, step=1, interactive=True, visible=False)
- drop_audio = gr.Files(label=translations["drop_audio"], file_types=file_types)
- with gr.Accordion(translations["use_url"], open=False):
- url = gr.Textbox(label=translations["url_audio"], value="", placeholder="https://www.youtube.com/...", scale=6)
- download_button = gr.Button(translations["downloads"])
- with gr.Column():
- with gr.Group():
- with gr.Row():
- overlap = gr.Radio(label=translations["overlap"], info=translations["overlap_info"], choices=["0.25", "0.5", "0.75", "0.99"], value="0.25", interactive=True)
- with gr.Row():
- window_size = gr.Slider(label=translations["window_size"], info=translations["window_size_info"], minimum=320, maximum=1024, value=512, step=32, interactive=True, visible=False)
- hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=8192, value=1024, step=1, interactive=True, visible=False)
- post_process_threshold = gr.Slider(label=translations['post_process_threshold'], info=translations["post_process_threshold_info"], minimum=0.1, maximum=0.3, value=0.2, step=0.1, interactive=True, visible=False)
- sample_rate = gr.Radio(choices=sample_rate_choice, value=44100, label=translations["sr"], info=translations["sr_info"], interactive=True)
- with gr.Accordion(translations["input_output"], open=False):
- export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True)
- input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True)
- refresh_audio = gr.Button(translations["refresh"])
- output_dirs = gr.Textbox(label=translations["output_folder"], value="audios", placeholder="audios", info=translations["output_folder_info"], interactive=True)
- audio_input = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- with gr.Row():
- gr.Markdown(translations["output_separator"])
- with gr.Row():
- instruments_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["instruments"])
- original_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["original_vocal"])
- main_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["main_vocal"], visible=separate_backing.value)
- backing_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["backing_vocal"], visible=separate_backing.value)
- with gr.Row():
- model_name.change(fn=lambda a: valueFalse_interactive(a in list(mdx_models.keys()) + list(vr_models.keys())), inputs=[model_name], outputs=[enable_denoise])
- separate_backing.change(fn=lambda a, b: valueFalse_interactive(a or b), inputs=[separate_backing, separate_reverb], outputs=[enable_denoise])
- separate_reverb.change(fn=lambda a, b: valueFalse_interactive(a or b), inputs=[separate_backing, separate_reverb], outputs=[enable_denoise])
- with gr.Row():
- input_audio.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio], outputs=[audio_input])
- drop_audio.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[drop_audio], outputs=[input_audio])
- refresh_audio.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio])
- with gr.Row():
- separate_backing.change(fn=lambda a: [visible(a) for _ in range(2)], inputs=[separate_backing], outputs=[main_vocals, backing_vocals])
- download_button.click(
- fn=download_url,
- inputs=[url],
- outputs=[input_audio, audio_input, url],
- api_name='download_url'
- )
- with gr.Row():
- model_name.change(
- fn=separate_change,
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
- outputs=[
- karaoke_model,
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- with gr.Row():
- karaoke_model.change(
- fn=separate_change,
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
- outputs=[
- karaoke_model,
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- separate_backing.change(
- fn=separate_change,
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
- outputs=[
- karaoke_model,
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- with gr.Row():
- reverb_model.change(
- fn=separate_change,
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
- outputs=[
- karaoke_model,
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- separate_reverb.change(
- fn=separate_change,
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
- outputs=[
- karaoke_model,
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- with gr.Row():
- enable_denoise.change(
- fn=separate_change,
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
- outputs=[
- karaoke_model,
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- enable_post_process.change(
- fn=separate_change,
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
- outputs=[
- karaoke_model,
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- with gr.Row():
- separate_button.click(
- fn=separate_music,
- inputs=[
- input_audio,
- output_dirs,
- export_format,
- model_name,
- karaoke_model,
- reverb_model,
- denoise_model,
- sample_rate,
- shifts,
- batch_size,
- overlap,
- aggression,
- hop_length,
- window_size,
- segments_size,
- post_process_threshold,
- enable_tta,
- enable_denoise,
- high_end_process,
- enable_post_process,
- separate_backing,
- separate_reverb
- ],
- outputs=[
- original_vocals,
- instruments_audio,
- main_vocals,
- backing_vocals
- ],
- api_name="separate_music"
- )
\ No newline at end of file
diff --git a/main/app/tabs/inference/inference.py b/main/app/tabs/inference/inference.py
deleted file mode 100644
index 437ba78589fc35337e8bd1fdf9145b83f96301e8..0000000000000000000000000000000000000000
--- a/main/app/tabs/inference/inference.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import translations, configs
-from main.app.tabs.inference.child.convert import convert_tab
-from main.app.tabs.inference.child.separate import separate_tab
-from main.app.tabs.inference.child.convert_tts import convert_tts_tab
-from main.app.tabs.inference.child.convert_with_whisper import convert_with_whisper_tab
-
-def inference_tab():
- with gr.TabItem(translations["inference"], visible=configs.get("inference_tab", True)):
- with gr.TabItem(translations["separator_tab"], visible=configs.get("separator_tab", True)):
- gr.Markdown(f"## {translations['separator_tab']}")
- separate_tab()
-
- with gr.TabItem(translations["convert_audio"], visible=configs.get("convert_tab", True)):
- gr.Markdown(f"## {translations['convert_audio']}")
- convert_tab()
-
- with gr.TabItem(translations["convert_with_whisper"], visible=configs.get("convert_with_whisper", True)):
- gr.Markdown(f"## {translations['convert_with_whisper']}")
- convert_with_whisper_tab()
-
- with gr.TabItem(translations["convert_text"], visible=configs.get("tts_tab", True)):
- gr.Markdown(translations["convert_text_markdown"])
- convert_tts_tab()
diff --git a/main/app/tabs/realtime/realtime.py b/main/app/tabs/realtime/realtime.py
deleted file mode 100644
index 0937ed12e76842287ca4790216b9f3c7719bf284..0000000000000000000000000000000000000000
--- a/main/app/tabs/realtime/realtime.py
+++ /dev/null
@@ -1,226 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.realtime import realtime_start, realtime_stop
-from main.app.variables import translations, configs, model_name, index_path, method_f0, embedders_mode, embedders_model
-from main.app.core.ui import change_models_choices, get_index, index_strength_show, unlock_f0, hoplength_show, change_embedders_mode, visible, audio_device, change_audio_device_choices, update_audio_device
-
-input_channels_map, output_channels_map = audio_device()
-
-def realtime_tab():
- with gr.TabItem(translations["realtime"], visible=configs.get("realtime_tab", True)):
- gr.Markdown(translations["realtime_markdown"])
- with gr.Row():
- gr.Markdown(translations["realtime_markdown_2"])
- with gr.Row():
- status = gr.Label(label=translations["realtime_latency"], value=translations["realtime_not_startup"])
- with gr.Row():
- monitor = gr.Checkbox(label=translations["monitor"], value=False, interactive=True)
- exclusive_mode = gr.Checkbox(label=translations["exclusive_mode"], value=False, interactive=True)
- vad_enabled = gr.Checkbox(label=translations["vad_enabled"], value=False, interactive=True)
- clean_audio = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
- with gr.Row():
- with gr.Accordion(translations["audio_device"], open=True):
- with gr.Row():
- input_audio_device = gr.Dropdown(label=translations["input_audio_device_label"], info=translations["input_audio_device_info"], choices=list(input_channels_map.keys()), value=list(input_channels_map.keys())[0] if len(list(input_channels_map.keys())) >= 1 else "", interactive=True)
- output_audio_device = gr.Dropdown(label=translations["output_audio_device_label"], info=translations["output_audio_device_info"], choices=list(output_channels_map.keys()), value=list(output_channels_map.keys())[0] if len(list(output_channels_map.keys())) >= 1 else "", interactive=True)
- monitor_output_device = gr.Dropdown(label=translations["monitor_output_device_label"], info=translations["monitor_output_device_info"], choices=list(output_channels_map.keys()), value=list(output_channels_map.keys())[0] if len(list(output_channels_map.keys())) >= 1 else "", interactive=True, visible=False)
- with gr.Row():
- input_audio_gain = gr.Slider(minimum=0, maximum=2500, label=translations["input_audio_gain_label"], info=translations["input_audio_gain_info"], value=100, step=1, interactive=True)
- output_audio_gain = gr.Slider(minimum=0, maximum=4000, label=translations["output_audio_gain_label"], info=translations["output_audio_gain_info"], value=100, step=1, interactive=True)
- monitor_audio_gain = gr.Slider(minimum=0, maximum=4000, label=translations["monitor_audio_gain_label"], info=translations["monitor_audio_gain_info"], value=100, step=1, interactive=True, visible=False)
- with gr.Row(visible=False) as asio_row:
- input_asio_channels = gr.Slider(minimum=-1, maximum=128, label=translations["input_asio_channels_label"], info=translations["input_asio_channels_info"], value=-1, step=1, interactive=True, visible=False)
- output_asio_channels = gr.Slider(minimum=-1, maximum=128, label=translations["output_asio_channels_label"], info=translations["output_asio_channels_info"], value=-1, step=1, interactive=True, visible=False)
- monitor_asio_channels = gr.Slider(minimum=-1, maximum=128, label=translations["monitor_asio_channels_label"], info=translations["monitor_asio_channels_info"], value=-1, step=1, interactive=True, visible=False)
- with gr.Row():
- refresh_audio_device = gr.Button(value=translations["refresh_audio_device"], variant="secondary")
- with gr.Row():
- start_realtime = gr.Button(value=translations["start_realtime_button"], variant="primary", interactive=True)
- stop_realtime = gr.Button(value=translations["stop_realtime_button"], variant="stop", interactive=False)
- with gr.Row():
- chunk_size = gr.Slider(minimum=2.7, maximum=2730.7, step=0.1, label=translations["chunk_size"], info=translations["chunk_size_info"], value=1024, interactive=True)
- pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["model_accordion"], open=True):
- with gr.Row():
- model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
- model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
- with gr.Row():
- model_refresh = gr.Button(translations["refresh"])
- with gr.Row():
- index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "")
- with gr.Column():
- with gr.Accordion(translations["f0_method"], open=True):
- with gr.Group():
- with gr.Row():
- onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
- f0_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=[m for m in method_f0 if m != "hybrid"], value="rmvpe", interactive=True)
- hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False)
- with gr.Column():
- with gr.Accordion(translations["hubert_model"], open=True):
- embed_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
- embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
- custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom")
- with gr.Row():
- with gr.Accordion(translations["setting"], open=True):
- with gr.Row():
- f0_autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
- with gr.Group():
- with gr.Row():
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=f0_autotune.value)
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
- with gr.Row():
- rms_mix_rate = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
- protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
- with gr.Row():
- clean_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
- filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
- with gr.Column():
- silent_threshold = gr.Slider(minimum=-90, maximum=-60, label=translations["silent_threshold_label"], info=translations["silent_threshold_info"], value=-90, step=1, interactive=True)
- extra_convert_size = gr.Slider(minimum=0.1, maximum=5, label=translations["extra_convert_size_label"], info=translations["extra_convert_size_info"], value=0.5, step=0.1, interactive=True)
- cross_fade_overlap_size = gr.Slider(minimum=0.05, maximum=0.2, label=translations["cross_fade_overlap_size_label"], info=translations["cross_fade_overlap_size_info"], value=0.1, step=0.01, interactive=True)
- with gr.Row():
- vad_sensitivity = gr.Slider(minimum=0, maximum=3, label=translations["vad_sensitivity_label"], info=translations["vad_sensitivity_info"], value=3, step=1, interactive=True, visible=vad_enabled.value)
- vad_frame_ms = gr.Slider(minimum=10, maximum=30, label=translations["vad_frame_ms_label"], info=translations["vad_frame_ms_info"], value=30, step=10, interactive=True, visible=vad_enabled.value)
- with gr.Row():
- model_pth.change(
- fn=get_index,
- inputs=[model_pth],
- outputs=[model_index]
- )
- model_index.change(
- fn=index_strength_show,
- inputs=[model_index],
- outputs=[index_strength]
- )
- model_refresh.click(
- fn=change_models_choices,
- inputs=[],
- outputs=[model_pth, model_index]
- )
- with gr.Row():
- unlock_full_method.change(
- fn=lambda f0_method: {"choices": [m for m in unlock_f0(f0_method)["choices"] if m != "hybrid"], "value": "rmvpe", "__type__": "update"},
- inputs=[unlock_full_method],
- outputs=[f0_method]
- )
- f0_method.change(
- fn=lambda f0_method: hoplength_show(f0_method, None),
- inputs=[f0_method],
- outputs=[hop_length]
- )
- embed_mode.change(
- fn=change_embedders_mode,
- inputs=[embed_mode],
- outputs=[embedders]
- )
- with gr.Row():
- embedders.change(
- fn=lambda embedders: visible(embedders == "custom"),
- inputs=[embedders],
- outputs=[custom_embedders]
- )
- input_audio_device.change(
- fn=update_audio_device,
- inputs=[input_audio_device, output_audio_device, monitor_output_device, monitor],
- outputs=[monitor_output_device, monitor_audio_gain, monitor_asio_channels, asio_row, input_asio_channels, output_asio_channels, monitor_asio_channels]
- )
- output_audio_device.change(
- fn=update_audio_device,
- inputs=[input_audio_device, output_audio_device, monitor_output_device, monitor],
- outputs=[monitor_output_device, monitor_audio_gain, monitor_asio_channels, asio_row, input_asio_channels, output_asio_channels, monitor_asio_channels]
- )
- with gr.Row():
- monitor_output_device.change(
- fn=update_audio_device,
- inputs=[input_audio_device, output_audio_device, monitor_output_device, monitor],
- outputs=[monitor_output_device, monitor_audio_gain, monitor_asio_channels, asio_row, input_asio_channels, output_asio_channels, monitor_asio_channels]
- )
- monitor.change(
- fn=update_audio_device,
- inputs=[input_audio_device, output_audio_device, monitor_output_device, monitor],
- outputs=[monitor_output_device, monitor_audio_gain, monitor_asio_channels, asio_row, input_asio_channels, output_asio_channels, monitor_asio_channels]
- )
- f0_autotune.change(
- fn=visible,
- inputs=[f0_autotune],
- outputs=[f0_autotune_strength]
- )
- with gr.Row():
- proposal_pitch.change(
- fn=visible,
- inputs=[proposal_pitch],
- outputs=[proposal_pitch_threshold]
- )
- vad_enabled.change(
- fn=lambda a: [visible(a) for _ in range(2)],
- inputs=[vad_enabled],
- outputs=[vad_sensitivity, vad_frame_ms]
- )
- refresh_audio_device.click(
- fn=change_audio_device_choices,
- inputs=[],
- outputs=[input_audio_device, output_audio_device, monitor_output_device]
- )
- with gr.Row():
- clean_audio.change(
- fn=visible,
- inputs=[clean_audio],
- outputs=[clean_strength]
- )
- start_realtime.click(
- fn=realtime_start,
- inputs=[
- monitor,
- exclusive_mode,
- vad_enabled,
- input_audio_device,
- output_audio_device,
- monitor_output_device,
- input_audio_gain,
- output_audio_gain,
- monitor_audio_gain,
- input_asio_channels,
- output_asio_channels,
- monitor_asio_channels,
- chunk_size,
- pitch,
- model_pth,
- model_index,
- index_strength,
- onnx_f0_mode,
- f0_method,
- hop_length,
- embed_mode,
- embedders,
- custom_embedders,
- f0_autotune,
- proposal_pitch,
- f0_autotune_strength,
- proposal_pitch_threshold,
- rms_mix_rate,
- protect,
- filter_radius,
- silent_threshold,
- extra_convert_size,
- cross_fade_overlap_size,
- vad_sensitivity,
- vad_frame_ms,
- clean_audio,
- clean_strength
- ],
- outputs=[status, start_realtime, stop_realtime]
- )
- stop_realtime.click(
- fn=realtime_stop,
- inputs=[],
- outputs=[status, start_realtime, stop_realtime]
- )
\ No newline at end of file
diff --git a/main/app/tabs/realtime/realtime_client.py b/main/app/tabs/realtime/realtime_client.py
deleted file mode 100644
index 578045970f8b00cd2e4de2aaa6a3359d1283525e..0000000000000000000000000000000000000000
--- a/main/app/tabs/realtime/realtime_client.py
+++ /dev/null
@@ -1,210 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import translations, configs, model_name, index_path, method_f0, embedders_mode, embedders_model
-from main.app.core.ui import change_models_choices, get_index, index_strength_show, unlock_f0, hoplength_show, change_embedders_mode, visible, update_dropdowns_from_json, update_button_from_json
-
-def realtime_client_tab():
- with gr.TabItem(translations["realtime_client"], visible=configs.get("realtime_client_tab", True)):
- gr.Markdown(translations["realtime_markdown"])
- with gr.Row():
- gr.Markdown(translations["realtime_markdown_2"])
- with gr.Row():
- gr.Label(label=translations["realtime_latency"], value=translations["realtime_not_startup"], elem_id="realtime-status-info")
- with gr.Row():
- monitor = gr.Checkbox(label=translations["monitor"], value=False, interactive=True)
- vad_enabled = gr.Checkbox(label=translations["vad_enabled"], value=False, interactive=True)
- clean_audio = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
- with gr.Row():
- with gr.Accordion(translations["audio_device"], open=True):
- with gr.Row():
- input_audio_device = gr.Dropdown(label=translations["input_audio_device_label"], info=translations["input_audio_device_info"], choices=[], value=None, interactive=True)
- output_audio_device = gr.Dropdown(label=translations["output_audio_device_label"], info=translations["output_audio_device_info"], choices=[], value=None, interactive=True)
- monitor_output_device = gr.Dropdown(label=translations["monitor_output_device_label"], info=translations["monitor_output_device_info"], choices=[], value=None, interactive=True, visible=False)
- with gr.Row():
- input_audio_gain = gr.Slider(minimum=0, maximum=2500, label=translations["input_audio_gain_label"], info=translations["input_audio_gain_info"], value=100, step=1, interactive=True)
- output_audio_gain = gr.Slider(minimum=0, maximum=4000, label=translations["output_audio_gain_label"], info=translations["output_audio_gain_info"], value=100, step=1, interactive=True)
- monitor_audio_gain = gr.Slider(minimum=0, maximum=4000, label=translations["monitor_audio_gain_label"], info=translations["monitor_audio_gain_info"], value=100, step=1, interactive=True, visible=False)
- with gr.Row():
- refresh_audio_device = gr.Button(value=translations["refresh_audio_device"], variant="secondary")
- with gr.Row():
- start_realtime = gr.Button(value=translations["start_realtime_button"], variant="primary", interactive=True)
- stop_realtime = gr.Button(value=translations["stop_realtime_button"], variant="stop", interactive=False)
- with gr.Row():
- chunk_size = gr.Slider(minimum=2.7, maximum=2730.7, step=0.1, label=translations["chunk_size"], info=translations["chunk_size_info"], value=1024, interactive=True)
- pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["model_accordion"], open=True):
- with gr.Row():
- model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
- model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
- with gr.Row():
- model_refresh = gr.Button(translations["refresh"])
- with gr.Row():
- index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "")
- with gr.Column():
- with gr.Accordion(translations["f0_method"], open=True):
- with gr.Group():
- with gr.Row():
- onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
- f0_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=[m for m in method_f0 if m != "hybrid"], value="rmvpe", interactive=True)
- hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False)
- with gr.Column():
- with gr.Accordion(translations["hubert_model"], open=True):
- embed_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
- embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
- custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom")
- with gr.Row():
- with gr.Accordion(translations["setting"], open=True):
- with gr.Row():
- f0_autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
- with gr.Group():
- with gr.Row():
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=f0_autotune.value)
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
- with gr.Row():
- rms_mix_rate = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
- protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
- with gr.Row():
- clean_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
- filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
- with gr.Column():
- silent_threshold = gr.Slider(minimum=-90, maximum=-60, label=translations["silent_threshold_label"], info=translations["silent_threshold_info"], value=-90, step=1, interactive=True)
- extra_convert_size = gr.Slider(minimum=0.1, maximum=5, label=translations["extra_convert_size_label"], info=translations["extra_convert_size_info"], value=0.5, step=0.1, interactive=True)
- cross_fade_overlap_size = gr.Slider(minimum=0.05, maximum=0.2, label=translations["cross_fade_overlap_size_label"], info=translations["cross_fade_overlap_size_info"], value=0.1, step=0.01, interactive=True)
- with gr.Row():
- vad_sensitivity = gr.Slider(minimum=0, maximum=3, label=translations["vad_sensitivity_label"], info=translations["vad_sensitivity_info"], value=3, step=1, interactive=True, visible=vad_enabled.value)
- vad_frame_ms = gr.Slider(minimum=10, maximum=30, label=translations["vad_frame_ms_label"], info=translations["vad_frame_ms_info"], value=30, step=10, interactive=True, visible=vad_enabled.value)
- with gr.Row():
- json_audio_hidden = gr.JSON(visible=False)
- json_button_hidden = gr.JSON(visible=False)
- with gr.Row():
- model_pth.change(
- fn=get_index,
- inputs=[model_pth],
- outputs=[model_index]
- )
- model_index.change(
- fn=index_strength_show,
- inputs=[model_index],
- outputs=[index_strength]
- )
- model_refresh.click(
- fn=change_models_choices,
- inputs=[],
- outputs=[model_pth, model_index]
- )
- with gr.Row():
- unlock_full_method.change(
- fn=lambda f0_method: {"choices": [m for m in unlock_f0(f0_method)["choices"] if m != "hybrid"], "value": "rmvpe", "__type__": "update"},
- inputs=[unlock_full_method],
- outputs=[f0_method]
- )
- f0_method.change(
- fn=lambda f0_method: hoplength_show(f0_method, None),
- inputs=[f0_method],
- outputs=[hop_length]
- )
- embed_mode.change(
- fn=change_embedders_mode,
- inputs=[embed_mode],
- outputs=[embedders]
- )
- with gr.Row():
- embedders.change(
- fn=lambda embedders: visible(embedders == "custom"),
- inputs=[embedders],
- outputs=[custom_embedders]
- )
- f0_autotune.change(
- fn=visible,
- inputs=[f0_autotune],
- outputs=[f0_autotune_strength]
- )
- clean_audio.change(
- fn=visible,
- inputs=[clean_audio],
- outputs=[clean_strength]
- )
- with gr.Row():
- proposal_pitch.change(
- fn=visible,
- inputs=[proposal_pitch],
- outputs=[proposal_pitch_threshold]
- )
- vad_enabled.change(
- fn=lambda a: [visible(a) for _ in range(2)],
- inputs=[vad_enabled],
- outputs=[vad_sensitivity, vad_frame_ms]
- )
- refresh_audio_device.click(
- fn=None,
- js="getAudioDevices",
- inputs=[],
- outputs=json_audio_hidden
- )
- with gr.Row():
- json_audio_hidden.change(
- fn=update_dropdowns_from_json,
- inputs=[json_audio_hidden],
- outputs=[input_audio_device, output_audio_device, monitor_output_device]
- )
- json_button_hidden.change(
- fn=update_button_from_json,
- inputs=[json_button_hidden],
- outputs=[start_realtime, stop_realtime]
- )
- with gr.Row():
- start_realtime.click(
- fn=None,
- js="StreamAudioRealtime",
- inputs=[
- monitor,
- vad_enabled,
- input_audio_device,
- output_audio_device,
- monitor_output_device,
- input_audio_gain,
- output_audio_gain,
- monitor_audio_gain,
- chunk_size,
- pitch,
- model_pth,
- model_index,
- index_strength,
- onnx_f0_mode,
- f0_method,
- hop_length,
- embed_mode,
- embedders,
- custom_embedders,
- f0_autotune,
- proposal_pitch,
- f0_autotune_strength,
- proposal_pitch_threshold,
- rms_mix_rate,
- protect,
- filter_radius,
- silent_threshold,
- extra_convert_size,
- cross_fade_overlap_size,
- vad_sensitivity,
- vad_frame_ms,
- clean_audio,
- clean_strength
- ],
- outputs=[json_button_hidden]
- )
- stop_realtime.click(
- fn=None,
- js="StopAudioStream",
- inputs=[],
- outputs=[json_button_hidden]
- )
\ No newline at end of file
diff --git a/main/app/tabs/training/child/create_dataset.py b/main/app/tabs/training/child/create_dataset.py
deleted file mode 100644
index e306c9bf2c16de041facac8ff41558653d3ed186..0000000000000000000000000000000000000000
--- a/main/app/tabs/training/child/create_dataset.py
+++ /dev/null
@@ -1,282 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.training import create_dataset
-from main.app.core.ui import visible, valueFalse_interactive, create_dataset_change
-from main.app.variables import translations, sample_rate_choice, uvr_model, reverb_models, denoise_models, vr_models, mdx_models
-
-def create_dataset_tab():
- with gr.Row():
- gr.Markdown(translations["create_dataset_markdown_2"])
- with gr.Group():
- with gr.Row():
- separate = gr.Checkbox(label=translations["separator_tab"], value=False, interactive=True)
- clean_dataset = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
- skip_seconds = gr.Checkbox(label=translations["skip"], value=False, interactive=True)
- separate_reverb = gr.Checkbox(label=translations["dereveb_audio"], value=False, interactive=False)
- with gr.Row(visible=False) as row:
- enable_tta = gr.Checkbox(label=translations["enable_tta"], value=False, interactive=False)
- high_end_process = gr.Checkbox(label=translations["high_end_process"], value=False, interactive=False)
- enable_post_process = gr.Checkbox(label=translations["enable_post_process"], value=False, interactive=False)
- enable_denoise = gr.Checkbox(label=translations["denoise_mdx"], value=False, interactive=False)
- with gr.Row():
- dataset_url = gr.Textbox(label=translations["url_audio"], info=translations["create_dataset_url"], value="", placeholder="https://www.youtube.com/...", interactive=True, scale=5)
- output_dataset = gr.Textbox(label=translations["output_data"], info=translations["output_data_info"], value="dataset", placeholder="dataset", interactive=True)
- with gr.Row():
- create_dataset_button = gr.Button(translations["createdataset"], variant="primary", scale=2, min_width=4000)
- with gr.Row(visible=False) as row_2:
- model_name = gr.Dropdown(label=translations["separator_model"], value=uvr_model[0], choices=uvr_model, interactive=True)
- reverb_model = gr.Dropdown(label=translations["dereveb_model"], value=list(reverb_models.keys())[0], choices=list(reverb_models.keys()), interactive=True)
- denoise_model = gr.Dropdown(label=translations["denoise_model"], value=list(denoise_models.keys())[0], choices=list(denoise_models.keys()), interactive=True, visible=False)
- with gr.Row():
- with gr.Column(visible=False) as row_3:
- with gr.Group():
- with gr.Row():
- overlap = gr.Radio(label=translations["overlap"], info=translations["overlap_info"], choices=["0.25", "0.5", "0.75", "0.99"], value="0.25", interactive=True)
- with gr.Row():
- window_size = gr.Slider(label=translations["window_size"], info=translations["window_size_info"], minimum=320, maximum=1024, value=512, step=32, interactive=True, visible=False)
- with gr.Row():
- shifts = gr.Slider(label=translations["shift"], info=translations["shift_info"], minimum=1, maximum=20, value=2, step=1, interactive=True)
- segments_size = gr.Slider(label=translations["segments_size"], info=translations["segments_size_info"], minimum=32, maximum=3072, value=256, step=32, interactive=True)
- with gr.Row():
- batch_size = gr.Slider(label=translations["batch_size"], info=translations["mdx_batch_size_info"], minimum=1, maximum=64, value=1, step=1, interactive=True, visible=False)
- hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=8192, value=1024, step=1, interactive=True, visible=False)
- with gr.Row():
- post_process_threshold = gr.Slider(label=translations['post_process_threshold'], info=translations["post_process_threshold_info"], minimum=0.1, maximum=0.3, value=0.2, step=0.1, interactive=True, visible=False)
- aggression = gr.Slider(label=translations['aggression'], info=translations["aggression_info"], minimum=1, maximum=50, value=5, step=1, interactive=True, visible=False)
- with gr.Column():
- sample_rate = gr.Radio(choices=sample_rate_choice, value=48000, label=translations["sr"], info=translations["sr_info"], interactive=True)
- clean_strength = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.5, label=translations["clean_strength"], info=translations["clean_strength_info"], interactive=True, visible=False)
- with gr.Row():
- skip_start = gr.Textbox(label=translations["skip_start"], info=translations["skip_start_info"], value="", placeholder="0,...", interactive=True, visible=skip_seconds.value)
- skip_end = gr.Textbox(label=translations["skip_end"], info=translations["skip_end_info"], value="", placeholder="0,...", interactive=True, visible=skip_seconds.value)
- create_dataset_info = gr.Textbox(label=translations["create_dataset_info"], value="", interactive=False, lines=2)
- with gr.Row():
- separate.change(
- fn=lambda a: [visible(a) for _ in range(3)],
- inputs=[separate],
- outputs=[
- row,
- row_2,
- row_3
- ]
- )
- separate.change(
- fn=valueFalse_interactive,
- inputs=[separate],
- outputs=[separate_reverb]
- )
- separate.change(
- fn=create_dataset_change,
- inputs=[
- model_name,
- reverb_model,
- enable_post_process,
- separate_reverb,
- enable_denoise
- ],
- outputs=[
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- with gr.Row():
- model_name.change(
- fn=create_dataset_change,
- inputs=[
- model_name,
- reverb_model,
- enable_post_process,
- separate_reverb,
- enable_denoise
- ],
- outputs=[
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- reverb_model.change(
- fn=create_dataset_change,
- inputs=[
- model_name,
- reverb_model,
- enable_post_process,
- separate_reverb,
- enable_denoise
- ],
- outputs=[
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- denoise_model.change(
- fn=create_dataset_change,
- inputs=[
- model_name,
- reverb_model,
- enable_post_process,
- separate_reverb,
- enable_denoise
- ],
- outputs=[
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- with gr.Row():
- separate_reverb.change(
- fn=create_dataset_change,
- inputs=[
- model_name,
- reverb_model,
- enable_post_process,
- separate_reverb,
- enable_denoise
- ],
- outputs=[
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- enable_denoise.change(
- fn=create_dataset_change,
- inputs=[
- model_name,
- reverb_model,
- enable_post_process,
- separate_reverb,
- enable_denoise
- ],
- outputs=[
- reverb_model,
- overlap,
- segments_size,
- hop_length,
- batch_size,
- shifts,
- window_size,
- aggression,
- post_process_threshold,
- denoise_model,
- enable_tta,
- high_end_process,
- enable_post_process,
- ]
- )
- with gr.Row():
- skip_seconds.change(
- fn=lambda a: [visible(a) for _ in range(2)],
- inputs=[skip_seconds],
- outputs=[
- skip_start,
- skip_end
- ]
- )
- clean_dataset.change(
- fn=visible,
- inputs=[clean_dataset],
- outputs=[clean_strength]
- )
- with gr.Row():
- model_name.change(
- fn=lambda a: valueFalse_interactive(a in list(mdx_models.keys()) + list(vr_models.keys())),
- inputs=[model_name],
- outputs=[enable_denoise]
- )
- separate_reverb.change(
- fn=valueFalse_interactive,
- inputs=[separate_reverb],
- outputs=[enable_denoise]
- )
- with gr.Row():
- create_dataset_button.click(
- fn=create_dataset,
- inputs=[
- dataset_url,
- output_dataset,
- skip_seconds,
- skip_start,
- skip_end,
- separate,
- model_name,
- reverb_model,
- denoise_model,
- sample_rate,
- shifts,
- batch_size,
- overlap,
- aggression,
- hop_length,
- window_size,
- segments_size,
- post_process_threshold,
- enable_tta,
- enable_denoise,
- high_end_process,
- enable_post_process,
- separate_reverb,
- clean_dataset,
- clean_strength
- ],
- outputs=[create_dataset_info],
- api_name="create_dataset"
- )
\ No newline at end of file
diff --git a/main/app/tabs/training/child/create_reference.py b/main/app/tabs/training/child/create_reference.py
deleted file mode 100644
index c79fc9b6888c03c7345cb5c8329e5f33c8edeb0a..0000000000000000000000000000000000000000
--- a/main/app/tabs/training/child/create_reference.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.training import create_reference
-from main.app.core.ui import visible, change_audios_choices, unlock_f0, shutil_move, change_embedders_mode
-from main.app.variables import translations, paths_for_files, method_f0, hybrid_f0_method, file_types, configs, embedders_model, embedders_mode
-
-def create_reference_tab():
- with gr.Row():
- gr.Markdown(translations["create_reference_markdown_2"])
- with gr.Row():
- pitch_guidance = gr.Checkbox(label=translations["training_pitch"], value=True, interactive=True)
- use_energy = gr.Checkbox(label=translations["train&energy"], value=False, interactive=True)
- f0_autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
- with gr.Row():
- create_reference_button = gr.Button(translations["create_reference"], variant="primary")
- with gr.Row():
- f0_up_key = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
- with gr.Row():
- filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=f0_autotune.value)
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["input_output"], open=False):
- with gr.Column():
- input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
- reference_name = gr.Textbox(label=translations["reference_name"], value="reference", placeholder="reference", info=translations["reference_name_info"], interactive=True)
- with gr.Column():
- refresh_audio = gr.Button(translations["refresh"])
- with gr.Column():
- upload_audio = gr.Files(label=translations["drop_audio"], file_types=file_types)
- with gr.Column():
- play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
- with gr.Column() as f0_method_column:
- with gr.Accordion(label=translations["f0_method"], open=False):
- with gr.Group():
- with gr.Row():
- onnx_f0 = gr.Checkbox(label=translations["f0_onnx_mode"], value=False, interactive=True)
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], value=False, interactive=True)
- f0_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
- f0_hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=f0_method.value == "hybrid")
- with gr.Row():
- alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
- with gr.Column():
- with gr.Accordion(label=translations["hubert_model"], open=False):
- with gr.Row():
- version = gr.Radio(label=translations["training_version"], info=translations["training_version_info"], choices=["v1", "v2"], value="v2", interactive=True)
- with gr.Group():
- embedder_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
- embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
- with gr.Row():
- embedders_custom = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom")
- with gr.Row():
- create_reference_info = gr.Textbox(label=translations["reference_info"], value="", interactive=False, lines=2)
- with gr.Row():
- f0_autotune.change(fn=visible, inputs=[f0_autotune], outputs=[f0_autotune_strength])
- proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold])
- unlock_full_method.change(fn=unlock_f0, inputs=[unlock_full_method], outputs=[f0_method])
- with gr.Row():
- input_audio.change(fn=lambda audio: audio, inputs=[input_audio], outputs=[play_audio])
- refresh_audio.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio])
- f0_method.change(fn=lambda method: [visible(method == "hybrid") for _ in range(2)], inputs=[f0_method], outputs=[f0_hybrid_method, alpha])
- with gr.Row():
- upload_audio.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[upload_audio], outputs=[input_audio])
- embedder_mode.change(fn=change_embedders_mode, inputs=[embedder_mode], outputs=[embedders])
- embedders.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders], outputs=[embedders_custom])
- with gr.Row():
- pitch_guidance.change(fn=visible, inputs=[pitch_guidance], outputs=[f0_method_column])
- create_reference_button.click(
- fn=create_reference,
- inputs=[
- input_audio,
- reference_name,
- pitch_guidance,
- use_energy,
- version,
- embedders,
- embedder_mode,
- f0_method,
- onnx_f0,
- f0_up_key,
- filter_radius,
- f0_autotune,
- f0_autotune_strength,
- proposal_pitch,
- proposal_pitch_threshold,
- alpha
- ],
- outputs=[create_reference_info],
- api_name="create_reference"
- )
\ No newline at end of file
diff --git a/main/app/tabs/training/child/training.py b/main/app/tabs/training/child/training.py
deleted file mode 100644
index 3eb94e244f39919fefbc1363f738651d85b434f4..0000000000000000000000000000000000000000
--- a/main/app/tabs/training/child/training.py
+++ /dev/null
@@ -1,259 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.core.process import zip_file
-from main.app.core.training import preprocess, extract, create_index, training
-from main.app.variables import translations, model_name, index_path, method_f0, embedders_mode, embedders_model, pretrainedD, pretrainedG, config, file_types, hybrid_f0_method, reference_list
-from main.app.core.ui import gr_warning, visible, unlock_f0, hoplength_show, change_models_choices, get_gpu_info, change_embedders_mode, pitch_guidance_lock, vocoders_lock, unlock_ver, unlock_vocoder, change_pretrained_choices, gpu_number_str, shutil_move, change_reference_choices
-
-def training_model_tab():
- with gr.Row():
- gr.Markdown(translations["training_markdown"])
- with gr.Row():
- with gr.Column():
- with gr.Row():
- with gr.Column():
- training_name = gr.Textbox(label=translations["modelname"], info=translations["training_model_name"], value="", placeholder=translations["modelname"], interactive=True)
- training_sr = gr.Radio(label=translations["sample_rate"], info=translations["sample_rate_info"], choices=["32k", "40k", "48k"], value="48k", interactive=True)
- training_ver = gr.Radio(label=translations["training_version"], info=translations["training_version_info"], choices=["v1", "v2"], value="v2", interactive=True)
- with gr.Row():
- clean_dataset = gr.Checkbox(label=translations["clear_dataset"], value=False, interactive=True)
- process_effects = gr.Checkbox(label=translations["preprocess_effect"], value=False, interactive=True)
- training_f0 = gr.Checkbox(label=translations["training_pitch"], value=True, interactive=True)
- custom_reference = gr.Checkbox(label=translations["custom_reference"], value=False, interactive=True)
- checkpointing1 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
- upload = gr.Checkbox(label=translations["upload_dataset"], value=False, interactive=True)
- with gr.Row():
- preprocess_split_audio_mode = gr.Radio(label=translations["split_audio_mode"], info=translations["split_audio_mode_info"], value="Automatic", choices=["Automatic", "Simple", "Skip"], interactive=True)
- preprocess_normalization_mode = gr.Radio(label=translations["normalization_mode"], info=translations["normalization_mode_info"], value="none", choices=["none", "pre", "post"], interactive=True)
- with gr.Row(visible=custom_reference.value) as custom_reference_row:
- with gr.Accordion(translations["custom_reference"], open=True):
- reference_name = gr.Dropdown(label=translations["reference_name"], info=translations["reference_name_info"], choices=reference_list, value=reference_list[0] if len(reference_list) >= 1 else "", allow_custom_value=True, interactive=True)
- reference_refresh = gr.Button(translations["refresh"], scale=2)
- with gr.Row(visible=clean_dataset.value) as clean_dataset_row:
- clean_dataset_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.7, step=0.1, interactive=True)
- with gr.Column():
- preprocess_button = gr.Button(translations["preprocess_button"], scale=2)
- upload_dataset = gr.Files(label=translations["drop_audio"], file_types=file_types, visible=upload.value)
- preprocess_info = gr.Textbox(label=translations["preprocess_info"], value="", interactive=False, container=True, lines=2)
- with gr.Column():
- with gr.Row():
- with gr.Column():
- with gr.Accordion(label=translations["f0_method"], open=False):
- with gr.Group():
- with gr.Row():
- onnx_f0_mode2 = gr.Checkbox(label=translations["f0_onnx_mode"], value=False, interactive=True)
- unlock_full_method4 = gr.Checkbox(label=translations["f0_unlock"], value=False, interactive=True)
- autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
- extract_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
- extract_hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=extract_method.value == "hybrid")
- extract_hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False)
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value)
- alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
- with gr.Accordion(label=translations["hubert_model"], open=False):
- with gr.Group():
- embed_mode2 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
- extract_embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
- with gr.Row():
- extract_embedders_custom = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=extract_embedders.value == "custom")
- with gr.Column():
- extract_button = gr.Button(translations["extract_button"], scale=2)
- extract_info = gr.Textbox(label=translations["extract_info"], value="", interactive=False, lines=2)
- with gr.Column():
- with gr.Row():
- with gr.Column():
- total_epochs = gr.Slider(label=translations["total_epoch"], info=translations["total_epoch_info"], minimum=1, maximum=10000, value=300, step=1, interactive=True)
- save_epochs = gr.Slider(label=translations["save_epoch"], info=translations["save_epoch_info"], minimum=1, maximum=10000, value=50, step=1, interactive=True)
- with gr.Column():
- index_button = gr.Button(f"3. {translations['create_index']}", variant="primary", scale=2)
- training_button = gr.Button(f"4. {translations['training_model']}", variant="primary", scale=2)
- with gr.Row():
- with gr.Accordion(label=translations["setting"], open=False):
- with gr.Row():
- index_algorithm = gr.Radio(label=translations["index_algorithm"], info=translations["index_algorithm_info"], choices=["Auto", "Faiss", "KMeans"], value="Auto", interactive=True)
- with gr.Row():
- cache_in_gpu = gr.Checkbox(label=translations["cache_in_gpu"], info=translations["cache_in_gpu_info"], value=True, interactive=True)
- rms_extract = gr.Checkbox(label=translations["train&energy"], info=translations["train&energy_info"], value=False, interactive=True)
- overtraining_detector = gr.Checkbox(label=translations["overtraining_detector"], info=translations["overtraining_detector_info"], value=False, interactive=True)
- with gr.Row():
- custom_dataset = gr.Checkbox(label=translations["custom_dataset"], info=translations["custom_dataset_info"], value=False, interactive=True)
- save_only_latest = gr.Checkbox(label=translations["save_only_latest"], info=translations["save_only_latest_info"], value=True, interactive=True)
- save_every_weights = gr.Checkbox(label=translations["save_every_weights"], info=translations["save_every_weights_info"], value=True, interactive=True)
- with gr.Row():
- clean_up = gr.Checkbox(label=translations["cleanup_training"], info=translations["cleanup_training_info"], value=False, interactive=True)
- not_use_pretrain = gr.Checkbox(label=translations["not_use_pretrain_2"], info=translations["not_use_pretrain_info"], value=False, interactive=True)
- custom_pretrain = gr.Checkbox(label=translations["custom_pretrain"], info=translations["custom_pretrain_info"], value=False, interactive=True)
- with gr.Column():
- dataset_path = gr.Textbox(label=translations["dataset_folder"], value="dataset", interactive=True, visible=custom_dataset.value)
- with gr.Column():
- with gr.Row(visible=False) as simple_option:
- chunk_len = gr.Slider(minimum=0.5, maximum=5.0, value=3.0, step=0.1, label=translations["chunk_length"], info=translations["chunk_length_info"], interactive=True)
- overlap_len = gr.Slider(minimum=0.0, maximum=0.4, value=0.3, step=0.1, label=translations["overlap_length"], info=translations["overlap_length_info"], interactive=True)
- threshold = gr.Slider(minimum=1, maximum=100, value=50, step=1, label=translations["threshold"], interactive=True, visible=overtraining_detector.value)
- with gr.Accordion(translations["setting_cpu_gpu"], open=False):
- with gr.Column():
- gpu_number = gr.Textbox(label=translations["gpu_number"], value=gpu_number_str(), info=translations["gpu_number_info"], interactive=True)
- gpu_info = gr.Textbox(label=translations["gpu_info"], value=get_gpu_info(), info=translations["gpu_info_2"], interactive=False)
- cpu_core = gr.Slider(label=translations["cpu_core"], info=translations["cpu_core_info"], minimum=1, maximum=os.cpu_count(), value=os.cpu_count(), step=1, interactive=True)
- train_batch_size = gr.Slider(label=translations["batch_size"], info=translations["batch_size_info"], minimum=1, maximum=64, value=8, step=1, interactive=True)
- with gr.Group():
- multiscale_mel_loss = gr.Checkbox(label=translations["multiscale_mel_loss"], info=translations["multiscale_mel_loss_info"], value=False, interactive=True)
- vocoders = gr.Radio(label=translations["vocoder"], info=translations["vocoder_info"], choices=["Default", "MRF-HiFi-GAN", "RefineGAN"], value="Default", interactive=True)
- with gr.Row():
- deterministic = gr.Checkbox(label=translations["deterministic"], info=translations["deterministic_info"], value=False, interactive=config.device.startswith("cuda"))
- benchmark = gr.Checkbox(label=translations["benchmark"], info=translations["benchmark_info"], value=False, interactive=config.device.startswith("cuda"))
- with gr.Row():
- optimizer = gr.Radio(label=translations["optimizer"], info=translations["optimizer_info"], value="AdamW", choices=["AdamW", "RAdam", "AnyPrecisionAdamW"], interactive=True)
- with gr.Row():
- model_author = gr.Textbox(label=translations["training_author"], info=translations["training_author_info"], value="", placeholder=translations["training_author"], interactive=True)
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["custom_pretrain_info"], open=False, visible=custom_pretrain.value and not not_use_pretrain.value) as pretrain_setting:
- pretrained_D = gr.Dropdown(label=translations["pretrain_file"].format(dg="D"), choices=pretrainedD, value=pretrainedD[0] if len(pretrainedD) > 0 else '', interactive=True, allow_custom_value=True)
- pretrained_G = gr.Dropdown(label=translations["pretrain_file"].format(dg="G"), choices=pretrainedG, value=pretrainedG[0] if len(pretrainedG) > 0 else '', interactive=True, allow_custom_value=True)
- refresh_pretrain = gr.Button(translations["refresh"], scale=2)
- with gr.Row():
- training_info = gr.Textbox(label=translations["train_info"], value="", interactive=False, lines=3)
- with gr.Row():
- with gr.Column():
- with gr.Accordion(translations["export_model"], open=False):
- with gr.Row():
- model_file = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
- index_file = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
- with gr.Row():
- refresh_file = gr.Button(f"1. {translations['refresh']}", scale=2)
- zip_model = gr.Button(translations["zip_model"], variant="primary", scale=2)
- with gr.Row():
- zip_output = gr.File(label=translations["output_zip"], file_types=[".zip"], interactive=False, visible=False)
- with gr.Row():
- vocoders.change(fn=pitch_guidance_lock, inputs=[vocoders], outputs=[training_f0])
- training_f0.change(fn=vocoders_lock, inputs=[training_f0, vocoders], outputs=[vocoders])
- unlock_full_method4.change(fn=unlock_f0, inputs=[unlock_full_method4], outputs=[extract_method])
- with gr.Row():
- refresh_file.click(fn=change_models_choices, inputs=[], outputs=[model_file, index_file])
- zip_model.click(fn=zip_file, inputs=[training_name, model_file, index_file], outputs=[zip_output])
- dataset_path.change(fn=lambda folder: os.makedirs(folder, exist_ok=True), inputs=[dataset_path], outputs=[])
- with gr.Row():
- upload.change(fn=visible, inputs=[upload], outputs=[upload_dataset])
- overtraining_detector.change(fn=visible, inputs=[overtraining_detector], outputs=[threshold])
- clean_dataset.change(fn=visible, inputs=[clean_dataset], outputs=[clean_dataset_row])
- with gr.Row():
- custom_dataset.change(fn=lambda custom_dataset: [visible(custom_dataset), "dataset"],inputs=[custom_dataset], outputs=[dataset_path, dataset_path])
- training_ver.change(fn=unlock_vocoder, inputs=[training_ver, vocoders], outputs=[vocoders])
- vocoders.change(fn=unlock_ver, inputs=[training_ver, vocoders], outputs=[training_ver])
- with gr.Row():
- custom_reference.change(fn=visible, inputs=[custom_reference], outputs=[custom_reference_row])
- extract_method.change(fn=lambda method, hybrid: [visible(method == "hybrid"), visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[extract_method, extract_hybrid_method], outputs=[extract_hybrid_method, alpha, extract_hop_length])
- extract_hybrid_method.change(fn=hoplength_show, inputs=[extract_method, extract_hybrid_method], outputs=[extract_hop_length])
- with gr.Row():
- autotune.change(fn=visible, inputs=[autotune], outputs=[f0_autotune_strength])
- preprocess_split_audio_mode.change(fn=lambda a: visible(a == "Simple"), inputs=[preprocess_split_audio_mode], outputs=[simple_option])
- upload_dataset.upload(
- fn=lambda files, folder: [shutil_move(f.name, os.path.join(folder, os.path.split(f.name)[1])) for f in files] if folder != "" else gr_warning(translations["dataset_folder1"]),
- inputs=[upload_dataset, dataset_path],
- outputs=[],
- api_name="upload_dataset"
- )
- with gr.Row():
- not_use_pretrain.change(fn=lambda a, b: visible(a and not b), inputs=[custom_pretrain, not_use_pretrain], outputs=[pretrain_setting])
- custom_pretrain.change(fn=lambda a, b: visible(a and not b), inputs=[custom_pretrain, not_use_pretrain], outputs=[pretrain_setting])
- refresh_pretrain.click(fn=change_pretrained_choices, inputs=[], outputs=[pretrained_D, pretrained_G])
- with gr.Row():
- preprocess_button.click(
- fn=preprocess,
- inputs=[
- training_name,
- training_sr,
- cpu_core,
- preprocess_split_audio_mode,
- process_effects,
- dataset_path,
- clean_dataset,
- clean_dataset_strength,
- chunk_len,
- overlap_len,
- preprocess_normalization_mode
- ],
- outputs=[preprocess_info],
- api_name="preprocess"
- )
- with gr.Row():
- embed_mode2.change(fn=change_embedders_mode, inputs=[embed_mode2], outputs=[extract_embedders])
- extract_embedders.change(fn=lambda extract_embedders: visible(extract_embedders == "custom"), inputs=[extract_embedders], outputs=[extract_embedders_custom])
- reference_refresh.click(fn=change_reference_choices, inputs=[], outputs=[reference_name])
- with gr.Row():
- extract_button.click(
- fn=extract,
- inputs=[
- training_name,
- training_ver,
- extract_method,
- training_f0,
- extract_hop_length,
- cpu_core,
- gpu_number,
- training_sr,
- extract_embedders,
- extract_embedders_custom,
- onnx_f0_mode2,
- embed_mode2,
- autotune,
- f0_autotune_strength,
- extract_hybrid_method,
- rms_extract,
- alpha
- ],
- outputs=[extract_info],
- api_name="extract"
- )
- with gr.Row():
- index_button.click(
- fn=create_index,
- inputs=[
- training_name,
- training_ver,
- index_algorithm
- ],
- outputs=[training_info],
- api_name="create_index"
- )
- with gr.Row():
- training_button.click(
- fn=training,
- inputs=[
- training_name,
- training_ver,
- save_epochs,
- save_only_latest,
- save_every_weights,
- total_epochs,
- training_sr,
- train_batch_size,
- gpu_number,
- training_f0,
- not_use_pretrain,
- custom_pretrain,
- pretrained_G,
- pretrained_D,
- overtraining_detector,
- threshold,
- clean_up,
- cache_in_gpu,
- model_author,
- vocoders,
- checkpointing1,
- deterministic,
- benchmark,
- optimizer,
- rms_extract,
- custom_reference,
- reference_name,
- multiscale_mel_loss
- ],
- outputs=[training_info],
- api_name="training_model"
- )
\ No newline at end of file
diff --git a/main/app/tabs/training/training.py b/main/app/tabs/training/training.py
deleted file mode 100644
index e4cea85bc853f52130996e8f41c4b3b8c9bf090c..0000000000000000000000000000000000000000
--- a/main/app/tabs/training/training.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import os
-import sys
-
-import gradio as gr
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import translations, configs
-from main.app.tabs.training.child.training import training_model_tab
-from main.app.tabs.training.child.create_dataset import create_dataset_tab
-from main.app.tabs.training.child.create_reference import create_reference_tab
-
-def training_tab():
- with gr.TabItem(translations["training_model"], visible=configs.get("create_and_training_tab", True)):
- with gr.TabItem(translations["createdataset"], visible=configs.get("create_dataset_tab", True)):
- gr.Markdown(translations["create_dataset_markdown"])
- create_dataset_tab()
-
- with gr.TabItem(translations["create_reference"], visible=configs.get("create_reference_tab", True)):
- gr.Markdown(translations["create_reference_markdown"])
- create_reference_tab()
-
- with gr.TabItem(translations["training_model"], visible=configs.get("training_tab", True)):
- gr.Markdown(f"## {translations['training_model']}")
- training_model_tab()
\ No newline at end of file
diff --git a/main/app/variables.py b/main/app/variables.py
deleted file mode 100644
index df1ae15415817a1c5e55ead9c908c6d1b2382a0d..0000000000000000000000000000000000000000
--- a/main/app/variables.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import os
-import sys
-import csv
-import json
-import codecs
-import logging
-import urllib.request
-import logging.handlers
-
-sys.path.append(os.getcwd())
-
-from main.configs.config import Config
-
-logger = logging.getLogger(__name__)
-logger.propagate = False
-
-config = Config()
-python = sys.executable
-translations = config.translations
-configs_json = os.path.join("main", "configs", "config.json")
-configs = json.load(open(configs_json, "r"))
-
-if not logger.hasHandlers():
- console_handler = logging.StreamHandler()
- console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
- console_handler.setFormatter(console_formatter)
- console_handler.setLevel(logging.DEBUG if config.debug_mode else logging.INFO)
- file_handler = logging.handlers.RotatingFileHandler(os.path.join(configs["logs_path"], "app.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8')
- file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
- file_handler.setFormatter(file_formatter)
- file_handler.setLevel(logging.DEBUG)
- logger.addHandler(console_handler)
- logger.addHandler(file_handler)
- logger.setLevel(logging.DEBUG)
-
-if config.device in ["cpu", "mps", "ocl:0"] and configs.get("fp16", False):
- logger.warning(translations["fp16_not_support"])
- configs["fp16"] = config.is_half = False
-
- with open(configs_json, "w") as f:
- json.dump(configs, f, indent=4)
-
-models = {}
-model_options = {}
-
-method_f0 = ["mangio-crepe-full", "crepe-full", "fcpe", "rmvpe", "harvest", "pyin", "hybrid"]
-method_f0_full = ["pm-ac", "pm-cc", "pm-shs", "dio", "mangio-crepe-tiny", "mangio-crepe-small", "mangio-crepe-medium", "mangio-crepe-large", "mangio-crepe-full", "crepe-tiny", "crepe-small", "crepe-medium", "crepe-large", "crepe-full", "fcpe", "fcpe-legacy", "fcpe-previous", "rmvpe", "rmvpe-clipping", "rmvpe-medfilt", "rmvpe-clipping-medfilt", "harvest", "yin", "pyin", "swipe", "piptrack", "penn", "mangio-penn", "djcm", "djcm-clipping", "djcm-medfilt", "djcm-clipping-medfilt", "swift", "pesto", "hybrid"]
-hybrid_f0_method = ["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"]
-
-embedders_mode = ["fairseq", "onnx", "transformers", "spin", "whisper"]
-embedders_model = ["contentvec_base", "hubert_base", "vietnamese_hubert_base", "japanese_hubert_base", "korean_hubert_base", "chinese_hubert_base", "portuguese_hubert_base", "custom"]
-spin_model = ["spin-v1", "spin-v2"]
-whisper_model = ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3", "large-v3-turbo"]
-
-paths_for_files = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["audios_path"]) for f in files if os.path.splitext(f)[1].lower() in (".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")])
-reference_list = sorted([name for name in os.listdir(configs["reference_path"]) if os.path.exists(os.path.join(configs["reference_path"], name)) and os.path.isdir(os.path.join(configs["reference_path"], name))])
-model_name = sorted(list(model for model in os.listdir(configs["weights_path"]) if model.endswith((".pth", ".onnx")) and not model.startswith("G_") and not model.startswith("D_")))
-index_path = sorted([os.path.join(root, name) for root, _, files in os.walk(configs["logs_path"], topdown=False) for name in files if name.endswith(".index") and "trained" not in name])
-
-pretrainedD = [model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "D" in model]
-pretrainedG = [model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "G" in model]
-
-presets_file = sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".conversion.json")))
-audio_effect_presets_file = sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".effect.json")))
-f0_file = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["f0_path"]) for f in files if f.endswith(".txt")])
-
-file_types = [".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"]
-export_format_choices = ["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"]
-
-language = configs.get("language", "vi-VN")
-theme = configs.get("theme", "NoCrypt/miku")
-
-edgetts = configs.get("edge_tts", ["vi-VN-HoaiMyNeural", "vi-VN-NamMinhNeural"])
-google_tts_voice = configs.get("google_tts_voice", ["vi", "en"])
-
-vr_models = configs.get("vr_models", "")
-demucs_models = configs.get("demucs_models", "")
-mdx_models = configs.get("mdx_models", "")
-karaoke_models = configs.get("karaoke_models", "")
-reverb_models = configs.get("reverb_models", "")
-denoise_models = configs.get("denoise_models", "")
-uvr_model = list(demucs_models.keys()) + list(vr_models.keys()) + list(mdx_models.keys())
-
-font = configs.get("font", "https://fonts.googleapis.com/css2?family=Courgette&display=swap")
-sample_rate_choice = [8000, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000, 96000]
-csv_path = configs["csv_path"]
-
-if "--allow_all_disk" in sys.argv and sys.platform == "win32":
- try:
- import win32api
- except:
- os.system(f"{python} -m pip install pywin32")
- import win32api
-
- allow_disk = win32api.GetLogicalDriveStrings().split('\x00')[:-1]
-else: allow_disk = []
-
-try:
- if os.path.exists(csv_path): reader = list(csv.DictReader(open(csv_path, newline='', encoding='utf-8')))
- else:
- reader = list(csv.DictReader([line.decode('utf-8') for line in urllib.request.urlopen(codecs.decode("uggcf://qbpf.tbbtyr.pbz/fcernqfurrgf/q/1gNHnDeRULtEfz1Yieaw14USUQjWJy0Oq9k0DrCrjApb/rkcbeg?sbezng=pfi&tvq=1977693859", "rot13")).readlines()]))
- writer = csv.DictWriter(open(csv_path, mode='w', newline='', encoding='utf-8'), fieldnames=reader[0].keys())
- writer.writeheader()
- writer.writerows(reader)
-
- for row in reader:
- filename = row['Filename']
- url = None
-
- for value in row.values():
- if isinstance(value, str) and "huggingface" in value:
- url = value
- break
-
- if url: models[filename] = url
-except:
- pass
\ No newline at end of file
diff --git a/main/configs/config.json b/main/configs/config.json
deleted file mode 100644
index 1bd722d36a05023f7c46033d5f2bbc23416154c0..0000000000000000000000000000000000000000
--- a/main/configs/config.json
+++ /dev/null
@@ -1,622 +0,0 @@
-{
- "language": "vi-VN",
- "support_language": [
- "en-US",
- "vi-VN"
- ],
- "theme": "NoCrypt/miku",
- "themes": [
- "NoCrypt/miku",
- "gstaff/xkcd",
- "JohnSmith9982/small_and_pretty",
- "ParityError/Interstellar",
- "earneleh/paris",
- "shivi/calm_seafoam",
- "Hev832/Applio",
- "YTheme/Minecraft",
- "gstaff/sketch",
- "SebastianBravo/simci_css",
- "allenai/gradio-theme",
- "Nymbo/Nymbo_Theme_5",
- "lone17/kotaemon",
- "Zarkel/IBM_Carbon_Theme",
- "SherlockRamos/Feliz",
- "freddyaboulton/dracula_revamped",
- "freddyaboulton/bad-theme-space",
- "gradio/dracula_revamped",
- "abidlabs/dracula_revamped",
- "gradio/dracula_test",
- "gradio/seafoam",
- "gradio/glass",
- "gradio/monochrome",
- "gradio/soft",
- "gradio/default",
- "gradio/base",
- "abidlabs/pakistan",
- "dawood/microsoft_windows",
- "ysharma/steampunk",
- "ysharma/huggingface",
- "abidlabs/Lime",
- "freddyaboulton/this-theme-does-not-exist-2",
- "aliabid94/new-theme",
- "aliabid94/test2",
- "aliabid94/test3",
- "aliabid94/test4",
- "abidlabs/banana",
- "freddyaboulton/test-blue",
- "gstaff/whiteboard",
- "ysharma/llamas",
- "abidlabs/font-test",
- "YenLai/Superhuman",
- "bethecloud/storj_theme",
- "sudeepshouche/minimalist",
- "knotdgaf/gradiotest",
- "ParityError/Anime",
- "Ajaxon6255/Emerald_Isle",
- "ParityError/LimeFace",
- "finlaymacklon/smooth_slate",
- "finlaymacklon/boxy_violet",
- "derekzen/stardust",
- "EveryPizza/Cartoony-Gradio-Theme",
- "Ifeanyi/Cyanister",
- "Tshackelton/IBMPlex-DenseReadable",
- "snehilsanyal/scikit-learn",
- "Himhimhim/xkcd",
- "nota-ai/theme",
- "rawrsor1/Everforest",
- "rottenlittlecreature/Moon_Goblin",
- "abidlabs/test-yellow",
- "abidlabs/test-yellow3",
- "idspicQstitho/dracula_revamped",
- "kfahn/AnimalPose",
- "HaleyCH/HaleyCH_Theme",
- "simulKitke/dracula_test",
- "braintacles/CrimsonNight",
- "wentaohe/whiteboardv2",
- "reilnuud/polite",
- "remilia/Ghostly",
- "Franklisi/darkmode",
- "coding-alt/soft",
- "xiaobaiyuan/theme_land",
- "step-3-profit/Midnight-Deep",
- "xiaobaiyuan/theme_demo",
- "Taithrah/Minimal",
- "Insuz/SimpleIndigo",
- "zkunn/Alipay_Gradio_theme",
- "Insuz/Mocha",
- "xiaobaiyuan/theme_brief",
- "Ama434/434-base-Barlow",
- "Ama434/def_barlow",
- "Ama434/neutral-barlow",
- "dawood/dracula_test",
- "nuttea/Softblue",
- "BlueDancer/Alien_Diffusion",
- "naughtondale/monochrome",
- "Dagfinn1962/standard",
- "default"
- ],
- "mdx_models": {
- "Main_340": "UVR-MDX-NET_Main_340.onnx",
- "Main_390": "UVR-MDX-NET_Main_390.onnx",
- "Main_406": "UVR-MDX-NET_Main_406.onnx",
- "Main_427": "UVR-MDX-NET_Main_427.onnx",
- "Main_438": "UVR-MDX-NET_Main_438.onnx",
- "Inst_full_292": "UVR-MDX-NET-Inst_full_292.onnx",
- "Inst_HQ_1": "UVR-MDX-NET-Inst_HQ_1.onnx",
- "Inst_HQ_2": "UVR-MDX-NET-Inst_HQ_2.onnx",
- "Inst_HQ_3": "UVR-MDX-NET-Inst_HQ_3.onnx",
- "Inst_HQ_4": "UVR-MDX-NET-Inst_HQ_4.onnx",
- "Inst_HQ_5": "UVR-MDX-NET-Inst_HQ_5.onnx",
- "Kim_Vocal_1": "Kim_Vocal_1.onnx",
- "Kim_Vocal_2": "Kim_Vocal_2.onnx",
- "Kim_Inst": "Kim_Inst.onnx",
- "Inst_187_beta": "UVR-MDX-NET_Inst_187_beta.onnx",
- "Inst_82_beta": "UVR-MDX-NET_Inst_82_beta.onnx",
- "Inst_90_beta": "UVR-MDX-NET_Inst_90_beta.onnx",
- "Voc_FT": "UVR-MDX-NET-Voc_FT.onnx",
- "Crowd_HQ": "UVR-MDX-NET_Crowd_HQ_1.onnx",
- "MDXNET_9482": "UVR_MDXNET_9482.onnx",
- "Inst_1": "UVR-MDX-NET-Inst_1.onnx",
- "Inst_2": "UVR-MDX-NET-Inst_2.onnx",
- "Inst_3": "UVR-MDX-NET-Inst_3.onnx",
- "MDXNET_1_9703": "UVR_MDXNET_1_9703.onnx",
- "MDXNET_2_9682": "UVR_MDXNET_2_9682.onnx",
- "MDXNET_3_9662": "UVR_MDXNET_3_9662.onnx",
- "Inst_Main": "UVR-MDX-NET-Inst_Main.onnx",
- "MDXNET_Main": "UVR_MDXNET_Main.onnx"
- },
- "demucs_models": {
- "HT-Tuned": "htdemucs_ft.yaml",
- "HT-Normal": "htdemucs.yaml",
- "HD_MMI": "hdemucs_mmi.yaml",
- "HT_6S": "htdemucs_6s.yaml"
- },
- "vr_models": {
- "HP-1": "1_HP-UVR.pth",
- "HP-2": "2_HP-UVR.pth",
- "HP-Vocal-1": "3_HP-Vocal-UVR.pth",
- "HP-Vocal-2": "4_HP-Vocal-UVR.pth",
- "HP2-1": "7_HP2-UVR.pth",
- "HP2-2": "8_HP2-UVR.pth",
- "HP2-3": "9_HP2-UVR.pth",
- "SP-2B-1": "10_SP-UVR-2B-32000-1.pth",
- "SP-2B-2": "11_SP-UVR-2B-32000-2.pth",
- "SP-3B-1": "12_SP-UVR-3B-44100.pth",
- "SP-4B-1": "13_SP-UVR-4B-44100-1.pth",
- "SP-4B-2": "14_SP-UVR-4B-44100-2.pth",
- "SP-MID-1": "15_SP-UVR-MID-44100-1.pth",
- "SP-MID-2": "16_SP-UVR-MID-44100-2.pth"
- },
- "karaoke_models": {
- "MDX-Version-1": "UVR_MDXNET_KARA.onnx",
- "MDX-Version-2": "UVR_MDXNET_KARA_2.onnx",
- "VR-Version-1": "5_HP-Karaoke-UVR.pth",
- "VR-Version-2": "6_HP-Karaoke-UVR.pth"
- },
- "reverb_models": {
- "MDX-Reverb": "Reverb_HQ_By_FoxJoy.onnx",
- "VR-Reverb": "UVR-DeEcho-DeReverb.pth",
- "Echo-Aggressive": "UVR-De-Echo-Aggressive.pth",
- "Echo-Normal": "UVR-De-Echo-Normal.pth"
- },
- "denoise_models": {
- "Lite": "UVR-DeNoise-Lite.pth",
- "Normal": "UVR-DeNoise.pth"
- },
- "edge_tts": [
- "af-ZA-AdriNeural",
- "af-ZA-WillemNeural",
- "sq-AL-AnilaNeural",
- "sq-AL-IlirNeural",
- "am-ET-AmehaNeural",
- "am-ET-MekdesNeural",
- "ar-DZ-AminaNeural",
- "ar-DZ-IsmaelNeural",
- "ar-BH-AliNeural",
- "ar-BH-LailaNeural",
- "ar-EG-SalmaNeural",
- "ar-EG-ShakirNeural",
- "ar-IQ-BasselNeural",
- "ar-IQ-RanaNeural",
- "ar-JO-SanaNeural",
- "ar-JO-TaimNeural",
- "ar-KW-FahedNeural",
- "ar-KW-NouraNeural",
- "ar-LB-LaylaNeural",
- "ar-LB-RamiNeural",
- "ar-LY-ImanNeural",
- "ar-LY-OmarNeural",
- "ar-MA-JamalNeural",
- "ar-MA-MounaNeural",
- "ar-OM-AbdullahNeural",
- "ar-OM-AyshaNeural",
- "ar-QA-AmalNeural",
- "ar-QA-MoazNeural",
- "ar-SA-HamedNeural",
- "ar-SA-ZariyahNeural",
- "ar-SY-AmanyNeural",
- "ar-SY-LaithNeural",
- "ar-TN-HediNeural",
- "ar-TN-ReemNeural",
- "ar-AE-FatimaNeural",
- "ar-AE-HamdanNeural",
- "ar-YE-MaryamNeural",
- "ar-YE-SalehNeural",
- "az-AZ-BabekNeural",
- "az-AZ-BanuNeural",
- "bn-BD-NabanitaNeural",
- "bn-BD-PradeepNeural",
- "bn-IN-BashkarNeural",
- "bn-IN-TanishaaNeural",
- "bs-BA-GoranNeural",
- "bs-BA-VesnaNeural",
- "bg-BG-BorislavNeural",
- "bg-BG-KalinaNeural",
- "my-MM-NilarNeural",
- "my-MM-ThihaNeural",
- "ca-ES-EnricNeural",
- "ca-ES-JoanaNeural",
- "zh-HK-HiuGaaiNeural",
- "zh-HK-HiuMaanNeural",
- "zh-HK-WanLungNeural",
- "zh-CN-XiaoxiaoNeural",
- "zh-CN-XiaoyiNeural",
- "zh-CN-YunjianNeural",
- "zh-CN-YunxiNeural",
- "zh-CN-YunxiaNeural",
- "zh-CN-YunyangNeural",
- "zh-CN-liaoning-XiaobeiNeural",
- "zh-TW-HsiaoChenNeural",
- "zh-TW-YunJheNeural",
- "zh-TW-HsiaoYuNeural",
- "zh-CN-shaanxi-XiaoniNeural",
- "hr-HR-GabrijelaNeural",
- "hr-HR-SreckoNeural",
- "cs-CZ-AntoninNeural",
- "cs-CZ-VlastaNeural",
- "da-DK-ChristelNeural",
- "da-DK-JeppeNeural",
- "nl-BE-ArnaudNeural",
- "nl-BE-DenaNeural",
- "nl-NL-ColetteNeural",
- "nl-NL-FennaNeural",
- "nl-NL-MaartenNeural",
- "en-AU-NatashaNeural",
- "en-AU-WilliamNeural",
- "en-CA-ClaraNeural",
- "en-CA-LiamNeural",
- "en-HK-SamNeural",
- "en-HK-YanNeural",
- "en-IN-NeerjaExpressiveNeural",
- "en-IN-NeerjaNeural",
- "en-IN-PrabhatNeural",
- "en-IE-ConnorNeural",
- "en-IE-EmilyNeural",
- "en-KE-AsiliaNeural",
- "en-KE-ChilembaNeural",
- "en-NZ-MitchellNeural",
- "en-NZ-MollyNeural",
- "en-NG-AbeoNeural",
- "en-NG-EzinneNeural",
- "en-PH-JamesNeural",
- "en-PH-RosaNeural",
- "en-SG-LunaNeural",
- "en-SG-WayneNeural",
- "en-ZA-LeahNeural",
- "en-ZA-LukeNeural",
- "en-TZ-ElimuNeural",
- "en-TZ-ImaniNeural",
- "en-GB-LibbyNeural",
- "en-GB-MaisieNeural",
- "en-GB-RyanNeural",
- "en-GB-SoniaNeural",
- "en-GB-ThomasNeural",
- "en-US-AvaMultilingualNeural",
- "en-US-AndrewMultilingualNeural",
- "en-US-EmmaMultilingualNeural",
- "en-US-BrianMultilingualNeural",
- "en-US-AvaNeural",
- "en-US-AndrewNeural",
- "en-US-EmmaNeural",
- "en-US-BrianNeural",
- "en-US-AnaNeural",
- "en-US-AriaNeural",
- "en-US-ChristopherNeural",
- "en-US-EricNeural",
- "en-US-GuyNeural",
- "en-US-JennyNeural",
- "en-US-MichelleNeural",
- "en-US-RogerNeural",
- "en-US-SteffanNeural",
- "et-EE-AnuNeural",
- "et-EE-KertNeural",
- "fil-PH-AngeloNeural",
- "fil-PH-BlessicaNeural",
- "fi-FI-HarriNeural",
- "fi-FI-NooraNeural",
- "fr-BE-CharlineNeural",
- "fr-BE-GerardNeural",
- "fr-CA-ThierryNeural",
- "fr-CA-AntoineNeural",
- "fr-CA-JeanNeural",
- "fr-CA-SylvieNeural",
- "fr-FR-VivienneMultilingualNeural",
- "fr-FR-RemyMultilingualNeural",
- "fr-FR-DeniseNeural",
- "fr-FR-EloiseNeural",
- "fr-FR-HenriNeural",
- "fr-CH-ArianeNeural",
- "fr-CH-FabriceNeural",
- "gl-ES-RoiNeural",
- "gl-ES-SabelaNeural",
- "ka-GE-EkaNeural",
- "ka-GE-GiorgiNeural",
- "de-AT-IngridNeural",
- "de-AT-JonasNeural",
- "de-DE-SeraphinaMultilingualNeural",
- "de-DE-FlorianMultilingualNeural",
- "de-DE-AmalaNeural",
- "de-DE-ConradNeural",
- "de-DE-KatjaNeural",
- "de-DE-KillianNeural",
- "de-CH-JanNeural",
- "de-CH-LeniNeural",
- "el-GR-AthinaNeural",
- "el-GR-NestorasNeural",
- "gu-IN-DhwaniNeural",
- "gu-IN-NiranjanNeural",
- "he-IL-AvriNeural",
- "he-IL-HilaNeural",
- "hi-IN-MadhurNeural",
- "hi-IN-SwaraNeural",
- "hu-HU-NoemiNeural",
- "hu-HU-TamasNeural",
- "is-IS-GudrunNeural",
- "is-IS-GunnarNeural",
- "id-ID-ArdiNeural",
- "id-ID-GadisNeural",
- "ga-IE-ColmNeural",
- "ga-IE-OrlaNeural",
- "it-IT-GiuseppeNeural",
- "it-IT-DiegoNeural",
- "it-IT-ElsaNeural",
- "it-IT-IsabellaNeural",
- "ja-JP-KeitaNeural",
- "ja-JP-NanamiNeural",
- "jv-ID-DimasNeural",
- "jv-ID-SitiNeural",
- "kn-IN-GaganNeural",
- "kn-IN-SapnaNeural",
- "kk-KZ-AigulNeural",
- "kk-KZ-DauletNeural",
- "km-KH-PisethNeural",
- "km-KH-SreymomNeural",
- "ko-KR-HyunsuNeural",
- "ko-KR-InJoonNeural",
- "ko-KR-SunHiNeural",
- "lo-LA-ChanthavongNeural",
- "lo-LA-KeomanyNeural",
- "lv-LV-EveritaNeural",
- "lv-LV-NilsNeural",
- "lt-LT-LeonasNeural",
- "lt-LT-OnaNeural",
- "mk-MK-AleksandarNeural",
- "mk-MK-MarijaNeural",
- "ms-MY-OsmanNeural",
- "ms-MY-YasminNeural",
- "ml-IN-MidhunNeural",
- "ml-IN-SobhanaNeural",
- "mt-MT-GraceNeural",
- "mt-MT-JosephNeural",
- "mr-IN-AarohiNeural",
- "mr-IN-ManoharNeural",
- "mn-MN-BataaNeural",
- "mn-MN-YesuiNeural",
- "ne-NP-HemkalaNeural",
- "ne-NP-SagarNeural",
- "nb-NO-FinnNeural",
- "nb-NO-PernilleNeural",
- "ps-AF-GulNawazNeural",
- "ps-AF-LatifaNeural",
- "fa-IR-DilaraNeural",
- "fa-IR-FaridNeural",
- "pl-PL-MarekNeural",
- "pl-PL-ZofiaNeural",
- "pt-BR-ThalitaNeural",
- "pt-BR-AntonioNeural",
- "pt-BR-FranciscaNeural",
- "pt-PT-DuarteNeural",
- "pt-PT-RaquelNeural",
- "ro-RO-AlinaNeural",
- "ro-RO-EmilNeural",
- "ru-RU-DmitryNeural",
- "ru-RU-SvetlanaNeural",
- "sr-RS-NicholasNeural",
- "sr-RS-SophieNeural",
- "si-LK-SameeraNeural",
- "si-LK-ThiliniNeural",
- "sk-SK-LukasNeural",
- "sk-SK-ViktoriaNeural",
- "sl-SI-PetraNeural",
- "sl-SI-RokNeural",
- "so-SO-MuuseNeural",
- "so-SO-UbaxNeural",
- "es-AR-ElenaNeural",
- "es-AR-TomasNeural",
- "es-BO-MarceloNeural",
- "es-BO-SofiaNeural",
- "es-CL-CatalinaNeural",
- "es-CL-LorenzoNeural",
- "es-ES-XimenaNeural",
- "es-CO-GonzaloNeural",
- "es-CO-SalomeNeural",
- "es-CR-JuanNeural",
- "es-CR-MariaNeural",
- "es-CU-BelkysNeural",
- "es-CU-ManuelNeural",
- "es-DO-EmilioNeural",
- "es-DO-RamonaNeural",
- "es-EC-AndreaNeural",
- "es-EC-LuisNeural",
- "es-SV-LorenaNeural",
- "es-SV-RodrigoNeural",
- "es-GQ-JavierNeural",
- "es-GQ-TeresaNeural",
- "es-GT-AndresNeural",
- "es-GT-MartaNeural",
- "es-HN-CarlosNeural",
- "es-HN-KarlaNeural",
- "es-MX-DaliaNeural",
- "es-MX-JorgeNeural",
- "es-NI-FedericoNeural",
- "es-NI-YolandaNeural",
- "es-PA-MargaritaNeural",
- "es-PA-RobertoNeural",
- "es-PY-MarioNeural",
- "es-PY-TaniaNeural",
- "es-PE-AlexNeural",
- "es-PE-CamilaNeural",
- "es-PR-KarinaNeural",
- "es-PR-VictorNeural",
- "es-ES-AlvaroNeural",
- "es-ES-ElviraNeural",
- "es-US-AlonsoNeural",
- "es-US-PalomaNeural",
- "es-UY-MateoNeural",
- "es-UY-ValentinaNeural",
- "es-VE-PaolaNeural",
- "es-VE-SebastianNeural",
- "su-ID-JajangNeural",
- "su-ID-TutiNeural",
- "sw-KE-RafikiNeural",
- "sw-KE-ZuriNeural",
- "sw-TZ-DaudiNeural",
- "sw-TZ-RehemaNeural",
- "sv-SE-MattiasNeural",
- "sv-SE-SofieNeural",
- "ta-IN-PallaviNeural",
- "ta-IN-ValluvarNeural",
- "ta-MY-KaniNeural",
- "ta-MY-SuryaNeural",
- "ta-SG-AnbuNeural",
- "ta-SG-VenbaNeural",
- "ta-LK-KumarNeural",
- "ta-LK-SaranyaNeural",
- "te-IN-MohanNeural",
- "te-IN-ShrutiNeural",
- "th-TH-NiwatNeural",
- "th-TH-PremwadeeNeural",
- "tr-TR-AhmetNeural",
- "tr-TR-EmelNeural",
- "uk-UA-OstapNeural",
- "uk-UA-PolinaNeural",
- "ur-IN-GulNeural",
- "ur-IN-SalmanNeural",
- "ur-PK-AsadNeural",
- "ur-PK-UzmaNeural",
- "uz-UZ-MadinaNeural",
- "uz-UZ-SardorNeural",
- "vi-VN-HoaiMyNeural",
- "vi-VN-NamMinhNeural",
- "cy-GB-AledNeural",
- "cy-GB-NiaNeural",
- "zu-ZA-ThandoNeural",
- "zu-ZA-ThembaNeural"
- ],
- "google_tts_voice": [
- "af",
- "am",
- "ar",
- "bg",
- "bn",
- "bs",
- "ca",
- "cs",
- "cy",
- "da",
- "de",
- "el",
- "en",
- "es",
- "et",
- "eu",
- "fi",
- "fr",
- "fr-CA",
- "gl",
- "gu",
- "ha",
- "hi",
- "hr",
- "hu",
- "id",
- "is",
- "it",
- "iw",
- "ja",
- "jw",
- "km",
- "kn",
- "ko",
- "la",
- "lt",
- "lv",
- "ml",
- "mr",
- "ms",
- "my",
- "ne",
- "nl",
- "no",
- "pa",
- "pl",
- "pt",
- "pt-PT",
- "ro",
- "ru",
- "si",
- "sk",
- "sq",
- "sr",
- "su",
- "sv",
- "sw",
- "ta",
- "te",
- "th",
- "tl",
- "tr",
- "uk",
- "ur",
- "vi",
- "yue",
- "zh-CN",
- "zh-TW",
- "zh"
- ],
- "fp16": false,
- "editing_tab": true,
- "inference_tab": true,
- "create_and_training_tab": true,
- "extra_tab": true,
- "separator_tab": true,
- "convert_tab": true,
- "convert_with_whisper": true,
- "tts_tab": true,
- "effects_tab": true,
- "quirk": true,
- "create_dataset_tab": true,
- "training_tab": true,
- "fushion_tab": true,
- "read_tab": true,
- "onnx_tab": true,
- "downloads_tab": true,
- "f0_extractor_tab": true,
- "settings_tab": true,
- "create_srt_tab": true,
- "realtime_tab": true,
- "realtime_client_tab": true,
- "create_reference_tab": true,
- "font": "https://fonts.googleapis.com/css2?family=Roboto&display=swap",
- "app_port": 7860,
- "tensorboard_port": 6870,
- "num_of_restart": 5,
- "server_name": "0.0.0.0",
- "app_show_error": true,
- "delete_exists_file": false,
- "audio_effects_path": "main/inference/audio_effects.py",
- "convert_path": "main/inference/conversion/convert.py",
- "separate_path": "main/inference/separate_music.py",
- "create_dataset_path": "main/inference/create_dataset.py",
- "preprocess_path": "main/inference/preprocess/preprocess.py",
- "extract_path": "main/inference/extracting/extract.py",
- "create_index_path": "main/inference/create_index.py",
- "train_path": "main/inference/training/train.py",
- "create_reference_path": "main/inference/create_reference.py",
- "ico_path": "assets/ico.png",
- "csv_path": "assets/spreadsheet.csv",
- "weights_path": "assets/weights",
- "logs_path": "assets/logs",
- "binary_path": "assets/binary",
- "f0_path": "assets/f0",
- "language_path": "assets/languages",
- "presets_path": "assets/presets",
- "embedders_path": "assets/models/embedders",
- "predictors_path": "assets/models/predictors",
- "pretrained_custom_path": "assets/models/pretrained_custom",
- "pretrained_v1_path": "assets/models/pretrained_v1",
- "pretrained_v2_path": "assets/models/pretrained_v2",
- "speaker_diarization_path": "assets/models/speaker_diarization",
- "uvr5_path": "assets/models/uvr5",
- "audios_path": "audios",
- "reference_path": "assets/logs/reference",
- "demucs_segments_enable": true,
- "demucs_cpu_mode": false,
- "limit_f0": 8,
- "debug_mode": false,
- "pretrain_verify_shape": true,
- "pretrain_strict": true,
- "cpu_mode": false,
- "brain": false,
- "discord_presence": true
-}
\ No newline at end of file
diff --git a/main/configs/config.py b/main/configs/config.py
deleted file mode 100644
index 7f260693c90ee6eeca6cf740a2fc2ad9f1d03476..0000000000000000000000000000000000000000
--- a/main/configs/config.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import os
-import sys
-import json
-import torch
-import onnxruntime
-
-sys.path.append(os.getcwd())
-
-from main.library.backends import directml, opencl, zluda
-
-version_config_paths = [os.path.join(version, size) for version in ["v1", "v2"] for size in ["32000.json", "40000.json", "48000.json"]]
-
-def singleton(cls):
- instances = {}
-
- def get_instance(*args, **kwargs):
- if cls not in instances: instances[cls] = cls(*args, **kwargs)
- return instances[cls]
-
- return get_instance
-
-@singleton
-class Config:
- def __init__(self):
- self.configs_path = os.path.join("main", "configs", "config.json")
- self.configs = json.load(open(self.configs_path, "r"))
-
- self.cpu_mode = self.configs.get("cpu_mode", False)
- self.brain = self.configs.get("brain", False)
- self.debug_mode = self.configs.get("debug_mode", False)
-
- self.json_config = self.load_config_json()
- self.translations = self.multi_language()
-
- self.gpu_mem = None
- self.per_preprocess = 3.7
- self.device = self.get_default_device()
- self.providers = self.get_providers()
- self.is_half = self.is_fp16()
- self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
-
- def multi_language(self):
- try:
- lang = self.configs.get("language", "vi-VN")
- if len([l for l in os.listdir(self.configs["language_path"]) if l.endswith(".json")]) < 1: raise FileNotFoundError("Không tìm thấy bất cứ gói ngôn ngữ nào(No package languages found)")
-
- if not lang: lang = "vi-VN"
- if lang not in self.configs["support_language"]: raise ValueError("Ngôn ngữ không được hỗ trợ(Language not supported)")
-
- lang_path = os.path.join(self.configs["language_path"], f"{lang}.json")
- if not os.path.exists(lang_path): lang_path = os.path.join(self.configs["language_path"], "vi-VN.json")
-
- with open(lang_path, encoding="utf-8") as f:
- translations = json.load(f)
- except json.JSONDecodeError:
- print(self.translations["empty_json"].format(file=lang))
- pass
-
- return translations
-
- def is_fp16(self):
- fp16 = self.configs.get("fp16", False)
-
- if self.device in ["cpu", "mps"] and fp16:
- self.configs["fp16"] = False
- fp16 = False
-
- with open(self.configs_path, "w") as f:
- json.dump(self.configs, f, indent=4)
-
- if not fp16: self.per_preprocess = 3.0
- return fp16
-
- def load_config_json(self):
- configs = {}
-
- for config_file in version_config_paths:
- try:
- with open(os.path.join("main", "configs", config_file), "r") as f:
- configs[config_file] = json.load(f)
- except json.JSONDecodeError:
- print(self.translations["empty_json"].format(file=config_file))
- pass
-
- return configs
-
- def device_config(self):
- if self.gpu_mem is not None and self.gpu_mem <= 4:
- self.per_preprocess = 3.0
- return 1, 5, 30, 32
-
- return (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
-
- def get_default_device(self):
- if not self.cpu_mode:
- if torch.cuda.is_available():
- device = "cuda:0"
- self.gpu_mem = torch.cuda.get_device_properties(int(device.split(":")[-1])).total_memory // (1024**3)
- elif directml.is_available():
- device = "privateuseone:0"
- elif opencl.is_available():
- device = "ocl:0"
- elif torch.backends.mps.is_available():
- device = "mps"
- else:
- device = "cpu"
- else:
- torch.cuda.is_available = lambda : False
- directml.is_available = lambda : False
- opencl.is_available = lambda : False
- torch.backends.mps.is_available = lambda : False
-
- device = "cpu"
-
- return device
-
- def get_providers(self):
- ort_providers = onnxruntime.get_available_providers()
-
- if "CUDAExecutionProvider" in ort_providers and self.device.startswith("cuda"):
- providers = ["CUDAExecutionProvider"]
- elif "ROCMExecutionProvider" in ort_providers and self.device.startswith("cuda"):
- providers = ["ROCMExecutionProvider"]
- elif "DmlExecutionProvider" in ort_providers and self.device.startswith(("ocl", "privateuseone")):
- providers = ["DmlExecutionProvider"]
- elif "CoreMLExecutionProvider" in ort_providers and self.device.startswith("mps"):
- providers = ["CoreMLExecutionProvider"]
- else:
- providers = ["CPUExecutionProvider"]
-
- return providers
\ No newline at end of file
diff --git a/main/configs/rpc.py b/main/configs/rpc.py
deleted file mode 100644
index f3bded39740d264f2a4731b247009d096400a7ee..0000000000000000000000000000000000000000
--- a/main/configs/rpc.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import os
-import sys
-import json
-import time
-import struct
-import codecs
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import translations
-
-CLIENT_ID = "1392816674159202396"
-
-def create_payload(opcode, payload):
- data = json.dumps(payload).encode("utf-8")
-
- return struct.pack(
- "= 1:
- b, a = butter(4, frequency / (0.5 * sample_rate), btype='low')
- boosted = _filtfilt(b, a, audio)
- return boosted * (10 ** (gain_db / 20))
- return audio
-
- def treble_boost(audio, gain_db, frequency, sample_rate):
- if gain_db >= 1:
- b, a = butter(4, frequency / (0.5 * sample_rate), btype='high')
- boosted = _filtfilt(b, a, audio)
- return boosted * (10 ** (gain_db / 20))
- return audio
-
- def fade_out_effect(audio, sr, duration=3.0):
- length = int(duration * sr)
- end = audio.shape[0]
- if length > end: length = end
- start = end - length
- audio[start:end] = audio[start:end] * np.linspace(1.0, 0.0, length)
- return audio
-
- def fade_in_effect(audio, sr, duration=3.0):
- length = int(duration * sr)
- start = 0
- if length > audio.shape[0]: length = audio.shape[0]
- end = length
- audio[start:end] = audio[start:end] * np.linspace(0.0, 1.0, length)
- return audio
-
- if not input_path or not os.path.exists(input_path):
- logger.warning(translations["input_not_valid"])
- sys.exit(1)
-
- if not output_path:
- logger.warning(translations["output_not_valid"])
- sys.exit(1)
-
- if os.path.exists(output_path): os.remove(output_path)
-
- try:
- input_path = input_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
- try:
- audio, sample_rate = sf.read(input_path, dtype=np.float32)
- except:
- audio, sample_rate = librosa.load(input_path, sr=None)
- except Exception as e:
- logger.debug(f"{translations['errors_loading_audio']}: {e}")
- raise RuntimeError(f"{translations['errors_loading_audio']}: {e}")
-
- try:
- board = Pedalboard([HighpassFilter()])
-
- if chorus: board.append(Chorus(depth=chorus_depth, rate_hz=chorus_rate, mix=chorus_mix, centre_delay_ms=chorus_delay, feedback=chorus_feedback))
- if distortion: board.append(Distortion(drive_db=distortion_drive))
- if reverb: board.append(Reverb(room_size=reverb_room_size, damping=reverb_damping, wet_level=reverb_wet_level, dry_level=reverb_dry_level, width=reverb_width, freeze_mode=int(reverb_freeze_mode)))
- if pitchshift: board.append(PitchShift(semitones=pitch_shift))
- if delay: board.append(Delay(delay_seconds=delay_seconds, feedback=delay_feedback, mix=delay_mix))
- if compressor: board.append(Compressor(threshold_db=compressor_threshold, ratio=compressor_ratio, attack_ms=compressor_attack_ms, release_ms=compressor_release_ms))
- if limiter: board.append(Limiter(threshold_db=limiter_threshold, release_ms=limiter_release))
- if gain: board.append(Gain(gain_db=gain_db))
- if bitcrush: board.append(Bitcrush(bit_depth=bitcrush_bit_depth))
- if clipping: board.append(Clipping(threshold_db=clipping_threshold))
- if phaser: board.append(Phaser(rate_hz=phaser_rate_hz, depth=phaser_depth, centre_frequency_hz=phaser_centre_frequency_hz, feedback=phaser_feedback, mix=phaser_mix))
-
- processed_audio = board(audio, sample_rate)
-
- if treble_bass_boost:
- processed_audio = bass_boost(processed_audio, bass_boost_db, bass_boost_frequency, sample_rate)
- processed_audio = treble_boost(processed_audio, treble_boost_db, treble_boost_frequency, sample_rate)
-
- if fade_in_out:
- processed_audio = fade_in_effect(processed_audio, sample_rate, fade_in_duration)
- processed_audio = fade_out_effect(processed_audio, sample_rate, fade_out_duration)
-
- if resample and resample_sr != sample_rate and resample_sr > 0:
- processed_audio = librosa.resample(processed_audio, orig_sr=sample_rate, target_sr=resample_sr, res_type="soxr_vhq")
- sample_rate = resample_sr
-
- sf.write(replace_export_format(output_path, export_format), processed_audio, sample_rate, format=export_format)
- if audio_combination: pydub_load(audio_combination_input, combination_volume).overlay(pydub_load(replace_export_format(output_path, export_format), main_volume)).export(replace_export_format(output_path, export_format), format=export_format)
- except Exception as e:
- import traceback
- logger.debug(traceback.format_exc())
- raise RuntimeError(translations["apply_error"].format(e=e))
- return output_path
-
-def main():
- args = parse_arguments()
- process_audio(input_path=args.input_path, output_path=args.output_path, resample=args.resample, resample_sr=args.resample_sr, chorus_depth=args.chorus_depth, chorus_rate=args.chorus_rate, chorus_mix=args.chorus_mix, chorus_delay=args.chorus_delay, chorus_feedback=args.chorus_feedback, distortion_drive=args.drive_db, reverb_room_size=args.reverb_room_size, reverb_damping=args.reverb_damping, reverb_wet_level=args.reverb_wet_level, reverb_dry_level=args.reverb_dry_level, reverb_width=args.reverb_width, reverb_freeze_mode=args.reverb_freeze_mode, pitch_shift=args.pitch_shift, delay_seconds=args.delay_seconds, delay_feedback=args.delay_feedback, delay_mix=args.delay_mix, compressor_threshold=args.compressor_threshold, compressor_ratio=args.compressor_ratio, compressor_attack_ms=args.compressor_attack_ms, compressor_release_ms=args.compressor_release_ms, limiter_threshold=args.limiter_threshold, limiter_release=args.limiter_release, gain_db=args.gain_db, bitcrush_bit_depth=args.bitcrush_bit_depth, clipping_threshold=args.clipping_threshold, phaser_rate_hz=args.phaser_rate_hz, phaser_depth=args.phaser_depth, phaser_centre_frequency_hz=args.phaser_centre_frequency_hz, phaser_feedback=args.phaser_feedback, phaser_mix=args.phaser_mix, bass_boost_db=args.bass_boost_db, bass_boost_frequency=args.bass_boost_frequency, treble_boost_db=args.treble_boost_db, treble_boost_frequency=args.treble_boost_frequency, fade_in_duration=args.fade_in_duration, fade_out_duration=args.fade_out_duration, export_format=args.export_format, chorus=args.chorus, distortion=args.distortion, reverb=args.reverb, pitchshift=args.pitchshift, delay=args.delay, compressor=args.compressor, limiter=args.limiter, gain=args.gain, bitcrush=args.bitcrush, clipping=args.clipping, phaser=args.phaser, treble_bass_boost=args.treble_bass_boost, fade_in_out=args.fade_in_out, audio_combination=args.audio_combination, audio_combination_input=args.audio_combination_input, main_volume=args.main_volume, combination_volume=args.combination_volume)
-
-if __name__ == "__main__": main()
\ No newline at end of file
diff --git a/main/inference/conversion/audio_processing.py b/main/inference/conversion/audio_processing.py
deleted file mode 100644
index 54e75126d5244edbc6402338a21dda96446f8e4d..0000000000000000000000000000000000000000
--- a/main/inference/conversion/audio_processing.py
+++ /dev/null
@@ -1,281 +0,0 @@
-import os
-import sys
-import torch
-import librosa
-
-import numpy as np
-import scipy.signal as signal
-
-sys.path.append(os.getcwd())
-
-stft = None
-
-def rms(x, eps=1e-9):
- return np.sqrt(np.mean(x ** 2) + eps)
-
-def soft_limiter(x, threshold=0.98):
- return np.tanh(x / threshold) * threshold
-
-def normalize_audio(x, target_rms=0.1):
- cur = rms(x)
-
- if cur <= 0: return x
- return x * (target_rms / cur)
-
-def compute_mfcc(x, sr, n_mfcc=20, n_fft=1024, hop_length=160):
- mfcc = librosa.feature.mfcc(
- y=x,
- sr=sr,
- n_mfcc=n_mfcc,
- n_fft=n_fft,
- hop_length=hop_length
- )
-
- return mfcc
-
-def mix_mfcc_exciter(audio, sr, strength=0.08, n_mfcc=20, n_mels=128):
- mfcc = compute_mfcc(audio, sr, n_mfcc)
-
- try:
- exc = librosa.feature.inverse.mfcc_to_audio(mfcc, sr=sr, n_mels=n_mels)
- except Exception:
- mel_spec = librosa.feature.inverse.mfcc_to_mel(mfcc)
- exc = librosa.feature.inverse.mel_to_audio(mel_spec, sr=sr)
-
- if exc.shape[0] < audio.shape[0]:
- exc = np.pad(exc, (0, audio.shape[0] - exc.shape[0]))
- else:
- exc = exc[: audio.shape[0]]
-
- b, a = signal.butter(2, 300 / (sr / 2), btype="high")
- exc = signal.lfilter(b, a, exc)
-
- exc = exc / (rms(exc) + 1e-9) * (rms(audio) + 1e-9)
- return audio + strength * exc
-
-def automatic_multiband_eq(audio, sr, n_bands=6, target_slope=0.0, n_fft=1024, hop_length=160):
- S = np.abs(librosa.stft(audio.astype(np.float32), n_fft=n_fft, hop_length=hop_length))
- mean_spec = np.mean(S, axis=1)
- freqs = np.linspace(0, sr // 2, mean_spec.shape[0])
-
- band_edges = np.geomspace(100, sr / 2, n_bands + 1)
- gains_db = np.zeros(n_bands)
-
- for i in range(n_bands):
- idx = np.where((freqs >= band_edges[i]) & (freqs < band_edges[i + 1]))[0]
- if idx.size == 0:
- gains_db[i] = 0.0
- continue
-
- band_power_db = 20 * np.log10(np.mean(mean_spec[idx]) + 1e-9)
- median_db = np.median(20 * np.log10(mean_spec + 1e-9))
- gains_db[i] = median_db - band_power_db
-
- gains_db = signal.medfilt(gains_db, kernel_size=3)
- gains_db = gains_db + np.linspace(-target_slope, target_slope, n_bands)
- gains = 10 ** (gains_db / 20.0)
-
- out = np.zeros_like(audio)
- for i in range(n_bands):
- low = band_edges[i]
- high = band_edges[i + 1]
-
- if low <= 0:
- b, a = signal.butter(2, high / (sr / 2), btype="low")
- elif high >= sr / 2:
- b, a = signal.butter(2, low / (sr / 2), btype="high")
- else:
- b, a = signal.butter(2, [low / (sr / 2), high / (sr / 2)], btype="band")
-
- band = signal.lfilter(b, a, audio)
- out += gains[i] * band
-
- out = out / (rms(out) + 1e-9) * (rms(audio) + 1e-9)
- return 0.85 * audio + 0.15 * out
-
-def apply_multiband_eq(audio, sr, bands):
- out = np.zeros_like(audio)
-
- for low, high, gain_db in bands:
- gain = 10 ** (gain_db / 20.0)
-
- if low <= 0: b, a = signal.butter(2, high / (sr / 2), btype="low")
- elif high >= sr / 2: b, a = signal.butter(2, low / (sr / 2), btype="high")
- else: b, a = signal.butter(2, [low / (sr / 2), high / (sr / 2)], btype="band")
-
- band = signal.lfilter(b, a, audio)
- out += gain * band
-
- return out
-
-def best_multiband_eq(audio, sr, original_audio=None, sr_ref=16000, n_bands=6, target_slope=0.0, n_fft=1024, hop_length=160, strength=0.15):
- if original_audio is not None:
- mf_out = compute_mfcc(audio, sr)
- mf_ref = compute_mfcc(original_audio.astype(np.float32), sr_ref)
-
- out_mean = np.mean(mf_out, axis=1)
- ref_mean = np.mean(mf_ref, axis=1)
- diff = ref_mean - out_mean
-
- low_val = diff[:3].mean()
- mid_val = diff[3:6].mean()
- upper_val = diff[6:9].mean()
- high_val = diff[9:13].mean()
-
- bands = [
- (0, 300, np.clip(low_val * 0.6, -6.0, 6.0)),
- (300, 800, np.clip(mid_val * 0.5, -6.0, 6.0)),
- (800, 2000, np.clip(upper_val * 0.6, -6.0, 6.0)),
- (2000, int(sr / 2 - 1000), np.clip(high_val * 0.6, -6.0, 6.0)),
- ]
- eq_audio = apply_multiband_eq(audio, sr, bands)
- else:
- fft = np.abs(librosa.stft(audio.astype(np.float32), n_fft=n_fft, hop_length=hop_length))
- mean_spec = np.mean(fft, axis=1)
- freqs = np.linspace(0, sr // 2, mean_spec.shape[0])
-
- band_edges = np.geomspace(100, sr / 2, n_bands + 1)
- gains_db = np.zeros(n_bands)
-
- for i in range(n_bands):
- idx = np.where((freqs >= band_edges[i]) & (freqs < band_edges[i + 1]))[0]
- if idx.size == 0: continue
-
- band_power_db = 20 * np.log10(np.mean(mean_spec[idx]) + 1e-9)
- median_db = np.median(20 * np.log10(mean_spec + 1e-9))
- gains_db[i] = median_db - band_power_db
-
- gains_db = signal.medfilt(gains_db, kernel_size=3)
- gains_db += np.linspace(-target_slope, target_slope, n_bands)
- gains_db = np.clip(gains_db, -6.0, 6.0)
-
- bands = [(band_edges[i], band_edges[i+1], gains_db[i]) for i in range(n_bands)]
- eq_audio = apply_multiband_eq(audio, sr, bands)
-
- out = (1 - strength) * audio + strength * eq_audio
- out = out / (rms(out) + 1e-9) * (rms(audio) + 1e-9)
-
- mx = np.max(np.abs(out)) + 1e-9
- if mx > 0.99: out /= mx * 0.99
-
- return out
-
-def spectral_subtract_denoise(audio, sr, noise_seconds=0.4, alpha=1.0, n_fft=1024, hop_length=160, device="cpu"):
- global stft
-
- if stft is None and device.startswith(("ocl", "privateuseone")):
- from main.library.backends.utils import STFT
- stft = STFT(filter_length=n_fft, hop_length=hop_length, win_length=None, window="hann").to(device)
- else: stft = None
-
- x = torch.from_numpy(audio.astype(np.float32)).float().unsqueeze(0).to(device)
- window = torch.hann_window(n_fft).to(device)
-
- if stft is None:
- fft = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window=window, return_complex=True)
- mag, phase = (fft.real.pow(2) + fft.imag.pow(2)).sqrt(), fft.imag.data.atan2(fft.real.data)
- else:
- mag, phase = stft.transform(x, eps=1e-9, return_phase=True)
-
- noise_mag = mag[:, :, :max(1, min(int((noise_seconds * sr - n_fft) // hop_length) + 1, mag.shape[-1]))].mean(dim=-1, keepdim=True)
- clean_mag = (mag - alpha * noise_mag).maximum((noise_mag * 1.0) * 0.1)
-
- xrec = torch.istft(clean_mag * (1j * phase).exp(), n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window=window, length=x.shape[0]) if stft is None else stft.inverse(clean_mag, phase)
- return xrec.squeeze(0).cpu().numpy()
-
-def repair_bad_frames(audio, sr, frame_ms=20, energy_thresh=0.02):
- frame_len = int(sr * frame_ms / 1000)
- hop = frame_len // 2
-
- n_frames = 1 + max(0, (len(audio) - frame_len) // hop)
- frames = np.stack([audio[i * hop : i * hop + frame_len] for i in range(n_frames)])
-
- energies = np.sqrt(np.mean(frames ** 2, axis=1))
- median_e = np.median(energies)
- bad = energies < (energy_thresh * median_e)
-
- if not np.any(bad): return audio
- out = audio.copy()
-
- for i, is_bad in enumerate(bad):
- if not is_bad: continue
-
- start = i * hop
- end = start + frame_len
-
- left = out[max(0, start - frame_len) : start]
- right = out[end : min(len(out), end + frame_len)]
-
- if left.size > 0 and right.size > 0: out[start:end] = 0.5 * (np.mean(left) + np.mean(right))
- elif left.size > 0: out[start:end] = left[-1]
- elif right.size > 0: out[start:end] = right[0]
- else: out[start:end] = 0.0
-
- return out
-
-def harmonic_enrich_and_compress(audio, drive=0.02, comp_ratio=3.0, frame_length=1024, hop_length=160):
- exc = np.abs(audio)
- exc -= np.mean(exc)
- audio2 = audio + drive * exc
-
- env_rms = librosa.feature.rms(y=audio2.astype(np.float32), frame_length=frame_length, hop_length=hop_length)[0]
- frame_times = np.linspace(0, len(audio2), num=len(env_rms))
- env_s = np.interp(np.arange(len(audio2)), frame_times, env_rms)
-
- threshold = np.median(env_s) * 1.2
- gain = 1.0 / (1.0 + ((env_s / (threshold + 1e-9)) ** (comp_ratio - 1)))
- out = audio2 * gain
-
- return out
-
-def fade_in_out(audio, sr, fade_ms=10):
- n = len(audio)
-
- fade_len = int(sr * fade_ms / 1000)
- if fade_len <= 0: return audio
-
- win = np.ones(n)
- fade_in = np.linspace(0.0, 1.0, fade_len)
- fade_out = np.linspace(1.0, 0.0, fade_len)
-
- win[:fade_len] = fade_in
- win[-fade_len:] = fade_out
-
- return audio * win
-
-def preprocess(audio, sr=16000, target_rms=0.8, device="cpu"):
- x = normalize_audio(audio.astype(np.float32), target_rms=target_rms)
- x -= np.mean(x)
-
- x = spectral_subtract_denoise(x, sr, device=device)
- x = repair_bad_frames(x, sr)
-
- x = automatic_multiband_eq(x, sr)
- x = mix_mfcc_exciter(x, sr, strength=0.06)
-
- x = harmonic_enrich_and_compress(x, drive=0.015, comp_ratio=2.5)
- x = soft_limiter(x, threshold=0.98)
-
- x = fade_in_out(x, sr, fade_ms=8)
- x /= (np.max(np.abs(x)) + 1e-9) * 0.99
-
- return x.astype(np.float32)
-
-def postprocess(audio, sr=48000, original_audio=None, sr_ref=16000, device="cpu"):
- x = audio.astype(np.float32)
- x = x - np.mean(x)
-
- x = fade_in_out(x, sr, fade_ms=6)
- x = spectral_subtract_denoise(x, sr, noise_seconds=0.25, device=device)
-
- x = best_multiband_eq(x, sr, original_audio=original_audio, sr_ref=sr_ref, n_bands=6, target_slope=0.02, strength=0.15)
- x = soft_limiter(x, threshold=0.995)
-
- cutoff = min(20000, sr / 2 - 100)
- Wn = cutoff / (sr / 2)
-
- b, a = signal.butter(2, Wn, btype="low")
- x = signal.filtfilt(b, a, x)
-
- x /= (np.max(np.abs(x)) + 1e-9) * 0.99
- return x.astype(np.float32)
\ No newline at end of file
diff --git a/main/inference/conversion/convert.py b/main/inference/conversion/convert.py
deleted file mode 100644
index 786e9c9e876ce007177046fe03963583560d48ff..0000000000000000000000000000000000000000
--- a/main/inference/conversion/convert.py
+++ /dev/null
@@ -1,377 +0,0 @@
-import os
-import sys
-import time
-import torch
-import librosa
-import logging
-import argparse
-import warnings
-
-import numpy as np
-import soundfile as sf
-
-from tqdm import tqdm
-from distutils.util import strtobool
-
-warnings.filterwarnings("ignore")
-sys.path.append(os.getcwd())
-
-from main.app.core.ui import replace_export_format
-from main.inference.conversion.pipeline import Pipeline
-from main.app.variables import config, logger, translations
-from main.inference.conversion.audio_processing import preprocess, postprocess
-from main.library.utils import check_assets, load_audio, load_embedders_model, cut, restore, clear_gpu_cache, load_model
-
-for l in ["torch", "faiss", "omegaconf", "httpx", "httpcore", "faiss.loader", "numba.core", "urllib3", "transformers", "matplotlib"]:
- logging.getLogger(l).setLevel(logging.ERROR)
-
-def parse_arguments():
- parser = argparse.ArgumentParser()
- parser.add_argument("--convert", action='store_true')
- parser.add_argument("--pitch", type=int, default=0)
- parser.add_argument("--filter_radius", type=int, default=3)
- parser.add_argument("--index_rate", type=float, default=0.5)
- parser.add_argument("--rms_mix_rate", type=float, default=1)
- parser.add_argument("--protect", type=float, default=0.33)
- parser.add_argument("--hop_length", type=int, default=64)
- parser.add_argument("--f0_method", type=str, default="rmvpe")
- parser.add_argument("--embedder_model", type=str, default="hubert_base")
- parser.add_argument("--input_path", type=str, required=True)
- parser.add_argument("--output_path", type=str, default="./audios/output.wav")
- parser.add_argument("--export_format", type=str, default="wav")
- parser.add_argument("--pth_path", type=str, required=True)
- parser.add_argument("--index_path", type=str, default="")
- parser.add_argument("--f0_autotune", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--f0_autotune_strength", type=float, default=1)
- parser.add_argument("--clean_audio", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--clean_strength", type=float, default=0.7)
- parser.add_argument("--resample_sr", type=int, default=0)
- parser.add_argument("--split_audio", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--checkpointing", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--f0_file", type=str, default="")
- parser.add_argument("--f0_onnx", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--embedders_mode", type=str, default="fairseq")
- parser.add_argument("--formant_shifting", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--formant_qfrency", type=float, default=0.8)
- parser.add_argument("--formant_timbre", type=float, default=0.8)
- parser.add_argument("--proposal_pitch", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--proposal_pitch_threshold", type=float, default=255.0)
- parser.add_argument("--audio_processing", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--alpha", type=float, default=0.5)
-
- return parser.parse_args()
-
-def main():
- args = parse_arguments()
- pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, f0_autotune_strength, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, checkpointing, f0_file, f0_onnx, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha = args.pitch, args.filter_radius, args.index_rate, args.rms_mix_rate,args.protect, args.hop_length, args.f0_method, args.input_path, args.output_path, args.pth_path, args.index_path, args.f0_autotune, args.f0_autotune_strength, args.clean_audio, args.clean_strength, args.export_format, args.embedder_model, args.resample_sr, args.split_audio, args.checkpointing, args.f0_file, args.f0_onnx, args.embedders_mode, args.formant_shifting, args.formant_qfrency, args.formant_timbre, args.proposal_pitch, args.proposal_pitch_threshold, args.audio_processing, args.alpha
-
- run_convert_script(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, rms_mix_rate=rms_mix_rate, protect=protect, hop_length=hop_length, f0_method=f0_method, input_path=input_path, output_path=output_path, pth_path=pth_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, resample_sr=resample_sr, split_audio=split_audio, checkpointing=checkpointing, f0_file=f0_file, f0_onnx=f0_onnx, embedders_mode=embedders_mode, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre, proposal_pitch=proposal_pitch, proposal_pitch_threshold=proposal_pitch_threshold, audio_processing=audio_processing, alpha=alpha)
-
-def run_convert_script(
- pitch=0,
- filter_radius=3,
- index_rate=0.5,
- rms_mix_rate=1,
- protect=0.5,
- hop_length=64,
- f0_method="rmvpe",
- input_path=None,
- output_path="./output.wav",
- pth_path=None,
- index_path=None,
- f0_autotune=False,
- f0_autotune_strength=1,
- clean_audio=False,
- clean_strength=0.7,
- export_format="wav",
- embedder_model="hubert_base",
- resample_sr=0,
- split_audio=False,
- checkpointing=False,
- f0_file=None,
- f0_onnx=False,
- embedders_mode="fairseq",
- formant_shifting=False,
- formant_qfrency=0.8,
- formant_timbre=0.8,
- proposal_pitch=False,
- proposal_pitch_threshold=255.0,
- audio_processing=False,
- alpha=0.5
-):
- check_assets(f0_method, embedder_model, f0_onnx=f0_onnx, embedders_mode=embedders_mode)
- log_data = {
- translations['pitch']: pitch,
- translations['filter_radius']: filter_radius,
- translations['index_strength']: index_rate,
- translations['rms_mix_rate']: rms_mix_rate,
- translations['protect']: protect,
- translations['hop_length']: hop_length,
- translations['f0_method']: f0_method,
- translations['audio_path']: input_path,
- translations['output_path']: replace_export_format(output_path, export_format),
- translations['model_path']: pth_path,
- translations['indexpath']: index_path,
- translations['autotune']: f0_autotune,
- translations['clear_audio']: clean_audio,
- translations['export_format']: export_format,
- translations['hubert_model']: embedder_model,
- translations['split_audio']: split_audio,
- translations['memory_efficient_training']: checkpointing,
- translations["f0_onnx_mode"]: f0_onnx,
- translations["embed_mode"]: embedders_mode,
- translations["proposal_pitch"]: proposal_pitch,
- translations["audio_processing"]: audio_processing,
- translations["alpha_label"]: alpha
- }
-
- if clean_audio: log_data[translations['clean_strength']] = clean_strength
- if resample_sr != 0: log_data[translations['sample_rate']] = resample_sr
- if f0_autotune: log_data[translations['autotune_rate_info']] = f0_autotune_strength
- if os.path.isfile(f0_file): log_data[translations['f0_file']] = f0_file
- if proposal_pitch: log_data[translations["proposal_pitch_threshold"]] = proposal_pitch_threshold
- if formant_shifting:
- log_data[translations['formant_qfrency']] = formant_qfrency
- log_data[translations['formant_timbre']] = formant_timbre
-
- for key, value in log_data.items():
- logger.debug(f"{key}: {value}")
-
- if not pth_path or not os.path.exists(pth_path) or os.path.isdir(pth_path) or not pth_path.endswith((".pth", ".onnx")):
- logger.warning(translations["provide_file"].format(filename=translations["model"]))
- sys.exit(1)
-
- cvt = VoiceConverter(pth_path, 0)
- start_time = time.time()
-
- pid_path = os.path.join("assets", "convert_pid.txt")
- with open(pid_path, "w") as pid_file:
- pid_file.write(str(os.getpid()))
-
- def convert_audio(audio_path, output_audio):
- cvt.convert_audio(
- pitch=pitch,
- filter_radius=filter_radius,
- index_rate=index_rate,
- rms_mix_rate=rms_mix_rate,
- protect=protect,
- hop_length=hop_length,
- f0_method=f0_method,
- audio_input_path=audio_path,
- audio_output_path=output_audio,
- index_path=index_path,
- f0_autotune=f0_autotune,
- f0_autotune_strength=f0_autotune_strength,
- clean_audio=clean_audio,
- clean_strength=clean_strength,
- export_format=export_format,
- embedder_model=embedder_model,
- resample_sr=resample_sr,
- checkpointing=checkpointing,
- f0_file=f0_file, f0_onnx=f0_onnx,
- embedders_mode=embedders_mode,
- formant_shifting=formant_shifting,
- formant_qfrency=formant_qfrency,
- formant_timbre=formant_timbre,
- split_audio=split_audio,
- proposal_pitch=proposal_pitch,
- proposal_pitch_threshold=proposal_pitch_threshold,
- audio_processing=audio_processing,
- alpha=alpha
- )
-
- if os.path.isdir(input_path):
- logger.info(translations["convert_batch"])
- audio_files = [f for f in os.listdir(input_path) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]
-
- if not audio_files:
- logger.warning(translations["not_found_audio"])
- sys.exit(1)
-
- logger.info(translations["found_audio"].format(audio_files=len(audio_files)))
-
- for audio in audio_files:
- audio_path = os.path.join(input_path, audio)
- output_audio = os.path.join(input_path, os.path.splitext(audio)[0] + f"_output.{export_format}")
-
- logger.info(f"{translations['convert_audio']} '{audio_path}'...")
- if os.path.exists(output_audio): os.remove(output_audio)
-
- convert_audio(audio_path, output_audio)
-
- logger.info(translations["convert_batch_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}", output_path=replace_export_format(output_path, export_format)))
- else:
- if not os.path.exists(input_path):
- logger.warning(translations["not_found_audio"])
- sys.exit(1)
-
- logger.info(f"{translations['convert_audio']} '{input_path}'...")
- if os.path.exists(output_path): os.remove(output_path)
-
- convert_audio(input_path, output_path)
- logger.info(translations["convert_audio_success"].format(input_path=input_path, elapsed_time=f"{(time.time() - start_time):.2f}", output_path=replace_export_format(output_path, export_format)))
-
- if os.path.exists(pid_path): os.remove(pid_path)
-
-class VoiceConverter:
- def __init__(self, model_path, sid = 0):
- self.config = config
- self.device = config.device
- self.hubert_model = None
- self.tgt_sr = None
- self.net_g = None
- self.vc = None
- self.cpt = None
- self.version = None
- self.n_spk = None
- self.use_f0 = None
- self.loaded_model = None
- self.vocoder = "Default"
- self.checkpointing = False
- self.sample_rate = 16000
- self.sid = sid
- self.get_vc(model_path, sid)
-
- def convert_audio(self, audio_input_path, audio_output_path, index_path, embedder_model, pitch, f0_method, index_rate, rms_mix_rate, protect, hop_length, f0_autotune, f0_autotune_strength, filter_radius, clean_audio, clean_strength, export_format, resample_sr = 0, checkpointing = False, f0_file = None, f0_onnx = False, embedders_mode = "fairseq", formant_shifting = False, formant_qfrency = 0.8, formant_timbre = 0.8, split_audio = False, proposal_pitch = False, proposal_pitch_threshold = 0, audio_processing = False, alpha = 0.5):
- self.checkpointing = checkpointing
-
- try:
- with tqdm(total=10, desc=translations["convert_audio"], ncols=100, unit="a", leave=not split_audio) as pbar:
- audio = load_audio(audio_input_path, self.sample_rate, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre)
- if audio_processing: audio = preprocess(audio, self.sample_rate, device=self.device)
-
- try:
- audio_max = np.abs(audio).max() / 0.95
- if audio_max > 1: audio /= audio_max
- except:
- import shutil
- shutil.copy(audio_input_path, audio_output_path)
- return
-
- if not self.hubert_model:
- models = load_embedders_model(embedder_model, embedders_mode)
- if isinstance(models, torch.nn.Module): models = models.to(torch.float16 if self.config.is_half else torch.float32).eval().to(self.device)
- self.hubert_model = models
-
- pbar.update(1)
- if split_audio:
- pbar.close()
- chunks = cut(audio, self.sample_rate, db_thresh=-60, min_interval=500)
-
- logger.info(f"{translations['split_total']}: {len(chunks)}")
- pbar = tqdm(total=len(chunks) * 5 + 4, desc=translations["convert_audio"], ncols=100, unit="a", leave=True)
- else: chunks = [(audio, 0, 0)]
-
- pbar.update(1)
- converted_chunks = [(
- start,
- end,
- self.vc.pipeline(
- logger=logger,
- model=self.hubert_model,
- net_g=self.net_g,
- sid=self.sid,
- audio=waveform,
- f0_up_key=pitch,
- f0_method=f0_method,
- file_index=index_path.strip().strip('"').strip("\n").strip('"').strip().replace("trained", "added"),
- index_rate=index_rate,
- pitch_guidance=self.use_f0,
- filter_radius=filter_radius,
- rms_mix_rate=rms_mix_rate,
- version=self.version,
- protect=protect,
- hop_length=hop_length,
- f0_autotune=f0_autotune,
- f0_autotune_strength=f0_autotune_strength,
- f0_file=f0_file,
- f0_onnx=f0_onnx,
- pbar=pbar,
- proposal_pitch=proposal_pitch,
- proposal_pitch_threshold=proposal_pitch_threshold,
- energy_use=self.energy,
- del_onnx=not split_audio,
- alpha=alpha
- )
- ) for waveform, start, end in chunks]
-
- pbar.update(1)
- audio_output = restore(converted_chunks, total_len=len(audio), dtype=converted_chunks[0][2].dtype) if split_audio else converted_chunks[0][2]
-
- if audio_processing: audio_output = postprocess(audio_output, self.tgt_sr, audio, self.sample_rate, device=self.device)
- if self.tgt_sr != resample_sr and resample_sr > 0:
- audio_output = librosa.resample(audio_output, orig_sr=self.tgt_sr, target_sr=resample_sr, res_type="soxr_vhq")
- self.tgt_sr = resample_sr
-
- pbar.update(1)
- if clean_audio:
- from main.tools.noisereduce import TorchGate
- if not hasattr(self, "tg"): self.tg = TorchGate(self.tgt_sr, prop_decrease=clean_strength).to(self.device)
- audio_output = self.tg(torch.from_numpy(audio_output).unsqueeze(0).to(self.device).float()).squeeze(0).cpu().detach().numpy()
-
- if len(audio) / self.sample_rate > len(audio_output) / self.tgt_sr:
- padding = np.zeros(int(np.round(len(audio) / self.sample_rate * self.tgt_sr) - len(audio_output)), dtype=audio_output.dtype)
- audio_output = np.concatenate([audio_output, padding])
-
- try:
- sf.write(audio_output_path, audio_output, self.tgt_sr, format=export_format)
- except:
- sf.write(audio_output_path, librosa.resample(audio_output, orig_sr=self.tgt_sr, target_sr=48000, res_type="soxr_vhq"), 48000, format=export_format)
-
- pbar.update(1)
- except Exception as e:
- import traceback
- logger.debug(traceback.format_exc())
- logger.error(translations["error_convert"].format(e=e))
-
- def get_vc(self, weight_root, sid):
- if sid == "" or sid == []:
- self.cleanup()
- clear_gpu_cache()
-
- if not self.loaded_model or self.loaded_model != weight_root:
- self.loaded_model = weight_root
- self.cpt = load_model(weight_root)
- if self.cpt is not None: self.setup()
-
- def cleanup(self):
- if self.hubert_model is not None:
- del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
- self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
- clear_gpu_cache()
-
- del self.net_g, self.cpt
- clear_gpu_cache()
- self.cpt = None
-
- def setup(self):
- if self.cpt is not None:
- if self.loaded_model.endswith(".pth"):
- from main.library.algorithm.synthesizers import Synthesizer
-
- self.tgt_sr = self.cpt["config"][-1]
- self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
-
- self.use_f0 = self.cpt.get("f0", 1)
- self.version = self.cpt.get("version", "v1")
- self.vocoder = self.cpt.get("vocoder", "Default")
- self.energy = self.cpt.get("energy", False)
-
- if self.vocoder != "Default": self.config.is_half = False
- self.net_g = Synthesizer(*self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=768 if self.version == "v2" else 256, vocoder=self.vocoder, checkpointing=self.checkpointing, energy=self.energy)
- del self.net_g.enc_q
-
- self.net_g.load_state_dict(self.cpt["weight"], strict=False)
- self.net_g.eval().to(self.device)
- self.net_g = self.net_g.to(torch.float16 if self.config.is_half else torch.float32)
- self.n_spk = self.cpt["config"][-3]
- else:
- self.net_g = self.cpt.to(config.device)
- self.tgt_sr = self.cpt.cpt.get("tgt_sr", 32000)
- self.use_f0 = self.cpt.cpt.get("f0", 1)
- self.version = self.cpt.cpt.get("version", "v1")
- self.energy = self.cpt.cpt.get("energy", False)
-
- self.vc = Pipeline(self.tgt_sr, self.config)
-
-if __name__ == "__main__": main()
\ No newline at end of file
diff --git a/main/inference/conversion/pipeline.py b/main/inference/conversion/pipeline.py
deleted file mode 100644
index 9bbe8d1e5732cf0471512417231536b812c1a123..0000000000000000000000000000000000000000
--- a/main/inference/conversion/pipeline.py
+++ /dev/null
@@ -1,207 +0,0 @@
-import os
-import sys
-import torch
-
-import numpy as np
-import torch.nn.functional as F
-
-from scipy import signal
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import translations
-from main.library.utils import extract_features, change_rms, clear_gpu_cache, load_faiss_index
-
-bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
-
-class Pipeline:
- def __init__(self, tgt_sr, config):
- self.x_pad = config.x_pad
- self.x_query = config.x_query
- self.x_center = config.x_center
- self.x_max = config.x_max
- self.sample_rate = 16000
- self.window = 160
- self.t_pad = self.sample_rate * self.x_pad
- self.t_pad_tgt = tgt_sr * self.x_pad
- self.t_pad2 = self.t_pad * 2
- self.t_query = self.sample_rate * self.x_query
- self.t_center = self.sample_rate * self.x_center
- self.t_max = self.sample_rate * self.x_max
- self.f0_min = 50
- self.f0_max = 1100
- self.device = config.device
- self.is_half = config.is_half
- self.tgt_sr = tgt_sr
-
- def voice_conversion(self, model, net_g, sid, audio0, pitch, pitchf, index, big_npy, index_rate, version, protect, energy):
- pitch_guidance = pitch != None and pitchf != None
- energy_use = energy != None
-
- feats = torch.from_numpy(audio0).to(self.device).to(torch.float16 if self.is_half else torch.float32)
- feats = feats.mean(-1) if feats.dim() == 2 else feats
- assert feats.dim() == 1, feats.dim()
-
- with torch.no_grad():
- feats = extract_features(model, feats.view(1, -1), version, self.device)
- feats0 = feats.clone() if protect < 0.5 and pitch_guidance else None
-
- if (not isinstance(index, type(None)) and not isinstance(big_npy, type(None)) and index_rate != 0):
- npy = feats[0].cpu().numpy()
- if self.is_half: npy = npy.astype(np.float32)
-
- score, ix = index.search(npy, k=8)
- weight = np.square(1 / score)
-
- npy = np.sum(big_npy[ix] * np.expand_dims(weight / weight.sum(axis=1, keepdims=True), axis=2), axis=1)
- if self.is_half: npy = npy.astype(np.float16)
-
- feats = (torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats)
-
- feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
- p_len = min(audio0.shape[0] // self.window, feats.shape[1])
-
- if pitch_guidance: pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len]
- if energy_use: energy = energy[:p_len].unsqueeze(0)
-
- if feats0 is not None:
- pitchff = pitchf.clone()
- pitchff[pitchf > 0] = 1
- pitchff[pitchf < 1] = protect
- pitchff = pitchff.unsqueeze(-1)
-
- feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
- feats = (feats * pitchff + feats0 * (1 - pitchff)).to(feats0.dtype)
-
- p_len = torch.tensor([p_len], device=self.device).long()
- feats = feats.to(torch.float16 if self.is_half else torch.float32)
-
- audio1 = (
- (
- net_g.infer(
- feats,
- p_len,
- pitch if pitch_guidance else None,
- pitchf.to(torch.float16 if self.is_half else torch.float32) if pitch_guidance else None,
- sid,
- energy.to(torch.float16 if self.is_half else torch.float32) if energy_use else None
- )[0][0, 0]
- ).data.cpu().float().numpy()
- )
-
- del feats, feats0, p_len
-
- clear_gpu_cache()
- return audio1
-
- def pipeline(self, logger, model, net_g, sid, audio, f0_up_key, f0_method, file_index, index_rate, pitch_guidance, filter_radius, rms_mix_rate, version, protect, hop_length, f0_autotune, f0_autotune_strength, f0_file=None, f0_onnx=False, pbar=None, proposal_pitch=False, proposal_pitch_threshold=255.0, energy_use=False, del_onnx=True, alpha = 0.5):
- index, big_npy = load_faiss_index(file_index) if index_rate != 0 else None, None
- if pbar: pbar.update(1)
-
- opt_ts, audio_opt = [], []
- audio = signal.filtfilt(bh, ah, audio)
- audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
-
- if audio_pad.shape[0] > self.t_max:
- audio_sum = np.zeros_like(audio)
-
- for i in range(self.window):
- audio_sum += audio_pad[i : i - self.window]
-
- for t in range(self.t_center, audio.shape[0], self.t_center):
- opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query : t + self.t_query]) == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min())[0][0])
-
- s = 0
- t, inp_f0 = None, None
- audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
- sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
- p_len = audio_pad.shape[0] // self.window
-
- if hasattr(f0_file, "name"):
- try:
- with open(f0_file.name, "r") as f:
- raw_lines = f.read()
-
- if len(raw_lines) > 0:
- inp_f0 = []
-
- for line in raw_lines.strip("\n").split("\n"):
- inp_f0.append([float(i) for i in line.split(",")])
-
- inp_f0 = np.array(inp_f0, dtype=np.float32)
- except:
- logger.error(translations["error_readfile"])
- inp_f0 = None
-
- if pbar: pbar.update(1)
-
- if pitch_guidance:
- if not hasattr(self, "f0_generator"):
- from main.library.predictors.Generator import Generator
- self.f0_generator = Generator(self.sample_rate, hop_length, self.f0_min, self.f0_max, alpha, self.is_half, self.device, f0_onnx, del_onnx)
-
- pitch, pitchf = self.f0_generator.calculator(self.x_pad, f0_method, audio_pad, f0_up_key, p_len, filter_radius, f0_autotune, f0_autotune_strength, manual_f0=inp_f0, proposal_pitch=proposal_pitch, proposal_pitch_threshold=proposal_pitch_threshold)
- if self.device == "mps": pitchf = pitchf.astype(np.float32)
- pitch, pitchf = torch.tensor(pitch[:p_len], device=self.device).unsqueeze(0).long(), torch.tensor(pitchf[:p_len], device=self.device).unsqueeze(0).float()
-
- if pbar: pbar.update(1)
-
- if energy_use:
- if not hasattr(self, "rms_extract"):
- from main.inference.extracting.rms import RMSEnergyExtractor
- self.rms_extract = RMSEnergyExtractor(frame_length=2048, hop_length=self.window, center=True, pad_mode = "reflect").to(self.device).eval()
-
- energy = self.rms_extract(torch.from_numpy(audio_pad).to(self.device).unsqueeze(0))[:p_len].to(self.device).float()
-
- if pbar: pbar.update(1)
-
- for t in opt_ts:
- t = t // self.window * self.window
- audio_opt.append(
- self.voice_conversion(
- model,
- net_g,
- sid,
- audio_pad[s : t + self.t_pad2 + self.window],
- pitch[:, s // self.window : (t + self.t_pad2) // self.window] if pitch_guidance else None,
- pitchf[:, s // self.window : (t + self.t_pad2) // self.window] if pitch_guidance else None,
- index,
- big_npy,
- index_rate,
- version,
- protect,
- energy[:, s // self.window : (t + self.t_pad2) // self.window] if energy_use else None
- )[self.t_pad_tgt : -self.t_pad_tgt]
- )
- s = t
-
- audio_opt.append(
- self.voice_conversion(
- model,
- net_g,
- sid,
- audio_pad[t:],
- (pitch[:, t // self.window :] if t is not None else pitch) if pitch_guidance else None,
- (pitchf[:, t // self.window :] if t is not None else pitchf) if pitch_guidance else None,
- index,
- big_npy,
- index_rate,
- version,
- protect,
- (energy[:, t // self.window :] if t is not None else energy) if energy_use else None
- )[self.t_pad_tgt : -self.t_pad_tgt]
- )
-
- if pbar: pbar.update(1)
-
- audio_opt = np.concatenate(audio_opt)
- if rms_mix_rate != 1: audio_opt = change_rms(audio, self.sample_rate, audio_opt, self.tgt_sr, rms_mix_rate)
-
- audio_max = np.abs(audio_opt).max() / 0.99
- if audio_max > 1: audio_opt /= audio_max
-
- if pitch_guidance: del pitch, pitchf
- del sid
-
- clear_gpu_cache()
- return audio_opt
\ No newline at end of file
diff --git a/main/inference/create_dataset.py b/main/inference/create_dataset.py
deleted file mode 100644
index 7c86ff0537a34921c1ab4ff0fa12d2a399ffb11b..0000000000000000000000000000000000000000
--- a/main/inference/create_dataset.py
+++ /dev/null
@@ -1,380 +0,0 @@
-import os
-import sys
-import time
-import torch
-import yt_dlp
-import shutil
-import librosa
-import argparse
-import warnings
-
-import numpy as np
-import soundfile as sf
-
-from urllib.parse import urlparse
-from distutils.util import strtobool
-
-sys.path.append(os.getcwd())
-
-from main.app.variables import config, logger, translations
-from main.inference.separate_music import _separate, vr_models
-
-dataset_temp = "dataset_temp"
-
-def parse_arguments():
- parser = argparse.ArgumentParser()
- parser.add_argument("--create_dataset", action='store_true')
- parser.add_argument("--input_data", type=str, required=True)
- parser.add_argument("--output_dirs", type=str, default="./dataset")
- parser.add_argument("--skip_seconds", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--skip_start_audios", type=str, default="0")
- parser.add_argument("--skip_end_audios", type=str, default="0")
- parser.add_argument("--separate", type=lambda x: bool(strtobool(x)), default=True)
- parser.add_argument("--model_name", type=str, default="MDXNET_Main")
- parser.add_argument("--reverb_model", type=str, default="MDX-Reverb")
- parser.add_argument("--denoise_model", type=str, default="Normal")
- parser.add_argument("--sample_rate", type=int, default=48000)
- parser.add_argument("--shifts", type=int, default=2)
- parser.add_argument("--batch_size", type=int, default=1)
- parser.add_argument("--overlap", type=float, default=0.25)
- parser.add_argument("--aggression", type=int, default=5)
- parser.add_argument("--hop_length", type=int, default=1024)
- parser.add_argument("--window_size", type=int, default=512)
- parser.add_argument("--segments_size", type=int, default=256)
- parser.add_argument("--post_process_threshold", type=float, default=0.2)
- parser.add_argument("--enable_tta", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--enable_denoise", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--high_end_process", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--enable_post_process", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--separate_reverb", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--clean_dataset", type=lambda x: bool(strtobool(x)), default=False)
- parser.add_argument("--clean_strength", type=float, default=0.7)
-
- return parser.parse_args()
-
-def main():
- args = parse_arguments()
- input_data, output_dirs, skip_seconds, skip_start_audios, skip_end_audios, separate, model_name, reverb_model, denoise_model, sample_rate, shifts, batch_size, overlap, aggression, hop_length, window_size, segments_size, post_process_threshold, enable_tta, enable_denoise, high_end_process, enable_post_process, separate_reverb, clean_dataset, clean_strength = args.input_data, args.output_dirs, args.skip_seconds, args.skip_start_audios, args.skip_end_audios, args.separate, args.model_name, args.reverb_model, args.denoise_model, args.sample_rate, args.shifts, args.batch_size, args.overlap, args.aggression, args.hop_length, args.window_size, args.segments_size, args.post_process_threshold, args.enable_tta, args.enable_denoise, args.high_end_process, args.enable_post_process, args.separate_reverb, args.clean_dataset, args.clean_strength
-
- create_dataset(
- input_data,
- output_dirs,
- skip_seconds,
- skip_start_audios,
- skip_end_audios,
- separate,
- model_name,
- reverb_model,
- denoise_model,
- sample_rate,
- shifts,
- batch_size,
- overlap,
- aggression,
- hop_length,
- window_size,
- segments_size,
- post_process_threshold,
- enable_tta,
- enable_denoise,
- high_end_process,
- enable_post_process,
- separate_reverb,
- clean_dataset,
- clean_strength
- )
-
-def create_dataset(
- input_data,
- output_dirs,
- skip_seconds,
- skip_start_audios,
- skip_end_audios,
- separate,
- model_name,
- reverb_model="MDX-Reverb",
- denoise_model="Normal",
- sample_rate=48000,
- shifts=2,
- batch_size=1,
- overlap=0.25,
- aggression=5,
- hop_length=1024,
- window_size=512,
- segments_size=256,
- post_process_threshold=0.2,
- enable_tta=False,
- enable_denoise=False,
- high_end_process=False,
- enable_post_process=False,
- separate_reverb=False,
- clean_dataset=False,
- clean_strength=0.7
-):
- log_data = {
- translations['audio_path']: input_data,
- translations['output_path']: output_dirs,
- translations['skip']: skip_seconds,
- translations['separator_tab']: separate,
- translations['modelname']: model_name,
- translations['dereveb_audio']: separate_reverb,
- translations['sr']: sample_rate,
- translations['shift']: shifts,
- translations['batch_size']: batch_size,
- translations['overlap']: overlap,
- translations['aggression']: aggression,
- translations['hop_length']: hop_length,
- translations['window_size']: window_size,
- translations['segments_size']: segments_size,
- translations['post_process_threshold']: post_process_threshold,
- translations['enable_tta']: enable_tta,
- translations['denoise_mdx']: enable_denoise,
- translations['high_end_process']: high_end_process,
- translations['enable_post_process']: enable_post_process,
- translations['clear_dataset']: clean_dataset
- }
-
- if clean_dataset: log_data[translations['clean_strength']] = clean_strength
- if separate_reverb: log_data[translations['dereveb_model']] = reverb_model
- if enable_denoise and model_name in list(vr_models.keys()): log_data["Denoise Model"] = denoise_model
- if skip_seconds:
- log_data[translations['skip_start']] = skip_start_audios
- log_data[translations['skip_end']] = skip_end_audios
-
- for key, value in log_data.items():
- logger.debug(f"{key}: {value}")
-
- start_time = time.time()
- inputs_data = input_data.replace(", ", ",").split(",")
-
- pid_path = os.path.join("assets", "create_dataset_pid.txt")
- with open(pid_path, "w") as pid_file:
- pid_file.write(str(os.getpid()))
-
- try:
- if os.path.exists(dataset_temp): shutil.rmtree(dataset_temp, ignore_errors=True)
- else: os.makedirs(dataset_temp, exist_ok=True)
-
- audio_path = [
- downloader(
- url,
- f"audio_{str(inputs_data.index(url))}"
- ) if is_url(url) else url
- for url in inputs_data
- ]
-
- if skip_seconds:
- skip_start_audios, skip_end_audios = skip_start_audios.replace(", ", ",").split(","), skip_end_audios.replace(", ", ",").split(",")
-
- if len(skip_start_audios) < len(audio_path) or len(skip_end_audios) < len(audio_path):
- logger.warning(translations["skip