diff --git a/main/app/app.py b/main/app/app.py deleted file mode 100644 index b357f9b2acc4a5a8977537eeb13bdf05828398a1..0000000000000000000000000000000000000000 --- a/main/app/app.py +++ /dev/null @@ -1,524 +0,0 @@ -import os -import io -import ssl -import sys -import time -import codecs -import logging -import warnings - -import gradio as gr - -sys.path.append(os.getcwd()) -start_time = time.time() - -from main.app.tabs.extra.extra import extra_tab -from main.app.tabs.editing.editing import editing_tab -from main.app.tabs.training.training import training_tab -from main.app.tabs.downloads.downloads import download_tab -from main.app.tabs.inference.inference import inference_tab -from main.configs.rpc import connect_discord_ipc, send_discord_rpc -from main.app.variables import logger, config, translations, theme, font, configs, language, allow_disk - -ssl._create_default_https_context = ssl._create_unverified_context - -warnings.filterwarnings("ignore") -for l in ["httpx", "gradio", "uvicorn", "httpcore", "urllib3"]: - logging.getLogger(l).setLevel(logging.ERROR) - -js_code = """ -() => { - window._activeStream = null; - window._audioCtx = null; - window._workletNode = null; - window._playbackNode = null; - window._ws = null; - - function setStatus(msg, use_alert = true) { - const realtimeStatus = document.querySelector("#realtime-status-info h2.output-class"); - if (use_alert) alert(msg); - - if (realtimeStatus) { - realtimeStatus.innerText = msg; - realtimeStatus.style.whiteSpace = "nowrap"; - realtimeStatus.style.textAlign = "center"; - } - } - - async function addModuleFromString(ctx, codeStr) { - const blob = new Blob([codeStr], {type: 'application/javascript'}); - const url = URL.createObjectURL(blob); - - await ctx.audioWorklet.addModule(url); - URL.revokeObjectURL(url); - }; - - function createOutputRoute(audioCtx, playbackNode, sinkId, gainValue = 1.0) { - const dest = audioCtx.createMediaStreamDestination(); - const gainNode = audioCtx.createGain(); - gainNode.gain.value = gainValue; - - playbackNode.connect(gainNode); - gainNode.connect(dest); - - const el = document.createElement('audio'); - el.autoplay = true; - el.srcObject = dest.stream; - el.style.display = 'none'; - document.body.appendChild(el); - - if (el.setSinkId) el.setSinkId(sinkId).catch(err => console.error(err)); - return { dest, gainNode, el }; - } - - const inputWorkletSource = ` - class InputProcessor extends AudioWorkletProcessor { - constructor() { - super(); - this.buffer = new Float32Array(0); - this.block_frame = 128; - this.port.onmessage = (e) => { - if (e.data && e.data.block_frame) this.block_frame = e.data.block_frame; - }; - } - - process(inputs) { - const input = inputs[0]; - if (!input || !input[0]) return true; - const frame = input[0]; - - const newBuf = new Float32Array(this.buffer.length + frame.length); - newBuf.set(this.buffer, 0); - newBuf.set(frame, this.buffer.length); - this.buffer = newBuf; - - while (this.buffer.length >= this.block_frame) { - const chunk = this.buffer.slice(0, this.block_frame); - - this.port.postMessage({chunk}, [chunk.buffer]); - this.buffer = this.buffer.slice(this.block_frame); - } - - return true; - } - } - registerProcessor('input-processor', InputProcessor); - `; - - const playbackWorkletSource = ` - class PlaybackProcessor extends AudioWorkletProcessor { - constructor(options) { - super(options); - const bufferSize = options.processorOptions && options.processorOptions.bufferSize ? options.processorOptions.bufferSize: 98304; - this.buffer = new Float32Array(bufferSize); - this.bufferCapacity = bufferSize; - this.writePointer = 0; - this.readPointer = 0; - this.availableSamples = 0; - this.port.onmessage = (e) => { - if (e.data && e.data.chunk) { - const chunk = new Float32Array(e.data.chunk); - const chunkSize = chunk.length; - - if (this.availableSamples + chunkSize > this.bufferCapacity) return; - - for (let i = 0; i < chunkSize; i++) { - this.buffer[this.writePointer] = chunk[i]; - this.writePointer = (this.writePointer + 1) % this.bufferCapacity; - } - - this.availableSamples += chunkSize; - } - }; - } - - process(inputs, outputs) { - const output = outputs[0]; - if (!output || !output[0]) return true; - - const frame = output[0]; - const frameSize = frame.length; - - if (this.availableSamples >= frameSize) { - for (let i = 0; i < frameSize; i++) { - frame[i] = this.buffer[this.readPointer]; - this.readPointer = (this.readPointer + 1) % this.bufferCapacity; - } - this.availableSamples -= frameSize; - } else { - frame.fill(0); - } - - if (output.length > 1) output[1].set(output[0]); - return true; - } - } - registerProcessor('playback-processor', PlaybackProcessor); - `; - - window.getAudioDevices = async function() { - if (!navigator.mediaDevices) { - setStatus("__MEDIA_DEVICES__"); - return {"inputs": {}, "outputs": {}}; - } - - try { - await navigator.mediaDevices.getUserMedia({ audio: true }); - } catch (err) { - console.error(err); - setStatus("__MIC_INACCESSIBLE__") - - return {"inputs": {}, "outputs": {}}; - } - - const devices = await navigator.mediaDevices.enumerateDevices(); - const inputs = {}; - const outputs = {}; - - for (const device of devices) { - if (device.kind === "audioinput") { - inputs[device.label] = device.deviceId - } else if (device.kind === "audiooutput") { - outputs[device.label] = device.deviceId - } - } - - if (!Object.keys(inputs).length && !Object.keys(outputs).length) return {"inputs": {}, "outputs": {}}; - return {"inputs": inputs, "outputs": outputs}; - }; - - window.StreamAudioRealtime = async function( - monitor, - vad_enabled, - input_audio_device, - output_audio_device, - monitor_output_device, - input_audio_gain, - output_audio_gain, - monitor_audio_gain, - chunk_size, - pitch, - model_pth, - model_index, - index_strength, - onnx_f0_mode, - f0_method, - hop_length, - embed_mode, - embedders, - custom_embedders, - f0_autotune, - proposal_pitch, - f0_autotune_strength, - proposal_pitch_threshold, - rms_mix_rate, - protect, - filter_radius, - silent_threshold, - extra_convert_size, - cross_fade_overlap_size, - vad_sensitivity, - vad_frame_ms, - clean_audio, - clean_strength - ) { - const SampleRate = 48000; - const ReadChunkSize = Math.round(chunk_size * SampleRate / 1000 / 128); - const block_frame = parseInt(ReadChunkSize) * 128; - const ButtonState = { start_button: true, stop_button: false }; - const devices = await window.getAudioDevices(); - - input_audio_device = devices["inputs"][input_audio_device]; - output_audio_device = devices["outputs"][output_audio_device]; - if (monitor && devices["outputs"][monitor_output_device]) monitor_output_device = devices["outputs"][monitor_output_device]; - - try { - if (!input_audio_device || !output_audio_device) { - setStatus("__PROVIDE_AUDIO_DEVICE__"); - return ButtonState; - } - - if (monitor && !monitor_output_device) { - setStatus("__PROVIDE_MONITOR_DEVICE__"); - return ButtonState; - } - - if (!model_pth) { - setStatus("__PROVIDE_MODEL__") - return ButtonState; - } - - setStatus("__START_REALTIME__", use_alert=false) - - const stream = await navigator.mediaDevices.getUserMedia({ - audio: { - deviceId: { exact: input_audio_device }, - channelCount: 1, - sampleRate: SampleRate, - echoCancellation: false, - noiseSuppression: false, - autoGainControl: false - } - }); - - window._activeStream = stream; - window._audioCtx = new AudioContext({ sampleRate: SampleRate, latencyHint: "interactive" }); - - await addModuleFromString(window._audioCtx, inputWorkletSource); - await addModuleFromString(window._audioCtx, playbackWorkletSource); - - const src = window._audioCtx.createMediaStreamSource(stream); - const inputNode = new AudioWorkletNode(window._audioCtx, 'input-processor'); - const playbackNode = new AudioWorkletNode(window._audioCtx, 'playback-processor', { - processorOptions: { - bufferSize: block_frame * 2 - } - }); - - inputNode.port.postMessage({ block_frame: block_frame }); - src.connect(inputNode); - - createOutputRoute(window._audioCtx, playbackNode, output_audio_device, output_audio_gain / 100); - if (monitor && monitor_output_device) createOutputRoute(window._audioCtx, playbackNode, monitor_output_device, monitor_audio_gain / 100); - - const protocol = (location.protocol === "https:") ? "wss:" : "ws:"; - const wsUrl = protocol + '//' + location.hostname + `:${location.port}` + '/api/ws-audio'; - const ws = new WebSocket(wsUrl); - - ButtonState.start_button = false; - ButtonState.stop_button = true; - - ws.binaryType = "arraybuffer"; - window._ws = ws; - - ws.onopen = () => { - console.log("__WS_CONNECTED__") - - ws.send( - JSON.stringify({ - type: 'init', - chunk_size: ReadChunkSize, - embedders: embedders, - model_pth: model_pth, - custom_embedders: custom_embedders, - cross_fade_overlap_size: cross_fade_overlap_size, - extra_convert_size: extra_convert_size, - model_index: model_index, - f0_method: f0_method, - f0_onnx: onnx_f0_mode, - embedders_mode: embed_mode, - hop_length: hop_length, - silent_threshold: silent_threshold, - vad_enabled: vad_enabled, - vad_sensitivity: vad_sensitivity, - vad_frame_ms: vad_frame_ms, - clean_audio: clean_audio, - clean_strength: clean_strength, - f0_up_key: pitch, - index_rate: index_strength, - protect: protect, - filter_radius: filter_radius, - rms_mix_rate: rms_mix_rate, - f0_autotune: f0_autotune, - f0_autotune_strength: f0_autotune_strength, - proposal_pitch: proposal_pitch, - proposal_pitch_threshold: proposal_pitch_threshold, - input_audio_gain: input_audio_gain - }) - ); - }; - - inputNode.port.onmessage = (e) => { - const chunk = e.data && e.data.chunk; - - if (!chunk) return; - if (ws.readyState === WebSocket.OPEN) ws.send(chunk); - }; - - ws.onmessage = (ev) => { - if (typeof ev.data === 'string') { - const msg = JSON.parse(ev.data); - - if (msg.type === 'latency') setStatus(`__LATENCY__: ${msg.value.toFixed(1)} ms`, use_alert=false) - if (msg.type === 'warnings') { - setStatus(msg.value); - StopAudioStream(); - } - - return; - } - - const ab = ev.data; - playbackNode.port.postMessage({ chunk: ab }, [ab]); - }; - - ws.onclose = () => console.log("__WS_CLOSED__"); - window._workletNode = inputNode; - window._playbackNode = playbackNode; - - if (window._audioCtx.state === 'suspended') await window._audioCtx.resume(); - - console.log("__REALTIME_STARTED__"); - return ButtonState; - } catch (err) { - console.error("__ERROR__", err); - alert("__ERROR__" + err.message); - - return StopAudioStream(); - } - }; - - window.StopAudioStream = async function() { - try { - if (window._ws) { - window._ws.close(); - window._ws = null; - } - - if (window._activeStream) { - window._activeStream.getTracks().forEach(t => t.stop()); - window._activeStream = null; - } - - if (window._workletNode) { - window._workletNode.disconnect(); - window._workletNode = null; - } - - if (window._playbackNode) { - window._playbackNode.disconnect(); - window._playbackNode = null; - } - - if (window._audioCtx) { - await window._audioCtx.close(); - window._audioCtx = null; - } - - document.querySelectorAll('audio').forEach(a => a.remove()); - setStatus("__REALTIME_HAS_STOP__", use_alert=false); - - return {"start_button": true, "stop_button": false}; - } catch (e) { - setStatus(`__ERROR__ ${e}`); - - return {"start_button": false, "stop_button": true} - } - }; -} -""".replace( - "__MEDIA_DEVICES__", translations["media_devices"] -).replace( - "__MIC_INACCESSIBLE__", translations["mic_inaccessible"] -).replace( - "__PROVIDE_AUDIO_DEVICE__", translations["provide_audio_device"] -).replace( - "__PROVIDE_MONITOR_DEVICE__", translations["provide_monitor_device"] -).replace( - "__START_REALTIME__", translations["start_realtime"] -).replace( - "__LATENCY__", translations['latency'] -).replace( - "__WS_CONNECTED__", translations["ws_connected"] -).replace( - "__WS_CLOSED__", translations["ws_closed"] -).replace( - "__REALTIME_STARTED__", translations["realtime_is_ready"] -).replace( - "__ERROR__", translations["error_occurred"].format(e="") -).replace( - "__REALTIME_HAS_STOP__", translations["realtime_has_stop"] -).replace( - "__PROVIDE_MODEL__", translations["provide_file"].format(filename=translations["model"]) -) - -client_mode = True # "--client" in sys.argv - -with gr.Blocks( - title="📱 Vietnamese-RVC GUI BY ANH", - js=js_code if client_mode else None, - theme=theme, - css="".format(fonts=font or "https://fonts.googleapis.com/css2?family=Courgette&display=swap") -) as app: - gr.HTML("

🎵VIETNAMESE RVC BY ANH🎵

") - gr.HTML(f"

{translations['title']}

") - - with gr.Tabs(): - inference_tab() - editing_tab() - - if client_mode: - from main.app.tabs.realtime.realtime_client import realtime_client_tab - realtime_client_tab() - else: - from main.app.tabs.realtime.realtime import realtime_tab - realtime_tab() - - training_tab() - download_tab() - extra_tab(app) - - with gr.Row(): - gr.Markdown(translations["rick_roll"].format(rickroll=codecs.decode('uggcf://jjj.lbhghor.pbz/jngpu?i=qDj4j9JtKpD', 'rot13'))) - - with gr.Row(): - gr.Markdown(translations["terms_of_use"]) - - with gr.Row(): - gr.Markdown(translations["exemption"]) - - if __name__ == "__main__": - logger.info(config.device.replace("privateuseone", "dml")) - logger.info(translations["start_app"]) - logger.info(translations["set_lang"].format(lang=language)) - - port = configs.get("app_port", 7860) - server_name = configs.get("server_name", "0.0.0.0") - share = "--share" in sys.argv - - original_stdout = sys.stdout - sys.stdout = io.StringIO() - - for i in range(configs.get("num_of_restart", 5)): - try: - gradio_app, _, share_url = app.queue().launch( - favicon_path=configs["ico_path"], - server_name=server_name, - server_port=port, - show_error=configs.get("app_show_error", False), - inbrowser="--open" in sys.argv, - share=share, - allowed_paths=allow_disk, - prevent_thread_lock=True, - quiet=True - ) - break - except OSError: - logger.debug(translations["port"].format(port=port)) - port -= 1 - except Exception as e: - logger.error(translations["error_occurred"].format(e=e)) - sys.exit(1) - - if client_mode: - from main.app.core.realtime_client import app as fastapi_app - gradio_app.mount("/api", fastapi_app) - - sys.stdout = original_stdout - - if configs.get("discord_presence", True): - pipe = connect_discord_ipc() - if pipe: - try: - logger.info(translations["start_rpc"]) - send_discord_rpc(pipe) - except KeyboardInterrupt: - logger.info(translations["stop_rpc"]) - pipe.close() - - logger.info(f"{translations['running_local_url']}: {server_name}:{port}") - if share: logger.info(f"{translations['running_share_url']}: {share_url}") - logger.info(f"{translations['gradio_start']}: {(time.time() - start_time):.2f}s") - - while 1: - time.sleep(5) \ No newline at end of file diff --git a/main/app/core/csrt.py b/main/app/core/csrt.py deleted file mode 100644 index f99f0878ee52a4d1ba26dac59339042cff38b356..0000000000000000000000000000000000000000 --- a/main/app/core/csrt.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import sys - -sys.path.append(os.getcwd()) - -from main.app.core.inference import whisper_process -from main.library.utils import check_spk_diarization -from main.app.core.ui import gr_info, gr_warning, process_output -from main.app.variables import config, translations, configs, logger - -def create_srt(model_size, input_audio, output_file, word_timestamps): - import multiprocessing as mp - - if not input_audio or not os.path.exists(input_audio) or os.path.isdir(input_audio): - gr_warning(translations["input_not_valid"]) - return [None]*2 - - if not output_file.endswith(".srt"): output_file += ".srt" - - if not output_file: - gr_warning(translations["output_not_valid"]) - return [None]*2 - - output_dir = os.path.dirname(output_file) - if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) - - info = "" - output_file = process_output(output_file) - - check_spk_diarization(model_size, speechbrain=False) - gr_info(translations["csrt"]) - - try: - mp.set_start_method("spawn") - except: - pass - - whisper_queue = mp.Queue() - whisperprocess = mp.Process(target=whisper_process, args=(model_size, input_audio, configs, config.device, whisper_queue, word_timestamps)) - whisperprocess.start() - - segments = whisper_queue.get() - - with open(output_file, "w", encoding="utf-8") as f: - for i, segment in enumerate(segments): - start = segment["start"] - end = segment["end"] - text = segment["text"].strip() - - index = f"{i+1}\n" - timestamp = f"{format_timestamp(start)} --> {format_timestamp(end)}\n" - text1 = f"{text}\n\n" - - f.write(index) - f.write(timestamp) - f.write(text1) - - info = info + index + timestamp + text1 - logger.info(info) - - gr_info(translations["success"]) - - return [{"value": output_file, "visible": True, "__type__": "update"}, info] - -def format_timestamp(seconds): - hours = int(seconds // 3600) - minutes = int((seconds % 3600) // 60) - - seconds = int(seconds % 60) - miliseconds = int((seconds - int(seconds)) * 1000) - - return f"{hours:02}:{minutes:02}:{seconds:02},{miliseconds:03}" \ No newline at end of file diff --git a/main/app/core/downloads.py b/main/app/core/downloads.py deleted file mode 100644 index 715d7313e50a84fe92a863cbc7f1b9786474d6f9..0000000000000000000000000000000000000000 --- a/main/app/core/downloads.py +++ /dev/null @@ -1,208 +0,0 @@ -import os -import re -import sys -import json -import codecs -import shutil -import yt_dlp -import warnings -import requests - -from bs4 import BeautifulSoup - -sys.path.append(os.getcwd()) - -from main.tools import huggingface, gdown, meganz, mediafire, pixeldrain -from main.app.variables import logger, translations, model_options, configs -from main.app.core.process import move_files_from_directory, fetch_pretrained_data, extract_name_model -from main.app.core.ui import gr_info, gr_warning, gr_error, process_output, replace_url, replace_modelname - -def download_url(url): - if not url: - gr_warning(translations["provide_url"]) - return [None]*3 - - if not os.path.exists(configs["audios_path"]): os.makedirs(configs["audios_path"], exist_ok=True) - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore") - ydl_opts = { - "format": "bestaudio/best", - "postprocessors": [{ - "key": "FFmpegExtractAudio", - "preferredcodec": "wav", - "preferredquality": "192" - }], - "quiet": True, - "no_warnings": True, - "noplaylist": True, - "verbose": False - } - - gr_info(translations["start"].format(start=translations["download_music"])) - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - audio_output = os.path.join(configs["audios_path"], re.sub(r'\s+', '-', re.sub(r'[^\w\s\u4e00-\u9fff\uac00-\ud7af\u0400-\u04FF\u1100-\u11FF]', '', ydl.extract_info(url, download=False).get('title', 'video')).strip())) - if os.path.exists(audio_output): shutil.rmtree(audio_output, ignore_errors=True) - - ydl_opts['outtmpl'] = audio_output - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - audio_output = process_output(audio_output + ".wav") - - ydl.download([url]) - - gr_info(translations["success"]) - return [audio_output, audio_output, translations["success"]] - -def move_file(file, download_dir, model): - weights_dir = configs["weights_path"] - logs_dir = configs["logs_path"] - - if not os.path.exists(weights_dir): os.makedirs(weights_dir, exist_ok=True) - if not os.path.exists(logs_dir): os.makedirs(logs_dir, exist_ok=True) - - if file.endswith(".zip"): shutil.unpack_archive(file, download_dir) - move_files_from_directory(download_dir, weights_dir, logs_dir, model) - -def download_model(url=None, model=None): - if not url: return gr_warning(translations["provide_url"]) - - url = replace_url(url) - download_dir = "download_model" - - os.makedirs(download_dir, exist_ok=True) - - try: - gr_info(translations["start"].format(start=translations["download"])) - - if "huggingface.co" in url: file = huggingface.HF_download_file(url, download_dir) - elif "google.com" in url: file = gdown.gdown_download(url, download_dir) - elif "mediafire.com" in url: file = mediafire.Mediafire_Download(url, download_dir) - elif "pixeldrain.com" in url: file = pixeldrain.pixeldrain(url, download_dir) - elif "mega.nz" in url: file = meganz.mega_download_url(url, download_dir) - else: - gr_warning(translations["not_support_url"]) - return translations["not_support_url"] - - if not model: - modelname = os.path.basename(file) - model = extract_name_model(modelname) if modelname.endswith(".index") else os.path.splitext(modelname)[0] - if model is None: model = os.path.splitext(modelname)[0] - - model = replace_modelname(model) - - move_file(file, download_dir, model) - gr_info(translations["success"]) - - return translations["success"] - except Exception as e: - gr_error(message=translations["error_occurred"].format(e=e)) - return translations["error_occurred"].format(e=e) - finally: - shutil.rmtree(download_dir, ignore_errors=True) - -def download_pretrained_model(choices, model, sample_rate): - pretraineds_custom_path = configs["pretrained_custom_path"] - - if choices == translations["list_model"]: - paths = fetch_pretrained_data()[model][sample_rate] - - if not os.path.exists(pretraineds_custom_path): os.makedirs(pretraineds_custom_path, exist_ok=True) - url = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cergenvarq_phfgbz/", "rot13") + paths - - gr_info(translations["download_pretrain"]) - file = huggingface.HF_download_file(replace_url(url), os.path.join(pretraineds_custom_path, paths)) - - if file.endswith(".zip"): - shutil.unpack_archive(file, pretraineds_custom_path) - os.remove(file) - - gr_info(translations["success"]) - return translations["success"] - elif choices == translations["download_url"]: - pretrain_is_zip = model.endswith(".zip") or model.endswith(".zip?download=true") or sample_rate.endswith(".zip") or sample_rate.endswith(".zip?download=true") - urls = [] - - if not model and not pretrain_is_zip: - gr_warning(translations["provide_pretrain"].format(dg="D")) - return [None]*2 - - if not sample_rate and not pretrain_is_zip: - gr_warning(translations["provide_pretrain"].format(dg="G")) - return [None]*2 - - gr_info(translations["download_pretrain"]) - - if model: urls.append(model) - if sample_rate: urls.append(sample_rate) - - for url in urls: - url = replace_url(url) - - if "huggingface.co" in url: file = huggingface.HF_download_file(url, pretraineds_custom_path) - elif "google.com" in url: file = gdown.gdown_download(url, pretraineds_custom_path) - elif "mediafire.com" in url: file = mediafire.Mediafire_Download(url, pretraineds_custom_path) - elif "pixeldrain.com" in url: file = pixeldrain.pixeldrain(url, pretraineds_custom_path) - elif "mega.nz" in url: file = meganz.mega_download_url(url, pretraineds_custom_path) - else: - gr_warning(translations["not_support_url"]) - return translations["not_support_url"], translations["not_support_url"] - - if file.endswith(".zip"): - shutil.unpack_archive(file, pretraineds_custom_path) - if os.path.exists(file): os.remove(file) - - gr_info(translations["success"]) - return translations["success"], translations["success"] - -def fetch_models_data(search): - all_table_data = [] - page = 1 - - while 1: - try: - response = requests.post(url=codecs.decode("uggcf://ibvpr-zbqryf.pbz/srgpu_qngn.cuc", "rot13"), data={"page": page, "search": search}) - - if response.status_code == 200: - table_data = response.json().get("table", "") - if not table_data.strip(): break - - all_table_data.append(table_data) - page += 1 - else: - logger.debug(f"{translations['code_error']} {response.status_code}") - break - except json.JSONDecodeError: - logger.debug(translations["json_error"]) - break - except requests.RequestException as e: - logger.debug(translations["requests_error"].format(e=e)) - break - - return all_table_data - -def search_models(name): - if not name: - gr_warning(translations["provide_name"]) - return [None]*2 - - gr_info(translations["start"].format(start=translations["search"])) - - tables = fetch_models_data(name) - - if len(tables) == 0: - gr_info(translations["not_found"].format(name=name)) - return [None]*2 - else: - model_options.clear() - - for table in tables: - for row in BeautifulSoup(table, "html.parser").select("tr"): - name_tag, url_tag = row.find("a", {"class": "fs-5"}), row.find("a", {"class": "btn btn-sm fw-bold btn-light ms-0 p-1 ps-2 pe-2"}) - url = url_tag["href"].replace("https://easyaivoice.com/run?url=", "") - if "huggingface" in url: - if name_tag and url_tag: model_options[replace_modelname(name_tag.text)] = url - - gr_info(translations["found"].format(results=len(model_options))) - return [{"value": "", "choices": model_options, "interactive": True, "visible": True, "__type__": "update"}, {"value": translations["downloads"], "visible": True, "__type__": "update"}] \ No newline at end of file diff --git a/main/app/core/editing.py b/main/app/core/editing.py deleted file mode 100644 index d30e43638d0689a9fe5475646ed4ad158e769863..0000000000000000000000000000000000000000 --- a/main/app/core/editing.py +++ /dev/null @@ -1,92 +0,0 @@ -import os -import sys -import random -import subprocess - -sys.path.append(os.getcwd()) - -from main.app.variables import python, translations, configs -from main.app.core.ui import gr_info, gr_warning, process_output, replace_export_format - -def audio_effects(input_path, output_path, resample, resample_sr, chorus_depth, chorus_rate, chorus_mix, chorus_delay, chorus_feedback, distortion_drive, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift, delay_seconds, delay_feedback, delay_mix, compressor_threshold, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold, limiter_release, gain_db, bitcrush_bit_depth, clipping_threshold, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost_db, bass_boost_frequency, treble_boost_db, treble_boost_frequency, fade_in_duration, fade_out_duration, export_format, chorus, distortion, reverb, delay, compressor, limiter, gain, bitcrush, clipping, phaser, treble_bass_boost, fade_in_out, audio_combination, audio_combination_input, main_vol, combine_vol): - if not input_path or not os.path.exists(input_path) or os.path.isdir(input_path): - gr_warning(translations["input_not_valid"]) - return None - - if not output_path: - gr_warning(translations["output_not_valid"]) - return None - - if os.path.isdir(output_path): output_path = os.path.join(output_path, f"audio_effects.{export_format}") - output_dir = os.path.dirname(output_path) or output_path - - if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) - output_path = process_output(output_path) - - gr_info(translations["start"].format(start=translations["apply_effect"])) - - subprocess.run([python, configs["audio_effects_path"], "--input_path", input_path, "--output_path", output_path, "--resample", str(resample), "--resample_sr", str(resample_sr), "--chorus_depth", str(chorus_depth), "--chorus_rate", str(chorus_rate), "--chorus_mix", str(chorus_mix), "--chorus_delay", str(chorus_delay), "--chorus_feedback", str(chorus_feedback), "--drive_db", str(distortion_drive), "--reverb_room_size", str(reverb_room_size), "--reverb_damping", str(reverb_damping), "--reverb_wet_level", str(reverb_wet_level), "--reverb_dry_level", str(reverb_dry_level), "--reverb_width", str(reverb_width), "--reverb_freeze_mode", str(reverb_freeze_mode), "--pitch_shift", str(pitch_shift), "--delay_seconds", str(delay_seconds), "--delay_feedback", str(delay_feedback), "--delay_mix", str(delay_mix), "--compressor_threshold", str(compressor_threshold), "--compressor_ratio", str(compressor_ratio), "--compressor_attack_ms", str(compressor_attack_ms), "--compressor_release_ms", str(compressor_release_ms), "--limiter_threshold", str(limiter_threshold), "--limiter_release", str(limiter_release), "--gain_db", str(gain_db), "--bitcrush_bit_depth", str(bitcrush_bit_depth), "--clipping_threshold", str(clipping_threshold), "--phaser_rate_hz", str(phaser_rate_hz), "--phaser_depth", str(phaser_depth), "--phaser_centre_frequency_hz", str(phaser_centre_frequency_hz), "--phaser_feedback", str(phaser_feedback), "--phaser_mix", str(phaser_mix), "--bass_boost_db", str(bass_boost_db), "--bass_boost_frequency", str(bass_boost_frequency), "--treble_boost_db", str(treble_boost_db), "--treble_boost_frequency", str(treble_boost_frequency), "--fade_in_duration", str(fade_in_duration), "--fade_out_duration", str(fade_out_duration), "--export_format", export_format, "--chorus", str(chorus), "--distortion", str(distortion), "--reverb", str(reverb), "--pitchshift", str(pitch_shift != 0), "--delay", str(delay), "--compressor", str(compressor), "--limiter", str(limiter), "--gain", str(gain), "--bitcrush", str(bitcrush), "--clipping", str(clipping), "--phaser", str(phaser), "--treble_bass_boost", str(treble_bass_boost), "--fade_in_out", str(fade_in_out), "--audio_combination", str(audio_combination), "--audio_combination_input", audio_combination_input, "--main_volume", str(main_vol), "--combination_volume", str(combine_vol)]) - - gr_info(translations["success"]) - return replace_export_format(output_path, export_format) - -def apply_voice_quirk(audio_path, mode, output_path, export_format): - if not audio_path or not os.path.exists(audio_path) or os.path.isdir(audio_path): - gr_warning(translations["input_not_valid"]) - return None - - if not output_path: - gr_warning(translations["output_not_valid"]) - return None - - if os.path.isdir(output_path): output_path = os.path.join(output_path, f"audio_quirk.{export_format}") - output_dir = os.path.dirname(output_path) or output_path - - if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) - output_path = process_output(output_path) - - gr_info(translations["start"].format(start=translations["apply_effect"])) - - import librosa - import numpy as np - import soundfile as sf - - def vibrato(y, sr, freq=5, depth=0.003): - return y[np.clip((np.arange(len(y)) + (depth * np.sin(2 * np.pi * freq * (np.arange(len(y)) / sr))) * sr).astype(int), 0, len(y) - 1)] - - y, sr = librosa.load(audio_path, sr=None) - output_path = replace_export_format(output_path, export_format) - - mode = translations["quirk_choice"][mode] - if mode == 0: mode = random.randint(1, 16) - - if mode == 1: y *= np.random.uniform(0.5, 0.8, size=len(y)) - elif mode == 2: y = librosa.effects.pitch_shift(y=y + np.random.normal(0, 0.01, y.shape), sr=sr, n_steps=np.random.uniform(-1.5, -3.5)) - elif mode == 3: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=3), rate=1.2) - elif mode == 4: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=8), rate=1.3) - elif mode == 5: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=-3), rate=0.75) - elif mode == 6: y *= np.sin(np.linspace(0, np.pi * 20, len(y))) * 0.5 + 0.5 - elif mode == 7: y = librosa.effects.time_stretch(vibrato(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=-4), sr, freq=3, depth=0.004), rate=0.85) - elif mode == 8: y *= 0.6 + np.pad(y, (sr // 2, 0), mode='constant')[:len(y)] * 0.4 - elif mode == 9: y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=2) + np.sin(np.linspace(0, np.pi * 20, len(y))) * 0.02 - elif mode == 10: y = vibrato(y, sr, freq=8, depth=0.005) - elif mode == 11: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=4), rate=1.25) - elif mode == 12: y = np.hstack([np.pad(f, (0, int(len(f)*0.3)), mode='edge') for f in librosa.util.frame(y, frame_length=2048, hop_length=512).T]) - elif mode == 13: y = np.concatenate([y, np.sin(2 * np.pi * np.linspace(0, 1, int(0.05 * sr))) * 0.02]) - elif mode == 14: y += np.random.normal(0, 0.005, len(y)) - elif mode == 15: - frame = int(sr * 0.2) - chunks = [y[i:i + frame] for i in range(0, len(y), frame)] - - np.random.shuffle(chunks) - y = np.concatenate(chunks) - elif mode == 16: - frame = int(sr * 0.3) - - for i in range(0, len(y), frame * 2): - y[i:i+frame] = y[i:i+frame][::-1] - - sf.write(output_path, y, sr, format=export_format) - gr_info(translations["success"]) - - return output_path \ No newline at end of file diff --git a/main/app/core/f0_extract.py b/main/app/core/f0_extract.py deleted file mode 100644 index 2c91d46a763f55f3905044d79a853c8148972b30..0000000000000000000000000000000000000000 --- a/main/app/core/f0_extract.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -import sys - -sys.path.append(os.getcwd()) - -from main.app.core.ui import gr_info, gr_warning -from main.app.variables import config, translations, configs - -def f0_extract(audio, f0_method, f0_onnx): - if not audio or not os.path.exists(audio) or os.path.isdir(audio): - gr_warning(translations["input_not_valid"]) - return [None]*2 - - import librosa - import numpy as np - import matplotlib.pyplot as plt - - from main.library.utils import check_assets, load_audio - from main.library.predictors.Generator import Generator - - check_assets(f0_method, "", f0_onnx, "") - - f0_path = os.path.join(configs["f0_path"], os.path.splitext(os.path.basename(audio))[0]) - image_path = os.path.join(f0_path, "f0.png") - txt_path = os.path.join(f0_path, "f0.txt") - - gr_info(translations["start_extract"]) - - if not os.path.exists(f0_path): os.makedirs(f0_path, exist_ok=True) - - y = load_audio(audio, sample_rate=16000) - f0_generator = Generator(16000, 160, 50, 1100, 0.5, is_half=config.is_half, device=config.device, f0_onnx_mode=f0_onnx, del_onnx_model=f0_onnx) - _, pitchf = f0_generator.calculator(config.x_pad, f0_method, y, 0, None, 3, False, 0, None, False) - - F_temp = np.array(pitchf, dtype=np.float32) - F_temp[F_temp == 0] = np.nan - - f0 = 1200 * np.log2(F_temp / librosa.midi_to_hz(0)) - - plt.figure(figsize=(10, 4)) - plt.plot(f0) - plt.title(f0_method) - plt.xlabel(translations["time_frames"]) - plt.ylabel(translations["Frequency"]) - plt.savefig(image_path) - plt.close() - - with open(txt_path, "w") as f: - for i, f0_value in enumerate(f0): - f.write(f"{i * 100.0},{f0_value}\n") - - gr_info(translations["extract_done"]) - - return [txt_path, image_path] \ No newline at end of file diff --git a/main/app/core/inference.py b/main/app/core/inference.py deleted file mode 100644 index 2690dac3839d6749b0bf59196bf25fb0d5a6f7bd..0000000000000000000000000000000000000000 --- a/main/app/core/inference.py +++ /dev/null @@ -1,441 +0,0 @@ -import os -import re -import gc -import sys -import shutil -import datetime -import subprocess - -import numpy as np - -sys.path.append(os.getcwd()) - -from main.app.variables import logger, config, configs, translations, python -from main.app.core.ui import gr_info, gr_warning, gr_error, process_output, replace_export_format - -def convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, f0_onnx, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5): - subprocess.run([ - python, - configs["convert_path"], - "--pitch", str(pitch), - "--filter_radius", str(filter_radius), - "--index_rate", str(index_rate), - "--rms_mix_rate", str(rms_mix_rate), - "--protect", str(protect), - "--hop_length", str(hop_length), - "--f0_method", f0_method, - "--input_path", input_path, - "--output_path", output_path, - "--pth_path", pth_path, - "--index_path", index_path, - "--f0_autotune", str(f0_autotune), - "--clean_audio", str(clean_audio), - "--clean_strength", str(clean_strength), - "--export_format", export_format, - "--embedder_model", embedder_model, - "--resample_sr", str(resample_sr), - "--split_audio", str(split_audio), - "--f0_autotune_strength", str(f0_autotune_strength), - "--checkpointing", str(checkpointing), - "--f0_onnx", str(f0_onnx), - "--embedders_mode", embedders_mode, - "--formant_shifting", str(formant_shifting), - "--formant_qfrency", str(formant_qfrency), - "--formant_timbre", str(formant_timbre), - "--f0_file", f0_file, - "--proposal_pitch", str(proposal_pitch), - "--proposal_pitch_threshold", str(proposal_pitch_threshold), - "--audio_processing", str(audio_processing), - "--alpha", str(alpha) - ]) - -def convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, input_audio_name, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5): - model_path = os.path.join(configs["weights_path"], model) if not os.path.exists(model) else model - - return_none = [None]*6 - return_none[5] = {"visible": True, "__type__": "update"} - - if not use_audio: - if merge_instrument or not_merge_backing or convert_backing or use_original: - gr_warning(translations["turn_on_use_audio"]) - return return_none - - if use_original: - if convert_backing: - gr_warning(translations["turn_off_convert_backup"]) - return return_none - elif not_merge_backing: - gr_warning(translations["turn_off_merge_backup"]) - return return_none - - if not model or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")): - gr_warning(translations["provide_file"].format(filename=translations["model"])) - return return_none - - f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders) - - if use_audio: - output_audio = os.path.join(configs["audios_path"], input_audio_name) - - from main.library.utils import pydub_load - - def get_audio_file(label): - matching_files = [f for f in os.listdir(output_audio) if label in f] - - if not matching_files: return translations["notfound"] - return os.path.join(output_audio, matching_files[0]) - - output_path = os.path.join(output_audio, f"Convert_Vocals.{format}") - output_backing = os.path.join(output_audio, f"Convert_Backing.{format}") - output_merge_backup = os.path.join(output_audio, f"Vocals+Backing.{format}") - output_merge_instrument = os.path.join(output_audio, f"Vocals+Instruments.{format}") - - if os.path.exists(output_audio): os.makedirs(output_audio, exist_ok=True) - output_path = process_output(output_path) - - if use_original: - original_vocal = get_audio_file('Original_Vocals_No_Reverb.') - - if original_vocal == translations["notfound"]: original_vocal = get_audio_file('Original_Vocals.') - - if original_vocal == translations["notfound"]: - gr_warning(translations["not_found_original_vocal"]) - return return_none - - input_path = original_vocal - else: - main_vocal = get_audio_file('Main_Vocals_No_Reverb.') - backing_vocal = get_audio_file('Backing_Vocals.') - - if main_vocal == translations["notfound"]: main_vocal = get_audio_file('Main_Vocals.') - if main_vocal == translations["notfound"]: - gr_warning(translations["not_found_main_vocal"]) - return return_none - - if not not_merge_backing and backing_vocal == translations["notfound"]: - gr_warning(translations["not_found_backing_vocal"]) - return return_none - - input_path = main_vocal - backing_path = backing_vocal - - gr_info(translations["convert_vocal"]) - - convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input_path, output_path, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha) - - gr_info(translations["convert_success"]) - - if convert_backing: - output_backing = process_output(output_backing) - - gr_info(translations["convert_backup"]) - - convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, backing_path, output_backing, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha) - - gr_info(translations["convert_backup_success"]) - - try: - if not not_merge_backing and not use_original: - backing_source = output_backing if convert_backing else backing_vocal - - output_merge_backup = process_output(output_merge_backup) - - gr_info(translations["merge_backup"]) - - pydub_load(output_path, volume=-4).overlay(pydub_load(backing_source, volume=-6)).export(output_merge_backup, format=format) - - gr_info(translations["merge_success"]) - - if merge_instrument: - vocals = output_merge_backup if not not_merge_backing and not use_original else output_path - - output_merge_instrument = process_output(output_merge_instrument) - - gr_info(translations["merge_instruments_process"]) - - instruments = get_audio_file('Instruments.') - - if instruments == translations["notfound"]: - gr_warning(translations["not_found_instruments"]) - output_merge_instrument = None - else: pydub_load(instruments, volume=-7).overlay(pydub_load(vocals, volume=-4 if use_original else None)).export(output_merge_instrument, format=format) - - gr_info(translations["merge_success"]) - except: - return return_none - - return [(None if use_original else output_path), output_backing, (None if not_merge_backing and use_original else output_merge_backup), (output_path if use_original else None), (output_merge_instrument if merge_instrument else None), {"visible": True, "__type__": "update"}] - else: - if not input or not os.path.exists(input): - gr_warning(translations["input_not_valid"]) - return return_none - - if not output: - gr_warning(translations["output_not_valid"]) - return return_none - - output = replace_export_format(output, format) - - if os.path.isdir(input): - gr_info(translations["is_folder"]) - - if not [f for f in os.listdir(input) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]: - gr_warning(translations["not_found_in_folder"]) - return return_none - - gr_info(translations["batch_convert"]) - - output_dir = os.path.dirname(output) or output - convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output_dir, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha) - - gr_info(translations["batch_convert_success"]) - - return return_none - else: - output_dir = os.path.dirname(output) or output - - if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) - output = process_output(output) - - gr_info(translations["convert_vocal"]) - - convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha) - - gr_info(translations["convert_success"]) - - return_none[0] = output - return return_none - -def convert_selection(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5): - if use_audio: - gr_info(translations["search_separate"]) - choice = [f for f in os.listdir(configs["audios_path"]) if os.path.isdir(os.path.join(configs["audios_path"], f))] if config.debug_mode else [f for f in os.listdir(configs["audios_path"]) if os.path.isdir(os.path.join(configs["audios_path"], f)) and any(file.lower().endswith((".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")) for file in os.listdir(os.path.join(configs["audios_path"], f)))] - - gr_info(translations["found_choice"].format(choice=len(choice))) - - if len(choice) == 0: - gr_warning(translations["separator==0"]) - - return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, None, None, None, None, None, {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}] - elif len(choice) == 1: - convert_output = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, None, None, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, choice[0], checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha) - - return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, convert_output[0], convert_output[1], convert_output[2], convert_output[3], convert_output[4], {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}] - else: return [{"choices": choice, "value": choice[0], "interactive": True, "visible": True, "__type__": "update"}, None, None, None, None, None, {"visible": False, "__type__": "update"}, {"visible": True, "__type__": "update"}] - else: - main_convert = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, None, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha) - - return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, main_convert[0], None, None, None, None, {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}] - -def whisper_process(model_size, input_audio, configs, device, out_queue, word_timestamps=True): - from main.library.speaker_diarization.whisper import load_model - - try: - segments = load_model(model_size, device=device).transcribe(input_audio, fp16=configs.get("fp16", False), word_timestamps=word_timestamps) - out_queue.put(segments["segments"]) - except Exception as e: - out_queue.put(e) - finally: - del segments - gc.collect() - -def convert_with_whisper(num_spk, model_size, cleaner, clean_strength, autotune, f0_autotune_strength, checkpointing, model_1, model_2, model_index_1, model_index_2, pitch_1, pitch_2, index_strength_1, index_strength_2, export_format, input_audio, output_audio, onnx_f0_mode, method, hybrid_method, hop_length, embed_mode, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, formant_shifting, formant_qfrency_1, formant_timbre_1, formant_qfrency_2, formant_timbre_2, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5): - import librosa - import multiprocessing as mp - - from pydub import AudioSegment - from sklearn.cluster import AgglomerativeClustering - - from main.library.utils import clear_gpu_cache - from main.library.speaker_diarization.audio import Audio - from main.library.speaker_diarization.segment import Segment - from main.library.utils import check_spk_diarization, pydub_load - from main.library.speaker_diarization.embedding import SpeechBrainPretrainedSpeakerEmbedding - - check_spk_diarization(model_size) - model_pth_1, model_pth_2 = os.path.join(configs["weights_path"], model_1) if not os.path.exists(model_1) else model_1, os.path.join(configs["weights_path"], model_2) if not os.path.exists(model_2) else model_2 - - if (not model_1 or not os.path.exists(model_pth_1) or os.path.isdir(model_pth_1) or not model_pth_1.endswith((".pth", ".onnx"))) and (not model_2 or not os.path.exists(model_pth_2) or os.path.isdir(model_pth_2) or not model_pth_2.endswith((".pth", ".onnx"))): - gr_warning(translations["provide_file"].format(filename=translations["model"])) - return None - - if not model_1: model_pth_1 = model_pth_2 - if not model_2: model_pth_2 = model_pth_1 - - if not input_audio or not os.path.exists(input_audio) or os.path.isdir(input_audio): - gr_warning(translations["input_not_valid"]) - return None - - if not output_audio: - gr_warning(translations["output_not_valid"]) - return None - - output_audio = process_output(output_audio) - gr_info(translations["start_whisper"]) - - try: - try: - mp.set_start_method("spawn") - except: - pass - - whisper_queue = mp.Queue() - whisperprocess = mp.Process(target=whisper_process, args=(model_size, input_audio, configs, config.device, whisper_queue, True)) - whisperprocess.start() - - segments = whisper_queue.get() - audio = Audio() - - embedding_model = SpeechBrainPretrainedSpeakerEmbedding(embedding=os.path.join(configs["speaker_diarization_path"], "models", "speechbrain"), device=config.device) - y, sr = librosa.load(input_audio, sr=None) - duration = len(y) / sr - - def segment_embedding(segment): - waveform, _ = audio.crop(input_audio, Segment(segment["start"], min(duration, segment["end"]))) - return embedding_model(waveform.mean(dim=0, keepdim=True)[None] if waveform.shape[0] == 2 else waveform[None]) - - def time(secs): - return datetime.timedelta(seconds=round(secs)) - - def merge_audio(files_list, time_stamps, original_file_path, output_path, format): - def extract_number(filename): - match = re.search(r'_(\d+)', filename) - return int(match.group(1)) if match else 0 - - total_duration = len(pydub_load(original_file_path)) - combined = AudioSegment.empty() - current_position = 0 - - for file, (start_i, end_i) in zip(sorted(files_list, key=extract_number), time_stamps): - if start_i > current_position: combined += AudioSegment.silent(duration=start_i - current_position) - - combined += pydub_load(file) - current_position = end_i - - if current_position < total_duration: combined += AudioSegment.silent(duration=total_duration - current_position) - combined.export(output_path, format=format) - - return output_path - - embeddings = np.zeros(shape=(len(segments), 192)) - for i, segment in enumerate(segments): - embeddings[i] = segment_embedding(segment) - - labels = AgglomerativeClustering(num_spk).fit(np.nan_to_num(embeddings)).labels_ - for i in range(len(segments)): - segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1) - - merged_segments, current_text = [], [] - current_speaker, current_start = None, None - - for i, segment in enumerate(segments): - speaker = segment["speaker"] - start_time = segment["start"] - text = segment["text"][1:] - - if speaker == current_speaker: - current_text.append(text) - end_time = segment["end"] - else: - if current_speaker is not None: merged_segments.append({"speaker": current_speaker, "start": current_start, "end": end_time, "text": " ".join(current_text)}) - - current_speaker = speaker - current_start = start_time - current_text = [text] - end_time = segment["end"] - - if current_speaker is not None: merged_segments.append({"speaker": current_speaker, "start": current_start, "end": end_time, "text": " ".join(current_text)}) - - gr_info(translations["whisper_done"]) - - x = "" - for segment in merged_segments: - x += f"\n{segment['speaker']} {str(time(segment['start']))} - {str(time(segment['end']))}\n" - x += segment["text"] + "\n" - - logger.info(x) - - del audio, embedding_model, segments, labels - clear_gpu_cache() - gc.collect() - - gr_info(translations["process_audio"]) - - audio = pydub_load(input_audio) - output_folder = "audios_temp" - - if os.path.exists(output_folder): shutil.rmtree(output_folder, ignore_errors=True) - for f in [output_folder, os.path.join(output_folder, "1"), os.path.join(output_folder, "2")]: - os.makedirs(f, exist_ok=True) - - time_stamps, processed_segments = [], [] - for i, segment in enumerate(merged_segments): - start_ms = int(segment["start"] * 1000) - end_ms = int(segment["end"] * 1000) - - index = i + 1 - - segment_filename = os.path.join(output_folder, "1" if i % 2 == 1 else "2", f"segment_{index}.wav") - audio[start_ms:end_ms].export(segment_filename, format="wav") - - processed_segments.append(os.path.join(output_folder, "1" if i % 2 == 1 else "2", f"segment_{index}_output.wav")) - time_stamps.append((start_ms, end_ms)) - - f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders) - - gr_info(translations["process_done_start_convert"]) - - convert(pitch_1, filter_radius, index_strength_1, rms_mix_rate, protect, hop_length, f0method, os.path.join(output_folder, "1"), output_folder, model_pth_1, model_index_1, autotune, cleaner, clean_strength, "wav", embedder_model, resample_sr, False, f0_autotune_strength, checkpointing, onnx_f0_mode, embed_mode, formant_shifting, formant_qfrency_1, formant_timbre_1, "", proposal_pitch, proposal_pitch_threshold, audio_processing, alpha) - convert(pitch_2, filter_radius, index_strength_2, rms_mix_rate, protect, hop_length, f0method, os.path.join(output_folder, "2"), output_folder, model_pth_2, model_index_2, autotune, cleaner, clean_strength, "wav", embedder_model, resample_sr, False, f0_autotune_strength, checkpointing, onnx_f0_mode, embed_mode, formant_shifting, formant_qfrency_2, formant_timbre_2, "", proposal_pitch, proposal_pitch_threshold, audio_processing, alpha) - - gr_info(translations["convert_success"]) - return merge_audio(processed_segments, time_stamps, input_audio, replace_export_format(output_audio, export_format), export_format) - except Exception as e: - gr_error(translations["error_occurred"].format(e=e)) - import traceback - logger.debug(traceback.format_exc()) - return None - finally: - if os.path.exists("audios_temp"): shutil.rmtree("audios_temp", ignore_errors=True) - -def convert_tts(clean, autotune, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5): - model_path = os.path.join(configs["weights_path"], model) if not os.path.exists(model) else model - - if not model_path or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")): - gr_warning(translations["provide_file"].format(filename=translations["model"])) - return None - - if not input or not os.path.exists(input): - gr_warning(translations["input_not_valid"]) - return None - - if os.path.isdir(input): - input_audio = [f for f in os.listdir(input) if "tts" in f and f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))] - - if not input_audio: - gr_warning(translations["not_found_in_folder"]) - return None - - input = os.path.join(input, input_audio[0]) - - if not output: - gr_warning(translations["output_not_valid"]) - return None - - output = replace_export_format(output, format) - if os.path.isdir(output): output = os.path.join(output, f"tts.{format}") - - output_dir = os.path.dirname(output) - if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) - - output = process_output(output) - - f0method = method if method != "hybrid" else hybrid_method - embedder_model = embedders if embedders != "custom" else custom_embedders - - gr_info(translations["convert_vocal"]) - - convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha) - - gr_info(translations["convert_success"]) - return output \ No newline at end of file diff --git a/main/app/core/model_utils.py b/main/app/core/model_utils.py deleted file mode 100644 index f897182f442434a7426b0ffd71107b0a95d14904..0000000000000000000000000000000000000000 --- a/main/app/core/model_utils.py +++ /dev/null @@ -1,164 +0,0 @@ -import os -import sys -import json -import torch -import datetime - -sys.path.append(os.getcwd()) - -from main.app.core.ui import gr_info, gr_warning, gr_error -from main.app.variables import config, logger, translations, configs - -def fushion_model_pth(name, pth_1, pth_2, ratio): - if not name.endswith(".pth"): name = name + ".pth" - - if not pth_1 or not os.path.exists(pth_1) or not pth_1.endswith(".pth"): - gr_warning(translations["provide_file"].format(filename=translations["model"] + " 1")) - return [translations["provide_file"].format(filename=translations["model"] + " 1"), None] - - if not pth_2 or not os.path.exists(pth_2) or not pth_2.endswith(".pth"): - gr_warning(translations["provide_file"].format(filename=translations["model"] + " 2")) - return [translations["provide_file"].format(filename=translations["model"] + " 2"), None] - - from collections import OrderedDict - - def extract(ckpt): - a = ckpt["model"] - opt = OrderedDict() - opt["weight"] = {} - - for key in a.keys(): - if "enc_q" in key: continue - - opt["weight"][key] = a[key] - - return opt - - try: - ckpt1 = torch.load(pth_1, map_location="cpu", weights_only=True) - ckpt2 = torch.load(pth_2, map_location="cpu", weights_only=True) - - if ckpt1["sr"] != ckpt2["sr"]: - gr_warning(translations["sr_not_same"]) - return [translations["sr_not_same"], None] - - cfg = ckpt1["config"] - cfg_f0 = ckpt1["f0"] - cfg_version = ckpt1["version"] - cfg_sr = ckpt1["sr"] - - vocoder = ckpt1.get("vocoder", "Default") - rms_extract = ckpt1.get("energy", False) - - ckpt1 = extract(ckpt1) if "model" in ckpt1 else ckpt1["weight"] - ckpt2 = extract(ckpt2) if "model" in ckpt2 else ckpt2["weight"] - - if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())): - gr_warning(translations["architectures_not_same"]) - return [translations["architectures_not_same"], None] - - gr_info(translations["start"].format(start=translations["fushion_model"])) - - opt = OrderedDict() - opt["weight"] = {} - - for key in ckpt1.keys(): - if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape: - min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0]) - opt["weight"][key] = (ratio * (ckpt1[key][:min_shape0].float()) + (1 - ratio) * (ckpt2[key][:min_shape0].float())).half() - else: opt["weight"][key] = (ratio * (ckpt1[key].float()) + (1 - ratio) * (ckpt2[key].float())).half() - - opt["config"] = cfg - opt["sr"] = cfg_sr - opt["f0"] = cfg_f0 - opt["version"] = cfg_version - opt["infos"] = translations["model_fushion_info"].format(name=name, pth_1=pth_1, pth_2=pth_2, ratio=ratio) - opt["vocoder"] = vocoder - opt["energy"] = rms_extract - - output_model = configs["weights_path"] - if not os.path.exists(output_model): os.makedirs(output_model, exist_ok=True) - - torch.save(opt, os.path.join(output_model, name)) - - gr_info(translations["success"]) - return [translations["success"], os.path.join(output_model, name)] - except Exception as e: - gr_error(message=translations["error_occurred"].format(e=e)) - return [e, None] - -def fushion_model(name, path_1, path_2, ratio): - if not name: - gr_warning(translations["provide_name_is_save"]) - return [translations["provide_name_is_save"], None] - - if path_1.endswith(".pth") and path_2.endswith(".pth"): return fushion_model_pth(name, path_1, path_2, ratio) - else: - gr_warning(translations["format_not_valid"]) - return [None, None] - -def onnx_export(model_path): - if not model_path.endswith(".pth"): model_path += ".pth" - if not model_path or not os.path.exists(model_path) or not model_path.endswith(".pth"): return gr_warning(translations["provide_file"].format(filename=translations["model"])) - - try: - gr_info(translations["start_onnx_export"]) - - from main.library.onnx.onnx_export import onnx_exporter - output = onnx_exporter(model_path, model_path.replace(".pth", ".onnx"), is_half=config.is_half, device=config.device) - - gr_info(translations["success"]) - return output - except Exception as e: - return gr_error(e) - -def model_info(path): - if not path or not os.path.exists(path) or os.path.isdir(path) or not path.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"])) - - def prettify_date(date_str): - if date_str == translations["not_found_create_time"]: return None - - try: - return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f").strftime("%Y-%m-%d %H:%M:%S") - except ValueError as e: - logger.debug(e) - return translations["format_not_valid"] - - if path.endswith(".pth"): model_data = torch.load(path, map_location="cpu") - else: - import onnx - - model = onnx.load(path) - model_data = None - - for prop in model.metadata_props: - if prop.key == "model_info": - model_data = json.loads(prop.value) - break - - gr_info(translations["read_info"]) - - epochs = model_data.get("epoch", None) - if epochs is None: - epochs = model_data.get("info", None) - try: - epoch = epochs.replace("epoch", "").replace("e", "").isdigit() - if epoch and epochs is None: epochs = translations["not_found"].format(name=translations["epoch"]) - except: - pass - - steps = model_data.get("step", translations["not_found"].format(name=translations["step"])) - sr = model_data.get("sr", translations["not_found"].format(name=translations["sr"])) - f0 = model_data.get("f0", translations["not_found"].format(name=translations["f0"])) - version = model_data.get("version", translations["not_found"].format(name=translations["version"])) - creation_date = model_data.get("creation_date", translations["not_found_create_time"]) - model_hash = model_data.get("model_hash", translations["not_found"].format(name="model_hash")) - pitch_guidance = translations["trained_f0"] if f0 else translations["not_f0"] - creation_date_str = prettify_date(creation_date) if creation_date else translations["not_found_create_time"] - model_name = model_data.get("model_name", translations["unregistered"]) - model_author = model_data.get("author", translations["not_author"]) - vocoder = model_data.get("vocoder", "Default") - rms_extract = model_data.get("energy", False) - - gr_info(translations["success"]) - return translations["model_info"].format(model_name=model_name, model_author=model_author, epochs=epochs, steps=steps, version=version, sr=sr, pitch_guidance=pitch_guidance, model_hash=model_hash, creation_date_str=creation_date_str, vocoder=vocoder, rms_extract=rms_extract) \ No newline at end of file diff --git a/main/app/core/presets.py b/main/app/core/presets.py deleted file mode 100644 index 058d0086309908c4c99e9bc01b88a325c230b279..0000000000000000000000000000000000000000 --- a/main/app/core/presets.py +++ /dev/null @@ -1,166 +0,0 @@ -import os -import sys -import json - -sys.path.append(os.getcwd()) - -from main.app.variables import translations, configs -from main.app.core.ui import gr_info, gr_warning, change_preset_choices, change_effect_preset_choices - -def load_presets(presets, cleaner, autotune, pitch, clean_strength, index_strength, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, formant_shifting, formant_qfrency, formant_timbre, proposal_pitch, proposal_pitch_threshold): - if not presets: gr_warning(translations["provide_file_settings"]) - - file = {} - if presets: - with open(os.path.join(configs["presets_path"], presets)) as f: - file = json.load(f) - - gr_info(translations["load_presets"].format(presets=presets)) - - return [file.get("cleaner", cleaner), file.get("autotune", autotune), file.get("pitch", pitch), file.get("clean_strength", clean_strength), file.get("index_strength", index_strength), file.get("resample_sr", resample_sr), file.get("filter_radius", filter_radius), file.get("rms_mix_rate", rms_mix_rate), file.get("protect", protect), file.get("split_audio", split_audio), file.get("f0_autotune_strength", f0_autotune_strength), file.get("formant_shifting", formant_shifting), file.get("formant_qfrency", formant_qfrency), file.get("formant_timbre", formant_timbre), file.get("proposal_pitch", proposal_pitch), file.get("proposal_pitch_threshold", proposal_pitch_threshold)] - -def save_presets(name, cleaner, autotune, pitch, clean_strength, index_strength, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, cleaner_chbox, autotune_chbox, pitch_chbox, index_strength_chbox, resample_sr_chbox, filter_radius_chbox, rms_mix_rate_chbox, protect_chbox, split_audio_chbox, formant_shifting_chbox, formant_shifting, formant_qfrency, formant_timbre, proposal_pitch, proposal_pitch_threshold): - if not name: return gr_warning(translations["provide_filename_settings"]) - if not any([cleaner_chbox, autotune_chbox, pitch_chbox, index_strength_chbox, resample_sr_chbox, filter_radius_chbox, rms_mix_rate_chbox, protect_chbox, split_audio_chbox, formant_shifting_chbox]): return gr_warning(translations["choose1"]) - - settings = {} - - for checkbox, data in [(cleaner_chbox, {"cleaner": cleaner, "clean_strength": clean_strength}), (autotune_chbox, {"autotune": autotune, "f0_autotune_strength": f0_autotune_strength}), (pitch_chbox, {"pitch": pitch}), (index_strength_chbox, {"index_strength": index_strength}), (resample_sr_chbox, {"resample_sr": resample_sr}), (filter_radius_chbox, {"filter_radius": filter_radius}), (rms_mix_rate_chbox, {"rms_mix_rate": rms_mix_rate}), (protect_chbox, {"protect": protect}), (split_audio_chbox, {"split_audio": split_audio}), (formant_shifting_chbox, {"formant_shifting": formant_shifting, "formant_qfrency": formant_qfrency, "formant_timbre": formant_timbre}), (proposal_pitch, {"proposal_pitch": proposal_pitch, "proposal_pitch_threshold": proposal_pitch_threshold})]: - if checkbox: settings.update(data) - - with open(os.path.join(configs["presets_path"], name + ".conversion.json"), "w") as f: - json.dump(settings, f, indent=4) - - gr_info(translations["export_settings"].format(name=name)) - return change_preset_choices() - -def audio_effect_load_presets(presets, resample_checkbox, audio_effect_resample_sr, chorus_depth, chorus_rate_hz, chorus_mix, chorus_centre_delay_ms, chorus_feedback, distortion_drive_db, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift_semitones, delay_second, delay_feedback, delay_mix, compressor_threshold_db, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold_db, limiter_release_ms, gain_db, bitcrush_bit_depth, clipping_threshold_db, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost, bass_frequency, treble_boost, treble_frequency, fade_in, fade_out, chorus_check_box, distortion_checkbox, reverb_check_box, delay_check_box, compressor_check_box, limiter, gain_checkbox, bitcrush_checkbox, clipping_checkbox, phaser_check_box, bass_or_treble, fade): - if not presets: gr_warning(translations["provide_file_settings"]) - - file = {} - if presets: - with open(os.path.join(configs["presets_path"], presets)) as f: - file = json.load(f) - - gr_info(translations["load_presets"].format(presets=presets)) - return [ - file.get("resample_checkbox", resample_checkbox), file.get("audio_effect_resample_sr", audio_effect_resample_sr), - file.get("chorus_depth", chorus_depth), file.get("chorus_rate_hz", chorus_rate_hz), - file.get("chorus_mix", chorus_mix), file.get("chorus_centre_delay_ms", chorus_centre_delay_ms), - file.get("chorus_feedback", chorus_feedback), file.get("distortion_drive_db", distortion_drive_db), - file.get("reverb_room_size", reverb_room_size), file.get("reverb_damping", reverb_damping), - file.get("reverb_wet_level", reverb_wet_level), file.get("reverb_dry_level", reverb_dry_level), - file.get("reverb_width", reverb_width), file.get("reverb_freeze_mode", reverb_freeze_mode), - file.get("pitch_shift_semitones", pitch_shift_semitones), file.get("delay_second", delay_second), - file.get("delay_feedback", delay_feedback), file.get("delay_mix", delay_mix), - file.get("compressor_threshold_db", compressor_threshold_db), file.get("compressor_ratio", compressor_ratio), - file.get("compressor_attack_ms", compressor_attack_ms), file.get("compressor_release_ms", compressor_release_ms), - file.get("limiter_threshold_db", limiter_threshold_db), file.get("limiter_release_ms", limiter_release_ms), - file.get("gain_db", gain_db), file.get("bitcrush_bit_depth", bitcrush_bit_depth), - file.get("clipping_threshold_db", clipping_threshold_db), file.get("phaser_rate_hz", phaser_rate_hz), - file.get("phaser_depth", phaser_depth), file.get("phaser_centre_frequency_hz", phaser_centre_frequency_hz), - file.get("phaser_feedback", phaser_feedback), file.get("phaser_mix", phaser_mix), - file.get("bass_boost", bass_boost), file.get("bass_frequency", bass_frequency), - file.get("treble_boost", treble_boost), file.get("treble_frequency", treble_frequency), - file.get("fade_in", fade_in), file.get("fade_out", fade_out), - file.get("chorus_check_box", chorus_check_box), file.get("distortion_checkbox", distortion_checkbox), - file.get("reverb_check_box", reverb_check_box), file.get("delay_check_box", delay_check_box), - file.get("compressor_check_box", compressor_check_box), file.get("limiter", limiter), - file.get("gain_checkbox", gain_checkbox), file.get("bitcrush_checkbox", bitcrush_checkbox), - file.get("clipping_checkbox", clipping_checkbox), file.get("phaser_check_box", phaser_check_box), - file.get("bass_or_treble", bass_or_treble), file.get("fade", fade) - ] - -def audio_effect_save_presets(name, resample_checkbox, audio_effect_resample_sr, chorus_depth, chorus_rate_hz, chorus_mix, chorus_centre_delay_ms, chorus_feedback, distortion_drive_db, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift_semitones, delay_second, delay_feedback, delay_mix, compressor_threshold_db, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold_db, limiter_release_ms, gain_db, bitcrush_bit_depth, clipping_threshold_db, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost, bass_frequency, treble_boost, treble_frequency, fade_in, fade_out, chorus_check_box, distortion_checkbox, reverb_check_box, delay_check_box, compressor_check_box, limiter, gain_checkbox, bitcrush_checkbox, clipping_checkbox, phaser_check_box, bass_or_treble, fade): - if not name: return gr_warning(translations["provide_filename_settings"]) - if not any([resample_checkbox, chorus_check_box, distortion_checkbox, reverb_check_box, delay_check_box, compressor_check_box, limiter, gain_checkbox, bitcrush_checkbox, clipping_checkbox, phaser_check_box, bass_or_treble, fade, pitch_shift_semitones != 0]): return gr_warning(translations["choose1"]) - - settings = {} - - for checkbox, data in [ - (resample_checkbox, { - "resample_checkbox": resample_checkbox, - "audio_effect_resample_sr": audio_effect_resample_sr - }), - (chorus_check_box, { - "chorus_check_box": chorus_check_box, - "chorus_depth": chorus_depth, - "chorus_rate_hz": chorus_rate_hz, - "chorus_mix": chorus_mix, - "chorus_centre_delay_ms": chorus_centre_delay_ms, - "chorus_feedback": chorus_feedback - }), - (distortion_checkbox, { - "distortion_checkbox": distortion_checkbox, - "distortion_drive_db": distortion_drive_db - }), - (reverb_check_box, { - "reverb_check_box": reverb_check_box, - "reverb_room_size": reverb_room_size, - "reverb_damping": reverb_damping, - "reverb_wet_level": reverb_wet_level, - "reverb_dry_level": reverb_dry_level, - "reverb_width": reverb_width, - "reverb_freeze_mode": reverb_freeze_mode - }), - (pitch_shift_semitones != 0, { - "pitch_shift_semitones": pitch_shift_semitones - }), - (delay_check_box, { - "delay_check_box": delay_check_box, - "delay_second": delay_second, - "delay_feedback": delay_feedback, - "delay_mix": delay_mix - }), - (compressor_check_box, { - "compressor_check_box": compressor_check_box, - "compressor_threshold_db": compressor_threshold_db, - "compressor_ratio": compressor_ratio, - "compressor_attack_ms": compressor_attack_ms, - "compressor_release_ms": compressor_release_ms - }), - (limiter, { - "limiter": limiter, - "limiter_threshold_db": limiter_threshold_db, - "limiter_release_ms": limiter_release_ms - }), - (gain_checkbox, { - "gain_checkbox": gain_checkbox, - "gain_db": gain_db - }), - (bitcrush_checkbox, { - "bitcrush_checkbox": bitcrush_checkbox, - "bitcrush_bit_depth": bitcrush_bit_depth - }), - (clipping_checkbox, { - "clipping_checkbox": clipping_checkbox, - "clipping_threshold_db": clipping_threshold_db - }), - (phaser_check_box, { - "phaser_check_box": phaser_check_box, - "phaser_rate_hz": phaser_rate_hz, - "phaser_depth": phaser_depth, - "phaser_centre_frequency_hz": phaser_centre_frequency_hz, - "phaser_feedback": phaser_feedback, - "phaser_mix": phaser_mix - }), - (bass_or_treble, { - "bass_or_treble": bass_or_treble, - "bass_boost": bass_boost, - "bass_frequency": bass_frequency, - "treble_boost": treble_boost, - "treble_frequency": treble_frequency - }), - (fade, { - "fade": fade, - "fade_in": fade_in, - "fade_out": fade_out - }) - ]: - if checkbox: settings.update(data) - - with open(os.path.join(configs["presets_path"], name + ".effect.json"), "w") as f: - json.dump(settings, f, indent=4) - - gr_info(translations["export_settings"].format(name=name)) - return change_effect_preset_choices() \ No newline at end of file diff --git a/main/app/core/process.py b/main/app/core/process.py deleted file mode 100644 index 8be9c6677978f698a6a6af661c40dc98e1281e51..0000000000000000000000000000000000000000 --- a/main/app/core/process.py +++ /dev/null @@ -1,135 +0,0 @@ -import os -import re -import sys -import shutil -import codecs -import zipfile -import requests - -sys.path.append(os.getcwd()) - -from main.app.variables import logger, translations, configs -from main.app.core.ui import gr_info, gr_warning, gr_error, process_output, replace_punctuation - -def read_docx_text(path): - import xml.etree.ElementTree - - with zipfile.ZipFile(path) as docx: - with docx.open("word/document.xml") as document_xml: - xml_content = document_xml.read() - - WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' - - paragraphs = [] - for paragraph in xml.etree.ElementTree.XML(xml_content).iter(WORD_NAMESPACE + 'p'): - texts = [node.text for node in paragraph.iter(WORD_NAMESPACE + 't') if node.text] - if texts: paragraphs.append(''.join(texts)) - - return '\n'.join(paragraphs) - -def process_input(file_path): - if file_path.endswith(".srt"): file_contents = "" - elif file_path.endswith(".docx"): file_contents = read_docx_text(file_path) - else: - try: - with open(file_path, "r", encoding="utf-8") as file: - file_contents = file.read() - except Exception as e: - gr_warning(translations["read_error"]) - logger.debug(e) - file_contents = "" - - gr_info(translations["upload_success"].format(name=translations["text"])) - return file_contents - -def move_files_from_directory(src_dir, dest_weights, dest_logs, model_name): - for root, _, files in os.walk(src_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".index"): - model_log_dir = os.path.join(dest_logs, model_name) - os.makedirs(model_log_dir, exist_ok=True) - - filepath = process_output(os.path.join(model_log_dir, replace_punctuation(file))) - - shutil.move(file_path, filepath) - elif file.endswith(".pth") and not file.startswith("D_") and not file.startswith("G_"): - pth_path = process_output(os.path.join(dest_weights, model_name + ".pth")) - - shutil.move(file_path, pth_path) - elif file.endswith(".onnx") and not file.startswith("D_") and not file.startswith("G_"): - pth_path = process_output(os.path.join(dest_weights, model_name + ".onnx")) - - shutil.move(file_path, pth_path) - -def extract_name_model(filename): - match = re.search(r"_([A-Za-z0-9]+)(?=_v\d*)", replace_punctuation(filename)) - return match.group(1) if match else None - -def save_drop_model(dropboxs): - weight_folder = configs["weights_path"] - logs_folder = configs["logs_path"] - save_model_temp = "save_model_temp" - - if not os.path.exists(weight_folder): os.makedirs(weight_folder, exist_ok=True) - if not os.path.exists(logs_folder): os.makedirs(logs_folder, exist_ok=True) - if not os.path.exists(save_model_temp): os.makedirs(save_model_temp, exist_ok=True) - - try: - for dropbox in dropboxs: - shutil.move(dropbox, save_model_temp) - file_name = os.path.basename(dropbox) - - if file_name.endswith(".zip"): - shutil.unpack_archive(os.path.join(save_model_temp, file_name), save_model_temp) - move_files_from_directory(save_model_temp, weight_folder, logs_folder, file_name.replace(".zip", "")) - elif file_name.endswith((".pth", ".onnx")): - output_file = process_output(os.path.join(weight_folder, file_name)) - - shutil.move(os.path.join(save_model_temp, file_name), output_file) - elif file_name.endswith(".index"): - modelname = extract_name_model(file_name) - if modelname is None: modelname = os.path.splitext(os.path.basename(file_name))[0] - - model_logs = os.path.join(logs_folder, modelname) - if not os.path.exists(model_logs): os.makedirs(model_logs, exist_ok=True) - - shutil.move(os.path.join(save_model_temp, file_name), model_logs) - else: - gr_warning(translations["unable_analyze_model"]) - return None - - gr_info(translations["upload_success"].format(name=translations["model"])) - return None - except Exception as e: - gr_error(message=translations["error_occurred"].format(e=e)) - return None - finally: - shutil.rmtree(save_model_temp, ignore_errors=True) - -def zip_file(name, pth, index): - pth_path = os.path.join(configs["weights_path"], pth) - if not pth or not os.path.exists(pth_path) or not pth.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"])) - - zip_file_path = os.path.join(configs["logs_path"], name, name + ".zip") - gr_info(translations["start"].format(start=translations["zip"])) - - with zipfile.ZipFile(zip_file_path, 'w') as zipf: - zipf.write(pth_path, os.path.basename(pth_path)) - if index: zipf.write(index, os.path.basename(index)) - - gr_info(translations["success"]) - return {"visible": True, "value": zip_file_path, "__type__": "update"} - -def fetch_pretrained_data(): - try: - response = requests.get(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/wfba/phfgbz_cergenvarq.wfba", "rot13")) - response.raise_for_status() - - return response.json() - except: - return {} - -def update_sample_rate_dropdown(model): - data = fetch_pretrained_data() - if model != translations["success"]: return {"choices": list(data[model].keys()), "value": list(data[model].keys())[0], "__type__": "update"} \ No newline at end of file diff --git a/main/app/core/realtime.py b/main/app/core/realtime.py deleted file mode 100644 index 54831e2a7f009792e830bb2f0dd8161e067be19c..0000000000000000000000000000000000000000 --- a/main/app/core/realtime.py +++ /dev/null @@ -1,174 +0,0 @@ -import os -import sys -import time - -sys.path.append(os.getcwd()) - -from main.app.variables import translations, configs -from main.app.core.ui import gr_info, gr_warning, audio_device - -running, callbacks, audio_manager = False, None, None - -PIPELINE_SAMPLE_RATE = 16000 -DEVICE_SAMPLE_RATE = 48000 - -interactive_true = {"interactive": True, "__type__": "update"} -interactive_false = {"interactive": False, "__type__": "update"} - -def realtime_start( - monitor, - exclusive_mode, - vad_enabled, - input_audio_device, - output_audio_device, - monitor_output_device, - input_audio_gain, - output_audio_gain, - monitor_audio_gain, - input_asio_channels, - output_asio_channels, - monitor_asio_channels, - chunk_size, - pitch, - model_pth, - model_index, - index_strength, - onnx_f0_mode, - f0_method, - hop_length, - embed_mode, - embedders, - custom_embedders, - f0_autotune, - proposal_pitch, - f0_autotune_strength, - proposal_pitch_threshold, - rms_mix_rate, - protect, - filter_radius, - silent_threshold, - extra_convert_size, - cross_fade_overlap_size, - vad_sensitivity, - vad_frame_ms, - clean_audio, - clean_strength -): - global running, callbacks, audio_manager - running = True - - gr_info(translations["start_realtime"]) - yield translations["start_realtime"], interactive_false, interactive_true - - if not input_audio_device or not output_audio_device: - gr_warning(translations["provide_audio_device"]) - yield translations["provide_audio_device"], interactive_true, interactive_false - return - - if monitor and not monitor_output_device: - gr_warning(translations["provide_monitor_device"]) - yield translations["provide_monitor_device"], interactive_true, interactive_false - return - - model_pth = os.path.join(configs["weights_path"], model_pth) if not os.path.exists(model_pth) else model_pth - embedder_model = (embedders if embedders != "custom" else custom_embedders) - - if not model_pth or not os.path.exists(model_pth) or os.path.isdir(model_pth) or not model_pth.endswith((".pth", ".onnx")): - gr_warning(translations["provide_file"].format(filename=translations["model"])) - yield translations["provide_file"].format(filename=translations["model"]), interactive_true, interactive_false - return - - input_devices, output_devices = audio_device() - input_device_id = input_devices[input_audio_device][0] - output_device_id = output_devices[output_audio_device][0] - output_monitor_id = output_devices[monitor_output_device][0] if monitor else None - - input_audio_gain /= 100.0 - output_audio_gain /= 100.0 - monitor_audio_gain /= 100.0 - - chunk_size = int(chunk_size * DEVICE_SAMPLE_RATE / 1000 / 128) - - from main.inference.realtime.callbacks import AudioCallbacks - - callbacks = AudioCallbacks( - pass_through=False, - read_chunk_size=chunk_size, - cross_fade_overlap_size=cross_fade_overlap_size, - input_sample_rate=DEVICE_SAMPLE_RATE, - output_sample_rate=DEVICE_SAMPLE_RATE, - extra_convert_size=extra_convert_size, - model_path=model_pth, - index_path=model_index, - f0_method=f0_method, - f0_onnx=onnx_f0_mode, - embedder_model=embedder_model, - embedders_mode=embed_mode, - sample_rate=PIPELINE_SAMPLE_RATE, - hop_length=hop_length, - silent_threshold=silent_threshold, - f0_up_key=pitch, - index_rate=index_strength, - protect=protect, - filter_radius=filter_radius, - rms_mix_rate=rms_mix_rate, - f0_autotune=f0_autotune, - f0_autotune_strength=f0_autotune_strength, - proposal_pitch=proposal_pitch, - proposal_pitch_threshold=proposal_pitch_threshold, - input_audio_gain=input_audio_gain, - output_audio_gain=output_audio_gain, - monitor_audio_gain=monitor_audio_gain, - monitor=monitor, - vad_enabled=vad_enabled, - vad_sensitivity=vad_sensitivity, - vad_frame_ms=vad_frame_ms, - clean_audio=clean_audio, - clean_strength=clean_strength - ) - - audio_manager = callbacks.audio - audio_manager.start( - input_device_id=input_device_id, - output_device_id=output_device_id, - output_monitor_id=output_monitor_id, - exclusive_mode=exclusive_mode, - asio_input_channel=input_asio_channels, - asio_output_channel=output_asio_channels, - asio_output_monitor_channel=monitor_asio_channels, - read_chunk_size=chunk_size, - input_audio_sample_rate=DEVICE_SAMPLE_RATE, - output_monitor_sample_rate=DEVICE_SAMPLE_RATE - ) - - gr_info(translations["realtime_is_ready"]) - - while running and callbacks is not None and audio_manager is not None: - time.sleep(0.1) - if hasattr(callbacks, "latency"): yield f"{translations['latency']}: {callbacks.latency:.2f} ms", interactive_false, interactive_true - - return translations["realtime_has_stop"], interactive_true, interactive_false - -def realtime_stop(): - global running, callbacks, audio_manager - - if running and audio_manager is not None and callbacks is not None: - gr_info(translations["stop_realtime"]) - - audio_manager.stop() - running = False - - if hasattr(callbacks, "latency"): del callbacks.latency - del audio_manager, callbacks - - audio_manager = callbacks = None - gr_info(translations["realtime_has_stop"]) - - from main.library.utils import clear_gpu_cache - clear_gpu_cache() - - return translations["realtime_has_stop"], interactive_true, interactive_false - else: - gr_warning(translations["realtime_not_found"]) - - return translations["realtime_not_found"], interactive_true, interactive_false \ No newline at end of file diff --git a/main/app/core/realtime_client.py b/main/app/core/realtime_client.py deleted file mode 100644 index e9f39d924c15e4a7386cb46680cf9a6c33666b8e..0000000000000000000000000000000000000000 --- a/main/app/core/realtime_client.py +++ /dev/null @@ -1,114 +0,0 @@ -import os -import sys -import json - -import numpy as np - -from fastapi import FastAPI, WebSocketDisconnect, WebSocket - -sys.path.append(os.getcwd()) - -from main.library.utils import clear_gpu_cache -from main.app.variables import configs, translations, logger -from main.inference.realtime.realtime import VoiceChanger, RVC_Realtime - -app = FastAPI() -vc_instance = None - -PIPELINE_SAMPLE_RATE = 16000 -DEVICE_SAMPLE_RATE = 48000 - -@app.websocket("/ws-audio") -async def websocket_audio(ws: WebSocket): - global vc_instance - await ws.accept() - - logger.info(translations["ws_connected"]) - - try: - text = await ws.receive_text() - params = json.loads(text) - - read_chunk_size = int(params["chunk_size"]) - block_frame = read_chunk_size * 128 - embedders = params["embedders"] - - model_pth = params["model_pth"] - model_pth = os.path.join(configs["weights_path"], model_pth) if not os.path.exists(model_pth) else model_pth - - if not model_pth or not os.path.exists(model_pth) or os.path.isdir(model_pth) or not model_pth.endswith((".pth", ".onnx")): - logger.warning(translations["provide_file"].format(filename=translations["model"])) - await ws.send_text(json.dumps({"type": "warnings", "value": translations["provide_file"].format(filename=translations["model"])})) - return - - logger.info(translations["start_realtime"]) - - if vc_instance is None: - vc_instance = VoiceChanger( - read_chunk_size=read_chunk_size, - cross_fade_overlap_size=params["cross_fade_overlap_size"], - input_sample_rate=DEVICE_SAMPLE_RATE, - extra_convert_size=params["extra_convert_size"] - ) - vc_instance.initialize(vc_model=RVC_Realtime( - model_path=model_pth, - index_path=params["model_index"], - f0_method=params["f0_method"], - f0_onnx=params["f0_onnx"], - embedder_model=(embedders if embedders != "custom" else params["custom_embedders"]), - embedders_mode=params["embedders_mode"], - sample_rate=PIPELINE_SAMPLE_RATE, - hop_length=params["hop_length"], - silent_threshold=params["silent_threshold"], - input_sample_rate=DEVICE_SAMPLE_RATE, - output_sample_rate=DEVICE_SAMPLE_RATE, - vad_enabled=params["vad_enabled"], - vad_sensitivity=params["vad_sensitivity"], - vad_frame_ms=params["vad_frame_ms"], - clean_audio=params["clean_audio"], - clean_strength=params["clean_strength"] - )) - - logger.info(translations["realtime_is_ready"]) - - while 1: - audio = await ws.receive_bytes() - arr = np.frombuffer(audio, dtype=np.float32) - - if arr.size != block_frame: - arr = np.pad(arr, (0, block_frame - arr.size)).astype(np.float32) if arr.size < block_frame else arr[:block_frame].astype(np.float32) - - audio_output, _, perf = vc_instance.on_request( - arr * (params["input_audio_gain"] / 100.0), - f0_up_key=params["f0_up_key"], - index_rate=params["index_rate"], - protect=params["protect"], - filter_radius=params["filter_radius"], - rms_mix_rate=params["rms_mix_rate"], - f0_autotune=params["f0_autotune"], - f0_autotune_strength=params["f0_autotune_strength"], - proposal_pitch=params["proposal_pitch"], - proposal_pitch_threshold=params["proposal_pitch_threshold"] - ) - - await ws.send_text(json.dumps({"type": "latency", "value": perf[1]})) - await ws.send_bytes(audio_output.tobytes()) - except WebSocketDisconnect: - logger.info(translations["ws_disconnected"]) - except Exception as e: - import traceback - logger.debug(traceback.format_exc()) - logger.info(translations["error_occurred"].format(e=e)) - finally: - if vc_instance is not None: - del vc_instance - vc_instance = None - - clear_gpu_cache() - - try: - await ws.close() - except: - pass - - logger.info(translations["ws_closed"]) \ No newline at end of file diff --git a/main/app/core/restart.py b/main/app/core/restart.py deleted file mode 100644 index d4029c84ff41511517b45eb6afa15fefd87dc997..0000000000000000000000000000000000000000 --- a/main/app/core/restart.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -import sys -import json -import platform -import subprocess - -sys.path.append(os.getcwd()) - -from main.app.core.ui import gr_info -from main.app.variables import python, translations, configs_json - -def restart_app(app): - gr_info(translations["30s"]) - os.system("cls" if platform.system() == "Windows" else "clear") - - app.close() - subprocess.run([python, os.path.join("main", "app", "app.py")] + [arg for arg in sys.argv[1:] if arg != "--open"]) - -def change_language(lang, app): - configs = json.load(open(configs_json, "r")) - - if lang != configs["language"]: - configs["language"] = lang - - with open(configs_json, "w") as f: - json.dump(configs, f, indent=4) - - restart_app(app) - -def change_theme(theme, app): - configs = json.load(open(configs_json, "r")) - - if theme != configs["theme"]: - configs["theme"] = theme - with open(configs_json, "w") as f: - json.dump(configs, f, indent=4) - - restart_app(app) - -def change_font(font, app): - configs = json.load(open(configs_json, "r")) - - if font != configs["font"]: - configs["font"] = font - with open(configs_json, "w") as f: - json.dump(configs, f, indent=4) - - restart_app(app) \ No newline at end of file diff --git a/main/app/core/separate.py b/main/app/core/separate.py deleted file mode 100644 index b27e4858412221b170599a70614d646cd0592454..0000000000000000000000000000000000000000 --- a/main/app/core/separate.py +++ /dev/null @@ -1,95 +0,0 @@ -import os -import sys -import subprocess - -sys.path.append(os.getcwd()) - -from main.app.core.ui import gr_info, gr_warning -from main.app.variables import python, translations, configs - -def separate_music( - input_path, - output_dirs, - export_format, - model_name, - karaoke_model, - reverb_model, - denoise_model, - sample_rate, - shifts, - batch_size, - overlap, - aggression, - hop_length, - window_size, - segments_size, - post_process_threshold, - enable_tta, - enable_denoise, - high_end_process, - enable_post_process, - separate_backing, - separate_reverb -): - output_dirs = os.path.dirname(output_dirs) or output_dirs - - if not input_path or not os.path.exists(input_path) or os.path.isdir(input_path): - gr_warning(translations["input_not_valid"]) - return [None]*4 - - if not os.path.exists(output_dirs): - gr_warning(translations["output_not_valid"]) - return [None]*4 - - if not os.path.exists(output_dirs): os.makedirs(output_dirs) - gr_info(translations["start"].format(start=translations["separator_music"])) - - subprocess.run([ - python, configs["separate_path"], - "--input_path", input_path, - "--output_dirs", output_dirs, - "--export_format", export_format, - "--model_name", model_name, - "--karaoke_model", karaoke_model, - "--reverb_model", reverb_model, - "--denoise_model", denoise_model, - "--sample_rate", str(sample_rate), - "--shifts", str(shifts), - "--batch_size", str(batch_size), - "--overlap", str(overlap), - "--aggression", str(aggression), - "--hop_length", str(hop_length), - "--window_size", str(window_size), - "--segments_size", str(segments_size), - "--post_process_threshold", str(post_process_threshold), - "--enable_tta", str(enable_tta), - "--enable_denoise", str(enable_denoise), - "--high_end_process", str(high_end_process), - "--enable_post_process", str(enable_post_process), - "--separate_backing", str(separate_backing), - "--separate_reverb", str(separate_reverb), - ]) - - gr_info(translations["success"]) - - filename, _ = os.path.splitext(os.path.basename(input_path)) - output_dirs = os.path.join(output_dirs, filename) - - return [ - os.path.join( - output_dirs, - f"Original_Vocals_No_Reverb.{export_format}" if separate_reverb else f"Original_Vocals.{export_format}" - ), - os.path.join( - output_dirs, - f"Instruments.{export_format}" - ), - os.path.join( - output_dirs, - f"Main_Vocals_No_Reverb.{export_format}" if separate_reverb else f"Main_Vocals.{export_format}" - ) if separate_backing else None, - os.path.join( - output_dirs, - f"Backing_Vocals.{export_format}" - ) if separate_backing else None - ] if os.path.isfile(input_path) else [None]*4 \ No newline at end of file diff --git a/main/app/core/training.py b/main/app/core/training.py deleted file mode 100644 index 3a276f0af33a5aabe2c4831578a82f6368e7e024..0000000000000000000000000000000000000000 --- a/main/app/core/training.py +++ /dev/null @@ -1,265 +0,0 @@ -import os -import sys -import time -import shutil -import codecs -import threading -import subprocess - -sys.path.append(os.getcwd()) - -from main.tools import huggingface -from main.app.core.ui import gr_info, gr_warning -from main.app.variables import python, translations, configs - -def if_done(done, p): - while 1: - if p.poll() is None: time.sleep(0.5) - else: break - - done[0] = True - -def log_read(done, name): - log_file = os.path.join(configs["logs_path"], "app.log") - - f = open(log_file, "w", encoding="utf-8") - f.close() - - while 1: - with open(log_file, "r", encoding="utf-8") as f: - yield "".join(line for line in f.readlines() if "DEBUG" not in line and name in line and line.strip() != "") - - time.sleep(1) - if done[0]: break - - with open(log_file, "r", encoding="utf-8") as f: - log = "".join(line for line in f.readlines() if "DEBUG" not in line and line.strip() != "") - - yield log - -def create_dataset( - input_data, - output_dirs, - skip_seconds, - skip_start_audios, - skip_end_audios, - separate, - model_name, - reverb_model, - denoise_model, - sample_rate, - shifts, - batch_size, - overlap, - aggression, - hop_length, - window_size, - segments_size, - post_process_threshold, - enable_tta, - enable_denoise, - high_end_process, - enable_post_process, - separate_reverb, - clean_dataset, - clean_strength -): - gr_info(translations["start"].format(start=translations["create"])) - - p = subprocess.Popen(f'{python} {configs["create_dataset_path"]} --input_data "{input_data}" --output_dirs "{output_dirs}" --skip_seconds {skip_seconds} --skip_start_audios "{skip_start_audios}" --skip_end_audios "{skip_end_audios}" --separate {separate} --model_name "{model_name}" --reverb_model "{reverb_model}" --denoise_model "{denoise_model}" --sample_rate {sample_rate} --shifts {shifts} --batch_size {batch_size} --overlap {overlap} --aggression {aggression} --hop_length {hop_length} --window_size {window_size} --segments_size {segments_size} --post_process_threshold {post_process_threshold} --enable_tta {enable_tta} --enable_denoise {enable_denoise} --high_end_process {high_end_process} --enable_post_process {enable_post_process} --separate_reverb {separate_reverb} --clean_dataset {clean_dataset} --clean_strength {clean_strength}', shell=True) - done = [False] - - threading.Thread(target=if_done, args=(done, p)).start() - - for log in log_read(done, "create_dataset"): - yield log - -def create_reference(audio_path, reference_name, pitch_guidance, use_energy, version, embedder_model, embedders_mode, f0_method, f0_onnx, f0_up_key, filter_radius, f0_autotune, f0_autotune_strength, proposal_pitch, proposal_pitch_threshold, alpha=0.5): - gr_info(translations["start"].format(start=translations["create_reference"])) - - p = subprocess.Popen(f'{python} {configs["create_reference_path"]} --audio_path "{audio_path}" --reference_name "{reference_name}" --pitch_guidance {pitch_guidance} --use_energy {use_energy} --version {version} --embedder_model {embedder_model} --embedders_mode {embedders_mode} --f0_method {f0_method} --f0_onnx {f0_onnx} --f0_up_key {f0_up_key} --filter_radius {filter_radius} --f0_autotune {f0_autotune} --f0_autotune_strength {f0_autotune_strength} --proposal_pitch {proposal_pitch} --proposal_pitch_threshold {proposal_pitch_threshold} --alpha {alpha}', shell=True) - done = [False] - - threading.Thread(target=if_done, args=(done, p)).start() - - for log in log_read(done, "create_reference"): - yield log - -def preprocess(model_name, sample_rate, cpu_core, cut_preprocess, process_effects, dataset, clean_dataset, clean_strength, chunk_len=3.0, overlap_len=0.3, normalization_mode="none"): - sr = int(float(sample_rate.rstrip("k")) * 1000) - - if not model_name: return gr_warning(translations["provide_name"]) - if not os.path.exists(dataset) or not any(f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3")) for f in os.listdir(dataset) if os.path.isfile(os.path.join(dataset, f))): return gr_warning(translations["not_found_data"]) - - model_dir = os.path.join(configs["logs_path"], model_name) - if os.path.exists(model_dir): shutil.rmtree(model_dir, ignore_errors=True) - - p = subprocess.Popen(f'{python} {configs["preprocess_path"]} --model_name "{model_name}" --dataset_path "{dataset}" --sample_rate {sr} --cpu_cores {cpu_core} --cut_preprocess {cut_preprocess} --process_effects {process_effects} --clean_dataset {clean_dataset} --clean_strength {clean_strength} --chunk_len {chunk_len} --overlap_len {overlap_len} --normalization_mode {normalization_mode}', shell=True) - done = [False] - - threading.Thread(target=if_done, args=(done, p)).start() - os.makedirs(model_dir, exist_ok=True) - - for log in log_read(done, "preprocess"): - yield log - -def extract(model_name, version, method, pitch_guidance, hop_length, cpu_cores, gpu, sample_rate, embedders, custom_embedders, onnx_f0_mode, embedders_mode, f0_autotune, f0_autotune_strength, hybrid_method, rms_extract, alpha=0.5): - f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders) - sr = int(float(sample_rate.rstrip("k")) * 1000) - - if not model_name: return gr_warning(translations["provide_name"]) - model_dir = os.path.join(configs["logs_path"], model_name) - - try: - if not any(os.path.isfile(os.path.join(model_dir, "sliced_audios", f)) for f in os.listdir(os.path.join(model_dir, "sliced_audios"))) or not any(os.path.isfile(os.path.join(model_dir, "sliced_audios_16k", f)) for f in os.listdir(os.path.join(model_dir, "sliced_audios_16k"))): return gr_warning(translations["not_found_data_preprocess"]) - except: - return gr_warning(translations["not_found_data_preprocess"]) - - p = subprocess.Popen(f'{python} {configs["extract_path"]} --model_name "{model_name}" --rvc_version {version} --f0_method {f0method} --pitch_guidance {pitch_guidance} --hop_length {hop_length} --cpu_cores {cpu_cores} --gpu {gpu} --sample_rate {sr} --embedder_model {embedder_model} --f0_onnx {onnx_f0_mode} --embedders_mode {embedders_mode} --f0_autotune {f0_autotune} --f0_autotune_strength {f0_autotune_strength} --rms_extract {rms_extract} --alpha {alpha}', shell=True) - done = [False] - - threading.Thread(target=if_done, args=(done, p)).start() - os.makedirs(model_dir, exist_ok=True) - - for log in log_read(done, "extract"): - yield log - -def create_index(model_name, rvc_version, index_algorithm): - if not model_name: return gr_warning(translations["provide_name"]) - model_dir = os.path.join(configs["logs_path"], model_name) - - try: - if not any(os.path.isfile(os.path.join(model_dir, f"{rvc_version}_extracted", f)) for f in os.listdir(os.path.join(model_dir, f"{rvc_version}_extracted"))): return gr_warning(translations["not_found_data_extract"]) - except: - return gr_warning(translations["not_found_data_extract"]) - - p = subprocess.Popen(f'{python} {configs["create_index_path"]} --model_name "{model_name}" --rvc_version {rvc_version} --index_algorithm {index_algorithm}', shell=True) - done = [False] - - threading.Thread(target=if_done, args=(done, p)).start() - os.makedirs(model_dir, exist_ok=True) - - for log in log_read(done, "create_index"): - yield log - -def training(model_name, rvc_version, save_every_epoch, save_only_latest, save_every_weights, total_epoch, sample_rate, batch_size, gpu, pitch_guidance, not_pretrain, custom_pretrained, pretrain_g, pretrain_d, detector, threshold, clean_up, cache, model_author, vocoder, checkpointing, deterministic, benchmark, optimizer, energy_use, custom_reference=False, reference_name="", multiscale_mel_loss=False): - sr = int(float(sample_rate.rstrip("k")) * 1000) - if not model_name: return gr_warning(translations["provide_name"]) - - model_dir = os.path.join(configs["logs_path"], model_name) - if os.path.exists(os.path.join(model_dir, "train_pid.txt")): os.remove(os.path.join(model_dir, "train_pid.txt")) - - try: - if not any(os.path.isfile(os.path.join(model_dir, f"{rvc_version}_extracted", f)) for f in os.listdir(os.path.join(model_dir, f"{rvc_version}_extracted"))): return gr_warning(translations["not_found_data_extract"]) - except: - return gr_warning(translations["not_found_data_extract"]) - - if not not_pretrain: - if not custom_pretrained: - pretrain_dir = configs["pretrained_v2_path"] if rvc_version == 'v2' else configs["pretrained_v1_path"] - download_version = codecs.decode(f"uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cergenvarq_", "rot13") + f"{rvc_version}/" - - pretrained_selector = { - True: { - 32000: ("f0G32k.pth", "f0D32k.pth"), - 40000: ("f0G40k.pth", "f0D40k.pth"), - 48000: ("f0G48k.pth", "f0D48k.pth") - }, - False: { - 32000: ("G32k.pth", "D32k.pth"), - 40000: ("G40k.pth", "D40k.pth"), - 48000: ("G48k.pth", "D48k.pth") - } - } - - pg2, pd2 = "", "" - pg, pd = pretrained_selector[pitch_guidance][sr] - - if energy_use: pg2, pd2 = pg2 + "ENERGY_", pd2 + "ENERGY_" - if vocoder != 'Default': pg2, pd2 = pg2 + vocoder + "_", pd2 + vocoder + "_" - - pg2, pd2 = pg2 + pg, pd2 + pd - pretrained_G, pretrained_D = ( - os.path.join( - pretrain_dir, - pg2 - ), - os.path.join( - pretrain_dir, - pd2 - ) - ) - - try: - if not os.path.exists(pretrained_G): - gr_info(translations["download_pretrained"].format(dg="G", rvc_version=rvc_version)) - huggingface.HF_download_file( - "".join( - [ - download_version, - pg2 - ] - ), - os.path.join( - pretrain_dir, - pg2 - ) - ) - - if not os.path.exists(pretrained_D): - gr_info(translations["download_pretrained"].format(dg="D", rvc_version=rvc_version)) - huggingface.HF_download_file( - "".join( - [ - download_version, - pd2 - ] - ), - os.path.join( - pretrain_dir, - pd2 - ) - ) - except: - gr_warning(translations["not_use_pretrain_error_download"]) - pretrained_G = pretrained_D = None - else: - if not pretrain_g: return gr_warning(translations["provide_pretrained"].format(dg="G")) - if not pretrain_d: return gr_warning(translations["provide_pretrained"].format(dg="D")) - - pg2, pd2 = pretrain_g, pretrain_d - pretrained_G, pretrained_D = ( - (os.path.join(configs["pretrained_custom_path"], pg2) if not os.path.exists(pg2) else pg2), - (os.path.join(configs["pretrained_custom_path"], pd2) if not os.path.exists(pd2) else pd2) - ) - - if not os.path.exists(pretrained_G): return gr_warning(translations["not_found_pretrain"].format(dg="G")) - if not os.path.exists(pretrained_D): return gr_warning(translations["not_found_pretrain"].format(dg="D")) - else: - pretrained_G = pretrained_D = None - gr_warning(translations["not_use_pretrain"]) - - if custom_reference: - reference_path = os.path.join(configs["reference_path"], reference_name) - - if not os.path.exists(reference_path): - gr_warning(translations["not_found_reference"]) - - custom_reference = False - reference_path = None - else: reference_path = None - - gr_info(translations["start"].format(start=translations["training"])) - - p = subprocess.Popen(f'{python} {configs["train_path"]} --model_name "{model_name}" --rvc_version {rvc_version} --save_every_epoch {save_every_epoch} --save_only_latest {save_only_latest} --save_every_weights {save_every_weights} --total_epoch {total_epoch} --batch_size {batch_size} --gpu {gpu} --pitch_guidance {pitch_guidance} --overtraining_detector {detector} --overtraining_threshold {threshold} --cleanup {clean_up} --cache_data_in_gpu {cache} --g_pretrained_path "{pretrained_G}" --d_pretrained_path "{pretrained_D}" --model_author "{model_author}" --vocoder "{vocoder}" --checkpointing {checkpointing} --deterministic {deterministic} --benchmark {benchmark} --optimizer {optimizer} --energy_use {energy_use} --use_custom_reference {custom_reference} --reference_path {reference_path} --multiscale_mel_loss {multiscale_mel_loss}', shell=True) - done = [False] - - with open(os.path.join(model_dir, "train_pid.txt"), "w") as pid_file: - pid_file.write(str(p.pid)) - - threading.Thread(target=if_done, args=(done, p)).start() - - for log in log_read(done, "train"): - lines = log.splitlines() - if len(lines) > 50: log = "\n".join(lines[-50:]) - yield log \ No newline at end of file diff --git a/main/app/core/tts.py b/main/app/core/tts.py deleted file mode 100644 index 327fd849371c19fedf405e27bf426d6bfc57c903..0000000000000000000000000000000000000000 --- a/main/app/core/tts.py +++ /dev/null @@ -1,100 +0,0 @@ -import os -import sys -import pysrt -import codecs -import librosa -import asyncio -import requests -import tempfile - -sys.path.append(os.getcwd()) - -from main.app.variables import translations -from main.app.core.ui import gr_info, gr_warning, gr_error - -def synthesize_tts(prompt, voice, speed, output, pitch, google): - if not google: - from edge_tts import Communicate - asyncio.run(Communicate(text=prompt, voice=voice, rate=f"+{speed}%" if speed >= 0 else f"{speed}%", pitch=f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz").save(output)) - else: - response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyr.pbz/genafyngr_ggf", "rot13"), params={"ie": "UTF-8", "q": prompt, "tl": voice, "ttsspeed": speed, "client": "tw-ob"}, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}) - - if response.status_code == 200: - with open(output, "wb") as f: - f.write(response.content) - - if pitch != 0 or speed != 0: - y, sr = librosa.load(output, sr=None) - - if pitch != 0: y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch) - if speed != 0: y = librosa.effects.time_stretch(y, rate=speed) - - import soundfile as sf - sf.write(file=output, data=y, samplerate=sr, format=os.path.splitext(os.path.basename(output))[-1].lower().replace('.', '')) - else: gr_error(f"{response.status_code}, {response.text}") - -def srt_tts(srt_file, out_file, voice, rate = 0, sr = 24000, google = False): - import numpy as np - import soundfile as sf - - def time_stretch(y, sr, target_duration): - rate = (len(y) / sr) / target_duration - if rate != 1.0: y = librosa.effects.time_stretch(y=y.astype(np.float32), rate=rate) - - n_target = int(round(target_duration * sr)) - return np.pad(y, (0, n_target - len(y))) if len(y) < n_target else y[:n_target] - - def pysrttime_to_seconds(t): - return (t.hours * 60 + t.minutes) * 60 + t.seconds + t.milliseconds / 1000 - - subs = pysrt.open(srt_file) - if not subs: raise ValueError(translations["srt"]) - - final_audio = np.zeros(int(round(pysrttime_to_seconds(subs[-1].end) * sr)), dtype=np.float32) - - with tempfile.TemporaryDirectory() as tempdir: - for idx, seg in enumerate(subs): - wav_path = os.path.join(tempdir, f"seg_{idx}.wav") - synthesize_tts(" ".join(seg.text.splitlines()), voice, 0, wav_path, rate, google) - - audio, file_sr = sf.read(wav_path, dtype=np.float32) - if file_sr != sr: audio = np.interp(np.linspace(0, len(audio) - 1, int(len(audio) * sr / file_sr)), np.arange(len(audio)), audio) - adjusted = time_stretch(audio, sr, pysrttime_to_seconds(seg.duration)) - - start_sample = int(round(pysrttime_to_seconds(seg.start) * sr)) - end_sample = start_sample + adjusted.shape[0] - - if end_sample > final_audio.shape[0]: - adjusted = adjusted[: final_audio.shape[0] - start_sample] - end_sample = final_audio.shape[0] - - final_audio[start_sample:end_sample] += adjusted - - sf.write(out_file, final_audio, sr) - -def TTS(prompt, voice, speed, output, pitch, google, srt_input): - if not srt_input: srt_input = "" - - if not prompt and not srt_input.endswith(".srt"): - gr_warning(translations["enter_the_text"]) - return None - - if not voice: - gr_warning(translations["choose_voice"]) - return None - - if not output: - gr_warning(translations["output_not_valid"]) - return None - - if os.path.isdir(output): output = os.path.join(output, f"tts.wav") - gr_info(translations["convert"].format(name=translations["text"])) - - output_dir = os.path.dirname(output) or output - if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) - - if srt_input.endswith(".srt"): srt_tts(srt_input, output, voice, 0, 24000, google) - else: synthesize_tts(prompt, voice, speed, output, pitch, google) - - gr_info(translations["success"]) - return output \ No newline at end of file diff --git a/main/app/core/ui.py b/main/app/core/ui.py deleted file mode 100644 index 87362b30005132bfc722bc606b81a9f3ce4ad973..0000000000000000000000000000000000000000 --- a/main/app/core/ui.py +++ /dev/null @@ -1,362 +0,0 @@ -import os -import re -import sys -import json -import torch -import shutil - -import gradio as gr -import sounddevice as sd - -sys.path.append(os.getcwd()) - -from main.library.backends import directml, opencl -from main.inference.realtime.audio import list_audio_device -from main.app.variables import config, configs, configs_json, logger, translations, edgetts, google_tts_voice, method_f0, method_f0_full, vr_models, mdx_models, demucs_models, embedders_model, spin_model, whisper_model - -def gr_info(message): - gr.Info(message, duration=2) - logger.info(message) - -def gr_warning(message): - gr.Warning(message, duration=2) - logger.warning(message) - -def gr_error(message): - gr.Error(message=message, duration=6) - logger.error(message) - -def get_gpu_info(): - ngpu = torch.cuda.device_count() - gpu_infos = [ - f"{i}: {torch.cuda.get_device_name(i)} ({int(torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + 0.4)} GB)" - for i in range(ngpu) - if torch.cuda.is_available() or ngpu != 0 - ] - - if len(gpu_infos) == 0: - if directml.torch_available: - ngpu = directml.device_count() - gpu_infos = [f"{i}: {directml.device_name(i)}" for i in range(ngpu) if directml.is_available() or ngpu != 0] - elif opencl.torch_available: - ngpu = opencl.device_count() - gpu_infos = [f"{i}: {opencl.device_name(i)}" for i in range(ngpu) if opencl.is_available() or ngpu != 0] - else: - ngpu = 0 - gpu_infos = [] - - return "\n".join(gpu_infos) if len(gpu_infos) > 0 and not config.cpu_mode else translations["no_support_gpu"] - -def gpu_number_str(): - if config.cpu_mode: return "-" - - ngpu = torch.cuda.device_count() - if ngpu == 0: ngpu = directml.device_count() if directml.torch_available else opencl.device_count() - - return str("-".join(map(str, range(ngpu))) if torch.cuda.is_available() or directml.is_available() or opencl.is_available() else "-") - -def change_f0_choices(): - f0_file = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["f0_path"]) for f in files if f.endswith(".txt")]) - return {"value": f0_file[0] if len(f0_file) >= 1 else "", "choices": f0_file, "__type__": "update"} - -def change_audios_choices(input_audio): - audios = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["audios_path"]) for f in files if os.path.splitext(f)[1].lower() in (".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")]) - return {"value": input_audio if input_audio != "" else (audios[0] if len(audios) >= 1 else ""), "choices": audios, "__type__": "update"} - -def change_reference_choices(): - reference = sorted([re.sub(r'_v\d+_(?:[A-Za-z0-9_]+?)_(True|False)_(True|False)$', '', name) for name in os.listdir(configs["reference_path"]) if os.path.exists(os.path.join(configs["reference_path"], name)) and os.path.isdir(os.path.join(configs["reference_path"], name))]) - return {"value": reference[0] if len(reference) >= 1 else "", "choices": reference, "__type__": "update"} - -def change_models_choices(): - model, index = sorted(list(model for model in os.listdir(configs["weights_path"]) if model.endswith((".pth", ".onnx")) and not model.startswith("G_") and not model.startswith("D_"))), sorted([os.path.join(root, name) for root, _, files in os.walk(configs["logs_path"], topdown=False) for name in files if name.endswith(".index") and "trained" not in name]) - return [{"value": model[0] if len(model) >= 1 else "", "choices": model, "__type__": "update"}, {"value": index[0] if len(index) >= 1 else "", "choices": index, "__type__": "update"}] - -def change_pretrained_choices(): - pretrainD = sorted([model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "D" in model]) - pretrainG = sorted([model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "G" in model]) - - return [{"choices": pretrainD, "value": pretrainD[0] if len(pretrainD) >= 1 else "", "__type__": "update"}, {"choices": pretrainG, "value": pretrainG[0] if len(pretrainG) >= 1 else "", "__type__": "update"}] - -def change_choices_del(): - return [{"choices": sorted(list(model for model in os.listdir(configs["weights_path"]) if model.endswith(".pth") and not model.startswith("G_") and not model.startswith("D_"))), "__type__": "update"}, {"choices": sorted([os.path.join(configs["logs_path"], f) for f in os.listdir(configs["logs_path"]) if f not in ["mute", "reference"] and os.path.isdir(os.path.join(configs["logs_path"], f))]), "__type__": "update"}] - -def change_preset_choices(): - return {"value": "", "choices": sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".conversion.json"))), "__type__": "update"} - -def change_effect_preset_choices(): - return {"value": "", "choices": sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".effect.json"))), "__type__": "update"} - -def change_tts_voice_choices(google): - return {"choices": google_tts_voice if google else edgetts, "value": google_tts_voice[0] if google else edgetts[0], "__type__": "update"} - -def change_backing_choices(backing, merge): - if backing or merge: return {"value": False, "interactive": False, "__type__": "update"} - elif not backing or not merge: return {"interactive": True, "__type__": "update"} - else: gr_warning(translations["option_not_valid"]) - -def change_download_choices(select): - selects = [False]*10 - - if select == translations["download_url"]: selects[0] = selects[1] = selects[2] = True - elif select == translations["download_from_csv"]: selects[3] = selects[4] = True - elif select == translations["search_models"]: selects[5] = selects[6] = True - elif select == translations["upload"]: selects[9] = True - else: gr_warning(translations["option_not_valid"]) - - return [{"visible": selects[i], "__type__": "update"} for i in range(len(selects))] - -def change_download_pretrained_choices(select): - selects = [False]*7 - - if select == translations["download_url"]: selects[0] = selects[1] = selects[2] = True - elif select == translations["list_model"]: selects[3] = selects[4] = selects[5] = True - elif select == translations["upload"]: selects[6] = True - else: gr_warning(translations["option_not_valid"]) - - return [{"visible": selects[i], "__type__": "update"} for i in range(len(selects))] - -def get_index(model): - model = os.path.basename(model).split("_")[0] - return {"value": next((f for f in [os.path.join(root, name) for root, _, files in os.walk(configs["logs_path"], topdown=False) for name in files if name.endswith(".index") and "trained" not in name] if model.split(".")[0] in f), ""), "__type__": "update"} if model else None - -def index_strength_show(index): - return {"visible": index != "" and index != None and os.path.exists(index) and os.path.isfile(index), "value": 0.5, "__type__": "update"} - -def hoplength_show(method, hybrid_method=None): - visible = False - - for m in ["mangio-crepe", "fcpe", "yin", "piptrack", "mangio-penn"]: - if m in method: visible = True - if hybrid_method is not None and m in hybrid_method: visible = True - - if visible: break - else: visible = False - - return {"visible": visible, "__type__": "update"} - -def visible(value): - return {"visible": value, "__type__": "update"} - -def valueFalse_interactive(value): - return {"value": False, "interactive": value, "__type__": "update"} - -def valueEmpty_visible1(value): - return {"value": "", "visible": value, "__type__": "update"} - -def pitch_guidance_lock(vocoders): - return {"value": True, "interactive": vocoders == "Default", "__type__": "update"} - -def vocoders_lock(pitch, vocoders): - return {"value": vocoders if pitch else "Default", "interactive": pitch, "__type__": "update"} - -def unlock_f0(value): - return {"choices": method_f0_full if value else method_f0, "value": "rmvpe", "__type__": "update"} - -def unlock_vocoder(value, vocoder): - return {"value": vocoder if value == "v2" else "Default", "interactive": value == "v2", "__type__": "update"} - -def unlock_ver(value, vocoder): - return {"value": "v2" if vocoder == "Default" else value, "interactive": vocoder == "Default", "__type__": "update"} - -def change_embedders_mode(value): - if value == "spin": - return {"value": spin_model[0], "choices": spin_model, "__type__": "update"} - elif value == "whisper": - return {"value": whisper_model[0], "choices": whisper_model, "__type__": "update"} - else: - return {"value": embedders_model[0], "choices": embedders_model, "__type__": "update"} - -def change_fp(fp): - fp16 = fp == "fp16" - - if fp16 and config.device in ["cpu", "mps", "ocl:0"]: - gr_warning(translations["fp16_not_support"]) - return "fp32" - else: - gr_info(translations["start_update_precision"]) - - configs = json.load(open(configs_json, "r")) - configs["fp16"] = config.is_half = fp16 - - with open(configs_json, "w") as f: - json.dump(configs, f, indent=4) - - gr_info(translations["success"]) - return "fp16" if fp16 else "fp32" - -def process_output(file_path): - if config.configs.get("delete_exists_file", True): - if os.path.exists(file_path) and os.path.isfile(file_path): os.remove(file_path) - return file_path - else: - if not os.path.exists(file_path): return file_path - file = os.path.splitext(os.path.basename(file_path)) - - index = 1 - while 1: - file_path = os.path.join(os.path.dirname(file_path), f"{file[0]}_{index}{file[1]}") - if not os.path.exists(file_path): return file_path - index += 1 - -def shutil_move(input_path, output_path): - output_path = os.path.join(output_path, os.path.basename(input_path)) if os.path.isdir(output_path) else output_path - - return shutil.move(input_path, process_output(output_path)) if os.path.exists(output_path) else shutil.move(input_path, output_path) - -def separate_change(model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise): - model_type = "vr" if model_name in list(vr_models.keys()) else "mdx" if model_name in list(mdx_models.keys()) else "demucs" if model_name in list(demucs_models.keys()) else "" - karaoke_type = ("vr" if karaoke_model.startswith("VR") else "mdx") if separate_backing else None - reverb_type = ("vr" if not reverb_model.startswith("MDX") else "mdx") if separate_reverb else None - - all_types = {model_type, karaoke_type, reverb_type} - - is_vr = "vr" in all_types - is_mdx = "mdx" in all_types - is_demucs = "demucs" in all_types - - return [ - visible(separate_backing), - visible(separate_reverb), - visible(is_mdx or is_demucs), - visible(is_mdx or is_demucs), - visible(is_mdx), - visible(is_mdx or is_vr), - visible(is_demucs), - visible(is_vr), - visible(is_vr), - visible(is_vr and enable_post_process), - visible(is_vr and enable_denoise), - valueFalse_interactive(is_vr), - valueFalse_interactive(is_vr), - valueFalse_interactive(is_vr) - ] - -def create_dataset_change(model_name, reverb_model, enable_post_process, separate_reverb, enable_denoise): - model_type = "vr" if model_name in list(vr_models.keys()) else "mdx" if model_name in list(mdx_models.keys()) else "demucs" if model_name in list(demucs_models.keys()) else "" - reverb_type = ("vr" if not reverb_model.startswith("MDX") else "mdx") if separate_reverb else None - all_types = {model_type, reverb_type} - - is_vr = "vr" in all_types - is_mdx = "mdx" in all_types - is_demucs = "demucs" in all_types - - return [ - visible(separate_reverb), - visible(is_mdx or is_demucs), - visible(is_mdx or is_demucs), - visible(is_mdx), - visible(is_mdx or is_vr), - visible(is_demucs), - visible(is_vr), - visible(is_vr), - visible(is_vr and enable_post_process), - visible(is_vr and enable_denoise), - valueFalse_interactive(is_vr), - valueFalse_interactive(is_vr), - valueFalse_interactive(is_vr) - ] - -def audio_device(): - try: - input_devices, output_devices = list_audio_device() - - def priority(name): - n = name.lower() - if "virtual" in n: - return 0 - if "vb" in n: - return 1 - return 2 - - output_sorted = sorted(output_devices, key=lambda d: priority(d.name)) - input_sorted = sorted( - input_devices, key=lambda d: priority(d.name), reverse=True - ) - - input_device_list = { - f"{input_sorted.index(d)+1}: {d.name} ({d.host_api})": [d.index, d.max_input_channels] for d in input_sorted - } - output_device_list = { - f"{output_sorted.index(d)+1}: {d.name} ({d.host_api})": [d.index, d.max_output_channels] for d in output_sorted - } - - return input_device_list, output_device_list - except Exception: - return [], [] - -def update_audio_device(input_device, output_device, monitor_device, monitor): - input_channels_map, output_channels_map = audio_device() - - input_is_asio = "ASIO" in input_device if input_device else False - output_is_asio = "ASIO" in output_device if output_device else False - monitor_is_asio = "ASIO" in monitor_device if monitor_device else False - - try: - input_max_ch = input_channels_map.get(input_device, [])[1] - output_max_ch = output_channels_map.get(output_device, [])[1] - monitor_max_ch = output_channels_map.get(monitor_device, [])[1] if monitor else 128 - except: - input_max_ch = output_max_ch = monitor_max_ch = -1 - - return [ - visible(monitor), - visible(monitor), - visible(monitor_is_asio), - visible(input_is_asio or output_is_asio or monitor_is_asio), - gr.update(visible=input_is_asio, maximum=input_max_ch), - gr.update(visible=output_is_asio, maximum=output_max_ch), - gr.update(visible=monitor_is_asio, maximum=monitor_max_ch) - ] - -def change_audio_device_choices(): - sd._terminate() - sd._initialize() - - input_channels_map, output_channels_map = audio_device() - input_channels_map, output_channels_map = list(input_channels_map.keys()), list(output_channels_map.keys()) - - return [ - {"value": input_channels_map[0] if len(input_channels_map) >= 1 else "", "choices": input_channels_map, "__type__": "update"}, - {"value": output_channels_map[0] if len(output_channels_map) >= 1 else "", "choices": output_channels_map, "__type__": "update"}, - {"value": output_channels_map[0] if len(output_channels_map) >= 1 else "", "choices": output_channels_map, "__type__": "update"} - ] - -def replace_punctuation(filename): - return filename.replace(" ", "_").replace("-", "").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace(",", "").replace('"', "").replace("'", "").replace("|", "_").replace("{", "").replace("}", "").replace("-_-", "_").replace("_-_", "_").replace("-", "_").replace("---", "_").replace("___", "_").strip() - -def replace_url(url): - return url.replace("/blob/", "/resolve/").replace("?download=true", "").strip() - -def replace_modelname(modelname): - return replace_punctuation(modelname.replace(".onnx", "").replace(".pth", "").replace(".index", "").replace(".zip", "")) - -def replace_export_format(audio_path, export_format = "wav"): - export_format = f".{export_format}" - - return audio_path if audio_path.endswith(export_format) else audio_path.replace(f".{os.path.basename(audio_path).split('.')[-1]}", export_format) - -def update_dropdowns_from_json(data): - if not data: - return [ - gr.update(choices=[], value=None), - gr.update(choices=[], value=None), - gr.update(choices=[], value=None) - ] - - inputs = list(data.get("inputs", {}).keys()) - outputs = list(data.get("outputs", {}).keys()) - - return [ - gr.update(choices=inputs, value=inputs[0] if len(inputs) > 0 else None), - gr.update(choices=outputs, value=outputs[0] if len(outputs) > 0 else None), - gr.update(choices=outputs, value=outputs[0] if len(outputs) > 0 else None), - ] - -def update_button_from_json(data): - if not data: - return [gr.update(interactive=True), gr.update(interactive=False)] - - return [ - gr.update(interactive=data.get("start_button", True)), - gr.update(interactive=data.get("stop_button", False)) - ] \ No newline at end of file diff --git a/main/app/core/utils.py b/main/app/core/utils.py deleted file mode 100644 index 4f42c26d4a9c502c27a07a5dea4d3f65b7dc3757..0000000000000000000000000000000000000000 --- a/main/app/core/utils.py +++ /dev/null @@ -1,61 +0,0 @@ -import os -import sys -import json -import codecs -import requests - -sys.path.append(os.getcwd()) - -from main.app.core.ui import gr_info, gr_warning -from main.app.variables import translations, configs - -def stop_pid(pid_file, model_name=None, train=False): - try: - pid_file_path = os.path.join("assets", f"{pid_file}.txt") if model_name is None else os.path.join(configs["logs_path"], model_name, f"{pid_file}.txt") - - if not os.path.exists(pid_file_path): return gr_warning(translations["not_found_pid"]) - else: - with open(pid_file_path, "r") as pid_file: - pids = [int(pid) for pid in pid_file.readlines()] - - for pid in pids: - os.kill(pid, 9) - - if os.path.exists(pid_file_path): os.remove(pid_file_path) - - pid_file_path = os.path.join(configs["logs_path"], model_name, "config.json") - - if train and os.path.exists(pid_file_path): - with open(pid_file_path, "r") as pid_file: - pid_data = json.load(pid_file) - pids = pid_data.get("process_pids", []) - - with open(pid_file_path, "w") as pid_file: - pid_data.pop("process_pids", None) - - json.dump(pid_data, pid_file, indent=4) - - for pid in pids: - os.kill(pid, 9) - - gr_info(translations["end_pid"]) - except: - pass - -def google_translate(text, source='auto', target='vi'): - if text == "": return gr_warning(translations["prompt_warning"]) - - try: - import textwrap - - def translate_chunk(chunk): - response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyrncvf.pbz/genafyngr_n/fvatyr", "rot13"), params={'client': 'gtx', 'sl': source, 'tl': target, 'dt': 't', 'q': chunk}) - return ''.join([i[0] for i in response.json()[0]]) if response.status_code == 200 else chunk - - translated_text = '' - for chunk in textwrap.wrap(text, 5000, break_long_words=False, break_on_hyphens=False): - translated_text += translate_chunk(chunk) - - return translated_text - except: - return text \ No newline at end of file diff --git a/main/app/parser.py b/main/app/parser.py deleted file mode 100644 index 4dd900ce8326407a65800ee9c0cae25e53e11f8f..0000000000000000000000000000000000000000 --- a/main/app/parser.py +++ /dev/null @@ -1,369 +0,0 @@ -import os -import sys - -sys.path.append(os.getcwd()) - -try: - argv = sys.argv[1] -except IndexError: - argv = None - -argv_is_allows = ["--audio_effects", "--convert", "--create_dataset", "--create_index", "--extract", "--preprocess", "--separator_music", "--train", "--help_audio_effects", "--help_convert", "--help_create_dataset", "--help_create_index", "--help_extract", "--help_preprocess", "--help_separate_music", "--help_train", "--help", "--create_reference", "help_create_reference"] - -if argv not in argv_is_allows: - print("Cú pháp không hợp lệ! Sử dụng --help để biết thêm") - quit() - -if argv_is_allows[0] in argv: from main.inference.audio_effects import main -elif argv_is_allows[1] in argv: from main.inference.conversion.convert import main -elif argv_is_allows[2] in argv: from main.inference.create_dataset import main -elif argv_is_allows[3] in argv: from main.inference.create_index import main -elif argv_is_allows[4] in argv: from main.inference.extracting.extract import main -elif argv_is_allows[5] in argv: from main.inference.preprocess.preprocess import main -elif argv_is_allows[6] in argv: from main.inference.separate_music import main -elif argv_is_allows[7] in argv: from main.inference.training.train import main -elif argv_is_allows[17] in argv: from main.inference.create_reference import main -elif argv_is_allows[8] in argv: - print("""Các tham số của `--audio_effects`: - 1. Đường dẫn tệp: - - `--input_path` (bắt buộc): Đường dẫn đến tệp âm thanh đầu vào. - - `--output_path` (mặc định: `./audios/apply_effects.wav`): Đường dẫn lưu tệp đầu ra. - - `--export_format` (mặc định: `wav`): Định dạng xuất tệp (`wav`, `mp3`, ...). - - 2. Lấy mẫu lại: - - `--resample` (mặc định: `False`): Có lấy mẫu lại hay không. - - `--resample_sr` (mặc định: `0`): Tần số lấy mẫu mới (Hz). - - 3. Hiệu ứng chorus: - - `--chorus`: Bật/tắt chorus. - - `--chorus_depth`, `--chorus_rate`, `--chorus_mix`, `--chorus_delay`, `--chorus_feedback`: Các thông số điều chỉnh chorus. - - 4. Hiệu ứng distortion: - - `--distortion`: Bật/tắt distortion. - - `--drive_db`: Mức độ méo âm thanh. - - 5. Hiệu ứng reverb: - - `--reverb`: Bật/tắt hồi âm. - - `--reverb_room_size`, `--reverb_damping`, `--reverb_wet_level`, `--reverb_dry_level`, `--reverb_width`, `--reverb_freeze_mode`: Điều chỉnh hồi âm. - - 6. Hiệu ứng pitch shift: - - `--pitchshift`: Bật/tắt thay đổi cao độ. - - `--pitch_shift`: Giá trị dịch cao độ. - - 7. Hiệu ứng delay: - - `--delay`: Bật/tắt delay. - - `--delay_seconds`, `--delay_feedback`, `--delay_mix`: Điều chỉnh thời gian trễ, phản hồi và hòa trộn. - - 8. Compressor: - - `--compressor`: Bật/tắt compressor. - - `--compressor_threshold`, `--compressor_ratio`, `--compressor_attack_ms`, `--compressor_release_ms`: Các thông số nén. - - 9. Limiter: - - `--limiter`: Bật/tắt giới hạn mức âm thanh. - - `--limiter_threshold`, `--limiter_release`: Ngưỡng giới hạn và thời gian nhả. - - 10. Gain (Khuếch đại): - - `--gain`: Bật/tắt gain. - - `--gain_db`: Mức gain (dB). - - 11. Bitcrush: - - `--bitcrush`: Bật/tắt hiệu ứng giảm độ phân giải. - - `--bitcrush_bit_depth`: Số bit của bitcrush. - - 12. Clipping: - - `--clipping`: Bật/tắt cắt âm thanh. - - `--clipping_threshold`: Ngưỡng clipping. - - 13. Phaser: - - `--phaser`: Bật/tắt hiệu ứng phaser. - - `--phaser_rate_hz`, `--phaser_depth`, `--phaser_centre_frequency_hz`, `--phaser_feedback`, `--phaser_mix`: Điều chỉnh hiệu ứng phaser. - - 14. Boost bass & treble: - - `--treble_bass_boost`: Bật/tắt tăng cường âm bass và treble. - - `--bass_boost_db`, `--bass_boost_frequency`, `--treble_boost_db`, `--treble_boost_frequency`: Các thông số tăng bass và treble. - - 15. Fade in & fade out: - - `--fade_in_out`: Bật/tắt hiệu ứng fade. - - `--fade_in_duration`, `--fade_out_duration`: Thời gian fade vào/ra. - - 16. Kết hợp âm thanh: - - `--audio_combination`: Bật/tắt ghép nhiều tệp âm thanh. - - `--audio_combination_input`: Đường dẫn tệp âm thanh bổ sung. - - `--main_volume`: Âm lượng của âm thanh chính. - - `--combination_volume`:: Âm lượng của âm thanh cần kết hợp. - """) - quit() -elif argv_is_allows[9] in argv: - print("""Các tham số của --convert: - 1. Cấu hình xử lí giọng nói: - - `--pitch` (mặc định: `0`): Điều chỉnh cao độ. - - `--filter_radius` (mặc định: `3`): Độ mượt của đường F0. - - `--index_rate` (mặc định: `0.5`): Tỷ lệ sử dụng chỉ mục giọng nói. - - `--rms_mix_rate` (mặc định: `1`): Hệ số điều chỉnh biên độ âm lượng. - - `--protect` (mặc định: `0.33`): Bảo vệ phụ âm. - - `--hop_length` (mặc định: `64`): Bước nhảy khi xử lí âm thanh. - - 2. Cấu hình F0: - - `--f0_method` (mặc định: `rmvpe`): Phương pháp dự đoán F0 (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`). - - `--f0_autotune` (mặc định: `False`): Có tự động điều chỉnh F0 hay không. - - `--f0_autotune_strength` (mặc định: `1`): Cường độ hiệu chỉnh tự động F0. - - `--f0_file` (mặc định: ``): Đường dẫn tệp F0 có sẵn. - - `--f0_onnx` (mặc định: `False`): Có sử dụng phiên bản ONNX của F0 hay không. - - `--proposal_pitch` (mặc định: `False`): Đề xuất cao độ thay vì điều chỉnh thủ công. - - `--proposal_pitch_threshold` (mặc định: `0.0`): Ngưỡng tần số ước tính cao độ. - - `--alpha` (mặc định: `0.5`): Ngưỡng trộn cao độ khi ước tính cao độ hybrid. - - 3. Mô hình nhúng: - - `--embedder_model` (mặc định: `hubert_base`): Mô hình nhúng sử dụng. - - `--embedders_mode` (mặc định: `fairseq`): Chế độ nhúng (`fairseq`, `transformers`, `onnx`, `whisper`). - - 4. Đường dẫn tệp: - - `--input_path` (bắt buộc): Đường dẫn tệp âm thanh đầu vào. - - `--output_path` (mặc định: `./audios/output.wav`): Đường dẫn lưu tệp đầu ra. - - `--export_format` (mặc định: `wav`): Định dạng xuất tệp. - - `--pth_path` (bắt buộc): Đường dẫn đến tệp mô hình `.pth`. - - `--index_path` (mặc định: `None`): Đường dẫn tệp chỉ mục (nếu có). - - 5. Làm sạch âm thanh: - - `--clean_audio` (mặc định: `False`): Có áp dụng làm sạch âm thanh không. - - `--clean_strength` (mặc định: `0.7`): Mức độ làm sạch. - - 6. Resampling & chia nhỏ âm thanh: - - `--resample_sr` (mặc định: `0`): Tần số lấy mẫu mới (0 nghĩa là giữ nguyên). - - `--split_audio` (mặc định: `False`): Có chia nhỏ audio trước khi xử lí không. - - 7. Kiểm tra & tối ưu hóa: - - `--checkpointing` (mặc định: `False`): Bật/tắt checkpointing để tiết kiệm RAM. - - 8. Dịch formant: - - `--formant_shifting` (mặc định: `False`): Có bật hiệu ứng dịch formant không. - - `--formant_qfrency` (mặc định: `0.8`): Hệ số dịch formant theo tần số. - - `--formant_timbre` (mặc định: `0.8`): Hệ số thay đổi màu sắc giọng. - """) - quit() -elif argv_is_allows[10] in argv: - print("""Các tham số của --create_dataset: - 1. Đường dẫn & cấu hình dataset: - - `--input_data` (bắt buộc): Đường dẫn liên kết đến âm thanh (Liên kết Youtube, có thể dùng dấu `,` để dùng nhiều liên kết). - - `--output_dirs` (mặc định: `./dataset`): Thư mục xuất dữ liệu đầu ra. - - `--sample_rate` (mặc định: `48000`): Tần số lấy mẫu cho âm thanh. - - 2. Làm sạch dữ liệu: - - `--clean_dataset` (mặc định: `False`): Có áp dụng làm sạch dữ liệu hay không. - - `--clean_strength` (mặc định: `0.7`): Mức độ làm sạch dữ liệu. - - 3. Tách giọng & hiệu ứng: - - `--separate` (mặc định: `True`): có tách nhạc hay không. - - `--separator_reverb` (mặc định: `False`): Có tách vang giọng không. - - `--model_name` (mặc định: `MDXNET_Main`): Mô hình tách nhạc ('Main_340', 'Main_390', 'Main_406', 'Main_427', 'Main_438', 'Inst_full_292', 'Inst_HQ_1', 'Inst_HQ_2', 'Inst_HQ_3', 'Inst_HQ_4', 'Inst_HQ_5', 'Kim_Vocal_1', 'Kim_Vocal_2', 'Kim_Inst', 'Inst_187_beta', 'Inst_82_beta', 'Inst_90_beta', 'Voc_FT', 'Crowd_HQ', 'MDXNET_9482', 'Inst_1', 'Inst_2', 'Inst_3', 'MDXNET_1_9703', 'MDXNET_2_9682', 'MDXNET_3_9662', 'Inst_Main', 'MDXNET_Main', 'HT-Tuned', 'HT-Normal', 'HD_MMI', 'HT_6S', 'HP-1', 'HP-2', 'HP-Vocal-1', 'HP-Vocal-2', 'HP2-1', 'HP2-2', 'HP2-3', 'SP-2B-1', 'SP-2B-2', 'SP-3B-1', 'SP-4B-1', 'SP-4B-2', 'SP-MID-1', 'SP-MID-2'). - - `--reverb_model` (mặc định: `MDX-Reverb`): Mô hình tách nhạc ("MDX-Reverb", 'VR-Reverb', 'Echo-Aggressive', 'Echo-Normal'). - - `--denoise_model` (mặc định: `Normal`): Mô hình tách nhạc ('Lite', 'Normal'). - - 4. Cấu hình xử lí âm thanh: - - `--shifts` (mặc định: `2`): Số lượng dự đoán. - - `--batch_size` (mặc định: `1`): Kích thước lô. - - `--overlap` (mặc định: `0.25`): Mức độ chồng lấn giữa các đoạn. - - `--aggression` (mặc định: `5`): Cường độ chiết xuất thân chính. - - `--hop_length` (mặc định: `1024`): Bước nhảy MDX khi xử lí. - - `--window_size` (mặc định: `512`): Kích thước cửa sổ. - - `--segments_size` (mặc định: `256`): Kích thước phân đoạn âm thanh. - - `--post_process_threshold` (mặc định: `0.2`): Mức độ xử lí hậu kỳ sau khi tách nhạc. - - 5. Cấu hình xử lí âm thanh khác: - - `--enable_tta` (mặc định: `False`): Tăng cường suy luận. - - `--enable_denoise` (mặc định: `False`): Khữ tách nhạc. - - `--high_end_process` (mặc định: `False`): Xử lí dải cao. - - `--enable_post_process` (mặc định: `False`): Hậu xử lí. - - 6. Bỏ qua phần âm thanh: - - `--skip_seconds` (mặc định: `False`): Có bỏ qua giây âm thanh nào không. - - `--skip_start_audios` (mặc định: `0`): Thời gian (giây) cần bỏ qua ở đầu audio. - - `--skip_end_audios` (mặc định: `0`): Thời gian (giây) cần bỏ qua ở cuối audio. - """) - quit() -elif argv_is_allows[11] in argv: - print("""Các tham số của --create_index: - 1. Thông tin mô hình: - - `--model_name` (bắt buộc): Tên mô hình. - - `--rvc_version` (mặc định: `v2`): Phiên bản (`v1`, `v2`). - - `--index_algorithm` (mặc định: `Auto`): Thuật toán index sử dụng (`Auto`, `Faiss`, `KMeans`). - """) - quit() -elif argv_is_allows[12] in argv: - print("""Các tham số của --extract: - 1. Thông tin mô hình: - - `--model_name` (bắt buộc): Tên mô hình. - - `--rvc_version` (mặc định: `v2`): Phiên bản RVC (`v1`, `v2`). - - 2. Cấu hình F0: - - `--f0_method` (mặc định: `rmvpe`): Phương pháp dự đoán F0 (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`). - - `--f0_onnx` (mặc định: `False`): Có sử dụng phiên bản ONNX của F0 hay không. - - `--pitch_guidance` (mặc định: `True`): Có sử dụng hướng dẫn cao độ hay không. - - `--f0_autotune` (mặc định: `False`): Có tự động điều chỉnh F0 hay không. - - `--f0_autotune_strength` (mặc định: `1`): Cường độ hiệu chỉnh tự động F0. - - `--alpha` (mặc định: `0.5`): Ngưỡng trộn cao độ khi ước tính cao độ hybrid. - - 3. Cấu hình xử lí: - - `--hop_length` (mặc định: `128`): Độ dài bước nhảy trong quá trình xử lí. - - `--cpu_cores` (mặc định: `2`): Số lượng luồng CPU sử dụng. - - `--gpu` (mặc định: `-`): Chỉ định GPU sử dụng (ví dụ: `0` cho GPU đầu tiên, `-` để tắt GPU). - - `--sample_rate` (bắt buộc): Tần số lấy mẫu của âm thanh đầu vào. - - 4. Cấu hình nhúng: - - `--embedder_model` (mặc định: `hubert_base`): Tên mô hình nhúng. - - `--embedders_mode` (mặc định: `fairseq`): Chế độ nhúng (`fairseq`, `transformers`, `onnx`, `whisper`). - - 4. RMS: - - `--rms_extract` (mặc định: False): Trích xuất thêm năng lượng rms. - """) - quit() -elif argv_is_allows[13] in argv: - print("""Các tham số của --preprocess: - 1. Thông tin mô hình: - - `--model_name` (bắt buộc): Tên mô hình. - - 2. Cấu hình dữ liệu: - - `--dataset_path` (mặc định: `./dataset`): Đường dẫn thư mục chứa tệp dữ liệu. - - `--sample_rate` (bắt buộc): Tần số lấy mẫu của dữ liệu âm thanh. - - 3. Cấu hình xử lí: - - `--cpu_cores` (mặc định: `2`): Số lượng luồng CPU sử dụng. - - `--cut_preprocess` (mặc định: `Automatic`): Cách cắt dữ liệu tiền xử lí (`Automatic`, `Simple`, `Skip`). - - `--process_effects` (mặc định: `False`): Có áp dụng tiền xử lí hay không. - - `--clean_dataset` (mặc định: `False`): Có làm sạch tệp dữ liệu hay không. - - `--clean_strength` (mặc định: `0.7`): Độ mạnh của quá trình làm sạch dữ liệu. - - 4. Cấu hình khác: - - `--chunk_len` (mặc định: `3.0`): Độ dài của đoạn âm thanh cho phương pháp 'Simple'. - - `--overlap_len` (mặc định: `0.3`): Độ dài của phần chồng chéo giữa các lát cắt đối với phương pháp 'Simple'. - - `--normalization_mode` (mặc định: `none`): Có xử lí chuẩn hóa âm thanh không (`none`, `pre`, `post`) - """) - quit() -elif argv_is_allows[14] in argv: - print("""Các tham số của --separate_music: - 1. Cấu hình đầu vào, đầu ra: - - `--input_path` (bắt buộc): Đường dẫn tệp âm thanh đầu vào. - - `--output_dirs` (mặc định: `./audios`): Thư mục lưu tệp đầu ra. - - `--export_format` (mặc định: `wav`): Định dạng xuất tệp (`wav`, `mp3`,...). - - `--sample_rate` (mặc định: `44100`): Tần số lấy mẫu của âm thanh đầu ra. - - 2. Cấu hình mô hình: - - `--model_name` (mặc định: `MDXNET_Main`): Mô hình tách nhạc ('Main_340', 'Main_390', 'Main_406', 'Main_427', 'Main_438', 'Inst_full_292', 'Inst_HQ_1', 'Inst_HQ_2', 'Inst_HQ_3', 'Inst_HQ_4', 'Inst_HQ_5', 'Kim_Vocal_1', 'Kim_Vocal_2', 'Kim_Inst', 'Inst_187_beta', 'Inst_82_beta', 'Inst_90_beta', 'Voc_FT', 'Crowd_HQ', 'MDXNET_9482', 'Inst_1', 'Inst_2', 'Inst_3', 'MDXNET_1_9703', 'MDXNET_2_9682', 'MDXNET_3_9662', 'Inst_Main', 'MDXNET_Main', 'HT-Tuned', 'HT-Normal', 'HD_MMI', 'HT_6S', 'HP-1', 'HP-2', 'HP-Vocal-1', 'HP-Vocal-2', 'HP2-1', 'HP2-2', 'HP2-3', 'SP-2B-1', 'SP-2B-2', 'SP-3B-1', 'SP-4B-1', 'SP-4B-2', 'SP-MID-1', 'SP-MID-2'). - - `--karaoke_model` (mặc định: `MDX-Version-1`): Mô hình tách nhạc ('MDX-Version-1', 'MDX-Version-2', 'VR-Version-1', 'VR-Version-2'). - - `--reverb_model` (mặc định: `MDX-Reverb`): Mô hình tách nhạc ("MDX-Reverb", 'VR-Reverb', 'Echo-Aggressive', 'Echo-Normal'). - - `--denoise_model` (mặc định: `Normal`): Mô hình tách nhạc ('Lite', 'Normal'). - - 3. Cấu hình xử lí âm thanh: - - `--shifts` (mặc định: `2`): Số lượng dự đoán. - - `--batch_size` (mặc định: `1`): Kích thước lô. - - `--overlap` (mặc định: `0.25`): Mức độ chồng lấn giữa các đoạn. - - `--aggression` (mặc định: `5`): Cường độ chiết xuất thân chính. - - `--hop_length` (mặc định: `1024`): Bước nhảy MDX khi xử lí. - - `--window_size` (mặc định: `512`): Kích thước cửa sổ. - - `--segments_size` (mặc định: `256`): Kích thước phân đoạn âm thanh. - - `--post_process_threshold` (mặc định: `0.2`): Mức độ xử lí hậu kỳ sau khi tách nhạc. - - 4. Cấu hình xử lí âm thanh khác: - - `--enable_tta` (mặc định: `False`): Tăng cường suy luận. - - `--enable_denoise` (mặc định: `False`): Khữ tách nhạc. - - `--high_end_process` (mặc định: `False`): Xử lí dải cao. - - `--enable_post_process` (mặc định: `False`): Hậu xử lí. - - `--separate_backing` (mặc định: `False`): Tách bè giọng. - - `--separate_reverb` (mặc định: `False`): Tách vang giọng. - """) - quit() -elif argv_is_allows[15] in argv: - print("""Các tham số của --train: - 1. Cấu hình mô hình: - - `--model_name` (bắt buộc): Tên mô hình. - - `--rvc_version` (mặc định: `v2`): Phiên bản RVC (`v1`, `v2`). - - `--model_author` (tùy chọn): Tác giả của mô hình. - - 2. Cấu hình lưu: - - `--save_every_epoch` (bắt buộc): Số kỷ nguyên giữa mỗi lần lưu. - - `--save_only_latest` (mặc định: `True`): Chỉ lưu điểm mới nhất. - - `--save_every_weights` (mặc định: `True`): Lưu tất cả trọng số của mô hình. - - 3. Cấu hình huấn luyện: - - `--total_epoch` (mặc định: `300`): Tổng số kỷ nguyên huấn luyện. - - `--batch_size` (mặc định: `8`): Kích thước lô trong quá trình huấn luyện. - - 4. Cấu hình thiết bị: - - `--gpu` (mặc định: `0`): Chỉ định GPU để sử dụng (số hoặc `-` nếu không dùng GPU). - - `--cache_data_in_gpu` (mặc định: `False`): Lưu dữ liệu vào GPU để tăng tốc. - - 5. Cấu hình huấn luyện nâng cao: - - `--pitch_guidance` (mặc định: `True`): Sử dụng hướng dẫn cao độ. - - `--g_pretrained_path` (mặc định: ``): Đường dẫn đến trọng số G đã huấn luyện trước. - - `--d_pretrained_path` (mặc định: ``): Đường dẫn đến trọng số D đã huấn luyện trước. - - `--vocoder` (mặc định: `Default`): Bộ mã hóa được sử dụng (`Default`, `MRF-HiFi-GAN`, `RefineGAN`). - - `--energy_use` (mặc định: `False`): Sử dụng năng lượng rms. - - 6. Phát hiện huấn luyện quá mức: - - `--overtraining_detector` (mặc định: `False`): Bật/tắt chế độ phát hiện huấn luyện quá mức. - - `--overtraining_threshold` (mặc định: `50`): Ngưỡng để xác định huấn luyện quá mức. - - 7. Xử lí dữ liệu: - - `--cleanup` (mặc định: `False`): Dọn dẹp tệp huấn luyện cũ để tiến hành huấn luyện lại từ đầu. - - 8. Tối ưu: - - `--checkpointing` (mặc định: `False`): Bật/tắt checkpointing để tiết kiệm RAM. - - `--deterministic` (mặc định: `False`): Khi bật sẽ sử dụng các thuật toán có tính xác định cao, đảm bảo rằng mỗi lần chạy cùng một dữ liệu đầu vào sẽ cho kết quả giống nhau. - - `--benchmark` (mặc định: `False`): Khi bật sẽ thử nghiệm và chọn thuật toán tối ưu nhất cho phần cứng và kích thước cụ thể. - - `--optimizer` (mặc định: `AdamW`): Trình tối ưu hóa được sử dụng (`AdamW`, `RAdam`, `AnyPrecisionAdamW`). - - `--multiscale_mel_loss` (mặc định: `False`): So sánh phổ Mel của âm thanh thật và âm thanh giả ở nhiều thang độ khác nhau. Giúp mô hình học được chi tiết âm sắc, độ sáng và cấu trúc tần số tốt hơn, từ đó cải thiện chất lượng và độ tự nhiên của giọng nói đầu ra. - - 9. Bộ tham chiếu: - - `--use_custom_reference` (mặc định: `False`): Có tùy chỉnh bộ tham chiếu hay không. - - `--reference_path` (mặc định: `False`): Đường dẫn đến bộ tham chiếu. - """) - quit() -elif argv_is_allows[18] in argv: - print("""Các tham số của --create_reference: - 1. Đường dẫn tệp: - - `--audio_path` (bắt buộc): Đường dẫn tệp âm thanh đầu vào. - - `--reference_name` (mặc định: `reference`): Đường dẫn lưu bộ tham chiếu đầu ra. - - 2. Cấu hình bộ tham chiếu: - - `--pitch_guidance` (mặc định: `True`): Sử dụng hướng dẫn cao độ. - - `--energy_use` (mặc định: `False`): Sử dụng năng lượng rms. - - `--version` (mặc định: `v2`): Phiên bản RVC (`v1`, `v2`). - - 3. Cấu hình nhúng: - - `--embedder_model` (mặc định: `hubert_base`): Tên mô hình nhúng. - - `--embedders_mode` (mặc định: `fairseq`): Chế độ nhúng (`fairseq`, `transformers`, `onnx`, `whisper`). - - 4. Cấu hình F0: - - `--f0_method` (mặc định: `rmvpe`): Phương pháp dự đoán F0 (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`). - - `--f0_onnx` (mặc định: `False`): Có sử dụng phiên bản ONNX của F0 hay không. - - `--f0_up_key` (mặc định: `0`): Điều chỉnh cao độ. - - `--filter_radius` (mặc định: `3`): Độ mượt của đường F0. - - `--f0_autotune` (mặc định: `False`): Có tự động điều chỉnh F0 hay không. - - `--f0_autotune_strength` (mặc định: `1`): Cường độ hiệu chỉnh tự động F0. - - `--f0_file` (mặc định: ``): Đường dẫn tệp F0 có sẵn. - - `--proposal_pitch` (mặc định: `False`): Đề xuất cao độ thay vì điều chỉnh thủ công. - - `--proposal_pitch_threshold` (mặc định: `0.0`): Ngưỡng tần số ước tính cao độ. - - `--alpha` (mặc định: `0.5`): Ngưỡng trộn cao độ khi ước tính cao độ hybrid. - """) - quit() -elif argv_is_allows[16] in argv: - print("""Sử dụng: - 1. `--help_audio_effects`: Trợ giúp về phần thêm hiệu ứng âm thanh. - 2. `--help_convert`: Trợ giúp về chuyển đổi âm thanh. - 3. `--help_create_dataset`: Trợ giúp về tạo dữ liệu huấn luyện. - 4. `--help_create_index`: Trợ giúp về tạo chỉ mục. - 5. `--help_extract`: Trợ giúp về trích xuất dữ liệu huấn luyện. - 6. `--help_preprocess`: Trợ giúp về xử lí trước dữ liệu. - 7. `--help_separate_music`: Trợ giúp về tách nhạc. - 8. `--help_train`: Trợ giúp về huấn luyện mô hình. - 9. `--help_create_reference`: Trợ giúp về tạo bộ tham chiếu. - """) - quit() - -if __name__ == "__main__": - import torch.multiprocessing as mp - - if "--train" in argv: mp.set_start_method("spawn") - if "--preprocess" in argv or "--extract" in argv: mp.set_start_method("spawn", force=True) - - main() \ No newline at end of file diff --git a/main/app/run_tensorboard.py b/main/app/run_tensorboard.py deleted file mode 100644 index 56fb927d6744eeb276902267e4297695dfb7acbd..0000000000000000000000000000000000000000 --- a/main/app/run_tensorboard.py +++ /dev/null @@ -1,32 +0,0 @@ -import os -import sys -import time -import logging -import warnings -import webbrowser - -from tensorboard import program - -sys.path.append(os.getcwd()) - -from main.app.variables import config, translations, logger - -def launch_tensorboard(): - warnings.filterwarnings("ignore") - for l in ["root", "tensorboard"]: - logging.getLogger(l).setLevel(logging.ERROR) - - tb = program.TensorBoard() - tb.configure(argv=[None, "--logdir", config.configs["logs_path"], f"--port={config.configs['tensorboard_port']}"]) - url = tb.launch() - - logger.info(f"{translations['tensorboard_url']}: {url}") - if "--open" in sys.argv: webbrowser.open(url) - - return f"{translations['tensorboard_url']}: {url}" - -if __name__ == "__main__": - launch_tensorboard() - - while 1: - time.sleep(5) \ No newline at end of file diff --git a/main/app/tabs/downloads/downloads.py b/main/app/tabs/downloads/downloads.py deleted file mode 100644 index eaaf06b1f31d831369a3b9452feb5b0d00b6cd71..0000000000000000000000000000000000000000 --- a/main/app/tabs/downloads/downloads.py +++ /dev/null @@ -1,112 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.variables import translations, configs, models, model_options -from main.app.core.downloads import download_model, search_models, download_pretrained_model -from main.app.core.ui import change_download_choices, change_download_pretrained_choices, shutil_move -from main.app.core.process import fetch_pretrained_data, save_drop_model, update_sample_rate_dropdown - -def download_tab(): - with gr.TabItem(translations["downloads"], visible=configs.get("downloads_tab", True)): - gr.Markdown(translations["download_markdown"]) - with gr.Row(): - gr.Markdown(translations["download_markdown_2"]) - with gr.Row(): - with gr.Accordion(translations["model_download"], open=True): - with gr.Row(): - downloadmodel = gr.Radio(label=translations["model_download_select"], choices=[translations["download_url"], translations["download_from_csv"], translations["search_models"], translations["upload"]], interactive=True, value=translations["download_url"]) - with gr.Row(): - gr.Markdown("___") - with gr.Column(): - with gr.Row(): - url_input = gr.Textbox(label=translations["model_url"], value="", placeholder="https://...", scale=6) - download_model_name = gr.Textbox(label=translations["modelname"], value="", placeholder=translations["modelname"], scale=2) - url_download = gr.Button(value=translations["downloads"], scale=2) - with gr.Column(): - model_browser = gr.Dropdown(choices=models.keys(), label=translations["model_warehouse"], scale=8, allow_custom_value=True, visible=False) - download_from_browser = gr.Button(value=translations["get_model"], scale=2, variant="primary", visible=False) - with gr.Column(): - search_name = gr.Textbox(label=translations["name_to_search"], placeholder=translations["modelname"], interactive=True, scale=8, visible=False) - search = gr.Button(translations["search_2"], scale=2, visible=False) - search_dropdown = gr.Dropdown(label=translations["select_download_model"], value="", choices=[], allow_custom_value=True, interactive=False, visible=False) - download = gr.Button(translations["downloads"], variant="primary", visible=False) - with gr.Column(): - model_upload = gr.Files(label=translations["drop_model"], file_types=[".pth", ".onnx", ".index", ".zip"], visible=False) - with gr.Row(): - with gr.Accordion(translations["download_pretrained_2"], open=False): - with gr.Row(): - pretrain_download_choices = gr.Radio(label=translations["model_download_select"], choices=[translations["download_url"], translations["list_model"], translations["upload"]], value=translations["download_url"], interactive=True) - with gr.Row(): - gr.Markdown("___") - with gr.Column(): - with gr.Row(): - pretrainD = gr.Textbox(label=translations["pretrained_url"].format(dg="D"), value="", placeholder="https://...", interactive=True, scale=4) - pretrainG = gr.Textbox(label=translations["pretrained_url"].format(dg="G"), value="", placeholder="https://...", interactive=True, scale=4) - download_pretrain_button = gr.Button(translations["downloads"], scale=2) - with gr.Column(): - with gr.Row(): - pretrain_choices = gr.Dropdown(label=translations["select_pretrain"], info=translations["select_pretrain_info"], choices=list(fetch_pretrained_data().keys()), value="Titan_Medium", allow_custom_value=True, interactive=True, scale=6, visible=False) - sample_rate_pretrain = gr.Dropdown(label=translations["pretrain_sr"], info=translations["pretrain_sr"], choices=["48k", "40k", "32k"], value="48k", interactive=True, visible=False) - download_pretrain_choices_button = gr.Button(translations["downloads"], scale=2, variant="primary", visible=False) - with gr.Row(): - pretrain_upload = gr.Files(label=translations["drop_pretrain"].format(dg="G, D"), file_types=[".pth"], visible=False) - with gr.Row(): - url_download.click( - fn=download_model, - inputs=[ - url_input, - download_model_name - ], - outputs=[url_input], - api_name="download_model" - ) - download_from_browser.click( - fn=lambda model: download_model(models[model], model), - inputs=[model_browser], - outputs=[model_browser], - api_name="download_browser" - ) - with gr.Row(): - downloadmodel.change(fn=change_download_choices, inputs=[downloadmodel], outputs=[url_input, download_model_name, url_download, model_browser, download_from_browser, search_name, search, search_dropdown, download, model_upload]) - search.click(fn=search_models, inputs=[search_name], outputs=[search_dropdown, download]) - model_upload.upload(fn=save_drop_model, inputs=[model_upload], outputs=[model_upload]) - download.click( - fn=lambda model: download_model(model_options[model], model), - inputs=[search_dropdown], - outputs=[search_dropdown], - api_name="search_models" - ) - with gr.Row(): - pretrain_download_choices.change(fn=change_download_pretrained_choices, inputs=[pretrain_download_choices], outputs=[pretrainD, pretrainG, download_pretrain_button, pretrain_choices, sample_rate_pretrain, download_pretrain_choices_button, pretrain_upload]) - pretrain_choices.change(fn=update_sample_rate_dropdown, inputs=[pretrain_choices], outputs=[sample_rate_pretrain]) - with gr.Row(): - download_pretrain_button.click( - fn=download_pretrained_model, - inputs=[ - pretrain_download_choices, - pretrainD, - pretrainG - ], - outputs=[pretrainD, pretrainG], - api_name="download_pretrain_link" - ) - download_pretrain_choices_button.click( - fn=download_pretrained_model, - inputs=[ - pretrain_download_choices, - pretrain_choices, - sample_rate_pretrain - ], - outputs=[pretrain_choices], - api_name="download_pretrain_choices" - ) - pretrain_upload.upload( - fn=lambda pretrain_upload: [shutil_move(pretrain.name, configs["pretrained_custom_path"]) for pretrain in pretrain_upload], - inputs=[pretrain_upload], - outputs=[], - api_name="upload_pretrain" - ) \ No newline at end of file diff --git a/main/app/tabs/editing/child/audio_effects.py b/main/app/tabs/editing/child/audio_effects.py deleted file mode 100644 index 370d117cdac347912c6a29a62d1a7587dd8050eb..0000000000000000000000000000000000000000 --- a/main/app/tabs/editing/child/audio_effects.py +++ /dev/null @@ -1,393 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.editing import audio_effects -from main.app.core.presets import audio_effect_load_presets, audio_effect_save_presets -from main.app.core.ui import visible, change_audios_choices, change_effect_preset_choices, shutil_move -from main.app.variables import translations, paths_for_files, sample_rate_choice, audio_effect_presets_file, configs, file_types, export_format_choices - -def audio_effects_tab(): - with gr.Row(): - gr.Markdown(translations["audio_effects_edit"]) - with gr.Row(): - with gr.Column(): - with gr.Row(): - reverb_check_box = gr.Checkbox(label=translations["reverb"], value=False, interactive=True) - chorus_check_box = gr.Checkbox(label=translations["chorus"], value=False, interactive=True) - delay_check_box = gr.Checkbox(label=translations["delay"], value=False, interactive=True) - phaser_check_box = gr.Checkbox(label=translations["phaser"], value=False, interactive=True) - compressor_check_box = gr.Checkbox(label=translations["compressor"], value=False, interactive=True) - more_options = gr.Checkbox(label=translations["more_option"], value=False, interactive=True) - with gr.Row(): - with gr.Accordion(translations["input_output"], open=False): - with gr.Row(): - upload_audio = gr.Files(label=translations["drop_audio"], file_types=file_types) - with gr.Row(): - audio_in_path = gr.Dropdown(label=translations["input_audio"], value="", choices=paths_for_files, info=translations["provide_audio"], interactive=True, allow_custom_value=True) - audio_out_path = gr.Textbox(label=translations["output_audio"], value="audios/audio_effects.wav", placeholder="audios/audio_effects.wav", info=translations["provide_output"], interactive=True) - with gr.Row(): - with gr.Column(): - audio_combination = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True) - audio_combination_input = gr.Dropdown(label=translations["input_audio"], value="", choices=paths_for_files, info=translations["provide_audio"], interactive=True, allow_custom_value=True, visible=audio_combination.value) - with gr.Row(): - main_vol = gr.Slider(minimum=-80, maximum=80, label=translations["main_volume"], info=translations["main_volume_info"], value=-4, step=1, interactive=True, visible=audio_combination.value) - combine_vol = gr.Slider(minimum=-80, maximum=80, label=translations["combination_volume"], info=translations["combination_volume_info"], value=-7, step=1, interactive=True, visible=audio_combination.value) - with gr.Row(): - audio_effects_refresh = gr.Button(translations["refresh"]) - with gr.Row(): - audio_output_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True) - with gr.Row(): - with gr.Accordion(translations["use_presets"], open=False): - with gr.Row(): - presets_name = gr.Dropdown(label=translations["file_preset"], choices=audio_effect_presets_file, value=audio_effect_presets_file[0] if len(audio_effect_presets_file) > 0 else '', interactive=True, allow_custom_value=True) - with gr.Row(): - load_click = gr.Button(translations["load_file"], variant="primary") - refresh_click = gr.Button(translations["refresh"]) - with gr.Accordion(translations["export_file"], open=False): - with gr.Row(): - with gr.Column(): - name_to_save_file = gr.Textbox(label=translations["filename_to_save"]) - save_file_button = gr.Button(translations["export_file"]) - with gr.Row(): - upload_presets = gr.Files(label=translations["upload_presets"], file_types=[".effect.json"]) - with gr.Row(): - apply_effects_button = gr.Button(translations["apply"], variant="primary", scale=2) - with gr.Row(): - with gr.Column(): - with gr.Row(): - with gr.Accordion(translations["reverb"], open=False, visible=reverb_check_box.value) as reverb_accordion: - reverb_freeze_mode = gr.Checkbox(label=translations["reverb_freeze"], info=translations["reverb_freeze_info"], value=False, interactive=True) - reverb_room_size = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.15, label=translations["room_size"], info=translations["room_size_info"], interactive=True) - reverb_damping = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.7, label=translations["damping"], info=translations["damping_info"], interactive=True) - reverb_wet_level = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label=translations["wet_level"], info=translations["wet_level_info"], interactive=True) - reverb_dry_level = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.8, label=translations["dry_level"], info=translations["dry_level_info"], interactive=True) - reverb_width = gr.Slider(minimum=0, maximum=1, step=0.01, value=1, label=translations["width"], info=translations["width_info"], interactive=True) - with gr.Row(): - with gr.Accordion(translations["chorus"], open=False, visible=chorus_check_box.value) as chorus_accordion: - chorus_depth = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["chorus_depth"], info=translations["chorus_depth_info"], interactive=True) - chorus_rate_hz = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1.5, label=translations["chorus_rate_hz"], info=translations["chorus_rate_hz_info"], interactive=True) - chorus_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["chorus_mix"], info=translations["chorus_mix_info"], interactive=True) - chorus_centre_delay_ms = gr.Slider(minimum=0, maximum=50, step=1, value=10, label=translations["chorus_centre_delay_ms"], info=translations["chorus_centre_delay_ms_info"], interactive=True) - chorus_feedback = gr.Slider(minimum=-1, maximum=1, step=0.01, value=0, label=translations["chorus_feedback"], info=translations["chorus_feedback_info"], interactive=True) - with gr.Row(): - with gr.Accordion(translations["delay"], open=False, visible=delay_check_box.value) as delay_accordion: - delay_second = gr.Slider(minimum=0, maximum=5, step=0.01, value=0.5, label=translations["delay_seconds"], info=translations["delay_seconds_info"], interactive=True) - delay_feedback = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["delay_feedback"], info=translations["delay_feedback_info"], interactive=True) - delay_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["delay_mix"], info=translations["delay_mix_info"], interactive=True) - with gr.Column(): - with gr.Row(): - with gr.Accordion(translations["more_option"], open=False, visible=more_options.value) as more_accordion: - with gr.Row(): - fade = gr.Checkbox(label=translations["fade"], value=False, interactive=True) - bass_or_treble = gr.Checkbox(label=translations["bass_or_treble"], value=False, interactive=True) - limiter = gr.Checkbox(label=translations["limiter"], value=False, interactive=True) - resample_checkbox = gr.Checkbox(label=translations["resample"], value=False, interactive=True) - with gr.Row(): - distortion_checkbox = gr.Checkbox(label=translations["distortion"], value=False, interactive=True) - gain_checkbox = gr.Checkbox(label=translations["gain"], value=False, interactive=True) - bitcrush_checkbox = gr.Checkbox(label=translations["bitcrush"], value=False, interactive=True) - clipping_checkbox = gr.Checkbox(label=translations["clipping"], value=False, interactive=True) - with gr.Accordion(translations["fade"], open=True, visible=fade.value) as fade_accordion: - with gr.Row(): - fade_in = gr.Slider(minimum=0, maximum=10000, step=100, value=0, label=translations["fade_in"], info=translations["fade_in_info"], interactive=True) - fade_out = gr.Slider(minimum=0, maximum=10000, step=100, value=0, label=translations["fade_out"], info=translations["fade_out_info"], interactive=True) - with gr.Accordion(translations["bass_or_treble"], open=True, visible=bass_or_treble.value) as bass_treble_accordion: - with gr.Row(): - bass_boost = gr.Slider(minimum=0, maximum=20, step=1, value=0, label=translations["bass_boost"], info=translations["bass_boost_info"], interactive=True) - bass_frequency = gr.Slider(minimum=20, maximum=200, step=10, value=100, label=translations["bass_frequency"], info=translations["bass_frequency_info"], interactive=True) - with gr.Row(): - treble_boost = gr.Slider(minimum=0, maximum=20, step=1, value=0, label=translations["treble_boost"], info=translations["treble_boost_info"], interactive=True) - treble_frequency = gr.Slider(minimum=1000, maximum=10000, step=500, value=3000, label=translations["treble_frequency"], info=translations["treble_frequency_info"], interactive=True) - with gr.Accordion(translations["limiter"], open=True, visible=limiter.value) as limiter_accordion: - with gr.Row(): - limiter_threshold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-1, label=translations["limiter_threshold_db"], info=translations["limiter_threshold_db_info"], interactive=True) - limiter_release_ms = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label=translations["limiter_release_ms"], info=translations["limiter_release_ms_info"], interactive=True) - with gr.Column(): - pitch_shift_semitones = gr.Slider(minimum=-20, maximum=20, step=1, value=0, label=translations["pitch"], info=translations["pitch_info"], interactive=True) - audio_effect_resample_sr = gr.Radio(choices=[0]+sample_rate_choice, value=0, label=translations["resample"], info=translations["resample_info"], interactive=True, visible=resample_checkbox.value) - distortion_drive_db = gr.Slider(minimum=0, maximum=50, step=1, value=20, label=translations["distortion"], info=translations["distortion_info"], interactive=True, visible=distortion_checkbox.value) - gain_db = gr.Slider(minimum=-60, maximum=60, step=1, value=0, label=translations["gain"], info=translations["gain_info"], interactive=True, visible=gain_checkbox.value) - clipping_threshold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-1, label=translations["clipping_threshold_db"], info=translations["clipping_threshold_db_info"], interactive=True, visible=clipping_checkbox.value) - bitcrush_bit_depth = gr.Slider(minimum=1, maximum=24, step=1, value=16, label=translations["bitcrush_bit_depth"], info=translations["bitcrush_bit_depth_info"], interactive=True, visible=bitcrush_checkbox.value) - with gr.Row(): - with gr.Accordion(translations["phaser"], open=False, visible=phaser_check_box.value) as phaser_accordion: - phaser_depth = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["phaser_depth"], info=translations["phaser_depth_info"], interactive=True) - phaser_rate_hz = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1, label=translations["phaser_rate_hz"], info=translations["phaser_rate_hz_info"], interactive=True) - phaser_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["phaser_mix"], info=translations["phaser_mix_info"], interactive=True) - phaser_centre_frequency_hz = gr.Slider(minimum=50, maximum=5000, step=10, value=1000, label=translations["phaser_centre_frequency_hz"], info=translations["phaser_centre_frequency_hz_info"], interactive=True) - phaser_feedback = gr.Slider(minimum=-1, maximum=1, step=0.01, value=0, label=translations["phaser_feedback"], info=translations["phaser_feedback_info"], interactive=True) - with gr.Row(): - with gr.Accordion(translations["compressor"], open=False, visible=compressor_check_box.value) as compressor_accordion: - compressor_threshold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-20, label=translations["compressor_threshold_db"], info=translations["compressor_threshold_db_info"], interactive=True) - compressor_ratio = gr.Slider(minimum=1, maximum=20, step=0.1, value=1, label=translations["compressor_ratio"], info=translations["compressor_ratio_info"], interactive=True) - compressor_attack_ms = gr.Slider(minimum=0.1, maximum=100, step=0.1, value=10, label=translations["compressor_attack_ms"], info=translations["compressor_attack_ms_info"], interactive=True) - compressor_release_ms = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label=translations["compressor_release_ms"], info=translations["compressor_release_ms_info"], interactive=True) - with gr.Row(): - gr.Markdown(translations["output_audio"]) - with gr.Row(): - audio_play_input = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"]) - audio_play_output = gr.Audio(show_download_button=True, interactive=False, label=translations["output_audio"]) - with gr.Row(): - reverb_check_box.change(fn=visible, inputs=[reverb_check_box], outputs=[reverb_accordion]) - chorus_check_box.change(fn=visible, inputs=[chorus_check_box], outputs=[chorus_accordion]) - delay_check_box.change(fn=visible, inputs=[delay_check_box], outputs=[delay_accordion]) - with gr.Row(): - compressor_check_box.change(fn=visible, inputs=[compressor_check_box], outputs=[compressor_accordion]) - phaser_check_box.change(fn=visible, inputs=[phaser_check_box], outputs=[phaser_accordion]) - more_options.change(fn=visible, inputs=[more_options], outputs=[more_accordion]) - with gr.Row(): - fade.change(fn=visible, inputs=[fade], outputs=[fade_accordion]) - bass_or_treble.change(fn=visible, inputs=[bass_or_treble], outputs=[bass_treble_accordion]) - limiter.change(fn=visible, inputs=[limiter], outputs=[limiter_accordion]) - resample_checkbox.change(fn=visible, inputs=[resample_checkbox], outputs=[audio_effect_resample_sr]) - with gr.Row(): - distortion_checkbox.change(fn=visible, inputs=[distortion_checkbox], outputs=[distortion_drive_db]) - gain_checkbox.change(fn=visible, inputs=[gain_checkbox], outputs=[gain_db]) - clipping_checkbox.change(fn=visible, inputs=[clipping_checkbox], outputs=[clipping_threshold_db]) - bitcrush_checkbox.change(fn=visible, inputs=[bitcrush_checkbox], outputs=[bitcrush_bit_depth]) - with gr.Row(): - upload_audio.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[upload_audio], outputs=[audio_in_path]) - audio_in_path.change(fn=lambda audio: audio if audio else None, inputs=[audio_in_path], outputs=[audio_play_input]) - audio_effects_refresh.click(fn=lambda a, b: [change_audios_choices(a), change_audios_choices(b)], inputs=[audio_in_path, audio_combination_input], outputs=[audio_in_path, audio_combination_input]) - with gr.Row(): - more_options.change(fn=lambda: [False]*8, inputs=[], outputs=[fade, bass_or_treble, limiter, resample_checkbox, distortion_checkbox, gain_checkbox, clipping_checkbox, bitcrush_checkbox]) - audio_combination.change(fn=visible, inputs=[audio_combination], outputs=[audio_combination_input]) - audio_combination.change(fn=lambda a: [visible(a) for _ in range(2)], inputs=[audio_combination], outputs=[main_vol, combine_vol]) - with gr.Row(): - upload_presets.upload(fn=lambda presets_in: [shutil_move(preset.name, configs["presets_path"]) for preset in presets_in][0], inputs=[upload_presets], outputs=[presets_name]) - refresh_click.click(fn=change_effect_preset_choices, inputs=[], outputs=[presets_name]) - with gr.Row(): - load_click.click( - fn=audio_effect_load_presets, - inputs=[ - presets_name, - resample_checkbox, - audio_effect_resample_sr, - chorus_depth, - chorus_rate_hz, - chorus_mix, - chorus_centre_delay_ms, - chorus_feedback, - distortion_drive_db, - reverb_room_size, - reverb_damping, - reverb_wet_level, - reverb_dry_level, - reverb_width, - reverb_freeze_mode, - pitch_shift_semitones, - delay_second, - delay_feedback, - delay_mix, - compressor_threshold_db, - compressor_ratio, - compressor_attack_ms, - compressor_release_ms, - limiter_threshold_db, - limiter_release_ms, - gain_db, - bitcrush_bit_depth, - clipping_threshold_db, - phaser_rate_hz, - phaser_depth, - phaser_centre_frequency_hz, - phaser_feedback, - phaser_mix, - bass_boost, - bass_frequency, - treble_boost, - treble_frequency, - fade_in, - fade_out, - chorus_check_box, - distortion_checkbox, - reverb_check_box, - delay_check_box, - compressor_check_box, - limiter, - gain_checkbox, - bitcrush_checkbox, - clipping_checkbox, - phaser_check_box, - bass_or_treble, - fade - ], - outputs=[ - resample_checkbox, - audio_effect_resample_sr, - chorus_depth, - chorus_rate_hz, - chorus_mix, - chorus_centre_delay_ms, - chorus_feedback, - distortion_drive_db, - reverb_room_size, - reverb_damping, - reverb_wet_level, - reverb_dry_level, - reverb_width, - reverb_freeze_mode, - pitch_shift_semitones, - delay_second, - delay_feedback, - delay_mix, - compressor_threshold_db, - compressor_ratio, - compressor_attack_ms, - compressor_release_ms, - limiter_threshold_db, - limiter_release_ms, - gain_db, - bitcrush_bit_depth, - clipping_threshold_db, - phaser_rate_hz, - phaser_depth, - phaser_centre_frequency_hz, - phaser_feedback, - phaser_mix, - bass_boost, - bass_frequency, - treble_boost, - treble_frequency, - fade_in, - fade_out, - chorus_check_box, - distortion_checkbox, - reverb_check_box, - delay_check_box, - compressor_check_box, - limiter, - gain_checkbox, - bitcrush_checkbox, - clipping_checkbox, - phaser_check_box, - bass_or_treble, - fade - ], - ) - save_file_button.click( - fn=audio_effect_save_presets, - inputs=[ - name_to_save_file, - resample_checkbox, - audio_effect_resample_sr, - chorus_depth, - chorus_rate_hz, - chorus_mix, - chorus_centre_delay_ms, - chorus_feedback, - distortion_drive_db, - reverb_room_size, - reverb_damping, - reverb_wet_level, - reverb_dry_level, - reverb_width, - reverb_freeze_mode, - pitch_shift_semitones, - delay_second, - delay_feedback, - delay_mix, - compressor_threshold_db, - compressor_ratio, - compressor_attack_ms, - compressor_release_ms, - limiter_threshold_db, - limiter_release_ms, - gain_db, - bitcrush_bit_depth, - clipping_threshold_db, - phaser_rate_hz, - phaser_depth, - phaser_centre_frequency_hz, - phaser_feedback, - phaser_mix, - bass_boost, - bass_frequency, - treble_boost, - treble_frequency, - fade_in, - fade_out, - chorus_check_box, - distortion_checkbox, - reverb_check_box, - delay_check_box, - compressor_check_box, - limiter, - gain_checkbox, - bitcrush_checkbox, - clipping_checkbox, - phaser_check_box, - bass_or_treble, - fade - ], - outputs=[presets_name] - ) - with gr.Row(): - apply_effects_button.click( - fn=audio_effects, - inputs=[ - audio_in_path, - audio_out_path, - resample_checkbox, - audio_effect_resample_sr, - chorus_depth, - chorus_rate_hz, - chorus_mix, - chorus_centre_delay_ms, - chorus_feedback, - distortion_drive_db, - reverb_room_size, - reverb_damping, - reverb_wet_level, - reverb_dry_level, - reverb_width, - reverb_freeze_mode, - pitch_shift_semitones, - delay_second, - delay_feedback, - delay_mix, - compressor_threshold_db, - compressor_ratio, - compressor_attack_ms, - compressor_release_ms, - limiter_threshold_db, - limiter_release_ms, - gain_db, - bitcrush_bit_depth, - clipping_threshold_db, - phaser_rate_hz, - phaser_depth, - phaser_centre_frequency_hz, - phaser_feedback, - phaser_mix, - bass_boost, - bass_frequency, - treble_boost, - treble_frequency, - fade_in, - fade_out, - audio_output_format, - chorus_check_box, - distortion_checkbox, - reverb_check_box, - delay_check_box, - compressor_check_box, - limiter, - gain_checkbox, - bitcrush_checkbox, - clipping_checkbox, - phaser_check_box, - bass_or_treble, - fade, - audio_combination, - audio_combination_input, - main_vol, - combine_vol - ], - outputs=[audio_play_output], - api_name="audio_effects" - ) \ No newline at end of file diff --git a/main/app/tabs/editing/child/quirk.py b/main/app/tabs/editing/child/quirk.py deleted file mode 100644 index f723b1cf61dd7a05d2e630a47ed7130671660bd9..0000000000000000000000000000000000000000 --- a/main/app/tabs/editing/child/quirk.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.editing import apply_voice_quirk -from main.app.core.ui import change_audios_choices, shutil_move -from main.app.variables import translations, paths_for_files, configs, file_types, export_format_choices - -def quirk_tab(): - with gr.Row(): - gr.Markdown(translations["quirk_markdown"]) - with gr.Row(): - input_audio_play = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"]) - with gr.Row(): - quirk_choice = gr.Radio(label=translations["quirk_label"], info=translations["quirk_label_info"], choices=list(translations["quirk_choice"].keys()), interactive=True, value=list(translations["quirk_choice"].keys())[0]) - with gr.Row(): - apply_quirk_button = gr.Button(translations["apply"], variant="primary") - with gr.Row(): - with gr.Accordion(translations["input_output"], open=False): - with gr.Row(): - quirk_upload_audio = gr.Files(label=translations["drop_audio"], file_types=file_types) - with gr.Column(): - quirk_export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True) - quirk_input_path = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True) - quirk_output_path = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True) - with gr.Column(): - quirk_refresh = gr.Button(translations["refresh"]) - with gr.Row(): - output_audio_play = gr.Audio(show_download_button=True, interactive=False, label=translations["output_audio"]) - with gr.Row(): - quirk_upload_audio.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[quirk_upload_audio], outputs=[quirk_input_path]) - quirk_input_path.change(fn=lambda audio: audio if audio else None, inputs=[quirk_input_path], outputs=[input_audio_play]) - quirk_refresh.click(fn=change_audios_choices, inputs=[quirk_input_path], outputs=[quirk_input_path]) - with gr.Row(): - apply_quirk_button.click( - fn=apply_voice_quirk, - inputs=[ - quirk_input_path, - quirk_choice, - quirk_output_path, - quirk_export_format - ], - outputs=[output_audio_play], - api_name="quirk" - ) \ No newline at end of file diff --git a/main/app/tabs/editing/editing.py b/main/app/tabs/editing/editing.py deleted file mode 100644 index 10964204b1e39de7c2d239fdfe959eb6900f6ae9..0000000000000000000000000000000000000000 --- a/main/app/tabs/editing/editing.py +++ /dev/null @@ -1,20 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.variables import configs, translations -from main.app.tabs.editing.child.quirk import quirk_tab -from main.app.tabs.editing.child.audio_effects import audio_effects_tab - -def editing_tab(): - with gr.TabItem(translations["editing"], visible=configs.get("editing_tab", True)): - with gr.TabItem(translations["audio_effects"], visible=configs.get("effects_tab", True)): - gr.Markdown(translations["apply_audio_effects"]) - audio_effects_tab() - - with gr.TabItem(translations["quirk"], visible=configs.get("quirk", True)): - gr.Markdown(translations["quirk_info"]) - quirk_tab() \ No newline at end of file diff --git a/main/app/tabs/extra/child/convert_model.py b/main/app/tabs/extra/child/convert_model.py deleted file mode 100644 index 410ffd88ab46a829484266b7fd2bf6d6e18743f6..0000000000000000000000000000000000000000 --- a/main/app/tabs/extra/child/convert_model.py +++ /dev/null @@ -1,31 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.ui import visible, shutil_move -from main.app.core.model_utils import onnx_export -from main.app.variables import translations, configs - -def convert_model_tab(): - with gr.Row(): - gr.Markdown(translations["pytorch2onnx_markdown"]) - with gr.Row(): - model_pth_upload = gr.File(label=translations["drop_model"], file_types=[".pth"]) - with gr.Row(): - convert_onnx = gr.Button(translations["convert_model"], variant="primary", scale=2) - with gr.Row(): - model_pth_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True) - with gr.Row(): - output_model2 = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False) - with gr.Row(): - model_pth_upload.upload(fn=lambda model_pth_upload: shutil_move(model_pth_upload.name, configs["weights_path"]), inputs=[model_pth_upload], outputs=[model_pth_path]) - convert_onnx.click( - fn=onnx_export, - inputs=[model_pth_path], - outputs=[output_model2], - api_name="model_onnx_export" - ) - convert_onnx.click(fn=lambda: visible(True), inputs=[], outputs=[output_model2]) \ No newline at end of file diff --git a/main/app/tabs/extra/child/create_srt.py b/main/app/tabs/extra/child/create_srt.py deleted file mode 100644 index 1fd8c70cc41cc01bdeb7ee9aae5e3c4e54538a71..0000000000000000000000000000000000000000 --- a/main/app/tabs/extra/child/create_srt.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.csrt import create_srt -from main.app.core.ui import shutil_move, change_audios_choices -from main.app.variables import translations, file_types, configs, paths_for_files - -def create_srt_tab(): - with gr.Row(): - gr.Markdown(translations["create_srt_markdown_2"]) - with gr.Row(): - with gr.Column(): - srt_content = gr.Textbox(label=translations["srt_content"], value="", lines=9, max_lines=9, interactive=False) - with gr.Column(): - word_timestamps = gr.Checkbox(label=translations["word_timestamps"], info=translations["word_timestamps_info"], value=False, interactive=True) - model_size = gr.Radio(label=translations["model_size"], info=translations["model_size_info"], choices=["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3", "large-v3-turbo"], value="medium", interactive=True) - with gr.Row(): - convert_button = gr.Button(translations["convert_audio"], variant="primary") - with gr.Row(): - with gr.Accordion(translations["input_output"], open=False): - with gr.Column(): - input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True) - output_file = gr.Textbox(label=translations["srt_output_file"], value="srt/output.srt", placeholder="srt/output.srt", interactive=True) - with gr.Column(): - refresh = gr.Button(translations["refresh"]) - with gr.Row(): - input_file = gr.Files(label=translations["drop_audio"], file_types=file_types) - with gr.Row(): - play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"]) - with gr.Row(): - output_srt = gr.File(label=translations["srt_output_file"], file_types=[".srt"], interactive=False, visible=False) - with gr.Row(): - input_file.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[input_file], outputs=[input_audio]) - input_audio.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio], outputs=[play_audio]) - refresh.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio]) - with gr.Row(): - convert_button.click( - fn=create_srt, - inputs=[ - model_size, - input_audio, - output_file, - word_timestamps - ], - outputs=[ - output_srt, - srt_content - ], - api_name="create_srt" - ) - - diff --git a/main/app/tabs/extra/child/f0_extract.py b/main/app/tabs/extra/child/f0_extract.py deleted file mode 100644 index 3062ee728faa655a89d0866a4a1ed02a0a6547bc..0000000000000000000000000000000000000000 --- a/main/app/tabs/extra/child/f0_extract.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.f0_extract import f0_extract -from main.app.core.ui import change_audios_choices, unlock_f0, shutil_move -from main.app.variables import translations, paths_for_files, method_f0, configs, file_types - -def f0_extract_tab(): - with gr.Row(): - gr.Markdown(translations["f0_extractor_markdown_2"]) - with gr.Row(): - extractor_button = gr.Button(translations["extract_button"].replace("2. ", ""), variant="primary") - with gr.Row(): - with gr.Column(): - upload_audio_file = gr.Files(label=translations["drop_audio"], file_types=file_types) - audioplay = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"]) - with gr.Column(): - with gr.Accordion(translations["f0_method"], open=False): - with gr.Group(): - with gr.Row(): - onnx_f0_mode3 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True) - unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True) - f0_method_extract = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=[m for m in method_f0 if m != "hybrid"], value="rmvpe", interactive=True) - with gr.Accordion(translations["audio_path"], open=True): - input_audio_path = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True) - refresh_audio_button = gr.Button(translations["refresh"]) - with gr.Row(): - gr.Markdown("___") - with gr.Row(): - file_output = gr.File(label="", file_types=[".txt"], interactive=False) - image_output = gr.Image(label="", interactive=False, show_download_button=True) - with gr.Row(): - upload_audio_file.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[upload_audio_file], outputs=[input_audio_path]) - input_audio_path.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio_path], outputs=[audioplay]) - refresh_audio_button.click(fn=change_audios_choices, inputs=[input_audio_path], outputs=[input_audio_path]) - with gr.Row(): - unlock_full_method.change(fn=lambda method: {"choices": [m for m in unlock_f0(method)["choices"] if m != "hybrid"], "value": "rmvpe", "__type__": "update"}, inputs=[unlock_full_method], outputs=[f0_method_extract]) - extractor_button.click( - fn=f0_extract, - inputs=[ - input_audio_path, - f0_method_extract, - onnx_f0_mode3 - ], - outputs=[file_output, image_output], - api_name="f0_extract" - ) \ No newline at end of file diff --git a/main/app/tabs/extra/child/fushion.py b/main/app/tabs/extra/child/fushion.py deleted file mode 100644 index 0064ef81ec702236ded2833a65d1d394d552e312..0000000000000000000000000000000000000000 --- a/main/app/tabs/extra/child/fushion.py +++ /dev/null @@ -1,45 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.ui import visible, shutil_move -from main.app.core.model_utils import fushion_model -from main.app.variables import translations, configs - -def fushion_tab(): - with gr.Row(): - gr.Markdown(translations["fushion_markdown_2"]) - with gr.Row(): - name_to_save = gr.Textbox(label=translations["modelname"], placeholder="Model.pth", value="", max_lines=1, interactive=True) - with gr.Row(): - fushion_button = gr.Button(translations["fushion"], variant="primary", scale=4) - with gr.Column(): - with gr.Row(): - model_a = gr.File(label=f"{translations['model_name']} 1", file_types=[".pth", ".onnx"]) - model_b = gr.File(label=f"{translations['model_name']} 2", file_types=[".pth", ".onnx"]) - with gr.Row(): - model_path_a = gr.Textbox(label=f"{translations['model_path']} 1", value="", placeholder="assets/weights/Model_1.pth") - model_path_b = gr.Textbox(label=f"{translations['model_path']} 2", value="", placeholder="assets/weights/Model_2.pth") - with gr.Row(): - ratio = gr.Slider(minimum=0, maximum=1, label=translations["model_ratio"], info=translations["model_ratio_info"], value=0.5, interactive=True) - with gr.Row(): - output_model = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False) - with gr.Row(): - model_a.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model_a], outputs=[model_path_a]) - model_b.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model_b], outputs=[model_path_b]) - with gr.Row(): - fushion_button.click( - fn=fushion_model, - inputs=[ - name_to_save, - model_path_a, - model_path_b, - ratio - ], - outputs=[name_to_save, output_model], - api_name="fushion_model" - ) - fushion_button.click(fn=lambda: visible(True), inputs=[], outputs=[output_model]) \ No newline at end of file diff --git a/main/app/tabs/extra/child/read_model.py b/main/app/tabs/extra/child/read_model.py deleted file mode 100644 index 4ca25625fd48dbff9e64bbb388851fc35883a450..0000000000000000000000000000000000000000 --- a/main/app/tabs/extra/child/read_model.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.ui import shutil_move -from main.app.core.model_utils import model_info -from main.app.variables import translations, configs - -def read_model_tab(): - with gr.Row(): - gr.Markdown(translations["read_model_markdown_2"]) - with gr.Row(): - model = gr.File(label=translations["drop_model"], file_types=[".pth", ".onnx"]) - with gr.Row(): - read_button = gr.Button(translations["readmodel"], variant="primary", scale=2) - with gr.Column(): - model_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True) - output_info = gr.Textbox(label=translations["modelinfo"], value="", interactive=False, scale=6) - with gr.Row(): - model.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model], outputs=[model_path]) - read_button.click( - fn=model_info, - inputs=[model_path], - outputs=[output_info], - api_name="read_model" - ) \ No newline at end of file diff --git a/main/app/tabs/extra/child/settings.py b/main/app/tabs/extra/child/settings.py deleted file mode 100644 index fd839e3bdb93314a7e89b861712980e1637df767..0000000000000000000000000000000000000000 --- a/main/app/tabs/extra/child/settings.py +++ /dev/null @@ -1,61 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.ui import change_fp -from main.app.core.utils import stop_pid -from main.app.core.restart import change_font, change_language, change_theme -from main.app.variables import translations, theme, font, configs, language, config - -def settings_tab(app): - with gr.Row(): - gr.Markdown(translations["settings_markdown_2"]) - with gr.Row(): - toggle_button = gr.Button(translations["change_light_dark"], variant="secondary", scale=2) - with gr.Row(): - with gr.Column(): - language_dropdown = gr.Dropdown(label=translations["lang"], interactive=True, info=translations["lang_restart"], choices=configs.get("support_language", "vi-VN"), value=language) - change_lang = gr.Button(translations["change_lang"], variant="primary", scale=2) - with gr.Column(): - theme_dropdown = gr.Dropdown(label=translations["theme"], interactive=True, info=translations["theme_restart"], choices=configs.get("themes", theme), value=theme, allow_custom_value=True) - changetheme = gr.Button(translations["theme_button"], variant="primary", scale=2) - with gr.Row(): - with gr.Column(): - fp_choice = gr.Radio(choices=["fp16","fp32"], value="fp16" if configs.get("fp16", False) else "fp32", label=translations["precision"], info=translations["precision_info"], interactive=config.device not in ["cpu", "mps", "ocl:0"]) - fp_button = gr.Button(translations["update_precision"], variant="secondary", scale=2) - with gr.Column(): - font_choice = gr.Textbox(label=translations["font"], info=translations["font_info"], value=font, interactive=True) - font_button = gr.Button(translations["change_font"]) - with gr.Row(): - with gr.Column(): - with gr.Accordion(translations["stop"], open=False, visible=True): - separate_stop = gr.Button(translations["stop_separate"]) - convert_stop = gr.Button(translations["stop_convert"]) - create_dataset_stop = gr.Button(translations["stop_create_dataset"]) - with gr.Accordion(translations["stop_training"], open=False): - model_name_stop = gr.Textbox(label=translations["modelname"], info=translations["training_model_name"], value="", placeholder=translations["modelname"], interactive=True) - preprocess_stop = gr.Button(translations["stop_preprocess"]) - extract_stop = gr.Button(translations["stop_extract"]) - train_stop = gr.Button(translations["stop_training"]) - with gr.Row(): - toggle_button.click(fn=None, js="() => {document.body.classList.toggle('dark')}") - fp_button.click(fn=change_fp, inputs=[fp_choice], outputs=[fp_choice]) - with gr.Row(): - change_lang.click(fn=lambda a: change_language(a, app), inputs=[language_dropdown], outputs=[]) - changetheme.click(fn=lambda a: change_theme(a, app) , inputs=[theme_dropdown], outputs=[]) - font_button.click(fn=lambda a: change_font(a, app), inputs=[font_choice], outputs=[]) - with gr.Row(): - change_lang.click(fn=None, js="setTimeout(function() {location.reload()}, 30000)", inputs=[], outputs=[]) - changetheme.click(fn=None, js="setTimeout(function() {location.reload()}, 30000)", inputs=[], outputs=[]) - font_button.click(fn=None, js="setTimeout(function() {location.reload()}, 30000)", inputs=[], outputs=[]) - with gr.Row(): - separate_stop.click(fn=lambda: stop_pid("separate_pid", None, False), inputs=[], outputs=[]) - convert_stop.click(fn=lambda: stop_pid("convert_pid", None, False), inputs=[], outputs=[]) - create_dataset_stop.click(fn=lambda: stop_pid("create_dataset_pid", None, False), inputs=[], outputs=[]) - with gr.Row(): - preprocess_stop.click(fn=lambda model_name_stop: stop_pid("preprocess_pid", model_name_stop, False), inputs=[model_name_stop], outputs=[]) - extract_stop.click(fn=lambda model_name_stop: stop_pid("extract_pid", model_name_stop, False), inputs=[model_name_stop], outputs=[]) - train_stop.click(fn=lambda model_name_stop: stop_pid("train_pid", model_name_stop, True), inputs=[model_name_stop], outputs=[]) \ No newline at end of file diff --git a/main/app/tabs/extra/extra.py b/main/app/tabs/extra/extra.py deleted file mode 100644 index f2938e7341fc5187eb5a8c9af54a4320e5725e04..0000000000000000000000000000000000000000 --- a/main/app/tabs/extra/extra.py +++ /dev/null @@ -1,40 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.variables import translations, configs -from main.app.tabs.extra.child.fushion import fushion_tab -from main.app.tabs.extra.child.settings import settings_tab -from main.app.tabs.extra.child.read_model import read_model_tab -from main.app.tabs.extra.child.f0_extract import f0_extract_tab -from main.app.tabs.extra.child.create_srt import create_srt_tab -from main.app.tabs.extra.child.convert_model import convert_model_tab - -def extra_tab(app): - with gr.TabItem(translations["extra"], visible=configs.get("extra_tab", True)): - with gr.TabItem(translations["fushion"], visible=configs.get("fushion_tab", True)): - gr.Markdown(translations["fushion_markdown"]) - fushion_tab() - - with gr.TabItem(translations["read_model"], visible=configs.get("read_tab", True)): - gr.Markdown(translations["read_model_markdown"]) - read_model_tab() - - with gr.TabItem(translations["convert_model"], visible=configs.get("onnx_tab", True)): - gr.Markdown(translations["pytorch2onnx"]) - convert_model_tab() - - with gr.TabItem(translations["f0_extractor_tab"], visible=configs.get("f0_extractor_tab", True)): - gr.Markdown(translations["f0_extractor_markdown"]) - f0_extract_tab() - - with gr.TabItem(translations["create_srt_tab"], visible=configs.get("create_srt_tab", True)): - gr.Markdown(translations["create_srt_markdown"]) - create_srt_tab() - - with gr.TabItem(translations["settings"], visible=configs.get("settings_tab", True)): - gr.Markdown(translations["settings_markdown"]) - settings_tab(app) \ No newline at end of file diff --git a/main/app/tabs/inference/child/convert.py b/main/app/tabs/inference/child/convert.py deleted file mode 100644 index 2d09f3f4ed1817a8fa17d1d9875de32d2431cafc..0000000000000000000000000000000000000000 --- a/main/app/tabs/inference/child/convert.py +++ /dev/null @@ -1,328 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.presets import load_presets, save_presets -from main.app.core.inference import convert_audio, convert_selection -from main.app.variables import translations, paths_for_files, sample_rate_choice, model_name, index_path, method_f0, f0_file, embedders_mode, embedders_model, presets_file, configs, file_types, export_format_choices, hybrid_f0_method -from main.app.core.ui import visible, valueFalse_interactive, change_audios_choices, change_f0_choices, unlock_f0, change_preset_choices, change_backing_choices, hoplength_show, change_models_choices, get_index, index_strength_show, change_embedders_mode, shutil_move - -def convert_tab(): - with gr.Row(): - gr.Markdown(translations["convert_info"]) - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - cleaner0 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True) - autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True) - use_audio = gr.Checkbox(label=translations["use_audio"], value=False, interactive=True) - checkpointing = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True) - with gr.Row(): - use_original = gr.Checkbox(label=translations["convert_original"], value=False, interactive=True, visible=use_audio.value) - convert_backing = gr.Checkbox(label=translations["convert_backing"], value=False, interactive=True, visible=use_audio.value) - not_merge_backing = gr.Checkbox(label=translations["not_merge_backing"], value=False, interactive=True, visible=use_audio.value) - merge_instrument = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True, visible=use_audio.value) - with gr.Row(): - pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True) - clean_strength0 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner0.value) - with gr.Row(): - with gr.Column(): - audio_select = gr.Dropdown(label=translations["select_separate"], choices=[], value="", interactive=True, allow_custom_value=True, visible=False) - convert_button_2 = gr.Button(translations["convert_audio"], visible=False) - with gr.Row(): - with gr.Column(): - convert_button = gr.Button(translations["convert_audio"], variant="primary") - with gr.Row(): - with gr.Column(): - input0 = gr.Files(label=translations["drop_audio"], file_types=file_types) - play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"]) - with gr.Column(): - with gr.Accordion(translations["model_accordion"], open=True): - with gr.Row(): - model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True) - model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True) - with gr.Row(): - refresh = gr.Button(translations["refresh"]) - with gr.Row(): - index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "") - with gr.Accordion(translations["input_output"], open=False): - with gr.Column(): - export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True) - input_audio0 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True) - output_audio = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True) - with gr.Column(): - refresh0 = gr.Button(translations["refresh"]) - with gr.Accordion(translations["setting"], open=False): - with gr.Accordion(translations["f0_method"], open=False): - with gr.Group(): - with gr.Row(): - onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True) - unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True) - method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True) - hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=method.value == "hybrid") - hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False) - alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False) - with gr.Accordion(translations["f0_file"], open=False): - upload_f0_file = gr.File(label=translations["upload_f0"], file_types=[".txt"]) - f0_file_dropdown = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True) - refresh_f0_file = gr.Button(translations["refresh"]) - with gr.Accordion(translations["hubert_model"], open=False): - embed_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True) - embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True) - custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom") - with gr.Accordion(translations["use_presets"], open=False): - with gr.Row(): - presets_name = gr.Dropdown(label=translations["file_preset"], choices=presets_file, value=presets_file[0] if len(presets_file) > 0 else '', interactive=True, allow_custom_value=True) - with gr.Row(): - load_click = gr.Button(translations["load_file"], variant="primary") - refresh_click = gr.Button(translations["refresh"]) - with gr.Accordion(translations["export_file"], open=False): - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - cleaner_chbox = gr.Checkbox(label=translations["save_clean"], value=True, interactive=True) - autotune_chbox = gr.Checkbox(label=translations["save_autotune"], value=True, interactive=True) - pitch_chbox = gr.Checkbox(label=translations["save_pitch"], value=True, interactive=True) - index_strength_chbox = gr.Checkbox(label=translations["save_index_2"], value=True, interactive=True) - resample_sr_chbox = gr.Checkbox(label=translations["save_resample"], value=True, interactive=True) - filter_radius_chbox = gr.Checkbox(label=translations["save_filter"], value=True, interactive=True) - rms_mix_rate_chbox = gr.Checkbox(label=translations["save_envelope"], value=True, interactive=True) - protect_chbox = gr.Checkbox(label=translations["save_protect"], value=True, interactive=True) - split_audio_chbox = gr.Checkbox(label=translations["save_split"], value=True, interactive=True) - formant_shifting_chbox = gr.Checkbox(label=translations["formantshift"], value=True, interactive=True) - with gr.Row(): - with gr.Column(): - name_to_save_file = gr.Textbox(label=translations["filename_to_save"]) - save_file_button = gr.Button(translations["export_file"]) - with gr.Row(): - upload_presets = gr.Files(label=translations["upload_presets"], file_types=[".conversion.json"]) - with gr.Column(): - with gr.Group(): - with gr.Row(): - split_audio = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True) - formant_shifting = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True) - with gr.Row(): - proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True) - audio_processing = gr.Checkbox(label=translations["audio_processing"], value=False, interactive=True) - resample_sr = gr.Radio(choices=[0]+sample_rate_choice, label=translations["resample"], info=translations["resample_info"], value=0, interactive=True) - proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value) - f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value) - filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True) - rms_mix_rate = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True) - protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True) - with gr.Row(): - formant_qfrency = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False) - formant_timbre = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False) - with gr.Row(): - gr.Markdown(translations["output_convert"]) - with gr.Row(): - main_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["main_convert"]) - backing_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_backing"], visible=convert_backing.value) - main_backing = gr.Audio(show_download_button=True, interactive=False, label=translations["main_or_backing"], visible=convert_backing.value) - with gr.Row(): - original_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_original"], visible=use_original.value) - vocal_instrument = gr.Audio(show_download_button=True, interactive=False, label=translations["voice_or_instruments"], visible=merge_instrument.value) - with gr.Row(): - upload_f0_file.upload(fn=lambda inp: shutil_move(inp.name, configs["f0_path"]), inputs=[upload_f0_file], outputs=[f0_file_dropdown]) - refresh_f0_file.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown]) - unlock_full_method.change(fn=unlock_f0, inputs=[unlock_full_method], outputs=[method]) - with gr.Row(): - load_click.click( - fn=load_presets, - inputs=[ - presets_name, - cleaner0, - autotune, - pitch, - clean_strength0, - index_strength, - resample_sr, - filter_radius, - rms_mix_rate, - protect, - split_audio, - f0_autotune_strength, - formant_shifting, - formant_qfrency, - formant_timbre, - proposal_pitch, - proposal_pitch_threshold - ], - outputs=[ - cleaner0, - autotune, - pitch, - clean_strength0, - index_strength, - resample_sr, - filter_radius, - rms_mix_rate, - protect, - split_audio, - f0_autotune_strength, - formant_shifting, - formant_qfrency, - formant_timbre, - proposal_pitch, - proposal_pitch_threshold - ] - ) - refresh_click.click(fn=change_preset_choices, inputs=[], outputs=[presets_name]) - save_file_button.click( - fn=save_presets, - inputs=[ - name_to_save_file, - cleaner0, - autotune, - pitch, - clean_strength0, - index_strength, - resample_sr, - filter_radius, - rms_mix_rate, - protect, - split_audio, - f0_autotune_strength, - cleaner_chbox, - autotune_chbox, - pitch_chbox, - index_strength_chbox, - resample_sr_chbox, - filter_radius_chbox, - rms_mix_rate_chbox, - protect_chbox, - split_audio_chbox, - formant_shifting_chbox, - formant_shifting, - formant_qfrency, - formant_timbre, - proposal_pitch, - proposal_pitch_threshold - ], - outputs=[presets_name] - ) - with gr.Row(): - upload_presets.upload(fn=lambda presets_in: [shutil_move(preset.name, configs["presets_path"]) for preset in presets_in][0], inputs=[upload_presets], outputs=[presets_name]) - autotune.change(fn=visible, inputs=[autotune], outputs=[f0_autotune_strength]) - use_audio.change(fn=lambda a: [visible(a), visible(a), visible(a), visible(a), visible(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), visible(not a), visible(not a), visible(not a), visible(not a)], inputs=[use_audio], outputs=[main_backing, use_original, convert_backing, not_merge_backing, merge_instrument, use_original, convert_backing, not_merge_backing, merge_instrument, input_audio0, output_audio, input0, play_audio]) - with gr.Row(): - convert_backing.change(fn=lambda a,b: [change_backing_choices(a, b), visible(a)], inputs=[convert_backing, not_merge_backing], outputs=[use_original, backing_convert]) - use_original.change(fn=lambda audio, original: [visible(original), visible(not original), visible(audio and not original), valueFalse_interactive(not original), valueFalse_interactive(not original)], inputs=[use_audio, use_original], outputs=[original_convert, main_convert, main_backing, convert_backing, not_merge_backing]) - cleaner0.change(fn=visible, inputs=[cleaner0], outputs=[clean_strength0]) - with gr.Row(): - merge_instrument.change(fn=visible, inputs=[merge_instrument], outputs=[vocal_instrument]) - not_merge_backing.change(fn=lambda audio, merge, cvb: [visible(audio and not merge), change_backing_choices(cvb, merge)], inputs=[use_audio, not_merge_backing, convert_backing], outputs=[main_backing, use_original]) - method.change(fn=lambda method, hybrid: [visible(method == "hybrid"), visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method, hybrid_method], outputs=[hybrid_method, alpha, hop_length]) - with gr.Row(): - hybrid_method.change(fn=hoplength_show, inputs=[method, hybrid_method], outputs=[hop_length]) - refresh.click(fn=change_models_choices, inputs=[], outputs=[model_pth, model_index]) - model_pth.change(fn=get_index, inputs=[model_pth], outputs=[model_index]) - with gr.Row(): - input0.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[input0], outputs=[input_audio0]) - input_audio0.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio0], outputs=[play_audio]) - formant_shifting.change(fn=lambda a: [visible(a) for _ in range(2)], inputs=[formant_shifting], outputs=[formant_qfrency, formant_timbre]) - with gr.Row(): - embedders.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders], outputs=[custom_embedders]) - refresh0.click(fn=change_audios_choices, inputs=[input_audio0], outputs=[input_audio0]) - model_index.change(fn=index_strength_show, inputs=[model_index], outputs=[index_strength]) - with gr.Row(): - convert_button.click(fn=lambda: visible(False), inputs=[], outputs=[convert_button]) - convert_button_2.click(fn=lambda: [visible(False), visible(False)], inputs=[], outputs=[audio_select, convert_button_2]) - with gr.Row(): - proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold]) - embed_mode.change(fn=change_embedders_mode, inputs=[embed_mode], outputs=[embedders]) - with gr.Row(): - convert_button.click( - fn=convert_selection, - inputs=[ - cleaner0, - autotune, - use_audio, - use_original, - convert_backing, - not_merge_backing, - merge_instrument, - pitch, - clean_strength0, - model_pth, - model_index, - index_strength, - input_audio0, - output_audio, - export_format, - method, - hybrid_method, - hop_length, - embedders, - custom_embedders, - resample_sr, - filter_radius, - rms_mix_rate, - protect, - split_audio, - f0_autotune_strength, - checkpointing, - onnx_f0_mode, - formant_shifting, - formant_qfrency, - formant_timbre, - f0_file_dropdown, - embed_mode, - proposal_pitch, - proposal_pitch_threshold, - audio_processing, - alpha - ], - outputs=[audio_select, main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button, convert_button_2], - api_name="convert_selection" - ) - convert_button_2.click( - fn=convert_audio, - inputs=[ - cleaner0, - autotune, - use_audio, - use_original, - convert_backing, - not_merge_backing, - merge_instrument, - pitch, - clean_strength0, - model_pth, - model_index, - index_strength, - input_audio0, - output_audio, - export_format, - method, - hybrid_method, - hop_length, - embedders, - custom_embedders, - resample_sr, - filter_radius, - rms_mix_rate, - protect, - split_audio, - f0_autotune_strength, - audio_select, - checkpointing, - onnx_f0_mode, - formant_shifting, - formant_qfrency, - formant_timbre, - f0_file_dropdown, - embed_mode, - proposal_pitch, - proposal_pitch_threshold, - audio_processing, - alpha - ], - outputs=[main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button], - api_name="convert_audio" - ) \ No newline at end of file diff --git a/main/app/tabs/inference/child/convert_tts.py b/main/app/tabs/inference/child/convert_tts.py deleted file mode 100644 index 48de23f74d25053f0afcb179650267d1b21e9f6f..0000000000000000000000000000000000000000 --- a/main/app/tabs/inference/child/convert_tts.py +++ /dev/null @@ -1,280 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.tts import TTS -from main.app.core.process import process_input -from main.app.core.inference import convert_tts -from main.app.core.utils import google_translate -from main.app.core.presets import save_presets, load_presets -from main.app.core.ui import visible, change_f0_choices, unlock_f0, hoplength_show, change_models_choices, get_index, index_strength_show, change_embedders_mode, change_tts_voice_choices, shutil_move, change_preset_choices -from main.app.variables import translations, sample_rate_choice, model_name, index_path, method_f0, f0_file, embedders_mode, embedders_model, edgetts, google_tts_voice, configs, presets_file, export_format_choices, hybrid_f0_method - -def convert_tts_tab(): - with gr.Row(): - gr.Markdown(translations["convert_text_markdown_2"]) - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - use_txt = gr.Checkbox(label=translations["input_txt"], value=False, interactive=True) - google_tts_check_box = gr.Checkbox(label=translations["googletts"], value=False, interactive=True) - prompt = gr.Textbox(label=translations["text_to_speech"], value="", placeholder="Hello Words", lines=3) - with gr.Column(): - speed = gr.Slider(label=translations["voice_speed"], info=translations["voice_speed_info"], minimum=-100, maximum=100, value=0, step=1) - pitch0 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True) - with gr.Row(): - tts_button = gr.Button(translations["tts_1"], variant="primary", scale=2) - convert_button0 = gr.Button(translations["tts_2"], variant="secondary", scale=2) - with gr.Row(): - with gr.Column(): - txt_input = gr.File(label=translations["drop_text"], file_types=[".txt", ".srt", ".docx"], visible=use_txt.value) - tts_voice = gr.Dropdown(label=translations["voice"], choices=edgetts, interactive=True, value="vi-VN-NamMinhNeural") - tts_pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info_2"], label=translations["pitch"], value=0, interactive=True) - with gr.Accordion(translations["translate"], open=False): - with gr.Row(): - source_lang = gr.Dropdown(label=translations["source_lang"], choices=["auto"]+google_tts_voice, interactive=True, value="auto") - target_lang = gr.Dropdown(label=translations["target_lang"], choices=google_tts_voice, interactive=True, value="en") - translate_button = gr.Button(translations["translate"]) - with gr.Column(): - with gr.Accordion(translations["model_accordion"], open=True): - with gr.Row(): - model_pth0 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True) - model_index0 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True) - with gr.Row(): - refresh1 = gr.Button(translations["refresh"]) - with gr.Row(): - index_strength0 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index0.value != "") - with gr.Accordion(translations["output_path"], open=False): - export_format0 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True) - output_audio0 = gr.Textbox(label=translations["output_tts"], value="audios/tts.wav", placeholder="audios/tts.wav", info=translations["tts_output"], interactive=True) - output_audio1 = gr.Textbox(label=translations["output_tts_convert"], value="audios/tts-convert.wav", placeholder="audios/tts-convert.wav", info=translations["tts_output"], interactive=True) - with gr.Accordion(translations["setting"], open=False): - with gr.Accordion(translations["f0_method"], open=False): - with gr.Group(): - with gr.Row(): - onnx_f0_mode1 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True) - unlock_full_method3 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True) - method0 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True) - hybrid_method0 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=method0.value == "hybrid") - hop_length0 = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False) - alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False) - with gr.Accordion(translations["f0_file"], open=False): - upload_f0_file0 = gr.File(label=translations["upload_f0"], file_types=[".txt"]) - f0_file_dropdown0 = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True) - refresh_f0_file0 = gr.Button(translations["refresh"]) - with gr.Accordion(translations["hubert_model"], open=False): - embed_mode1 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True) - embedders0 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True) - custom_embedders0 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders0.value == "custom") - with gr.Accordion(translations["use_presets"], open=False): - with gr.Row(): - presets_name = gr.Dropdown(label=translations["file_preset"], choices=presets_file, value=presets_file[0] if len(presets_file) > 0 else '', interactive=True, allow_custom_value=True) - with gr.Row(): - load_click = gr.Button(translations["load_file"], variant="primary") - refresh_click = gr.Button(translations["refresh"]) - with gr.Accordion(translations["export_file"], open=False): - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - cleaner_chbox = gr.Checkbox(label=translations["save_clean"], value=True, interactive=True) - autotune_chbox = gr.Checkbox(label=translations["save_autotune"], value=True, interactive=True) - pitch_chbox = gr.Checkbox(label=translations["save_pitch"], value=True, interactive=True) - index_strength_chbox = gr.Checkbox(label=translations["save_index_2"], value=True, interactive=True) - resample_sr_chbox = gr.Checkbox(label=translations["save_resample"], value=True, interactive=True) - filter_radius_chbox = gr.Checkbox(label=translations["save_filter"], value=True, interactive=True) - rms_mix_rate_chbox = gr.Checkbox(label=translations["save_envelope"], value=True, interactive=True) - protect_chbox = gr.Checkbox(label=translations["save_protect"], value=True, interactive=True) - split_audio_chbox = gr.Checkbox(label=translations["save_split"], value=True, interactive=True) - formant_shifting_chbox = gr.Checkbox(label=translations["formantshift"], value=True, interactive=True) - with gr.Row(): - with gr.Column(): - name_to_save_file = gr.Textbox(label=translations["filename_to_save"]) - save_file_button = gr.Button(translations["export_file"]) - with gr.Row(): - upload_presets = gr.Files(label=translations["upload_presets"], file_types=[".conversion.json"]) - with gr.Group(): - with gr.Row(): - audio_processing = gr.Checkbox(label=translations["audio_processing"], value=False, interactive=True) - with gr.Row(): - formant_shifting1 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True) - split_audio0 = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True) - cleaner1 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True) - with gr.Row(): - autotune3 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True) - checkpointing0 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True) - proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True) - with gr.Column(): - resample_sr0 = gr.Radio(choices=[0]+sample_rate_choice, label=translations["resample"], info=translations["resample_info"], value=0, interactive=True) - proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value) - f0_autotune_strength0 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune3.value) - clean_strength1 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner1.value) - filter_radius0 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True) - rms_mix_rate0 = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True) - protect0 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True) - with gr.Row(): - formant_qfrency1 = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False) - formant_timbre1 = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False) - with gr.Row(): - gr.Markdown(translations["output_tts_markdown"]) - with gr.Row(): - tts_voice_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["output_text_to_speech"]) - tts_voice_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"]) - with gr.Row(): - load_click.click( - fn=load_presets, - inputs=[ - presets_name, - cleaner1, - autotune3, - pitch0, - clean_strength1, - index_strength0, - resample_sr0, - filter_radius0, - rms_mix_rate0, - protect0, - split_audio0, - f0_autotune_strength0, - formant_shifting1, - formant_qfrency1, - formant_timbre1, - proposal_pitch, - proposal_pitch_threshold - ], - outputs=[ - cleaner1, - autotune3, - pitch0, - clean_strength1, - index_strength0, - resample_sr0, - filter_radius0, - rms_mix_rate0, - protect0, - split_audio0, - f0_autotune_strength0, - formant_shifting1, - formant_qfrency1, - formant_timbre1, - proposal_pitch, - proposal_pitch_threshold - ] - ) - refresh_click.click(fn=change_preset_choices, inputs=[], outputs=[presets_name]) - save_file_button.click( - fn=save_presets, - inputs=[ - name_to_save_file, - cleaner1, - autotune3, - pitch0, - clean_strength1, - index_strength0, - resample_sr0, - filter_radius0, - rms_mix_rate0, - protect0, - split_audio0, - f0_autotune_strength0, - cleaner_chbox, - autotune_chbox, - pitch_chbox, - index_strength_chbox, - resample_sr_chbox, - filter_radius_chbox, - rms_mix_rate_chbox, - protect_chbox, - split_audio_chbox, - formant_shifting_chbox, - formant_shifting1, - formant_qfrency1, - formant_timbre1, - proposal_pitch, - proposal_pitch_threshold - ], - outputs=[presets_name] - ) - with gr.Row(): - proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold]) - upload_presets.upload(fn=lambda presets_in: [shutil_move(preset.name, configs["presets_path"]) for preset in presets_in][0], inputs=[upload_presets], outputs=[presets_name]) - translate_button.click(fn=google_translate, inputs=[prompt, source_lang, target_lang], outputs=[prompt], api_name="google_translate") - with gr.Row(): - unlock_full_method3.change(fn=unlock_f0, inputs=[unlock_full_method3], outputs=[method0]) - upload_f0_file0.upload(fn=lambda inp: shutil_move(inp.name, configs["f0_path"]), inputs=[upload_f0_file0], outputs=[f0_file_dropdown0]) - refresh_f0_file0.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown0]) - with gr.Row(): - embed_mode1.change(fn=change_embedders_mode, inputs=[embed_mode1], outputs=[embedders0]) - autotune3.change(fn=visible, inputs=[autotune3], outputs=[f0_autotune_strength0]) - model_pth0.change(fn=get_index, inputs=[model_pth0], outputs=[model_index0]) - with gr.Row(): - cleaner1.change(fn=visible, inputs=[cleaner1], outputs=[clean_strength1]) - method0.change(fn=lambda method, hybrid: [visible(method == "hybrid"), visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method0, hybrid_method0], outputs=[hybrid_method0, alpha, hop_length0]) - hybrid_method0.change(fn=hoplength_show, inputs=[method0, hybrid_method0], outputs=[hop_length0]) - with gr.Row(): - refresh1.click(fn=change_models_choices, inputs=[], outputs=[model_pth0, model_index0]) - embedders0.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders0], outputs=[custom_embedders0]) - formant_shifting1.change(fn=lambda a: [visible(a) for _ in range(2)], inputs=[formant_shifting1], outputs=[formant_qfrency1, formant_timbre1]) - with gr.Row(): - model_index0.change(fn=index_strength_show, inputs=[model_index0], outputs=[index_strength0]) - txt_input.upload(fn=process_input, inputs=[txt_input], outputs=[prompt]) - use_txt.change(fn=visible, inputs=[use_txt], outputs=[txt_input]) - with gr.Row(): - google_tts_check_box.change(fn=change_tts_voice_choices, inputs=[google_tts_check_box], outputs=[tts_voice]) - tts_button.click( - fn=TTS, - inputs=[ - prompt, - tts_voice, - speed, - output_audio0, - tts_pitch, - google_tts_check_box, - txt_input - ], - outputs=[tts_voice_audio], - api_name="text-to-speech" - ) - convert_button0.click( - fn=convert_tts, - inputs=[ - cleaner1, - autotune3, - pitch0, - clean_strength1, - model_pth0, - model_index0, - index_strength0, - output_audio0, - output_audio1, - export_format0, - method0, - hybrid_method0, - hop_length0, - embedders0, - custom_embedders0, - resample_sr0, - filter_radius0, - rms_mix_rate0, - protect0, - split_audio0, - f0_autotune_strength0, - checkpointing0, - onnx_f0_mode1, - formant_shifting1, - formant_qfrency1, - formant_timbre1, - f0_file_dropdown0, - embed_mode1, - proposal_pitch, - proposal_pitch_threshold, - audio_processing, - alpha - ], - outputs=[tts_voice_convert], - api_name="convert_tts" - ) \ No newline at end of file diff --git a/main/app/tabs/inference/child/convert_with_whisper.py b/main/app/tabs/inference/child/convert_with_whisper.py deleted file mode 100644 index 2c023ee67772475b39136760014de9e7cad0bc8f..0000000000000000000000000000000000000000 --- a/main/app/tabs/inference/child/convert_with_whisper.py +++ /dev/null @@ -1,164 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.inference import convert_with_whisper -from main.app.core.ui import visible, change_audios_choices, unlock_f0, hoplength_show, change_models_choices, get_index, index_strength_show, change_embedders_mode, shutil_move -from main.app.variables import translations, paths_for_files, sample_rate_choice, model_name, index_path, method_f0, embedders_mode, embedders_model, configs, file_types, export_format_choices, whisper_model, hybrid_f0_method - -def convert_with_whisper_tab(): - with gr.Row(): - gr.Markdown(translations["convert_with_whisper_info"]) - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - cleaner2 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True) - autotune2 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True) - checkpointing2 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True) - formant_shifting2 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True) - proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True) - audio_processing = gr.Checkbox(label=translations["audio_processing"], value=False, interactive=True) - with gr.Row(): - num_spk = gr.Slider(minimum=2, maximum=8, step=1, info=translations["num_spk_info"], label=translations["num_spk"], value=2, interactive=True) - with gr.Row(): - with gr.Column(): - convert_button3 = gr.Button(translations["convert_audio"], variant="primary") - with gr.Row(): - with gr.Column(): - with gr.Accordion(translations["model_accordion"] + " 1", open=True): - with gr.Row(): - model_pth2 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True) - model_index2 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True) - with gr.Row(): - refresh2 = gr.Button(translations["refresh"]) - with gr.Row(): - pitch3 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True) - index_strength2 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index2.value != "") - with gr.Accordion(translations["input_output"], open=False): - with gr.Column(): - export_format2 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True) - input_audio1 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True) - output_audio2 = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True) - with gr.Column(): - refresh4 = gr.Button(translations["refresh"]) - with gr.Row(): - input2 = gr.Files(label=translations["drop_audio"], file_types=file_types) - with gr.Column(): - with gr.Accordion(translations["model_accordion"] + " 2", open=True): - with gr.Row(): - model_pth3 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True) - model_index3 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True) - with gr.Row(): - refresh3 = gr.Button(translations["refresh"]) - with gr.Row(): - pitch4 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True) - index_strength3 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index3.value != "") - with gr.Accordion(translations["setting"], open=False): - with gr.Row(): - model_size = gr.Radio(label=translations["model_size"], info=translations["model_size_info"], choices=whisper_model, value="medium", interactive=True) - with gr.Accordion(translations["f0_method"], open=False): - with gr.Group(): - with gr.Row(): - onnx_f0_mode4 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True) - unlock_full_method2 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True) - method3 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True) - hybrid_method3 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=method3.value == "hybrid") - hop_length3 = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False) - alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False) - with gr.Accordion(translations["hubert_model"], open=False): - embed_mode3 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True) - embedders3 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True) - custom_embedders3 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders3.value == "custom") - with gr.Column(): - resample_sr3 = gr.Radio(choices=[0]+sample_rate_choice, label=translations["resample"], info=translations["resample_info"], value=0, interactive=True) - proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value) - clean_strength3 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner2.value) - f0_autotune_strength3 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune2.value) - filter_radius3 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True) - rms_mix_rate3 = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True) - protect3 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True) - with gr.Row(): - formant_qfrency3 = gr.Slider(value=1.0, label=translations["formant_qfrency"] + " 1", info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False) - formant_timbre3 = gr.Slider(value=1.0, label=translations["formant_timbre"] + " 1", info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False) - with gr.Row(): - formant_qfrency4 = gr.Slider(value=1.0, label=translations["formant_qfrency"] + " 2", info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False) - formant_timbre4 = gr.Slider(value=1.0, label=translations["formant_timbre"] + " 2", info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False) - with gr.Row(): - gr.Markdown(translations["input_output"]) - with gr.Row(): - play_audio2 = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"]) - play_audio3 = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"]) - with gr.Row(): - autotune2.change(fn=visible, inputs=[autotune2], outputs=[f0_autotune_strength3]) - cleaner2.change(fn=visible, inputs=[cleaner2], outputs=[clean_strength3]) - method3.change(fn=lambda method, hybrid: [visible(method == "hybrid"), visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method3, hybrid_method3], outputs=[hybrid_method3, alpha, hop_length3]) - with gr.Row(): - hybrid_method3.change(fn=hoplength_show, inputs=[method3, hybrid_method3], outputs=[hop_length3]) - refresh2.click(fn=change_models_choices, inputs=[], outputs=[model_pth2, model_index2]) - model_pth2.change(fn=get_index, inputs=[model_pth2], outputs=[model_index2]) - with gr.Row(): - refresh3.click(fn=change_models_choices, inputs=[], outputs=[model_pth3, model_index3]) - model_pth3.change(fn=get_index, inputs=[model_pth3], outputs=[model_index3]) - input2.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[input2], outputs=[input_audio1]) - with gr.Row(): - input_audio1.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio1], outputs=[play_audio2]) - formant_shifting2.change(fn=lambda a: [visible(a) for _ in range(4)], inputs=[formant_shifting2], outputs=[formant_qfrency3, formant_timbre3, formant_qfrency4, formant_timbre4]) - embedders3.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders3], outputs=[custom_embedders3]) - with gr.Row(): - refresh4.click(fn=change_audios_choices, inputs=[input_audio1], outputs=[input_audio1]) - model_index2.change(fn=index_strength_show, inputs=[model_index2], outputs=[index_strength2]) - model_index3.change(fn=index_strength_show, inputs=[model_index3], outputs=[index_strength3]) - with gr.Row(): - unlock_full_method2.change(fn=unlock_f0, inputs=[unlock_full_method2], outputs=[method3]) - embed_mode3.change(fn=change_embedders_mode, inputs=[embed_mode3], outputs=[embedders3]) - proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold]) - with gr.Row(): - convert_button3.click( - fn=convert_with_whisper, - inputs=[ - num_spk, - model_size, - cleaner2, - clean_strength3, - autotune2, - f0_autotune_strength3, - checkpointing2, - model_pth2, - model_pth3, - model_index2, - model_index3, - pitch3, - pitch4, - index_strength2, - index_strength3, - export_format2, - input_audio1, - output_audio2, - onnx_f0_mode4, - method3, - hybrid_method3, - hop_length3, - embed_mode3, - embedders3, - custom_embedders3, - resample_sr3, - filter_radius3, - rms_mix_rate3, - protect3, - formant_shifting2, - formant_qfrency3, - formant_timbre3, - formant_qfrency4, - formant_timbre4, - proposal_pitch, - proposal_pitch_threshold, - audio_processing, - alpha - ], - outputs=[play_audio3], - api_name="convert_with_whisper" - ) \ No newline at end of file diff --git a/main/app/tabs/inference/child/separate.py b/main/app/tabs/inference/child/separate.py deleted file mode 100644 index c43102281fd41b9604f185ec186f4c638aca2414..0000000000000000000000000000000000000000 --- a/main/app/tabs/inference/child/separate.py +++ /dev/null @@ -1,263 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.downloads import download_url -from main.app.core.separate import separate_music -from main.app.core.ui import visible, valueFalse_interactive, change_audios_choices, shutil_move, separate_change -from main.app.variables import translations, uvr_model, karaoke_models, reverb_models, vr_models, denoise_models, mdx_models, paths_for_files, sample_rate_choice, configs, file_types, export_format_choices - -def separate_tab(): - with gr.Row(): - gr.Markdown(translations["4_part"]) - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - enable_denoise = gr.Checkbox(label=translations["denoise_mdx"], value=False, interactive=False) - separate_backing = gr.Checkbox(label=translations["separator_backing"], value=False, interactive=True) - separate_reverb = gr.Checkbox(label=translations["dereveb_audio"], value=False, interactive=True) - enable_tta = gr.Checkbox(label=translations["enable_tta"], value=False, interactive=False) - high_end_process = gr.Checkbox(label=translations["high_end_process"], value=False, interactive=False) - enable_post_process = gr.Checkbox(label=translations["enable_post_process"], value=False, interactive=False) - with gr.Row(): - model_name = gr.Dropdown(label=translations["separator_model"], value=uvr_model[0], choices=uvr_model, interactive=True) - karaoke_model = gr.Dropdown(label=translations["separator_backing_model"], value=list(karaoke_models.keys())[0], choices=list(karaoke_models.keys()), interactive=True, visible=separate_backing.value) - reverb_model = gr.Dropdown(label=translations["dereveb_model"], value=list(reverb_models.keys())[0], choices=list(reverb_models.keys()), interactive=True, visible=separate_reverb.value) - denoise_model = gr.Dropdown(label=translations["denoise_model"], value=list(denoise_models.keys())[0], choices=list(denoise_models.keys()), interactive=True, visible=enable_denoise.value and model_name.value in list(vr_models.keys())) - with gr.Row(): - with gr.Column(): - separate_button = gr.Button(translations["separator_tab"], variant="primary") - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - shifts = gr.Slider(label=translations["shift"], info=translations["shift_info"], minimum=1, maximum=20, value=2, step=1, interactive=True) - batch_size = gr.Slider(label=translations["batch_size"], info=translations["mdx_batch_size_info"], minimum=1, maximum=64, value=1, step=1, interactive=True, visible=False) - with gr.Row(): - segments_size = gr.Slider(label=translations["segments_size"], info=translations["segments_size_info"], minimum=32, maximum=3072, value=256, step=32, interactive=True) - aggression = gr.Slider(label=translations['aggression'], info=translations["aggression_info"], minimum=1, maximum=50, value=5, step=1, interactive=True, visible=False) - drop_audio = gr.Files(label=translations["drop_audio"], file_types=file_types) - with gr.Accordion(translations["use_url"], open=False): - url = gr.Textbox(label=translations["url_audio"], value="", placeholder="https://www.youtube.com/...", scale=6) - download_button = gr.Button(translations["downloads"]) - with gr.Column(): - with gr.Group(): - with gr.Row(): - overlap = gr.Radio(label=translations["overlap"], info=translations["overlap_info"], choices=["0.25", "0.5", "0.75", "0.99"], value="0.25", interactive=True) - with gr.Row(): - window_size = gr.Slider(label=translations["window_size"], info=translations["window_size_info"], minimum=320, maximum=1024, value=512, step=32, interactive=True, visible=False) - hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=8192, value=1024, step=1, interactive=True, visible=False) - post_process_threshold = gr.Slider(label=translations['post_process_threshold'], info=translations["post_process_threshold_info"], minimum=0.1, maximum=0.3, value=0.2, step=0.1, interactive=True, visible=False) - sample_rate = gr.Radio(choices=sample_rate_choice, value=44100, label=translations["sr"], info=translations["sr_info"], interactive=True) - with gr.Accordion(translations["input_output"], open=False): - export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True) - input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True) - refresh_audio = gr.Button(translations["refresh"]) - output_dirs = gr.Textbox(label=translations["output_folder"], value="audios", placeholder="audios", info=translations["output_folder_info"], interactive=True) - audio_input = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"]) - with gr.Row(): - gr.Markdown(translations["output_separator"]) - with gr.Row(): - instruments_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["instruments"]) - original_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["original_vocal"]) - main_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["main_vocal"], visible=separate_backing.value) - backing_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["backing_vocal"], visible=separate_backing.value) - with gr.Row(): - model_name.change(fn=lambda a: valueFalse_interactive(a in list(mdx_models.keys()) + list(vr_models.keys())), inputs=[model_name], outputs=[enable_denoise]) - separate_backing.change(fn=lambda a, b: valueFalse_interactive(a or b), inputs=[separate_backing, separate_reverb], outputs=[enable_denoise]) - separate_reverb.change(fn=lambda a, b: valueFalse_interactive(a or b), inputs=[separate_backing, separate_reverb], outputs=[enable_denoise]) - with gr.Row(): - input_audio.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio], outputs=[audio_input]) - drop_audio.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[drop_audio], outputs=[input_audio]) - refresh_audio.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio]) - with gr.Row(): - separate_backing.change(fn=lambda a: [visible(a) for _ in range(2)], inputs=[separate_backing], outputs=[main_vocals, backing_vocals]) - download_button.click( - fn=download_url, - inputs=[url], - outputs=[input_audio, audio_input, url], - api_name='download_url' - ) - with gr.Row(): - model_name.change( - fn=separate_change, - inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise], - outputs=[ - karaoke_model, - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - with gr.Row(): - karaoke_model.change( - fn=separate_change, - inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise], - outputs=[ - karaoke_model, - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - separate_backing.change( - fn=separate_change, - inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise], - outputs=[ - karaoke_model, - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - with gr.Row(): - reverb_model.change( - fn=separate_change, - inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise], - outputs=[ - karaoke_model, - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - separate_reverb.change( - fn=separate_change, - inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise], - outputs=[ - karaoke_model, - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - with gr.Row(): - enable_denoise.change( - fn=separate_change, - inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise], - outputs=[ - karaoke_model, - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - enable_post_process.change( - fn=separate_change, - inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise], - outputs=[ - karaoke_model, - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - with gr.Row(): - separate_button.click( - fn=separate_music, - inputs=[ - input_audio, - output_dirs, - export_format, - model_name, - karaoke_model, - reverb_model, - denoise_model, - sample_rate, - shifts, - batch_size, - overlap, - aggression, - hop_length, - window_size, - segments_size, - post_process_threshold, - enable_tta, - enable_denoise, - high_end_process, - enable_post_process, - separate_backing, - separate_reverb - ], - outputs=[ - original_vocals, - instruments_audio, - main_vocals, - backing_vocals - ], - api_name="separate_music" - ) \ No newline at end of file diff --git a/main/app/tabs/inference/inference.py b/main/app/tabs/inference/inference.py deleted file mode 100644 index 437ba78589fc35337e8bd1fdf9145b83f96301e8..0000000000000000000000000000000000000000 --- a/main/app/tabs/inference/inference.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.variables import translations, configs -from main.app.tabs.inference.child.convert import convert_tab -from main.app.tabs.inference.child.separate import separate_tab -from main.app.tabs.inference.child.convert_tts import convert_tts_tab -from main.app.tabs.inference.child.convert_with_whisper import convert_with_whisper_tab - -def inference_tab(): - with gr.TabItem(translations["inference"], visible=configs.get("inference_tab", True)): - with gr.TabItem(translations["separator_tab"], visible=configs.get("separator_tab", True)): - gr.Markdown(f"## {translations['separator_tab']}") - separate_tab() - - with gr.TabItem(translations["convert_audio"], visible=configs.get("convert_tab", True)): - gr.Markdown(f"## {translations['convert_audio']}") - convert_tab() - - with gr.TabItem(translations["convert_with_whisper"], visible=configs.get("convert_with_whisper", True)): - gr.Markdown(f"## {translations['convert_with_whisper']}") - convert_with_whisper_tab() - - with gr.TabItem(translations["convert_text"], visible=configs.get("tts_tab", True)): - gr.Markdown(translations["convert_text_markdown"]) - convert_tts_tab() diff --git a/main/app/tabs/realtime/realtime.py b/main/app/tabs/realtime/realtime.py deleted file mode 100644 index 0937ed12e76842287ca4790216b9f3c7719bf284..0000000000000000000000000000000000000000 --- a/main/app/tabs/realtime/realtime.py +++ /dev/null @@ -1,226 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.realtime import realtime_start, realtime_stop -from main.app.variables import translations, configs, model_name, index_path, method_f0, embedders_mode, embedders_model -from main.app.core.ui import change_models_choices, get_index, index_strength_show, unlock_f0, hoplength_show, change_embedders_mode, visible, audio_device, change_audio_device_choices, update_audio_device - -input_channels_map, output_channels_map = audio_device() - -def realtime_tab(): - with gr.TabItem(translations["realtime"], visible=configs.get("realtime_tab", True)): - gr.Markdown(translations["realtime_markdown"]) - with gr.Row(): - gr.Markdown(translations["realtime_markdown_2"]) - with gr.Row(): - status = gr.Label(label=translations["realtime_latency"], value=translations["realtime_not_startup"]) - with gr.Row(): - monitor = gr.Checkbox(label=translations["monitor"], value=False, interactive=True) - exclusive_mode = gr.Checkbox(label=translations["exclusive_mode"], value=False, interactive=True) - vad_enabled = gr.Checkbox(label=translations["vad_enabled"], value=False, interactive=True) - clean_audio = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True) - with gr.Row(): - with gr.Accordion(translations["audio_device"], open=True): - with gr.Row(): - input_audio_device = gr.Dropdown(label=translations["input_audio_device_label"], info=translations["input_audio_device_info"], choices=list(input_channels_map.keys()), value=list(input_channels_map.keys())[0] if len(list(input_channels_map.keys())) >= 1 else "", interactive=True) - output_audio_device = gr.Dropdown(label=translations["output_audio_device_label"], info=translations["output_audio_device_info"], choices=list(output_channels_map.keys()), value=list(output_channels_map.keys())[0] if len(list(output_channels_map.keys())) >= 1 else "", interactive=True) - monitor_output_device = gr.Dropdown(label=translations["monitor_output_device_label"], info=translations["monitor_output_device_info"], choices=list(output_channels_map.keys()), value=list(output_channels_map.keys())[0] if len(list(output_channels_map.keys())) >= 1 else "", interactive=True, visible=False) - with gr.Row(): - input_audio_gain = gr.Slider(minimum=0, maximum=2500, label=translations["input_audio_gain_label"], info=translations["input_audio_gain_info"], value=100, step=1, interactive=True) - output_audio_gain = gr.Slider(minimum=0, maximum=4000, label=translations["output_audio_gain_label"], info=translations["output_audio_gain_info"], value=100, step=1, interactive=True) - monitor_audio_gain = gr.Slider(minimum=0, maximum=4000, label=translations["monitor_audio_gain_label"], info=translations["monitor_audio_gain_info"], value=100, step=1, interactive=True, visible=False) - with gr.Row(visible=False) as asio_row: - input_asio_channels = gr.Slider(minimum=-1, maximum=128, label=translations["input_asio_channels_label"], info=translations["input_asio_channels_info"], value=-1, step=1, interactive=True, visible=False) - output_asio_channels = gr.Slider(minimum=-1, maximum=128, label=translations["output_asio_channels_label"], info=translations["output_asio_channels_info"], value=-1, step=1, interactive=True, visible=False) - monitor_asio_channels = gr.Slider(minimum=-1, maximum=128, label=translations["monitor_asio_channels_label"], info=translations["monitor_asio_channels_info"], value=-1, step=1, interactive=True, visible=False) - with gr.Row(): - refresh_audio_device = gr.Button(value=translations["refresh_audio_device"], variant="secondary") - with gr.Row(): - start_realtime = gr.Button(value=translations["start_realtime_button"], variant="primary", interactive=True) - stop_realtime = gr.Button(value=translations["stop_realtime_button"], variant="stop", interactive=False) - with gr.Row(): - chunk_size = gr.Slider(minimum=2.7, maximum=2730.7, step=0.1, label=translations["chunk_size"], info=translations["chunk_size_info"], value=1024, interactive=True) - pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True) - with gr.Row(): - with gr.Column(): - with gr.Accordion(translations["model_accordion"], open=True): - with gr.Row(): - model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True) - model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True) - with gr.Row(): - model_refresh = gr.Button(translations["refresh"]) - with gr.Row(): - index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "") - with gr.Column(): - with gr.Accordion(translations["f0_method"], open=True): - with gr.Group(): - with gr.Row(): - onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True) - unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True) - f0_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=[m for m in method_f0 if m != "hybrid"], value="rmvpe", interactive=True) - hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False) - with gr.Column(): - with gr.Accordion(translations["hubert_model"], open=True): - embed_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True) - embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True) - custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom") - with gr.Row(): - with gr.Accordion(translations["setting"], open=True): - with gr.Row(): - f0_autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True) - proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True) - with gr.Group(): - with gr.Row(): - f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=f0_autotune.value) - proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value) - with gr.Row(): - rms_mix_rate = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True) - protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True) - with gr.Row(): - clean_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=False) - filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True) - with gr.Column(): - silent_threshold = gr.Slider(minimum=-90, maximum=-60, label=translations["silent_threshold_label"], info=translations["silent_threshold_info"], value=-90, step=1, interactive=True) - extra_convert_size = gr.Slider(minimum=0.1, maximum=5, label=translations["extra_convert_size_label"], info=translations["extra_convert_size_info"], value=0.5, step=0.1, interactive=True) - cross_fade_overlap_size = gr.Slider(minimum=0.05, maximum=0.2, label=translations["cross_fade_overlap_size_label"], info=translations["cross_fade_overlap_size_info"], value=0.1, step=0.01, interactive=True) - with gr.Row(): - vad_sensitivity = gr.Slider(minimum=0, maximum=3, label=translations["vad_sensitivity_label"], info=translations["vad_sensitivity_info"], value=3, step=1, interactive=True, visible=vad_enabled.value) - vad_frame_ms = gr.Slider(minimum=10, maximum=30, label=translations["vad_frame_ms_label"], info=translations["vad_frame_ms_info"], value=30, step=10, interactive=True, visible=vad_enabled.value) - with gr.Row(): - model_pth.change( - fn=get_index, - inputs=[model_pth], - outputs=[model_index] - ) - model_index.change( - fn=index_strength_show, - inputs=[model_index], - outputs=[index_strength] - ) - model_refresh.click( - fn=change_models_choices, - inputs=[], - outputs=[model_pth, model_index] - ) - with gr.Row(): - unlock_full_method.change( - fn=lambda f0_method: {"choices": [m for m in unlock_f0(f0_method)["choices"] if m != "hybrid"], "value": "rmvpe", "__type__": "update"}, - inputs=[unlock_full_method], - outputs=[f0_method] - ) - f0_method.change( - fn=lambda f0_method: hoplength_show(f0_method, None), - inputs=[f0_method], - outputs=[hop_length] - ) - embed_mode.change( - fn=change_embedders_mode, - inputs=[embed_mode], - outputs=[embedders] - ) - with gr.Row(): - embedders.change( - fn=lambda embedders: visible(embedders == "custom"), - inputs=[embedders], - outputs=[custom_embedders] - ) - input_audio_device.change( - fn=update_audio_device, - inputs=[input_audio_device, output_audio_device, monitor_output_device, monitor], - outputs=[monitor_output_device, monitor_audio_gain, monitor_asio_channels, asio_row, input_asio_channels, output_asio_channels, monitor_asio_channels] - ) - output_audio_device.change( - fn=update_audio_device, - inputs=[input_audio_device, output_audio_device, monitor_output_device, monitor], - outputs=[monitor_output_device, monitor_audio_gain, monitor_asio_channels, asio_row, input_asio_channels, output_asio_channels, monitor_asio_channels] - ) - with gr.Row(): - monitor_output_device.change( - fn=update_audio_device, - inputs=[input_audio_device, output_audio_device, monitor_output_device, monitor], - outputs=[monitor_output_device, monitor_audio_gain, monitor_asio_channels, asio_row, input_asio_channels, output_asio_channels, monitor_asio_channels] - ) - monitor.change( - fn=update_audio_device, - inputs=[input_audio_device, output_audio_device, monitor_output_device, monitor], - outputs=[monitor_output_device, monitor_audio_gain, monitor_asio_channels, asio_row, input_asio_channels, output_asio_channels, monitor_asio_channels] - ) - f0_autotune.change( - fn=visible, - inputs=[f0_autotune], - outputs=[f0_autotune_strength] - ) - with gr.Row(): - proposal_pitch.change( - fn=visible, - inputs=[proposal_pitch], - outputs=[proposal_pitch_threshold] - ) - vad_enabled.change( - fn=lambda a: [visible(a) for _ in range(2)], - inputs=[vad_enabled], - outputs=[vad_sensitivity, vad_frame_ms] - ) - refresh_audio_device.click( - fn=change_audio_device_choices, - inputs=[], - outputs=[input_audio_device, output_audio_device, monitor_output_device] - ) - with gr.Row(): - clean_audio.change( - fn=visible, - inputs=[clean_audio], - outputs=[clean_strength] - ) - start_realtime.click( - fn=realtime_start, - inputs=[ - monitor, - exclusive_mode, - vad_enabled, - input_audio_device, - output_audio_device, - monitor_output_device, - input_audio_gain, - output_audio_gain, - monitor_audio_gain, - input_asio_channels, - output_asio_channels, - monitor_asio_channels, - chunk_size, - pitch, - model_pth, - model_index, - index_strength, - onnx_f0_mode, - f0_method, - hop_length, - embed_mode, - embedders, - custom_embedders, - f0_autotune, - proposal_pitch, - f0_autotune_strength, - proposal_pitch_threshold, - rms_mix_rate, - protect, - filter_radius, - silent_threshold, - extra_convert_size, - cross_fade_overlap_size, - vad_sensitivity, - vad_frame_ms, - clean_audio, - clean_strength - ], - outputs=[status, start_realtime, stop_realtime] - ) - stop_realtime.click( - fn=realtime_stop, - inputs=[], - outputs=[status, start_realtime, stop_realtime] - ) \ No newline at end of file diff --git a/main/app/tabs/realtime/realtime_client.py b/main/app/tabs/realtime/realtime_client.py deleted file mode 100644 index 578045970f8b00cd2e4de2aaa6a3359d1283525e..0000000000000000000000000000000000000000 --- a/main/app/tabs/realtime/realtime_client.py +++ /dev/null @@ -1,210 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.variables import translations, configs, model_name, index_path, method_f0, embedders_mode, embedders_model -from main.app.core.ui import change_models_choices, get_index, index_strength_show, unlock_f0, hoplength_show, change_embedders_mode, visible, update_dropdowns_from_json, update_button_from_json - -def realtime_client_tab(): - with gr.TabItem(translations["realtime_client"], visible=configs.get("realtime_client_tab", True)): - gr.Markdown(translations["realtime_markdown"]) - with gr.Row(): - gr.Markdown(translations["realtime_markdown_2"]) - with gr.Row(): - gr.Label(label=translations["realtime_latency"], value=translations["realtime_not_startup"], elem_id="realtime-status-info") - with gr.Row(): - monitor = gr.Checkbox(label=translations["monitor"], value=False, interactive=True) - vad_enabled = gr.Checkbox(label=translations["vad_enabled"], value=False, interactive=True) - clean_audio = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True) - with gr.Row(): - with gr.Accordion(translations["audio_device"], open=True): - with gr.Row(): - input_audio_device = gr.Dropdown(label=translations["input_audio_device_label"], info=translations["input_audio_device_info"], choices=[], value=None, interactive=True) - output_audio_device = gr.Dropdown(label=translations["output_audio_device_label"], info=translations["output_audio_device_info"], choices=[], value=None, interactive=True) - monitor_output_device = gr.Dropdown(label=translations["monitor_output_device_label"], info=translations["monitor_output_device_info"], choices=[], value=None, interactive=True, visible=False) - with gr.Row(): - input_audio_gain = gr.Slider(minimum=0, maximum=2500, label=translations["input_audio_gain_label"], info=translations["input_audio_gain_info"], value=100, step=1, interactive=True) - output_audio_gain = gr.Slider(minimum=0, maximum=4000, label=translations["output_audio_gain_label"], info=translations["output_audio_gain_info"], value=100, step=1, interactive=True) - monitor_audio_gain = gr.Slider(minimum=0, maximum=4000, label=translations["monitor_audio_gain_label"], info=translations["monitor_audio_gain_info"], value=100, step=1, interactive=True, visible=False) - with gr.Row(): - refresh_audio_device = gr.Button(value=translations["refresh_audio_device"], variant="secondary") - with gr.Row(): - start_realtime = gr.Button(value=translations["start_realtime_button"], variant="primary", interactive=True) - stop_realtime = gr.Button(value=translations["stop_realtime_button"], variant="stop", interactive=False) - with gr.Row(): - chunk_size = gr.Slider(minimum=2.7, maximum=2730.7, step=0.1, label=translations["chunk_size"], info=translations["chunk_size_info"], value=1024, interactive=True) - pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True) - with gr.Row(): - with gr.Column(): - with gr.Accordion(translations["model_accordion"], open=True): - with gr.Row(): - model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True) - model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True) - with gr.Row(): - model_refresh = gr.Button(translations["refresh"]) - with gr.Row(): - index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "") - with gr.Column(): - with gr.Accordion(translations["f0_method"], open=True): - with gr.Group(): - with gr.Row(): - onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True) - unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True) - f0_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=[m for m in method_f0 if m != "hybrid"], value="rmvpe", interactive=True) - hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False) - with gr.Column(): - with gr.Accordion(translations["hubert_model"], open=True): - embed_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True) - embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True) - custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom") - with gr.Row(): - with gr.Accordion(translations["setting"], open=True): - with gr.Row(): - f0_autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True) - proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True) - with gr.Group(): - with gr.Row(): - f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=f0_autotune.value) - proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value) - with gr.Row(): - rms_mix_rate = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True) - protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True) - with gr.Row(): - clean_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=False) - filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True) - with gr.Column(): - silent_threshold = gr.Slider(minimum=-90, maximum=-60, label=translations["silent_threshold_label"], info=translations["silent_threshold_info"], value=-90, step=1, interactive=True) - extra_convert_size = gr.Slider(minimum=0.1, maximum=5, label=translations["extra_convert_size_label"], info=translations["extra_convert_size_info"], value=0.5, step=0.1, interactive=True) - cross_fade_overlap_size = gr.Slider(minimum=0.05, maximum=0.2, label=translations["cross_fade_overlap_size_label"], info=translations["cross_fade_overlap_size_info"], value=0.1, step=0.01, interactive=True) - with gr.Row(): - vad_sensitivity = gr.Slider(minimum=0, maximum=3, label=translations["vad_sensitivity_label"], info=translations["vad_sensitivity_info"], value=3, step=1, interactive=True, visible=vad_enabled.value) - vad_frame_ms = gr.Slider(minimum=10, maximum=30, label=translations["vad_frame_ms_label"], info=translations["vad_frame_ms_info"], value=30, step=10, interactive=True, visible=vad_enabled.value) - with gr.Row(): - json_audio_hidden = gr.JSON(visible=False) - json_button_hidden = gr.JSON(visible=False) - with gr.Row(): - model_pth.change( - fn=get_index, - inputs=[model_pth], - outputs=[model_index] - ) - model_index.change( - fn=index_strength_show, - inputs=[model_index], - outputs=[index_strength] - ) - model_refresh.click( - fn=change_models_choices, - inputs=[], - outputs=[model_pth, model_index] - ) - with gr.Row(): - unlock_full_method.change( - fn=lambda f0_method: {"choices": [m for m in unlock_f0(f0_method)["choices"] if m != "hybrid"], "value": "rmvpe", "__type__": "update"}, - inputs=[unlock_full_method], - outputs=[f0_method] - ) - f0_method.change( - fn=lambda f0_method: hoplength_show(f0_method, None), - inputs=[f0_method], - outputs=[hop_length] - ) - embed_mode.change( - fn=change_embedders_mode, - inputs=[embed_mode], - outputs=[embedders] - ) - with gr.Row(): - embedders.change( - fn=lambda embedders: visible(embedders == "custom"), - inputs=[embedders], - outputs=[custom_embedders] - ) - f0_autotune.change( - fn=visible, - inputs=[f0_autotune], - outputs=[f0_autotune_strength] - ) - clean_audio.change( - fn=visible, - inputs=[clean_audio], - outputs=[clean_strength] - ) - with gr.Row(): - proposal_pitch.change( - fn=visible, - inputs=[proposal_pitch], - outputs=[proposal_pitch_threshold] - ) - vad_enabled.change( - fn=lambda a: [visible(a) for _ in range(2)], - inputs=[vad_enabled], - outputs=[vad_sensitivity, vad_frame_ms] - ) - refresh_audio_device.click( - fn=None, - js="getAudioDevices", - inputs=[], - outputs=json_audio_hidden - ) - with gr.Row(): - json_audio_hidden.change( - fn=update_dropdowns_from_json, - inputs=[json_audio_hidden], - outputs=[input_audio_device, output_audio_device, monitor_output_device] - ) - json_button_hidden.change( - fn=update_button_from_json, - inputs=[json_button_hidden], - outputs=[start_realtime, stop_realtime] - ) - with gr.Row(): - start_realtime.click( - fn=None, - js="StreamAudioRealtime", - inputs=[ - monitor, - vad_enabled, - input_audio_device, - output_audio_device, - monitor_output_device, - input_audio_gain, - output_audio_gain, - monitor_audio_gain, - chunk_size, - pitch, - model_pth, - model_index, - index_strength, - onnx_f0_mode, - f0_method, - hop_length, - embed_mode, - embedders, - custom_embedders, - f0_autotune, - proposal_pitch, - f0_autotune_strength, - proposal_pitch_threshold, - rms_mix_rate, - protect, - filter_radius, - silent_threshold, - extra_convert_size, - cross_fade_overlap_size, - vad_sensitivity, - vad_frame_ms, - clean_audio, - clean_strength - ], - outputs=[json_button_hidden] - ) - stop_realtime.click( - fn=None, - js="StopAudioStream", - inputs=[], - outputs=[json_button_hidden] - ) \ No newline at end of file diff --git a/main/app/tabs/training/child/create_dataset.py b/main/app/tabs/training/child/create_dataset.py deleted file mode 100644 index e306c9bf2c16de041facac8ff41558653d3ed186..0000000000000000000000000000000000000000 --- a/main/app/tabs/training/child/create_dataset.py +++ /dev/null @@ -1,282 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.training import create_dataset -from main.app.core.ui import visible, valueFalse_interactive, create_dataset_change -from main.app.variables import translations, sample_rate_choice, uvr_model, reverb_models, denoise_models, vr_models, mdx_models - -def create_dataset_tab(): - with gr.Row(): - gr.Markdown(translations["create_dataset_markdown_2"]) - with gr.Group(): - with gr.Row(): - separate = gr.Checkbox(label=translations["separator_tab"], value=False, interactive=True) - clean_dataset = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True) - skip_seconds = gr.Checkbox(label=translations["skip"], value=False, interactive=True) - separate_reverb = gr.Checkbox(label=translations["dereveb_audio"], value=False, interactive=False) - with gr.Row(visible=False) as row: - enable_tta = gr.Checkbox(label=translations["enable_tta"], value=False, interactive=False) - high_end_process = gr.Checkbox(label=translations["high_end_process"], value=False, interactive=False) - enable_post_process = gr.Checkbox(label=translations["enable_post_process"], value=False, interactive=False) - enable_denoise = gr.Checkbox(label=translations["denoise_mdx"], value=False, interactive=False) - with gr.Row(): - dataset_url = gr.Textbox(label=translations["url_audio"], info=translations["create_dataset_url"], value="", placeholder="https://www.youtube.com/...", interactive=True, scale=5) - output_dataset = gr.Textbox(label=translations["output_data"], info=translations["output_data_info"], value="dataset", placeholder="dataset", interactive=True) - with gr.Row(): - create_dataset_button = gr.Button(translations["createdataset"], variant="primary", scale=2, min_width=4000) - with gr.Row(visible=False) as row_2: - model_name = gr.Dropdown(label=translations["separator_model"], value=uvr_model[0], choices=uvr_model, interactive=True) - reverb_model = gr.Dropdown(label=translations["dereveb_model"], value=list(reverb_models.keys())[0], choices=list(reverb_models.keys()), interactive=True) - denoise_model = gr.Dropdown(label=translations["denoise_model"], value=list(denoise_models.keys())[0], choices=list(denoise_models.keys()), interactive=True, visible=False) - with gr.Row(): - with gr.Column(visible=False) as row_3: - with gr.Group(): - with gr.Row(): - overlap = gr.Radio(label=translations["overlap"], info=translations["overlap_info"], choices=["0.25", "0.5", "0.75", "0.99"], value="0.25", interactive=True) - with gr.Row(): - window_size = gr.Slider(label=translations["window_size"], info=translations["window_size_info"], minimum=320, maximum=1024, value=512, step=32, interactive=True, visible=False) - with gr.Row(): - shifts = gr.Slider(label=translations["shift"], info=translations["shift_info"], minimum=1, maximum=20, value=2, step=1, interactive=True) - segments_size = gr.Slider(label=translations["segments_size"], info=translations["segments_size_info"], minimum=32, maximum=3072, value=256, step=32, interactive=True) - with gr.Row(): - batch_size = gr.Slider(label=translations["batch_size"], info=translations["mdx_batch_size_info"], minimum=1, maximum=64, value=1, step=1, interactive=True, visible=False) - hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=8192, value=1024, step=1, interactive=True, visible=False) - with gr.Row(): - post_process_threshold = gr.Slider(label=translations['post_process_threshold'], info=translations["post_process_threshold_info"], minimum=0.1, maximum=0.3, value=0.2, step=0.1, interactive=True, visible=False) - aggression = gr.Slider(label=translations['aggression'], info=translations["aggression_info"], minimum=1, maximum=50, value=5, step=1, interactive=True, visible=False) - with gr.Column(): - sample_rate = gr.Radio(choices=sample_rate_choice, value=48000, label=translations["sr"], info=translations["sr_info"], interactive=True) - clean_strength = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.5, label=translations["clean_strength"], info=translations["clean_strength_info"], interactive=True, visible=False) - with gr.Row(): - skip_start = gr.Textbox(label=translations["skip_start"], info=translations["skip_start_info"], value="", placeholder="0,...", interactive=True, visible=skip_seconds.value) - skip_end = gr.Textbox(label=translations["skip_end"], info=translations["skip_end_info"], value="", placeholder="0,...", interactive=True, visible=skip_seconds.value) - create_dataset_info = gr.Textbox(label=translations["create_dataset_info"], value="", interactive=False, lines=2) - with gr.Row(): - separate.change( - fn=lambda a: [visible(a) for _ in range(3)], - inputs=[separate], - outputs=[ - row, - row_2, - row_3 - ] - ) - separate.change( - fn=valueFalse_interactive, - inputs=[separate], - outputs=[separate_reverb] - ) - separate.change( - fn=create_dataset_change, - inputs=[ - model_name, - reverb_model, - enable_post_process, - separate_reverb, - enable_denoise - ], - outputs=[ - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - with gr.Row(): - model_name.change( - fn=create_dataset_change, - inputs=[ - model_name, - reverb_model, - enable_post_process, - separate_reverb, - enable_denoise - ], - outputs=[ - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - reverb_model.change( - fn=create_dataset_change, - inputs=[ - model_name, - reverb_model, - enable_post_process, - separate_reverb, - enable_denoise - ], - outputs=[ - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - denoise_model.change( - fn=create_dataset_change, - inputs=[ - model_name, - reverb_model, - enable_post_process, - separate_reverb, - enable_denoise - ], - outputs=[ - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - with gr.Row(): - separate_reverb.change( - fn=create_dataset_change, - inputs=[ - model_name, - reverb_model, - enable_post_process, - separate_reverb, - enable_denoise - ], - outputs=[ - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - enable_denoise.change( - fn=create_dataset_change, - inputs=[ - model_name, - reverb_model, - enable_post_process, - separate_reverb, - enable_denoise - ], - outputs=[ - reverb_model, - overlap, - segments_size, - hop_length, - batch_size, - shifts, - window_size, - aggression, - post_process_threshold, - denoise_model, - enable_tta, - high_end_process, - enable_post_process, - ] - ) - with gr.Row(): - skip_seconds.change( - fn=lambda a: [visible(a) for _ in range(2)], - inputs=[skip_seconds], - outputs=[ - skip_start, - skip_end - ] - ) - clean_dataset.change( - fn=visible, - inputs=[clean_dataset], - outputs=[clean_strength] - ) - with gr.Row(): - model_name.change( - fn=lambda a: valueFalse_interactive(a in list(mdx_models.keys()) + list(vr_models.keys())), - inputs=[model_name], - outputs=[enable_denoise] - ) - separate_reverb.change( - fn=valueFalse_interactive, - inputs=[separate_reverb], - outputs=[enable_denoise] - ) - with gr.Row(): - create_dataset_button.click( - fn=create_dataset, - inputs=[ - dataset_url, - output_dataset, - skip_seconds, - skip_start, - skip_end, - separate, - model_name, - reverb_model, - denoise_model, - sample_rate, - shifts, - batch_size, - overlap, - aggression, - hop_length, - window_size, - segments_size, - post_process_threshold, - enable_tta, - enable_denoise, - high_end_process, - enable_post_process, - separate_reverb, - clean_dataset, - clean_strength - ], - outputs=[create_dataset_info], - api_name="create_dataset" - ) \ No newline at end of file diff --git a/main/app/tabs/training/child/create_reference.py b/main/app/tabs/training/child/create_reference.py deleted file mode 100644 index c79fc9b6888c03c7345cb5c8329e5f33c8edeb0a..0000000000000000000000000000000000000000 --- a/main/app/tabs/training/child/create_reference.py +++ /dev/null @@ -1,97 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.training import create_reference -from main.app.core.ui import visible, change_audios_choices, unlock_f0, shutil_move, change_embedders_mode -from main.app.variables import translations, paths_for_files, method_f0, hybrid_f0_method, file_types, configs, embedders_model, embedders_mode - -def create_reference_tab(): - with gr.Row(): - gr.Markdown(translations["create_reference_markdown_2"]) - with gr.Row(): - pitch_guidance = gr.Checkbox(label=translations["training_pitch"], value=True, interactive=True) - use_energy = gr.Checkbox(label=translations["train&energy"], value=False, interactive=True) - f0_autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True) - proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True) - with gr.Row(): - create_reference_button = gr.Button(translations["create_reference"], variant="primary") - with gr.Row(): - f0_up_key = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True) - proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value) - with gr.Row(): - filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True) - f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=f0_autotune.value) - with gr.Row(): - with gr.Column(): - with gr.Accordion(translations["input_output"], open=False): - with gr.Column(): - input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True) - reference_name = gr.Textbox(label=translations["reference_name"], value="reference", placeholder="reference", info=translations["reference_name_info"], interactive=True) - with gr.Column(): - refresh_audio = gr.Button(translations["refresh"]) - with gr.Column(): - upload_audio = gr.Files(label=translations["drop_audio"], file_types=file_types) - with gr.Column(): - play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"]) - with gr.Column() as f0_method_column: - with gr.Accordion(label=translations["f0_method"], open=False): - with gr.Group(): - with gr.Row(): - onnx_f0 = gr.Checkbox(label=translations["f0_onnx_mode"], value=False, interactive=True) - unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], value=False, interactive=True) - f0_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True) - f0_hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=f0_method.value == "hybrid") - with gr.Row(): - alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False) - with gr.Column(): - with gr.Accordion(label=translations["hubert_model"], open=False): - with gr.Row(): - version = gr.Radio(label=translations["training_version"], info=translations["training_version_info"], choices=["v1", "v2"], value="v2", interactive=True) - with gr.Group(): - embedder_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True) - embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True) - with gr.Row(): - embedders_custom = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom") - with gr.Row(): - create_reference_info = gr.Textbox(label=translations["reference_info"], value="", interactive=False, lines=2) - with gr.Row(): - f0_autotune.change(fn=visible, inputs=[f0_autotune], outputs=[f0_autotune_strength]) - proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold]) - unlock_full_method.change(fn=unlock_f0, inputs=[unlock_full_method], outputs=[f0_method]) - with gr.Row(): - input_audio.change(fn=lambda audio: audio, inputs=[input_audio], outputs=[play_audio]) - refresh_audio.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio]) - f0_method.change(fn=lambda method: [visible(method == "hybrid") for _ in range(2)], inputs=[f0_method], outputs=[f0_hybrid_method, alpha]) - with gr.Row(): - upload_audio.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[upload_audio], outputs=[input_audio]) - embedder_mode.change(fn=change_embedders_mode, inputs=[embedder_mode], outputs=[embedders]) - embedders.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders], outputs=[embedders_custom]) - with gr.Row(): - pitch_guidance.change(fn=visible, inputs=[pitch_guidance], outputs=[f0_method_column]) - create_reference_button.click( - fn=create_reference, - inputs=[ - input_audio, - reference_name, - pitch_guidance, - use_energy, - version, - embedders, - embedder_mode, - f0_method, - onnx_f0, - f0_up_key, - filter_radius, - f0_autotune, - f0_autotune_strength, - proposal_pitch, - proposal_pitch_threshold, - alpha - ], - outputs=[create_reference_info], - api_name="create_reference" - ) \ No newline at end of file diff --git a/main/app/tabs/training/child/training.py b/main/app/tabs/training/child/training.py deleted file mode 100644 index 3eb94e244f39919fefbc1363f738651d85b434f4..0000000000000000000000000000000000000000 --- a/main/app/tabs/training/child/training.py +++ /dev/null @@ -1,259 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.core.process import zip_file -from main.app.core.training import preprocess, extract, create_index, training -from main.app.variables import translations, model_name, index_path, method_f0, embedders_mode, embedders_model, pretrainedD, pretrainedG, config, file_types, hybrid_f0_method, reference_list -from main.app.core.ui import gr_warning, visible, unlock_f0, hoplength_show, change_models_choices, get_gpu_info, change_embedders_mode, pitch_guidance_lock, vocoders_lock, unlock_ver, unlock_vocoder, change_pretrained_choices, gpu_number_str, shutil_move, change_reference_choices - -def training_model_tab(): - with gr.Row(): - gr.Markdown(translations["training_markdown"]) - with gr.Row(): - with gr.Column(): - with gr.Row(): - with gr.Column(): - training_name = gr.Textbox(label=translations["modelname"], info=translations["training_model_name"], value="", placeholder=translations["modelname"], interactive=True) - training_sr = gr.Radio(label=translations["sample_rate"], info=translations["sample_rate_info"], choices=["32k", "40k", "48k"], value="48k", interactive=True) - training_ver = gr.Radio(label=translations["training_version"], info=translations["training_version_info"], choices=["v1", "v2"], value="v2", interactive=True) - with gr.Row(): - clean_dataset = gr.Checkbox(label=translations["clear_dataset"], value=False, interactive=True) - process_effects = gr.Checkbox(label=translations["preprocess_effect"], value=False, interactive=True) - training_f0 = gr.Checkbox(label=translations["training_pitch"], value=True, interactive=True) - custom_reference = gr.Checkbox(label=translations["custom_reference"], value=False, interactive=True) - checkpointing1 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True) - upload = gr.Checkbox(label=translations["upload_dataset"], value=False, interactive=True) - with gr.Row(): - preprocess_split_audio_mode = gr.Radio(label=translations["split_audio_mode"], info=translations["split_audio_mode_info"], value="Automatic", choices=["Automatic", "Simple", "Skip"], interactive=True) - preprocess_normalization_mode = gr.Radio(label=translations["normalization_mode"], info=translations["normalization_mode_info"], value="none", choices=["none", "pre", "post"], interactive=True) - with gr.Row(visible=custom_reference.value) as custom_reference_row: - with gr.Accordion(translations["custom_reference"], open=True): - reference_name = gr.Dropdown(label=translations["reference_name"], info=translations["reference_name_info"], choices=reference_list, value=reference_list[0] if len(reference_list) >= 1 else "", allow_custom_value=True, interactive=True) - reference_refresh = gr.Button(translations["refresh"], scale=2) - with gr.Row(visible=clean_dataset.value) as clean_dataset_row: - clean_dataset_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.7, step=0.1, interactive=True) - with gr.Column(): - preprocess_button = gr.Button(translations["preprocess_button"], scale=2) - upload_dataset = gr.Files(label=translations["drop_audio"], file_types=file_types, visible=upload.value) - preprocess_info = gr.Textbox(label=translations["preprocess_info"], value="", interactive=False, container=True, lines=2) - with gr.Column(): - with gr.Row(): - with gr.Column(): - with gr.Accordion(label=translations["f0_method"], open=False): - with gr.Group(): - with gr.Row(): - onnx_f0_mode2 = gr.Checkbox(label=translations["f0_onnx_mode"], value=False, interactive=True) - unlock_full_method4 = gr.Checkbox(label=translations["f0_unlock"], value=False, interactive=True) - autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True) - extract_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True) - extract_hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=extract_method.value == "hybrid") - extract_hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False) - f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value) - alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False) - with gr.Accordion(label=translations["hubert_model"], open=False): - with gr.Group(): - embed_mode2 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True) - extract_embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True) - with gr.Row(): - extract_embedders_custom = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=extract_embedders.value == "custom") - with gr.Column(): - extract_button = gr.Button(translations["extract_button"], scale=2) - extract_info = gr.Textbox(label=translations["extract_info"], value="", interactive=False, lines=2) - with gr.Column(): - with gr.Row(): - with gr.Column(): - total_epochs = gr.Slider(label=translations["total_epoch"], info=translations["total_epoch_info"], minimum=1, maximum=10000, value=300, step=1, interactive=True) - save_epochs = gr.Slider(label=translations["save_epoch"], info=translations["save_epoch_info"], minimum=1, maximum=10000, value=50, step=1, interactive=True) - with gr.Column(): - index_button = gr.Button(f"3. {translations['create_index']}", variant="primary", scale=2) - training_button = gr.Button(f"4. {translations['training_model']}", variant="primary", scale=2) - with gr.Row(): - with gr.Accordion(label=translations["setting"], open=False): - with gr.Row(): - index_algorithm = gr.Radio(label=translations["index_algorithm"], info=translations["index_algorithm_info"], choices=["Auto", "Faiss", "KMeans"], value="Auto", interactive=True) - with gr.Row(): - cache_in_gpu = gr.Checkbox(label=translations["cache_in_gpu"], info=translations["cache_in_gpu_info"], value=True, interactive=True) - rms_extract = gr.Checkbox(label=translations["train&energy"], info=translations["train&energy_info"], value=False, interactive=True) - overtraining_detector = gr.Checkbox(label=translations["overtraining_detector"], info=translations["overtraining_detector_info"], value=False, interactive=True) - with gr.Row(): - custom_dataset = gr.Checkbox(label=translations["custom_dataset"], info=translations["custom_dataset_info"], value=False, interactive=True) - save_only_latest = gr.Checkbox(label=translations["save_only_latest"], info=translations["save_only_latest_info"], value=True, interactive=True) - save_every_weights = gr.Checkbox(label=translations["save_every_weights"], info=translations["save_every_weights_info"], value=True, interactive=True) - with gr.Row(): - clean_up = gr.Checkbox(label=translations["cleanup_training"], info=translations["cleanup_training_info"], value=False, interactive=True) - not_use_pretrain = gr.Checkbox(label=translations["not_use_pretrain_2"], info=translations["not_use_pretrain_info"], value=False, interactive=True) - custom_pretrain = gr.Checkbox(label=translations["custom_pretrain"], info=translations["custom_pretrain_info"], value=False, interactive=True) - with gr.Column(): - dataset_path = gr.Textbox(label=translations["dataset_folder"], value="dataset", interactive=True, visible=custom_dataset.value) - with gr.Column(): - with gr.Row(visible=False) as simple_option: - chunk_len = gr.Slider(minimum=0.5, maximum=5.0, value=3.0, step=0.1, label=translations["chunk_length"], info=translations["chunk_length_info"], interactive=True) - overlap_len = gr.Slider(minimum=0.0, maximum=0.4, value=0.3, step=0.1, label=translations["overlap_length"], info=translations["overlap_length_info"], interactive=True) - threshold = gr.Slider(minimum=1, maximum=100, value=50, step=1, label=translations["threshold"], interactive=True, visible=overtraining_detector.value) - with gr.Accordion(translations["setting_cpu_gpu"], open=False): - with gr.Column(): - gpu_number = gr.Textbox(label=translations["gpu_number"], value=gpu_number_str(), info=translations["gpu_number_info"], interactive=True) - gpu_info = gr.Textbox(label=translations["gpu_info"], value=get_gpu_info(), info=translations["gpu_info_2"], interactive=False) - cpu_core = gr.Slider(label=translations["cpu_core"], info=translations["cpu_core_info"], minimum=1, maximum=os.cpu_count(), value=os.cpu_count(), step=1, interactive=True) - train_batch_size = gr.Slider(label=translations["batch_size"], info=translations["batch_size_info"], minimum=1, maximum=64, value=8, step=1, interactive=True) - with gr.Group(): - multiscale_mel_loss = gr.Checkbox(label=translations["multiscale_mel_loss"], info=translations["multiscale_mel_loss_info"], value=False, interactive=True) - vocoders = gr.Radio(label=translations["vocoder"], info=translations["vocoder_info"], choices=["Default", "MRF-HiFi-GAN", "RefineGAN"], value="Default", interactive=True) - with gr.Row(): - deterministic = gr.Checkbox(label=translations["deterministic"], info=translations["deterministic_info"], value=False, interactive=config.device.startswith("cuda")) - benchmark = gr.Checkbox(label=translations["benchmark"], info=translations["benchmark_info"], value=False, interactive=config.device.startswith("cuda")) - with gr.Row(): - optimizer = gr.Radio(label=translations["optimizer"], info=translations["optimizer_info"], value="AdamW", choices=["AdamW", "RAdam", "AnyPrecisionAdamW"], interactive=True) - with gr.Row(): - model_author = gr.Textbox(label=translations["training_author"], info=translations["training_author_info"], value="", placeholder=translations["training_author"], interactive=True) - with gr.Row(): - with gr.Column(): - with gr.Accordion(translations["custom_pretrain_info"], open=False, visible=custom_pretrain.value and not not_use_pretrain.value) as pretrain_setting: - pretrained_D = gr.Dropdown(label=translations["pretrain_file"].format(dg="D"), choices=pretrainedD, value=pretrainedD[0] if len(pretrainedD) > 0 else '', interactive=True, allow_custom_value=True) - pretrained_G = gr.Dropdown(label=translations["pretrain_file"].format(dg="G"), choices=pretrainedG, value=pretrainedG[0] if len(pretrainedG) > 0 else '', interactive=True, allow_custom_value=True) - refresh_pretrain = gr.Button(translations["refresh"], scale=2) - with gr.Row(): - training_info = gr.Textbox(label=translations["train_info"], value="", interactive=False, lines=3) - with gr.Row(): - with gr.Column(): - with gr.Accordion(translations["export_model"], open=False): - with gr.Row(): - model_file = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True) - index_file = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True) - with gr.Row(): - refresh_file = gr.Button(f"1. {translations['refresh']}", scale=2) - zip_model = gr.Button(translations["zip_model"], variant="primary", scale=2) - with gr.Row(): - zip_output = gr.File(label=translations["output_zip"], file_types=[".zip"], interactive=False, visible=False) - with gr.Row(): - vocoders.change(fn=pitch_guidance_lock, inputs=[vocoders], outputs=[training_f0]) - training_f0.change(fn=vocoders_lock, inputs=[training_f0, vocoders], outputs=[vocoders]) - unlock_full_method4.change(fn=unlock_f0, inputs=[unlock_full_method4], outputs=[extract_method]) - with gr.Row(): - refresh_file.click(fn=change_models_choices, inputs=[], outputs=[model_file, index_file]) - zip_model.click(fn=zip_file, inputs=[training_name, model_file, index_file], outputs=[zip_output]) - dataset_path.change(fn=lambda folder: os.makedirs(folder, exist_ok=True), inputs=[dataset_path], outputs=[]) - with gr.Row(): - upload.change(fn=visible, inputs=[upload], outputs=[upload_dataset]) - overtraining_detector.change(fn=visible, inputs=[overtraining_detector], outputs=[threshold]) - clean_dataset.change(fn=visible, inputs=[clean_dataset], outputs=[clean_dataset_row]) - with gr.Row(): - custom_dataset.change(fn=lambda custom_dataset: [visible(custom_dataset), "dataset"],inputs=[custom_dataset], outputs=[dataset_path, dataset_path]) - training_ver.change(fn=unlock_vocoder, inputs=[training_ver, vocoders], outputs=[vocoders]) - vocoders.change(fn=unlock_ver, inputs=[training_ver, vocoders], outputs=[training_ver]) - with gr.Row(): - custom_reference.change(fn=visible, inputs=[custom_reference], outputs=[custom_reference_row]) - extract_method.change(fn=lambda method, hybrid: [visible(method == "hybrid"), visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[extract_method, extract_hybrid_method], outputs=[extract_hybrid_method, alpha, extract_hop_length]) - extract_hybrid_method.change(fn=hoplength_show, inputs=[extract_method, extract_hybrid_method], outputs=[extract_hop_length]) - with gr.Row(): - autotune.change(fn=visible, inputs=[autotune], outputs=[f0_autotune_strength]) - preprocess_split_audio_mode.change(fn=lambda a: visible(a == "Simple"), inputs=[preprocess_split_audio_mode], outputs=[simple_option]) - upload_dataset.upload( - fn=lambda files, folder: [shutil_move(f.name, os.path.join(folder, os.path.split(f.name)[1])) for f in files] if folder != "" else gr_warning(translations["dataset_folder1"]), - inputs=[upload_dataset, dataset_path], - outputs=[], - api_name="upload_dataset" - ) - with gr.Row(): - not_use_pretrain.change(fn=lambda a, b: visible(a and not b), inputs=[custom_pretrain, not_use_pretrain], outputs=[pretrain_setting]) - custom_pretrain.change(fn=lambda a, b: visible(a and not b), inputs=[custom_pretrain, not_use_pretrain], outputs=[pretrain_setting]) - refresh_pretrain.click(fn=change_pretrained_choices, inputs=[], outputs=[pretrained_D, pretrained_G]) - with gr.Row(): - preprocess_button.click( - fn=preprocess, - inputs=[ - training_name, - training_sr, - cpu_core, - preprocess_split_audio_mode, - process_effects, - dataset_path, - clean_dataset, - clean_dataset_strength, - chunk_len, - overlap_len, - preprocess_normalization_mode - ], - outputs=[preprocess_info], - api_name="preprocess" - ) - with gr.Row(): - embed_mode2.change(fn=change_embedders_mode, inputs=[embed_mode2], outputs=[extract_embedders]) - extract_embedders.change(fn=lambda extract_embedders: visible(extract_embedders == "custom"), inputs=[extract_embedders], outputs=[extract_embedders_custom]) - reference_refresh.click(fn=change_reference_choices, inputs=[], outputs=[reference_name]) - with gr.Row(): - extract_button.click( - fn=extract, - inputs=[ - training_name, - training_ver, - extract_method, - training_f0, - extract_hop_length, - cpu_core, - gpu_number, - training_sr, - extract_embedders, - extract_embedders_custom, - onnx_f0_mode2, - embed_mode2, - autotune, - f0_autotune_strength, - extract_hybrid_method, - rms_extract, - alpha - ], - outputs=[extract_info], - api_name="extract" - ) - with gr.Row(): - index_button.click( - fn=create_index, - inputs=[ - training_name, - training_ver, - index_algorithm - ], - outputs=[training_info], - api_name="create_index" - ) - with gr.Row(): - training_button.click( - fn=training, - inputs=[ - training_name, - training_ver, - save_epochs, - save_only_latest, - save_every_weights, - total_epochs, - training_sr, - train_batch_size, - gpu_number, - training_f0, - not_use_pretrain, - custom_pretrain, - pretrained_G, - pretrained_D, - overtraining_detector, - threshold, - clean_up, - cache_in_gpu, - model_author, - vocoders, - checkpointing1, - deterministic, - benchmark, - optimizer, - rms_extract, - custom_reference, - reference_name, - multiscale_mel_loss - ], - outputs=[training_info], - api_name="training_model" - ) \ No newline at end of file diff --git a/main/app/tabs/training/training.py b/main/app/tabs/training/training.py deleted file mode 100644 index e4cea85bc853f52130996e8f41c4b3b8c9bf090c..0000000000000000000000000000000000000000 --- a/main/app/tabs/training/training.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -import sys - -import gradio as gr - -sys.path.append(os.getcwd()) - -from main.app.variables import translations, configs -from main.app.tabs.training.child.training import training_model_tab -from main.app.tabs.training.child.create_dataset import create_dataset_tab -from main.app.tabs.training.child.create_reference import create_reference_tab - -def training_tab(): - with gr.TabItem(translations["training_model"], visible=configs.get("create_and_training_tab", True)): - with gr.TabItem(translations["createdataset"], visible=configs.get("create_dataset_tab", True)): - gr.Markdown(translations["create_dataset_markdown"]) - create_dataset_tab() - - with gr.TabItem(translations["create_reference"], visible=configs.get("create_reference_tab", True)): - gr.Markdown(translations["create_reference_markdown"]) - create_reference_tab() - - with gr.TabItem(translations["training_model"], visible=configs.get("training_tab", True)): - gr.Markdown(f"## {translations['training_model']}") - training_model_tab() \ No newline at end of file diff --git a/main/app/variables.py b/main/app/variables.py deleted file mode 100644 index df1ae15415817a1c5e55ead9c908c6d1b2382a0d..0000000000000000000000000000000000000000 --- a/main/app/variables.py +++ /dev/null @@ -1,117 +0,0 @@ -import os -import sys -import csv -import json -import codecs -import logging -import urllib.request -import logging.handlers - -sys.path.append(os.getcwd()) - -from main.configs.config import Config - -logger = logging.getLogger(__name__) -logger.propagate = False - -config = Config() -python = sys.executable -translations = config.translations -configs_json = os.path.join("main", "configs", "config.json") -configs = json.load(open(configs_json, "r")) - -if not logger.hasHandlers(): - console_handler = logging.StreamHandler() - console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - console_handler.setFormatter(console_formatter) - console_handler.setLevel(logging.DEBUG if config.debug_mode else logging.INFO) - file_handler = logging.handlers.RotatingFileHandler(os.path.join(configs["logs_path"], "app.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8') - file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - file_handler.setFormatter(file_formatter) - file_handler.setLevel(logging.DEBUG) - logger.addHandler(console_handler) - logger.addHandler(file_handler) - logger.setLevel(logging.DEBUG) - -if config.device in ["cpu", "mps", "ocl:0"] and configs.get("fp16", False): - logger.warning(translations["fp16_not_support"]) - configs["fp16"] = config.is_half = False - - with open(configs_json, "w") as f: - json.dump(configs, f, indent=4) - -models = {} -model_options = {} - -method_f0 = ["mangio-crepe-full", "crepe-full", "fcpe", "rmvpe", "harvest", "pyin", "hybrid"] -method_f0_full = ["pm-ac", "pm-cc", "pm-shs", "dio", "mangio-crepe-tiny", "mangio-crepe-small", "mangio-crepe-medium", "mangio-crepe-large", "mangio-crepe-full", "crepe-tiny", "crepe-small", "crepe-medium", "crepe-large", "crepe-full", "fcpe", "fcpe-legacy", "fcpe-previous", "rmvpe", "rmvpe-clipping", "rmvpe-medfilt", "rmvpe-clipping-medfilt", "harvest", "yin", "pyin", "swipe", "piptrack", "penn", "mangio-penn", "djcm", "djcm-clipping", "djcm-medfilt", "djcm-clipping-medfilt", "swift", "pesto", "hybrid"] -hybrid_f0_method = ["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"] - -embedders_mode = ["fairseq", "onnx", "transformers", "spin", "whisper"] -embedders_model = ["contentvec_base", "hubert_base", "vietnamese_hubert_base", "japanese_hubert_base", "korean_hubert_base", "chinese_hubert_base", "portuguese_hubert_base", "custom"] -spin_model = ["spin-v1", "spin-v2"] -whisper_model = ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3", "large-v3-turbo"] - -paths_for_files = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["audios_path"]) for f in files if os.path.splitext(f)[1].lower() in (".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")]) -reference_list = sorted([name for name in os.listdir(configs["reference_path"]) if os.path.exists(os.path.join(configs["reference_path"], name)) and os.path.isdir(os.path.join(configs["reference_path"], name))]) -model_name = sorted(list(model for model in os.listdir(configs["weights_path"]) if model.endswith((".pth", ".onnx")) and not model.startswith("G_") and not model.startswith("D_"))) -index_path = sorted([os.path.join(root, name) for root, _, files in os.walk(configs["logs_path"], topdown=False) for name in files if name.endswith(".index") and "trained" not in name]) - -pretrainedD = [model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "D" in model] -pretrainedG = [model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "G" in model] - -presets_file = sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".conversion.json"))) -audio_effect_presets_file = sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".effect.json"))) -f0_file = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["f0_path"]) for f in files if f.endswith(".txt")]) - -file_types = [".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"] -export_format_choices = ["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"] - -language = configs.get("language", "vi-VN") -theme = configs.get("theme", "NoCrypt/miku") - -edgetts = configs.get("edge_tts", ["vi-VN-HoaiMyNeural", "vi-VN-NamMinhNeural"]) -google_tts_voice = configs.get("google_tts_voice", ["vi", "en"]) - -vr_models = configs.get("vr_models", "") -demucs_models = configs.get("demucs_models", "") -mdx_models = configs.get("mdx_models", "") -karaoke_models = configs.get("karaoke_models", "") -reverb_models = configs.get("reverb_models", "") -denoise_models = configs.get("denoise_models", "") -uvr_model = list(demucs_models.keys()) + list(vr_models.keys()) + list(mdx_models.keys()) - -font = configs.get("font", "https://fonts.googleapis.com/css2?family=Courgette&display=swap") -sample_rate_choice = [8000, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000, 96000] -csv_path = configs["csv_path"] - -if "--allow_all_disk" in sys.argv and sys.platform == "win32": - try: - import win32api - except: - os.system(f"{python} -m pip install pywin32") - import win32api - - allow_disk = win32api.GetLogicalDriveStrings().split('\x00')[:-1] -else: allow_disk = [] - -try: - if os.path.exists(csv_path): reader = list(csv.DictReader(open(csv_path, newline='', encoding='utf-8'))) - else: - reader = list(csv.DictReader([line.decode('utf-8') for line in urllib.request.urlopen(codecs.decode("uggcf://qbpf.tbbtyr.pbz/fcernqfurrgf/q/1gNHnDeRULtEfz1Yieaw14USUQjWJy0Oq9k0DrCrjApb/rkcbeg?sbezng=pfi&tvq=1977693859", "rot13")).readlines()])) - writer = csv.DictWriter(open(csv_path, mode='w', newline='', encoding='utf-8'), fieldnames=reader[0].keys()) - writer.writeheader() - writer.writerows(reader) - - for row in reader: - filename = row['Filename'] - url = None - - for value in row.values(): - if isinstance(value, str) and "huggingface" in value: - url = value - break - - if url: models[filename] = url -except: - pass \ No newline at end of file diff --git a/main/configs/config.json b/main/configs/config.json deleted file mode 100644 index 1bd722d36a05023f7c46033d5f2bbc23416154c0..0000000000000000000000000000000000000000 --- a/main/configs/config.json +++ /dev/null @@ -1,622 +0,0 @@ -{ - "language": "vi-VN", - "support_language": [ - "en-US", - "vi-VN" - ], - "theme": "NoCrypt/miku", - "themes": [ - "NoCrypt/miku", - "gstaff/xkcd", - "JohnSmith9982/small_and_pretty", - "ParityError/Interstellar", - "earneleh/paris", - "shivi/calm_seafoam", - "Hev832/Applio", - "YTheme/Minecraft", - "gstaff/sketch", - "SebastianBravo/simci_css", - "allenai/gradio-theme", - "Nymbo/Nymbo_Theme_5", - "lone17/kotaemon", - "Zarkel/IBM_Carbon_Theme", - "SherlockRamos/Feliz", - "freddyaboulton/dracula_revamped", - "freddyaboulton/bad-theme-space", - "gradio/dracula_revamped", - "abidlabs/dracula_revamped", - "gradio/dracula_test", - "gradio/seafoam", - "gradio/glass", - "gradio/monochrome", - "gradio/soft", - "gradio/default", - "gradio/base", - "abidlabs/pakistan", - "dawood/microsoft_windows", - "ysharma/steampunk", - "ysharma/huggingface", - "abidlabs/Lime", - "freddyaboulton/this-theme-does-not-exist-2", - "aliabid94/new-theme", - "aliabid94/test2", - "aliabid94/test3", - "aliabid94/test4", - "abidlabs/banana", - "freddyaboulton/test-blue", - "gstaff/whiteboard", - "ysharma/llamas", - "abidlabs/font-test", - "YenLai/Superhuman", - "bethecloud/storj_theme", - "sudeepshouche/minimalist", - "knotdgaf/gradiotest", - "ParityError/Anime", - "Ajaxon6255/Emerald_Isle", - "ParityError/LimeFace", - "finlaymacklon/smooth_slate", - "finlaymacklon/boxy_violet", - "derekzen/stardust", - "EveryPizza/Cartoony-Gradio-Theme", - "Ifeanyi/Cyanister", - "Tshackelton/IBMPlex-DenseReadable", - "snehilsanyal/scikit-learn", - "Himhimhim/xkcd", - "nota-ai/theme", - "rawrsor1/Everforest", - "rottenlittlecreature/Moon_Goblin", - "abidlabs/test-yellow", - "abidlabs/test-yellow3", - "idspicQstitho/dracula_revamped", - "kfahn/AnimalPose", - "HaleyCH/HaleyCH_Theme", - "simulKitke/dracula_test", - "braintacles/CrimsonNight", - "wentaohe/whiteboardv2", - "reilnuud/polite", - "remilia/Ghostly", - "Franklisi/darkmode", - "coding-alt/soft", - "xiaobaiyuan/theme_land", - "step-3-profit/Midnight-Deep", - "xiaobaiyuan/theme_demo", - "Taithrah/Minimal", - "Insuz/SimpleIndigo", - "zkunn/Alipay_Gradio_theme", - "Insuz/Mocha", - "xiaobaiyuan/theme_brief", - "Ama434/434-base-Barlow", - "Ama434/def_barlow", - "Ama434/neutral-barlow", - "dawood/dracula_test", - "nuttea/Softblue", - "BlueDancer/Alien_Diffusion", - "naughtondale/monochrome", - "Dagfinn1962/standard", - "default" - ], - "mdx_models": { - "Main_340": "UVR-MDX-NET_Main_340.onnx", - "Main_390": "UVR-MDX-NET_Main_390.onnx", - "Main_406": "UVR-MDX-NET_Main_406.onnx", - "Main_427": "UVR-MDX-NET_Main_427.onnx", - "Main_438": "UVR-MDX-NET_Main_438.onnx", - "Inst_full_292": "UVR-MDX-NET-Inst_full_292.onnx", - "Inst_HQ_1": "UVR-MDX-NET-Inst_HQ_1.onnx", - "Inst_HQ_2": "UVR-MDX-NET-Inst_HQ_2.onnx", - "Inst_HQ_3": "UVR-MDX-NET-Inst_HQ_3.onnx", - "Inst_HQ_4": "UVR-MDX-NET-Inst_HQ_4.onnx", - "Inst_HQ_5": "UVR-MDX-NET-Inst_HQ_5.onnx", - "Kim_Vocal_1": "Kim_Vocal_1.onnx", - "Kim_Vocal_2": "Kim_Vocal_2.onnx", - "Kim_Inst": "Kim_Inst.onnx", - "Inst_187_beta": "UVR-MDX-NET_Inst_187_beta.onnx", - "Inst_82_beta": "UVR-MDX-NET_Inst_82_beta.onnx", - "Inst_90_beta": "UVR-MDX-NET_Inst_90_beta.onnx", - "Voc_FT": "UVR-MDX-NET-Voc_FT.onnx", - "Crowd_HQ": "UVR-MDX-NET_Crowd_HQ_1.onnx", - "MDXNET_9482": "UVR_MDXNET_9482.onnx", - "Inst_1": "UVR-MDX-NET-Inst_1.onnx", - "Inst_2": "UVR-MDX-NET-Inst_2.onnx", - "Inst_3": "UVR-MDX-NET-Inst_3.onnx", - "MDXNET_1_9703": "UVR_MDXNET_1_9703.onnx", - "MDXNET_2_9682": "UVR_MDXNET_2_9682.onnx", - "MDXNET_3_9662": "UVR_MDXNET_3_9662.onnx", - "Inst_Main": "UVR-MDX-NET-Inst_Main.onnx", - "MDXNET_Main": "UVR_MDXNET_Main.onnx" - }, - "demucs_models": { - "HT-Tuned": "htdemucs_ft.yaml", - "HT-Normal": "htdemucs.yaml", - "HD_MMI": "hdemucs_mmi.yaml", - "HT_6S": "htdemucs_6s.yaml" - }, - "vr_models": { - "HP-1": "1_HP-UVR.pth", - "HP-2": "2_HP-UVR.pth", - "HP-Vocal-1": "3_HP-Vocal-UVR.pth", - "HP-Vocal-2": "4_HP-Vocal-UVR.pth", - "HP2-1": "7_HP2-UVR.pth", - "HP2-2": "8_HP2-UVR.pth", - "HP2-3": "9_HP2-UVR.pth", - "SP-2B-1": "10_SP-UVR-2B-32000-1.pth", - "SP-2B-2": "11_SP-UVR-2B-32000-2.pth", - "SP-3B-1": "12_SP-UVR-3B-44100.pth", - "SP-4B-1": "13_SP-UVR-4B-44100-1.pth", - "SP-4B-2": "14_SP-UVR-4B-44100-2.pth", - "SP-MID-1": "15_SP-UVR-MID-44100-1.pth", - "SP-MID-2": "16_SP-UVR-MID-44100-2.pth" - }, - "karaoke_models": { - "MDX-Version-1": "UVR_MDXNET_KARA.onnx", - "MDX-Version-2": "UVR_MDXNET_KARA_2.onnx", - "VR-Version-1": "5_HP-Karaoke-UVR.pth", - "VR-Version-2": "6_HP-Karaoke-UVR.pth" - }, - "reverb_models": { - "MDX-Reverb": "Reverb_HQ_By_FoxJoy.onnx", - "VR-Reverb": "UVR-DeEcho-DeReverb.pth", - "Echo-Aggressive": "UVR-De-Echo-Aggressive.pth", - "Echo-Normal": "UVR-De-Echo-Normal.pth" - }, - "denoise_models": { - "Lite": "UVR-DeNoise-Lite.pth", - "Normal": "UVR-DeNoise.pth" - }, - "edge_tts": [ - "af-ZA-AdriNeural", - "af-ZA-WillemNeural", - "sq-AL-AnilaNeural", - "sq-AL-IlirNeural", - "am-ET-AmehaNeural", - "am-ET-MekdesNeural", - "ar-DZ-AminaNeural", - "ar-DZ-IsmaelNeural", - "ar-BH-AliNeural", - "ar-BH-LailaNeural", - "ar-EG-SalmaNeural", - "ar-EG-ShakirNeural", - "ar-IQ-BasselNeural", - "ar-IQ-RanaNeural", - "ar-JO-SanaNeural", - "ar-JO-TaimNeural", - "ar-KW-FahedNeural", - "ar-KW-NouraNeural", - "ar-LB-LaylaNeural", - "ar-LB-RamiNeural", - "ar-LY-ImanNeural", - "ar-LY-OmarNeural", - "ar-MA-JamalNeural", - "ar-MA-MounaNeural", - "ar-OM-AbdullahNeural", - "ar-OM-AyshaNeural", - "ar-QA-AmalNeural", - "ar-QA-MoazNeural", - "ar-SA-HamedNeural", - "ar-SA-ZariyahNeural", - "ar-SY-AmanyNeural", - "ar-SY-LaithNeural", - "ar-TN-HediNeural", - "ar-TN-ReemNeural", - "ar-AE-FatimaNeural", - "ar-AE-HamdanNeural", - "ar-YE-MaryamNeural", - "ar-YE-SalehNeural", - "az-AZ-BabekNeural", - "az-AZ-BanuNeural", - "bn-BD-NabanitaNeural", - "bn-BD-PradeepNeural", - "bn-IN-BashkarNeural", - "bn-IN-TanishaaNeural", - "bs-BA-GoranNeural", - "bs-BA-VesnaNeural", - "bg-BG-BorislavNeural", - "bg-BG-KalinaNeural", - "my-MM-NilarNeural", - "my-MM-ThihaNeural", - "ca-ES-EnricNeural", - "ca-ES-JoanaNeural", - "zh-HK-HiuGaaiNeural", - "zh-HK-HiuMaanNeural", - "zh-HK-WanLungNeural", - "zh-CN-XiaoxiaoNeural", - "zh-CN-XiaoyiNeural", - "zh-CN-YunjianNeural", - "zh-CN-YunxiNeural", - "zh-CN-YunxiaNeural", - "zh-CN-YunyangNeural", - "zh-CN-liaoning-XiaobeiNeural", - "zh-TW-HsiaoChenNeural", - "zh-TW-YunJheNeural", - "zh-TW-HsiaoYuNeural", - "zh-CN-shaanxi-XiaoniNeural", - "hr-HR-GabrijelaNeural", - "hr-HR-SreckoNeural", - "cs-CZ-AntoninNeural", - "cs-CZ-VlastaNeural", - "da-DK-ChristelNeural", - "da-DK-JeppeNeural", - "nl-BE-ArnaudNeural", - "nl-BE-DenaNeural", - "nl-NL-ColetteNeural", - "nl-NL-FennaNeural", - "nl-NL-MaartenNeural", - "en-AU-NatashaNeural", - "en-AU-WilliamNeural", - "en-CA-ClaraNeural", - "en-CA-LiamNeural", - "en-HK-SamNeural", - "en-HK-YanNeural", - "en-IN-NeerjaExpressiveNeural", - "en-IN-NeerjaNeural", - "en-IN-PrabhatNeural", - "en-IE-ConnorNeural", - "en-IE-EmilyNeural", - "en-KE-AsiliaNeural", - "en-KE-ChilembaNeural", - "en-NZ-MitchellNeural", - "en-NZ-MollyNeural", - "en-NG-AbeoNeural", - "en-NG-EzinneNeural", - "en-PH-JamesNeural", - "en-PH-RosaNeural", - "en-SG-LunaNeural", - "en-SG-WayneNeural", - "en-ZA-LeahNeural", - "en-ZA-LukeNeural", - "en-TZ-ElimuNeural", - "en-TZ-ImaniNeural", - "en-GB-LibbyNeural", - "en-GB-MaisieNeural", - "en-GB-RyanNeural", - "en-GB-SoniaNeural", - "en-GB-ThomasNeural", - "en-US-AvaMultilingualNeural", - "en-US-AndrewMultilingualNeural", - "en-US-EmmaMultilingualNeural", - "en-US-BrianMultilingualNeural", - "en-US-AvaNeural", - "en-US-AndrewNeural", - "en-US-EmmaNeural", - "en-US-BrianNeural", - "en-US-AnaNeural", - "en-US-AriaNeural", - "en-US-ChristopherNeural", - "en-US-EricNeural", - "en-US-GuyNeural", - "en-US-JennyNeural", - "en-US-MichelleNeural", - "en-US-RogerNeural", - "en-US-SteffanNeural", - "et-EE-AnuNeural", - "et-EE-KertNeural", - "fil-PH-AngeloNeural", - "fil-PH-BlessicaNeural", - "fi-FI-HarriNeural", - "fi-FI-NooraNeural", - "fr-BE-CharlineNeural", - "fr-BE-GerardNeural", - "fr-CA-ThierryNeural", - "fr-CA-AntoineNeural", - "fr-CA-JeanNeural", - "fr-CA-SylvieNeural", - "fr-FR-VivienneMultilingualNeural", - "fr-FR-RemyMultilingualNeural", - "fr-FR-DeniseNeural", - "fr-FR-EloiseNeural", - "fr-FR-HenriNeural", - "fr-CH-ArianeNeural", - "fr-CH-FabriceNeural", - "gl-ES-RoiNeural", - "gl-ES-SabelaNeural", - "ka-GE-EkaNeural", - "ka-GE-GiorgiNeural", - "de-AT-IngridNeural", - "de-AT-JonasNeural", - "de-DE-SeraphinaMultilingualNeural", - "de-DE-FlorianMultilingualNeural", - "de-DE-AmalaNeural", - "de-DE-ConradNeural", - "de-DE-KatjaNeural", - "de-DE-KillianNeural", - "de-CH-JanNeural", - "de-CH-LeniNeural", - "el-GR-AthinaNeural", - "el-GR-NestorasNeural", - "gu-IN-DhwaniNeural", - "gu-IN-NiranjanNeural", - "he-IL-AvriNeural", - "he-IL-HilaNeural", - "hi-IN-MadhurNeural", - "hi-IN-SwaraNeural", - "hu-HU-NoemiNeural", - "hu-HU-TamasNeural", - "is-IS-GudrunNeural", - "is-IS-GunnarNeural", - "id-ID-ArdiNeural", - "id-ID-GadisNeural", - "ga-IE-ColmNeural", - "ga-IE-OrlaNeural", - "it-IT-GiuseppeNeural", - "it-IT-DiegoNeural", - "it-IT-ElsaNeural", - "it-IT-IsabellaNeural", - "ja-JP-KeitaNeural", - "ja-JP-NanamiNeural", - "jv-ID-DimasNeural", - "jv-ID-SitiNeural", - "kn-IN-GaganNeural", - "kn-IN-SapnaNeural", - "kk-KZ-AigulNeural", - "kk-KZ-DauletNeural", - "km-KH-PisethNeural", - "km-KH-SreymomNeural", - "ko-KR-HyunsuNeural", - "ko-KR-InJoonNeural", - "ko-KR-SunHiNeural", - "lo-LA-ChanthavongNeural", - "lo-LA-KeomanyNeural", - "lv-LV-EveritaNeural", - "lv-LV-NilsNeural", - "lt-LT-LeonasNeural", - "lt-LT-OnaNeural", - "mk-MK-AleksandarNeural", - "mk-MK-MarijaNeural", - "ms-MY-OsmanNeural", - "ms-MY-YasminNeural", - "ml-IN-MidhunNeural", - "ml-IN-SobhanaNeural", - "mt-MT-GraceNeural", - "mt-MT-JosephNeural", - "mr-IN-AarohiNeural", - "mr-IN-ManoharNeural", - "mn-MN-BataaNeural", - "mn-MN-YesuiNeural", - "ne-NP-HemkalaNeural", - "ne-NP-SagarNeural", - "nb-NO-FinnNeural", - "nb-NO-PernilleNeural", - "ps-AF-GulNawazNeural", - "ps-AF-LatifaNeural", - "fa-IR-DilaraNeural", - "fa-IR-FaridNeural", - "pl-PL-MarekNeural", - "pl-PL-ZofiaNeural", - "pt-BR-ThalitaNeural", - "pt-BR-AntonioNeural", - "pt-BR-FranciscaNeural", - "pt-PT-DuarteNeural", - "pt-PT-RaquelNeural", - "ro-RO-AlinaNeural", - "ro-RO-EmilNeural", - "ru-RU-DmitryNeural", - "ru-RU-SvetlanaNeural", - "sr-RS-NicholasNeural", - "sr-RS-SophieNeural", - "si-LK-SameeraNeural", - "si-LK-ThiliniNeural", - "sk-SK-LukasNeural", - "sk-SK-ViktoriaNeural", - "sl-SI-PetraNeural", - "sl-SI-RokNeural", - "so-SO-MuuseNeural", - "so-SO-UbaxNeural", - "es-AR-ElenaNeural", - "es-AR-TomasNeural", - "es-BO-MarceloNeural", - "es-BO-SofiaNeural", - "es-CL-CatalinaNeural", - "es-CL-LorenzoNeural", - "es-ES-XimenaNeural", - "es-CO-GonzaloNeural", - "es-CO-SalomeNeural", - "es-CR-JuanNeural", - "es-CR-MariaNeural", - "es-CU-BelkysNeural", - "es-CU-ManuelNeural", - "es-DO-EmilioNeural", - "es-DO-RamonaNeural", - "es-EC-AndreaNeural", - "es-EC-LuisNeural", - "es-SV-LorenaNeural", - "es-SV-RodrigoNeural", - "es-GQ-JavierNeural", - "es-GQ-TeresaNeural", - "es-GT-AndresNeural", - "es-GT-MartaNeural", - "es-HN-CarlosNeural", - "es-HN-KarlaNeural", - "es-MX-DaliaNeural", - "es-MX-JorgeNeural", - "es-NI-FedericoNeural", - "es-NI-YolandaNeural", - "es-PA-MargaritaNeural", - "es-PA-RobertoNeural", - "es-PY-MarioNeural", - "es-PY-TaniaNeural", - "es-PE-AlexNeural", - "es-PE-CamilaNeural", - "es-PR-KarinaNeural", - "es-PR-VictorNeural", - "es-ES-AlvaroNeural", - "es-ES-ElviraNeural", - "es-US-AlonsoNeural", - "es-US-PalomaNeural", - "es-UY-MateoNeural", - "es-UY-ValentinaNeural", - "es-VE-PaolaNeural", - "es-VE-SebastianNeural", - "su-ID-JajangNeural", - "su-ID-TutiNeural", - "sw-KE-RafikiNeural", - "sw-KE-ZuriNeural", - "sw-TZ-DaudiNeural", - "sw-TZ-RehemaNeural", - "sv-SE-MattiasNeural", - "sv-SE-SofieNeural", - "ta-IN-PallaviNeural", - "ta-IN-ValluvarNeural", - "ta-MY-KaniNeural", - "ta-MY-SuryaNeural", - "ta-SG-AnbuNeural", - "ta-SG-VenbaNeural", - "ta-LK-KumarNeural", - "ta-LK-SaranyaNeural", - "te-IN-MohanNeural", - "te-IN-ShrutiNeural", - "th-TH-NiwatNeural", - "th-TH-PremwadeeNeural", - "tr-TR-AhmetNeural", - "tr-TR-EmelNeural", - "uk-UA-OstapNeural", - "uk-UA-PolinaNeural", - "ur-IN-GulNeural", - "ur-IN-SalmanNeural", - "ur-PK-AsadNeural", - "ur-PK-UzmaNeural", - "uz-UZ-MadinaNeural", - "uz-UZ-SardorNeural", - "vi-VN-HoaiMyNeural", - "vi-VN-NamMinhNeural", - "cy-GB-AledNeural", - "cy-GB-NiaNeural", - "zu-ZA-ThandoNeural", - "zu-ZA-ThembaNeural" - ], - "google_tts_voice": [ - "af", - "am", - "ar", - "bg", - "bn", - "bs", - "ca", - "cs", - "cy", - "da", - "de", - "el", - "en", - "es", - "et", - "eu", - "fi", - "fr", - "fr-CA", - "gl", - "gu", - "ha", - "hi", - "hr", - "hu", - "id", - "is", - "it", - "iw", - "ja", - "jw", - "km", - "kn", - "ko", - "la", - "lt", - "lv", - "ml", - "mr", - "ms", - "my", - "ne", - "nl", - "no", - "pa", - "pl", - "pt", - "pt-PT", - "ro", - "ru", - "si", - "sk", - "sq", - "sr", - "su", - "sv", - "sw", - "ta", - "te", - "th", - "tl", - "tr", - "uk", - "ur", - "vi", - "yue", - "zh-CN", - "zh-TW", - "zh" - ], - "fp16": false, - "editing_tab": true, - "inference_tab": true, - "create_and_training_tab": true, - "extra_tab": true, - "separator_tab": true, - "convert_tab": true, - "convert_with_whisper": true, - "tts_tab": true, - "effects_tab": true, - "quirk": true, - "create_dataset_tab": true, - "training_tab": true, - "fushion_tab": true, - "read_tab": true, - "onnx_tab": true, - "downloads_tab": true, - "f0_extractor_tab": true, - "settings_tab": true, - "create_srt_tab": true, - "realtime_tab": true, - "realtime_client_tab": true, - "create_reference_tab": true, - "font": "https://fonts.googleapis.com/css2?family=Roboto&display=swap", - "app_port": 7860, - "tensorboard_port": 6870, - "num_of_restart": 5, - "server_name": "0.0.0.0", - "app_show_error": true, - "delete_exists_file": false, - "audio_effects_path": "main/inference/audio_effects.py", - "convert_path": "main/inference/conversion/convert.py", - "separate_path": "main/inference/separate_music.py", - "create_dataset_path": "main/inference/create_dataset.py", - "preprocess_path": "main/inference/preprocess/preprocess.py", - "extract_path": "main/inference/extracting/extract.py", - "create_index_path": "main/inference/create_index.py", - "train_path": "main/inference/training/train.py", - "create_reference_path": "main/inference/create_reference.py", - "ico_path": "assets/ico.png", - "csv_path": "assets/spreadsheet.csv", - "weights_path": "assets/weights", - "logs_path": "assets/logs", - "binary_path": "assets/binary", - "f0_path": "assets/f0", - "language_path": "assets/languages", - "presets_path": "assets/presets", - "embedders_path": "assets/models/embedders", - "predictors_path": "assets/models/predictors", - "pretrained_custom_path": "assets/models/pretrained_custom", - "pretrained_v1_path": "assets/models/pretrained_v1", - "pretrained_v2_path": "assets/models/pretrained_v2", - "speaker_diarization_path": "assets/models/speaker_diarization", - "uvr5_path": "assets/models/uvr5", - "audios_path": "audios", - "reference_path": "assets/logs/reference", - "demucs_segments_enable": true, - "demucs_cpu_mode": false, - "limit_f0": 8, - "debug_mode": false, - "pretrain_verify_shape": true, - "pretrain_strict": true, - "cpu_mode": false, - "brain": false, - "discord_presence": true -} \ No newline at end of file diff --git a/main/configs/config.py b/main/configs/config.py deleted file mode 100644 index 7f260693c90ee6eeca6cf740a2fc2ad9f1d03476..0000000000000000000000000000000000000000 --- a/main/configs/config.py +++ /dev/null @@ -1,131 +0,0 @@ -import os -import sys -import json -import torch -import onnxruntime - -sys.path.append(os.getcwd()) - -from main.library.backends import directml, opencl, zluda - -version_config_paths = [os.path.join(version, size) for version in ["v1", "v2"] for size in ["32000.json", "40000.json", "48000.json"]] - -def singleton(cls): - instances = {} - - def get_instance(*args, **kwargs): - if cls not in instances: instances[cls] = cls(*args, **kwargs) - return instances[cls] - - return get_instance - -@singleton -class Config: - def __init__(self): - self.configs_path = os.path.join("main", "configs", "config.json") - self.configs = json.load(open(self.configs_path, "r")) - - self.cpu_mode = self.configs.get("cpu_mode", False) - self.brain = self.configs.get("brain", False) - self.debug_mode = self.configs.get("debug_mode", False) - - self.json_config = self.load_config_json() - self.translations = self.multi_language() - - self.gpu_mem = None - self.per_preprocess = 3.7 - self.device = self.get_default_device() - self.providers = self.get_providers() - self.is_half = self.is_fp16() - self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() - - def multi_language(self): - try: - lang = self.configs.get("language", "vi-VN") - if len([l for l in os.listdir(self.configs["language_path"]) if l.endswith(".json")]) < 1: raise FileNotFoundError("Không tìm thấy bất cứ gói ngôn ngữ nào(No package languages found)") - - if not lang: lang = "vi-VN" - if lang not in self.configs["support_language"]: raise ValueError("Ngôn ngữ không được hỗ trợ(Language not supported)") - - lang_path = os.path.join(self.configs["language_path"], f"{lang}.json") - if not os.path.exists(lang_path): lang_path = os.path.join(self.configs["language_path"], "vi-VN.json") - - with open(lang_path, encoding="utf-8") as f: - translations = json.load(f) - except json.JSONDecodeError: - print(self.translations["empty_json"].format(file=lang)) - pass - - return translations - - def is_fp16(self): - fp16 = self.configs.get("fp16", False) - - if self.device in ["cpu", "mps"] and fp16: - self.configs["fp16"] = False - fp16 = False - - with open(self.configs_path, "w") as f: - json.dump(self.configs, f, indent=4) - - if not fp16: self.per_preprocess = 3.0 - return fp16 - - def load_config_json(self): - configs = {} - - for config_file in version_config_paths: - try: - with open(os.path.join("main", "configs", config_file), "r") as f: - configs[config_file] = json.load(f) - except json.JSONDecodeError: - print(self.translations["empty_json"].format(file=config_file)) - pass - - return configs - - def device_config(self): - if self.gpu_mem is not None and self.gpu_mem <= 4: - self.per_preprocess = 3.0 - return 1, 5, 30, 32 - - return (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41) - - def get_default_device(self): - if not self.cpu_mode: - if torch.cuda.is_available(): - device = "cuda:0" - self.gpu_mem = torch.cuda.get_device_properties(int(device.split(":")[-1])).total_memory // (1024**3) - elif directml.is_available(): - device = "privateuseone:0" - elif opencl.is_available(): - device = "ocl:0" - elif torch.backends.mps.is_available(): - device = "mps" - else: - device = "cpu" - else: - torch.cuda.is_available = lambda : False - directml.is_available = lambda : False - opencl.is_available = lambda : False - torch.backends.mps.is_available = lambda : False - - device = "cpu" - - return device - - def get_providers(self): - ort_providers = onnxruntime.get_available_providers() - - if "CUDAExecutionProvider" in ort_providers and self.device.startswith("cuda"): - providers = ["CUDAExecutionProvider"] - elif "ROCMExecutionProvider" in ort_providers and self.device.startswith("cuda"): - providers = ["ROCMExecutionProvider"] - elif "DmlExecutionProvider" in ort_providers and self.device.startswith(("ocl", "privateuseone")): - providers = ["DmlExecutionProvider"] - elif "CoreMLExecutionProvider" in ort_providers and self.device.startswith("mps"): - providers = ["CoreMLExecutionProvider"] - else: - providers = ["CPUExecutionProvider"] - - return providers \ No newline at end of file diff --git a/main/configs/rpc.py b/main/configs/rpc.py deleted file mode 100644 index f3bded39740d264f2a4731b247009d096400a7ee..0000000000000000000000000000000000000000 --- a/main/configs/rpc.py +++ /dev/null @@ -1,78 +0,0 @@ -import os -import sys -import json -import time -import struct -import codecs - -sys.path.append(os.getcwd()) - -from main.app.variables import translations - -CLIENT_ID = "1392816674159202396" - -def create_payload(opcode, payload): - data = json.dumps(payload).encode("utf-8") - - return struct.pack( - "= 1: - b, a = butter(4, frequency / (0.5 * sample_rate), btype='low') - boosted = _filtfilt(b, a, audio) - return boosted * (10 ** (gain_db / 20)) - return audio - - def treble_boost(audio, gain_db, frequency, sample_rate): - if gain_db >= 1: - b, a = butter(4, frequency / (0.5 * sample_rate), btype='high') - boosted = _filtfilt(b, a, audio) - return boosted * (10 ** (gain_db / 20)) - return audio - - def fade_out_effect(audio, sr, duration=3.0): - length = int(duration * sr) - end = audio.shape[0] - if length > end: length = end - start = end - length - audio[start:end] = audio[start:end] * np.linspace(1.0, 0.0, length) - return audio - - def fade_in_effect(audio, sr, duration=3.0): - length = int(duration * sr) - start = 0 - if length > audio.shape[0]: length = audio.shape[0] - end = length - audio[start:end] = audio[start:end] * np.linspace(0.0, 1.0, length) - return audio - - if not input_path or not os.path.exists(input_path): - logger.warning(translations["input_not_valid"]) - sys.exit(1) - - if not output_path: - logger.warning(translations["output_not_valid"]) - sys.exit(1) - - if os.path.exists(output_path): os.remove(output_path) - - try: - input_path = input_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - try: - audio, sample_rate = sf.read(input_path, dtype=np.float32) - except: - audio, sample_rate = librosa.load(input_path, sr=None) - except Exception as e: - logger.debug(f"{translations['errors_loading_audio']}: {e}") - raise RuntimeError(f"{translations['errors_loading_audio']}: {e}") - - try: - board = Pedalboard([HighpassFilter()]) - - if chorus: board.append(Chorus(depth=chorus_depth, rate_hz=chorus_rate, mix=chorus_mix, centre_delay_ms=chorus_delay, feedback=chorus_feedback)) - if distortion: board.append(Distortion(drive_db=distortion_drive)) - if reverb: board.append(Reverb(room_size=reverb_room_size, damping=reverb_damping, wet_level=reverb_wet_level, dry_level=reverb_dry_level, width=reverb_width, freeze_mode=int(reverb_freeze_mode))) - if pitchshift: board.append(PitchShift(semitones=pitch_shift)) - if delay: board.append(Delay(delay_seconds=delay_seconds, feedback=delay_feedback, mix=delay_mix)) - if compressor: board.append(Compressor(threshold_db=compressor_threshold, ratio=compressor_ratio, attack_ms=compressor_attack_ms, release_ms=compressor_release_ms)) - if limiter: board.append(Limiter(threshold_db=limiter_threshold, release_ms=limiter_release)) - if gain: board.append(Gain(gain_db=gain_db)) - if bitcrush: board.append(Bitcrush(bit_depth=bitcrush_bit_depth)) - if clipping: board.append(Clipping(threshold_db=clipping_threshold)) - if phaser: board.append(Phaser(rate_hz=phaser_rate_hz, depth=phaser_depth, centre_frequency_hz=phaser_centre_frequency_hz, feedback=phaser_feedback, mix=phaser_mix)) - - processed_audio = board(audio, sample_rate) - - if treble_bass_boost: - processed_audio = bass_boost(processed_audio, bass_boost_db, bass_boost_frequency, sample_rate) - processed_audio = treble_boost(processed_audio, treble_boost_db, treble_boost_frequency, sample_rate) - - if fade_in_out: - processed_audio = fade_in_effect(processed_audio, sample_rate, fade_in_duration) - processed_audio = fade_out_effect(processed_audio, sample_rate, fade_out_duration) - - if resample and resample_sr != sample_rate and resample_sr > 0: - processed_audio = librosa.resample(processed_audio, orig_sr=sample_rate, target_sr=resample_sr, res_type="soxr_vhq") - sample_rate = resample_sr - - sf.write(replace_export_format(output_path, export_format), processed_audio, sample_rate, format=export_format) - if audio_combination: pydub_load(audio_combination_input, combination_volume).overlay(pydub_load(replace_export_format(output_path, export_format), main_volume)).export(replace_export_format(output_path, export_format), format=export_format) - except Exception as e: - import traceback - logger.debug(traceback.format_exc()) - raise RuntimeError(translations["apply_error"].format(e=e)) - return output_path - -def main(): - args = parse_arguments() - process_audio(input_path=args.input_path, output_path=args.output_path, resample=args.resample, resample_sr=args.resample_sr, chorus_depth=args.chorus_depth, chorus_rate=args.chorus_rate, chorus_mix=args.chorus_mix, chorus_delay=args.chorus_delay, chorus_feedback=args.chorus_feedback, distortion_drive=args.drive_db, reverb_room_size=args.reverb_room_size, reverb_damping=args.reverb_damping, reverb_wet_level=args.reverb_wet_level, reverb_dry_level=args.reverb_dry_level, reverb_width=args.reverb_width, reverb_freeze_mode=args.reverb_freeze_mode, pitch_shift=args.pitch_shift, delay_seconds=args.delay_seconds, delay_feedback=args.delay_feedback, delay_mix=args.delay_mix, compressor_threshold=args.compressor_threshold, compressor_ratio=args.compressor_ratio, compressor_attack_ms=args.compressor_attack_ms, compressor_release_ms=args.compressor_release_ms, limiter_threshold=args.limiter_threshold, limiter_release=args.limiter_release, gain_db=args.gain_db, bitcrush_bit_depth=args.bitcrush_bit_depth, clipping_threshold=args.clipping_threshold, phaser_rate_hz=args.phaser_rate_hz, phaser_depth=args.phaser_depth, phaser_centre_frequency_hz=args.phaser_centre_frequency_hz, phaser_feedback=args.phaser_feedback, phaser_mix=args.phaser_mix, bass_boost_db=args.bass_boost_db, bass_boost_frequency=args.bass_boost_frequency, treble_boost_db=args.treble_boost_db, treble_boost_frequency=args.treble_boost_frequency, fade_in_duration=args.fade_in_duration, fade_out_duration=args.fade_out_duration, export_format=args.export_format, chorus=args.chorus, distortion=args.distortion, reverb=args.reverb, pitchshift=args.pitchshift, delay=args.delay, compressor=args.compressor, limiter=args.limiter, gain=args.gain, bitcrush=args.bitcrush, clipping=args.clipping, phaser=args.phaser, treble_bass_boost=args.treble_bass_boost, fade_in_out=args.fade_in_out, audio_combination=args.audio_combination, audio_combination_input=args.audio_combination_input, main_volume=args.main_volume, combination_volume=args.combination_volume) - -if __name__ == "__main__": main() \ No newline at end of file diff --git a/main/inference/conversion/audio_processing.py b/main/inference/conversion/audio_processing.py deleted file mode 100644 index 54e75126d5244edbc6402338a21dda96446f8e4d..0000000000000000000000000000000000000000 --- a/main/inference/conversion/audio_processing.py +++ /dev/null @@ -1,281 +0,0 @@ -import os -import sys -import torch -import librosa - -import numpy as np -import scipy.signal as signal - -sys.path.append(os.getcwd()) - -stft = None - -def rms(x, eps=1e-9): - return np.sqrt(np.mean(x ** 2) + eps) - -def soft_limiter(x, threshold=0.98): - return np.tanh(x / threshold) * threshold - -def normalize_audio(x, target_rms=0.1): - cur = rms(x) - - if cur <= 0: return x - return x * (target_rms / cur) - -def compute_mfcc(x, sr, n_mfcc=20, n_fft=1024, hop_length=160): - mfcc = librosa.feature.mfcc( - y=x, - sr=sr, - n_mfcc=n_mfcc, - n_fft=n_fft, - hop_length=hop_length - ) - - return mfcc - -def mix_mfcc_exciter(audio, sr, strength=0.08, n_mfcc=20, n_mels=128): - mfcc = compute_mfcc(audio, sr, n_mfcc) - - try: - exc = librosa.feature.inverse.mfcc_to_audio(mfcc, sr=sr, n_mels=n_mels) - except Exception: - mel_spec = librosa.feature.inverse.mfcc_to_mel(mfcc) - exc = librosa.feature.inverse.mel_to_audio(mel_spec, sr=sr) - - if exc.shape[0] < audio.shape[0]: - exc = np.pad(exc, (0, audio.shape[0] - exc.shape[0])) - else: - exc = exc[: audio.shape[0]] - - b, a = signal.butter(2, 300 / (sr / 2), btype="high") - exc = signal.lfilter(b, a, exc) - - exc = exc / (rms(exc) + 1e-9) * (rms(audio) + 1e-9) - return audio + strength * exc - -def automatic_multiband_eq(audio, sr, n_bands=6, target_slope=0.0, n_fft=1024, hop_length=160): - S = np.abs(librosa.stft(audio.astype(np.float32), n_fft=n_fft, hop_length=hop_length)) - mean_spec = np.mean(S, axis=1) - freqs = np.linspace(0, sr // 2, mean_spec.shape[0]) - - band_edges = np.geomspace(100, sr / 2, n_bands + 1) - gains_db = np.zeros(n_bands) - - for i in range(n_bands): - idx = np.where((freqs >= band_edges[i]) & (freqs < band_edges[i + 1]))[0] - if idx.size == 0: - gains_db[i] = 0.0 - continue - - band_power_db = 20 * np.log10(np.mean(mean_spec[idx]) + 1e-9) - median_db = np.median(20 * np.log10(mean_spec + 1e-9)) - gains_db[i] = median_db - band_power_db - - gains_db = signal.medfilt(gains_db, kernel_size=3) - gains_db = gains_db + np.linspace(-target_slope, target_slope, n_bands) - gains = 10 ** (gains_db / 20.0) - - out = np.zeros_like(audio) - for i in range(n_bands): - low = band_edges[i] - high = band_edges[i + 1] - - if low <= 0: - b, a = signal.butter(2, high / (sr / 2), btype="low") - elif high >= sr / 2: - b, a = signal.butter(2, low / (sr / 2), btype="high") - else: - b, a = signal.butter(2, [low / (sr / 2), high / (sr / 2)], btype="band") - - band = signal.lfilter(b, a, audio) - out += gains[i] * band - - out = out / (rms(out) + 1e-9) * (rms(audio) + 1e-9) - return 0.85 * audio + 0.15 * out - -def apply_multiband_eq(audio, sr, bands): - out = np.zeros_like(audio) - - for low, high, gain_db in bands: - gain = 10 ** (gain_db / 20.0) - - if low <= 0: b, a = signal.butter(2, high / (sr / 2), btype="low") - elif high >= sr / 2: b, a = signal.butter(2, low / (sr / 2), btype="high") - else: b, a = signal.butter(2, [low / (sr / 2), high / (sr / 2)], btype="band") - - band = signal.lfilter(b, a, audio) - out += gain * band - - return out - -def best_multiband_eq(audio, sr, original_audio=None, sr_ref=16000, n_bands=6, target_slope=0.0, n_fft=1024, hop_length=160, strength=0.15): - if original_audio is not None: - mf_out = compute_mfcc(audio, sr) - mf_ref = compute_mfcc(original_audio.astype(np.float32), sr_ref) - - out_mean = np.mean(mf_out, axis=1) - ref_mean = np.mean(mf_ref, axis=1) - diff = ref_mean - out_mean - - low_val = diff[:3].mean() - mid_val = diff[3:6].mean() - upper_val = diff[6:9].mean() - high_val = diff[9:13].mean() - - bands = [ - (0, 300, np.clip(low_val * 0.6, -6.0, 6.0)), - (300, 800, np.clip(mid_val * 0.5, -6.0, 6.0)), - (800, 2000, np.clip(upper_val * 0.6, -6.0, 6.0)), - (2000, int(sr / 2 - 1000), np.clip(high_val * 0.6, -6.0, 6.0)), - ] - eq_audio = apply_multiband_eq(audio, sr, bands) - else: - fft = np.abs(librosa.stft(audio.astype(np.float32), n_fft=n_fft, hop_length=hop_length)) - mean_spec = np.mean(fft, axis=1) - freqs = np.linspace(0, sr // 2, mean_spec.shape[0]) - - band_edges = np.geomspace(100, sr / 2, n_bands + 1) - gains_db = np.zeros(n_bands) - - for i in range(n_bands): - idx = np.where((freqs >= band_edges[i]) & (freqs < band_edges[i + 1]))[0] - if idx.size == 0: continue - - band_power_db = 20 * np.log10(np.mean(mean_spec[idx]) + 1e-9) - median_db = np.median(20 * np.log10(mean_spec + 1e-9)) - gains_db[i] = median_db - band_power_db - - gains_db = signal.medfilt(gains_db, kernel_size=3) - gains_db += np.linspace(-target_slope, target_slope, n_bands) - gains_db = np.clip(gains_db, -6.0, 6.0) - - bands = [(band_edges[i], band_edges[i+1], gains_db[i]) for i in range(n_bands)] - eq_audio = apply_multiband_eq(audio, sr, bands) - - out = (1 - strength) * audio + strength * eq_audio - out = out / (rms(out) + 1e-9) * (rms(audio) + 1e-9) - - mx = np.max(np.abs(out)) + 1e-9 - if mx > 0.99: out /= mx * 0.99 - - return out - -def spectral_subtract_denoise(audio, sr, noise_seconds=0.4, alpha=1.0, n_fft=1024, hop_length=160, device="cpu"): - global stft - - if stft is None and device.startswith(("ocl", "privateuseone")): - from main.library.backends.utils import STFT - stft = STFT(filter_length=n_fft, hop_length=hop_length, win_length=None, window="hann").to(device) - else: stft = None - - x = torch.from_numpy(audio.astype(np.float32)).float().unsqueeze(0).to(device) - window = torch.hann_window(n_fft).to(device) - - if stft is None: - fft = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window=window, return_complex=True) - mag, phase = (fft.real.pow(2) + fft.imag.pow(2)).sqrt(), fft.imag.data.atan2(fft.real.data) - else: - mag, phase = stft.transform(x, eps=1e-9, return_phase=True) - - noise_mag = mag[:, :, :max(1, min(int((noise_seconds * sr - n_fft) // hop_length) + 1, mag.shape[-1]))].mean(dim=-1, keepdim=True) - clean_mag = (mag - alpha * noise_mag).maximum((noise_mag * 1.0) * 0.1) - - xrec = torch.istft(clean_mag * (1j * phase).exp(), n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window=window, length=x.shape[0]) if stft is None else stft.inverse(clean_mag, phase) - return xrec.squeeze(0).cpu().numpy() - -def repair_bad_frames(audio, sr, frame_ms=20, energy_thresh=0.02): - frame_len = int(sr * frame_ms / 1000) - hop = frame_len // 2 - - n_frames = 1 + max(0, (len(audio) - frame_len) // hop) - frames = np.stack([audio[i * hop : i * hop + frame_len] for i in range(n_frames)]) - - energies = np.sqrt(np.mean(frames ** 2, axis=1)) - median_e = np.median(energies) - bad = energies < (energy_thresh * median_e) - - if not np.any(bad): return audio - out = audio.copy() - - for i, is_bad in enumerate(bad): - if not is_bad: continue - - start = i * hop - end = start + frame_len - - left = out[max(0, start - frame_len) : start] - right = out[end : min(len(out), end + frame_len)] - - if left.size > 0 and right.size > 0: out[start:end] = 0.5 * (np.mean(left) + np.mean(right)) - elif left.size > 0: out[start:end] = left[-1] - elif right.size > 0: out[start:end] = right[0] - else: out[start:end] = 0.0 - - return out - -def harmonic_enrich_and_compress(audio, drive=0.02, comp_ratio=3.0, frame_length=1024, hop_length=160): - exc = np.abs(audio) - exc -= np.mean(exc) - audio2 = audio + drive * exc - - env_rms = librosa.feature.rms(y=audio2.astype(np.float32), frame_length=frame_length, hop_length=hop_length)[0] - frame_times = np.linspace(0, len(audio2), num=len(env_rms)) - env_s = np.interp(np.arange(len(audio2)), frame_times, env_rms) - - threshold = np.median(env_s) * 1.2 - gain = 1.0 / (1.0 + ((env_s / (threshold + 1e-9)) ** (comp_ratio - 1))) - out = audio2 * gain - - return out - -def fade_in_out(audio, sr, fade_ms=10): - n = len(audio) - - fade_len = int(sr * fade_ms / 1000) - if fade_len <= 0: return audio - - win = np.ones(n) - fade_in = np.linspace(0.0, 1.0, fade_len) - fade_out = np.linspace(1.0, 0.0, fade_len) - - win[:fade_len] = fade_in - win[-fade_len:] = fade_out - - return audio * win - -def preprocess(audio, sr=16000, target_rms=0.8, device="cpu"): - x = normalize_audio(audio.astype(np.float32), target_rms=target_rms) - x -= np.mean(x) - - x = spectral_subtract_denoise(x, sr, device=device) - x = repair_bad_frames(x, sr) - - x = automatic_multiband_eq(x, sr) - x = mix_mfcc_exciter(x, sr, strength=0.06) - - x = harmonic_enrich_and_compress(x, drive=0.015, comp_ratio=2.5) - x = soft_limiter(x, threshold=0.98) - - x = fade_in_out(x, sr, fade_ms=8) - x /= (np.max(np.abs(x)) + 1e-9) * 0.99 - - return x.astype(np.float32) - -def postprocess(audio, sr=48000, original_audio=None, sr_ref=16000, device="cpu"): - x = audio.astype(np.float32) - x = x - np.mean(x) - - x = fade_in_out(x, sr, fade_ms=6) - x = spectral_subtract_denoise(x, sr, noise_seconds=0.25, device=device) - - x = best_multiband_eq(x, sr, original_audio=original_audio, sr_ref=sr_ref, n_bands=6, target_slope=0.02, strength=0.15) - x = soft_limiter(x, threshold=0.995) - - cutoff = min(20000, sr / 2 - 100) - Wn = cutoff / (sr / 2) - - b, a = signal.butter(2, Wn, btype="low") - x = signal.filtfilt(b, a, x) - - x /= (np.max(np.abs(x)) + 1e-9) * 0.99 - return x.astype(np.float32) \ No newline at end of file diff --git a/main/inference/conversion/convert.py b/main/inference/conversion/convert.py deleted file mode 100644 index 786e9c9e876ce007177046fe03963583560d48ff..0000000000000000000000000000000000000000 --- a/main/inference/conversion/convert.py +++ /dev/null @@ -1,377 +0,0 @@ -import os -import sys -import time -import torch -import librosa -import logging -import argparse -import warnings - -import numpy as np -import soundfile as sf - -from tqdm import tqdm -from distutils.util import strtobool - -warnings.filterwarnings("ignore") -sys.path.append(os.getcwd()) - -from main.app.core.ui import replace_export_format -from main.inference.conversion.pipeline import Pipeline -from main.app.variables import config, logger, translations -from main.inference.conversion.audio_processing import preprocess, postprocess -from main.library.utils import check_assets, load_audio, load_embedders_model, cut, restore, clear_gpu_cache, load_model - -for l in ["torch", "faiss", "omegaconf", "httpx", "httpcore", "faiss.loader", "numba.core", "urllib3", "transformers", "matplotlib"]: - logging.getLogger(l).setLevel(logging.ERROR) - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--convert", action='store_true') - parser.add_argument("--pitch", type=int, default=0) - parser.add_argument("--filter_radius", type=int, default=3) - parser.add_argument("--index_rate", type=float, default=0.5) - parser.add_argument("--rms_mix_rate", type=float, default=1) - parser.add_argument("--protect", type=float, default=0.33) - parser.add_argument("--hop_length", type=int, default=64) - parser.add_argument("--f0_method", type=str, default="rmvpe") - parser.add_argument("--embedder_model", type=str, default="hubert_base") - parser.add_argument("--input_path", type=str, required=True) - parser.add_argument("--output_path", type=str, default="./audios/output.wav") - parser.add_argument("--export_format", type=str, default="wav") - parser.add_argument("--pth_path", type=str, required=True) - parser.add_argument("--index_path", type=str, default="") - parser.add_argument("--f0_autotune", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--f0_autotune_strength", type=float, default=1) - parser.add_argument("--clean_audio", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--clean_strength", type=float, default=0.7) - parser.add_argument("--resample_sr", type=int, default=0) - parser.add_argument("--split_audio", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--checkpointing", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--f0_file", type=str, default="") - parser.add_argument("--f0_onnx", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--embedders_mode", type=str, default="fairseq") - parser.add_argument("--formant_shifting", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--formant_qfrency", type=float, default=0.8) - parser.add_argument("--formant_timbre", type=float, default=0.8) - parser.add_argument("--proposal_pitch", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--proposal_pitch_threshold", type=float, default=255.0) - parser.add_argument("--audio_processing", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--alpha", type=float, default=0.5) - - return parser.parse_args() - -def main(): - args = parse_arguments() - pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, f0_autotune_strength, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, checkpointing, f0_file, f0_onnx, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha = args.pitch, args.filter_radius, args.index_rate, args.rms_mix_rate,args.protect, args.hop_length, args.f0_method, args.input_path, args.output_path, args.pth_path, args.index_path, args.f0_autotune, args.f0_autotune_strength, args.clean_audio, args.clean_strength, args.export_format, args.embedder_model, args.resample_sr, args.split_audio, args.checkpointing, args.f0_file, args.f0_onnx, args.embedders_mode, args.formant_shifting, args.formant_qfrency, args.formant_timbre, args.proposal_pitch, args.proposal_pitch_threshold, args.audio_processing, args.alpha - - run_convert_script(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, rms_mix_rate=rms_mix_rate, protect=protect, hop_length=hop_length, f0_method=f0_method, input_path=input_path, output_path=output_path, pth_path=pth_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, resample_sr=resample_sr, split_audio=split_audio, checkpointing=checkpointing, f0_file=f0_file, f0_onnx=f0_onnx, embedders_mode=embedders_mode, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre, proposal_pitch=proposal_pitch, proposal_pitch_threshold=proposal_pitch_threshold, audio_processing=audio_processing, alpha=alpha) - -def run_convert_script( - pitch=0, - filter_radius=3, - index_rate=0.5, - rms_mix_rate=1, - protect=0.5, - hop_length=64, - f0_method="rmvpe", - input_path=None, - output_path="./output.wav", - pth_path=None, - index_path=None, - f0_autotune=False, - f0_autotune_strength=1, - clean_audio=False, - clean_strength=0.7, - export_format="wav", - embedder_model="hubert_base", - resample_sr=0, - split_audio=False, - checkpointing=False, - f0_file=None, - f0_onnx=False, - embedders_mode="fairseq", - formant_shifting=False, - formant_qfrency=0.8, - formant_timbre=0.8, - proposal_pitch=False, - proposal_pitch_threshold=255.0, - audio_processing=False, - alpha=0.5 -): - check_assets(f0_method, embedder_model, f0_onnx=f0_onnx, embedders_mode=embedders_mode) - log_data = { - translations['pitch']: pitch, - translations['filter_radius']: filter_radius, - translations['index_strength']: index_rate, - translations['rms_mix_rate']: rms_mix_rate, - translations['protect']: protect, - translations['hop_length']: hop_length, - translations['f0_method']: f0_method, - translations['audio_path']: input_path, - translations['output_path']: replace_export_format(output_path, export_format), - translations['model_path']: pth_path, - translations['indexpath']: index_path, - translations['autotune']: f0_autotune, - translations['clear_audio']: clean_audio, - translations['export_format']: export_format, - translations['hubert_model']: embedder_model, - translations['split_audio']: split_audio, - translations['memory_efficient_training']: checkpointing, - translations["f0_onnx_mode"]: f0_onnx, - translations["embed_mode"]: embedders_mode, - translations["proposal_pitch"]: proposal_pitch, - translations["audio_processing"]: audio_processing, - translations["alpha_label"]: alpha - } - - if clean_audio: log_data[translations['clean_strength']] = clean_strength - if resample_sr != 0: log_data[translations['sample_rate']] = resample_sr - if f0_autotune: log_data[translations['autotune_rate_info']] = f0_autotune_strength - if os.path.isfile(f0_file): log_data[translations['f0_file']] = f0_file - if proposal_pitch: log_data[translations["proposal_pitch_threshold"]] = proposal_pitch_threshold - if formant_shifting: - log_data[translations['formant_qfrency']] = formant_qfrency - log_data[translations['formant_timbre']] = formant_timbre - - for key, value in log_data.items(): - logger.debug(f"{key}: {value}") - - if not pth_path or not os.path.exists(pth_path) or os.path.isdir(pth_path) or not pth_path.endswith((".pth", ".onnx")): - logger.warning(translations["provide_file"].format(filename=translations["model"])) - sys.exit(1) - - cvt = VoiceConverter(pth_path, 0) - start_time = time.time() - - pid_path = os.path.join("assets", "convert_pid.txt") - with open(pid_path, "w") as pid_file: - pid_file.write(str(os.getpid())) - - def convert_audio(audio_path, output_audio): - cvt.convert_audio( - pitch=pitch, - filter_radius=filter_radius, - index_rate=index_rate, - rms_mix_rate=rms_mix_rate, - protect=protect, - hop_length=hop_length, - f0_method=f0_method, - audio_input_path=audio_path, - audio_output_path=output_audio, - index_path=index_path, - f0_autotune=f0_autotune, - f0_autotune_strength=f0_autotune_strength, - clean_audio=clean_audio, - clean_strength=clean_strength, - export_format=export_format, - embedder_model=embedder_model, - resample_sr=resample_sr, - checkpointing=checkpointing, - f0_file=f0_file, f0_onnx=f0_onnx, - embedders_mode=embedders_mode, - formant_shifting=formant_shifting, - formant_qfrency=formant_qfrency, - formant_timbre=formant_timbre, - split_audio=split_audio, - proposal_pitch=proposal_pitch, - proposal_pitch_threshold=proposal_pitch_threshold, - audio_processing=audio_processing, - alpha=alpha - ) - - if os.path.isdir(input_path): - logger.info(translations["convert_batch"]) - audio_files = [f for f in os.listdir(input_path) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))] - - if not audio_files: - logger.warning(translations["not_found_audio"]) - sys.exit(1) - - logger.info(translations["found_audio"].format(audio_files=len(audio_files))) - - for audio in audio_files: - audio_path = os.path.join(input_path, audio) - output_audio = os.path.join(input_path, os.path.splitext(audio)[0] + f"_output.{export_format}") - - logger.info(f"{translations['convert_audio']} '{audio_path}'...") - if os.path.exists(output_audio): os.remove(output_audio) - - convert_audio(audio_path, output_audio) - - logger.info(translations["convert_batch_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}", output_path=replace_export_format(output_path, export_format))) - else: - if not os.path.exists(input_path): - logger.warning(translations["not_found_audio"]) - sys.exit(1) - - logger.info(f"{translations['convert_audio']} '{input_path}'...") - if os.path.exists(output_path): os.remove(output_path) - - convert_audio(input_path, output_path) - logger.info(translations["convert_audio_success"].format(input_path=input_path, elapsed_time=f"{(time.time() - start_time):.2f}", output_path=replace_export_format(output_path, export_format))) - - if os.path.exists(pid_path): os.remove(pid_path) - -class VoiceConverter: - def __init__(self, model_path, sid = 0): - self.config = config - self.device = config.device - self.hubert_model = None - self.tgt_sr = None - self.net_g = None - self.vc = None - self.cpt = None - self.version = None - self.n_spk = None - self.use_f0 = None - self.loaded_model = None - self.vocoder = "Default" - self.checkpointing = False - self.sample_rate = 16000 - self.sid = sid - self.get_vc(model_path, sid) - - def convert_audio(self, audio_input_path, audio_output_path, index_path, embedder_model, pitch, f0_method, index_rate, rms_mix_rate, protect, hop_length, f0_autotune, f0_autotune_strength, filter_radius, clean_audio, clean_strength, export_format, resample_sr = 0, checkpointing = False, f0_file = None, f0_onnx = False, embedders_mode = "fairseq", formant_shifting = False, formant_qfrency = 0.8, formant_timbre = 0.8, split_audio = False, proposal_pitch = False, proposal_pitch_threshold = 0, audio_processing = False, alpha = 0.5): - self.checkpointing = checkpointing - - try: - with tqdm(total=10, desc=translations["convert_audio"], ncols=100, unit="a", leave=not split_audio) as pbar: - audio = load_audio(audio_input_path, self.sample_rate, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre) - if audio_processing: audio = preprocess(audio, self.sample_rate, device=self.device) - - try: - audio_max = np.abs(audio).max() / 0.95 - if audio_max > 1: audio /= audio_max - except: - import shutil - shutil.copy(audio_input_path, audio_output_path) - return - - if not self.hubert_model: - models = load_embedders_model(embedder_model, embedders_mode) - if isinstance(models, torch.nn.Module): models = models.to(torch.float16 if self.config.is_half else torch.float32).eval().to(self.device) - self.hubert_model = models - - pbar.update(1) - if split_audio: - pbar.close() - chunks = cut(audio, self.sample_rate, db_thresh=-60, min_interval=500) - - logger.info(f"{translations['split_total']}: {len(chunks)}") - pbar = tqdm(total=len(chunks) * 5 + 4, desc=translations["convert_audio"], ncols=100, unit="a", leave=True) - else: chunks = [(audio, 0, 0)] - - pbar.update(1) - converted_chunks = [( - start, - end, - self.vc.pipeline( - logger=logger, - model=self.hubert_model, - net_g=self.net_g, - sid=self.sid, - audio=waveform, - f0_up_key=pitch, - f0_method=f0_method, - file_index=index_path.strip().strip('"').strip("\n").strip('"').strip().replace("trained", "added"), - index_rate=index_rate, - pitch_guidance=self.use_f0, - filter_radius=filter_radius, - rms_mix_rate=rms_mix_rate, - version=self.version, - protect=protect, - hop_length=hop_length, - f0_autotune=f0_autotune, - f0_autotune_strength=f0_autotune_strength, - f0_file=f0_file, - f0_onnx=f0_onnx, - pbar=pbar, - proposal_pitch=proposal_pitch, - proposal_pitch_threshold=proposal_pitch_threshold, - energy_use=self.energy, - del_onnx=not split_audio, - alpha=alpha - ) - ) for waveform, start, end in chunks] - - pbar.update(1) - audio_output = restore(converted_chunks, total_len=len(audio), dtype=converted_chunks[0][2].dtype) if split_audio else converted_chunks[0][2] - - if audio_processing: audio_output = postprocess(audio_output, self.tgt_sr, audio, self.sample_rate, device=self.device) - if self.tgt_sr != resample_sr and resample_sr > 0: - audio_output = librosa.resample(audio_output, orig_sr=self.tgt_sr, target_sr=resample_sr, res_type="soxr_vhq") - self.tgt_sr = resample_sr - - pbar.update(1) - if clean_audio: - from main.tools.noisereduce import TorchGate - if not hasattr(self, "tg"): self.tg = TorchGate(self.tgt_sr, prop_decrease=clean_strength).to(self.device) - audio_output = self.tg(torch.from_numpy(audio_output).unsqueeze(0).to(self.device).float()).squeeze(0).cpu().detach().numpy() - - if len(audio) / self.sample_rate > len(audio_output) / self.tgt_sr: - padding = np.zeros(int(np.round(len(audio) / self.sample_rate * self.tgt_sr) - len(audio_output)), dtype=audio_output.dtype) - audio_output = np.concatenate([audio_output, padding]) - - try: - sf.write(audio_output_path, audio_output, self.tgt_sr, format=export_format) - except: - sf.write(audio_output_path, librosa.resample(audio_output, orig_sr=self.tgt_sr, target_sr=48000, res_type="soxr_vhq"), 48000, format=export_format) - - pbar.update(1) - except Exception as e: - import traceback - logger.debug(traceback.format_exc()) - logger.error(translations["error_convert"].format(e=e)) - - def get_vc(self, weight_root, sid): - if sid == "" or sid == []: - self.cleanup() - clear_gpu_cache() - - if not self.loaded_model or self.loaded_model != weight_root: - self.loaded_model = weight_root - self.cpt = load_model(weight_root) - if self.cpt is not None: self.setup() - - def cleanup(self): - if self.hubert_model is not None: - del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr - self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None - clear_gpu_cache() - - del self.net_g, self.cpt - clear_gpu_cache() - self.cpt = None - - def setup(self): - if self.cpt is not None: - if self.loaded_model.endswith(".pth"): - from main.library.algorithm.synthesizers import Synthesizer - - self.tgt_sr = self.cpt["config"][-1] - self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] - - self.use_f0 = self.cpt.get("f0", 1) - self.version = self.cpt.get("version", "v1") - self.vocoder = self.cpt.get("vocoder", "Default") - self.energy = self.cpt.get("energy", False) - - if self.vocoder != "Default": self.config.is_half = False - self.net_g = Synthesizer(*self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=768 if self.version == "v2" else 256, vocoder=self.vocoder, checkpointing=self.checkpointing, energy=self.energy) - del self.net_g.enc_q - - self.net_g.load_state_dict(self.cpt["weight"], strict=False) - self.net_g.eval().to(self.device) - self.net_g = self.net_g.to(torch.float16 if self.config.is_half else torch.float32) - self.n_spk = self.cpt["config"][-3] - else: - self.net_g = self.cpt.to(config.device) - self.tgt_sr = self.cpt.cpt.get("tgt_sr", 32000) - self.use_f0 = self.cpt.cpt.get("f0", 1) - self.version = self.cpt.cpt.get("version", "v1") - self.energy = self.cpt.cpt.get("energy", False) - - self.vc = Pipeline(self.tgt_sr, self.config) - -if __name__ == "__main__": main() \ No newline at end of file diff --git a/main/inference/conversion/pipeline.py b/main/inference/conversion/pipeline.py deleted file mode 100644 index 9bbe8d1e5732cf0471512417231536b812c1a123..0000000000000000000000000000000000000000 --- a/main/inference/conversion/pipeline.py +++ /dev/null @@ -1,207 +0,0 @@ -import os -import sys -import torch - -import numpy as np -import torch.nn.functional as F - -from scipy import signal - -sys.path.append(os.getcwd()) - -from main.app.variables import translations -from main.library.utils import extract_features, change_rms, clear_gpu_cache, load_faiss_index - -bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) - -class Pipeline: - def __init__(self, tgt_sr, config): - self.x_pad = config.x_pad - self.x_query = config.x_query - self.x_center = config.x_center - self.x_max = config.x_max - self.sample_rate = 16000 - self.window = 160 - self.t_pad = self.sample_rate * self.x_pad - self.t_pad_tgt = tgt_sr * self.x_pad - self.t_pad2 = self.t_pad * 2 - self.t_query = self.sample_rate * self.x_query - self.t_center = self.sample_rate * self.x_center - self.t_max = self.sample_rate * self.x_max - self.f0_min = 50 - self.f0_max = 1100 - self.device = config.device - self.is_half = config.is_half - self.tgt_sr = tgt_sr - - def voice_conversion(self, model, net_g, sid, audio0, pitch, pitchf, index, big_npy, index_rate, version, protect, energy): - pitch_guidance = pitch != None and pitchf != None - energy_use = energy != None - - feats = torch.from_numpy(audio0).to(self.device).to(torch.float16 if self.is_half else torch.float32) - feats = feats.mean(-1) if feats.dim() == 2 else feats - assert feats.dim() == 1, feats.dim() - - with torch.no_grad(): - feats = extract_features(model, feats.view(1, -1), version, self.device) - feats0 = feats.clone() if protect < 0.5 and pitch_guidance else None - - if (not isinstance(index, type(None)) and not isinstance(big_npy, type(None)) and index_rate != 0): - npy = feats[0].cpu().numpy() - if self.is_half: npy = npy.astype(np.float32) - - score, ix = index.search(npy, k=8) - weight = np.square(1 / score) - - npy = np.sum(big_npy[ix] * np.expand_dims(weight / weight.sum(axis=1, keepdims=True), axis=2), axis=1) - if self.is_half: npy = npy.astype(np.float16) - - feats = (torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats) - - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) - p_len = min(audio0.shape[0] // self.window, feats.shape[1]) - - if pitch_guidance: pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len] - if energy_use: energy = energy[:p_len].unsqueeze(0) - - if feats0 is not None: - pitchff = pitchf.clone() - pitchff[pitchf > 0] = 1 - pitchff[pitchf < 1] = protect - pitchff = pitchff.unsqueeze(-1) - - feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) - feats = (feats * pitchff + feats0 * (1 - pitchff)).to(feats0.dtype) - - p_len = torch.tensor([p_len], device=self.device).long() - feats = feats.to(torch.float16 if self.is_half else torch.float32) - - audio1 = ( - ( - net_g.infer( - feats, - p_len, - pitch if pitch_guidance else None, - pitchf.to(torch.float16 if self.is_half else torch.float32) if pitch_guidance else None, - sid, - energy.to(torch.float16 if self.is_half else torch.float32) if energy_use else None - )[0][0, 0] - ).data.cpu().float().numpy() - ) - - del feats, feats0, p_len - - clear_gpu_cache() - return audio1 - - def pipeline(self, logger, model, net_g, sid, audio, f0_up_key, f0_method, file_index, index_rate, pitch_guidance, filter_radius, rms_mix_rate, version, protect, hop_length, f0_autotune, f0_autotune_strength, f0_file=None, f0_onnx=False, pbar=None, proposal_pitch=False, proposal_pitch_threshold=255.0, energy_use=False, del_onnx=True, alpha = 0.5): - index, big_npy = load_faiss_index(file_index) if index_rate != 0 else None, None - if pbar: pbar.update(1) - - opt_ts, audio_opt = [], [] - audio = signal.filtfilt(bh, ah, audio) - audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") - - if audio_pad.shape[0] > self.t_max: - audio_sum = np.zeros_like(audio) - - for i in range(self.window): - audio_sum += audio_pad[i : i - self.window] - - for t in range(self.t_center, audio.shape[0], self.t_center): - opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query : t + self.t_query]) == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min())[0][0]) - - s = 0 - t, inp_f0 = None, None - audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") - sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() - p_len = audio_pad.shape[0] // self.window - - if hasattr(f0_file, "name"): - try: - with open(f0_file.name, "r") as f: - raw_lines = f.read() - - if len(raw_lines) > 0: - inp_f0 = [] - - for line in raw_lines.strip("\n").split("\n"): - inp_f0.append([float(i) for i in line.split(",")]) - - inp_f0 = np.array(inp_f0, dtype=np.float32) - except: - logger.error(translations["error_readfile"]) - inp_f0 = None - - if pbar: pbar.update(1) - - if pitch_guidance: - if not hasattr(self, "f0_generator"): - from main.library.predictors.Generator import Generator - self.f0_generator = Generator(self.sample_rate, hop_length, self.f0_min, self.f0_max, alpha, self.is_half, self.device, f0_onnx, del_onnx) - - pitch, pitchf = self.f0_generator.calculator(self.x_pad, f0_method, audio_pad, f0_up_key, p_len, filter_radius, f0_autotune, f0_autotune_strength, manual_f0=inp_f0, proposal_pitch=proposal_pitch, proposal_pitch_threshold=proposal_pitch_threshold) - if self.device == "mps": pitchf = pitchf.astype(np.float32) - pitch, pitchf = torch.tensor(pitch[:p_len], device=self.device).unsqueeze(0).long(), torch.tensor(pitchf[:p_len], device=self.device).unsqueeze(0).float() - - if pbar: pbar.update(1) - - if energy_use: - if not hasattr(self, "rms_extract"): - from main.inference.extracting.rms import RMSEnergyExtractor - self.rms_extract = RMSEnergyExtractor(frame_length=2048, hop_length=self.window, center=True, pad_mode = "reflect").to(self.device).eval() - - energy = self.rms_extract(torch.from_numpy(audio_pad).to(self.device).unsqueeze(0))[:p_len].to(self.device).float() - - if pbar: pbar.update(1) - - for t in opt_ts: - t = t // self.window * self.window - audio_opt.append( - self.voice_conversion( - model, - net_g, - sid, - audio_pad[s : t + self.t_pad2 + self.window], - pitch[:, s // self.window : (t + self.t_pad2) // self.window] if pitch_guidance else None, - pitchf[:, s // self.window : (t + self.t_pad2) // self.window] if pitch_guidance else None, - index, - big_npy, - index_rate, - version, - protect, - energy[:, s // self.window : (t + self.t_pad2) // self.window] if energy_use else None - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - s = t - - audio_opt.append( - self.voice_conversion( - model, - net_g, - sid, - audio_pad[t:], - (pitch[:, t // self.window :] if t is not None else pitch) if pitch_guidance else None, - (pitchf[:, t // self.window :] if t is not None else pitchf) if pitch_guidance else None, - index, - big_npy, - index_rate, - version, - protect, - (energy[:, t // self.window :] if t is not None else energy) if energy_use else None - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - - if pbar: pbar.update(1) - - audio_opt = np.concatenate(audio_opt) - if rms_mix_rate != 1: audio_opt = change_rms(audio, self.sample_rate, audio_opt, self.tgt_sr, rms_mix_rate) - - audio_max = np.abs(audio_opt).max() / 0.99 - if audio_max > 1: audio_opt /= audio_max - - if pitch_guidance: del pitch, pitchf - del sid - - clear_gpu_cache() - return audio_opt \ No newline at end of file diff --git a/main/inference/create_dataset.py b/main/inference/create_dataset.py deleted file mode 100644 index 7c86ff0537a34921c1ab4ff0fa12d2a399ffb11b..0000000000000000000000000000000000000000 --- a/main/inference/create_dataset.py +++ /dev/null @@ -1,380 +0,0 @@ -import os -import sys -import time -import torch -import yt_dlp -import shutil -import librosa -import argparse -import warnings - -import numpy as np -import soundfile as sf - -from urllib.parse import urlparse -from distutils.util import strtobool - -sys.path.append(os.getcwd()) - -from main.app.variables import config, logger, translations -from main.inference.separate_music import _separate, vr_models - -dataset_temp = "dataset_temp" - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--create_dataset", action='store_true') - parser.add_argument("--input_data", type=str, required=True) - parser.add_argument("--output_dirs", type=str, default="./dataset") - parser.add_argument("--skip_seconds", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--skip_start_audios", type=str, default="0") - parser.add_argument("--skip_end_audios", type=str, default="0") - parser.add_argument("--separate", type=lambda x: bool(strtobool(x)), default=True) - parser.add_argument("--model_name", type=str, default="MDXNET_Main") - parser.add_argument("--reverb_model", type=str, default="MDX-Reverb") - parser.add_argument("--denoise_model", type=str, default="Normal") - parser.add_argument("--sample_rate", type=int, default=48000) - parser.add_argument("--shifts", type=int, default=2) - parser.add_argument("--batch_size", type=int, default=1) - parser.add_argument("--overlap", type=float, default=0.25) - parser.add_argument("--aggression", type=int, default=5) - parser.add_argument("--hop_length", type=int, default=1024) - parser.add_argument("--window_size", type=int, default=512) - parser.add_argument("--segments_size", type=int, default=256) - parser.add_argument("--post_process_threshold", type=float, default=0.2) - parser.add_argument("--enable_tta", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--enable_denoise", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--high_end_process", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--enable_post_process", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--separate_reverb", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--clean_dataset", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--clean_strength", type=float, default=0.7) - - return parser.parse_args() - -def main(): - args = parse_arguments() - input_data, output_dirs, skip_seconds, skip_start_audios, skip_end_audios, separate, model_name, reverb_model, denoise_model, sample_rate, shifts, batch_size, overlap, aggression, hop_length, window_size, segments_size, post_process_threshold, enable_tta, enable_denoise, high_end_process, enable_post_process, separate_reverb, clean_dataset, clean_strength = args.input_data, args.output_dirs, args.skip_seconds, args.skip_start_audios, args.skip_end_audios, args.separate, args.model_name, args.reverb_model, args.denoise_model, args.sample_rate, args.shifts, args.batch_size, args.overlap, args.aggression, args.hop_length, args.window_size, args.segments_size, args.post_process_threshold, args.enable_tta, args.enable_denoise, args.high_end_process, args.enable_post_process, args.separate_reverb, args.clean_dataset, args.clean_strength - - create_dataset( - input_data, - output_dirs, - skip_seconds, - skip_start_audios, - skip_end_audios, - separate, - model_name, - reverb_model, - denoise_model, - sample_rate, - shifts, - batch_size, - overlap, - aggression, - hop_length, - window_size, - segments_size, - post_process_threshold, - enable_tta, - enable_denoise, - high_end_process, - enable_post_process, - separate_reverb, - clean_dataset, - clean_strength - ) - -def create_dataset( - input_data, - output_dirs, - skip_seconds, - skip_start_audios, - skip_end_audios, - separate, - model_name, - reverb_model="MDX-Reverb", - denoise_model="Normal", - sample_rate=48000, - shifts=2, - batch_size=1, - overlap=0.25, - aggression=5, - hop_length=1024, - window_size=512, - segments_size=256, - post_process_threshold=0.2, - enable_tta=False, - enable_denoise=False, - high_end_process=False, - enable_post_process=False, - separate_reverb=False, - clean_dataset=False, - clean_strength=0.7 -): - log_data = { - translations['audio_path']: input_data, - translations['output_path']: output_dirs, - translations['skip']: skip_seconds, - translations['separator_tab']: separate, - translations['modelname']: model_name, - translations['dereveb_audio']: separate_reverb, - translations['sr']: sample_rate, - translations['shift']: shifts, - translations['batch_size']: batch_size, - translations['overlap']: overlap, - translations['aggression']: aggression, - translations['hop_length']: hop_length, - translations['window_size']: window_size, - translations['segments_size']: segments_size, - translations['post_process_threshold']: post_process_threshold, - translations['enable_tta']: enable_tta, - translations['denoise_mdx']: enable_denoise, - translations['high_end_process']: high_end_process, - translations['enable_post_process']: enable_post_process, - translations['clear_dataset']: clean_dataset - } - - if clean_dataset: log_data[translations['clean_strength']] = clean_strength - if separate_reverb: log_data[translations['dereveb_model']] = reverb_model - if enable_denoise and model_name in list(vr_models.keys()): log_data["Denoise Model"] = denoise_model - if skip_seconds: - log_data[translations['skip_start']] = skip_start_audios - log_data[translations['skip_end']] = skip_end_audios - - for key, value in log_data.items(): - logger.debug(f"{key}: {value}") - - start_time = time.time() - inputs_data = input_data.replace(", ", ",").split(",") - - pid_path = os.path.join("assets", "create_dataset_pid.txt") - with open(pid_path, "w") as pid_file: - pid_file.write(str(os.getpid())) - - try: - if os.path.exists(dataset_temp): shutil.rmtree(dataset_temp, ignore_errors=True) - else: os.makedirs(dataset_temp, exist_ok=True) - - audio_path = [ - downloader( - url, - f"audio_{str(inputs_data.index(url))}" - ) if is_url(url) else url - for url in inputs_data - ] - - if skip_seconds: - skip_start_audios, skip_end_audios = skip_start_audios.replace(", ", ",").split(","), skip_end_audios.replace(", ", ",").split(",") - - if len(skip_start_audios) < len(audio_path) or len(skip_end_audios) < len(audio_path): - logger.warning(translations["skip len(audio_path) or len(skip_end_audios) > len(audio_path): - logger.warning(translations["skip>audio"]) - sys.exit(1) - else: - audio_path = [ - skip_duration( - audio, - skip_start_audio, - skip_end_audio - ) - for audio, skip_start_audio, skip_end_audio in zip( - audio_path, - skip_start_audios, - skip_end_audios - ) - ] - - if separate: - audio_path = [ - separate_main( - audio, - audio_path.index(audio), - model_name, - sample_rate, - reverb_model, - denoise_model, - shifts, - batch_size, - overlap, - aggression, - hop_length, - window_size, - segments_size, - post_process_threshold, - enable_tta, - enable_denoise, - high_end_process, - enable_post_process, - separate_reverb - ) - for audio in audio_path - ] - - if clean_dataset: - from main.tools.noisereduce import TorchGate - tg = TorchGate(sr, prop_decrease=clean_strength).to(config.device) - - for audio in audio_path: - data, sr = read_file(audio) - - if len(data.shape) > 1: data = librosa.to_mono(data.T) - if sr != sample_rate: data = librosa.resample(data, orig_sr=sr, target_sr=sample_rate, res_type="soxr_vhq") - if clean_dataset: data = tg(torch.from_numpy(data).unsqueeze(0).to(config.device).float()).squeeze(0).cpu().detach().numpy() - - sf.write(audio, data, sr) - output_path = os.path.join(output_dirs, os.path.basename(audio)) - - if os.path.exists(output_path): os.remove(output_path) - shutil.move(audio, output_path) - - if os.path.exists(dataset_temp): shutil.rmtree(dataset_temp, ignore_errors=True) - except Exception as e: - logger.error(f"{translations['create_dataset_error']}: {e}") - import traceback - logger.error(traceback.format_exc()) - - elapsed_time = time.time() - start_time - if os.path.exists(pid_path): os.remove(pid_path) - - logger.info(translations["create_dataset_success"].format(elapsed_time=f"{elapsed_time:.2f}")) - return output_dirs - -def separate_main( - input_path, - index, - model_name, - sample_rate, - reverb_model="MDX-Reverb", - denoise_model="Normal", - shifts=2, - batch_size=1, - overlap=0.25, - aggression=5, - hop_length=1024, - window_size=512, - segments_size=256, - post_process_threshold=0.2, - enable_tta=False, - enable_denoise=False, - high_end_process=False, - enable_post_process=False, - separate_reverb=False -): - original_vocals, _, _, _ = _separate( - input_path, - dataset_temp, - model_name, - reverb_model=reverb_model, - denoise_model=denoise_model, - sample_rate=sample_rate, - shifts=shifts, - batch_size=batch_size, - overlap=overlap, - aggression=aggression, - hop_length=hop_length, - window_size=window_size, - segments_size=segments_size, - post_process_threshold=post_process_threshold, - enable_tta=enable_tta, - enable_denoise=enable_denoise, - high_end_process=high_end_process, - enable_post_process=enable_post_process, - separate_reverb=separate_reverb - ) - - vocals = os.path.join(dataset_temp, f"dataset_{index}.wav") - os.rename(original_vocals, vocals) - - return vocals - -def is_url(path): - try: - result = urlparse(path) - return all([result.scheme, result.netloc]) - except ValueError: - return False - -def downloader( - url, - name -): - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - - ydl_opts = { - "format": "bestaudio/best", - "outtmpl": os.path.join(dataset_temp, f"{name}"), - "postprocessors": [{ - "key": "FFmpegExtractAudio", - "preferredcodec": "wav", - "preferredquality": "192" - }], - "no_warnings": True, - "noplaylist": True, - "noplaylist": True, - "verbose": False - } - - logger.info(f"{translations['starting_download']}: {url}...") - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - ydl.extract_info(url) - logger.info(f"{translations['download_success']}: {url}") - - return os.path.join(dataset_temp, f"{name}" + ".wav") - -def read_file(file): - try: - data, sr = sf.read(file, dtype=np.float32) - except: - data, sr = librosa.load(file, sr=None) - - return data, sr - -def skip_duration( - audio, - skip_start_audio, - skip_end_audio -): - skip_start(audio, int(skip_start_audio)) - skip_end(audio, int(skip_end_audio)) - - return audio - -def skip_start( - input_file, - seconds -): - data, sr = read_file(input_file) - total_duration = len(data) / sr - - if seconds <= 0: - logger.warning(translations["=<0"]) - elif seconds >= total_duration: - logger.warning(translations["skip_warning"].format(seconds=seconds, total_duration=f"{total_duration:.2f}")) - else: - logger.info(f"{translations['skip_start']}: {input_file}...") - sf.write(input_file, data[int(seconds * sr):], sr) - - logger.info(translations["skip_start_audio"].format(input_file=input_file)) - -def skip_end( - input_file, - seconds -): - data, sr = read_file(input_file) - total_duration = len(data) / sr - - if seconds <= 0: - logger.warning(translations["=<0"]) - elif seconds > total_duration: - logger.warning(translations["skip_warning"].format(seconds=seconds, total_duration=f"{total_duration:.2f}")) - else: - logger.info(f"{translations['skip_end']}: {input_file}...") - sf.write(input_file, data[:-int(seconds * sr)], sr) - - logger.info(translations["skip_end_audio"].format(input_file=input_file)) - -if __name__ == "__main__": main() \ No newline at end of file diff --git a/main/inference/create_index.py b/main/inference/create_index.py deleted file mode 100644 index 0db0499cd9e692e06ff0df03d5a3a3376f7ad657..0000000000000000000000000000000000000000 --- a/main/inference/create_index.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -import sys -import faiss -import argparse - -import numpy as np - -from multiprocessing import cpu_count -from sklearn.cluster import MiniBatchKMeans - -sys.path.append(os.getcwd()) - -from main.app.variables import logger, translations, configs - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--create_index", action='store_true') - parser.add_argument("--model_name", type=str, required=True) - parser.add_argument("--rvc_version", type=str, default="v2") - parser.add_argument("--index_algorithm", type=str, default="Auto") - - return parser.parse_args() - -def main(): - args = parse_arguments() - exp_dir = os.path.join(configs["logs_path"], args.model_name) - version, index_algorithm = args.rvc_version, args.index_algorithm - - log_data = {translations['modelname']: args.model_name, translations['model_path']: exp_dir, translations['training_version']: version, translations['index_algorithm_info']: index_algorithm} - for key, value in log_data.items(): - logger.debug(f"{key}: {value}") - - try: - npys = [] - feature_dir = os.path.join(exp_dir, f"{version}_extracted") - model_name = os.path.basename(exp_dir) - - for name in sorted(os.listdir(feature_dir)): - npys.append(np.load(os.path.join(feature_dir, name))) - - big_npy = np.concatenate(npys, axis=0) - big_npy_idx = np.arange(big_npy.shape[0]) - np.random.shuffle(big_npy_idx) - big_npy = big_npy[big_npy_idx] - - if big_npy.shape[0] > 2e5 and (index_algorithm == "Auto" or index_algorithm == "KMeans"): big_npy = (MiniBatchKMeans(n_clusters=10000, verbose=True, batch_size=256 * cpu_count(), compute_labels=False, init="random").fit(big_npy).cluster_centers_) - np.save(os.path.join(exp_dir, "total_fea.npy"), big_npy) - - n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) - index_trained = faiss.index_factory(256 if version == "v1" else 768, f"IVF{n_ivf},Flat") - index_ivf_trained = faiss.extract_index_ivf(index_trained) - index_ivf_trained.nprobe = 1 - index_trained.train(big_npy) - faiss.write_index(index_trained, os.path.join(exp_dir, f"trained_IVF{n_ivf}_Flat_nprobe_{index_ivf_trained.nprobe}_{model_name}_{version}.index")) - - index_added = faiss.index_factory(256 if version == "v1" else 768, f"IVF{n_ivf},Flat") - index_ivf_added = faiss.extract_index_ivf(index_added) - index_ivf_added.nprobe = 1 - index_added.train(big_npy) - batch_size_add = 8192 - - for i in range(0, big_npy.shape[0], batch_size_add): - index_added.add(big_npy[i : i + batch_size_add]) - - index_filepath_added = os.path.join(exp_dir, f"added_IVF{n_ivf}_Flat_nprobe_{index_ivf_added.nprobe}_{model_name}_{version}.index") - faiss.write_index(index_added, index_filepath_added) - logger.info(f"{translations['save_index']} '{index_filepath_added}'") - except Exception as e: - logger.error(f"{translations['create_index_error']}: {e}") - import traceback - logger.debug(traceback.format_exc()) - -if __name__ == "__main__": main() \ No newline at end of file diff --git a/main/inference/create_reference.py b/main/inference/create_reference.py deleted file mode 100644 index 88b64e7df1d4994bb3de9aabce623a6f02567fff..0000000000000000000000000000000000000000 --- a/main/inference/create_reference.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -import sys -import time -import torch -import shutil -import warnings -import argparse - -import numpy as np - -from tqdm import tqdm -from distutils.util import strtobool - -sys.path.append(os.getcwd()) - -from main.app.variables import config, logger, translations, configs -from main.library.utils import load_audio, load_embedders_model, extract_features - -warnings.filterwarnings("ignore") - -F0_MIN, F0_MAX, HOP_SIZE, SAMPLE_RATE, FRAME_LENGTH = 50, 1100, 160, 16000, 2048 - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--create_reference", action='store_true') - parser.add_argument("--audio_path", type=str, required=True) - parser.add_argument("--reference_name", type=str, default="reference") - parser.add_argument("--pitch_guidance", type=lambda x: bool(strtobool(x)), default=True) - parser.add_argument("--use_energy", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--version", type=str, default="v2") - parser.add_argument("--embedder_model", type=str, default="hubert_base") - parser.add_argument("--embedders_mode", type=str, default="fairseq") - parser.add_argument("--f0_method", type=str, default="rmvpe") - parser.add_argument("--f0_onnx", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--f0_up_key", type=int, default=0) - parser.add_argument("--filter_radius", type=int, default=3) - parser.add_argument("--f0_autotune", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--f0_autotune_strength", type=float, default=1) - parser.add_argument("--proposal_pitch", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--proposal_pitch_threshold", type=float, default=255.0) - parser.add_argument("--alpha", type=float, default=0.5) - - return parser.parse_args() - -def main(): - args = parse_arguments() - audio_path, reference_name, pitch_guidance, use_energy, version, embedder_model, embedders_mode, f0_method, f0_onnx, f0_up_key, filter_radius, f0_autotune, f0_autotune_strength, proposal_pitch, proposal_pitch_threshold, alpha = args.audio_path, args.reference_name, args.pitch_guidance, args.use_energy, args.version, args.embedder_model, args.embedders_mode, args.f0_method, args.f0_onnx, args.f0_up_key, args.filter_radius, args.f0_autotune, args.f0_autotune_strength, args.proposal_pitch, args.proposal_pitch_threshold, args.alpha - - create_reference( - audio_path, - reference_name, - pitch_guidance, - use_energy, - version, - embedder_model, - embedders_mode, - f0_method, - f0_onnx, - f0_up_key, - filter_radius, - f0_autotune, - f0_autotune_strength, - proposal_pitch, - proposal_pitch_threshold, - alpha - ) - -def create_reference( - audio_path, - reference_name, - pitch_guidance = True, - use_energy = False, - version = "v2", - embedder_model = "hubert_base", - embedders_mode = "fairseq", - f0_method = "rmvpe", - f0_onnx = False, - f0_up_key = 0, - filter_radius = 3, - f0_autotune = False, - f0_autotune_strength = 1, - proposal_pitch = False, - proposal_pitch_threshold = 255.0, - alpha = 0.5 -): - device = config.device - is_half = config.is_half - - if not audio_path: - logger.warning(translations["not_found_audio"]) - sys.exit(1) - - output_reference = os.path.join(configs["reference_path"], f"{reference_name}_{version}_{embedder_model}_{pitch_guidance}_{use_energy}") - if os.path.exists(output_reference): shutil.rmtree(reference_name, ignore_errors=True) - - os.makedirs(output_reference) - logger.info(translations["start_create_reference"]) - start_time = time.time() - - with tqdm(total=5, desc=translations["create_reference"], ncols=100, unit="a") as pbar: - audio = load_audio(audio_path, sample_rate=SAMPLE_RATE) - pbar.update(1) - - audio_max = np.abs(audio).max() / 0.95 - if audio_max > 1: audio /= audio_max - - trimmed_len = (len(audio) // 320) * 320 - audio = audio[:trimmed_len] - - audio_pad = torch.nn.functional.pad( - torch.from_numpy(audio).to( - torch.float16 if is_half else torch.float32 - ).to(device).unsqueeze(0), - (40, 40), - mode="reflect" - ) - pbar.update(1) - - embedder = load_embedders_model(embedder_model, embedders_mode) - if isinstance(embedder, torch.nn.Module): embedder = embedder.to(torch.float16 if is_half else torch.float32).eval().to(device) - - with torch.no_grad(): - feats = extract_features(embedder, audio_pad.view(1, -1), version, device=device) - - np.save(os.path.join(output_reference, "feats.npy"), feats.squeeze(0).float().cpu().numpy(), allow_pickle=False) - pbar.update(1) - - if pitch_guidance: - from main.library.predictors.Generator import Generator - - generator = Generator( - sample_rate=SAMPLE_RATE, - hop_length=HOP_SIZE, - f0_min=F0_MIN, - f0_max=F0_MAX, - alpha=alpha, - is_half=is_half, - device=device, - f0_onnx_mode=f0_onnx, - del_onnx_model=True - ) - - pitch, pitchf = generator.calculator( - x_pad=config.x_pad, - f0_method=f0_method, - x=audio, - f0_up_key=f0_up_key, - p_len=audio.shape[0] // 160 + 1, - filter_radius=filter_radius, - f0_autotune=f0_autotune, - f0_autotune_strength=f0_autotune_strength, - manual_f0=None, - proposal_pitch=proposal_pitch, - proposal_pitch_threshold=proposal_pitch_threshold - ) - - np.save(os.path.join(output_reference, "pitch_coarse.npy"), pitch, allow_pickle=False) - np.save(os.path.join(output_reference, "pitch_fine.npy"), pitchf, allow_pickle=False) - - pbar.update(1) - - if use_energy: - from main.inference.extracting.rms import RMSEnergyExtractor - rms = RMSEnergyExtractor(frame_length=FRAME_LENGTH, hop_length=HOP_SIZE, center=True, pad_mode="reflect").to(device).eval() - - with torch.no_grad(): - energy = rms(audio_pad) - - np.save(os.path.join(output_reference, "energy.npy"), energy.float().cpu().numpy(), allow_pickle=False) - - pbar.update(1) - - logger.info(translations["create_reference_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) - -if __name__ == "__main__": main() \ No newline at end of file diff --git a/main/inference/extracting/embedding.py b/main/inference/extracting/embedding.py deleted file mode 100644 index df730de46d7700d17bb061682d2f5c5620d8fe18..0000000000000000000000000000000000000000 --- a/main/inference/extracting/embedding.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import gc -import sys -import time -import tqdm -import torch -import traceback -import concurrent.futures - -import numpy as np - -sys.path.append(os.getcwd()) - -from main.app.variables import logger, translations, config -from main.inference.extracting.setup_path import setup_paths -from main.library.utils import load_audio, load_embedders_model, extract_features - -def process_file_embedding(files, embedder_model, embedders_mode, device, version, is_half, threads): - model = load_embedders_model(embedder_model, embedders_mode) - if isinstance(model, torch.nn.Module): model = model.to(device).to(torch.float16 if is_half else torch.float32).eval() - - def worker(file_info): - try: - file, out_path = file_info - out_file_path = os.path.join(out_path, os.path.basename(file.replace("wav", "npy"))) if os.path.isdir(out_path) else out_path - - if os.path.exists(out_file_path): return - feats = torch.from_numpy(load_audio(file, 16000)).to(device).to(torch.float16 if is_half else torch.float32) - - with torch.no_grad(): - feats = extract_features(model, feats.view(1, -1), version, device) - - feats = feats.squeeze(0).float().cpu().numpy() - if not np.isnan(feats).any(): np.save(out_file_path, feats, allow_pickle=False) - else: logger.warning(f"{file} {translations['NaN']}") - except: - logger.debug(traceback.format_exc()) - - with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar: - with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: - for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]): - pbar.update(1) - -def run_embedding_extraction(exp_dir, version, num_processes, devices, embedder_model, embedders_mode, is_half): - wav_path, out_path = setup_paths(exp_dir, version) - - logger.info(translations["start_extract_hubert"]) - num_processes = 1 if (config.device.startswith("ocl") and embedders_mode == "onnx") or config.device.startswith("privateuseone") else num_processes - paths = sorted([(os.path.join(wav_path, file), out_path) for file in os.listdir(wav_path) if file.endswith(".wav")]) - - start_time = time.time() - with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor: - concurrent.futures.wait([executor.submit(process_file_embedding, paths[i::len(devices)], embedder_model, embedders_mode, devices[i], version, is_half, num_processes // len(devices)) for i in range(len(devices))]) - - gc.collect() - logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) - -def create_mute_file(version, embedder_model, embedders_mode, is_half): - start_time = time.time() - logger.info(translations["start_extract_hubert"]) - - process_file_embedding([(os.path.join("assets", "logs", "mute", "sliced_audios_16k", "mute.wav"), os.path.join("assets", "logs", "mute", f"{version}_extracted", f"mute_{embedder_model}.npy"))], embedder_model, embedders_mode, config.device, version, is_half, 1) - - gc.collect() - logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) \ No newline at end of file diff --git a/main/inference/extracting/extract.py b/main/inference/extracting/extract.py deleted file mode 100644 index 725844cffdad247118d761663e109eb44f9977a9..0000000000000000000000000000000000000000 --- a/main/inference/extracting/extract.py +++ /dev/null @@ -1,93 +0,0 @@ -import os -import sys -import logging -import argparse -import warnings - -import torch.multiprocessing as mp - -from distutils.util import strtobool - -sys.path.append(os.getcwd()) - -from main.library.utils import check_assets -from main.inference.extracting.rms import run_rms_extraction -from main.inference.extracting.feature import run_pitch_extraction -from main.app.variables import config, logger, translations, configs -from main.inference.extracting.embedding import run_embedding_extraction -from main.inference.extracting.preparing_files import generate_config, generate_filelist - -warnings.filterwarnings("ignore") -for l in ["torch", "faiss", "httpx", "httpcore", "faiss.loader", "numba.core", "urllib3", "matplotlib"]: - logging.getLogger(l).setLevel(logging.ERROR) - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--extract", action='store_true') - parser.add_argument("--model_name", type=str, required=True) - parser.add_argument("--rvc_version", type=str, default="v2") - parser.add_argument("--f0_method", type=str, default="rmvpe") - parser.add_argument("--pitch_guidance", type=lambda x: bool(strtobool(x)), default=True) - parser.add_argument("--hop_length", type=int, default=128) - parser.add_argument("--cpu_cores", type=int, default=2) - parser.add_argument("--gpu", type=str, default="-") - parser.add_argument("--sample_rate", type=int, required=True) - parser.add_argument("--embedder_model", type=str, default="hubert_base") - parser.add_argument("--f0_onnx", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--embedders_mode", type=str, default="fairseq") - parser.add_argument("--f0_autotune", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--f0_autotune_strength", type=float, default=1) - parser.add_argument("--rms_extract", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--alpha", type=float, default=0.5) - - return parser.parse_args() - -def main(): - args = parse_arguments() - - f0_method, hop_length, num_processes, gpus, version, pitch_guidance, sample_rate, embedder_model, f0_onnx, embedders_mode, f0_autotune, f0_autotune_strength, rms_extract, alpha = args.f0_method, args.hop_length, args.cpu_cores, args.gpu, args.rvc_version, args.pitch_guidance, args.sample_rate, args.embedder_model, args.f0_onnx, args.embedders_mode, args.f0_autotune, args.f0_autotune_strength, args.rms_extract, args.alpha - check_assets(f0_method, embedder_model, f0_onnx=f0_onnx, embedders_mode=embedders_mode) - exp_dir = os.path.join(configs["logs_path"], args.model_name) - - num_processes = max(1, num_processes) - devices = ["cpu"] if gpus == "-" else [(f"cuda:{idx}" if config.device.startswith("cuda") else f"{'ocl' if config.device.startswith('ocl') else 'privateuseone'}:{idx}") for idx in gpus.split("-")] - - log_data = { - translations['modelname']: args.model_name, - translations['export_process']: exp_dir, - translations['f0_method']: f0_method, - translations['pretrain_sr']: sample_rate, - translations['cpu_core']: num_processes, - "Gpu": gpus, - translations['hop_length']: hop_length, - translations['training_version']: version, - translations['extract_f0']: pitch_guidance, - translations['hubert_model']: embedder_model, - translations["f0_onnx_mode"]: f0_onnx, - translations["embed_mode"]: embedders_mode, - translations["train&energy"]: rms_extract, - translations["alpha_label"]: alpha - } - - for key, value in log_data.items(): - logger.debug(f"{key}: {value}") - - pid_path = os.path.join(exp_dir, "extract_pid.txt") - with open(pid_path, "w") as pid_file: - pid_file.write(str(os.getpid())) - - try: - run_pitch_extraction(exp_dir, f0_method, hop_length, num_processes, devices, f0_onnx, config.is_half, f0_autotune, f0_autotune_strength, alpha) - run_embedding_extraction(exp_dir, version, num_processes, devices, embedder_model, embedders_mode, config.is_half) - run_rms_extraction(exp_dir, num_processes, devices, rms_extract) - generate_config(version, sample_rate, exp_dir) - generate_filelist(pitch_guidance, exp_dir, version, sample_rate, embedders_mode, embedder_model, rms_extract) - except Exception as e: - logger.error(f"{translations['extract_error']}: {e}") - - if os.path.exists(pid_path): os.remove(pid_path) - logger.info(f"{translations['extract_success']} {args.model_name}.") - -if __name__ == "__main__": - mp.set_start_method("spawn", force=True) - main() \ No newline at end of file diff --git a/main/inference/extracting/feature.py b/main/inference/extracting/feature.py deleted file mode 100644 index 19f7c813f25c8e6c5b91092e4d289bd357b6676c..0000000000000000000000000000000000000000 --- a/main/inference/extracting/feature.py +++ /dev/null @@ -1,67 +0,0 @@ -import os -import gc -import sys -import tqdm -import time -import traceback -import concurrent.futures - -import numpy as np - -sys.path.append(os.getcwd()) - -from main.library.utils import load_audio -from main.app.variables import config, logger, translations -from main.inference.extracting.setup_path import setup_paths - -class FeatureInput: - def __init__(self, is_half=config.is_half, device=config.device): - self.sample_rate = 16000 - self.f0_max = 1100.0 - self.f0_min = 50.0 - self.device = device - self.is_half = is_half - - def process_file(self, file_info, f0_method, hop_length, f0_onnx, f0_autotune, f0_autotune_strength, alpha): - if not hasattr(self, "f0_gen"): - from main.library.predictors.Generator import Generator - self.f0_gen = Generator(self.sample_rate, hop_length, self.f0_min, self.f0_max, alpha, self.is_half, self.device, f0_onnx, False) - - inp_path, opt_path1, opt_path2, file_inp = file_info - if os.path.exists(opt_path1 + ".npy") and os.path.exists(opt_path2 + ".npy"): return - - try: - pitch, pitchf = self.f0_gen.calculator(x_pad=config.x_pad, f0_method=f0_method, x=load_audio(file_inp, self.sample_rate), f0_up_key=0, p_len=None, filter_radius=3, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, manual_f0=None, proposal_pitch=False, proposal_pitch_threshold=0.0) - np.save(opt_path2, pitchf, allow_pickle=False) - np.save(opt_path1, pitch, allow_pickle=False) - except Exception as e: - logger.info(f"{translations['extract_file_error']} {inp_path}: {e}") - logger.debug(traceback.format_exc()) - - def process_files(self, files, f0_method, hop_length, f0_onnx, device, is_half, threads, f0_autotune, f0_autotune_strength, alpha): - self.device = device - self.is_half = is_half - - def worker(file_info): - self.process_file(file_info, f0_method, hop_length, f0_onnx, f0_autotune, f0_autotune_strength, alpha) - - with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar: - with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: - for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]): - pbar.update(1) - -def run_pitch_extraction(exp_dir, f0_method, hop_length, num_processes, devices, f0_onnx, is_half, f0_autotune, f0_autotune_strength, alpha): - input_root, *output_roots = setup_paths(exp_dir) - output_root1, output_root2 = output_roots if len(output_roots) == 2 else (output_roots[0], None) - - logger.info(translations["extract_f0_method"].format(num_processes=num_processes, f0_method=f0_method)) - num_processes = 1 if config.device.startswith(("ocl", "privateuseone")) and ("crepe" in f0_method or "fcpe" in f0_method or "rmvpe" in f0_method or "penn" in f0_method or "swift" in f0_method) else num_processes - paths = [(os.path.join(input_root, name), os.path.join(output_root1, name) if output_root1 else None, os.path.join(output_root2, name) if output_root2 else None, os.path.join(input_root, name)) for name in sorted(os.listdir(input_root)) if "spec" not in name] - - start_time = time.time() - feature_input = FeatureInput() - with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor: - concurrent.futures.wait([executor.submit(feature_input.process_files, paths[i::len(devices)], f0_method, hop_length, f0_onnx, devices[i], is_half, num_processes // len(devices), f0_autotune, f0_autotune_strength, alpha) for i in range(len(devices))]) - - gc.collect() - logger.info(translations["extract_f0_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) \ No newline at end of file diff --git a/main/inference/extracting/preparing_files.py b/main/inference/extracting/preparing_files.py deleted file mode 100644 index dd0aaccdbd2a9e1f980fb6874728a1f6c67a57ed..0000000000000000000000000000000000000000 --- a/main/inference/extracting/preparing_files.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -import sys -import shutil - -from random import shuffle - -sys.path.append(os.getcwd()) - -from main.app.core.ui import configs, config -from main.inference.extracting.embedding import create_mute_file - -def mute_file(embedders_mode, embedders_model, mute_base_path, rvc_version): - if embedders_mode.startswith(("spin", "whisper")): - mute_file = f"mute_{embedders_model}.npy" - else: - mute_file = { - "contentvec_base": "mute.npy", - "hubert_base": "mute.npy", - "vietnamese_hubert_base": "mute_vietnamese.npy", - "japanese_hubert_base": "mute_japanese.npy", - "korean_hubert_base": "mute_korean.npy", - "chinese_hubert_base": "mute_chinese.npy", - "portuguese_hubert_base": "mute_portuguese.npy" - }.get(embedders_model, None) - - if mute_file is None: - create_mute_file(rvc_version, embedders_model, embedders_mode, config.is_half) - mute_file = f"mute_{embedders_model}.npy" - - return os.path.join(mute_base_path, f"{rvc_version}_extracted", mute_file) - -def generate_config(rvc_version, sample_rate, model_path): - config_save_path = os.path.join(model_path, "config.json") - if not os.path.exists(config_save_path): shutil.copy(os.path.join("main", "configs", rvc_version, f"{sample_rate}.json"), config_save_path) - -def generate_filelist(pitch_guidance, model_path, rvc_version, sample_rate, embedders_mode = "fairseq", embedder_model = "hubert_base", rms_extract = False): - gt_wavs_dir, feature_dir = os.path.join(model_path, "sliced_audios"), os.path.join(model_path, f"{rvc_version}_extracted") - f0_dir, f0nsf_dir, energy_dir = None, None, None - - if pitch_guidance: f0_dir, f0nsf_dir = os.path.join(model_path, "f0"), os.path.join(model_path, "f0_voiced") - if rms_extract: energy_dir = os.path.join(model_path, "energy") - - gt_wavs_files, feature_files = set(name.split(".")[0] for name in os.listdir(gt_wavs_dir)), set(name.split(".")[0] for name in os.listdir(feature_dir)) - names = gt_wavs_files & feature_files - - if pitch_guidance: names = names & set(name.split(".")[0] for name in os.listdir(f0_dir)) & set(name.split(".")[0] for name in os.listdir(f0nsf_dir)) - if rms_extract: names = names & set(name.split(".")[0] for name in os.listdir(energy_dir)) - - options = [] - mute_base_path = os.path.join(configs["logs_path"], "mute") - - for name in names: - option = { - True: { - True: f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{f0_dir}/{name}.wav.npy|{f0nsf_dir}/{name}.wav.npy|{energy_dir}/{name}.wav.npy|0", - False: f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{f0_dir}/{name}.wav.npy|{f0nsf_dir}/{name}.wav.npy|0" - }, - False: { - True: f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{energy_dir}/{name}.wav.npy|0", - False: f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|0" - } - }[pitch_guidance][rms_extract] - - options.append(option) - - mute_audio_path, mute_feature_path = os.path.join(mute_base_path, "sliced_audios", f"mute{sample_rate}.wav"), mute_file(embedders_mode, embedder_model, mute_base_path, rvc_version) - - for _ in range(2): - option = { - True: { - True: f"{mute_audio_path}|{mute_feature_path}|{os.path.join(mute_base_path, 'f0', 'mute.wav.npy')}|{os.path.join(mute_base_path, 'f0_voiced', 'mute.wav.npy')}|{os.path.join(mute_base_path, 'energy', 'mute.wav.npy')}|0", - False: f"{mute_audio_path}|{mute_feature_path}|{os.path.join(mute_base_path, 'f0', 'mute.wav.npy')}|{os.path.join(mute_base_path, 'f0_voiced', 'mute.wav.npy')}|0" - }, - False: { - True: f"{mute_audio_path}|{mute_feature_path}|{os.path.join(mute_base_path, 'energy', 'mute.wav.npy')}|0", - False: f"{mute_audio_path}|{mute_feature_path}|0" - } - }[pitch_guidance][rms_extract] - - options.append(option) - - shuffle(options) - with open(os.path.join(model_path, "filelist.txt"), "w") as f: - f.write("\n".join(options)) \ No newline at end of file diff --git a/main/inference/extracting/rms.py b/main/inference/extracting/rms.py deleted file mode 100644 index 8c5b8a2db19972ad2b61b9a019e17d1659a705f0..0000000000000000000000000000000000000000 --- a/main/inference/extracting/rms.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -import sys -import time -import tqdm -import torch -import librosa -import traceback -import concurrent.futures - -import numpy as np -import torch.nn as nn - -sys.path.append(os.getcwd()) - -from main.library.utils import load_audio -from main.app.variables import logger, translations -from main.inference.extracting.setup_path import setup_paths - -class RMSEnergyExtractor(nn.Module): - def __init__(self, frame_length=2048, hop_length=512, center=True, pad_mode = "reflect"): - super().__init__() - self.frame_length = frame_length - self.hop_length = hop_length - self.center = center - self.pad_mode = pad_mode - - def forward(self, x): - assert x.ndim == 2 - assert x.shape[0] == 1 - - if str(x.device).startswith(("ocl", "privateuseone")): x = x.contiguous() - - rms = torch.from_numpy( - librosa.feature.rms( - y=x.squeeze(0).cpu().numpy(), - frame_length=self.frame_length, - hop_length=self.hop_length, - center=self.center, - pad_mode=self.pad_mode - ) - ) - - if str(x.device).startswith(("ocl", "privateuseone")): rms = rms.contiguous() - return rms.squeeze(-2).to(x.device) - -def process_file_rms(files, device, threads): - threads = max(1, threads) - - module = RMSEnergyExtractor( - frame_length=2048, hop_length=160, center=True, pad_mode = "reflect" - ).to(device).eval().float() - - def worker(file_info): - try: - file, out_path = file_info - out_file_path = os.path.join(out_path, os.path.basename(file)) - - if os.path.exists(out_file_path + ".npy"): return - feats = torch.from_numpy(load_audio(file, 16000)).unsqueeze(0) - - with torch.no_grad(): - feats = module(feats if device.startswith(("ocl", "privateuseone")) else feats.to(device)) - - np.save(out_file_path, feats.float().cpu().numpy(), allow_pickle=False) - except: - logger.debug(traceback.format_exc()) - - with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar: - with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: - for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]): - pbar.update(1) - -def run_rms_extraction(exp_dir, num_processes, devices, rms_extract): - if rms_extract: - wav_path, out_path = setup_paths(exp_dir, rms_extract=rms_extract) - paths = sorted([(os.path.join(wav_path, file), out_path) for file in os.listdir(wav_path) if file.endswith(".wav")]) - - start_time = time.time() - logger.info(translations["rms_start_extract"].format(num_processes=num_processes)) - - with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor: - concurrent.futures.wait([executor.submit(process_file_rms, paths[i::len(devices)], devices[i], num_processes // len(devices)) for i in range(len(devices))]) - - logger.info(translations["rms_success_extract"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) \ No newline at end of file diff --git a/main/inference/extracting/setup_path.py b/main/inference/extracting/setup_path.py deleted file mode 100644 index 2dbc51de1310b7ce3ec6634a58e93cb8e421a9e1..0000000000000000000000000000000000000000 --- a/main/inference/extracting/setup_path.py +++ /dev/null @@ -1,21 +0,0 @@ -import os - -def setup_paths(exp_dir, version = None, rms_extract = False): - wav_path = os.path.join(exp_dir, "sliced_audios_16k") - - if rms_extract: - out_path = os.path.join(exp_dir, "energy") - os.makedirs(out_path, exist_ok=True) - - return wav_path, out_path - - if version: - out_path = os.path.join(exp_dir, f"{version}_extracted") - os.makedirs(out_path, exist_ok=True) - - return wav_path, out_path - else: - output_root1, output_root2 = os.path.join(exp_dir, "f0"), os.path.join(exp_dir, "f0_voiced") - os.makedirs(output_root1, exist_ok=True); os.makedirs(output_root2, exist_ok=True) - - return wav_path, output_root1, output_root2 \ No newline at end of file diff --git a/main/inference/preprocess/preprocess.py b/main/inference/preprocess/preprocess.py deleted file mode 100644 index 9ae03331176025d9c49716beb85b617300933f79..0000000000000000000000000000000000000000 --- a/main/inference/preprocess/preprocess.py +++ /dev/null @@ -1,205 +0,0 @@ -import os -import sys -import time -import torch -import logging -import librosa -import argparse - -import numpy as np -import torch.multiprocessing as mp - -from tqdm import tqdm -from scipy import signal -from scipy.io import wavfile -from distutils.util import strtobool -from concurrent.futures import ProcessPoolExecutor, as_completed - -sys.path.append(os.getcwd()) - -from main.library.utils import load_audio -from main.inference.preprocess.slicer2 import Slicer -from main.app.variables import config, logger, translations, configs - -for l in ["numba.core.byteflow", "numba.core.ssa", "numba.core.interpreter"]: - logging.getLogger(l).setLevel(logging.ERROR) - -OVERLAP, MAX_AMPLITUDE, ALPHA, HIGH_PASS_CUTOFF, SAMPLE_RATE_16K = 0.3, 0.9, 0.75, 48, 16000 - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--preprocess", action='store_true') - parser.add_argument("--model_name", type=str, required=True) - parser.add_argument("--dataset_path", type=str, default="./dataset") - parser.add_argument("--sample_rate", type=int, required=True) - parser.add_argument("--cpu_cores", type=int, default=2) - parser.add_argument("--cut_preprocess", type=str, default="Automatic") - parser.add_argument("--process_effects", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--clean_dataset", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--clean_strength", type=float, default=0.7) - parser.add_argument("--chunk_len", type=float, default=3.0, required=False) - parser.add_argument("--overlap_len", type=float, default=0.3, required=False) - parser.add_argument("--normalization_mode", type=str, default="none", required=False) - - return parser.parse_args() - -class PreProcess: - def __init__(self, sr, exp_dir, per): - self.slicer = Slicer(sr=sr, threshold=-42, min_length=1500, min_interval=400, hop_size=15, max_sil_kept=500) - self.sr = sr - self.b_high, self.a_high = signal.butter(N=5, Wn=HIGH_PASS_CUTOFF, btype="high", fs=self.sr) - self.per = per - self.exp_dir = exp_dir - self.device = "cpu" - self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios") - self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k") - os.makedirs(self.gt_wavs_dir, exist_ok=True) - os.makedirs(self.wavs16k_dir, exist_ok=True) - - def _normalize_audio(self, audio): - tmp_max = np.abs(audio).max() - if tmp_max > 2.5: return None - return (audio / tmp_max * (MAX_AMPLITUDE * ALPHA)) + (1 - ALPHA) * audio - - def process_audio_segment(self, normalized_audio, sid, idx0, idx1, normalization_mode): - if normalized_audio is None: - logger.debug(f"{sid}-{idx0}-{idx1}-filtered") - return - - if normalization_mode == "post": normalized_audio = self._normalize_audio(normalized_audio) - - wavfile.write(os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{idx1}.wav"), self.sr, normalized_audio.astype(np.float32)) - wavfile.write(os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{idx1}.wav"), SAMPLE_RATE_16K, librosa.resample(normalized_audio, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K, res_type="soxr_vhq").astype(np.float32)) - - def simple_cut(self, audio, sid, idx0, chunk_len, overlap_len, normalization_mode): - chunk_length = int(self.sr * chunk_len) - overlap_length = int(self.sr * overlap_len) - i = 0 - - while i < len(audio): - chunk = audio[i : i + chunk_length] - if normalization_mode == "post": chunk = self._normalize_audio(chunk) - - if len(chunk) == chunk_length: - wavfile.write(os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav"), self.sr, chunk.astype(np.float32)) - wavfile.write(os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav"), SAMPLE_RATE_16K, librosa.resample(chunk, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K, res_type="soxr_vhq").astype(np.float32)) - - i += chunk_length - overlap_length - - def process_audio(self, path, idx0, sid, cut_preprocess, process_effects, clean_dataset, clean_strength, chunk_len, overlap_len, normalization_mode): - try: - audio = load_audio(path, self.sr) - - if process_effects: audio = signal.lfilter(self.b_high, self.a_high, audio) - if normalization_mode == "pre": audio = self._normalize_audio(audio) - - if clean_dataset: - if not hasattr(self, "tg"): - from main.tools.noisereduce import TorchGate - self.tg = TorchGate(self.sr, prop_decrease=clean_strength).to(self.device) - audio = self.tg(torch.from_numpy(audio).unsqueeze(0).to(self.device).float()).squeeze(0).cpu().detach().numpy() - - if cut_preprocess == "Skip": - self.process_audio_segment( - audio, - sid, - idx0, - 0, - normalization_mode, - ) - elif cut_preprocess == "Simple": - self.simple_cut( - audio, - sid, - idx0, - chunk_len, - overlap_len, - normalization_mode, - ) - elif cut_preprocess == "Automatic": - idx1 = 0 - for audio_segment in self.slicer.slice(audio): - i = 0 - - while 1: - start = int(self.sr * (self.per - OVERLAP) * i) - i += 1 - - if len(audio_segment[start:]) > (self.per + OVERLAP) * self.sr: - self.process_audio_segment(audio_segment[start : start + int(self.per * self.sr)], sid, idx0, idx1, normalization_mode) - idx1 += 1 - else: - self.process_audio_segment(audio_segment[start:], sid, idx0, idx1, normalization_mode) - idx1 += 1 - break - except Exception as e: - raise RuntimeError(f"{translations['process_audio_error']}: {e}") - -def process_file(args): - pp, file, cut_preprocess, process_effects, clean_dataset, clean_strength, chunk_len, overlap_len, normalization_mode = args - file_path, idx0, sid = file - pp.process_audio(file_path, idx0, sid, cut_preprocess, process_effects, clean_dataset, clean_strength, chunk_len, overlap_len, normalization_mode) - -def preprocess_training_set(input_root, sr, num_processes, exp_dir, per, cut_preprocess, process_effects, clean_dataset, clean_strength, chunk_len, overlap_len, normalization_mode): - start_time = time.time() - pp = PreProcess(sr, exp_dir, per) - logger.info(translations["start_preprocess"].format(num_processes=num_processes)) - files = [] - idx = 0 - - for root, _, filenames in os.walk(input_root): - try: - sid = 0 if root == input_root else int(os.path.basename(root)) - for f in filenames: - if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3")): - files.append((os.path.join(root, f), idx, sid)) - idx += 1 - except ValueError: - raise ValueError(f"{translations['not_integer']} '{os.path.basename(root)}'.") - - with tqdm(total=len(files), ncols=100, unit="f") as pbar: - with ProcessPoolExecutor(max_workers=num_processes) as executor: - futures = [executor.submit(process_file, (pp, file, cut_preprocess, process_effects, clean_dataset, clean_strength, chunk_len, overlap_len, normalization_mode)) for file in files] - for future in as_completed(futures): - try: - future.result() - except Exception as e: - raise RuntimeError(f"{translations['process_error']}: {e}") - pbar.update(1) - - elapsed_time = time.time() - start_time - logger.info(translations["preprocess_success"].format(elapsed_time=f"{elapsed_time:.2f}")) - -def main(): - args = parse_arguments() - experiment_directory = os.path.join(configs["logs_path"], args.model_name) - - num_processes = args.cpu_cores - num_processes = 2 if num_processes is None else int(num_processes) - - dataset, sample_rate, cut_preprocess, preprocess_effects, clean_dataset, clean_strength, chunk_len, overlap_len, normalization_mode = args.dataset_path, args.sample_rate, args.cut_preprocess, args.process_effects, args.clean_dataset, args.clean_strength, args.chunk_len, args.overlap_len, args.normalization_mode - os.makedirs(experiment_directory, exist_ok=True) - - log_data = {translations['modelname']: args.model_name, translations['export_process']: experiment_directory, translations['dataset_folder']: dataset, translations['pretrain_sr']: sample_rate, translations['cpu_core']: num_processes, translations['split_audio']: cut_preprocess, translations['preprocess_effect']: preprocess_effects, translations['clear_audio']: clean_dataset} - if clean_dataset: log_data[translations['clean_strength']] = clean_strength - - for key, value in log_data.items(): - logger.debug(f"{key}: {value}") - - pid_path = os.path.join(experiment_directory, "preprocess_pid.txt") - with open(pid_path, "w") as pid_file: - pid_file.write(str(os.getpid())) - - try: - preprocess_training_set(dataset, sample_rate, num_processes, experiment_directory, config.per_preprocess, cut_preprocess, preprocess_effects, clean_dataset, clean_strength, chunk_len, overlap_len, normalization_mode) - except Exception as e: - logger.error(f"{translations['process_audio_error']} {e}") - import traceback - logger.debug(traceback.format_exc()) - - if os.path.exists(pid_path): os.remove(pid_path) - logger.info(f"{translations['preprocess_model_success']} {args.model_name}") - -if __name__ == "__main__": - mp.set_start_method("spawn", force=True) - main() \ No newline at end of file diff --git a/main/inference/preprocess/slicer2.py b/main/inference/preprocess/slicer2.py deleted file mode 100644 index 58e34544936b589d68a8e528fa1fbdc3bfa6fcd6..0000000000000000000000000000000000000000 --- a/main/inference/preprocess/slicer2.py +++ /dev/null @@ -1,144 +0,0 @@ -import numpy as np - -class Slicer: - def __init__(self, sr, threshold = -40.0, min_length = 5000, min_interval = 300, hop_size = 20, max_sil_kept = 5000): - min_interval = sr * min_interval / 1000 - self.threshold = 10 ** (threshold / 20.0) - self.hop_size = round(sr * hop_size / 1000) - self.win_size = min(round(min_interval), 4 * self.hop_size) - self.min_length = round(sr * min_length / 1000 / self.hop_size) - self.min_interval = round(min_interval / self.hop_size) - self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) - - def _apply_slice(self, waveform, begin, end): - start_idx = begin * self.hop_size - - return waveform[:, start_idx:min(waveform.shape[1], end * self.hop_size)] if len(waveform.shape) > 1 else waveform[start_idx:min(waveform.shape[0], end * self.hop_size)] - - def slice(self, waveform): - samples = waveform.mean(axis=0) if len(waveform.shape) > 1 else waveform - if samples.shape[0] <= self.min_length: return [waveform] - rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) - sil_tags = [] - silence_start, clip_start = None, 0 - - for i, rms in enumerate(rms_list): - if rms < self.threshold: - if silence_start is None: silence_start = i - continue - - if silence_start is None: continue - is_leading_silence = silence_start == 0 and i > self.max_sil_kept - need_slice_middle = (i - silence_start >= self.min_interval and i - clip_start >= self.min_length) - if not is_leading_silence and not need_slice_middle: - silence_start = None - continue - - if i - silence_start <= self.max_sil_kept: - pos = rms_list[silence_start : i + 1].argmin() + silence_start - sil_tags.append((0, pos) if silence_start == 0 else (pos, pos)) - clip_start = pos - elif i - silence_start <= self.max_sil_kept * 2: - pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin() - pos += i - self.max_sil_kept - pos_r = (rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept) - if silence_start == 0: - sil_tags.append((0, pos_r)) - clip_start = pos_r - else: - sil_tags.append((min((rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start), pos), max(pos_r, pos))) - clip_start = max(pos_r, pos) - else: - pos_r = (rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept) - sil_tags.append((0, pos_r) if silence_start == 0 else ((rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start), pos_r)) - clip_start = pos_r - - silence_start = None - - total_frames = rms_list.shape[0] - if (silence_start is not None and total_frames - silence_start >= self.min_interval): sil_tags.append((rms_list[silence_start : min(total_frames, silence_start + self.max_sil_kept) + 1].argmin() + silence_start, total_frames + 1)) - - if not sil_tags: return [waveform] - else: - chunks = [] - if sil_tags[0][0] > 0: chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) - - for i in range(len(sil_tags) - 1): - chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])) - - if sil_tags[-1][1] < total_frames: chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames)) - return chunks - -class Slicer2(Slicer): - def slice2(self, waveform): - samples = waveform.mean(axis=0) if len(waveform.shape) > 1 else waveform - - if samples.shape[0] <= self.min_length: return [(waveform, 0, samples.shape[0])] - rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) - - sil_tags = [] - silence_start, clip_start = None, 0 - - for i, rms in enumerate(rms_list): - if rms < self.threshold: - if silence_start is None: silence_start = i - continue - - if silence_start is None: continue - - is_leading_silence = silence_start == 0 and i > self.max_sil_kept - need_slice_middle = (i - silence_start >= self.min_interval and i - clip_start >= self.min_length) - - if not is_leading_silence and not need_slice_middle: - silence_start = None - continue - - if i - silence_start <= self.max_sil_kept: - pos = rms_list[silence_start : i + 1].argmin() + silence_start - sil_tags.append((0, pos) if silence_start == 0 else (pos, pos)) - clip_start = pos - elif i - silence_start <= self.max_sil_kept * 2: - pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin() - pos += i - self.max_sil_kept - - pos_r = (rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept) - - if silence_start == 0: - sil_tags.append((0, pos_r)) - clip_start = pos_r - else: - sil_tags.append((min((rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start), pos), max(pos_r, pos))) - clip_start = max(pos_r, pos) - else: - pos_r = (rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept) - sil_tags.append((0, pos_r) if silence_start == 0 else ((rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start), pos_r)) - clip_start = pos_r - - silence_start = None - - total_frames = rms_list.shape[0] - if (silence_start is not None and total_frames - silence_start >= self.min_interval): sil_tags.append((rms_list[silence_start : min(total_frames, silence_start + self.max_sil_kept) + 1].argmin() + silence_start, total_frames + 1)) - - if not sil_tags: return [(waveform, 0, samples.shape[-1])] - else: - chunks = [] - if sil_tags[0][0] > 0: chunks.append((self._apply_slice(waveform, 0, sil_tags[0][0]), 0, sil_tags[0][0] * self.hop_size)) - - for i in range(len(sil_tags) - 1): - chunks.append((self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]), sil_tags[i][1] * self.hop_size, sil_tags[i + 1][0] * self.hop_size)) - - if sil_tags[-1][1] < total_frames: chunks.append((self._apply_slice(waveform, sil_tags[-1][1], total_frames), sil_tags[-1][1] * self.hop_size, samples.shape[-1])) - return chunks - -def get_rms(y, frame_length=2048, hop_length=512, pad_mode="constant"): - y = np.pad(y, (int(frame_length // 2), int(frame_length // 2)), mode=pad_mode) - axis = -1 - - x_shape_trimmed = list(y.shape) - x_shape_trimmed[axis] -= frame_length - 1 - xw = np.moveaxis(np.lib.stride_tricks.as_strided(y, shape=tuple(x_shape_trimmed) + tuple([frame_length]), strides=y.strides + tuple([y.strides[axis]])), -1, axis - 1 if axis < 0 else axis + 1) - - slices = [slice(None)] * xw.ndim - slices[axis] = slice(0, None, hop_length) - - return np.sqrt(np.mean(np.abs(xw[tuple(slices)]) ** 2, axis=-2, keepdims=True)) \ No newline at end of file diff --git a/main/inference/realtime/audio.py b/main/inference/realtime/audio.py deleted file mode 100644 index 20f175f25894e432c81afb1cc93b54cd4629f741..0000000000000000000000000000000000000000 --- a/main/inference/realtime/audio.py +++ /dev/null @@ -1,243 +0,0 @@ -import os -import sys -import librosa -import traceback - -import numpy as np -import sounddevice as sd - -from queue import Queue - -sys.path.append(os.getcwd()) - -from main.app.variables import logger, translations - -class ServerAudioDevice: - def __init__(self, index, name, host_api, max_input_channels, max_output_channels, default_samplerate): - self.index = index - self.name = name - self.host_api = host_api - self.max_input_channels = max_input_channels - self.max_output_channels = max_output_channels - self.default_samplerate = default_samplerate - -def check_the_device(device, type = "input"): - stream_device = sd.InputStream if type == "input" else sd.OutputStream - try: - with stream_device(device=device["index"], dtype=np.float32, samplerate=device["default_samplerate"]): - return True - except Exception: - return False - -def list_audio_device(): - try: - audio_device_list = sd.query_devices() - except Exception as e: - logger.error(translations["error_occurred"].format(e=e)) - audio_device_list = [] - except OSError as e: - logger.debug(translations["error_occurred"].format(e=e)) - audio_device_list = [] - - input_audio_device_list = [ - d for d in audio_device_list if d["max_input_channels"] > 0 and check_the_device(d, "input") - ] - output_audio_device_list = [ - d for d in audio_device_list if d["max_output_channels"] > 0 and check_the_device(d, "output") - ] - - try: - hostapis = sd.query_hostapis() - except Exception as e: - logger.error(translations["error_occurred"].format(e=e)) - hostapis = [] - except OSError as e: - logger.debug(translations["error_occurred"].format(e=e)) - hostapis = [] - - audio_input_device, audio_output_device = [], [] - - for d in input_audio_device_list: - input_audio_device = ServerAudioDevice( - index=d["index"], - name=d["name"], - host_api=hostapis[d["hostapi"]]["name"], - max_input_channels=d["max_input_channels"], - max_output_channels=d["max_output_channels"], - default_samplerate=d["default_samplerate"], - ) - audio_input_device.append(input_audio_device) - - for d in output_audio_device_list: - output_audio_device = ServerAudioDevice( - index=d["index"], - name=d["name"], - host_api=hostapis[d["hostapi"]]["name"], - max_input_channels=d["max_input_channels"], - max_output_channels=d["max_output_channels"], - default_samplerate=d["default_samplerate"], - ) - audio_output_device.append(output_audio_device) - - return audio_input_device, audio_output_device - -class Audio: - def __init__(self, callbacks, f0_up_key = 0, index_rate = 0.5, protect = 0.5, filter_radius = 3, rms_mix_rate = 1, f0_autotune = False, f0_autotune_strength = 1, proposal_pitch = False, proposal_pitch_threshold = 255.0, input_audio_gain = 1.0, output_audio_gain = 1.0, monitor_audio_gain = 1.0, monitor = False): - self.callbacks = callbacks - self.mon_queue = Queue() - self.performance = [0, 0, 0] - self.input_stream = None - self.output_stream = None - self.monitor = None - self.running = False - self.input_audio_gain = input_audio_gain - self.output_audio_gain = output_audio_gain - self.monitor_audio_gain = monitor_audio_gain - self.use_monitor = monitor - self.f0_up_key = f0_up_key - self.index_rate = index_rate - self.protect = protect - self.filter_radius = filter_radius - self.rms_mix_rate = rms_mix_rate - self.f0_autotune = f0_autotune - self.f0_autotune_strength = f0_autotune_strength - self.proposal_pitch = proposal_pitch - self.proposal_pitch_threshold = proposal_pitch_threshold - - def get_input_audio_device(self, index): - audioinput, _ = list_audio_device() - serverAudioDevice = [x for x in audioinput if x.index == index] - - return serverAudioDevice[0] if len(serverAudioDevice) > 0 else None - - def get_output_audio_device(self, index): - _, audiooutput = list_audio_device() - serverAudioDevice = [x for x in audiooutput if x.index == index] - - return serverAudioDevice[0] if len(serverAudioDevice) > 0 else None - - def process_data(self, indata): - indata = indata * self.input_audio_gain - unpacked_data = librosa.to_mono(indata.T) - - return self.callbacks.change_voice(unpacked_data, self.f0_up_key, self.index_rate, self.protect, self.filter_radius, self.rms_mix_rate, self.f0_autotune, self.f0_autotune_strength, self.proposal_pitch, self.proposal_pitch_threshold) - - def process_data_with_time(self, indata): - out_wav, _, perf, _ = self.process_data(indata) - self.performance = perf - - self.callbacks.emit_to(self.performance) - return out_wav - - def audio_stream_callback(self, indata, frames, times, status): - try: - out_wav = self.process_data_with_time(indata) - self.mon_queue.put(out_wav) - except Exception as e: - logger.error(translations["error_occurred"].format(e=e)) - logger.debug(traceback.format_exc()) - - def audio_queue(self, outdata, gain): - try: - mon_wav = self.mon_queue.get() - - while self.mon_queue.qsize() > 0: - self.mon_queue.get() - - output_channels = outdata.shape[1] - outdata[:] = (np.repeat(mon_wav, output_channels).reshape(-1, output_channels) * gain) - except Exception as e: - logger.error(translations["error_occurred"].format(e=e)) - logger.debug(traceback.format_exc()) - - def run_audio_stream(self, block_frame, input_device_id, output_device_id, output_monitor_id, input_audio_sample_rate, output_monitor_sample_rate, input_max_channel, output_max_channel, output_monitor_max_channel, input_extra_setting, output_extra_setting, output_monitor_extra_setting): - self.input_stream = sd.InputStream( - callback=self.audio_stream_callback, - latency="low", - dtype=np.float32, - device=input_device_id, - blocksize=block_frame, - samplerate=input_audio_sample_rate, - channels=input_max_channel, - extra_settings=input_extra_setting - ) - self.output_stream = sd.OutputStream( - callback=lambda outdata, frames, times, status: self.audio_queue(outdata, self.output_audio_gain), - latency="low", - dtype=np.float32, - device=output_device_id, - blocksize=block_frame, - samplerate=input_audio_sample_rate, - channels=output_max_channel, - extra_settings=output_extra_setting - ) - self.input_stream.start() - self.output_stream.start() - - if self.use_monitor: - self.monitor = sd.OutputStream( - callback=lambda outdata, frames, times, status: self.audio_queue(outdata, self.monitor_audio_gain), - latency="low", - dtype=np.float32, - device=output_monitor_id, - blocksize=block_frame, - samplerate=output_monitor_sample_rate, - channels=output_monitor_max_channel, - extra_settings=output_monitor_extra_setting - ) - self.monitor.start() - - def stop(self): - self.running = False - - if self.input_stream is not None: - self.input_stream.close() - self.input_stream = None - - if self.output_stream is not None: - self.output_stream.close() - self.output_stream = None - - if self.monitor is not None: - self.monitor.close() - self.monitor = None - - def start(self, input_device_id, output_device_id, output_monitor_id, exclusive_mode, asio_input_channel, asio_output_channel, asio_output_monitor_channel, read_chunk_size, input_audio_sample_rate, output_monitor_sample_rate): - self.stop() - - input_audio_device, output_audio_device = self.get_input_audio_device(input_device_id), self.get_output_audio_device(output_device_id) - input_channels, output_channels = input_audio_device.max_input_channels, output_audio_device.max_output_channels - - input_extra_setting, output_extra_setting, output_monitor_extra_setting, monitor_channels = None, None, None, None - wasapi_exclusive_mode = bool(exclusive_mode) - - if input_audio_device and "WASAPI" in input_audio_device.host_api: - input_extra_setting = sd.WasapiSettings(exclusive=wasapi_exclusive_mode, auto_convert=not wasapi_exclusive_mode) - elif input_audio_device and "ASIO" in input_audio_device.host_api and asio_input_channel != -1: - input_extra_setting = sd.AsioSettings(channel_selectors=[asio_input_channel]) - input_channels = 1 - - if output_audio_device and "WASAPI" in output_audio_device.host_api: - output_extra_setting = sd.WasapiSettings(exclusive=wasapi_exclusive_mode, auto_convert=not wasapi_exclusive_mode) - elif input_audio_device and "ASIO" in input_audio_device.host_api and asio_output_channel != -1: - output_extra_setting = sd.AsioSettings(channel_selectors=[asio_output_channel]) - output_channels = 1 - - if self.use_monitor: - output_monitor_device = self.get_output_audio_device(output_monitor_id) - monitor_channels = output_monitor_device.max_output_channels - - if output_monitor_device and "WASAPI" in output_monitor_device.host_api: - output_monitor_extra_setting = sd.WasapiSettings(exclusive=wasapi_exclusive_mode, auto_convert=not wasapi_exclusive_mode) - elif output_monitor_device and "ASIO" in output_monitor_device.host_api and asio_output_monitor_channel != -1: - output_monitor_extra_setting = sd.AsioSettings(channel_selectors=[asio_output_monitor_channel]) - monitor_channels = 1 - - block_frame = int((read_chunk_size * 128 / 48000) * input_audio_sample_rate) - - try: - self.run_audio_stream(block_frame, input_device_id, output_device_id, output_monitor_id, input_audio_sample_rate, output_monitor_sample_rate, input_channels, output_channels, monitor_channels, input_extra_setting, output_extra_setting, output_monitor_extra_setting) - self.running = True - except Exception as e: - logger.error(translations["error_occurred"].format(e=e)) - logger.debug(traceback.format_exc()) \ No newline at end of file diff --git a/main/inference/realtime/callbacks.py b/main/inference/realtime/callbacks.py deleted file mode 100644 index 22b95f2729ad50b81636d2c682783f781bcfb777..0000000000000000000000000000000000000000 --- a/main/inference/realtime/callbacks.py +++ /dev/null @@ -1,99 +0,0 @@ -import os -import sys -import threading - -import numpy as np - -sys.path.append(os.getcwd()) - -from main.inference.realtime.audio import Audio -from main.app.variables import logger, translations -from main.inference.realtime.realtime import VoiceChanger, RVC_Realtime - -class AudioCallbacks: - def emit_to(self, performance): - self.latency = performance[1] - - def __init__(self, pass_through = False, read_chunk_size = 192, cross_fade_overlap_size = 0.1, input_sample_rate = 48000, output_sample_rate = 48000, extra_convert_size = 0.5, model_path = None, index_path = None, f0_method = "rmvpe", f0_onnx = False, embedder_model = "hubert_base", embedders_mode = "fairseq", sample_rate = 16000, hop_length = 160, silent_threshold = -90, f0_up_key = 0, index_rate = 0.5, protect = 0.5, filter_radius = 3, rms_mix_rate = 1, f0_autotune = False, f0_autotune_strength = 1, proposal_pitch = False, proposal_pitch_threshold = 255.0, input_audio_gain = 1.0, output_audio_gain = 1.0, monitor_audio_gain = 1.0, monitor = False, vad_enabled = False, vad_sensitivity = 3, vad_frame_ms = 30, clean_audio = False, clean_strength = 0.7): - self.pass_through = pass_through - self.input_sample_rate = input_sample_rate - self.output_sample_rate = output_sample_rate - self.lock = threading.Lock() - self.vc = VoiceChanger( - read_chunk_size, - cross_fade_overlap_size, - input_sample_rate, - extra_convert_size - ) - self.audio = Audio( - self, - f0_up_key, - index_rate, - protect, - filter_radius, - rms_mix_rate, - f0_autotune, - f0_autotune_strength, - proposal_pitch, - proposal_pitch_threshold, - input_audio_gain, - output_audio_gain, - monitor_audio_gain, - monitor - ) - self.initialize( - model_path, - index_path, - f0_method, - f0_onnx, - embedder_model, - embedders_mode, - sample_rate, - hop_length, - silent_threshold, - vad_enabled, - vad_sensitivity, - vad_frame_ms, - clean_audio, - clean_strength - ) - - def initialize(self, model_path, index_path = None, f0_method = "rmvpe", f0_onnx = False, embedder_model = "hubert_base", embedders_mode = "fairseq", sample_rate = 16000, hop_length = 160, silent_threshold = -90, vad_enabled = False, vad_sensitivity = 3, vad_frame_ms = 30, clean_audio = False, clean_strength = 0.7): - self.vc.initialize( - RVC_Realtime( - model_path, - index_path, - f0_method, - f0_onnx, - embedder_model, - embedders_mode, - sample_rate, - hop_length, - silent_threshold, - self.input_sample_rate, - self.output_sample_rate, - vad_enabled, - vad_sensitivity, - vad_frame_ms, - clean_audio, - clean_strength - ) - ) - - def change_voice(self, received_data, f0_up_key = 0, index_rate = 0.5, protect = 0.5, filter_radius = 3, rms_mix_rate = 1, f0_autotune = False, f0_autotune_strength = 1, proposal_pitch = False, proposal_pitch_threshold = 255.0): - if self.pass_through: - vol = float(np.sqrt(np.square(received_data).mean(dtype=np.float32))) - return received_data, vol, [0, 0, 0], None - - try: - with self.lock: - audio, vol, perf = self.vc.on_request(received_data, f0_up_key, index_rate, protect, filter_radius, rms_mix_rate, f0_autotune, f0_autotune_strength, proposal_pitch, proposal_pitch_threshold) - - return audio, vol, perf, None - except RuntimeError as e: - import traceback - - logger.debug(traceback.format_exc()) - logger.error(translations["error_occurred"].format(e=e)) - - return np.zeros(1, dtype=np.float32), 0, [0, 0, 0], None \ No newline at end of file diff --git a/main/inference/realtime/pipeline.py b/main/inference/realtime/pipeline.py deleted file mode 100644 index 8c4de43154738abc956c935aa103e71f3913562c..0000000000000000000000000000000000000000 --- a/main/inference/realtime/pipeline.py +++ /dev/null @@ -1,176 +0,0 @@ -import os -import sys -import torch - -import numpy as np -import torch.nn.functional as F -import torchaudio.transforms as tat - -sys.path.append(os.getcwd()) - -from main.app.variables import config -from main.library.utils import load_embedders_model, extract_features, change_rms, load_faiss_index, load_model - -class Inference: - def get_synthesizer(self, model_path): - model = load_model(model_path) - - if model_path.endswith(".pth"): - from main.library.algorithm.synthesizers import Synthesizer - - self.tgt_sr = model["config"][-1] - model["config"][-3] = model["weight"]["emb_g.weight"].shape[0] - - self.use_f0 = model.get("f0", 1) - self.version = model.get("version", "v1") - self.vocoder = model.get("vocoder", "Default") - self.energy = model.get("energy", False) - - if self.vocoder != "Default": config.is_half = False - - net_g = Synthesizer( - *model["config"], - use_f0=self.use_f0, - text_enc_hidden_dim=768 if self.version == "v2" else 256, - vocoder=self.vocoder, - checkpointing=False, - energy=self.energy - ) - - net_g.load_state_dict(model["weight"], strict=False) - net_g.eval().to(config.device).to(torch.float16 if config.is_half else torch.float32) - net_g.remove_weight_norm() - - self.net_g = net_g - self.model = model - else: - self.model = model - self.net_g = self.model.to(config.device) - self.tgt_sr = self.model.cpt.get("tgt_sr", 32000) - self.use_f0 = self.model.cpt.get("f0", 1) - self.version = self.model.cpt.get("version", "v1") - self.energy = self.model.cpt.get("energy", False) - - return self - - def inference(self, feats, p_len, sid, pitch, pitchf, energy): - output = ( - self.net_g.infer( - feats, - p_len, - pitch, - pitchf, - sid, - energy - )[0][0, 0] - ) - - return torch.clip(output, -1.0, 1.0, out=output) - -class Pipeline: - def __init__(self, inference, embedder, predictor = None, rms = None, index = (None, None), f0_method = "rmvpe", sid = 0): - self.inference = inference - self.embedder = embedder - self.predictor = predictor - self.rms = rms - self.index = index - self.use_f0 = inference.use_f0 - self.tgt_sr = inference.tgt_sr - self.energy = inference.energy - self.f0_method = f0_method - self.f0_min = 50.0 - self.f0_max = 1100.0 - self.device = config.device - self.is_half = config.is_half - self.dtype = torch.float16 if self.is_half else torch.float32 - self.model_window = self.tgt_sr // 100 - self.sid = torch.tensor([sid], device=self.device, dtype=torch.int64) - self.resamplers = {} - - def execute(self, audio, pitch = None, pitchf = None, f0_up_key = 0, index_rate = 0.5, audio_feats_len = 0, silence_front = 0, skip_head = None, return_length = None, protect = 0.5, filter_radius = 3, rms_mix_rate = 1, f0_autotune = False, f0_autotune_strength = 1, proposal_pitch = False, proposal_pitch_threshold = 255.0): - with torch.no_grad(): - assert audio.dim() == 1, audio.dim() - formant_length = int(np.ceil(return_length * 1.0)) - - pitch, pitchf = self.predictor.realtime_calculator(audio[silence_front:], self.f0_method, pitch, pitchf, f0_up_key, filter_radius, f0_autotune, f0_autotune_strength, proposal_pitch, proposal_pitch_threshold) if self.use_f0 else (None, None) - energy = self.rms(audio[silence_front:].to(self.device).unsqueeze(0)) if self.energy else None - - feats = extract_features(self.embedder, audio.view(1, -1), self.inference.version, device=self.device) - feats = torch.cat((feats, feats[:, -1:, :]), 1) - feats0 = feats.detach().clone() if protect < 0.5 and self.use_f0 else None - - if (not isinstance(self.index[0], type(None)) and not isinstance(self.index[1], type(None)) and index_rate != 0): - skip_offset = skip_head // 2 - npy = feats[0][skip_offset :].cpu().numpy() - - if self.is_half: npy = npy.astype(np.float32) - - score, ix = self.index[0].search(npy, k=8) - weight = np.square(1 / score) - - npy = np.sum(self.index[1][ix] * np.expand_dims(weight / weight.sum(axis=1, keepdims=True), axis=2), axis=1) - if self.is_half: npy = npy.astype(np.float16) - - feats[0][skip_offset :] = (torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats[0][skip_offset :]) - - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)[:, :audio_feats_len, :] - - if self.use_f0: pitch, pitchf = pitch[:, -audio_feats_len:], pitchf[:, -audio_feats_len:] * (formant_length / return_length) - if self.energy: energy = energy[:audio_feats_len].unsqueeze(0) - - if feats0 is not None: - pitchff = pitchf.detach().clone() - pitchff[pitchf > 0] = 1 - pitchff[pitchf < 1] = protect - pitchff = pitchff.unsqueeze(-1) - - feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)[:, :audio_feats_len, :] - feats = (feats * pitchff + feats0 * (1 - pitchff)).to(feats0.dtype) - - pitch = pitch if self.use_f0 else None - pitchf = pitchf.to(self.dtype) if self.use_f0 else None - energy = energy.to(self.dtype) if self.energy else None - - p_len = torch.tensor([audio_feats_len], device=self.device, dtype=torch.int64) - - out_audio = self.inference.inference(feats, p_len, self.sid, pitch, pitchf, energy).float() - if rms_mix_rate != 1: - out_audio = change_rms(audio.cpu().numpy(), self.predictor.sample_rate, out_audio.cpu().numpy(), self.tgt_sr, rms_mix_rate) - out_audio = torch.as_tensor(out_audio, device=self.device) - - scaled_window = int(np.floor(1.0 * self.model_window)) - - if scaled_window != self.model_window: - if scaled_window not in self.resamplers: self.resamplers[scaled_window] = tat.Resample(orig_freq=scaled_window, new_freq=self.model_window, dtype=torch.float32).to(self.device) - out_audio = self.resamplers[scaled_window](out_audio[: return_length * scaled_window]) - - return out_audio - -def create_pipeline(model_path=None, index_path=None, f0_method="rmvpe", f0_onnx=False, embedder_model="hubert_base", embedders_mode="fairseq", sample_rate=16000, hop_length=160): - inference = Inference() - inference = inference.get_synthesizer(model_path) - - if inference.use_f0: - from main.library.predictors.Generator import Generator - predictor = Generator(sample_rate=sample_rate, hop_length=hop_length, f0_min=50.0, f0_max=1100.0, alpha=0.5, is_half=config.is_half, device=config.device, f0_onnx_mode=f0_onnx, del_onnx_model=False) - else: predictor = None - - if inference.energy: - from main.inference.extracting.rms import RMSEnergyExtractor - rms = RMSEnergyExtractor(frame_length=2048, hop_length=160, center=True, pad_mode="reflect").to(config.device).eval() - else: rms = None - - index, index_reconstruct = load_faiss_index(index_path.strip().strip('"').strip("\n").strip('"').strip().replace("trained", "added")) - embedder = load_embedders_model(embedder_model, embedders_mode=embedders_mode) - if isinstance(embedder, torch.nn.Module): embedder = embedder.to(config.device).to(torch.float16 if config.is_half else torch.float32) - - pipeline = Pipeline( - inference, - embedder, - predictor, - rms, - (index, index_reconstruct), - f0_method - ) - - return pipeline \ No newline at end of file diff --git a/main/inference/realtime/realtime.py b/main/inference/realtime/realtime.py deleted file mode 100644 index 8a68f74c7f91c948cab14013506eaf518a7b0708..0000000000000000000000000000000000000000 --- a/main/inference/realtime/realtime.py +++ /dev/null @@ -1,232 +0,0 @@ -import os -import sys -import time -import torch - -import numpy as np -import torch.nn.functional as F -import torchaudio.transforms as tat - -sys.path.append(os.getcwd()) - -from main.app.variables import config, translations -from main.library.utils import circular_write, check_assets -from main.inference.realtime.pipeline import create_pipeline - -class RVC_Realtime: - def __init__(self, model_path, index_path = None, f0_method = "rmvpe", f0_onnx = False, embedder_model = "hubert_base", embedders_mode = "fairseq", sample_rate = 16000, hop_length = 160, silent_threshold = 0, input_sample_rate = 48000, output_sample_rate = 48000, vad_enabled = False, vad_sensitivity = 3, vad_frame_ms = 30, clean_audio=False, clean_strength=0.7): - self.model_path = model_path - self.index_path = index_path - self.f0_method = f0_method - self.f0_onnx = f0_onnx - self.embedder_model = embedder_model - self.embedders_mode = embedders_mode - self.sample_rate = sample_rate - self.hop_length = hop_length - self.pipeline = None - self.convert_buffer = None - self.pitch_buffer = None - self.pitchf_buffer = None - self.return_length = 0 - self.skip_head = 0 - self.silence_front = 0 - self.resample_in = None - self.resample_out = None - self.vad = None - self.tg = None - self.input_sample_rate = input_sample_rate - self.output_sample_rate = output_sample_rate - self.vad_enabled = vad_enabled - self.vad_sensitivity = vad_sensitivity - self.vad_frame_ms = vad_frame_ms - self.clean_audio = clean_audio - self.clean_strength = clean_strength - self.input_sensitivity = 10 ** (silent_threshold / 20) - self.window_size = sample_rate // 100 - self.dtype = torch.float16 if config.is_half else torch.float32 - - def initialize(self): - check_assets(self.f0_method, self.embedder_model, f0_onnx=self.f0_onnx, embedders_mode=self.embedders_mode) - - if self.vad_enabled: - from main.inference.realtime.vad_utils import VADProcessor - self.vad = VADProcessor(sensitivity_mode=self.vad_sensitivity, sample_rate=self.sample_rate, frame_duration_ms=self.vad_frame_ms) - else: self.vad = None - - if self.clean_audio: - from main.tools.noisereduce import TorchGate - self.tg = TorchGate(self.sample_rate, prop_decrease=self.clean_strength).to(config.device) - else: self.tg = None - - self.pipeline = create_pipeline( - model_path=self.model_path, - index_path=self.index_path, - f0_method=self.f0_method, - f0_onnx=self.f0_onnx, - embedder_model=self.embedder_model, - embedders_mode=self.embedders_mode, - sample_rate=self.sample_rate, - hop_length=self.hop_length, - ) - - self.resample_in = tat.Resample( - orig_freq=self.input_sample_rate, - new_freq=self.sample_rate, - dtype=torch.float32 - ).to(config.device) - self.resample_out = tat.Resample( - orig_freq=self.pipeline.tgt_sr, - new_freq=self.output_sample_rate, - dtype=torch.float32 - ).to(config.device) - - def realloc(self, block_frame, extra_frame, crossfade_frame, sola_search_frame): - block_frame_16k = int(block_frame / self.input_sample_rate * self.sample_rate) - crossfade_frame_16k = int(crossfade_frame / self.input_sample_rate * self.sample_rate) - sola_search_frame_16k = int(sola_search_frame / self.input_sample_rate * self.sample_rate) - extra_frame_16k = int(extra_frame / self.input_sample_rate * self.sample_rate) - - convert_size_16k = block_frame_16k + sola_search_frame_16k + extra_frame_16k + crossfade_frame_16k - if (modulo := convert_size_16k % self.window_size) != 0: convert_size_16k = convert_size_16k + (self.window_size - modulo) - - self.convert_feature_size_16k = convert_size_16k // self.window_size - self.skip_head = extra_frame_16k // self.window_size - self.return_length = self.convert_feature_size_16k - self.skip_head - self.silence_front = extra_frame_16k - (self.window_size * 5) if self.silence_front else 0 - - audio_buffer_size = block_frame_16k + crossfade_frame_16k - - self.audio_buffer = torch.zeros(audio_buffer_size, dtype=self.dtype, device=config.device) - self.convert_buffer = torch.zeros(convert_size_16k, dtype=self.dtype, device=config.device) - self.pitch_buffer = torch.zeros(self.convert_feature_size_16k + 1, dtype=torch.int64, device=config.device) - self.pitchf_buffer = torch.zeros(self.convert_feature_size_16k + 1, dtype=self.dtype, device=config.device) - - def inference(self, audio_in, f0_up_key = 0, index_rate = 0.5, protect = 0.5, filter_radius = 3, rms_mix_rate = 1, f0_autotune = False, f0_autotune_strength = 1, proposal_pitch = False, proposal_pitch_threshold = 255.0): - if self.pipeline is None: - raise RuntimeError(translations["create_pipeline_error"]) - - audio_in_16k = self.resample_in(torch.as_tensor(audio_in, dtype=torch.float32, device=config.device)).to(self.dtype) - circular_write(audio_in_16k, self.audio_buffer) - - vol_t = self.audio_buffer.square().mean().sqrt() - vol = max(vol_t.item(), 0) - - if self.vad is not None: - is_speech = self.vad.is_speech(audio_in_16k.cpu().numpy().copy()) - if not is_speech: - self.pipeline.execute( - self.convert_buffer, - self.pitch_buffer, - self.pitchf_buffer, - f0_up_key, - index_rate, - self.convert_feature_size_16k, - self.silence_front, - self.skip_head, - self.return_length, - protect, - filter_radius, - rms_mix_rate, - f0_autotune, - f0_autotune_strength, - proposal_pitch, - proposal_pitch_threshold - ) - return None, vol - - if vol < self.input_sensitivity: - self.pipeline.execute( - self.convert_buffer, - self.pitch_buffer, - self.pitchf_buffer, - f0_up_key, - index_rate, - self.convert_feature_size_16k, - self.silence_front, - self.skip_head, - self.return_length, - protect, - filter_radius, - rms_mix_rate, - f0_autotune, - f0_autotune_strength, - proposal_pitch, - proposal_pitch_threshold - ) - - return None, vol - - circular_write(audio_in_16k, self.convert_buffer) - - audio_model = self.pipeline.execute( - self.convert_buffer, - self.pitch_buffer, - self.pitchf_buffer, - f0_up_key, - index_rate, - self.convert_feature_size_16k, - self.silence_front, - self.skip_head, - self.return_length, - protect, - filter_radius, - rms_mix_rate, - f0_autotune, - f0_autotune_strength, - proposal_pitch, - proposal_pitch_threshold - ) - - if self.tg is not None: audio_model = self.tg(audio_model.unsqueeze(0)).squeeze(0) - audio_out = self.resample_out(audio_model * vol_t.sqrt()) - - return audio_out, vol - -class VoiceChanger: - def __init__(self, read_chunk_size, cross_fade_overlap_size, input_sample_rate, extra_convert_size): - self.block_frame = read_chunk_size * 128 - self.crossfade_frame = int(cross_fade_overlap_size * input_sample_rate) - self.extra_frame = int(extra_convert_size * input_sample_rate) - self.sola_search_frame = input_sample_rate // 100 - self.vc_model = None - self.sola_buffer = None - self.generate_strength() - - def initialize(self, vc_model): - self.vc_model = vc_model - self.vc_model.realloc(self.block_frame, self.extra_frame, self.crossfade_frame, self.sola_search_frame) - self.vc_model.initialize() - - def generate_strength(self): - self.fade_in_window = (0.5 * np.pi * torch.linspace(0.0, 1.0, steps=self.crossfade_frame, device=config.device, dtype=torch.float32)).sin() ** 2 - self.fade_out_window = 1 - self.fade_in_window - self.sola_buffer = torch.zeros(self.crossfade_frame, device=config.device, dtype=torch.float32) - - def process_audio(self, audio_in, f0_up_key = 0, index_rate = 0.5, protect = 0.5, filter_radius = 3, rms_mix_rate = 1, f0_autotune = False, f0_autotune_strength = 1, proposal_pitch = False, proposal_pitch_threshold = 255.0): - block_size = audio_in.shape[0] - audio, vol = self.vc_model.inference(audio_in, f0_up_key, index_rate, protect, filter_radius, rms_mix_rate, f0_autotune, f0_autotune_strength, proposal_pitch, proposal_pitch_threshold) - - if audio is None: return np.zeros(block_size, dtype=np.float32), vol - - conv_input = audio[None, None, : self.crossfade_frame + self.sola_search_frame] - cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :]) - cor_den = (F.conv1d(conv_input ** 2, torch.ones(1, 1, self.crossfade_frame, device=config.device)) + 1e-8).sqrt() - sola_offset = (cor_nom[0, 0] / cor_den[0, 0]).argmax() - - audio = audio[sola_offset:] - audio[: self.crossfade_frame] *= self.fade_in_window - audio[: self.crossfade_frame] += (self.sola_buffer * self.fade_out_window) - - self.sola_buffer[:] = audio[block_size : block_size + self.crossfade_frame] - return audio[: block_size].detach().cpu().numpy(), vol - - @torch.no_grad() - def on_request(self, audio_in, f0_up_key = 0, index_rate = 0.5, protect = 0.5, filter_radius = 3, rms_mix_rate = 1, f0_autotune = False, f0_autotune_strength = 1, proposal_pitch = False, proposal_pitch_threshold = 255.0): - if self.vc_model is None: - raise RuntimeError(translations["voice_changer_selected_error"]) - - start = time.perf_counter() - result, vol = self.process_audio(audio_in, f0_up_key, index_rate, protect, filter_radius, rms_mix_rate, f0_autotune, f0_autotune_strength, proposal_pitch, proposal_pitch_threshold) - end = time.perf_counter() - - return result, vol, [0, (end - start) * 1000, 0] \ No newline at end of file diff --git a/main/inference/realtime/vad_utils.py b/main/inference/realtime/vad_utils.py deleted file mode 100644 index 76baa085deddb5311fd0c320cefb122d04a5cb12..0000000000000000000000000000000000000000 --- a/main/inference/realtime/vad_utils.py +++ /dev/null @@ -1,35 +0,0 @@ -import webrtcvad - -import numpy as np - -class VADProcessor: - def __init__(self, sensitivity_mode=3, sample_rate=16000, frame_duration_ms=30): - if sample_rate not in [8000, 16000]: raise ValueError - if frame_duration_ms not in [10, 20, 30]: raise ValueError - - self.vad = webrtcvad.Vad(sensitivity_mode) - self.sample_rate = sample_rate - self.frame_length = int(sample_rate * (frame_duration_ms / 1000.0)) - - def is_speech(self, audio_chunk): - if audio_chunk.ndim > 1 and audio_chunk.shape[1] == 1: audio_chunk = audio_chunk.flatten() - elif audio_chunk.ndim > 1: audio_chunk = np.mean(audio_chunk, axis=1) - - if np.max(np.abs(audio_chunk)) > 1.0: audio_chunk = np.clip(audio_chunk, -1.0, 1.0) - - audio_chunk = (audio_chunk * 32767).astype(np.int16) - num_frames = len(audio_chunk) // self.frame_length - - if num_frames == 0 and len(audio_chunk) > 0: - audio_chunk = np.concatenate((audio_chunk, np.zeros(self.frame_length - len(audio_chunk), dtype=np.int16))) - num_frames = 1 - elif num_frames == 0 and len(audio_chunk) == 0: return False - - try: - for i in range(num_frames): - start = i * self.frame_length - if self.vad.is_speech(audio_chunk[start:start + self.frame_length].tobytes(), self.sample_rate): return True - - return False - except Exception: - return False \ No newline at end of file diff --git a/main/inference/separate_music.py b/main/inference/separate_music.py deleted file mode 100644 index f47a7c778d13a9a5acd690a83bf38be5d801f1e4..0000000000000000000000000000000000000000 --- a/main/inference/separate_music.py +++ /dev/null @@ -1,613 +0,0 @@ -import os -import sys -import time -import argparse - -from distutils.util import strtobool - -sys.path.append(os.getcwd()) - -from main.library.utils import pydub_load -from main.library.uvr5_lib.separator import Separator -from main.app.variables import config, logger, translations, vr_models, demucs_models, mdx_models, karaoke_models, reverb_models, denoise_models - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--separate_music", action='store_true') - parser.add_argument("--input_path", type=str, required=True) - parser.add_argument("--output_dirs", type=str, default="./audios") - parser.add_argument("--export_format", type=str, default="wav") - parser.add_argument("--model_name", type=str, default="MDXNET_Main") - parser.add_argument("--karaoke_model", type=str, default="MDX-Version-1") - parser.add_argument("--reverb_model", type=str, default="MDX-Reverb") - parser.add_argument("--denoise_model", type=str, default="Normal") - parser.add_argument("--sample_rate", type=int, default=44100) - parser.add_argument("--shifts", type=int, default=2) - parser.add_argument("--batch_size", type=int, default=1) - parser.add_argument("--overlap", type=float, default=0.25) - parser.add_argument("--aggression", type=int, default=5) - parser.add_argument("--hop_length", type=int, default=1024) - parser.add_argument("--window_size", type=int, default=512) - parser.add_argument("--segments_size", type=int, default=256) - parser.add_argument("--post_process_threshold", type=float, default=0.2) - parser.add_argument("--enable_tta", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--enable_denoise", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--high_end_process", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--enable_post_process", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--separate_backing", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--separate_reverb", type=lambda x: bool(strtobool(x)), default=False) - - return parser.parse_args() - -def main(): - args = parse_arguments() - input_path, output_dirs, export_format, model_name, karaoke_model, reverb_model, denoise_model, sample_rate, shifts, batch_size, overlap, aggression, hop_length, window_size, segments_size, post_process_threshold, enable_tta, enable_denoise, high_end_process, enable_post_process, separate_backing, separate_reverb = args.input_path, args.output_dirs, args.export_format, args.model_name, args.karaoke_model, args.reverb_model, args.denoise_model, args.sample_rate, args.shifts, args.batch_size, args.overlap, args.aggression, args.hop_length, args.window_size, args.segments_size, args.post_process_threshold, args.enable_tta, args.enable_denoise, args.high_end_process, args.enable_post_process, args.separate_backing, args.separate_reverb - - separate( - input_path, - output_dirs, - export_format, - model_name, - karaoke_model, - reverb_model, - denoise_model, - sample_rate, - shifts, - batch_size, - overlap, - aggression, - hop_length, - window_size, - segments_size, - post_process_threshold, - enable_tta, - enable_denoise, - high_end_process, - enable_post_process, - separate_backing, - separate_reverb - ) - -def separate( - input_path, - output_dirs, - export_format="wav", - model_name="MDXNET_Main", - karaoke_model="MDX-Version-1", - reverb_model="MDX-Reverb", - denoise_model="Normal", - sample_rate=44100, - shifts=2, - batch_size=1, - overlap=0.25, - aggression=5, - hop_length=1024, - window_size=512, - segments_size=256, - post_process_threshold=0.2, - enable_tta=False, - enable_denoise=False, - high_end_process=False, - enable_post_process=False, - separate_backing=False, - separate_reverb=False -): - start_time = time.time() - pid_path = os.path.join("assets", "separate_pid.txt") - - with open(pid_path, "w") as pid_file: - pid_file.write(str(os.getpid())) - - try: - input_path = input_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - output_dirs = os.path.dirname(output_dirs) or output_dirs - - log_data = { - translations['audio_path']: input_path, - translations['output_path']: output_dirs, - translations['export_format']: export_format, - translations['shift']: shifts, - translations['segments_size']: segments_size, - translations['overlap']: overlap, - translations['modelname']: model_name, - translations['denoise_mdx']: enable_denoise, - translations['hop_length']: hop_length, - translations['batch_size']: batch_size, - translations['sr']: sample_rate, - translations['separator_backing']: separate_backing, - translations['dereveb_audio']: separate_reverb, - translations['aggression']: aggression, - translations['window_size']: window_size, - translations['post_process_threshold']: post_process_threshold, - translations['enable_tta']: enable_tta, - translations['high_end_process']: high_end_process, - translations['enable_post_process']: enable_post_process - } - - if separate_backing: log_data[translations['backing_model_ver']] = karaoke_model - if separate_reverb: log_data[translations['dereveb_model']] = reverb_model - if enable_denoise and model_name in list(vr_models.keys()): log_data["Denoise Model"] = denoise_model - - for key, value in log_data.items(): - logger.debug(f"{key}: {value}") - - output_files = [] - files = [os.path.join(input_path, f) for f in os.listdir(input_path)] if os.path.isdir(input_path) else [input_path] - - for file in files: - if os.path.isfile(file): - output_files.append(_separate( - input_path, - output_dirs, - model_name, - karaoke_model, - reverb_model, - denoise_model, - export_format, - sample_rate, - shifts, - batch_size, - overlap, - aggression, - hop_length, - window_size, - segments_size, - post_process_threshold, - enable_tta, - enable_denoise, - high_end_process, - enable_post_process, - separate_backing, - separate_reverb - )) - except Exception as e: - logger.error(f"{translations['separator_error']}: {e}") - import traceback - logger.debug(traceback.format_exc()) - - if os.path.exists(pid_path): os.remove(pid_path) - elapsed_time = time.time() - start_time - - logger.info(translations["separator_success"].format(elapsed_time=f"{elapsed_time:.2f}")) - return output_files - -def _separate( - input_path, - output_dirs, - model_name, - karaoke_model="MDX-Version-1", - reverb_model="MDX-Reverb", - denoise_model="Normal", - export_format="wav", - sample_rate=44100, - shifts=2, - batch_size=1, - overlap=0.25, - aggression=5, - hop_length=1024, - window_size=512, - segments_size=256, - post_process_threshold=0.2, - enable_tta=False, - enable_denoise=False, - high_end_process=False, - enable_post_process=False, - separate_backing=False, - separate_reverb=False -): - main_vocals, backing_vocals = None, None - - filename, _ = os.path.splitext(os.path.basename(input_path)) - output_dirs = os.path.join(output_dirs, filename) - - os.makedirs(output_dirs, exist_ok=True) - clean_file(output_dirs, export_format) - - if model_name in list(demucs_models.keys()): - original_vocals, instruments = demucs_main( - input_path, - output_dirs, - model_name, - export_format, - segments_size, - overlap, - shifts, - sample_rate - ) - elif model_name in list(vr_models.keys()): - original_vocals, instruments = vr_main( - input_path, - output_dirs, - vr_models.get(model_name, model_name), - export_format, - batch_size, - window_size, - aggression, - enable_denoise, - denoise_model, - enable_tta, - enable_post_process, - post_process_threshold, - high_end_process, - sample_rate, - ) - else: - original_vocals, instruments = mdx_main( - input_path, - output_dirs, - mdx_models.get(model_name, model_name), - export_format, - segments_size, - overlap, - enable_denoise, - hop_length, - batch_size, - sample_rate, - ) - - if separate_backing: - if karaoke_model.startswith("MDX"): - main_vocals, backing_vocals = mdx_main( - original_vocals, - output_dirs, - karaoke_models.get(karaoke_model, karaoke_model), - export_format, - segments_size, - overlap, - enable_denoise, - hop_length, - batch_size, - sample_rate, - mode="karaoke" - ) - else: - main_vocals, backing_vocals = vr_main( - original_vocals, - output_dirs, - karaoke_models.get(karaoke_model, karaoke_model), - export_format, - batch_size, - window_size, - aggression, - enable_denoise, - denoise_model, - enable_tta, - enable_post_process, - post_process_threshold, - high_end_process, - sample_rate, - mode="karaoke" - ) - - if separate_reverb: - dereverb = [original_vocals] - if separate_backing: dereverb.append(main_vocals) - - for audio in dereverb: - if karaoke_model.startswith("MDX"): - _, no_reverb_vocals = mdx_main( - audio, - output_dirs, - reverb_models.get(reverb_model, reverb_model), - export_format, - segments_size, - overlap, - enable_denoise, - hop_length, - batch_size, - sample_rate, - mode="reverb" - ) - else: - _, no_reverb_vocals = vr_main( - audio, - output_dirs, - reverb_models.get(reverb_model, reverb_model), - export_format, - batch_size, - window_size, - aggression, - enable_denoise, - denoise_model, - enable_tta, - enable_post_process, - post_process_threshold, - high_end_process, - sample_rate, - mode="reverb" - ) - - if "Original_Vocals" in os.path.basename(no_reverb_vocals): original_vocals = no_reverb_vocals - else: main_vocals = no_reverb_vocals - - return original_vocals, instruments, main_vocals, backing_vocals - -def vr_main( - input_path, - output_dirs, - model_name, - export_format="wav", - batch_size=1, - window_size=512, - aggression=5, - enable_denoise=False, - denoise_model="Normal", - enable_tta=False, - enable_post_process=False, - post_process_threshold=0.2, - high_end_process=False, - sample_rate=44100, - mode="original" -): - exists_file(input_path, output_dirs) - - logger.info(f"{translations['separator_process_2']}...") - - output_list = separate_main( - audio_file=input_path, - model_filename=model_name, - export_format=export_format, - output_dir=output_dirs, - batch_size=batch_size, - window_size=window_size, - aggression=aggression, - enable_tta=enable_tta, - enable_post_process=enable_post_process, - post_process_threshold=post_process_threshold, - high_end_process=high_end_process, - sample_rate=sample_rate - ) - - if enable_denoise: - denoise_list = [] - for audio in output_list: - audio_path = os.path.join(output_dirs, audio) - - denoise_file = separate_main( - audio_file=audio_path, - model_filename=denoise_models.get(denoise_model, denoise_model), - export_format=export_format, - output_dir=output_dirs, - batch_size=batch_size, - window_size=window_size, - aggression=aggression, - enable_tta=enable_tta, - enable_post_process=enable_post_process, - post_process_threshold=post_process_threshold, - high_end_process=high_end_process, - sample_rate=sample_rate - ) - - if os.path.exists(audio_path): os.remove(audio_path) - - for file in denoise_file: - file_path = os.path.join(output_dirs, file) - - if "_(Noise)_" in file and os.path.exists(file_path): os.remove(file_path) - elif "_(No Noise)_" in file: - filename = "".join([file.split("_(No Noise)_")[0], ".", export_format]) - os.rename(file_path, os.path.join(output_dirs, filename)) - - denoise_list.append(filename) - - logger.info(translations["separator_success_2"]) - return process_file(denoise_list if enable_denoise else output_list, output_dirs, export_format, mode) - -def demucs_main( - input_path, - output_dirs, - model_name, - export_format="wav", - segments_size=256, - overlap=0.25, - shifts=2, - sample_rate=44100 -): - exists_file(input_path, output_dirs) - - logger.info(f"{translations['separator_process_2']}...") - - output_list = separate_main( - audio_file=input_path, - output_dir=output_dirs, - model_filename=demucs_models.get(model_name, model_name), - export_format=export_format, - segment_size=(segments_size / 2), - overlap=overlap, - shifts=shifts, - sample_rate=sample_rate - ) - - logger.info(translations["separator_success_2"]) - return process_file(output_list, output_dirs, export_format, mode="4stem") - -def mdx_main( - input_path, - output_dirs, - model_name, - export_format="wav", - segments_size=256, - overlap=0.25, - enable_denoise=False, - hop_length=1024, - batch_size=1, - sample_rate=44100, - mode="original" -): - exists_file(input_path, output_dirs) - - logger.info(f"{translations['separator_process_2']}...") - - output_list = separate_main( - audio_file=input_path, - model_filename=model_name, - export_format=export_format, - output_dir=output_dirs, - segment_size=segments_size, - overlap=overlap, - batch_size=batch_size, - hop_length=hop_length, - enable_denoise=enable_denoise, - sample_rate=sample_rate - ) - - logger.info(translations["separator_success_2"]) - return process_file(output_list, output_dirs, export_format, mode) - -def process_file(input_list, output_dirs, export_format="wav", mode="original"): - demucs_inst = [] - - reverb_audio, no_reverb_audio = None, None - main_audio, backing_audio = os.path.join(output_dirs, f"Main_Vocals.{export_format}"), os.path.join(output_dirs, f"Backing_Vocals.{export_format}") - original_audio, instruments_audio = os.path.join(output_dirs, f"Original_Vocals.{export_format}"), os.path.join(output_dirs, f"Instruments.{export_format}") - - for file in input_list: - file_path = os.path.join(output_dirs, file) - if not os.path.exists(file_path): logger.warning(translations["not_found"].format(name=file_path)) - - if mode == "original": - if "_(Instrumental)_" in file: os.rename(file_path, instruments_audio) - elif "_(Vocals)_" in file: os.rename(file_path, original_audio) - elif mode == "4stem": - if "_(Vocals)_" in file: os.rename(file_path, original_audio) - elif "_(Drums)_" in file or "_(Bass)_" in file or "_(Other)_" in file: demucs_inst.append(file_path) - elif mode == "reverb": - filename = file.split("_(")[0] - - reverb_audio = os.path.join(output_dirs, "".join([filename, "_Reverb.", export_format])) - no_reverb_audio = os.path.join(output_dirs, "".join([filename, "_No_Reverb.", export_format])) - - if "_(Reverb)_" in file or "_(Echo)_" in file: os.rename(file_path, reverb_audio) - elif "_(No Reverb)_" in file or "_(No Echo)_" in file: os.rename(file_path, no_reverb_audio) - elif mode == "karaoke": - if "_(Instrumental)_" in file: os.rename(file_path, backing_audio) - elif "_(Vocals)_" in file: os.rename(file_path, main_audio) - - if mode == "reverb": return reverb_audio, no_reverb_audio - if mode == "karaoke": return main_audio, backing_audio - - if mode == "4stem": - demucs_audio = pydub_load(demucs_inst[0]) - for file in demucs_inst[1:]: - demucs_audio = demucs_audio.overlay(pydub_load(file)) - - demucs_audio.export(instruments_audio, format=export_format) - - for f in demucs_inst: - if os.path.exists(f): os.remove(f) - - return original_audio, instruments_audio - -def exists_file(input_path, output_dirs): - if not os.path.exists(input_path): - logger.warning(translations["input_not_valid"]) - sys.exit(1) - - if not os.path.exists(output_dirs): - logger.warning(translations["output_not_valid"]) - sys.exit(1) - -def clean_file(output_dirs, export_format): - for f in [ - "Original_Vocals.", - "Original_Vocals_Reverb.", - "Original_Vocals_No_Reverb.", - "Main_Vocals.", - "Main_Vocals_Reverb.", - "Main_Vocals_No_Reverb.", - "Instruments.", - "Backing_Vocals." - ]: - file_path = os.path.join(output_dirs, f + export_format) - if os.path.exists(file_path): os.remove(file_path) - -def separate_main( - audio_file=None, - model_filename="UVR-MDX-NET_Main_340.onnx", - export_format="wav", - output_dir=".", - segment_size=256, - overlap=0.25, - batch_size=1, - hop_length=1024, - enable_denoise=False, - shifts=2, - window_size=512, - aggression=5, - enable_tta=False, - enable_post_process=False, - post_process_threshold=0.2, - high_end_process=False, - sample_rate=44100 -): - try: - separator = Separator( - logger=logger, - output_dir=output_dir, - output_format=export_format, - output_bitrate=None, - normalization_threshold=0.9, - sample_rate=sample_rate, - mdx_params={ - "hop_length": hop_length, - "segment_size": segment_size, - "overlap": overlap, - "batch_size": batch_size, - "enable_denoise": enable_denoise - }, - demucs_params={ - "segment_size": segment_size, - "shifts": shifts, - "overlap": overlap, - "segments_enabled": config.configs.get("demucs_segments_enable", True) - }, - vr_params={ - "batch_size": batch_size, - "window_size": window_size, - "aggression": aggression, - "enable_tta": enable_tta, - "enable_post_process": enable_post_process, - "post_process_threshold": post_process_threshold, - "high_end_process": high_end_process - } - ) - separator.load_model(model_filename=model_filename) - - return separator.separate(audio_file) - except: - logger.debug(translations["default_setting"]) - separator = Separator( - logger=logger, - output_dir=output_dir, - output_format=export_format, - output_bitrate=None, - normalization_threshold=0.9, - sample_rate=44100, - mdx_params={ - "hop_length": 1024, - "segment_size": 256, - "overlap": 0.25, - "batch_size": 1, - "enable_denoise": enable_denoise - }, - demucs_params={ - "segment_size": 128, - "shifts": 2, - "overlap": 0.25, - "segments_enabled": config.configs.get("demucs_segments_enable", True) - }, - vr_params={ - "batch_size": 1, - "window_size": 512, - "aggression": 5, - "enable_tta": False, - "enable_post_process": False, - "post_process_threshold": 0.2, - "high_end_process": False - } - ) - separator.load_model(model_filename=model_filename) - - return separator.separate(audio_file) - -if __name__ == "__main__": main() \ No newline at end of file diff --git a/main/inference/training/anyprecision_optimizer.py b/main/inference/training/anyprecision_optimizer.py deleted file mode 100644 index d0b556dc445182436f321e97df944a7279ef12c5..0000000000000000000000000000000000000000 --- a/main/inference/training/anyprecision_optimizer.py +++ /dev/null @@ -1,61 +0,0 @@ -import torch - -from torch.optim.optimizer import Optimizer - -class AnyPrecisionAdamW(Optimizer): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0, use_kahan_summation=True, momentum_dtype=torch.bfloat16, variance_dtype=torch.bfloat16, compensation_buffer_dtype=torch.bfloat16): - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, use_kahan_summation=use_kahan_summation, momentum_dtype=momentum_dtype, variance_dtype=variance_dtype, compensation_buffer_dtype=compensation_buffer_dtype) - super().__init__(params, defaults) - - @torch.no_grad() - def step(self, closure=None): - if closure is not None: - with torch.enable_grad(): - closure() - - for group in self.param_groups: - beta1, beta2 = group["betas"] - lr = group["lr"] - weight_decay = group["weight_decay"] - eps = group["eps"] - use_kahan_summation = group["use_kahan_summation"] - momentum_dtype = group["momentum_dtype"] - variance_dtype = group["variance_dtype"] - compensation_buffer_dtype = group["compensation_buffer_dtype"] - - for p in group["params"]: - if p.grad is None: continue - if p.grad.is_sparse: raise RuntimeError - - state = self.state[p] - if len(state) == 0: - state["step"] = torch.tensor(0.0) - state["exp_avg"] = torch.zeros_like(p, dtype=momentum_dtype) - state["exp_avg_sq"] = torch.zeros_like(p, dtype=variance_dtype) - if use_kahan_summation: state["compensation"] = torch.zeros_like(p, dtype=compensation_buffer_dtype) - - state["step"] += 1 - step = state["step"] - exp_avg = state["exp_avg"] - exp_avg_sq = state["exp_avg_sq"] - - grad = p.grad - if weight_decay: p.data.mul_(1 - lr * weight_decay) - - exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) - exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) - - bias_correction1 = 1 - beta1 ** step - step_size = lr / bias_correction1 - - denom_correction = (1 - beta2**step) ** 0.5 - centered_variance = (exp_avg_sq.sqrt() / denom_correction).add_(eps, alpha=1) - - if use_kahan_summation: - compensation = state["compensation"] - compensation.addcdiv_(exp_avg, centered_variance, value=-step_size) - - temp_buffer = p.detach().clone() - p.data.add_(compensation) - compensation.add_(temp_buffer.sub_(p.data)) - else: p.data.addcdiv_(exp_avg, centered_variance, value=-step_size) \ No newline at end of file diff --git a/main/inference/training/data_utils.py b/main/inference/training/data_utils.py deleted file mode 100644 index d053cf84feee4c494e24642b545c00bb0e600117..0000000000000000000000000000000000000000 --- a/main/inference/training/data_utils.py +++ /dev/null @@ -1,273 +0,0 @@ -import os -import sys -import torch - -import numpy as np -import torch.utils.data as tdata - -sys.path.append(os.getcwd()) - -from main.app.variables import translations -from main.inference.training.mel_processing import spectrogram_torch -from main.inference.training.utils import load_filepaths_and_text, load_wav_to_torch - -class TextAudioLoader(tdata.Dataset): - def __init__(self, hparams, pitch_guidance=True, energy=False): - self.audiopaths_and_text = load_filepaths_and_text(hparams.training_files) - self.max_wav_value = hparams.max_wav_value - self.sample_rate = hparams.sample_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sample_rate = hparams.sample_rate - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 5000) - self.pitch_guidance = pitch_guidance - self.energy = energy - self._filter() - - def _filter(self): - audiopaths_and_text_new, lengths = [], [] - - for item in self.audiopaths_and_text: - audiopath = item[0] - text = item[1] - - if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_and_text_new.append(item) - lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) - - self.audiopaths_and_text = audiopaths_and_text_new - self.lengths = lengths - - def get_sid(self, sid): - try: - sid = torch.LongTensor([int(sid)]) - except ValueError: - sid = torch.LongTensor([0]) - - return sid - - def get_audio_text_pair(self, audiopath_and_text): - if self.energy: - if self.pitch_guidance: - phone, pitch, pitchf, energy = self.get_labels(audiopath_and_text[1], audiopath_and_text[2], audiopath_and_text[3], audiopath_and_text[4]) - spec, wav = self.get_audio(audiopath_and_text[0]) - dv = self.get_sid(audiopath_and_text[5]) - else: - phone, _, _, energy = self.get_labels(audiopath_and_text[1], energy=audiopath_and_text[2]) - spec, wav = self.get_audio(audiopath_and_text[0]) - dv = self.get_sid(audiopath_and_text[3]) - else: - if self.pitch_guidance: - phone, _, _, _ = self.get_labels(audiopath_and_text[1]) - spec, wav = self.get_audio(audiopath_and_text[0]) - dv = self.get_sid(audiopath_and_text[2]) - else: - phone, pitch, pitchf, _ = self.get_labels(audiopath_and_text[1], audiopath_and_text[2], audiopath_and_text[3]) - spec, wav = self.get_audio(audiopath_and_text[0]) - dv = self.get_sid(audiopath_and_text[4]) - - extra = audiopath_and_text[2:] - pitch = pitchf = energy = sid = None - - if self.pitch_guidance and self.energy: pitch, pitchf, energy, sid = extra - elif self.pitch_guidance: pitch, pitchf, sid = extra - elif self.energy: energy, sid = extra - else: pitch, pitchf, sid = extra - - spec, wav = self.get_audio(audiopath_and_text[0]) - dv = self.get_sid(sid) - - phone, pitch, pitchf, energy = self.get_labels( - audiopath_and_text[1], - pitch=pitch, - pitchf=pitchf, - energy=energy - ) - - len_phone = phone.size()[0] - len_spec = spec.size()[-1] - - if len_phone != len_spec: - len_min = min(len_phone, len_spec) - len_wav = len_min * self.hop_length - spec, wav, phone = spec[:, :len_min], wav[:, :len_wav], phone[:len_min, :] - if self.pitch_guidance: pitch, pitchf = pitch[:len_min], pitchf[:len_min] - if self.energy: energy = energy[:len_min] - - outputs = [spec, wav, phone, dv] - if self.pitch_guidance: outputs[3:3] = [pitch, pitchf] - if self.energy: outputs.append(energy) - - return tuple(outputs) - - def get_labels(self, phone, pitch=None, pitchf=None, energy=None): - phone = np.repeat(np.load(phone), 2, axis=0) - n_num = min(phone.shape[0], 900) - - return ( - torch.FloatTensor(phone[:n_num, :]), - torch.LongTensor(np.load(pitch)[:n_num]) if pitch else None, - torch.FloatTensor(np.load(pitchf)[:n_num]) if pitchf else None, - torch.FloatTensor(np.load(energy)[:n_num]) if energy else None - ) - - def get_audio(self, filename): - audio, sample_rate = load_wav_to_torch(filename) - if sample_rate != self.sample_rate: raise ValueError(translations["sr_does_not_match"].format(sample_rate=sample_rate, sample_rate2=self.sample_rate)) - - audio_norm = audio.unsqueeze(0) - spec_filename = filename.replace(".wav", ".spec.pt") - - if os.path.exists(spec_filename): - try: - spec = torch.load(spec_filename, weights_only=True) - except Exception: - spec = spectrogram_torch(audio_norm, self.filter_length, self.hop_length, self.win_length, center=False).squeeze(0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - else: - spec = spectrogram_torch(audio_norm, self.filter_length, self.hop_length, self.win_length, center=False).squeeze(0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - - return spec, audio_norm - - def __getitem__(self, index): - return self.get_audio_text_pair(self.audiopaths_and_text[index]) - - def __len__(self): - return len(self.audiopaths_and_text) - -class TextAudioCollate: - def __init__(self, return_ids=False, pitch_guidance=True, energy=False): - self.return_ids = return_ids - self.pitch_guidance = pitch_guidance - self.energy = energy - - def __call__(self, batch): - _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True) - spec_lengths, wave_lengths = torch.LongTensor(len(batch)), torch.LongTensor(len(batch)) - spec_padded, wave_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max([x[0].size(1) for x in batch])), torch.FloatTensor(len(batch), 1, max([x[1].size(1) for x in batch])) - spec_padded.zero_() - wave_padded.zero_() - - max_phone_len = max([x[2].size(0) for x in batch]) - phone_lengths, phone_padded = torch.LongTensor(len(batch)), torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1]) - phone_padded.zero_() - - if self.pitch_guidance: - pitch_padded, pitchf_padded = torch.LongTensor(len(batch), max_phone_len), torch.FloatTensor(len(batch), max_phone_len) - pitch_padded.zero_() - pitchf_padded.zero_() - - sid = torch.LongTensor(len(batch)) - - if self.energy: - energy_padded = torch.FloatTensor(len(batch), max_phone_len) - energy_padded.zero_() - - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - spec = row[0] - - spec_padded[i, :, : spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - - wave = row[1] - wave_padded[i, :, : wave.size(1)] = wave - wave_lengths[i] = wave.size(1) - - phone = row[2] - phone_padded[i, : phone.size(0), :] = phone - phone_lengths[i] = phone.size(0) - - if self.pitch_guidance: - pitch = row[3] - pitch_padded[i, : pitch.size(0)] = pitch - pitchf = row[4] - pitchf_padded[i, : pitchf.size(0)] = pitchf - - sid[i] = row[5 if self.pitch_guidance else 3] - - if self.energy: - energy = row[6 if self.pitch_guidance else 4] - energy_padded[i, : energy.size(0)] = energy - - outputs = [phone_padded, phone_lengths, spec_padded, spec_lengths, wave_padded, wave_lengths, sid] - if self.pitch_guidance: outputs[2:2] = [pitch_padded, pitchf_padded] - if self.energy: outputs.append(energy_padded) - - return tuple(outputs) - -class DistributedBucketSampler(tdata.distributed.DistributedSampler): - def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True): - super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) - self.lengths = dataset.lengths - self.batch_size = batch_size - self.boundaries = boundaries - self.buckets, self.num_samples_per_bucket = self._create_buckets() - self.total_size = sum(self.num_samples_per_bucket) - self.num_samples = self.total_size // self.num_replicas - - def _create_buckets(self): - buckets = [[] for _ in range(len(self.boundaries) - 1)] - - for i in range(len(self.lengths)): - idx_bucket = self._bisect(self.lengths[i]) - if idx_bucket != -1: buckets[idx_bucket].append(i) - - for i in range(len(buckets) - 1, -1, -1): - if len(buckets[i]) == 0: - buckets.pop(i) - self.boundaries.pop(i + 1) - - num_samples_per_bucket = [] - - for i in range(len(buckets)): - len_bucket = len(buckets[i]) - total_batch_size = self.num_replicas * self.batch_size - num_samples_per_bucket.append(len_bucket + ((total_batch_size - (len_bucket % total_batch_size)) % total_batch_size)) - - return buckets, num_samples_per_bucket - - def __iter__(self): - g = torch.Generator() - g.manual_seed(self.epoch) - indices, batches = [], [] - - if self.shuffle: - for bucket in self.buckets: - indices.append(torch.randperm(len(bucket), generator=g).tolist()) - else: - for bucket in self.buckets: - indices.append(list(range(len(bucket)))) - - for i in range(len(self.buckets)): - bucket = self.buckets[i] - len_bucket = len(bucket) - ids_bucket = indices[i] - rem = self.num_samples_per_bucket[i] - len_bucket - ids_bucket = (ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[: (rem % len_bucket)])[self.rank :: self.num_replicas] - - for j in range(len(ids_bucket) // self.batch_size): - batches.append([bucket[idx] for idx in ids_bucket[j * self.batch_size : (j + 1) * self.batch_size]]) - - if self.shuffle: batches = [batches[i] for i in torch.randperm(len(batches), generator=g).tolist()] - self.batches = batches - assert len(self.batches) * self.batch_size == self.num_samples - - return iter(self.batches) - - def _bisect(self, x, lo=0, hi=None): - if hi is None: hi = len(self.boundaries) - 1 - - if hi > lo: - mid = (hi + lo) // 2 - - if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: return mid - elif x <= self.boundaries[mid]: return self._bisect(x, lo, mid) - else: return self._bisect(x, mid + 1, hi) - else: return -1 - - def __len__(self): - return self.num_samples // self.batch_size \ No newline at end of file diff --git a/main/inference/training/extract_model.py b/main/inference/training/extract_model.py deleted file mode 100644 index 9058175cf035e039e4874cb6a71b1b5c0ca511a4..0000000000000000000000000000000000000000 --- a/main/inference/training/extract_model.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -import sys -import torch -import hashlib -import datetime - -from collections import OrderedDict - -sys.path.append(os.getcwd()) - -from main.app.variables import logger, translations, config -from main.inference.training.utils import replace_keys_in_dict - -def extract_model(ckpt, sr, pitch_guidance, name, model_path, epoch, step, version, hps, model_author, vocoder, energy_use): - try: - logger.info(translations["savemodel"].format(model_dir=model_path, epoch=epoch, step=step)) - os.makedirs(os.path.dirname(model_path), exist_ok=True) - - opt = OrderedDict(weight={key: (value if not config.device.startswith("privateuseone") else value.detach().cpu()).to(torch.float16 if config.is_half else torch.float32) for key, value in ckpt.items() if "enc_q" not in key}) - opt["config"] = [hps.data.filter_length // 2 + 1, 32, hps.model.inter_channels, hps.model.hidden_channels, hps.model.filter_channels, hps.model.n_heads, hps.model.n_layers, hps.model.kernel_size, hps.model.p_dropout, hps.model.resblock, hps.model.resblock_kernel_sizes, hps.model.resblock_dilation_sizes, hps.model.upsample_rates, hps.model.upsample_initial_channel, hps.model.upsample_kernel_sizes, hps.model.spk_embed_dim, hps.model.gin_channels, hps.data.sample_rate] - opt["epoch"] = f"{epoch}epoch" - opt["step"] = step - opt["sr"] = sr - opt["f0"] = int(pitch_guidance) - opt["version"] = version - opt["creation_date"] = datetime.datetime.now().isoformat() - opt["model_hash"] = hashlib.sha256(f"{str(ckpt)} {epoch} {step} {datetime.datetime.now().isoformat()}".encode()).hexdigest() - opt["model_name"] = name - opt["author"] = model_author - opt["vocoder"] = vocoder - opt["energy"] = energy_use - - torch.save(replace_keys_in_dict(replace_keys_in_dict(opt, ".parametrizations.weight.original1", ".weight_v"), ".parametrizations.weight.original0", ".weight_g"), model_path) - except Exception as e: - logger.error(f"{translations['extract_model_error']}: {e}") \ No newline at end of file diff --git a/main/inference/training/losses.py b/main/inference/training/losses.py deleted file mode 100644 index dcddc27df0f5a20bf6d854c6d4d3e96eab58b4b0..0000000000000000000000000000000000000000 --- a/main/inference/training/losses.py +++ /dev/null @@ -1,43 +0,0 @@ -def feature_loss(fmap_r, fmap_g): - loss = 0 - for dr, dg in zip(fmap_r, fmap_g): - for rl, gl in zip(dr, dg): - loss += (rl.float().detach() - gl.float()).abs().mean() - - return loss * 2 - -def discriminator_loss(disc_real_outputs, disc_generated_outputs): - loss = 0 - r_losses, g_losses = [], [] - for dr, dg in zip(disc_real_outputs, disc_generated_outputs): - dr = dr.float() - dg = dg.float() - r_loss = ((1 - dr) ** 2).mean() - g_loss = (dg**2).mean() - loss += r_loss + g_loss - r_losses.append(r_loss.item()) - g_losses.append(g_loss.item()) - - return loss, r_losses, g_losses - -def generator_loss(disc_outputs): - loss = 0 - gen_losses = [] - for dg in disc_outputs: - l = ((1 - dg.float()) ** 2).mean() - gen_losses.append(l) - loss += l - - return loss, gen_losses - -def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): - z_p = z_p.float() - logs_q = logs_q.float() - m_p = m_p.float() - logs_p = logs_p.float() - z_mask = z_mask.float() - - kl = logs_p - logs_q - 0.5 - kl += 0.5 * ((z_p - m_p) ** 2) * (-2.0 * logs_p).exp() - - return (kl * z_mask).sum() / z_mask.sum() \ No newline at end of file diff --git a/main/inference/training/mel_processing.py b/main/inference/training/mel_processing.py deleted file mode 100644 index 9434e5629d8cd4e0bfbc4ab7e25e42644703c0ae..0000000000000000000000000000000000000000 --- a/main/inference/training/mel_processing.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -import sys -import torch -import librosa - -sys.path.append(os.getcwd()) - -from main.library.backends.utils import STFT - -def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): - return (x.clamp(min=clip_val) * C).log() - -def dynamic_range_decompression_torch(x, C=1): - return x.exp() / C - -def spectral_normalize_torch(magnitudes): - return dynamic_range_compression_torch(magnitudes) - -def spectral_de_normalize_torch(magnitudes): - return dynamic_range_decompression_torch(magnitudes) - -stft = None -mel_basis, hann_window = {}, {} - -def spectrogram_torch(y, n_fft, hop_size, win_size, center=False): - global hann_window, stft - - wnsize_dtype_device = str(win_size) + "_" + str(y.dtype) + "_" + str(y.device) - if wnsize_dtype_device not in hann_window: hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) - pad = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect").squeeze(1) - - if str(y.device).startswith(("ocl", "privateuseone")): - if stft is None: stft = STFT(filter_length=n_fft, hop_length=hop_size, win_length=n_fft).to(y.device) - spec = stft.transform(pad.to(y.device), eps=1e-6, center=center) - else: - spec = torch.stft(pad, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device].to(pad.device), center=center, pad_mode="reflect", normalized=False, onesided=True, return_complex=True) - spec = (spec.real.pow(2) + spec.imag.pow(2) + 1e-6).sqrt() - - return spec.to(y.device) - -def spec_to_mel_torch(spec, n_fft, num_mels, sample_rate, fmin, fmax): - global mel_basis - - fmax_dtype_device = str(fmax) + "_" + str(spec.dtype) + "_" + str(spec.device) - if fmax_dtype_device not in mel_basis: mel_basis[fmax_dtype_device] = torch.from_numpy(librosa.filters.mel(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)).to(dtype=spec.dtype, device=spec.device) - - return spectral_normalize_torch(mel_basis[fmax_dtype_device] @ spec) - -def mel_spectrogram_torch(y, n_fft, num_mels, sample_rate, hop_size, win_size, fmin, fmax, center=False): - return spec_to_mel_torch(spectrogram_torch(y, n_fft, hop_size, win_size, center), n_fft, num_mels, sample_rate, fmin, fmax) - -class MultiScaleMelSpectrogramLoss(torch.nn.Module): - def __init__(self, sample_rate = 24000, n_mels=[5, 10, 20, 40, 80, 160, 320], window_lengths=[32, 64, 128, 256, 512, 1024, 2048], loss_fn=torch.nn.L1Loss()): - super().__init__() - self.sample_rate = sample_rate - self.loss_fn = loss_fn - self.log_base = torch.tensor(10.0).log() - self.stft_params = [] - self.hann_window = {} - self.mel_banks = {} - self.stft_params = [(mel, win) for mel, win in zip(n_mels, window_lengths)] - - def mel_spectrogram(self, wav, n_mels, window_length): - dtype_device = str(wav.dtype) + "_" + str(wav.device) - win_dtype_device = str(window_length) + "_" + dtype_device - mel_dtype_device = str(n_mels) + "_" + dtype_device - if win_dtype_device not in self.hann_window: self.hann_window[win_dtype_device] = torch.hann_window(window_length, device=wav.device, dtype=torch.float32) - wav = wav.float().squeeze(1) - - if str(wav.device).startswith(("ocl", "privateuseone")): - stft = torch.stft(wav.cpu(), n_fft=window_length, hop_length=window_length // 4, window=self.hann_window[win_dtype_device].cpu(), return_complex=True) - magnitude = (stft.real.pow(2) + stft.imag.pow(2) + 1e-6).sqrt().to(wav.device, dtype=torch.float32) - else: - stft = torch.stft(wav, n_fft=window_length, hop_length=window_length // 4, window=self.hann_window[win_dtype_device], return_complex=True) - magnitude = (stft.real.pow(2) + stft.imag.pow(2) + 1e-6).sqrt() - - if mel_dtype_device not in self.mel_banks: self.mel_banks[mel_dtype_device] = torch.from_numpy(librosa.filters.mel(sr=self.sample_rate, n_mels=n_mels, n_fft=window_length, fmin=0, fmax=None)).to(device=wav.device, dtype=torch.float32) - return self.mel_banks[mel_dtype_device] @ magnitude - - def forward(self, real, fake): - loss = 0.0 - for p in self.stft_params: - loss += self.loss_fn(self.mel_spectrogram(real, *p).clamp(min=1e-5).log() / self.log_base, self.mel_spectrogram(fake, *p).clamp(min=1e-5).log() / self.log_base) - return loss \ No newline at end of file diff --git a/main/inference/training/train.py b/main/inference/training/train.py deleted file mode 100644 index 98d33f72b5e915de53eec0029f490a20cc11f8a6..0000000000000000000000000000000000000000 --- a/main/inference/training/train.py +++ /dev/null @@ -1,716 +0,0 @@ -import os -import sys -import glob -import json -import torch -import logging -import argparse -import datetime -import warnings - -import torch.distributed as dist -import torch.multiprocessing as mp - -from tqdm import tqdm -from collections import deque -from contextlib import nullcontext -from random import randint, shuffle -from distutils.util import strtobool -from torch.utils.data import DataLoader -from torch.amp import GradScaler, autocast -from torch.utils.tensorboard import SummaryWriter - -from time import time as ttime -from torch.nn.parallel import DistributedDataParallel as DDP - -sys.path.append(os.getcwd()) -os.environ["USE_LIBUV"] = "0" if sys.platform == "win32" else "1" - -from main.library.utils import clear_gpu_cache -from main.library.backends import directml, opencl -from main.app.variables import logger, translations - -from main.library.algorithm import commons -from main.inference.training import losses - -from main.inference.training.extract_model import extract_model - -from main.inference.training.mel_processing import ( - MultiScaleMelSpectrogramLoss, - mel_spectrogram_torch, - spec_to_mel_torch -) - -from main.inference.training.utils import ( - HParams, - summarize, - load_checkpoint, - save_checkpoint, - load_wav_to_torch, - latest_checkpoint_path, - plot_spectrogram_to_numpy -) - -from main.app.variables import config as main_config -from main.app.variables import configs as main_configs - -warnings.filterwarnings("ignore") -logging.getLogger("torch").setLevel(logging.ERROR) - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--train", action='store_true') - parser.add_argument("--model_name", type=str, required=True) - parser.add_argument("--rvc_version", type=str, default="v2") - parser.add_argument("--save_every_epoch", type=int, required=True) - parser.add_argument("--save_only_latest", type=lambda x: bool(strtobool(x)), default=True) - parser.add_argument("--save_every_weights", type=lambda x: bool(strtobool(x)), default=True) - parser.add_argument("--total_epoch", type=int, default=300) - parser.add_argument("--batch_size", type=int, default=8) - parser.add_argument("--gpu", type=str, default="0") - parser.add_argument("--pitch_guidance", type=lambda x: bool(strtobool(x)), default=True) - parser.add_argument("--g_pretrained_path", type=str, default="") - parser.add_argument("--d_pretrained_path", type=str, default="") - parser.add_argument("--overtraining_detector", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--overtraining_threshold", type=int, default=50) - parser.add_argument("--cleanup", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--cache_data_in_gpu", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--model_author", type=str) - parser.add_argument("--vocoder", type=str, default="Default") - parser.add_argument("--checkpointing", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--deterministic", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--benchmark", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--optimizer", type=str, default="AdamW") - parser.add_argument("--energy_use", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--use_custom_reference", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--reference_path", type=str, default="") - parser.add_argument("--multiscale_mel_loss", type=lambda x: bool(strtobool(x)), default=False) - - return parser.parse_args() - -d_lr_coeff = 1.0 -g_lr_coeff = 1.0 -d_step_per_g_step = 1 - -args = parse_arguments() -model_name, save_every_epoch, total_epoch, pretrainG, pretrainD, version, gpus, batch_size, pitch_guidance, save_only_latest, save_every_weights, cache_data_in_gpu, overtraining_detector, overtraining_threshold, cleanup, model_author, vocoder, checkpointing, optimizer_choice, energy_use, use_custom_reference, reference_path, multiscale_mel_loss = args.model_name, args.save_every_epoch, args.total_epoch, args.g_pretrained_path, args.d_pretrained_path, args.rvc_version, args.gpu, args.batch_size, args.pitch_guidance, args.save_only_latest, args.save_every_weights, args.cache_data_in_gpu, args.overtraining_detector, args.overtraining_threshold, args.cleanup, args.model_author, args.vocoder, args.checkpointing, args.optimizer, args.energy_use, args.use_custom_reference, args.reference_path, args.multiscale_mel_loss - -experiment_dir = os.path.join(main_configs["logs_path"], model_name) -training_file_path = os.path.join(experiment_dir, "training_data.json") -config_save_path = os.path.join(experiment_dir, "config.json") - -torch.backends.cudnn.deterministic = args.deterministic if not main_config.device.startswith("ocl") else False -torch.backends.cudnn.benchmark = args.benchmark if not main_config.device.startswith("ocl") else False - -lowest_value = {"step": 0, "value": float("inf"), "epoch": 0} -global_step, last_loss_gen_all, overtrain_save_epoch = 0, 0, 0 -loss_gen_history, smoothed_loss_gen_history, loss_disc_history, smoothed_loss_disc_history = [], [], [], [] -avg_losses = {"grad_d_50": deque(maxlen=50), "grad_g_50": deque(maxlen=50), "disc_loss_50": deque(maxlen=50), "adv_loss_50": deque(maxlen=50), "fm_loss_50": deque(maxlen=50), "kl_loss_50": deque(maxlen=50), "mel_loss_50": deque(maxlen=50), "gen_loss_50": deque(maxlen=50)} - -with open(config_save_path, "r") as f: - config = json.load(f) - -config = HParams(**config) -config.data.training_files = os.path.join(experiment_dir, "filelist.txt") - -def main(): - global training_file_path, last_loss_gen_all, smoothed_loss_gen_history, loss_gen_history, loss_disc_history, smoothed_loss_disc_history, overtrain_save_epoch, model_author, vocoder, checkpointing, gpus, energy_use - - log_data = { - translations['modelname']: model_name, - translations["save_every_epoch"]: save_every_epoch, - translations["total_e"]: total_epoch, - translations["dorg"].format(pretrainG=pretrainG, pretrainD=pretrainD): "", - translations['training_version']: version, - "Gpu": gpus, - translations['batch_size']: batch_size, - translations['training_f0']: pitch_guidance, - translations['save_only_latest']: save_only_latest, - translations['save_every_weights']: save_every_weights, - translations['cache_in_gpu']: cache_data_in_gpu, - translations['overtraining_detector']: overtraining_detector, - translations['threshold']: overtraining_threshold, - translations['cleanup_training']: cleanup, - translations['memory_efficient_training']: checkpointing, - translations["optimizer"]: optimizer_choice, - translations["train&energy"]: energy_use, - translations["multiscale_mel_loss"]: multiscale_mel_loss - } - - if model_author: log_data[translations["model_author"].format(model_author=model_author)] = "" - if vocoder != "Default": log_data[translations['vocoder']] = vocoder - - for key, value in log_data.items(): - logger.debug(f"{key}: {value}" if value != "" else f"{key} {value}") - - try: - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = str(randint(20000, 55555)) - - wavs = glob.glob(os.path.join(os.path.join(experiment_dir, "sliced_audios"), "*.wav")) - if wavs: - _, sr = load_wav_to_torch(wavs[0]) - if sr != config.data.sample_rate: - logger.warning(translations["training_sr"].format(sr_1=config.data.sample_rate, sr_2=sr)) - sys.exit(1) - else: - logger.warning(translations["not_found_dataset"]) - sys.exit(1) - - if torch.cuda.is_available() and main_config.device.startswith("cuda"): - device, gpus = torch.device("cuda"), [int(item) for item in gpus.split("-")] - n_gpus = len(gpus) - elif opencl.is_available() and main_config.device.startswith("ocl"): - device, gpus = torch.device("ocl"), [int(item) for item in gpus.split("-")] - n_gpus = len(gpus) - elif directml.is_available() and main_config.device.startswith("privateuseone"): - device, gpus = torch.device("privateuseone"), [int(item) for item in gpus.split("-")] - n_gpus = len(gpus) - elif torch.backends.mps.is_available() and main_config.device.startswith("mps"): - device, gpus = torch.device("mps"), [0] - n_gpus = 1 - else: - device, gpus = torch.device("cpu"), [0] - n_gpus = 1 - logger.warning(translations["not_gpu"]) - - def start(): - children = [] - pid_data = {"process_pids": []} - - with open(config_save_path, "r") as pid_file: - try: - pid_data.update(json.load(pid_file)) - except json.JSONDecodeError: - pass - - with open(config_save_path, "w") as pid_file: - for rank, device_id in enumerate(gpus): - subproc = mp.Process(target=run, args=(rank, n_gpus, experiment_dir, pretrainG, pretrainD, pitch_guidance, total_epoch, save_every_weights, config, device, device_id, model_author, vocoder, checkpointing, energy_use)) - children.append(subproc) - subproc.start() - pid_data["process_pids"].append(subproc.pid) - - json.dump(pid_data, pid_file, indent=4) - - for i in range(n_gpus): - children[i].join() - - def load_from_json(file_path): - if os.path.exists(file_path): - with open(file_path, "r") as f: - data = json.load(f) - return (data.get("loss_disc_history", []), data.get("smoothed_loss_disc_history", []), data.get("loss_gen_history", []), data.get("smoothed_loss_gen_history", [])) - - return [], [], [], [] - - def continue_overtrain_detector(training_file_path): - if overtraining_detector and os.path.exists(training_file_path): (loss_disc_history, smoothed_loss_disc_history, loss_gen_history, smoothed_loss_gen_history) = load_from_json(training_file_path) - - if cleanup: - for root, dirs, files in os.walk(experiment_dir, topdown=False): - for name in files: - file_path = os.path.join(root, name) - file_name, file_extension = os.path.splitext(name) - if (file_extension == ".0" or (file_name.startswith(("D_", "G_")) and file_extension == ".pth") or (file_name.startswith(("added", "trained")) and file_extension == ".index")): os.remove(file_path) - - for name in dirs: - if name == "eval": - folder_path = os.path.join(root, name) - - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - if os.path.isfile(item_path): os.remove(item_path) - - os.rmdir(folder_path) - - continue_overtrain_detector(training_file_path) - start() - except Exception as e: - logger.error(f"{translations['training_error']} {e}") - import traceback - logger.debug(traceback.format_exc()) - -class EpochRecorder: - def __init__(self): - self.last_time = ttime() - - def record(self): - now_time = ttime() - elapsed_time = now_time - self.last_time - self.last_time = now_time - return translations["time_or_speed_training"].format(current_time=datetime.datetime.now().strftime("%H:%M:%S"), elapsed_time_str=str(datetime.timedelta(seconds=int(round(elapsed_time, 1))))) - -def run(rank, n_gpus, experiment_dir, pretrainG, pretrainD, pitch_guidance, custom_total_epoch, custom_save_every_weights, config, device, device_id, model_author, vocoder, checkpointing, energy_use): - global global_step, smoothed_value_gen, smoothed_value_disc, optimizer_choice - - smoothed_value_gen, smoothed_value_disc = 0, 0 - dist.init_process_group(backend="gloo" if sys.platform == "win32" or device.type != "cuda" else "nccl", init_method="env://", world_size=n_gpus if device.type == "cuda" else 1, rank=rank if device.type == "cuda" else 0) - - torch.manual_seed(config.train.seed) - if device.type == "cuda": torch.cuda.manual_seed(config.train.seed) - elif device.type == "ocl": opencl.pytorch_ocl.manual_seed_all(config.train.seed) - - if torch.cuda.is_available(): torch.cuda.set_device(device_id) - - writer_eval = SummaryWriter(log_dir=os.path.join(experiment_dir, "eval")) if rank == 0 else None - - from main.inference.training.data_utils import ( - DistributedBucketSampler, - TextAudioCollate, - TextAudioLoader - ) - - train_dataset = TextAudioLoader(config.data, pitch_guidance=pitch_guidance, energy=energy_use) - train_loader = DataLoader(train_dataset, num_workers=4, shuffle=False, pin_memory=True, collate_fn=TextAudioCollate(pitch_guidance=pitch_guidance, energy=energy_use), batch_sampler=DistributedBucketSampler(train_dataset, batch_size * n_gpus, [50, 100, 200, 300, 400, 500, 600, 700, 800, 900], num_replicas=n_gpus, rank=rank, shuffle=True), persistent_workers=True, prefetch_factor=8) - - if len(train_loader) < 3: - logger.warning(translations["not_enough_data"]) - sys.exit(1) - - from main.library.algorithm.synthesizers import Synthesizer - from main.library.algorithm.discriminators import MultiPeriodDiscriminator - - net_g, net_d = ( - Synthesizer( - config.data.filter_length // 2 + 1, - config.train.segment_size // config.data.hop_length, - **config.model, - use_f0=pitch_guidance, - sr=config.data.sample_rate, - vocoder=vocoder, - checkpointing=checkpointing, - energy=energy_use - ), - MultiPeriodDiscriminator( - version, - config.model.use_spectral_norm, - checkpointing=checkpointing - ) - ) - - net_g, net_d = (net_g.cuda(device_id), net_d.cuda(device_id)) if torch.cuda.is_available() else (net_g.to(device), net_d.to(device)) - - if optimizer_choice == "AnyPrecisionAdamW" and main_config.brain: - from main.inference.training.anyprecision_optimizer import AnyPrecisionAdamW - optimizer_optim = AnyPrecisionAdamW - elif optimizer_choice == "RAdam": - optimizer_optim = torch.optim.RAdam - else: - optimizer_optim = torch.optim.AdamW - - optim_g, optim_d = optimizer_optim(net_g.parameters(), config.train.learning_rate * g_lr_coeff, betas=config.train.betas, eps=config.train.eps), optimizer_optim(net_d.parameters(), config.train.learning_rate * d_lr_coeff, betas=config.train.betas, eps=config.train.eps) - fn_mel_loss = MultiScaleMelSpectrogramLoss(sample_rate=config.data.sample_rate) if multiscale_mel_loss else torch.nn.L1Loss() - - if not device.type.startswith(("privateuseone", "ocl")): - net_g, net_d = (DDP(net_g, device_ids=[device_id]), DDP(net_d, device_ids=[device_id])) if torch.cuda.is_available() else (DDP(net_g), DDP(net_d)) - - scaler_dict = {} - try: - logger.info(translations["start_training"]) - - _, _, _, epoch_str, scaler_dict = load_checkpoint(logger, (os.path.join(experiment_dir, "D_latest.pth") if save_only_latest else latest_checkpoint_path(experiment_dir, "D_*.pth")), net_d, optim_d) - _, _, _, epoch_str, _ = load_checkpoint(logger, (os.path.join(experiment_dir, "G_latest.pth") if save_only_latest else latest_checkpoint_path(experiment_dir, "G_*.pth")), net_g, optim_g) - - epoch_str += 1 - global_step = (epoch_str - 1) * len(train_loader) - except: - check = ["", "None"] - epoch_str, global_step = 1, 0 - strict = main_configs.get("pretrain_strict", True) - try: - if pretrainG not in check: - if rank == 0: logger.info(translations["import_pretrain"].format(dg="G", pretrain=pretrainG)) - - ckptG = torch.load(pretrainG, map_location="cpu", weights_only=True)["model"] - net_g.module.load_state_dict(ckptG, strict=strict) if hasattr(net_g, "module") else net_g.load_state_dict(ckptG, strict=strict) - del ckptG - - if pretrainD not in check: - if rank == 0: logger.info(translations["import_pretrain"].format(dg="D", pretrain=pretrainD)) - - ckptD = torch.load(pretrainD, map_location="cpu", weights_only=True)["model"] - net_d.module.load_state_dict(ckptD, strict=strict) if hasattr(net_d, "module") else net_d.load_state_dict(ckptD, strict=strict) - del ckptD - except Exception as e: - logger.warning(translations["checkpointing_err"]) - logger.error(e) - sys.exit(1) - - scheduler_g, scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=config.train.lr_decay, last_epoch=epoch_str - 2), torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=config.train.lr_decay, last_epoch=epoch_str - 2) - scaler = GradScaler(device=device, enabled=main_config.is_half and device.type == "cuda") - cache = [] - - if len(scaler_dict) > 0: scaler.load_state_dict(scaler_dict) - - if use_custom_reference and os.path.isfile(os.path.join(reference_path, "feats.npy")): - import numpy as np - - logger.info(translations["using_reference"].format(reference_name=reference_path)) - phone = np.repeat(np.load(os.path.join(reference_path, "feats.npy")), 2, axis=0) - - reference = ( - torch.FloatTensor(phone).unsqueeze(0).to(device), - torch.LongTensor([phone.shape[0]]).to(device), - torch.LongTensor(np.load(os.path.join(reference_path, "pitch_coarse.npy"))[:-1]).unsqueeze(0).to(device) if pitch_guidance else None, - torch.FloatTensor(np.load(os.path.join(reference_path, "pitch_fine.npy"))[:-1]).unsqueeze(0).to(device) if pitch_guidance else None, - torch.LongTensor([0]).to(device), - torch.FloatTensor(np.load(os.path.join(reference_path, "energy.npy"))[:-1]).unsqueeze(0).to(device) if energy_use else None - ) - else: - info = next(iter(train_loader)) - reference = (info[0].to(device), info[1].to(device)) - - if pitch_guidance: - reference += (info[2].to(device), info[3].to(device), info[8].to(device)) - reference += (info[9].to(device),) if energy_use else (None,) - else: - reference += (None, None, info[6].to(device)) - reference += (info[7].to(device),) if energy_use else (None,) - - for epoch in range(epoch_str, total_epoch + 1): - train_and_evaluate(rank, epoch, config, [net_g, net_d], [optim_g, optim_d], scaler, train_loader, writer_eval, cache, custom_save_every_weights, custom_total_epoch, device, device_id, reference, model_author, vocoder, energy_use, fn_mel_loss) - scheduler_g.step(); scheduler_d.step() - -def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, train_loader, writer, cache, custom_save_every_weights, custom_total_epoch, device, device_id, reference, model_author, vocoder, energy_use, fn_mel_loss): - global global_step, lowest_value, loss_disc, consecutive_increases_gen, consecutive_increases_disc, smoothed_value_gen, smoothed_value_disc - - if epoch == 1: - lowest_value = {"step": 0, "value": float("inf"), "epoch": 0} - consecutive_increases_gen, consecutive_increases_disc = 0, 0 - - net_g, net_d = nets - optim_g, optim_d = optims - train_loader.batch_sampler.set_epoch(epoch) - net_g.train(); net_d.train() - - if device.type == "cuda" and cache_data_in_gpu: - data_iterator = cache - if cache == []: - for batch_idx, info in enumerate(train_loader): - cache.append((batch_idx, [tensor.cuda(device_id, non_blocking=True) for tensor in info])) - else: shuffle(cache) - elif device.type in ["privateuseone", "ocl"] and cache_data_in_gpu: - data_iterator = cache - if cache == []: - for batch_idx, info in enumerate(train_loader): - cache.append((batch_idx, [tensor.to(device_id if device.type == "ocl" else device, non_blocking=True) for tensor in info])) - else: shuffle(cache) - else: data_iterator = enumerate(train_loader) - - epoch_recorder = EpochRecorder() - - autocast_enabled = main_config.is_half and device.type == "cuda" - autocast_dtype = torch.float32 if not autocast_enabled else (torch.bfloat16 if main_config.brain else torch.float16) - autocasts = autocast(device.type, enabled=autocast_enabled, dtype=autocast_dtype) if not device.type.startswith("ocl") else nullcontext() - - with tqdm(total=len(train_loader), leave=False) as pbar: - for batch_idx, info in data_iterator: - if device.type == "cuda" and not cache_data_in_gpu: info = [tensor.cuda(device_id, non_blocking=True) for tensor in info] - elif device.type in ["privateuseone", "ocl"] and not cache_data_in_gpu: info = [tensor.to(device_id if device.type == "ocl" else device, non_blocking=True) for tensor in info] - else: info = [tensor.to(device) for tensor in info] - - phone, phone_lengths = info[0], info[1] - if pitch_guidance: - pitch, pitchf = info[2], info[3] - spec, spec_lengths, wave, sid = info[4], info[5], info[6], info[8] - energy = info[9] if energy_use else None - else: - pitch = pitchf = None - spec, spec_lengths, wave, sid = info[2], info[3], info[4], info[6] - energy = info[7] if energy_use else None - - with autocasts: - y_hat, ids_slice, _, z_mask, (_, z_p, m_p, logs_p, _, logs_q) = net_g(phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid, energy) - wave = commons.slice_segments(wave, ids_slice * config.data.hop_length, config.train.segment_size, dim=3) - - for _ in range(d_step_per_g_step): - with autocasts: - y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) - loss_disc, losses_disc_r, losses_disc_g = losses.discriminator_loss(y_d_hat_r, y_d_hat_g) - - optim_d.zero_grad() - - if autocast_enabled: - scaler.scale(loss_disc).backward() - scaler.unscale_(optim_d) - grad_norm_d = commons.clip_grad_value(net_d.parameters(), None) - scaler.step(optim_d) - else: - loss_disc.backward() - grad_norm_d = commons.clip_grad_value(net_d.parameters(), None) - optim_d.step() - - with autocasts: - y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) - - if multiscale_mel_loss: - loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0 - else: - y_hat_mel = mel_spectrogram_torch( - y_hat.float().squeeze(1), - config.data.filter_length, - config.data.n_mel_channels, - config.data.sample_rate, - config.data.hop_length, - config.data.win_length, - config.data.mel_fmin, - config.data.mel_fmax - ) - loss_mel = fn_mel_loss( - mel_spectrogram_torch( - wave.float().squeeze(1), - config.data.filter_length, - config.data.n_mel_channels, - config.data.sample_rate, - config.data.hop_length, - config.data.win_length, - config.data.mel_fmin, - config.data.mel_fmax - ), - y_hat_mel - ) * config.train.c_mel - - if device.type == "privateuseone": - loss_kl = (losses.kl_loss(z_p.detach().cpu(), logs_q.detach().cpu(), m_p.detach().cpu(), logs_p.detach().cpu(), z_mask.detach().cpu()) * config.train.c_kl).to(device) - else: - loss_kl = losses.kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl - - loss_fm = losses.feature_loss(fmap_r, fmap_g) - loss_gen, losses_gen = losses.generator_loss(y_d_hat_g) - - loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl - if loss_gen_all < lowest_value["value"]: lowest_value = {"step": global_step, "value": loss_gen_all, "epoch": epoch} - - optim_g.zero_grad() - if autocast_enabled: - scaler.scale(loss_gen_all).backward() - scaler.unscale_(optim_g) - grad_norm_g = commons.clip_grad_value(net_g.parameters(), None) - scaler.step(optim_g) - scaler.update() - else: - loss_gen_all.backward() - grad_norm_g = commons.clip_grad_value(net_g.parameters(), None) - optim_g.step() - - global_step += 1 - - avg_losses["grad_d_50"].append(grad_norm_d) - avg_losses["grad_g_50"].append(grad_norm_g) - avg_losses["disc_loss_50"].append(loss_disc.detach()) - avg_losses["adv_loss_50"].append(loss_gen.detach()) - avg_losses["fm_loss_50"].append(loss_fm.detach()) - avg_losses["kl_loss_50"].append(loss_kl.detach()) - avg_losses["mel_loss_50"].append(loss_mel.detach()) - avg_losses["gen_loss_50"].append(loss_gen_all.detach()) - - if rank == 0 and global_step % 50 == 0: - scalar_dict = { - "grad_avg_50/norm_d": sum(avg_losses["grad_d_50"]) / len(avg_losses["grad_d_50"]), - "grad_avg_50/norm_g": sum(avg_losses["grad_g_50"]) / len(avg_losses["grad_g_50"]), - "loss_avg_50/d/adv": torch.stack(list(avg_losses["disc_loss_50"])).mean(), - "loss_avg_50/g/adv": torch.stack(list(avg_losses["adv_loss_50"])).mean(), - "loss_avg_50/g/fm": torch.stack(list(avg_losses["fm_loss_50"])).mean(), - "loss_avg_50/g/kl": torch.stack(list(avg_losses["kl_loss_50"])).mean(), - "loss_avg_50/g/mel": torch.stack(list(avg_losses["mel_loss_50"])).mean(), - "loss_avg_50/g/total": torch.stack(list(avg_losses["gen_loss_50"])).mean() - } - - summarize( - writer=writer, - global_step=global_step, - scalars=scalar_dict - ) - - pbar.update(1) - - with torch.no_grad(): - clear_gpu_cache() - - if rank == 0: - mel = spec_to_mel_torch( - spec, - config.data.filter_length, - config.data.n_mel_channels, - config.data.sample_rate, - config.data.mel_fmin, - config.data.mel_fmax - ) - y_mel = commons.slice_segments( - mel, - ids_slice, - config.train.segment_size // config.data.hop_length, - dim=3 - ) - y_hat_mel = mel_spectrogram_torch( - y_hat.float().squeeze(1), - config.data.filter_length, - config.data.n_mel_channels, - config.data.sample_rate, - config.data.hop_length, - config.data.win_length, - config.data.mel_fmin, - config.data.mel_fmax - ) - - scalar_dict = { - "loss/g/total": loss_gen_all, - "loss/d/adv": loss_disc, - "learning_rate": optim_g.param_groups[0]["lr"], - "grad/norm_d": grad_norm_d, - "grad/norm_g": grad_norm_g, - "loss/g/adv": loss_gen, - "loss/g/fm": loss_fm, - "loss/g/mel": loss_mel, - "loss/g/kl": loss_kl - } - - scalar_dict.update({f"loss/g/{i}": v for i, v in enumerate(losses_gen)}) - scalar_dict.update({f"loss/d_r/{i}": v for i, v in enumerate(losses_disc_r)}) - scalar_dict.update({f"loss/d_g/{i}": v for i, v in enumerate(losses_disc_g)}) - - image_dict = { - "slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), - "slice/mel_gen": plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), - "all/mel": plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), - } - - if epoch % save_every_epoch == 0: - with autocasts: - with torch.no_grad(): - o, *_ = net_g.module.infer(*reference) if hasattr(net_g, "module") else net_g.infer(*reference) - - summarize( - writer=writer, - global_step=global_step, - images=image_dict, - scalars=scalar_dict, - audios={ - f"gen/audio_{global_step:07d}": o[0, :, :] - }, - audio_sample_rate=config.data.sample_rate - ) - else: - summarize( - writer=writer, - global_step=global_step, - images=image_dict, - scalars=scalar_dict - ) - - def check_overtraining(smoothed_loss_history, threshold, epsilon=0.004): - if len(smoothed_loss_history) < threshold + 1: return False - - for i in range(-threshold, -1): - if smoothed_loss_history[i + 1] > smoothed_loss_history[i]: return True - if abs(smoothed_loss_history[i + 1] - smoothed_loss_history[i]) >= epsilon: return False - - return True - - def update_exponential_moving_average(smoothed_loss_history, new_value, smoothing=0.987): - smoothed_value = new_value if not smoothed_loss_history else (smoothing * smoothed_loss_history[-1] + (1 - smoothing) * new_value) - smoothed_loss_history.append(smoothed_value) - - return smoothed_value - - def save_to_json(file_path, loss_disc_history, smoothed_loss_disc_history, loss_gen_history, smoothed_loss_gen_history): - with open(file_path, "w") as f: - json.dump({"loss_disc_history": loss_disc_history, "smoothed_loss_disc_history": smoothed_loss_disc_history, "loss_gen_history": loss_gen_history, "smoothed_loss_gen_history": smoothed_loss_gen_history}, f) - - model_add, model_del = [], [] - done = False - - if rank == 0: - if epoch % save_every_epoch == False: - checkpoint_suffix = f"{'latest' if save_only_latest else global_step}.pth" - - save_checkpoint( - logger, - net_g, - optim_g, - config.train.learning_rate, - epoch, - os.path.join(experiment_dir, "G_" + checkpoint_suffix), - scaler - ) - save_checkpoint( - logger, - net_d, - optim_d, - config.train.learning_rate, - epoch, - os.path.join(experiment_dir, "D_" + checkpoint_suffix), - scaler - ) - - if custom_save_every_weights: model_add.append(os.path.join(main_configs["weights_path"], f"{model_name}_{epoch}e_{global_step}s.pth")) - - if overtraining_detector and epoch > 1: - current_loss_disc, current_loss_gen = float(loss_disc), float(lowest_value["value"]) - - loss_disc_history.append(current_loss_disc) - loss_gen_history.append(current_loss_gen) - - smoothed_value_disc = update_exponential_moving_average(smoothed_loss_disc_history, current_loss_disc) - smoothed_value_gen = update_exponential_moving_average(smoothed_loss_gen_history, current_loss_gen) - - is_overtraining_disc = check_overtraining(smoothed_loss_disc_history, overtraining_threshold * 2) - is_overtraining_gen = check_overtraining(smoothed_loss_gen_history, overtraining_threshold, 0.01) - - consecutive_increases_disc = (consecutive_increases_disc + 1) if is_overtraining_disc else 0 - consecutive_increases_gen = (consecutive_increases_gen + 1) if is_overtraining_gen else 0 - - if epoch % save_every_epoch == 0: save_to_json(training_file_path, loss_disc_history, smoothed_loss_disc_history, loss_gen_history, smoothed_loss_gen_history) - - if (is_overtraining_gen and consecutive_increases_gen == overtraining_threshold or is_overtraining_disc and consecutive_increases_disc == (overtraining_threshold * 2)): - logger.info(translations["overtraining_find"].format(epoch=epoch, smoothed_value_gen=f"{smoothed_value_gen:.3f}", smoothed_value_disc=f"{smoothed_value_disc:.3f}")) - done = True - else: - logger.info(translations["best_epoch"].format(epoch=epoch, smoothed_value_gen=f"{smoothed_value_gen:.3f}", smoothed_value_disc=f"{smoothed_value_disc:.3f}")) - for file in glob.glob(os.path.join(main_configs["weights_path"], f"{model_name}_*e_*s_best_epoch.pth")): - model_del.append(file) - - model_add.append(os.path.join(main_configs["weights_path"], f"{model_name}_{epoch}e_{global_step}s_best_epoch.pth")) - - if epoch >= custom_total_epoch: - logger.info(translations["success_training"].format(epoch=epoch, global_step=global_step, loss_gen_all=round(loss_gen_all.item(), 3))) - logger.info(translations["training_info"].format(lowest_value_rounded=round(float(lowest_value["value"]), 3), lowest_value_epoch=lowest_value['epoch'], lowest_value_step=lowest_value['step'])) - model_add.append(os.path.join(main_configs["weights_path"], f"{model_name}_{epoch}e_{global_step}s.pth")) - done = True - - for m in model_del: - os.remove(m) - - if model_add: - ckpt = (net_g.module.state_dict() if hasattr(net_g, "module") else net_g.state_dict()) - for m in model_add: - extract_model(ckpt=ckpt, sr=config.data.sample_rate, pitch_guidance=pitch_guidance == True, name=model_name, model_path=m, epoch=epoch, step=global_step, version=version, hps=hps, model_author=model_author, vocoder=vocoder, energy_use=energy_use) - - lowest_value_rounded = round(float(lowest_value["value"]), 3) - - if epoch > 1 and overtraining_detector: logger.info(translations["model_training_info"].format(model_name=model_name, epoch=epoch, global_step=global_step, epoch_recorder=epoch_recorder.record(), lowest_value_rounded=lowest_value_rounded, lowest_value_epoch=lowest_value['epoch'], lowest_value_step=lowest_value['step'], remaining_epochs_gen=(overtraining_threshold - consecutive_increases_gen), remaining_epochs_disc=((overtraining_threshold * 2) - consecutive_increases_disc), smoothed_value_gen=f"{smoothed_value_gen:.3f}", smoothed_value_disc=f"{smoothed_value_disc:.3f}")) - elif epoch > 1 and overtraining_detector == False: logger.info(translations["model_training_info_2"].format(model_name=model_name, epoch=epoch, global_step=global_step, epoch_recorder=epoch_recorder.record(), lowest_value_rounded=lowest_value_rounded, lowest_value_epoch=lowest_value['epoch'], lowest_value_step=lowest_value['step'])) - else: logger.info(translations["model_training_info_3"].format(model_name=model_name, epoch=epoch, global_step=global_step, epoch_recorder=epoch_recorder.record())) - - logger.debug(f"loss_gen_all: {loss_gen_all} loss_gen: {loss_gen} loss_fm: {loss_fm} loss_mel: {loss_mel} loss_kl: {loss_kl}") - last_loss_gen_all = loss_gen_all - - if done: - pid_file_path = os.path.join(experiment_dir, "config.json") - with open(pid_file_path, "r") as pid_file: - pid_data = json.load(pid_file) - - with open(pid_file_path, "w") as pid_file: - pid_data.pop("process_pids", None) - json.dump(pid_data, pid_file, indent=4) - - if os.path.exists(os.path.join(experiment_dir, "train_pid.txt")): os.remove(os.path.join(experiment_dir, "train_pid.txt")) - sys.exit(0) - - with torch.no_grad(): - clear_gpu_cache() - -if __name__ == "__main__": - mp.set_start_method("spawn") - main() \ No newline at end of file diff --git a/main/inference/training/utils.py b/main/inference/training/utils.py deleted file mode 100644 index 404ef67f319dd2f88fa4c90a54596e9c8801860a..0000000000000000000000000000000000000000 --- a/main/inference/training/utils.py +++ /dev/null @@ -1,138 +0,0 @@ -import os -import sys -import glob -import torch - -import numpy as np -import soundfile as sf -import matplotlib.pyplot as plt - -from collections import OrderedDict - -sys.path.append(os.getcwd()) - -from main.app.variables import config, translations - -MATPLOTLIB_FLAG = False - -def optimizer_device(optimizer, device="cpu"): - for state in optimizer.state.values(): - for k, v in state.items(): - if torch.is_tensor(v): state[k] = v.to(device) - - return optimizer - -def replace_keys_in_dict(d, old_key_part, new_key_part): - updated_dict = OrderedDict() if isinstance(d, OrderedDict) else {} - - for key, value in d.items(): - updated_dict[(key.replace(old_key_part, new_key_part) if isinstance(key, str) else key)] = (replace_keys_in_dict(value, old_key_part, new_key_part) if isinstance(value, dict) else value) - - return updated_dict - -def load_checkpoint(logger, checkpoint_path, model, optimizer=None, load_opt=1): - assert os.path.isfile(checkpoint_path), translations["not_found_checkpoint"].format(checkpoint_path=checkpoint_path) - - checkpoint_dict = replace_keys_in_dict(replace_keys_in_dict(torch.load(checkpoint_path, map_location="cpu", weights_only=True), ".weight_v", ".parametrizations.weight.original1"), ".weight_g", ".parametrizations.weight.original0") - new_state_dict = {k: checkpoint_dict["model"].get(k, v) for k, v in (model.module.state_dict() if hasattr(model, "module") else model.state_dict()).items()} - model.module.load_state_dict(new_state_dict, strict=False) if hasattr(model, "module") else model.load_state_dict(new_state_dict, strict=False) - - if optimizer and load_opt == 1: optimizer.load_state_dict(checkpoint_dict.get("optimizer", {})) - logger.debug(translations["save_checkpoint"].format(checkpoint_path=checkpoint_path, checkpoint_dict=checkpoint_dict['iteration'])) - - return (model, optimizer, checkpoint_dict.get("learning_rate", 0), checkpoint_dict["iteration"], checkpoint_dict.get("scaler", {})) - -def save_checkpoint(logger, model, optimizer, learning_rate, iteration, checkpoint_path, scaler): - state_dict = (model.module.state_dict() if hasattr(model, "module") else model.state_dict()) - torch.save( - replace_keys_in_dict( - replace_keys_in_dict({ - "model": state_dict if not config.device.startswith("privateuseone") else {key: value.detach().cpu() for key, value in state_dict.items()}, - "iteration": iteration, - "optimizer": (optimizer if not config.device.startswith("privateuseone") else optimizer_device(optimizer)).state_dict(), - "learning_rate": learning_rate, - "scaler": scaler.state_dict() - }, ".parametrizations.weight.original1", ".weight_v"), - ".parametrizations.weight.original0", ".weight_g" - ), - checkpoint_path - ) - - if config.device.startswith("privateuseone"): optimizer_device(optimizer, config.device) - logger.info(translations["save_model"].format(checkpoint_path=checkpoint_path, iteration=iteration)) - -def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sample_rate=22050): - for k, v in scalars.items(): - writer.add_scalar(k, v, global_step) - - for k, v in histograms.items(): - writer.add_histogram(k, v, global_step) - - for k, v in images.items(): - writer.add_image(k, v, global_step, dataformats="HWC") - - for k, v in audios.items(): - writer.add_audio(k, v, global_step, audio_sample_rate) - -def latest_checkpoint_path(dir_path, regex="G_*.pth"): - checkpoints = sorted(glob.glob(os.path.join(dir_path, regex)), key=lambda f: int("".join(filter(str.isdigit, f)))) - return checkpoints[-1] if checkpoints else None - -def plot_spectrogram_to_numpy(spectrogram): - global MATPLOTLIB_FLAG - - if not MATPLOTLIB_FLAG: - plt.switch_backend("Agg") - MATPLOTLIB_FLAG = True - - fig, ax = plt.subplots(figsize=(10, 2)) - plt.colorbar(ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none"), ax=ax) - plt.xlabel("Frames") - plt.ylabel("Channels") - plt.tight_layout() - fig.canvas.draw() - plt.close(fig) - - try: - data = np.array(fig.canvas.renderer.buffer_rgba(), dtype=np.uint8).reshape(fig.canvas.get_width_height()[::-1] + (4,))[:, :, :3] - except: - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="").reshape(fig.canvas.get_width_height()[::-1] + (3,)) - - return data - -def load_wav_to_torch(full_path): - data, sample_rate = sf.read(full_path, dtype=np.float32) - return torch.FloatTensor(data.astype(np.float32)), sample_rate - -def load_filepaths_and_text(filename, split="|"): - with open(filename, encoding="utf-8") as f: - return [line.strip().split(split) for line in f] - -class HParams: - def __init__(self, **kwargs): - for k, v in kwargs.items(): - self[k] = HParams(**v) if isinstance(v, dict) else v - - def keys(self): - return self.__dict__.keys() - - def items(self): - return self.__dict__.items() - - def values(self): - return self.__dict__.values() - - def __len__(self): - return len(self.__dict__) - - def __getitem__(self, key): - return self.__dict__[key] - - def __setitem__(self, key, value): - self.__dict__[key] = value - - def __contains__(self, key): - return key in self.__dict__ - - def __repr__(self): - return repr(self.__dict__) \ No newline at end of file diff --git a/main/library/algorithm/attentions.py b/main/library/algorithm/attentions.py deleted file mode 100644 index 81960a4f2f73e3819c842d7315059aa091ba910c..0000000000000000000000000000000000000000 --- a/main/library/algorithm/attentions.py +++ /dev/null @@ -1,146 +0,0 @@ -import os -import sys -import math -import torch - -import torch.nn as nn -import torch.nn.functional as F - -sys.path.append(os.getcwd()) - -from main.library.algorithm.commons import convert_pad_shape - -class MultiHeadAttention(nn.Module): - def __init__(self, channels, out_channels, n_heads, p_dropout=0.0, window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False, onnx=False): - super().__init__() - assert channels % n_heads == 0 - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.p_dropout = p_dropout - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - self.onnx = onnx - self.attn = None - self.k_channels = channels // n_heads - self.conv_q = nn.Conv1d(channels, channels, 1) - self.conv_k = nn.Conv1d(channels, channels, 1) - self.conv_v = nn.Conv1d(channels, channels, 1) - self.conv_o = nn.Conv1d(channels, out_channels, 1) - self.drop = nn.Dropout(p_dropout) - - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels**-0.5 - - self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - - nn.init.xavier_uniform_(self.conv_q.weight) - nn.init.xavier_uniform_(self.conv_k.weight) - nn.init.xavier_uniform_(self.conv_v.weight) - nn.init.xavier_uniform_(self.conv_o.weight) - - if proximal_init: - with torch.no_grad(): - self.conv_k.weight.copy_(self.conv_q.weight) - self.conv_k.bias.copy_(self.conv_q.bias) - - def forward(self, x, c, attn_mask=None): - q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c) - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - return self.conv_o(x) - - def attention(self, query, key, value, mask=None): - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - scores = (query / math.sqrt(self.k_channels)) @ key.transpose(-2, -1) - - if self.window_size is not None: - assert (t_s == t_t) - scores += self._relative_position_to_absolute_position(self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), self._get_relative_embeddings(self.emb_rel_k, t_s, onnx=self.onnx)), onnx=self.onnx) - - if self.proximal_bias: - assert t_s == t_t - scores += self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) - - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - assert (t_s == t_t) - scores = scores.masked_fill((torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)) == 0, -1e4) - - p_attn = self.drop(F.softmax(scores, dim=-1)) - output = p_attn @ value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - - if self.window_size is not None: output += self._matmul_with_relative_values(self._absolute_position_to_relative_position(p_attn, onnx=self.onnx), self._get_relative_embeddings(self.emb_rel_v, t_s, onnx=self.onnx)) - return (output.transpose(2, 3).contiguous().view(b, d, t_t)), p_attn - - def _matmul_with_relative_values(self, x, y): - return x @ y.unsqueeze(0) - - def _matmul_with_relative_keys(self, x, y): - return x @ y.unsqueeze(0).transpose(-2, -1) - - def _get_relative_embeddings(self, relative_embeddings, length, onnx=False): - if onnx: - pad_length = (length - (self.window_size + 1)).clamp(min=0) - slice_start_position = ((self.window_size + 1) - length).clamp(min=0) - - return (F.pad(relative_embeddings, [0, 0, pad_length, pad_length, 0, 0]) if pad_length > 0 else relative_embeddings)[:, slice_start_position:(slice_start_position + 2 * length - 1)] - else: - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - - return (F.pad(relative_embeddings, convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) if pad_length > 0 else relative_embeddings)[:, slice_start_position:(slice_start_position + 2 * length - 1)] - - def _relative_position_to_absolute_position(self, x, onnx=False): - batch, heads, length, _ = x.size() - - return (F.pad(F.pad(x, [0, 1, 0, 0, 0, 0, 0, 0]).view([batch, heads, length * 2 * length]), [0, length - 1, 0, 0, 0, 0]).view([batch, heads, length + 1, 2 * length - 1]) if onnx else F.pad(F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])).view([batch, heads, length * 2 * length]), convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])).view([batch, heads, length + 1, 2 * length - 1]))[:, :, :length, length - 1 :] - - def _absolute_position_to_relative_position(self, x, onnx=False): - batch, heads, length, _ = x.size() - - return (F.pad(F.pad(x, [0, length - 1, 0, 0, 0, 0, 0, 0]).view([batch, heads, length*length + length * (length - 1)]), [length, 0, 0, 0, 0, 0]).view([batch, heads, length, 2 * length]) if onnx else F.pad(F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])).view([batch, heads, length**2 + length * (length - 1)]), convert_pad_shape([[0, 0], [0, 0], [length, 0]])).view([batch, heads, length, 2 * length]))[:, :, :, 1:] - - def _attention_bias_proximal(self, length): - r = torch.arange(length, dtype=torch.float32) - - return -(r.unsqueeze(0) - r.unsqueeze(1)).abs().log1p().unsqueeze(0).unsqueeze(0) - -class FFN(nn.Module): - def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0, activation=None, causal=False, onnx=False): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - self.causal = causal - self.onnx = onnx - self.padding = self._causal_padding if causal else self._same_padding - self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) - self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) - self.drop = nn.Dropout(p_dropout) - - def forward(self, x, x_mask): - x = self.conv_1(self.padding(x * x_mask)) - - return self.conv_2(self.padding(self.drop(((x * (1.702 * x).sigmoid()) if self.activation == "gelu" else x.relu())) * x_mask)) * x_mask - - def _causal_padding(self, x): - if self.kernel_size == 1: return x - - return F.pad(x, [self.kernel_size - 1, 0, 0, 0, 0, 0]) if self.onnx else F.pad(x, convert_pad_shape([[0, 0], [0, 0], [(self.kernel_size - 1), 0]])) - - def _same_padding(self, x): - if self.kernel_size == 1: return x - - return F.pad(x, [(self.kernel_size - 1) // 2, self.kernel_size // 2, 0, 0, 0, 0]) if self.onnx else F.pad(x, convert_pad_shape([[0, 0], [0, 0], [((self.kernel_size - 1) // 2), (self.kernel_size // 2)]])) \ No newline at end of file diff --git a/main/library/algorithm/commons.py b/main/library/algorithm/commons.py deleted file mode 100644 index beef14f8f343ffda9fec1859a8dcb853bcda47e4..0000000000000000000000000000000000000000 --- a/main/library/algorithm/commons.py +++ /dev/null @@ -1,56 +0,0 @@ -import torch - -def init_weights(m, mean=0.0, std=0.01): - if m.__class__.__name__.find("Conv") != -1: m.weight.data.normal_(mean, std) - -def get_padding(kernel_size, dilation=1): - return int((kernel_size * dilation - dilation) / 2) - -def convert_pad_shape(pad_shape): - return [item for sublist in pad_shape[::-1] for item in sublist] - -def slice_segments(x, ids_str, segment_size = 4, dim = 2): - if dim == 2: ret = torch.zeros_like(x[:, :segment_size]) - elif dim == 3: ret = torch.zeros_like(x[:, :, :segment_size]) - - for i in range(x.size(0)): - idx_str = ids_str[i].item() - idx_end = idx_str + segment_size - - if dim == 2: ret[i] = x[i, idx_str:idx_end] - else: ret[i] = x[i, :, idx_str:idx_end] - - return ret - -def rand_slice_segments(x, x_lengths=None, segment_size=4): - b, _, t = x.size() - if x_lengths is None: x_lengths = t - - ids_str = (torch.rand([b]).to(device=x.device) * (x_lengths - segment_size + 1)).to(dtype=torch.long) - - return slice_segments(x, ids_str, segment_size, dim=3), ids_str - -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - - return in_act[:, :n_channels_int, :].tanh() * in_act[:, n_channels_int:, :].sigmoid() - -def sequence_mask(length, max_length = None): - if max_length is None: max_length = length.max() - return torch.arange(max_length, dtype=length.dtype, device=length.device).unsqueeze(0) < length.unsqueeze(1) - -def clip_grad_value(parameters, clip_value, norm_type=2): - if isinstance(parameters, torch.Tensor): parameters = [parameters] - norm_type = float(norm_type) - - if clip_value is not None: clip_value = float(clip_value) - total_norm = 0 - - for p in list(filter(lambda p: p.grad is not None, parameters)): - total_norm += (p.grad.data.norm(norm_type)).item() ** norm_type - - if clip_value is not None: p.grad.data.clamp_(min=-clip_value, max=clip_value) - - return total_norm ** (1.0 / norm_type) \ No newline at end of file diff --git a/main/library/algorithm/discriminators.py b/main/library/algorithm/discriminators.py deleted file mode 100644 index bbc315b56d187e79810f19aa04eab9976b256ac8..0000000000000000000000000000000000000000 --- a/main/library/algorithm/discriminators.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -import sys -import torch -import torch.nn.functional as F - -from torch.utils.checkpoint import checkpoint -from torch.nn.utils.parametrizations import spectral_norm, weight_norm - -sys.path.append(os.getcwd()) - -from main.library.algorithm.commons import get_padding -from main.library.algorithm.residuals import LRELU_SLOPE - -class MultiPeriodDiscriminator(torch.nn.Module): - def __init__(self, version, use_spectral_norm=False, checkpointing=False): - super(MultiPeriodDiscriminator, self).__init__() - self.checkpointing = checkpointing - periods = ([2, 3, 5, 7, 11, 17] if version == "v1" else [2, 3, 5, 7, 11, 17, 23, 37]) - self.discriminators = torch.nn.ModuleList([DiscriminatorS(use_spectral_norm=use_spectral_norm, checkpointing=checkpointing)] + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm, checkpointing=checkpointing) for p in periods]) - - def forward(self, y, y_hat): - y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], [] - - for d in self.discriminators: - if self.training and self.checkpointing: - def forward_discriminator(d, y, y_hat): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - - return y_d_r, fmap_r, y_d_g, fmap_g - y_d_r, fmap_r, y_d_g, fmap_g = checkpoint(forward_discriminator, d, y, y_hat, use_reentrant=False) - else: - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - - y_d_rs.append(y_d_r); fmap_rs.append(fmap_r) - y_d_gs.append(y_d_g); fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - -class DiscriminatorS(torch.nn.Module): - def __init__(self, use_spectral_norm=False, checkpointing=False): - super(DiscriminatorS, self).__init__() - self.checkpointing = checkpointing - norm_f = spectral_norm if use_spectral_norm else weight_norm - self.convs = torch.nn.ModuleList([norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)), norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)), norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)), norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)), norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2))]) - self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1)) - self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) - - def forward(self, x): - fmap = [] - - for conv in self.convs: - x = checkpoint(self.lrelu, checkpoint(conv, x, use_reentrant = False), use_reentrant = False) if self.training and self.checkpointing else self.lrelu(conv(x)) - fmap.append(x) - - x = self.conv_post(x) - fmap.append(x) - - return x.flatten(1, -1), fmap - -class DiscriminatorP(torch.nn.Module): - def __init__(self, period, kernel_size=5, use_spectral_norm=False, checkpointing=False): - super(DiscriminatorP, self).__init__() - self.period = period - self.checkpointing = checkpointing - norm_f = spectral_norm if use_spectral_norm else weight_norm - self.convs = torch.nn.ModuleList([norm_f(torch.nn.Conv2d(in_ch, out_ch, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))) for in_ch, out_ch, stride in zip([1, 32, 128, 512, 1024], [32, 128, 512, 1024, 1024], [3, 3, 3, 3, 1])]) - self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) - self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) - - def forward(self, x): - fmap = [] - b, c, t = x.shape - if t % self.period != 0: x = F.pad(x, (0, (self.period - (t % self.period))), "reflect") - x = x.view(b, c, -1, self.period) - - for conv in self.convs: - x = checkpoint(self.lrelu, checkpoint(conv, x, use_reentrant = False), use_reentrant = False) if self.training and self.checkpointing else self.lrelu(conv(x)) - fmap.append(x) - - x = self.conv_post(x) - fmap.append(x) - return x.flatten(1, -1), fmap \ No newline at end of file diff --git a/main/library/algorithm/encoders.py b/main/library/algorithm/encoders.py deleted file mode 100644 index 774b014a4631c590031b0c91eb2a9d9fec8bd927..0000000000000000000000000000000000000000 --- a/main/library/algorithm/encoders.py +++ /dev/null @@ -1,96 +0,0 @@ -import os -import sys -import math -import torch - -sys.path.append(os.getcwd()) - -from main.library.algorithm.modules import WaveNet -from main.library.algorithm.commons import sequence_mask -from main.library.algorithm.normalization import LayerNorm -from main.library.algorithm.attentions import MultiHeadAttention, FFN - -class Encoder(torch.nn.Module): - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.0, window_size=10, onnx=False, **kwargs): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - self.drop = torch.nn.Dropout(p_dropout) - self.attn_layers = torch.nn.ModuleList() - self.norm_layers_1 = torch.nn.ModuleList() - self.ffn_layers = torch.nn.ModuleList() - self.norm_layers_2 = torch.nn.ModuleList() - - for _ in range(self.n_layers): - self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size, onnx=onnx)) - self.norm_layers_1.append(LayerNorm(hidden_channels, onnx=onnx)) - - self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, onnx=onnx)) - self.norm_layers_2.append(LayerNorm(hidden_channels, onnx=onnx)) - - def forward(self, x, x_mask): - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - - for i in range(self.n_layers): - x = self.norm_layers_1[i](x + self.drop(self.attn_layers[i](x, x, attn_mask))) - x = self.norm_layers_2[i](x + self.drop(self.ffn_layers[i](x, x_mask))) - - return x * x_mask - -class TextEncoder(torch.nn.Module): - def __init__(self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, embedding_dim, f0=True, energy=False, onnx=False): - super(TextEncoder, self).__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True) - self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels) - self.emb_pitch = torch.nn.Embedding(256, hidden_channels) if f0 else None - self.emb_energy = torch.nn.Linear(1, hidden_channels) if energy else None - self.encoder = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, float(p_dropout), onnx=onnx) - self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, phone, pitch, lengths, energy): - x = self.emb_phone(phone) - - if pitch is not None: x += self.emb_pitch(pitch) - if energy is not None: x += self.emb_energy(energy.unsqueeze(-1)) - - x = self.lrelu(x * math.sqrt(self.hidden_channels)).transpose(1, -1) - x_mask = sequence_mask(lengths, x.size(2)).unsqueeze(1).to(x.dtype) - m, logs = (self.proj(self.encoder(x * x_mask, x_mask)) * x_mask).split(self.out_channels, dim=1) - - return m, logs, x_mask - -class PosteriorEncoder(torch.nn.Module): - def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0): - super(PosteriorEncoder, self).__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = WaveNet(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) - self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, x, x_lengths, g = None): - x_mask = sequence_mask(x_lengths, x.size(2)).unsqueeze(1).to(x.dtype) - m, logs = (self.proj(self.enc((self.pre(x) * x_mask), x_mask, g=g)) * x_mask).split(self.out_channels, dim=1) - - return ((m + torch.randn_like(m) * logs.exp()) * x_mask), m, logs, x_mask - - def remove_weight_norm(self): - self.enc.remove_weight_norm() \ No newline at end of file diff --git a/main/library/algorithm/modules.py b/main/library/algorithm/modules.py deleted file mode 100644 index e5b5a6df7016ff7f0ddb4e53b780d897cffe4b25..0000000000000000000000000000000000000000 --- a/main/library/algorithm/modules.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -import sys -import torch - -import torch.nn.utils.parametrize as parametrize - -sys.path.append(os.getcwd()) - -from .commons import fused_add_tanh_sigmoid_multiply - -class WaveNet(torch.nn.Module): - def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): - super(WaveNet, self).__init__() - assert kernel_size % 2 == 1 - self.hidden_channels = hidden_channels - self.kernel_size = (kernel_size,) - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - self.p_dropout = p_dropout - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = torch.nn.Dropout(p_dropout) - if gin_channels != 0: self.cond_layer = torch.nn.utils.parametrizations.weight_norm(torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1), name="weight") - dilations = [dilation_rate ** i for i in range(n_layers)] - paddings = [(kernel_size * d - d) // 2 for d in dilations] - - for i in range(n_layers): - in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilations[i], padding=paddings[i]) - in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight") - self.in_layers.append(in_layer) - res_skip_channels = (hidden_channels if i == n_layers - 1 else 2 * hidden_channels) - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight") - self.res_skip_layers.append(res_skip_layer) - - def forward(self, x, x_mask, g=None): - output = x.clone().zero_() - n_channels_tensor = torch.IntTensor([self.hidden_channels]) - - if g is not None: g = self.cond_layer(g) - - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - g_l = (g[:, i * 2 * self.hidden_channels : (i + 1) * 2 * self.hidden_channels, :] if g is not None else 0) - res_skip_acts = self.res_skip_layers[i](self.drop(fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor))) - - if i < self.n_layers - 1: - x = (x + (res_skip_acts[:, : self.hidden_channels, :])) * x_mask - output = output + res_skip_acts[:, self.hidden_channels :, :] - else: output = output + res_skip_acts - - return output * x_mask - - def remove_weight_norm(self): - if self.gin_channels != 0: - if hasattr(self.cond_layer, "parametrizations") and "weight" in self.cond_layer.parametrizations: parametrize.remove_parametrizations(self.cond_layer, "weight", leave_parametrized=True) - else: torch.nn.utils.remove_weight_norm(self.cond_layer) - - for l in self.in_layers: - if hasattr(l, "parametrizations") and "weight" in l.parametrizations: parametrize.remove_parametrizations(l, "weight", leave_parametrized=True) - else: torch.nn.utils.remove_weight_norm(l) - - for l in self.res_skip_layers: - if hasattr(l, "parametrizations") and "weight" in l.parametrizations: parametrize.remove_parametrizations(l, "weight", leave_parametrized=True) - else: torch.nn.utils.remove_weight_norm(l) \ No newline at end of file diff --git a/main/library/algorithm/normalization.py b/main/library/algorithm/normalization.py deleted file mode 100644 index 9a70d946641a06ae9609534241888f5357b80fc2..0000000000000000000000000000000000000000 --- a/main/library/algorithm/normalization.py +++ /dev/null @@ -1,16 +0,0 @@ -import torch - -import torch.nn.functional as F - -class LayerNorm(torch.nn.Module): - def __init__(self, channels, eps=1e-5, onnx=False): - super().__init__() - self.channels = channels - self.eps = eps - self.onnx = onnx - self.gamma = torch.nn.Parameter(torch.ones(channels)) - self.beta = torch.nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - x = x.transpose(1, -1) - return (F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) if self.onnx else F.layer_norm(x, (x.size(-1),), self.gamma, self.beta, self.eps)).transpose(1, -1) \ No newline at end of file diff --git a/main/library/algorithm/residuals.py b/main/library/algorithm/residuals.py deleted file mode 100644 index d1ccca66ff51806899ab13fc367c32398f0378f8..0000000000000000000000000000000000000000 --- a/main/library/algorithm/residuals.py +++ /dev/null @@ -1,135 +0,0 @@ -import os -import sys -import torch - -import torch.nn.utils.parametrize as parametrize - -from torch.nn.utils import remove_weight_norm -from torch.nn.utils.parametrizations import weight_norm - -sys.path.append(os.getcwd()) - -from .modules import WaveNet -from .commons import get_padding, init_weights - -LRELU_SLOPE = 0.1 - -def create_conv1d_layer(channels, kernel_size, dilation): - return weight_norm(torch.nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation, padding=get_padding(kernel_size, dilation))) - -def apply_mask(tensor, mask): - return tensor * mask if mask is not None else tensor - -class ResBlockBase(torch.nn.Module): - def __init__(self, channels, kernel_size, dilations): - super(ResBlockBase, self).__init__() - - self.convs1 = torch.nn.ModuleList([create_conv1d_layer(channels, kernel_size, d) for d in dilations]) - self.convs1.apply(init_weights) - - self.convs2 = torch.nn.ModuleList([create_conv1d_layer(channels, kernel_size, 1) for _ in dilations]) - self.convs2.apply(init_weights) - - def forward(self, x, x_mask=None): - for c1, c2 in zip(self.convs1, self.convs2): - x = c2(apply_mask(torch.nn.functional.leaky_relu(c1(apply_mask(torch.nn.functional.leaky_relu(x, LRELU_SLOPE), x_mask)), LRELU_SLOPE), x_mask)) + x - - return apply_mask(x, x_mask) - - def remove_weight_norm(self): - for conv in self.convs1 + self.convs2: - if hasattr(conv, "parametrizations") and "weight" in conv.parametrizations: parametrize.remove_parametrizations(conv, "weight", leave_parametrized=True) - else: remove_weight_norm(conv) - -class ResBlock(ResBlockBase): - def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super(ResBlock, self).__init__(channels, kernel_size, dilation) - -class Log(torch.nn.Module): - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = x.clamp_min(1e-5).log() * x_mask - return y, (-y).sum(dim=[1, 2]) - else: return x.exp() * x_mask - -class Flip(torch.nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - - if not reverse: return x, torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - else: return x - -class ElementwiseAffine(torch.nn.Module): - def __init__(self, channels): - super().__init__() - self.channels = channels - self.m = torch.nn.Parameter(torch.zeros(channels, 1)) - self.logs = torch.nn.Parameter(torch.zeros(channels, 1)) - - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: return ((self.m + self.logs.exp() * x) * x_mask), (self.logs * x_mask).sum(dim=[1, 2]) - else: return (x - self.m) * (-self.logs).exp() * x_mask - -class ResidualCouplingBlock(torch.nn.Module): - def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0): - super(ResidualCouplingBlock, self).__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.n_flows = n_flows - self.gin_channels = gin_channels - self.flows = torch.nn.ModuleList() - - for _ in range(n_flows): - self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) - self.flows.append(Flip()) - - def forward(self, x, x_mask, g = None, reverse = False): - if not reverse: - for flow in self.flows: - x, _ = flow(x, x_mask, g=g, reverse=reverse) - else: - for flow in reversed(self.flows): - x = flow.forward(x, x_mask, g=g, reverse=reverse) - - return x - - def remove_weight_norm(self): - for i in range(self.n_flows): - self.flows[i * 2].remove_weight_norm() - -class ResidualCouplingLayer(torch.nn.Module): - def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False): - assert channels % 2 == 0, "Channels/2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WaveNet(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) - self.post = torch.nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = x.split([self.half_channels] * 2, 1) - stats = self.post(self.enc((self.pre(x0) * x_mask), x_mask, g=g)) * x_mask - - if not self.mean_only: m, logs = stats.split([self.half_channels] * 2, 1) - else: - m = stats - logs = torch.zeros_like(m) - - if not reverse: return torch.cat([x0, (m + x1 * logs.exp() * x_mask)], 1), logs.sum(dim=[1, 2]) - else: return torch.cat([x0, ((x1 - m) * (-logs).exp() * x_mask)], 1) - - def remove_weight_norm(self): - self.enc.remove_weight_norm() \ No newline at end of file diff --git a/main/library/algorithm/stftpitchshift.py b/main/library/algorithm/stftpitchshift.py deleted file mode 100644 index ac8eb986c1bb21ed067ee75282082b2927420222..0000000000000000000000000000000000000000 --- a/main/library/algorithm/stftpitchshift.py +++ /dev/null @@ -1,250 +0,0 @@ -import numpy as np - -from numpy.lib.stride_tricks import sliding_window_view - -def istft(frames, framesize, hopsize): - frames = np.atleast_2d(frames) - assert frames.ndim == 2 - - analysis_window_size = np.ravel(framesize)[0] - synthesis_window_size = np.ravel(framesize)[-1] - - assert analysis_window_size >= synthesis_window_size - - A = asymmetric_analysis_window(analysis_window_size, synthesis_window_size) if analysis_window_size != synthesis_window_size else symmetric_window(analysis_window_size) - S = asymmetric_synthesis_window(analysis_window_size, synthesis_window_size) if analysis_window_size != synthesis_window_size else symmetric_window(synthesis_window_size) - - W = S * hopsize / np.sum(A * S) - N = frames.shape[0] * hopsize + analysis_window_size - - y = np.zeros((N), float) - - frames[:, 0] = 0 - frames[:, -1] = 0 - frames0 = sliding_window_view(y, analysis_window_size, writeable=True)[::hopsize] - frames1 = np.fft.irfft(frames, axis=-1, norm='forward') * W - - for i in range(min(len(frames0), len(frames1))): - frames0[i] += frames1[i] - - return y - -def asymmetric_synthesis_window(analysis_window_size, synthesis_window_size): - n = analysis_window_size - m = synthesis_window_size // 2 - - right = symmetric_window(2 * m) - window = np.zeros(n) - - window[n-m-m:n-m] = np.square(right[:m]) / symmetric_window(2 * n - 2 * m)[n-m-m:n-m] - window[-m:] = right[-m:] - - return window - -def asymmetric_analysis_window(analysis_window_size, synthesis_window_size): - n = analysis_window_size - m = synthesis_window_size // 2 - - window = np.zeros(n) - window[:n-m] = symmetric_window(2 * n - 2 * m)[:n-m] - window[-m:] = symmetric_window(2 * m)[-m:] - - return window - -def symmetric_window(symmetric_window_size): - n = symmetric_window_size - window = 0.5 - 0.5 * np.cos(2 * np.pi * np.arange(n) / n) - - return window - -def stft(x, framesize, hopsize): - x = np.atleast_1d(x) - assert x.ndim == 1 - - analysis_window_size = np.ravel(framesize)[0] - synthesis_window_size = np.ravel(framesize)[-1] - - assert analysis_window_size >= synthesis_window_size - - W = asymmetric_analysis_window(analysis_window_size, synthesis_window_size) if analysis_window_size != synthesis_window_size else symmetric_window(analysis_window_size) - - frames0 = sliding_window_view(x, analysis_window_size, writeable=False)[::hopsize] - frames1 = np.fft.rfft(frames0 * W, axis=-1, norm='forward') - - return frames1 - -def normalize(frames, frames0): - for i in range(len(frames)): - a = np.real(frames0[i]) - b = np.real(frames[i]) - a = np.dot(a, a) - b = np.dot(b, b) - - if b == 0: continue - frames[i] = np.real(frames[i]) * np.sqrt(a / b) + 1j * np.imag(frames[i]) - - return frames - -def lowpass(cepstrum, quefrency): - cepstrum[1:quefrency] *= 2 - cepstrum[quefrency+1:] = 0 - - return cepstrum - -def lifter(frames, quefrency): - envelopes = np.zeros(frames.shape) - - for i, frame in enumerate(frames): - with np.errstate(divide='ignore', invalid='ignore'): - spectrum = np.log10(np.real(frame)) - - envelopes[i] = np.power(10, np.real(np.fft.rfft(lowpass(np.fft.irfft(spectrum, norm='forward'), quefrency), norm='forward'))) - - return envelopes - -def resample(x, factor): - if factor == 1: return x.copy() - y = np.zeros(x.shape, dtype=x.dtype) - - n = len(x) - m = int(n * factor) - - i = np.arange(min(n, m)) - k = i * (n / m) - - j = np.trunc(k).astype(int) - k = k - j - - ok = (0 <= j) & (j < n - 1) - y[i[ok]] = k[ok] * x[j[ok] + 1] + (1 - k[ok]) * x[j[ok]] - - return y - -def shiftpitch(frames, factors, samplerate): - for i in range(len(frames)): - magnitudes = np.vstack([resample(np.real(frames[i]), factor) for factor in factors]) - frequencies = np.vstack([resample(np.imag(frames[i]), factor) * factor for factor in factors]) - - magnitudes[(frequencies <= 0) | (frequencies >= samplerate / 2)] = 0 - mask = np.argmax(magnitudes, axis=0) - - magnitudes = np.take_along_axis(magnitudes, mask[None,:], axis=0) - frequencies = np.take_along_axis(frequencies, mask[None,:], axis=0) - - frames[i] = magnitudes + 1j * frequencies - - return frames - -def wrap(x): - return (x + np.pi) % (2 * np.pi) - np.pi - -def encode(frames, framesize, hopsize, samplerate): - M, N = frames.shape - analysis_framesize = np.ravel(framesize)[0] - - freqinc = samplerate / analysis_framesize - phaseinc = 2 * np.pi * hopsize / analysis_framesize - - buffer = np.zeros(N) - data = np.zeros((M, N), complex) - - for m, frame in enumerate(frames): - arg = np.angle(frame) - delta = arg - buffer - - buffer = arg - - i = np.arange(N) - data[m] = np.abs(frame) + 1j * ((i + (wrap(delta - i * phaseinc) / phaseinc)) * freqinc) - - return data - -def decode(frames, framesize, hopsize, samplerate): - M, N = frames.shape - analysis_framesize = np.ravel(framesize)[0] - synthesis_framesize = np.ravel(framesize)[-1] - - freqinc = samplerate / analysis_framesize - phaseinc = 2 * np.pi * hopsize / analysis_framesize - timeshift = 2 * np.pi * synthesis_framesize * np.arange(N) / N if synthesis_framesize != analysis_framesize else 0 - - buffer = np.zeros(N) - data = np.zeros((M, N), complex) - - for m, frame in enumerate(frames): - i = np.arange(N) - delta = (i + ((np.imag(frame) - i * freqinc) / freqinc)) * phaseinc - buffer += delta - arg = buffer.copy() - arg -= timeshift - data[m] = np.real(frame) * np.exp(1j * arg) - - return data - -class StftPitchShift: - def __init__(self, framesize, hopsize, samplerate): - self.framesize = framesize - self.hopsize = hopsize - self.samplerate = samplerate - - def shiftpitch(self, input, factors = 1, quefrency = 0, distortion = 1, normalization = False): - input = np.atleast_1d(input) - dtype = input.dtype - shape = input.shape - - input = np.squeeze(input) - if input.ndim != 1: raise ValueError('input.ndim != 1') - - if np.issubdtype(dtype, np.integer): - a, b = np.iinfo(dtype).min, np.iinfo(dtype).max - input = ((input.astype(float) - a) / (b - a)) * 2 - 1 - elif not np.issubdtype(dtype, np.floating): raise TypeError('not np.issubdtype(dtype, np.floating)') - - def isnotnormal(x): - return (np.isinf(x)) | (np.isnan(x)) | (abs(x) < np.finfo(x.dtype).tiny) - - framesize = self.framesize - hopsize = self.hopsize - samplerate = self.samplerate - - factors = np.asarray(factors).flatten() - quefrency = int(quefrency * samplerate) - - frames = encode(stft(input, framesize, hopsize), framesize, hopsize, samplerate) - - if normalization: frames0 = frames.copy() - - if quefrency: - envelopes = lifter(frames, quefrency) - mask = isnotnormal(envelopes) - - frames.real /= envelopes - frames.real[mask] = 0 - - if distortion != 1: - envelopes[mask] = 0 - - for i in range(len(envelopes)): - envelopes[i] = resample(envelopes[i], distortion) - - mask = isnotnormal(envelopes) - - frames = shiftpitch(frames, factors, samplerate) - frames.real *= envelopes - frames.real[mask] = 0 - else: frames = shiftpitch(frames, factors, samplerate) - - if normalization: frames = normalize(frames, frames0) - - output = istft(decode(frames, framesize, hopsize, samplerate), framesize, hopsize) - output.resize(shape, refcheck=False) - - if np.issubdtype(dtype, np.integer): - a, b = np.iinfo(dtype).min, np.iinfo(dtype).max - output = (((output + 1) / 2) * (b - a) + a).clip(a, b).astype(dtype) - elif output.dtype != dtype: output = output.astype(dtype) - - assert output.dtype == dtype - assert output.shape == shape - - return output \ No newline at end of file diff --git a/main/library/algorithm/synthesizers.py b/main/library/algorithm/synthesizers.py deleted file mode 100644 index 33be6670c4400d35c94961a11a875fa22bd0af30..0000000000000000000000000000000000000000 --- a/main/library/algorithm/synthesizers.py +++ /dev/null @@ -1,96 +0,0 @@ -import os -import sys -import torch - -sys.path.append(os.getcwd()) - -from main.library.algorithm.residuals import ResidualCouplingBlock -from main.library.algorithm.encoders import TextEncoder, PosteriorEncoder -from main.library.algorithm.commons import slice_segments, rand_slice_segments - -class Synthesizer(torch.nn.Module): - def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, sr, use_f0, text_enc_hidden_dim=768, vocoder="Default", checkpointing=False, onnx=False, energy=False, **kwargs): - super(Synthesizer, self).__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - self.spk_embed_dim = spk_embed_dim - self.use_f0 = use_f0 - self.enc_p = TextEncoder(inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, float(p_dropout), text_enc_hidden_dim, f0=use_f0, energy=energy, onnx=onnx) - - if use_f0: - if vocoder == "RefineGAN": - from main.library.generators.refinegan import RefineGANGenerator - self.dec = RefineGANGenerator(sample_rate=sr, upsample_rates=upsample_rates, num_mels=inter_channels, checkpointing=checkpointing) - elif vocoder in ["MRF-HiFi-GAN", "MRF HiFi-GAN"]: - from main.library.generators.mrf_hifigan import HiFiGANMRFGenerator - self.dec = HiFiGANMRFGenerator(in_channel=inter_channels, upsample_initial_channel=upsample_initial_channel, upsample_rates=upsample_rates, upsample_kernel_sizes=upsample_kernel_sizes, resblock_kernel_sizes=resblock_kernel_sizes, resblock_dilations=resblock_dilation_sizes, gin_channels=gin_channels, sample_rate=sr, harmonic_num=8, checkpointing=checkpointing) - else: - from main.library.generators.nsf_hifigan import HiFiGANNRFGenerator - self.dec = HiFiGANNRFGenerator(inter_channels, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels, sr=sr, checkpointing=checkpointing) - else: - from main.library.generators.hifigan import HiFiGANGenerator - self.dec = HiFiGANGenerator(inter_channels, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) - - self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels) - self.emb_g = torch.nn.Embedding(self.spk_embed_dim, gin_channels) - - def remove_weight_norm(self): - for module in [self.dec, self.flow, self.enc_q]: - module.remove_weight_norm() - - @torch.jit.ignore - def forward(self, phone, phone_lengths, pitch = None, pitchf = None, y = None, y_lengths = None, ds = None, energy = None): - g = self.emb_g(ds).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths, energy) - - if y is not None: - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size) - - return (self.dec(z_slice, slice_segments(pitchf, ids_slice, self.segment_size, 2), g=g) if self.use_f0 else self.dec(z_slice, g=g)), ids_slice, x_mask, y_mask, (z, self.flow(z, y_mask, g=g), m_p, logs_p, m_q, logs_q) - else: return None, None, x_mask, None, (None, None, m_p, logs_p, None, None) - - @torch.jit.export - def infer(self, phone, phone_lengths, pitch = None, nsff0 = None, sid = None, energy = None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths, energy) - z_p = (m_p + logs_p.exp() * torch.randn_like(m_p) * 0.66666) * x_mask - - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, nsff0, g=g) if self.use_f0 else self.dec(z * x_mask, g=g) - - return o, x_mask, (z, z_p, m_p, logs_p) - -class SynthesizerONNX(Synthesizer): - def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, sr, use_f0, text_enc_hidden_dim=768, vocoder="Default", checkpointing=False, energy=False, **kwargs): - super().__init__(spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, sr, use_f0, text_enc_hidden_dim, vocoder, checkpointing, True, energy) - - def forward(self, phone, phone_lengths, g=None, rnd=None, pitch=None, nsff0=None, energy=None): - g = self.emb_g(g).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths, energy) - z_p = (m_p + logs_p.exp() * rnd) * x_mask - - z = self.flow(z_p, x_mask, g=g, reverse=True) - - return self.dec( - (z * x_mask)[:, :, :None], - nsff0, - g=g - ) if self.use_f0 else self.dec( - (z * x_mask)[:, :, :None], - g=g - ) \ No newline at end of file diff --git a/main/library/architectures/demucs_separator.py b/main/library/architectures/demucs_separator.py deleted file mode 100644 index ba41520d9b0f1568a4d5525dc821b0524c3014ed..0000000000000000000000000000000000000000 --- a/main/library/architectures/demucs_separator.py +++ /dev/null @@ -1,181 +0,0 @@ -import os -import sys -import yaml -import torch -import warnings - -import numpy as np - -from hashlib import sha256 - -sys.path.append(os.getcwd()) - -from main.app.variables import config -from main.library.utils import clear_gpu_cache -from main.library.uvr5_lib import spec_utils, common_separator -from main.library.uvr5_lib.demucs import hdemucs, states, apply - -warnings.filterwarnings("ignore") -sys.path.insert(0, os.path.join(os.getcwd(), "main", "library", "uvr5_lib")) - -DEMUCS_4_SOURCE_MAPPER = { - common_separator.CommonSeparator.BASS_STEM: 0, - common_separator.CommonSeparator.DRUM_STEM: 1, - common_separator.CommonSeparator.OTHER_STEM: 2, - common_separator.CommonSeparator.VOCAL_STEM: 3 -} - -class DemucsSeparator(common_separator.CommonSeparator): - def __init__(self, common_config, arch_config): - super().__init__(config=common_config) - self.segment_size = arch_config.get("segment_size", "Default") - self.shifts = arch_config.get("shifts", 2) - self.overlap = arch_config.get("overlap", 0.25) - self.segments_enabled = arch_config.get("segments_enabled", True) - self.demucs_source_map = DEMUCS_4_SOURCE_MAPPER - self.audio_file_path = None - self.audio_file_base = None - self.demucs_model_instance = None - if config.configs.get("demucs_cpu_mode", False): self.torch_device = torch.device("cpu") - - def separate(self, audio_file_path): - source = None - inst_source = {} - self.audio_file_path = audio_file_path - self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0] - mix = self.prepare_mix(self.audio_file_path) - self.demucs_model_instance = hdemucs.HDemucs(sources=["drums", "bass", "other", "vocals"]) - self.demucs_model_instance = get_demucs_model(name=os.path.splitext(os.path.basename(self.model_path))[0], repo=os.path.dirname(self.model_path)) - self.demucs_model_instance = apply.demucs_segments(self.segment_size, self.demucs_model_instance) - self.demucs_model_instance.to(self.torch_device) - self.demucs_model_instance.eval() - source = self.demix_demucs(mix) - del self.demucs_model_instance - clear_gpu_cache() - output_files = [] - - if isinstance(inst_source, np.ndarray): - inst_source[self.demucs_source_map[common_separator.CommonSeparator.VOCAL_STEM]] = spec_utils.reshape_sources(inst_source[self.demucs_source_map[common_separator.CommonSeparator.VOCAL_STEM]], source[self.demucs_source_map[common_separator.CommonSeparator.VOCAL_STEM]]) - source = inst_source - - if isinstance(source, np.ndarray): - source_length = len(source) - - if source_length == 2: - self.demucs_source_map = { - common_separator.CommonSeparator.INST_STEM: 0, - common_separator.CommonSeparator.VOCAL_STEM: 1 - } - elif source_length == 6: - self.demucs_source_map = { - common_separator.CommonSeparator.BASS_STEM: 0, - common_separator.CommonSeparator.DRUM_STEM: 1, - common_separator.CommonSeparator.OTHER_STEM: 2, - common_separator.CommonSeparator.VOCAL_STEM: 3, - common_separator.CommonSeparator.GUITAR_STEM: 4, - common_separator.CommonSeparator.PIANO_STEM: 5 - } - else: self.demucs_source_map = DEMUCS_4_SOURCE_MAPPER - - for stem_name, stem_value in self.demucs_source_map.items(): - if self.output_single_stem is not None: - if stem_name.lower() != self.output_single_stem.lower(): - continue - - stem_path = os.path.join(f"{self.audio_file_base}_({stem_name})_{self.model_name}.{self.output_format.lower()}") - self.final_process(stem_path, source[stem_value].T, stem_name) - output_files.append(stem_path) - - return output_files - - def demix_demucs(self, mix): - processed = {} - mix = torch.tensor(mix, dtype=torch.float32) - ref = mix.mean(0) - mix = (mix - ref.mean()) / ref.std() - mix_infer = mix - - with torch.no_grad(): - sources = apply.apply_model(model=self.demucs_model_instance, mix=mix_infer[None], shifts=self.shifts, split=self.segments_enabled, overlap=self.overlap, static_shifts=max(self.shifts, 1), set_progress_bar=None, device=self.torch_device, progress=True)[0] - - sources = (sources * ref.std() + ref.mean()).cpu().numpy() - sources[[0, 1]] = sources[[1, 0]] - - processed[mix] = sources[:, :, 0:None].copy() - return np.concatenate([s[:, :, 0:None] for s in list(processed.values())], axis=-1) - -class LocalRepo: - def __init__(self, root): - self.root = root - self.scan() - - def scan(self): - self._models, self._checksums = {}, {} - for filename in os.listdir(self.root): - filepath = os.path.join(self.root, filename) - if not os.path.isfile(filepath): continue - - if os.path.splitext(filename)[1] == ".th": - stem = os.path.splitext(filename)[0] - - if "-" in stem: - xp_sig, checksum = stem.split("-", 1) - self._checksums[xp_sig] = checksum - else: xp_sig = stem - - if xp_sig in self._models: raise RuntimeError - self._models[xp_sig] = filepath - - def has_model(self, sig): - return sig in self._models - - def get_model(self, sig): - try: - file = self._models[sig] - except KeyError: - raise RuntimeError - - if sig in self._checksums: check_checksum(file, self._checksums[sig]) - return states.load_model(file) - -class BagOnlyRepo: - def __init__(self, root, model_repo): - self.root = root - self.model_repo = model_repo - self.scan() - - def scan(self): - self._bags = {} - for filename in os.listdir(self.root): - filepath = os.path.join(self.root, filename) - - if os.path.isfile(filepath) and os.path.splitext(filename)[1] == ".yaml": - stem = os.path.splitext(filename)[0] - self._bags[stem] = filepath - - def get_model(self, name): - try: - yaml_file = self._bags[name] - except KeyError: - raise RuntimeError - - with open(yaml_file, 'r') as f: - bag = yaml.safe_load(f) - - return apply.BagOfModels([self.model_repo.get_model(sig) for sig in bag["models"]], bag.get("weights"), bag.get("segment")) - -def check_checksum(path, checksum): - sha = sha256() - - with open(path, "rb") as file: - while 1: - buf = file.read(2 ** 20) - if not buf: break - sha.update(buf) - - actual_checksum = sha.hexdigest()[:len(checksum)] - if actual_checksum != checksum: raise RuntimeError - -def get_demucs_model(name, repo = None): - model_repo = LocalRepo(repo) - return (model_repo.get_model(name) if model_repo.has_model(name) else BagOnlyRepo(repo, model_repo).get_model(name)).eval() \ No newline at end of file diff --git a/main/library/architectures/mdx_separator.py b/main/library/architectures/mdx_separator.py deleted file mode 100644 index 11b3bdb4cecba46ea58b52613504b4dc3ed13c1b..0000000000000000000000000000000000000000 --- a/main/library/architectures/mdx_separator.py +++ /dev/null @@ -1,240 +0,0 @@ -import os -import sys -import onnx -import torch -import platform -import warnings -import onnx2torch - -import numpy as np -import onnxruntime as ort - -from tqdm import tqdm - -sys.path.append(os.getcwd()) - -from main.library.uvr5_lib import spec_utils -from main.library.uvr5_lib.common_separator import CommonSeparator - -warnings.filterwarnings("ignore") - -class MDXSeparator(CommonSeparator): - def __init__(self, common_config, arch_config): - super().__init__(config=common_config) - self.segment_size = arch_config.get("segment_size") - self.overlap = arch_config.get("overlap") - self.batch_size = arch_config.get("batch_size", 1) - self.hop_length = arch_config.get("hop_length") - self.enable_denoise = arch_config.get("enable_denoise") - self.compensate = self.model_data["compensate"] - self.dim_f = self.model_data["mdx_dim_f_set"] - self.dim_t = 2 ** self.model_data["mdx_dim_t_set"] - self.n_fft = self.model_data["mdx_n_fft_scale_set"] - self.config_yaml = self.model_data.get("config_yaml", None) - self.load_model() - self.n_bins = 0 - self.trim = 0 - self.chunk_size = 0 - self.gen_size = 0 - self.stft = None - self.primary_source = None - self.secondary_source = None - self.audio_file_path = None - self.audio_file_base = None - - def load_model(self): - if self.segment_size == self.dim_t: - ort_session_options = ort.SessionOptions() - ort_session_options.log_severity_level = 3 - ort_inference_session = ort.InferenceSession(self.model_path, providers=self.onnx_execution_provider, sess_options=ort_session_options) - self.model_run = lambda spek: ort_inference_session.run(None, {"input": spek.cpu().numpy()})[0] - else: - self.model_run = onnx2torch.convert(onnx.load(self.model_path)) if platform.system() == 'Windows' else onnx2torch.convert(self.model_path) - self.model_run.to(self.torch_device).eval() - - def separate(self, audio_file_path): - self.audio_file_path = audio_file_path - self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0] - mix = self.prepare_mix(self.audio_file_path) - mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold) - source = self.demix(mix) - output_files = [] - - if not isinstance(self.primary_source, np.ndarray): - self.primary_source = spec_utils.normalize(wave=source, max_peak=self.normalization_threshold).T - - if not isinstance(self.secondary_source, np.ndarray): - raw_mix = self.demix(mix, is_match_mix=True) - - if self.invert_using_spec: - self.secondary_source = spec_utils.invert_stem(raw_mix, source) - else: - self.secondary_source = mix.T - source.T - - if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower(): - self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}") - self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name) - output_files.append(self.secondary_stem_output_path) - - if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower(): - self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}") - if not isinstance(self.primary_source, np.ndarray): self.primary_source = source.T - - self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name) - output_files.append(self.primary_stem_output_path) - - return output_files - - def initialize_model_settings(self): - self.n_bins = self.n_fft // 2 + 1 - self.trim = self.n_fft // 2 - self.chunk_size = self.hop_length * (self.segment_size - 1) - self.gen_size = self.chunk_size - 2 * self.trim - self.stft = STFT(self.n_fft, self.hop_length, self.dim_f, self.torch_device) - - def initialize_mix(self, mix, is_ckpt=False): - if is_ckpt: - pad = self.gen_size + self.trim - (mix.shape[-1] % self.gen_size) - mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, pad), dtype="float32")), 1) - - num_chunks = mixture.shape[-1] // self.gen_size - mix_waves = [mixture[:, i * self.gen_size : i * self.gen_size + self.chunk_size] for i in range(num_chunks)] - else: - mix_waves = [] - n_sample = mix.shape[1] - - pad = self.gen_size - n_sample % self.gen_size - mix_p = np.concatenate((np.zeros((2, self.trim)), mix, np.zeros((2, pad)), np.zeros((2, self.trim))), 1) - - i = 0 - while i < n_sample + pad: - mix_waves.append(np.array(mix_p[:, i : i + self.chunk_size])) - i += self.gen_size - - mix_waves_tensor = torch.tensor(mix_waves, dtype=torch.float32).to(self.torch_device) - return mix_waves_tensor, pad - - def demix(self, mix, is_match_mix=False): - self.initialize_model_settings() - tar_waves_ = [] - - if is_match_mix: - chunk_size = self.hop_length * (self.segment_size - 1) - overlap = 0.02 - else: - chunk_size = self.chunk_size - overlap = self.overlap - - gen_size = chunk_size - 2 * self.trim - mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, gen_size + self.trim - ((mix.shape[-1]) % gen_size)), dtype="float32")), 1) - step = int((1 - overlap) * chunk_size) - - result = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32) - divider = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32) - total = 0 - - for i in tqdm(range(0, mixture.shape[-1], step), ncols=100, unit="f"): - total += 1 - start = i - end = min(i + chunk_size, mixture.shape[-1]) - - chunk_size_actual = end - start - window = None - - if overlap != 0: - window = np.hanning(chunk_size_actual) - window = np.tile(window[None, None, :], (1, 2, 1)) - - mix_part_ = mixture[:, start:end] - - if end != i + chunk_size: - pad_size = (i + chunk_size) - end - mix_part_ = np.concatenate((mix_part_, np.zeros((2, pad_size), dtype="float32")), axis=-1) - - mix_waves = torch.tensor([mix_part_], dtype=torch.float32).to(self.torch_device).split(self.batch_size) - - with torch.no_grad(): - batches_processed = 0 - - for mix_wave in mix_waves: - batches_processed += 1 - tar_waves = self.run_model(mix_wave, is_match_mix=is_match_mix) - - if window is not None: - tar_waves[..., :chunk_size_actual] *= window - divider[..., start:end] += window - else: divider[..., start:end] += 1 - - result[..., start:end] += tar_waves[..., : end - start] - - tar_waves = result / divider - tar_waves_.append(tar_waves) - tar_waves = np.concatenate(np.vstack(tar_waves_)[:, :, self.trim : -self.trim], axis=-1)[:, : mix.shape[-1]] - - source = tar_waves[:, 0:None] - - if not is_match_mix: - source *= self.compensate - - return source - - def run_model(self, mix, is_match_mix=False): - spek = self.stft(mix.to(self.torch_device)) - spek[:, :, :3, :] *= 0 - - if is_match_mix: - spec_pred = spek.cpu().numpy() - else: - if self.enable_denoise: - spec_pred_neg = self.model_run(-spek) - spec_pred_pos = self.model_run(spek) - spec_pred = (spec_pred_neg * -0.5) + (spec_pred_pos * 0.5) - else: - spec_pred = self.model_run(spek) - - result = self.stft.inverse(torch.tensor(spec_pred).to(self.torch_device)).cpu().detach().numpy() - return result - -class STFT: - def __init__(self, n_fft, hop_length, dim_f, device): - self.n_fft = n_fft - self.hop_length = hop_length - self.dim_f = dim_f - self.device = device - self.hann_window = torch.hann_window(window_length=self.n_fft, periodic=True) - - def __call__(self, input_tensor): - is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"] - if is_non_standard_device: input_tensor = input_tensor.cpu() - - batch_dimensions = input_tensor.shape[:-2] - channel_dim, time_dim = input_tensor.shape[-2:] - - permuted_stft_output = torch.stft(input_tensor.reshape([-1, time_dim]), n_fft=self.n_fft, hop_length=self.hop_length, window=self.hann_window.to(input_tensor.device), center=True, return_complex=False).permute([0, 3, 1, 2]) - final_output = permuted_stft_output.reshape([*batch_dimensions, channel_dim, 2, -1, permuted_stft_output.shape[-1]]).reshape([*batch_dimensions, channel_dim * 2, -1, permuted_stft_output.shape[-1]]) - - if is_non_standard_device: final_output = final_output.to(self.device) - return final_output[..., : self.dim_f, :] - - def pad_frequency_dimension(self, input_tensor, batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins): - return torch.cat([input_tensor, torch.zeros([*batch_dimensions, channel_dim, num_freq_bins - freq_dim, time_dim]).to(input_tensor.device)], -2) - - def calculate_inverse_dimensions(self, input_tensor): - channel_dim, freq_dim, time_dim = input_tensor.shape[-3:] - - return input_tensor.shape[:-3], channel_dim, freq_dim, time_dim, self.n_fft // 2 + 1 - - def prepare_for_istft(self, padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim): - permuted_tensor = padded_tensor.reshape([*batch_dimensions, channel_dim // 2, 2, num_freq_bins, time_dim]).reshape([-1, 2, num_freq_bins, time_dim]).permute([0, 2, 3, 1]) - - return permuted_tensor[..., 0] + permuted_tensor[..., 1] * 1.0j - - def inverse(self, input_tensor): - is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"] - if is_non_standard_device: input_tensor = input_tensor.cpu() - - batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins = self.calculate_inverse_dimensions(input_tensor) - final_output = torch.istft(self.prepare_for_istft(self.pad_frequency_dimension(input_tensor, batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins), batch_dimensions, channel_dim, num_freq_bins, time_dim), n_fft=self.n_fft, hop_length=self.hop_length, window=self.hann_window.to(input_tensor.device), center=True).reshape([*batch_dimensions, 2, -1]) - - if is_non_standard_device: final_output = final_output.to(self.device) - return final_output \ No newline at end of file diff --git a/main/library/architectures/vr_separator.py b/main/library/architectures/vr_separator.py deleted file mode 100644 index 8d9fd6bd7d6efc93fe0dcdc317d771d371bce849..0000000000000000000000000000000000000000 --- a/main/library/architectures/vr_separator.py +++ /dev/null @@ -1,211 +0,0 @@ -import os -import sys -import math -import torch -import librosa -import audioread - -import numpy as np - -from tqdm import tqdm - -sys.path.append(os.getcwd()) - -from main.app.variables import configs -from main.library.uvr5_lib import spec_utils -from main.library.uvr5_lib.vr_network import nets -from main.library.uvr5_lib.vr_network import nets_new -from main.library.uvr5_lib.common_separator import CommonSeparator -from main.library.uvr5_lib.vr_network.model_param_init import ModelParameters - -class VRSeparator(CommonSeparator): - def __init__(self, common_config, arch_config): - super().__init__(config=common_config) - self.model_capacity = 32, 128 - self.is_vr_51_model = False - - if "nout" in self.model_data.keys() and "nout_lstm" in self.model_data.keys(): - self.model_capacity = self.model_data["nout"], self.model_data["nout_lstm"] - self.is_vr_51_model = True - - self.model_params = ModelParameters(os.path.join(configs["binary_path"], "vr_params.bin"), f"{self.model_data['vr_model_param']}.json") - self.enable_tta = arch_config.get("enable_tta", False) - self.enable_post_process = arch_config.get("enable_post_process", False) - self.post_process_threshold = arch_config.get("post_process_threshold", 0.2) - self.batch_size = arch_config.get("batch_size", 1) - self.window_size = arch_config.get("window_size", 512) - self.high_end_process = arch_config.get("high_end_process", False) - self.input_high_end_h = None - self.input_high_end = None - self.aggression = float(int(arch_config.get("aggression", 5)) / 100) - self.aggressiveness = {"value": self.aggression, "split_bin": self.model_params.param["band"]["1"]["crop_stop"], "aggr_correction": self.model_params.param.get("aggr_correction")} - self.model_samplerate = self.model_params.param["sr"] - self.wav_subtype = "PCM_16" - - def separate(self, audio_file_path, custom_output_names=None): - self.primary_source = None - self.secondary_source = None - self.audio_file_path = audio_file_path - self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0] - nn_arch_sizes = [31191, 33966, 56817, 123821, 123812, 129605, 218409, 537238, 537227] - vr_5_1_models = [56817, 218409] - model_size = math.ceil(os.stat(self.model_path).st_size / 1024) - nn_arch_size = min(nn_arch_sizes, key=lambda x: abs(x - model_size)) - - if nn_arch_size in vr_5_1_models or self.is_vr_51_model: - self.model_run = nets_new.CascadedNet(self.model_params.param["bins"] * 2, nn_arch_size, nout=self.model_capacity[0], nout_lstm=self.model_capacity[1]) - self.is_vr_51_model = True - else: - self.model_run = nets.determine_model_capacity(self.model_params.param["bins"] * 2, nn_arch_size) - - self.model_run.load_state_dict(torch.load(self.model_path, map_location="cpu", weights_only=True)) - self.model_run.to(self.torch_device) - - y_spec, v_spec = self.inference_vr(self.loading_mix(), self.torch_device, self.aggressiveness) - y_spec = np.nan_to_num(y_spec, nan=0.0, posinf=0.0, neginf=0.0) - v_spec = np.nan_to_num(v_spec, nan=0.0, posinf=0.0, neginf=0.0) - - output_files = [] - if self.output_single_stem and (self.output_single_stem.lower() != self.primary_stem_name.lower() and self.output_single_stem.lower() != self.secondary_stem_name.lower()): - self.output_single_stem = None - - if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower(): - if not isinstance(self.primary_source, np.ndarray): - self.primary_source = self.spec_to_wav(y_spec).T - if not self.model_samplerate == 44100: - self.primary_source = librosa.resample(self.primary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T - - self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}") - self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name) - output_files.append(self.primary_stem_output_path) - - if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower(): - if not isinstance(self.secondary_source, np.ndarray): - self.secondary_source = self.spec_to_wav(v_spec).T - if not self.model_samplerate == 44100: - self.secondary_source = librosa.resample(self.secondary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T - - self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}") - self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name) - output_files.append(self.secondary_stem_output_path) - - return output_files - - def loading_mix(self): - X_wave, X_spec_s = {}, {} - bands_n = len(self.model_params.param["band"]) - - audio_file = spec_utils.write_array_to_mem(self.audio_file_path, subtype=self.wav_subtype) - is_mp3 = audio_file.endswith(".mp3") if isinstance(audio_file, str) else False - - self.logger.debug(f"loading_mix iteraring through {bands_n} bands") - for d in tqdm(range(bands_n, 0, -1)): - bp = self.model_params.param["band"][str(d)] - - if d == bands_n: - X_wave[d], _ = librosa.load(audio_file, sr=bp["sr"], mono=False, dtype=np.float32, res_type="soxr_vhq") - X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp["hl"], bp["n_fft"], self.model_params, band=d, is_v51_model=self.is_vr_51_model) - - if not np.any(X_wave[d]) and is_mp3: - X_wave[d] = rerun_mp3(audio_file, bp["sr"]) - - if X_wave[d].ndim == 1: - X_wave[d] = np.asarray([X_wave[d], X_wave[d]]) - else: - X_wave[d] = librosa.resample(X_wave[d + 1], orig_sr=self.model_params.param["band"][str(d + 1)]["sr"], target_sr=bp["sr"], res_type="soxr_vhq") - X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp["hl"], bp["n_fft"], self.model_params, band=d, is_v51_model=self.is_vr_51_model) - - if d == bands_n and self.high_end_process: - self.input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (self.model_params.param["pre_filter_stop"] - self.model_params.param["pre_filter_start"]) - self.input_high_end = X_spec_s[d][:, bp["n_fft"] // 2 - self.input_high_end_h : bp["n_fft"] // 2, :] - - X_spec = spec_utils.combine_spectrograms(X_spec_s, self.model_params, is_v51_model=self.is_vr_51_model) - del X_wave, X_spec_s, audio_file - - return X_spec - - def inference_vr(self, X_spec, device, aggressiveness): - def _execute(X_mag_pad, roi_size): - X_dataset = [] - patches = (X_mag_pad.shape[2] - 2 * self.model_run.offset) // roi_size - - for i in tqdm(range(patches)): - start = i * roi_size - X_mag_window = X_mag_pad[:, :, start : start + self.window_size] - X_dataset.append(X_mag_window) - - total_iterations = patches // self.batch_size if not self.enable_tta else (patches // self.batch_size) * 2 - X_dataset = np.asarray(X_dataset) - self.model_run.eval() - - with torch.no_grad(): - mask = [] - - for i in tqdm(range(0, patches, self.batch_size)): - X_batch = X_dataset[i : i + self.batch_size] - X_batch = torch.from_numpy(X_batch).to(device) - pred = self.model_run.predict_mask(X_batch) - - if not pred.size()[3] > 0: - raise ValueError - - pred = pred.detach().cpu().numpy() - pred = np.concatenate(pred, axis=2) - mask.append(pred) - - if len(mask) == 0: - raise ValueError - - mask = np.concatenate(mask, axis=2) - - return mask - - def postprocess(mask, X_mag, X_phase): - is_non_accom_stem = False - for stem in CommonSeparator.NON_ACCOM_STEMS: - if stem == self.primary_stem_name: is_non_accom_stem = True - - mask = spec_utils.adjust_aggr(mask, is_non_accom_stem, aggressiveness) - if self.enable_post_process: mask = spec_utils.merge_artifacts(mask, thres=self.post_process_threshold) - - y_spec = mask * X_mag * np.exp(1.0j * X_phase) - v_spec = (1 - mask) * X_mag * np.exp(1.0j * X_phase) - - return y_spec, v_spec - - X_mag, X_phase = spec_utils.preprocess(X_spec) - n_frame = X_mag.shape[2] - pad_l, pad_r, roi_size = spec_utils.make_padding(n_frame, self.window_size, self.model_run.offset) - X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") - X_mag_pad /= X_mag_pad.max() - mask = _execute(X_mag_pad, roi_size) - - if self.enable_tta: - pad_l += roi_size // 2 - pad_r += roi_size // 2 - X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") - X_mag_pad /= X_mag_pad.max() - mask_tta = _execute(X_mag_pad, roi_size) - mask_tta = mask_tta[:, :, roi_size // 2 :] - mask = (mask[:, :, :n_frame] + mask_tta[:, :, :n_frame]) * 0.5 - else: - mask = mask[:, :, :n_frame] - - y_spec, v_spec = postprocess(mask, X_mag, X_phase) - - return y_spec, v_spec - - def spec_to_wav(self, spec): - if self.high_end_process and isinstance(self.input_high_end, np.ndarray) and self.input_high_end_h: - input_high_end_ = spec_utils.mirroring("mirroring", spec, self.input_high_end, self.model_params) - wav = spec_utils.cmb_spectrogram_to_wave(spec, self.model_params, self.input_high_end_h, input_high_end_, is_v51_model=self.is_vr_51_model) - else: - wav = spec_utils.cmb_spectrogram_to_wave(spec, self.model_params, is_v51_model=self.is_vr_51_model) - - return wav - -def rerun_mp3(audio_file, sample_rate=44100): - with audioread.audio_open(audio_file) as f: - track_length = int(f.duration) - - return librosa.load(audio_file, duration=track_length, mono=False, sr=sample_rate)[0] \ No newline at end of file diff --git a/main/library/backends/directml.py b/main/library/backends/directml.py deleted file mode 100644 index 9e4d2ca3f2c5e1d922df43a2864ef0c73c6e58de..0000000000000000000000000000000000000000 --- a/main/library/backends/directml.py +++ /dev/null @@ -1,42 +0,0 @@ -import os -import gc -import sys -import torch -import subprocess - -sys.path.append(os.getcwd()) - -from main.library.embedders import fairseq -from main.library.backends.utils import GRU - -try: - import torch_directml -except: - torch_directml = None - -torch_available = torch_directml != None - -def device_count(): - return torch_directml.device_count() if torch_available else 0 - -def device_name(device_id = 0): - return torch_directml.device_name(device_id) if torch_available else "" - -def is_available(): - return torch_directml.is_available() if torch_available else False - -def empty_cache(): - empty_cache_path = os.path.join("main", "library", "backends", "dml_empty_cache", "empty_cache.exe") - - if torch_available and os.path.exists(empty_cache_path): - subprocess.run([empty_cache_path], capture_output=True, text=True) - gc.collect() - -def forward_dml(ctx, x, scale): - ctx.scale = scale - res = x.clone().detach() - return res - -if torch_available: - torch.nn.GRU = GRU - fairseq.GradMultiply.forward = forward_dml \ No newline at end of file diff --git a/main/library/backends/dml_empty_cache/empty_cache.exe b/main/library/backends/dml_empty_cache/empty_cache.exe deleted file mode 100644 index 1d2e2e1b434214fcdbe32e47f2adfd97a5d21687..0000000000000000000000000000000000000000 Binary files a/main/library/backends/dml_empty_cache/empty_cache.exe and /dev/null differ diff --git a/main/library/backends/dml_empty_cache/empty_cache.exp b/main/library/backends/dml_empty_cache/empty_cache.exp deleted file mode 100644 index 10fc0d553fa359ba3991141fd0c39f28ff4fec75..0000000000000000000000000000000000000000 Binary files a/main/library/backends/dml_empty_cache/empty_cache.exp and /dev/null differ diff --git a/main/library/backends/dml_empty_cache/empty_cache.lib b/main/library/backends/dml_empty_cache/empty_cache.lib deleted file mode 100644 index f100ca99732e72a8752437f802cdab08cc91684b..0000000000000000000000000000000000000000 Binary files a/main/library/backends/dml_empty_cache/empty_cache.lib and /dev/null differ diff --git a/main/library/backends/dml_empty_cache/empty_cache.obj b/main/library/backends/dml_empty_cache/empty_cache.obj deleted file mode 100644 index 511aec14b6ea8b7c8ef6bf3fc3a42a3d1aca1585..0000000000000000000000000000000000000000 Binary files a/main/library/backends/dml_empty_cache/empty_cache.obj and /dev/null differ diff --git a/main/library/backends/opencl.py b/main/library/backends/opencl.py deleted file mode 100644 index ca2a29e180cdd5e1c2505e634fd765bcc871d539..0000000000000000000000000000000000000000 --- a/main/library/backends/opencl.py +++ /dev/null @@ -1,87 +0,0 @@ -import os -import sys -import torch -import platform -import subprocess - -try: - import pytorch_ocl -except: - pytorch_ocl = None - -sys.path.append(os.getcwd()) - -from main.library.backends.utils import GRU - -torch_available = pytorch_ocl != None -if torch_available: adaptive_orig = torch.nn.AdaptiveAvgPool2d - -def check_amd_gpu(gpu): - for i in ["RX", "AMD", "Vega", "Radeon", "FirePro"]: - return i in gpu - -def get_amd_gpu_windows(): - gpus = "" - - try: - gpus = subprocess.check_output("wmic path win32_VideoController get name", shell=True, stderr=subprocess.DEVNULL) - except subprocess.CalledProcessError: - gpus = subprocess.check_output('powershell "Get-CimInstance Win32_VideoController | Select-Object -ExpandProperty Name"', shell=True, stderr=subprocess.DEVNULL) - - return [gpu.strip() for gpu in gpus.decode().split('\n')[1:] if check_amd_gpu(gpu)] - -def get_amd_gpu_linux(): - try: - return [gpu for gpu in subprocess.check_output("lspci | grep VGA", shell=True).decode().split('\n') if check_amd_gpu(gpu)] - except: - return [] - -def get_gpu_list(): - return (get_amd_gpu_windows() if platform.system() == "Windows" else get_amd_gpu_linux()) if torch_available else [] - -def device_count(): - return len(get_gpu_list()) if torch_available else 0 - -def device_name(device_id = 0): - return (get_gpu_list()[device_id] if device_id >= 0 and device_id < device_count() else "") if torch_available else "" - -def is_available(): - return (device_count() > 0) if torch_available else False - -def group_norm(x, num_groups, weight=None, bias=None, eps=1e-5): - N, C = x.shape[:2] - assert C % num_groups == 0 - - shape = (N, num_groups, C // num_groups) + x.shape[2:] - x_reshaped = x.view(shape) - - dims = (2,) + tuple(range(3, x_reshaped.dim())) - mean = x_reshaped.mean(dim=dims, keepdim=True) - var = x_reshaped.var(dim=dims, keepdim=True, unbiased=False) - - x_norm = (x_reshaped - mean) / (var + eps).sqrt() - x_norm = x_norm.view_as(x) - - if weight is not None: - weight = weight.view(1, C, *([1] * (x.dim() - 2))) - x_norm = x_norm * weight - - if bias is not None: - bias = bias.view(1, C, *([1] * (x.dim() - 2))) - x_norm = x_norm + bias - - return x_norm - -def script(f, *_, **__): - f.graph = pytorch_ocl.torch._C.Graph() - return f - -def AdaptiveAvgPool2d(input): - input = input[0] if isinstance(input, tuple) else input - return adaptive_orig(input) - -if torch_available: - torch.nn.GRU = GRU - torch.nn.AdaptiveAvgPool2d = AdaptiveAvgPool2d - torch.nn.functional.group_norm = group_norm - torch.jit.script = script \ No newline at end of file diff --git a/main/library/backends/utils.py b/main/library/backends/utils.py deleted file mode 100644 index eb8ca822625d212922188fd155669a7f3c997ff9..0000000000000000000000000000000000000000 --- a/main/library/backends/utils.py +++ /dev/null @@ -1,149 +0,0 @@ -import torch - -import numpy as np -import torch.nn.functional as F - -from librosa.util import pad_center -from scipy.signal import get_window - -class STFT(torch.nn.Module): - def __init__(self, filter_length=1024, hop_length=512, win_length=None, window="hann", pad_mode="reflect"): - super(STFT, self).__init__() - self.filter_length = filter_length - self.hop_length = hop_length - self.pad_amount = int(self.filter_length / 2) - self.cutoff = int(self.filter_length / 2 + 1) - self.win_length = win_length - self.pad_mode = pad_mode - self.hann_window = {} - - fourier_basis = np.fft.fft(np.eye(self.filter_length)) - fourier_basis = np.vstack([np.real(fourier_basis[:self.cutoff, :]), np.imag(fourier_basis[:self.cutoff, :])]) - forward_basis = torch.FloatTensor(fourier_basis) - inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis)) - - if win_length is None or not win_length: win_length = filter_length - assert filter_length >= win_length - - fft_window = torch.from_numpy(pad_center(get_window(window, win_length, fftbins=True), size=filter_length)).float() - forward_basis *= fft_window - inverse_basis = (inverse_basis.T * fft_window).T - - self.register_buffer("forward_basis", forward_basis.float()) - self.register_buffer("inverse_basis", inverse_basis.float()) - self.register_buffer("fft_window", fft_window.float()) - - def transform(self, input_data, eps=1e-9, return_phase=False, center=True): - if center: input_data = F.pad(input_data, (self.pad_amount, self.pad_amount), mode=self.pad_mode) - forward_transform = (self.forward_basis @ input_data.unfold(1, self.filter_length, self.hop_length).permute(0, 2, 1)) - - real_part = forward_transform[:, :self.cutoff, :] - imag_part = forward_transform[:, self.cutoff:, :] - magnitude = (real_part**2 + imag_part**2 + eps).sqrt() - - if return_phase: - phase = imag_part.data.atan2(real_part.data) - return magnitude, phase - - return magnitude - - def inverse(self, magnitude, phase): - cat = torch.cat([magnitude * phase.cos(), magnitude * phase.sin()], dim=1) - fold = torch.nn.Fold(output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length), kernel_size=(1, self.filter_length), stride=(1, self.hop_length)) - - inverse_transform = fold(self.inverse_basis @ cat)[:, 0, 0, self.pad_amount : -self.pad_amount] - window_square_sum = fold(self.fft_window.cpu().pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0))[:, 0, 0, self.pad_amount : -self.pad_amount].to(cat.device) if str(cat.device).startswith("ocl") else fold(self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0))[:, 0, 0, self.pad_amount : -self.pad_amount] - - return inverse_transform / window_square_sum - -class GRU(torch.nn.RNNBase): - def __init__(self, input_size, hidden_size, num_layers=1, bias=True, batch_first=True, dropout=0.0, bidirectional=False, device=None, dtype=None): - super().__init__("GRU", input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional, device=device, dtype=dtype) - - @staticmethod - def _gru_cell(x, hx, weight_ih, bias_ih, weight_hh, bias_hh): - gate_x = F.linear(x, weight_ih, bias_ih) - gate_h = F.linear(hx, weight_hh, bias_hh) - - i_r, i_i, i_n = gate_x.chunk(3, 1) - h_r, h_i, h_n = gate_h.chunk(3, 1) - - resetgate = (i_r + h_r).sigmoid() - inputgate = (i_i + h_i).sigmoid() - newgate = (i_n + resetgate * h_n).tanh() - - hy = newgate + inputgate * (hx - newgate) - return hy - - def _gru_layer(self, x, hx, weights): - weight_ih, weight_hh, bias_ih, bias_hh = weights - outputs = [] - - for x_t in x.unbind(1): - hx = self._gru_cell(x_t, hx, weight_ih, bias_ih, weight_hh, bias_hh) - outputs.append(hx) - - return torch.stack(outputs, dim=1), hx - - def _gru(self, x, hx): - if not self.batch_first: x = x.permute(1, 0, 2) - num_directions = 2 if self.bidirectional else 1 - - h_n = [] - output_fwd, output_bwd = x, x - - for layer in range(self.num_layers): - fwd_idx = layer * num_directions - bwd_idx = fwd_idx + 1 if self.bidirectional else None - - weights_fwd = self._get_weights(fwd_idx) - h_fwd = hx[fwd_idx] - - out_fwd, h_out_fwd = self._gru_layer(output_fwd, h_fwd, weights_fwd) - h_n.append(h_out_fwd) - - if self.bidirectional: - weights_bwd = self._get_weights(bwd_idx) - h_bwd = hx[bwd_idx] - - reversed_input = torch.flip(output_bwd, dims=[1]) - out_bwd, h_out_bwd = self._gru_layer(reversed_input, h_bwd, weights_bwd) - - out_bwd = torch.flip(out_bwd, dims=[1]) - h_n.append(h_out_bwd) - - output_fwd = torch.cat([out_fwd, out_bwd], dim=2) - output_bwd = output_fwd - else: output_fwd = out_fwd - - if layer < self.num_layers - 1 and self.dropout > 0: - output_fwd = F.dropout(output_fwd, p=self.dropout, training=self.training) - if self.bidirectional: output_bwd = output_fwd - - output = output_fwd - h_n = torch.stack(h_n, dim=0) - - if not self.batch_first: output = output.permute(1, 0, 2) - return output, h_n - - def _get_weights(self, layer_idx): - weights = self._all_weights[layer_idx] - - weight_ih = getattr(self, weights[0]) - weight_hh = getattr(self, weights[1]) - - bias_ih = getattr(self, weights[2]) if self.bias else None - bias_hh = getattr(self, weights[3]) if self.bias else None - - return weight_ih, weight_hh, bias_ih, bias_hh - - def forward(self, input, hx=None): - if input.dim() != 3: raise ValueError - - batch_size = input.size(0) if self.batch_first else input.size(1) - num_directions = 2 if self.bidirectional else 1 - - if hx is None: hx = torch.zeros(self.num_layers * num_directions, batch_size, self.hidden_size, dtype=input.dtype, device=input.device) - - self.check_forward_args(input, hx, batch_sizes=None) - return self._gru(input, hx) \ No newline at end of file diff --git a/main/library/backends/zluda.py b/main/library/backends/zluda.py deleted file mode 100644 index c8c474072decebca1c222ff9ab9fca83822fa9e1..0000000000000000000000000000000000000000 --- a/main/library/backends/zluda.py +++ /dev/null @@ -1,68 +0,0 @@ -import torch - -if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"): - class STFT: - def __init__(self): - self.device = "cuda" - self.fourier_bases = {} - - def _get_fourier_basis(self, n_fft): - if n_fft in self.fourier_bases: - return self.fourier_bases[n_fft] - - fourier_basis = torch.fft.fft(torch.eye(n_fft, device="cpu")).to( - self.device - ) - - cutoff = n_fft // 2 + 1 - fourier_basis = torch.cat( - [fourier_basis.real[:cutoff], fourier_basis.imag[:cutoff]], dim=0 - ) - - self.fourier_bases[n_fft] = fourier_basis - return fourier_basis - - def transform(self, input, n_fft, hop_length, window): - fourier_basis = self._get_fourier_basis(n_fft) - fourier_basis = fourier_basis * window - - pad_amount = n_fft // 2 - input = torch.nn.functional.pad( - input, (pad_amount, pad_amount), mode="reflect" - ) - - input_frames = input.unfold(1, n_fft, hop_length).permute(0, 2, 1) - fourier_transform = fourier_basis @ input_frames - cutoff = n_fft // 2 + 1 - - return torch.complex( - fourier_transform[:, :cutoff, :], fourier_transform[:, cutoff:, :] - ) - - stft = STFT() - _torch_stft = torch.stft - - def z_stft(input, window, *args, **kwargs): - if ( - kwargs.get("win_length") == None - and kwargs.get("center") == None - and kwargs.get("return_complex") == True - ): - return stft.transform( - input, kwargs.get("n_fft"), kwargs.get("hop_length"), window - ) - else: - return _torch_stft( - input=input.cpu(), window=window.cpu(), *args, **kwargs - ).to(input.device) - - def z_jit(f, *_, **__): - f.graph = torch._C.Graph() - return f - - torch.stft = z_stft - torch.jit.script = z_jit - torch.backends.cudnn.enabled = False - torch.backends.cuda.enable_flash_sdp(False) - torch.backends.cuda.enable_math_sdp(True) - torch.backends.cuda.enable_mem_efficient_sdp(False) \ No newline at end of file diff --git a/main/library/embedders/fairseq.py b/main/library/embedders/fairseq.py deleted file mode 100644 index 89818491fc1b6ffad0640c40fa9e615fc15be8fa..0000000000000000000000000000000000000000 --- a/main/library/embedders/fairseq.py +++ /dev/null @@ -1,1454 +0,0 @@ -import re -import sys -import math -import uuid -import torch -import types -import contextlib - -import numpy as np -import torch.nn.functional as F - -from torch import nn -from omegaconf import DictConfig, open_dict - -class Dictionary: - def __init__(self, *args, **kwargs): - pass - -fairseq = types.ModuleType("fairseq") -fairseq_data = types.ModuleType("fairseq.data") -fairseq_data_dictionary = types.ModuleType("fairseq.data.dictionary") -fairseq_data_dictionary.Dictionary = Dictionary -fairseq.data = fairseq_data -fairseq_data.dictionary = fairseq_data_dictionary -sys.modules["fairseq"] = fairseq -sys.modules["fairseq.data"] = fairseq_data -sys.modules["fairseq.data.dictionary"] = fairseq_data_dictionary - -def load_model(filename): - state = torch.load(filename, map_location="cpu", weights_only=False) - - model = HubertModel(HubertConfig(**state['cfg']['model']), num_classes=int(state['model']['label_embs_concat'].shape[0])) - model.load_state_dict(state['model'], strict=False) - - return model - -def softmax(x, dim, onnx_trace = False): - return F.softmax(x.float(), dim=dim) if onnx_trace else F.softmax(x, dim=dim, dtype=torch.float32) - -def log_softmax(x, dim, onnx_trace = False): - return F.log_softmax(x.float(), dim=dim) if onnx_trace else F.log_softmax(x, dim=dim, dtype=torch.float32) - -def eval_str_dict(x, type=dict): - if x is None: return None - if isinstance(x, str): x = eval(x) - return x - -def with_incremental_state(cls): - cls.__bases__ = (FairseqIncrementalState,) + tuple(b for b in cls.__bases__ if b != FairseqIncrementalState) - return cls - -def quant_noise(module, p, block_size): - if p <= 0: return module - assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)) - is_conv = module.weight.ndim == 4 - if not is_conv: assert (module.weight.size(1) % block_size == 0) - else: - if module.kernel_size == (1, 1): assert (module.in_channels % block_size == 0) - else: - k = module.kernel_size[0] * module.kernel_size[1] - assert k % block_size == 0 - - def _forward_pre_hook(mod, input): - if mod.training: - if not is_conv: - weight = mod.weight - in_features = weight.size(1) - out_features = weight.size(0) - mask = torch.zeros(in_features // block_size * out_features, device=weight.device) - mask.bernoulli_(p) - mask = mask.repeat_interleave(block_size, -1).view(-1, in_features) - else: - weight = mod.weight - in_channels = mod.in_channels - out_channels = mod.out_channels - - if mod.kernel_size == (1, 1): - mask = torch.zeros(int(in_channels // block_size * out_channels), device=weight.device) - mask.bernoulli_(p) - mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels) - else: - mask = torch.zeros(weight.size(0), weight.size(1), device=weight.device) - mask.bernoulli_(p) - mask = (mask.unsqueeze(2).unsqueeze(3).repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])) - - mask = mask.to(torch.bool) - s = 1 / (1 - p) - mod.weight.data = s * weight.masked_fill(mask, 0) - - module.register_forward_pre_hook(_forward_pre_hook) - return module - -class FairseqDropout(nn.Module): - def __init__(self, p, module_name=None): - super().__init__() - self.p = p - self.module_name = module_name - self.apply_during_inference = False - - def forward(self, x, inplace = False): - return F.dropout(x, p=self.p, training=True, inplace=inplace) if self.p > 0 and (self.training or self.apply_during_inference) else x - - def make_generation_fast_(self, name, retain_dropout = False, retain_dropout_modules = None, **kwargs): - if retain_dropout: - if (retain_dropout_modules is None or self.module_name in retain_dropout_modules): self.apply_during_inference = True - -class FairseqIncrementalState(object): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.init_incremental_state() - - def init_incremental_state(self): - self._incremental_state_id = str(uuid.uuid4()) - - def _get_full_incremental_state_key(self, key): - return "{}.{}".format(self._incremental_state_id, key) - - def get_incremental_state(self, incremental_state, key): - full_key = self._get_full_incremental_state_key(key) - if incremental_state is None or full_key not in incremental_state: return None - return incremental_state[full_key] - - def set_incremental_state(self, incremental_state, key, value): - if incremental_state is not None: incremental_state[self._get_full_incremental_state_key(key)] = value - return incremental_state - -class FairseqDecoder(nn.Module): - def __init__(self, dictionary): - super().__init__() - self.dictionary = dictionary - self.onnx_trace = False - self.adaptive_softmax = None - - def forward(self, prev_output_tokens, encoder_out=None, **kwargs): - x, extra = self.extract_features(prev_output_tokens, encoder_out=encoder_out, **kwargs) - return self.output_layer(x), extra - - def extract_features(self, prev_output_tokens, encoder_out=None, **kwargs): - pass - - def output_layer(self, features, **kwargs): - pass - - def get_normalized_probs(self, net_output, log_probs, sample = None): - return self.get_normalized_probs_scriptable(net_output, log_probs, sample) - - def get_normalized_probs_scriptable(self, net_output, log_probs, sample = None): - if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None: - if sample is not None: - assert "target" in sample - target = sample["target"] - else: target = None - out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) - return out.exp_() if not log_probs else out - - logits = net_output[0] - return log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace) if log_probs else softmax(logits, dim=-1, onnx_trace=self.onnx_trace) - - def max_positions(self): - return 1e6 - - def upgrade_state_dict_named(self, state_dict, name): - return state_dict - - def prepare_for_onnx_export_(self): - self.onnx_trace = True - -@with_incremental_state -class FairseqIncrementalDecoder(FairseqDecoder): - def __init__(self, dictionary): - super().__init__(dictionary) - - def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs): - pass - - def extract_features(self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs): - pass - - def reorder_incremental_state(self, incremental_state, new_order): - pass - - def reorder_incremental_state_scripting(self, incremental_state, new_order): - for module in self.modules(): - if hasattr(module, "reorder_incremental_state"): - result = module.reorder_incremental_state(incremental_state, new_order) - if result is not None: incremental_state = result - - def set_beam_size(self, beam_size): - if getattr(self, "_beam_size", -1) != beam_size: - seen = set() - - def apply_set_beam_size(module): - if (module != self and hasattr(module, "set_beam_size") and module not in seen): - seen.add(module) - module.set_beam_size(beam_size) - - self.apply(apply_set_beam_size) - self._beam_size = beam_size - -class MultiheadAttention(FairseqIncrementalDecoder): - def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, dictionary=None, q_noise=0.0, qn_block_size=8, xformers_att_config=None, xformers_blocksparse_layout=None, xformers_blocksparse_blocksize=16): - super().__init__(dictionary) - xformers_att_config = eval_str_dict(xformers_att_config) - self.use_xformers = xformers_att_config is not None - if self.use_xformers: raise ImportError - self.embed_dim = embed_dim - self.kdim = kdim if kdim is not None else embed_dim - self.vdim = vdim if vdim is not None else embed_dim - self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim - self.num_heads = num_heads - self.dropout_module = FairseqDropout(dropout, module_name=self.__class__.__name__) - self.head_dim = embed_dim // num_heads - assert (self.head_dim * num_heads == self.embed_dim) - self.scaling = self.head_dim**-0.5 - self.self_attention = self_attention - self.encoder_decoder_attention = encoder_decoder_attention - assert not self.self_attention or self.qkv_same_dim - self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) - self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) - self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) - self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) - if add_bias_kv: self.bias_k, self.bias_v = nn.Parameter(torch.Tensor(1, 1, embed_dim)), nn.Parameter(torch.Tensor(1, 1, embed_dim)) - else: self.bias_k = self.bias_v = None - self.add_zero_attn = add_zero_attn - self.beam_size = 1 - self.reset_parameters() - self.onnx_trace = False - self.skip_embed_dim_check = False - self.init_incremental_state() - - def prepare_for_onnx_export_(self): - self.onnx_trace = True - - def reset_parameters(self): - if self.qkv_same_dim: - nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2)) - nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2)) - nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2)) - else: - nn.init.xavier_uniform_(self.k_proj.weight) - nn.init.xavier_uniform_(self.v_proj.weight) - nn.init.xavier_uniform_(self.q_proj.weight) - - nn.init.xavier_uniform_(self.out_proj.weight) - if self.out_proj.bias is not None: nn.init.constant_(self.out_proj.bias, 0.0) - if self.bias_k is not None: nn.init.xavier_normal_(self.bias_k) - if self.bias_v is not None: nn.init.xavier_normal_(self.bias_v) - - def _get_reserve_head_index(self, num_heads_to_keep: int): - k_proj_heads_norm, q_proj_heads_norm, v_proj_heads_norm = [], [], [] - for i in range(self.num_heads): - start_idx = i * self.head_dim - end_idx = (i + 1) * self.head_dim - k_proj_heads_norm.append((self.k_proj.weight[start_idx:end_idx]).abs().sum().tolist() + (self.k_proj.bias[start_idx:end_idx]).abs().sum().tolist()) - q_proj_heads_norm.append((self.q_proj.weight[start_idx:end_idx]).abs().sum().tolist() + (self.q_proj.bias[start_idx:end_idx]).abs().sum().tolist()) - v_proj_heads_norm.append((self.v_proj.weight[start_idx:end_idx]).abs().sum().tolist() + (self.v_proj.bias[start_idx:end_idx]).abs().sum().tolist()) - - heads_norm = [] - for i in range(self.num_heads): - heads_norm.append(k_proj_heads_norm[i] + q_proj_heads_norm[i] + v_proj_heads_norm[i]) - - sorted_head_index = sorted(range(self.num_heads), key=lambda k: heads_norm[k], reverse=True) - reserve_head_index = [] - for i in range(num_heads_to_keep): - reserve_head_index.append((sorted_head_index[i] * self.head_dim, (sorted_head_index[i] + 1) * self.head_dim)) - return reserve_head_index - - def _adaptive_prune_heads(self, reserve_head_index): - new_q_weight, new_q_bias, new_k_weight, new_k_bias, new_v_weight, new_v_bias, new_out_proj_weight = [], [], [], [], [], [], [] - for ele in reserve_head_index: - start_idx, end_idx = ele - new_q_weight.append(self.q_proj.weight[start_idx:end_idx]) - new_q_bias.append(self.q_proj.bias[start_idx:end_idx]) - new_k_weight.append(self.k_proj.weight[start_idx:end_idx]) - new_k_bias.append(self.k_proj.bias[start_idx:end_idx]) - new_v_weight.append(self.v_proj.weight[start_idx:end_idx]) - new_v_bias.append(self.v_proj.bias[start_idx:end_idx]) - new_out_proj_weight.append(self.out_proj.weight[:, start_idx:end_idx]) - new_q_weight = torch.cat(new_q_weight).detach() - new_k_weight = torch.cat(new_k_weight).detach() - new_v_weight = torch.cat(new_v_weight).detach() - new_out_proj_weight = torch.cat(new_out_proj_weight, dim=-1).detach() - new_q_weight.requires_grad = True - new_k_weight.requires_grad = True - new_v_weight.requires_grad = True - new_out_proj_weight.requires_grad = True - new_q_bias = torch.cat(new_q_bias).detach() - new_q_bias.requires_grad = True - new_k_bias = torch.cat(new_k_bias).detach() - new_k_bias.requires_grad = True - new_v_bias = torch.cat(new_v_bias).detach() - new_v_bias.requires_grad = True - self.q_proj.weight = nn.Parameter(new_q_weight) - self.q_proj.bias = nn.Parameter(new_q_bias) - self.k_proj.weight = nn.Parameter(new_k_weight) - self.k_proj.bias = nn.Parameter(new_k_bias) - self.v_proj.weight = nn.Parameter(new_v_weight) - self.v_proj.bias = nn.Parameter(new_v_bias) - self.out_proj.weight = nn.Parameter(new_out_proj_weight) - self.num_heads = len(reserve_head_index) - self.embed_dim = self.head_dim * self.num_heads - self.q_proj.out_features = self.embed_dim - self.k_proj.out_features = self.embed_dim - self.v_proj.out_features = self.embed_dim - - def _set_skip_embed_dim_check(self): - self.skip_embed_dim_check = True - - def _pad_masks(self, key_padding_mask, attn_mask): - if attn_mask is not None: - shape = attn_mask.size()[:-1] + torch.Size([1]) - attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(shape)], dim=-1) - - if key_padding_mask is not None: - shape = key_padding_mask.size()[:-1] + torch.Size([1]) - key_padding_mask = torch.cat([key_padding_mask, key_padding_mask.new_zeros(shape)], dim=-1) - - return key_padding_mask, attn_mask - - def _add_bias(self, k, v, key_padding_mask, attn_mask, bsz): - assert self.bias_k is not None or self.bias_v is not None - key_padding_mask, attn_mask = self._pad_masks(key_padding_mask=key_padding_mask, attn_mask=attn_mask) - return torch.cat([k, self.bias_k.repeat(1, bsz, 1)]), torch.cat([v, self.bias_v.repeat(1, bsz, 1)]), key_padding_mask, attn_mask - - def _append_zero_attn(self, k, v, key_padding_mask, attn_mask): - zero_attn_shape = k.size()[:-2] + torch.Size([1]) + k.size()[-1:] - key_padding_mask, attn_mask = self._pad_masks(key_padding_mask=key_padding_mask, attn_mask=attn_mask) - return torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=-2), torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=-2), key_padding_mask, attn_mask - - def forward(self, query, key, value, key_padding_mask = None, incremental_state = None, need_weights = True, static_kv = False, attn_mask = None, before_softmax = False, need_head_weights = False): - if need_head_weights: need_weights = True - is_tpu = query.device.type == "xla" - tgt_len, bsz, embed_dim = query.size() - src_len = tgt_len - if not self.skip_embed_dim_check: assert (embed_dim == self.embed_dim) - assert list(query.size()) == [tgt_len, bsz, embed_dim] - if key is not None: - src_len, key_bsz, _ = key.size() - if not torch.jit.is_scripting(): - assert value is not None - assert src_len, key_bsz == value.shape[:2] - - if (not self.onnx_trace and not is_tpu and incremental_state is None and not static_kv and not torch.jit.is_scripting() and not self.skip_embed_dim_check): - assert key is not None and value is not None - return F.multi_head_attention_forward(query, key, value, self.embed_dim, self.num_heads, torch.empty([0]), torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)), self.bias_k, self.bias_v, self.add_zero_attn, self.dropout_module.p, self.out_proj.weight, self.out_proj.bias, self.training or self.dropout_module.apply_during_inference, key_padding_mask.bool() if key_padding_mask is not None else None, need_weights, attn_mask, use_separate_proj_weight=True, q_proj_weight=self.q_proj.weight, k_proj_weight=self.k_proj.weight, v_proj_weight=self.v_proj.weight) - - if incremental_state is not None: - saved_state = self._get_input_buffer(incremental_state) - if saved_state is not None and "prev_key" in saved_state: - if static_kv: - assert self.encoder_decoder_attention and not self.self_attention - key = value = None - else: saved_state = None - - if self.self_attention: - q = self.q_proj(query) - k = self.k_proj(query) - v = self.v_proj(query) - elif self.encoder_decoder_attention: - q = self.q_proj(query) - if key is None: - assert value is None - k = v = None - else: - if self.beam_size > 1 and bsz == key.size(1): - key = key.view(key.size(0), -1, self.beam_size, key.size(2))[:, :, 0, :] - if key_padding_mask is not None: key_padding_mask = key_padding_mask.view(-1, self.beam_size, key_padding_mask.size(1))[:, 0, :] - k = self.k_proj(key) - v = self.v_proj(key) - else: - assert key is not None and value is not None - q = self.q_proj(query) - k = self.k_proj(key) - v = self.v_proj(value) - - q *= self.scaling - if self.bias_k is not None: - assert self.bias_v is not None - k, v, attn_mask, key_padding_mask = self._add_bias(k, v, attn_mask, key_padding_mask, bsz) - - q = (q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)) - kv_bsz = bsz - if k is not None: - kv_bsz = k.size(1) - k = (k.contiguous().view(-1, kv_bsz * self.num_heads, self.head_dim).transpose(0, 1)) - - if v is not None: v = (v.contiguous().view(-1, kv_bsz * self.num_heads, self.head_dim).transpose(0, 1)) - if saved_state is not None: - if "prev_key" in saved_state: - _prev_key = saved_state["prev_key"] - assert _prev_key is not None - - kv_bsz = _prev_key.size(0) - prev_key = _prev_key.view(kv_bsz * self.num_heads, -1, self.head_dim) - - if static_kv: k = prev_key - else: - assert k is not None - k = torch.cat([prev_key, k], dim=1) - src_len = k.size(1) - - if "prev_value" in saved_state: - _prev_value = saved_state["prev_value"] - assert _prev_value is not None or kv_bsz == _prev_value.size(0) - prev_value = _prev_value.view(kv_bsz * self.num_heads, -1, self.head_dim) - if static_kv: v = prev_value - else: - assert v is not None - v = torch.cat([prev_value, v], dim=1) - - prev_key_padding_mask = None - if "prev_key_padding_mask" in saved_state: prev_key_padding_mask = saved_state["prev_key_padding_mask"] - assert k is not None and v is not None - key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(key_padding_mask=key_padding_mask, prev_key_padding_mask=prev_key_padding_mask, batch_size=kv_bsz, src_len=k.size(1), static_kv=static_kv) - saved_state["prev_key"] = k.view(kv_bsz, self.num_heads, -1, self.head_dim) - saved_state["prev_value"] = v.view(kv_bsz, self.num_heads, -1, self.head_dim) - saved_state["prev_key_padding_mask"] = key_padding_mask - assert incremental_state is not None - incremental_state = self._set_input_buffer(incremental_state, saved_state) - - assert k is not None - assert k.size(1) == src_len - - if key_padding_mask is not None and key_padding_mask.dim() == 0: key_padding_mask = None - - if key_padding_mask is not None: - assert key_padding_mask.size(0) == kv_bsz - assert key_padding_mask.size(1) == src_len - - if self.add_zero_attn: - assert v is not None - src_len += 1 - k, v, key_padding_mask, attn_mask = self._append_zero_attn(k=k, v=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask) - - if self.encoder_decoder_attention and bsz != kv_bsz: - attn_weights = torch.einsum("bxhtd,bhsd->bxhts", q.view((kv_bsz, -1, self.num_heads) + q.size()[1:]), k.view((kv_bsz, self.num_heads) + k.size()[1:])) - attn_weights = attn_weights.reshape((-1,) + attn_weights.size()[-2:]) - else: attn_weights = q.bmm(k.transpose(1, 2)) - - assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] - - if attn_mask is not None: - attn_mask = attn_mask.unsqueeze(0) - if self.onnx_trace: attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1) - attn_weights += attn_mask - - if key_padding_mask is not None: - attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) - attn_weights = attn_weights.view(kv_bsz, -1, self.num_heads, tgt_len, src_len).masked_fill(key_padding_mask.unsqueeze(1).unsqueeze(2).unsqueeze(3).to(torch.bool), float("-inf")) if not is_tpu else attn_weights.transpose(0, 2).masked_fill(key_padding_mask, float("-inf")).transpose(0, 2) - attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - - if before_softmax: return attn_weights, v - attn_weights_float = softmax(attn_weights, dim=-1, onnx_trace=self.onnx_trace) - attn_weights = attn_weights_float.type_as(attn_weights) - attn_probs = self.dropout_module(attn_weights) - assert v is not None - attn = None - - if self.encoder_decoder_attention and bsz != kv_bsz: - attn = torch.einsum("bxhts,bhsd->bxhtd", attn_probs.view((kv_bsz, -1, self.num_heads) + attn_probs.size()[1:]), v.view((kv_bsz, self.num_heads) + v.size()[1:])) - attn = attn.reshape((-1,) + attn.size()[-2:]) - else: attn = attn_probs.bmm(v) - assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] - - attn = attn.contiguous().view(tgt_len, bsz, self.embed_dim) if self.onnx_trace and attn.size(1) == 1 else attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim) - attn = self.out_proj(attn) - attn_weights = None - - if need_weights: - attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0) - if not need_head_weights: attn_weights = attn_weights.mean(dim=0) - - return attn, attn_weights - - @staticmethod - def _append_prev_key_padding_mask(key_padding_mask, prev_key_padding_mask, batch_size, src_len, static_kv): - if prev_key_padding_mask is not None and static_kv: new_key_padding_mask = prev_key_padding_mask - elif prev_key_padding_mask is not None and key_padding_mask is not None: new_key_padding_mask = torch.cat([prev_key_padding_mask.float(), key_padding_mask.float()], dim=1) - elif prev_key_padding_mask is not None: - if src_len > prev_key_padding_mask.size(1): - filler = torch.zeros((batch_size, src_len - prev_key_padding_mask.size(1)), device=prev_key_padding_mask.device) - new_key_padding_mask = torch.cat([prev_key_padding_mask.float(), filler.float()], dim=1) - else: new_key_padding_mask = prev_key_padding_mask.float() - elif key_padding_mask is not None: - if src_len > key_padding_mask.size(1): - filler = torch.zeros((batch_size, src_len - key_padding_mask.size(1)), device=key_padding_mask.device) - new_key_padding_mask = torch.cat([filler.float(), key_padding_mask.float()], dim=1) - else: new_key_padding_mask = key_padding_mask.float() - else: new_key_padding_mask = prev_key_padding_mask - return new_key_padding_mask - - @torch.jit.export - def reorder_incremental_state(self, incremental_state, new_order): - input_buffer = self._get_input_buffer(incremental_state) - if input_buffer is not None: - for k in input_buffer.keys(): - input_buffer_k = input_buffer[k] - if input_buffer_k is not None: - if self.encoder_decoder_attention: - if input_buffer_k.size(0) * self.beam_size == new_order.size(0): return incremental_state - elif self.beam_size > 1: input_buffer[k] = input_buffer_k.index_select(0, new_order.reshape(-1, self.beam_size)[:, 0] // self.beam_size) - else: input_buffer[k] = input_buffer_k.index_select(0, new_order) - else: input_buffer[k] = input_buffer_k.index_select(0, new_order) - incremental_state = self._set_input_buffer(incremental_state, input_buffer) - return incremental_state - - def set_beam_size(self, beam_size): - self.beam_size = beam_size - - def _get_input_buffer(self, incremental_state): - result = self.get_incremental_state(incremental_state, "attn_state") - return result if result is not None else {} - - def _set_input_buffer(self, incremental_state, buffer): - return self.set_incremental_state(incremental_state, "attn_state", buffer) - - def upgrade_state_dict_named(self, state_dict, name): - prefix = name + "." if name != "" else "" - items_to_add, keys_to_remove = {}, [] - for k in state_dict.keys(): - if k.endswith(prefix + "in_proj_weight"): - dim = int(state_dict[k].shape[0] / 3) - items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim] - items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim] - items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :] - keys_to_remove.append(k) - k_bias = prefix + "in_proj_bias" - if k_bias in state_dict.keys(): - dim = int(state_dict[k].shape[0] / 3) - items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim] - items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][dim : 2 * dim] - items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :] - keys_to_remove.append(prefix + "in_proj_bias") - - for k in keys_to_remove: - del state_dict[k] - - for key, value in items_to_add.items(): - state_dict[key] = value - -def init_bert_params(module): - def normal_(data): - data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device)) - - if isinstance(module, nn.Linear): - normal_(module.weight.data) - if module.bias is not None: module.bias.data.zero_() - if isinstance(module, nn.Embedding): - normal_(module.weight.data) - if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - if isinstance(module, MultiheadAttention): - normal_(module.q_proj.weight.data) - normal_(module.k_proj.weight.data) - normal_(module.v_proj.weight.data) - -def make_conv_pos(e, k, g): - pos_conv = nn.Conv1d(e, e, kernel_size=k, padding=k // 2, groups=g) - dropout = 0 - nn.init.normal_(pos_conv.weight, mean=0, std=math.sqrt((4 * (1.0 - dropout)) / (k * e))) - nn.init.constant_(pos_conv.bias, 0) - return nn.Sequential(nn.utils.parametrizations.weight_norm(pos_conv, name="weight", dim=2), SamePad(k), nn.GELU()) - -def is_xla_tensor(tensor): - return torch.is_tensor(tensor) and tensor.device.type == "xla" - -def index_put(tensor, indices, value): - if is_xla_tensor(tensor): - for _ in range(indices.dim(), tensor.dim()): - indices = indices.unsqueeze(-1) - - if indices.size(-1) < tensor.size(-1): indices = indices.expand_as(tensor) - tensor = tensor.mul(~indices).add(value.mul(indices)) - else: tensor[indices] = value - - return tensor - -def pad_to_multiple(x, multiple, dim=-1, value=0): - if x is None: return None, 0 - tsz = x.size(dim) - m = tsz / multiple - remainder = math.ceil(m) * multiple - tsz - if m.is_integer(): return x, 0 - return F.pad(x, (*((0,) * (-1 - dim) * 2), 0, remainder), value=value), remainder - -def compute_mask_indices(shape, padding_mask, mask_prob, mask_length, mask_type = "static", mask_other = 0.0, min_masks = 0, no_overlap = False, min_space = 0, require_same_masks = True, mask_dropout = 0.0, add_masks = False, seed = None, epoch = None, indices = None, idc_select_ver = 1, num_mask_ver = 2): - bsz, all_sz = shape - mask = np.full((bsz, all_sz), False) - if num_mask_ver == 1: all_num_mask = max(min_masks, int(mask_prob * all_sz / float(mask_length) + np.random.rand())) - mask_idcs = [] - - for i in range(bsz): - seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6) if seed is not None and epoch is not None and indices is not None else None - rng = np.random.default_rng(seed_i) - - if padding_mask is not None: - sz = all_sz - padding_mask[i].long().sum().item() - assert sz >= 0, sz - else: sz = all_sz - - if num_mask_ver == 1: num_mask = max(min_masks, int(mask_prob * sz / float(mask_length) + np.random.rand())) if padding_mask is not None else all_num_mask - elif num_mask_ver == 2: num_mask = max(min_masks, int(mask_prob * sz / float(mask_length) + rng.random())) - else: raise ValueError - - if mask_type == "static": lengths = np.full(num_mask, mask_length) - elif mask_type == "uniform": lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask) - elif mask_type == "normal": lengths = [max(1, int(round(x))) for x in rng.normal(mask_length, mask_other, size=num_mask)] - elif mask_type == "poisson": lengths = [int(round(x)) for x in rng.poisson(mask_length, size=num_mask)] - else: raise Exception - - if sum(lengths) == 0: - if mask_type == "static": raise ValueError - else: lengths = [min(mask_length, sz - 1)] - - if no_overlap: - mask_idc = [] - - def arrange(s, e, length, keep_length): - span_start = rng.randint(s, e - length) - mask_idc.extend(span_start + i for i in range(length)) - new_parts = [] - if span_start - s - min_space >= keep_length: new_parts.append((s, span_start - min_space + 1)) - if e - span_start - length - min_space > keep_length: new_parts.append((span_start + length + min_space, e)) - return new_parts - - parts = [(0, sz)] - min_length = min(lengths) - for length in sorted(lengths, reverse=True): - lens = np.fromiter((e - s if e - s >= length + min_space else 0 for s, e in parts), np.int32) - l_sum = np.sum(lens) - if l_sum == 0: break - s, e = parts.pop(rng.choice(len(parts), p=lens / np.sum(lens))) - parts.extend(arrange(s, e, length, min_length)) - mask_idc = np.asarray(mask_idc) - else: - if idc_select_ver == 1: - min_len = min(lengths) - if sz - min_len <= num_mask: min_len = sz - num_mask - 1 - mask_idc = rng.choice(sz - min_len, num_mask, replace=False) - elif idc_select_ver == 2: mask_idc = rng.choice(sz, num_mask, replace=False) - else: raise ValueError - - mask_idc = np.asarray([mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])]) - - mask_idc = np.unique(mask_idc[mask_idc < sz]) - if len(mask_idc) >= sz: raise ValueError - mask_idcs.append(mask_idc) - - target_len = None - if require_same_masks: target_len = max([len(m) for m in mask_idcs]) if add_masks else min([len(m) for m in mask_idcs]) - - for i, mask_idc in enumerate(mask_idcs): - if target_len is not None and len(mask_idc) > target_len: mask_idc = rng.choice(mask_idc, target_len, replace=False) - mask[i, mask_idc] = True - - if target_len is not None and len(mask_idc) < target_len: - to_mask = rng.choice(np.flatnonzero(~mask[i]), target_len - len(mask_idc), replace=False) - mask[i, to_mask] = True - - if mask_dropout > 0: - masked = np.flatnonzero(mask[i]) - mask[i, rng.choice(masked, np.rint(len(masked) * mask_dropout).astype(int), replace=False)] = False - - return mask - -def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True): - return nn.LayerNorm(normalized_shape, eps, elementwise_affine) - -def prune_state_dict(state_dict, model_cfg): - arch = None - if model_cfg is not None: arch = (model_cfg._name if isinstance(model_cfg, DictConfig) else getattr(model_cfg, "arch", None)) - if not model_cfg or arch is None or arch == "ptt_transformer": return state_dict - encoder_layers_to_keep = getattr(model_cfg, "encoder_layers_to_keep", None) - decoder_layers_to_keep = getattr(model_cfg, "decoder_layers_to_keep", None) - if not encoder_layers_to_keep and not decoder_layers_to_keep: return state_dict - - def create_pruning_pass(layers_to_keep, layer_name): - keep_layers = sorted(int(layer_string) for layer_string in layers_to_keep.split(",")) - mapping_dict = {} - for i in range(len(keep_layers)): - mapping_dict[str(keep_layers[i])] = str(i) - - return {"substitution_regex": re.compile(r"^{layer}.*\.layers\.(\d+)".format(layer=layer_name)), "mapping_dict": mapping_dict} - - pruning_passes, new_state_dict = [], {} - if encoder_layers_to_keep: pruning_passes.append(create_pruning_pass(encoder_layers_to_keep, "encoder")) - if decoder_layers_to_keep: pruning_passes.append(create_pruning_pass(decoder_layers_to_keep, "decoder")) - - for layer_name in state_dict.keys(): - match = re.search(r"\.layers\.(\d+)\.", layer_name) - if not match: - new_state_dict[layer_name] = state_dict[layer_name] - continue - - original_layer_number = match.group(1) - for pruning_pass in pruning_passes: - if original_layer_number in pruning_pass["mapping_dict"] and pruning_pass["substitution_regex"].search(layer_name): - substitution_match = pruning_pass["substitution_regex"].search(layer_name) - new_state_dict[(layer_name[: substitution_match.start(1)] + pruning_pass["mapping_dict"][original_layer_number] + layer_name[substitution_match.end(1) :])] = state_dict[layer_name] - - with open_dict(model_cfg) if isinstance(model_cfg, DictConfig) else contextlib.ExitStack(): - if hasattr(model_cfg, "encoder_layers_to_keep"): model_cfg.encoder_layers_to_keep = None - if hasattr(model_cfg, "decoder_layers_to_keep"): model_cfg.decoder_layers_to_keep = None - - return new_state_dict - -def relu_squared(x): - return F.relu(x).pow(2) - -def get_activation_fn(activation): - def gelu(x): - return nn.functional.gelu(x.float()).type_as(x) - - def gelu_accurate(x): - if not hasattr(gelu_accurate, "_a"): - gelu_accurate._a = math.sqrt(2 / math.pi) - return (0.5 * x * (1 + (gelu_accurate._a * (x + 0.044715 * x.pow(3))).tanh())) - - if activation == "relu": return F.relu - elif activation == "relu_squared": return relu_squared - elif activation == "gelu": return gelu - elif activation == "gelu_fast": return gelu_accurate - elif activation == "gelu_accurate": return gelu_accurate - elif activation == "tanh": return torch.tanh - elif activation == "linear": return lambda x: x - elif activation == "swish": return nn.SiLU - else: raise RuntimeError - -class SamePad(nn.Module): - def __init__(self, kernel_size, causal=False): - super().__init__() - if causal: self.remove = kernel_size - 1 - else: self.remove = int(kernel_size % 2 == 0) - - def forward(self, x): - if self.remove > 0: x = x[:, :, : -self.remove] - return x - -class TransformerSentenceEncoderLayer(nn.Module): - def __init__(self, embedding_dim = 768, ffn_embedding_dim = 3072, num_attention_heads = 8, dropout = 0.1, attention_dropout = 0.1, activation_dropout = 0.1, activation_fn = "relu", layer_norm_first = False): - super().__init__() - self.embedding_dim = embedding_dim - self.dropout = dropout - self.activation_dropout = activation_dropout - self.activation_fn = get_activation_fn(activation_fn) - self.self_attn = MultiheadAttention(self.embedding_dim, num_attention_heads, dropout=attention_dropout, self_attention=True) - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(self.activation_dropout) - self.dropout3 = nn.Dropout(dropout) - self.layer_norm_first = layer_norm_first - self.self_attn_layer_norm = LayerNorm(self.embedding_dim) - self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) - self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) - self.final_layer_norm = LayerNorm(self.embedding_dim) - - def forward(self, x, self_attn_mask=None, self_attn_padding_mask=None, need_weights=False, att_args=None): - residual = x - if self.layer_norm_first: - x = self.self_attn_layer_norm(x) - x, attn = self.self_attn(query=x, key=x, value=x, key_padding_mask=self_attn_padding_mask, attn_mask=self_attn_mask, need_weights=False) - x = residual + self.dropout1(x) - residual = x - x = self.fc2(self.dropout2(self.activation_fn(self.fc1(self.final_layer_norm(x))))) - layer_result = x - x = residual + self.dropout3(x) - else: - x, attn = self.self_attn(query=x, key=x, value=x, key_padding_mask=self_attn_padding_mask, need_weights=False) - x = self.self_attn_layer_norm(residual + self.dropout1(x)) - residual = x - x = self.fc2(self.dropout2(self.activation_fn(self.fc1(x)))) - layer_result = x - x = self.final_layer_norm(residual + self.dropout3(x)) - - return x, (attn, layer_result) - -class AdapterFast(nn.Module): - def __init__(self, adapter_num, input_dim, hidden_dim, act_fn): - super().__init__() - self.adapter_num = adapter_num - self.input_dim = input_dim - self.hidden_dim = hidden_dim - self.W_a = nn.Parameter(torch.empty(adapter_num, hidden_dim, input_dim)) - self.W_b = nn.Parameter(torch.empty(adapter_num, input_dim, hidden_dim)) - self.b_a = nn.Parameter(torch.empty(adapter_num, hidden_dim)) - self.b_b = nn.Parameter(torch.empty(adapter_num, input_dim)) - self.ln_W = nn.Parameter(torch.empty(adapter_num, input_dim)) - self.ln_b = nn.Parameter(torch.empty(adapter_num, input_dim)) - self.act_fn = nn.Identity() - if act_fn == "relu": self.act_fn = nn.ReLU() - elif act_fn == "gelu": self.act_fn = nn.GELU() - elif act_fn == "selu": self.act_fn = nn.SELU() - else: raise ValueError - self.input_dim = input_dim - self.reset_parameters() - - def reset_parameters(self): - for ii in range(self.adapter_num): - nn.init.kaiming_uniform_(self.W_a[ii], a=math.sqrt(5)) - nn.init.kaiming_uniform_(self.W_b[ii], a=math.sqrt(5)) - fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.W_a[ii]) - bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - nn.init.uniform_(self.b_a[ii], -bound, bound) - fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.W_b[ii]) - bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - nn.init.uniform_(self.b_b[ii], -bound, bound) - - nn.init.ones_(self.ln_W) - nn.init.zeros_(self.ln_b) - - def forward(self, x, adapter_id): - ii = adapter_id - return F.linear(self.act_fn(F.linear(F.layer_norm(x, (self.input_dim, ), self.ln_W[ii], self.ln_b[ii]), self.W_a[ii], self.b_a[ii])), self.W_b[ii], self.b_b[ii]) - - def extra_repr(self): - return ('adapter={}, input_dim={}, hidden_dim={}'.format(self.adapter_num, self.input_dim, self.hidden_dim)) - -class FeedForwardModule(nn.Module): - def __init__(self, input_feat, hidden_units, dropout1, dropout2, activation_fn="swish", bias=True): - super(FeedForwardModule, self).__init__() - self.layer_norm = LayerNorm(input_feat) - self.w_1 = nn.Linear(input_feat, hidden_units, bias=bias) - self.w_2 = nn.Linear(hidden_units, input_feat, bias=bias) - self.dropout1 = nn.Dropout(dropout1) - self.dropout2 = nn.Dropout(dropout2) - self.activation = get_activation_fn(activation_fn)(hidden_units) - - def forward(self, x): - return self.dropout2(self.w_2(self.dropout1(self.activation(self.w_1(self.layer_norm(x)))))) - -class ConvolutionModule(nn.Module): - def __init__(self, embed_dim, channels, depthwise_kernel_size, dropout, activation_fn="swish", bias=False, export=False): - super(ConvolutionModule, self).__init__() - assert (depthwise_kernel_size - 1) % 2 == 0 - self.layer_norm = LayerNorm(embed_dim, export=export) - self.pointwise_conv1 = nn.Conv1d(embed_dim, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias) - self.glu = nn.GLU(dim=1) - self.depthwise_conv = nn.Conv1d(channels, channels, depthwise_kernel_size, stride=1, padding=(depthwise_kernel_size - 1) // 2, groups=channels, bias=bias) - self.batch_norm = nn.BatchNorm1d(channels) - self.activation = get_activation_fn(activation_fn)(channels) - self.pointwise_conv2 = nn.Conv1d(channels, embed_dim, kernel_size=1, stride=1, padding=0, bias=bias) - self.dropout = nn.Dropout(dropout) - - def forward(self, x): - return self.dropout(self.pointwise_conv2(self.activation(self.batch_norm(self.depthwise_conv(self.glu(self.pointwise_conv1(self.layer_norm(x).transpose(1, 2)))))))).transpose(1, 2) - -def rotate_half(x): - x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=x1.ndim - 1) - -def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0): - cos, sin = (cos[offset : q.shape[0] + offset, ...], sin[offset : q.shape[0] + offset, ...]) - return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) - -class RotaryPositionalEmbedding(nn.Module): - def __init__(self, dim, base=10000, precision=torch.half): - super().__init__() - inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) - self.register_buffer("inv_freq", inv_freq) - self.seq_len_cached = 0 - self.cos_cached = torch.empty(self.seq_len_cached, 1, 1, dim) - self.sin_cached = torch.empty(self.seq_len_cached, 1, 1, dim) - self.precision = precision - - def forward(self, x, seq_len = 0): - if seq_len > self.seq_len_cached: - self.seq_len_cached = seq_len - freqs = torch.einsum("i,j->ij", torch.arange(seq_len, device=x.device).type_as(self.inv_freq), self.inv_freq) - emb = torch.cat((freqs, freqs), dim=-1).to(x.device) - self.cos_cached = emb.cos().view(emb.size(0), 1, 1, emb.size(1)) - self.sin_cached = emb.sin().view(emb.size(0), 1, 1, emb.size(1)) - return self.cos_cached, self.sin_cached - -class ESPNETMultiHeadedAttention(nn.Module): - def __init__(self, n_feat, n_head, dropout): - super(ESPNETMultiHeadedAttention, self).__init__() - assert n_feat % n_head == 0 - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.attn = None - self.dropout = nn.Dropout(p=dropout) - - def forward_qkv(self, query, key, value, **kwargs): - n_batch = query.size(0) - return self.linear_q(query).view(n_batch, -1, self.h, self.d_k).transpose(1, 2), self.linear_k(key).view(n_batch, -1, self.h, self.d_k).transpose(1, 2), self.linear_v(value).view(n_batch, -1, self.h, self.d_k).transpose(1, 2) - - def forward_attention(self, value, scores, mask): - n_batch = value.size(0) - if mask is not None: - scores = scores.masked_fill(mask.unsqueeze(1).unsqueeze(2).to(bool), float("-inf")) - self.attn = scores.softmax(dim=-1) - else: self.attn = scores.softmax(dim=-1) - - return self.linear_out(((self.dropout(self.attn) @ value).transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k))) - - def forward(self, query, key, value, key_padding_mask=None, **kwargs): - q, k, v = self.forward_qkv(query.transpose(0, 1), key.transpose(0, 1), value.transpose(0, 1)) - return self.forward_attention(v, (q @ k.transpose(-2, -1)) / math.sqrt(self.d_k), key_padding_mask).transpose(0, 1), None - -class RelPositionMultiHeadedAttention(ESPNETMultiHeadedAttention): - def __init__(self, n_feat, n_head, dropout, zero_triu=False): - super().__init__(n_feat, n_head, dropout) - self.zero_triu = zero_triu - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.pos_bias_u = nn.Parameter(torch.zeros(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.zeros(self.h, self.d_k)) - nn.init.xavier_uniform_(self.pos_bias_u) - nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x): - x = torch.cat([torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype), x], dim=-1).view(*x.size()[:2], x.size(3) + 1, x.size(2))[:, :, 1:].view_as(x)[:, :, :, : x.size(-1) // 2 + 1] - if self.zero_triu: x = x * torch.ones((x.size(2), x.size(3)), device=x.device).tril(x.size(3) - x.size(2))[None, None, :, :] - return x - - def forward(self, query, key, value, pos_emb, key_padding_mask=None, **kwargs): - pos_emb = pos_emb.transpose(0, 1) - q, k, v = self.forward_qkv(query.transpose(0, 1), key.transpose(0, 1), value.transpose(0, 1)) - q = q.transpose(1, 2) - - return self.forward_attention(v, (((q + self.pos_bias_u).transpose(1, 2) @ k.transpose(-2, -1)) + self.rel_shift(((q + self.pos_bias_v).transpose(1, 2) @ self.linear_pos(pos_emb).view(pos_emb.size(0), -1, self.h, self.d_k).transpose(1, 2).transpose(-2, -1)))) / math.sqrt(self.d_k), key_padding_mask).transpose(0, 1), None - -class RotaryPositionMultiHeadedAttention(ESPNETMultiHeadedAttention): - def __init__(self, n_feat, n_head, dropout, precision, rotary_emd_base=10000): - super().__init__(n_feat, n_head, dropout) - precision = torch.float - self.rotary_ndims = self.d_k - if precision == "fp16": precision = torch.half - self.rotary_emb = RotaryPositionalEmbedding(self.rotary_ndims, base=rotary_emd_base, precision=precision) - - def forward(self, query, key, value, key_padding_mask=None, **kwargs): - T, B, C = value.size() - query = query.view(T, B, self.h, self.d_k) - key = key.view(T, B, self.h, self.d_k) - value = value.view(T, B, self.h, self.d_k) - cos, sin = self.rotary_emb(value, seq_len=T) - query, key = apply_rotary_pos_emb(query, key, cos, sin, offset=0) - query = query.view(T, B, self.h * self.d_k) - key = key.view(T, B, self.h * self.d_k) - value = value.view(T, B, self.h * self.d_k) - q, k, v = self.forward_qkv(query.transpose(0, 1), key.transpose(0, 1), value.transpose(0, 1)) - return self.forward_attention(v, (q @ k.transpose(-2, -1)) / math.sqrt(self.d_k), key_padding_mask).transpose(0, 1), None - -class ConformerEncoderLayer(nn.Module): - def __init__(self, embed_dim, ffn_embed_dim, attention_heads, dropout, use_fp16, depthwise_conv_kernel_size=31, activation_fn="swish", attn_type=None, pos_enc_type="abs"): - self.pos_enc_type = pos_enc_type - super(ConformerEncoderLayer, self).__init__() - self.ffn1 = FeedForwardModule(embed_dim, ffn_embed_dim, dropout, dropout) - self.self_attn_layer_norm = LayerNorm(embed_dim, export=False) - self.self_attn_dropout = nn.Dropout(dropout) - if attn_type == "espnet": - if self.pos_enc_type == "rel_pos": self.self_attn = RelPositionMultiHeadedAttention(embed_dim, attention_heads, dropout=dropout) - elif self.pos_enc_type == "rope": self.self_attn = RotaryPositionMultiHeadedAttention(embed_dim, attention_heads, dropout=dropout, precision=use_fp16) - elif self.pos_enc_type == "abs": self.self_attn = ESPNETMultiHeadedAttention(embed_dim, attention_heads, dropout=dropout) - else: raise Exception - else: self.self_attn = MultiheadAttention(embed_dim, attention_heads, dropout=dropout) - self.conv_module = ConvolutionModule(embed_dim=embed_dim, channels=embed_dim, depthwise_kernel_size=depthwise_conv_kernel_size, dropout=dropout, activation_fn=activation_fn) - self.ffn2 = FeedForwardModule(embed_dim, ffn_embed_dim, dropout, dropout, activation_fn=activation_fn) - self.final_layer_norm = LayerNorm(embed_dim, export=False) - - def forward(self, x, encoder_padding_mask, position_emb = None): - residual = x - x = self.ffn1(x) * 0.5 + residual - residual = x - x = self.self_attn_layer_norm(x) - if self.pos_enc_type == "rel_pos": x, attn = self.self_attn(query=x, key=x, value=x, key_padding_mask=encoder_padding_mask, pos_emb=position_emb, need_weights=False) - else: x, attn = self.self_attn(query=x, key=x, value=x, key_padding_mask=encoder_padding_mask, need_weights=False) - x = self.self_attn_dropout(x) - x = x + residual - residual = x - x = residual + self.conv_module(x.transpose(0, 1)).transpose(0, 1) - residual = x - x = self.ffn2(x) - layer_result = x - x = self.final_layer_norm(x * 0.5 + residual) - return x, (attn, layer_result) - -class ConformerWav2Vec2EncoderLayer(ConformerEncoderLayer): - def forward(self, x, self_attn_mask=None, self_attn_padding_mask=None, need_weights=False, att_args=None, position_emb=None): - return super().forward(x, self_attn_padding_mask, position_emb) - -class TransformerSentenceEncoderWithAdapterLayer(TransformerSentenceEncoderLayer): - def __init__(self, embedding_dim = 768, ffn_embedding_dim = 3072, num_attention_heads = 8, dropout = 0.1, attention_dropout = 0.1, activation_dropout = 0.1, activation_fn = "relu", layer_norm_first = False, adapter_num=201, adapter_dim=64, adapter_act_fn="relu"): - super().__init__(embedding_dim=embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, layer_norm_first=layer_norm_first) - self.adapter_num = adapter_num - self.adapter_dim = adapter_dim - self.adapter_layer = AdapterFast(adapter_num, self.embedding_dim, self.adapter_dim, adapter_act_fn) - - def forward(self, x, self_attn_mask=None, self_attn_padding_mask=None, need_weights=False, att_args=None, corpus_key=None): - x, (attn, layer_result) = super().forward(x=x, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_weights=need_weights, att_args=att_args) - assert corpus_key is not None - assert len(set(corpus_key)) == 1 - return x + self.adapter_layer(x, corpus_key[0]), (attn, layer_result) - -class TransposeLast(nn.Module): - def __init__(self, deconstruct_idx=None, tranpose_dim=-2): - super().__init__() - self.deconstruct_idx = deconstruct_idx - self.tranpose_dim = tranpose_dim - - def forward(self, x): - if self.deconstruct_idx is not None: x = x[self.deconstruct_idx] - return x.transpose(self.tranpose_dim, -1) - -class TransformerEncoder(nn.Module): - def build_encoder_layer(self, args, **kwargs): - if args.layer_type == "transformer": layer = TransformerSentenceEncoderLayer(embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first) - elif args.layer_type == "conformer": layer = ConformerWav2Vec2EncoderLayer(embed_dim=self.embedding_dim, ffn_embed_dim=args.encoder_ffn_embed_dim, attention_heads=args.encoder_attention_heads, dropout=args.dropout, depthwise_conv_kernel_size=args.depthwise_conv_kernel_size, activation_fn="swish", attn_type=args.attn_type, use_fp16=args.fp16, pos_enc_type="abs") - elif args.layer_type == "trf_adp": - use_adp = False - if args.adp_trf_idx == "all": use_adp = True - else: - if kwargs.get("layer_idx", None) in list(range(*[int(g) for g in args.adp_trf_idx.split(":")])): use_adp = True - - layer = TransformerSentenceEncoderWithAdapterLayer(embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first, adapter_num=args.adp_num, adapter_dim=args.adp_dim, adapter_act_fn=args.adp_act_fn) if use_adp else TransformerSentenceEncoderLayer(embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first,) - - return layer - - def __init__(self, args): - super().__init__() - self.dropout = args.dropout - self.embedding_dim = args.encoder_embed_dim - self.required_seq_len_multiple = args.required_seq_len_multiple - pos_conv_depth = getattr(args, "pos_conv_depth", 1) - if pos_conv_depth > 1: - num_layers = args.pos_conv_depth - k = max(3, args.conv_pos // num_layers) - - def make_conv_block(e, k, g, l): - return nn.Sequential(*[nn.Sequential(nn.Conv1d(e, e, kernel_size=k, padding=k // 2, groups=g), SamePad(k), TransposeLast(), LayerNorm(e, elementwise_affine=False), TransposeLast(), nn.GELU()) for _ in range(l)]) - - self.pos_conv = make_conv_block(self.embedding_dim, k, args.conv_pos_groups, num_layers) - else: self.pos_conv = make_conv_pos(self.embedding_dim, args.conv_pos, args.conv_pos_groups) - - self.layers = nn.ModuleList([self.build_encoder_layer(args, layer_idx=ii) for ii in range(args.encoder_layers)]) - self.layer_norm_first = args.layer_norm_first - self.layer_norm = LayerNorm(self.embedding_dim) - self.layerdrop = args.encoder_layerdrop - self.apply(init_bert_params) - - def forward(self, x, padding_mask=None, layer=None, corpus_key=None): - x, layer_results = self.extract_features(x, padding_mask, layer, corpus_key=corpus_key) - if self.layer_norm_first and layer is None: x = self.layer_norm(x) - return x, layer_results - - def extract_features(self, x, padding_mask=None, tgt_layer=None, min_layer=0, corpus_key=None): - if padding_mask is not None: x = index_put(x, padding_mask, 0) - x = x + self.pos_conv(x.transpose(1, 2)).transpose(1, 2) - if not self.layer_norm_first: x = self.layer_norm(x) - x, pad_length = pad_to_multiple(x, self.required_seq_len_multiple, dim=-2, value=0) - if pad_length > 0 and padding_mask is None: - padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool) - padding_mask[:, -pad_length:] = True - else: padding_mask, _ = pad_to_multiple(padding_mask, self.required_seq_len_multiple, dim=-1, value=True) - x = F.dropout(x, p=self.dropout, training=self.training).transpose(0, 1) - layer_results = [] - r = None - - for i, layer in enumerate(self.layers): - dropout_probability = np.random.random() if self.layerdrop > 0 else 1 - if not self.training or (dropout_probability > self.layerdrop): - layer_check = layer - if (corpus_key is None) or (not isinstance(layer_check, (TransformerSentenceEncoderWithAdapterLayer))): x, (z, lr) = layer(x, self_attn_padding_mask=padding_mask, need_weights=False) - else: x, (z, lr) = layer(x, self_attn_padding_mask=padding_mask, need_weights=False, corpus_key=corpus_key) - if i >= min_layer: layer_results.append((x, z, lr)) - if i == tgt_layer: - r = x - break - - if r is not None: x = r - x = x.transpose(0, 1) - - if pad_length > 0: - x = x[:, :-pad_length] - def undo_pad(a, b, c): - return (a[:-pad_length], b[:-pad_length] if b is not None else b, c[:-pad_length]) - - layer_results = [undo_pad(*u) for u in layer_results] - - return x, layer_results - - def max_positions(self): - return self.args.max_positions - - def upgrade_state_dict_named(self, state_dict, name): - return state_dict - -class Fp32GroupNorm(nn.GroupNorm): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def forward(self, input): - output = F.group_norm(input.float(), self.num_groups, self.weight.float() if self.weight is not None else None, self.bias.float() if self.bias is not None else None, self.eps) - return output.type_as(input) - -class Fp32LayerNorm(nn.LayerNorm): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def forward(self, input): - output = F.layer_norm(input.float(), self.normalized_shape, self.weight.float() if self.weight is not None else None, self.bias.float() if self.bias is not None else None, self.eps) - return output.type_as(input) - -class ConvFeatureExtractionModel(nn.Module): - def __init__(self, conv_layers, dropout = 0.0, mode = "default", conv_bias = False): - super().__init__() - assert mode in {"default", "layer_norm"} - - def block(n_in, n_out, k, stride, is_layer_norm=False, is_group_norm=False, conv_bias=False): - def make_conv(): - conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias) - nn.init.kaiming_normal_(conv.weight) - return conv - - assert (is_layer_norm and is_group_norm) == False - - if is_layer_norm: return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.Sequential(TransposeLast(), Fp32LayerNorm(dim, elementwise_affine=True), TransposeLast()), nn.GELU()) - elif is_group_norm: return nn.Sequential(make_conv(), nn.Dropout(p=dropout), Fp32GroupNorm(dim, dim, affine=True), nn.GELU()) - else: return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU()) - - in_d = 1 - self.conv_layers = nn.ModuleList() - for i, cl in enumerate(conv_layers): - assert len(cl) == 3 - (dim, k, stride) = cl - self.conv_layers.append(block(in_d, dim, k, stride, is_layer_norm=mode == "layer_norm", is_group_norm=mode == "default" and i == 0, conv_bias=conv_bias)) - in_d = dim - - def forward(self, x): - x = x.unsqueeze(1) - for conv in self.conv_layers: - x = conv(x) - - return x - -class GradMultiply(torch.autograd.Function): - @staticmethod - def forward(ctx, x, scale): - ctx.scale = scale - res = x.new(x) - return res - - @staticmethod - def backward(ctx, grad): - return grad * ctx.scale, None - -class BaseFairseqModel(nn.Module): - def __init__(self): - super().__init__() - self._is_generation_fast = False - - def get_targets(self, sample, net_output): - return sample["target"] - - def extract_features(self, *args, **kwargs): - return self(*args, **kwargs) - - def load_state_dict(self, state_dict, strict=True, model_cfg = None, args = None): - self.upgrade_state_dict(state_dict) - new_state_dict = prune_state_dict(state_dict, model_cfg) - return super().load_state_dict(new_state_dict, strict) - - def upgrade_state_dict(self, state_dict): - self.upgrade_state_dict_named(state_dict, "") - - def upgrade_state_dict_named(self, state_dict, name): - assert state_dict is not None - - def do_upgrade(m, prefix): - if len(prefix) > 0: prefix += "." - for n, c in m.named_children(): - name = prefix + n - if hasattr(c, "upgrade_state_dict_named"): c.upgrade_state_dict_named(state_dict, name) - elif hasattr(c, "upgrade_state_dict"): c.upgrade_state_dict(state_dict) - do_upgrade(c, name) - - do_upgrade(self, name) - - def make_generation_fast_(self, **kwargs): - if self._is_generation_fast: return - self._is_generation_fast = True - - def apply_remove_weight_norm(module): - try: - nn.utils.remove_weight_norm(module) - except (AttributeError, ValueError): - return - - self.apply(apply_remove_weight_norm) - def apply_make_generation_fast_(module, prefix): - if len(prefix) > 0: prefix += "." - - base_func = BaseFairseqModel.make_generation_fast_ - for n, m in module.named_modules(): - if (m != self and hasattr(m, "make_generation_fast_") and m.make_generation_fast_.__func__ is not base_func): m.make_generation_fast_(name=prefix + n, **kwargs) - - apply_make_generation_fast_(self, "") - self.eval() - -class HubertConfig: - def __init__( - self, - _name = None, - label_rate = 50, - encoder_layers_1 = 3, - logit_temp_ctr = 0.1, - num_negatives = 100, - cross_sample_negatives = 0, - ctr_layers = [-6], - crop_seq_to_multiple = 1, - extractor_mode = "default", - encoder_layers = 12, - encoder_embed_dim = 768, - encoder_ffn_embed_dim = 3072, - encoder_attention_heads = 12, - activation_fn = "gelu", - layer_type = "transformer", - dropout = 0.1, - attention_dropout = 0.1, - activation_dropout = 0.0, - encoder_layerdrop = 0.0, - dropout_input = 0.0, - dropout_features = 0.0, - final_dim = 0, - untie_final_proj = False, - layer_norm_first = False, - conv_feature_layers = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", - conv_bias = False, - logit_temp = 0.1, - target_glu = False, - feature_grad_mult = 1.0, - mask_length = 10, - mask_prob = 0.65, - mask_selection = "static", - mask_other = 0.0, - no_mask_overlap = False, - mask_min_space = 1, - mask_channel_length = 10, - mask_channel_prob = 0.0, - mask_channel_selection = "static", - mask_channel_other = 0.0, - no_mask_channel_overlap = False, - mask_channel_min_space = 1, - conv_pos = 128, - conv_pos_groups = 16, - conv_pos_batch_norm = False, - latent_temp = (2, 0.5, 0.999995), - skip_masked = False, - skip_nomask = False, - checkpoint_activations = False, - required_seq_len_multiple = 2, - depthwise_conv_kernel_size = 31, - attn_type = "", - pos_enc_type = "abs", - fp16 = False - ): - self._name = _name - self.label_rate = label_rate - self.encoder_layers_1 = encoder_layers_1 - self.logit_temp_ctr = logit_temp_ctr - self.num_negatives = num_negatives - self.cross_sample_negatives = cross_sample_negatives - self.ctr_layers = ctr_layers - self.crop_seq_to_multiple = crop_seq_to_multiple - self.extractor_mode = extractor_mode - self.encoder_layers = encoder_layers - self.encoder_embed_dim = encoder_embed_dim - self.encoder_ffn_embed_dim = encoder_ffn_embed_dim - self.encoder_attention_heads = encoder_attention_heads - self.activation_fn = activation_fn - self.layer_type = layer_type - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.encoder_layerdrop = encoder_layerdrop - self.dropout_input = dropout_input - self.dropout_features = dropout_features - self.final_dim = final_dim - self.untie_final_proj = untie_final_proj - self.layer_norm_first = layer_norm_first - self.conv_feature_layers = conv_feature_layers - self.conv_bias = conv_bias - self.logit_temp = logit_temp - self.target_glu = target_glu - self.feature_grad_mult = feature_grad_mult - self.mask_length = mask_length - self.mask_prob = mask_prob - self.mask_selection = mask_selection - self.mask_other = mask_other - self.no_mask_overlap = no_mask_overlap - self.mask_min_space = mask_min_space - self.mask_channel_length = mask_channel_length - self.mask_channel_prob = mask_channel_prob - self.mask_channel_selection = mask_channel_selection - self.mask_channel_other = mask_channel_other - self.no_mask_channel_overlap = no_mask_channel_overlap - self.mask_channel_min_space = mask_channel_min_space - self.conv_pos = conv_pos - self.conv_pos_groups = conv_pos_groups - self.conv_pos_batch_norm = conv_pos_batch_norm - self.latent_temp = latent_temp - self.skip_masked = skip_masked - self.skip_nomask = skip_nomask - self.checkpoint_activations = checkpoint_activations - self.required_seq_len_multiple = required_seq_len_multiple - self.depthwise_conv_kernel_size = depthwise_conv_kernel_size - self.attn_type = attn_type - self.pos_enc_type = pos_enc_type - self.fp16 = fp16 - -class HubertModel(BaseFairseqModel): - def __init__(self, cfg, num_classes): - super().__init__() - feature_enc_layers = eval(cfg.conv_feature_layers) - self.embed = feature_enc_layers[-1][0] - self.feature_extractor = ConvFeatureExtractionModel(conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias) - feature_ds_rate = np.prod([s for _, _, s in feature_enc_layers]) - self.feat2tar_ratio = cfg.label_rate * feature_ds_rate / 16000 - self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else None) - self.mask_prob = cfg.mask_prob - self.mask_selection = cfg.mask_selection - self.mask_other = cfg.mask_other - self.mask_length = cfg.mask_length - self.no_mask_overlap = cfg.no_mask_overlap - self.mask_min_space = cfg.mask_min_space - self.mask_channel_prob = cfg.mask_channel_prob - self.mask_channel_selection = cfg.mask_channel_selection - self.mask_channel_other = cfg.mask_channel_other - self.mask_channel_length = cfg.mask_channel_length - self.no_mask_channel_overlap = cfg.no_mask_channel_overlap - self.mask_channel_min_space = cfg.mask_channel_min_space - self.dropout_input = nn.Dropout(cfg.dropout_input) - self.dropout_features = nn.Dropout(cfg.dropout_features) - self.feature_grad_mult = cfg.feature_grad_mult - self.logit_temp = cfg.logit_temp - self.skip_masked = cfg.skip_masked - self.skip_nomask = cfg.skip_nomask - final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim - self.mask_emb = nn.Parameter(torch.FloatTensor(cfg.encoder_embed_dim).uniform_()) - self.encoder = TransformerEncoder(cfg) - self.layer_norm = LayerNorm(self.embed) - self.target_glu = None - if cfg.target_glu: self.target_glu = nn.Sequential(nn.Linear(final_dim, final_dim * 2), nn.GLU()) - self.untie_final_proj = cfg.untie_final_proj - self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim) - self.num_classes = [num_classes] - self.label_embs_concat = nn.Parameter(torch.FloatTensor(sum(self.num_classes), final_dim)) - nn.init.uniform_(self.label_embs_concat) - - def upgrade_state_dict_named(self, state_dict, name): - super().upgrade_state_dict_named(state_dict, name) - return state_dict - - def apply_mask(self, x, padding_mask, target_list): - B, T, C = x.shape - if self.mask_prob > 0: - mask_indices = torch.from_numpy(compute_mask_indices((B, T), padding_mask, self.mask_prob, self.mask_length, self.mask_selection, self.mask_other, min_masks=2, no_overlap=self.no_mask_overlap, min_space=self.mask_min_space)).to(x.device) - x[mask_indices] = self.mask_emb - else: mask_indices = None - - if self.mask_channel_prob > 0: x[(torch.from_numpy(compute_mask_indices((B, C), None, self.mask_channel_prob, self.mask_channel_length, self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, min_space=self.mask_channel_min_space)).to(x.device).unsqueeze(1).expand(-1, T, -1))] = 0 - return x, mask_indices - - def compute_nce(self, x, pos, negs): - neg_is_pos = (pos == negs).all(-1) - logits = torch.cosine_similarity(x.float(), torch.cat([pos.unsqueeze(0), negs], dim=0).float(), dim=-1).type_as(x) - logits /= self.logit_temp - if neg_is_pos.any(): logits[1:][neg_is_pos] = float("-inf") - return logits.transpose(0, 1) - - def forward_features(self, source): - if self.feature_grad_mult > 0: - features = self.feature_extractor(source) - if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) - else: - with torch.no_grad(): - features = self.feature_extractor(source) - return features - - def forward_targets(self, features, target_list): - feat_tsz = features.size(2) - targ_tsz = min([t.size(1) for t in target_list]) - if self.feat2tar_ratio * feat_tsz > targ_tsz: - feat_tsz = int(targ_tsz / self.feat2tar_ratio) - features = features[..., :feat_tsz] - - return features, [t[:, (torch.arange(feat_tsz).float() * self.feat2tar_ratio).long()] for t in target_list] - - def forward_padding_mask(self, features, padding_mask): - extra = padding_mask.size(1) % features.size(1) - if extra > 0: padding_mask = padding_mask[:, :-extra] - return padding_mask.view(padding_mask.size(0), features.size(1), -1).all(-1) - - def forward(self, source, target_list = None, padding_mask = None, mask = True, features_only = False, output_layer = None): - features = self.forward_features(source) - if target_list is not None: features, target_list = self.forward_targets(features, target_list) - features_pen = features.float().pow(2).mean() - features = self.layer_norm(features.transpose(1, 2)) - unmasked_features = features.clone() - if padding_mask is not None: padding_mask = self.forward_padding_mask(features, padding_mask) - if self.post_extract_proj is not None: features = self.post_extract_proj(features) - features = self.dropout_input(features) - unmasked_features = self.dropout_features(unmasked_features) - if mask: x, mask_indices = self.apply_mask(features, padding_mask, target_list) - else: x, mask_indices = features, None - x, _ = self.encoder(x, padding_mask=padding_mask, layer=None if output_layer is None else output_layer - 1) - if features_only: return {"x": x, "padding_mask": padding_mask, "features": features} - - def compute_pred(proj_x, target, label_embs): - y = torch.index_select(label_embs, 0, target.long()) - negs = label_embs.unsqueeze(1).expand(-1, proj_x.size(0), -1) - if self.target_glu: - y = self.target_glu(y) - negs = self.target_glu(negs) - - return self.compute_nce(proj_x, y, negs) - - label_embs_list = self.label_embs_concat.split(self.num_classes, 0) - if not self.skip_masked: - masked_indices = torch.logical_and(~padding_mask, mask_indices) - proj_x_m = self.final_proj(x[masked_indices]) - logit_m_list = [compute_pred(proj_x_m, t[masked_indices], label_embs_list[i]) for i, (proj_x_m, t) in enumerate(zip(proj_x_m.chunk(len(target_list), dim=-1) if self.untie_final_proj else [proj_x_m for _ in range(len(target_list))], target_list))] - else: logit_m_list = [None for _ in target_list] - - if not self.skip_nomask: - nomask_indices = torch.logical_and(~padding_mask, ~mask_indices) - proj_x_u = self.final_proj(x[nomask_indices]) - logit_u_list = [compute_pred(proj_x_u, t[nomask_indices], label_embs_list[i]) for i, (proj_x_u, t) in enumerate(zip(proj_x_u.chunk(len(target_list), dim=-1) if self.untie_final_proj else [proj_x_u for _ in range(len(target_list))], target_list))] - else: logit_u_list = [None for _ in target_list] - - return {"logit_m_list": logit_m_list, "logit_u_list": logit_u_list, "padding_mask": padding_mask, "features_pen": features_pen} - - def extract_features(self, source, padding_mask = None, mask = False, ret_conv = False, output_layer = None): - res = self.forward(source, padding_mask=padding_mask, mask=mask, features_only=True, output_layer=output_layer) - return res["features"] if ret_conv else res["x"], res["padding_mask"] - - def get_logits(self, net_output, is_masked=True): - return [x.float() for x in (net_output["logit_m_list"] if is_masked else net_output["logit_u_list"]) if x is not None] - - def get_targets(self, net_output, is_masked=True): - return [x.new_zeros(x.size(0), dtype=torch.long) for x in self.get_logits(net_output, is_masked)] - - def get_extra_losses(self, net_output): - extra_losses, names = [], [] - if "features_pen" in net_output: - extra_losses.append(net_output["features_pen"]) - names.append("features_pen") - - return extra_losses, names - - def remove_pretraining_modules(self): - self.target_glu = None - self.final_proj = None \ No newline at end of file diff --git a/main/library/embedders/onnx.py b/main/library/embedders/onnx.py deleted file mode 100644 index 2424c06d2e37a9d212127cf5a50e37435498c36f..0000000000000000000000000000000000000000 --- a/main/library/embedders/onnx.py +++ /dev/null @@ -1,17 +0,0 @@ -import torch -import onnxruntime - -class HubertModelONNX: - def __init__(self, embedder_model_path, providers, device): - sess_options = onnxruntime.SessionOptions() - sess_options.log_severity_level = 3 - self.model = onnxruntime.InferenceSession(embedder_model_path, sess_options=sess_options, providers=providers) - self.final_proj = self._final_proj - self.device = device - - def _final_proj(self, source): - return source - - def extract_features(self, source, padding_mask = None, output_layer = None): - logits = self.model.run([self.model.get_outputs()[0].name, self.model.get_outputs()[1].name], {"feats": source.detach().cpu().numpy()}) - return [torch.as_tensor(logits[int(output_layer != 9)], dtype=torch.float32, device=self.device)] \ No newline at end of file diff --git a/main/library/embedders/ppg.py b/main/library/embedders/ppg.py deleted file mode 100644 index be315b6301c315f9492f5872ae26d680cbaa1724..0000000000000000000000000000000000000000 --- a/main/library/embedders/ppg.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -import sys -import torch - -sys.path.append(os.getcwd()) - -from main.library.speaker_diarization.whisper import Whisper, ModelDimensions, log_mel_spectrogram, pad_or_trim - -class WhisperModel(torch.nn.Module): - def __init__(self, model_path, device): - super().__init__() - checkpoint = torch.load(model_path, map_location="cpu") - dims = ModelDimensions(**checkpoint["dims"]) - self.final_proj = torch.nn.Linear(dims.n_text_state, 768) - self.model = Whisper(dims) - self.model.load_state_dict(checkpoint["model_state_dict"]) - self.model = self.model.to(device) - del self.model.decoder - - def forward(self, audio): - ppgln = audio.shape[1] // 320 - mel = log_mel_spectrogram(pad_or_trim(audio[0])).to(audio.device) - - with torch.no_grad(): - ppg_raw = self.model.encoder(mel.unsqueeze(0)) - ppg_projected = self.final_proj(ppg_raw) - ppg = ppg_projected.data.float() - ppg = ppg[:, :ppgln, :] - - return [ppg] - - def extract_features(self, source, padding_mask = None, output_layer = None): - return self.forward(source) \ No newline at end of file diff --git a/main/library/embedders/transformers.py b/main/library/embedders/transformers.py deleted file mode 100644 index f99284a3ce751dce80e7c422794bb26e761ad3dc..0000000000000000000000000000000000000000 --- a/main/library/embedders/transformers.py +++ /dev/null @@ -1,10 +0,0 @@ -from torch import nn -from transformers import HubertModel - -class HubertModelWithFinalProj(HubertModel): - def __init__(self, config): - super().__init__(config) - self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) - - def extract_features(self, source, padding_mask = None, output_layer = None): - return self.forward(source) \ No newline at end of file diff --git a/main/library/generators/hifigan.py b/main/library/generators/hifigan.py deleted file mode 100644 index 06863292f6d059c2cea069706281d354180d5e60..0000000000000000000000000000000000000000 --- a/main/library/generators/hifigan.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import sys -import torch - -import torch.nn.utils.parametrize as parametrize - -from torch.nn.utils import remove_weight_norm -from torch.nn.utils.parametrizations import weight_norm - -sys.path.append(os.getcwd()) - -from main.library.algorithm.commons import init_weights -from main.library.algorithm.residuals import ResBlock, LRELU_SLOPE - -class HiFiGANGenerator(torch.nn.Module): - def __init__(self, initial_channel, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): - super(HiFiGANGenerator, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - self.conv_pre = torch.nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) - self.ups = torch.nn.ModuleList() - self.resblocks = torch.nn.ModuleList() - - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - self.ups.append(weight_norm(torch.nn.ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2))) - ch = upsample_initial_channel // (2 ** (i + 1)) - - for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes): - self.resblocks.append(ResBlock(ch, k, d)) - - self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(init_weights) - if gin_channels != 0: self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - def forward(self, x, g = None): - x = self.conv_pre(x) - if g is not None: x += self.cond(g) - - for i in range(self.num_upsamples): - x = self.ups[i](torch.nn.functional.leaky_relu(x, LRELU_SLOPE)) - xs = None - - for j in range(self.num_kernels): - if xs is None: xs = self.resblocks[i * self.num_kernels + j](x) - else: xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels - - return self.conv_post(torch.nn.functional.leaky_relu(x)).tanh() - - def remove_weight_norm(self): - for l in self.ups: - if hasattr(l, "parametrizations") and "weight" in l.parametrizations: parametrize.remove_parametrizations(l, "weight", leave_parametrized=True) - else: remove_weight_norm(l) - - for l in self.resblocks: - l.remove_weight_norm() \ No newline at end of file diff --git a/main/library/generators/mrf_hifigan.py b/main/library/generators/mrf_hifigan.py deleted file mode 100644 index c6fb42d1503b62d651ff274d8b03ddae12a2acf6..0000000000000000000000000000000000000000 --- a/main/library/generators/mrf_hifigan.py +++ /dev/null @@ -1,161 +0,0 @@ -import math -import torch - -import numpy as np -import torch.nn as nn -import torch.nn.functional as F -import torch.nn.utils.parametrize as parametrize - -from torch.nn.utils import remove_weight_norm -from torch.utils.checkpoint import checkpoint -from torch.nn.utils.parametrizations import weight_norm - -LRELU_SLOPE = 0.1 - -class MRFLayer(nn.Module): - def __init__(self, channels, kernel_size, dilation): - super().__init__() - self.conv1 = weight_norm(nn.Conv1d(channels, channels, kernel_size, padding=(kernel_size * dilation - dilation) // 2, dilation=dilation)) - self.conv2 = weight_norm(nn.Conv1d(channels, channels, kernel_size, padding=kernel_size // 2, dilation=1)) - - def forward(self, x): - return x + self.conv2(F.leaky_relu(self.conv1(F.leaky_relu(x, LRELU_SLOPE)), LRELU_SLOPE)) - - def remove_weight_norm(self): - if hasattr(self.conv1, "parametrizations") and "weight" in self.conv1.parametrizations: parametrize.remove_parametrizations(self.conv1, "weight", leave_parametrized=True) - else: remove_weight_norm(self.conv1) - - if hasattr(self.conv2, "parametrizations") and "weight" in self.conv2.parametrizations: parametrize.remove_parametrizations(self.conv2, "weight", leave_parametrized=True) - else: remove_weight_norm(self.conv2) - -class MRFBlock(nn.Module): - def __init__(self, channels, kernel_size, dilations): - super().__init__() - self.layers = nn.ModuleList() - - for dilation in dilations: - self.layers.append(MRFLayer(channels, kernel_size, dilation)) - - def forward(self, x): - for layer in self.layers: - x = layer(x) - - return x - - def remove_weight_norm(self): - for layer in self.layers: - layer.remove_weight_norm() - -class SineGenerator(nn.Module): - def __init__(self, samp_rate, harmonic_num = 0, sine_amp = 0.1, noise_std = 0.003, voiced_threshold = 0): - super(SineGenerator, self).__init__() - self.sine_amp = sine_amp - self.noise_std = noise_std - self.harmonic_num = harmonic_num - self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate - self.voiced_threshold = voiced_threshold - - def _f02uv(self, f0): - uv = torch.ones_like(f0) * (f0 > self.voiced_threshold) - if uv.device.type == "privateuseone": uv = uv.float() - - return uv - - def _f02sine(self, f0_values): - rad_values = (f0_values / self.sampling_rate) % 1 - rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], dtype=f0_values.dtype, device=f0_values.device) - rand_ini[:, 0] = 0 - rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini - tmp_over_one = torch.cumsum(rad_values, 1) % 1 - tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 - cumsum_shift = torch.zeros_like(rad_values) - cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 - - return (torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi).sin() - - def forward(self, f0): - with torch.no_grad(): - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, dtype=f0.dtype, device=f0.device) - f0_buf[:, :, 0] = f0[:, :, 0] - - for idx in np.arange(self.harmonic_num): - f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) - - sine_waves = self._f02sine(f0_buf) * self.sine_amp - uv = self._f02uv(f0) - sine_waves = sine_waves * uv + ((uv * self.noise_std + (1 - uv) * self.sine_amp / 3) * torch.randn_like(sine_waves)) - - return sine_waves - -class SourceModuleHnNSF(nn.Module): - def __init__(self, sampling_rate, harmonic_num = 0, sine_amp = 0.1, add_noise_std = 0.003, voiced_threshold = 0): - super(SourceModuleHnNSF, self).__init__() - self.sine_amp = sine_amp - self.noise_std = add_noise_std - self.l_sin_gen = SineGenerator(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold) - self.l_linear = nn.Linear(harmonic_num + 1, 1) - self.l_tanh = nn.Tanh() - - def forward(self, x): - return self.l_tanh(self.l_linear(self.l_sin_gen(x).to(dtype=self.l_linear.weight.dtype))) - -class HiFiGANMRFGenerator(nn.Module): - def __init__(self, in_channel, upsample_initial_channel, upsample_rates, upsample_kernel_sizes, resblock_kernel_sizes, resblock_dilations, gin_channels, sample_rate, harmonic_num, checkpointing = False): - super().__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.checkpointing = checkpointing - self.f0_upsample = nn.Upsample(scale_factor=np.prod(upsample_rates)) - self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num) - self.conv_pre = weight_norm(nn.Conv1d(in_channel, upsample_initial_channel, kernel_size=7, stride=1, padding=3)) - self.upsamples = nn.ModuleList() - self.noise_convs = nn.ModuleList() - stride_f0s = [math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1 for i in range(len(upsample_rates))] - - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - self.upsamples.append(weight_norm(nn.ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), kernel_size=k, stride=u, padding=((k - u) // 2) if u % 2 == 0 else (u // 2 + u % 2), output_padding=u % 2))) - stride = stride_f0s[i] - kernel = 1 if stride == 1 else stride * 2 - stride % 2 - self.noise_convs.append(nn.Conv1d(1, upsample_initial_channel // (2 ** (i + 1)), kernel_size=kernel, stride=stride, padding=0 if stride == 1 else (kernel - stride) // 2)) - - self.mrfs = nn.ModuleList() - for i in range(len(self.upsamples)): - channel = upsample_initial_channel // (2 ** (i + 1)) - self.mrfs.append(nn.ModuleList([MRFBlock(channel, kernel_size=k, dilations=d) for k, d in zip(resblock_kernel_sizes, resblock_dilations)])) - - self.conv_post = weight_norm(nn.Conv1d(channel, 1, kernel_size=7, stride=1, padding=3)) - if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - def forward(self, x, f0, g = None): - har_source = self.m_source(self.f0_upsample(f0[:, None, :]).transpose(-1, -2)).transpose(-1, -2) - x = self.conv_pre(x) - if g is not None: x += self.cond(g) - - for ups, mrf, noise_conv in zip(self.upsamples, self.mrfs, self.noise_convs): - x = F.leaky_relu(x, LRELU_SLOPE) - - if self.training and self.checkpointing: - x = checkpoint(ups, x, use_reentrant=False) + noise_conv(har_source) - xs = sum([checkpoint(layer, x, use_reentrant=False) for layer in mrf]) - else: - x = ups(x) + noise_conv(har_source) - xs = sum([layer(x) for layer in mrf]) - - x = xs / self.num_kernels - - return self.conv_post(F.leaky_relu(x)).tanh() - - def remove_weight_norm(self): - if hasattr(self.conv_pre, "parametrizations") and "weight" in self.conv_pre.parametrizations: parametrize.remove_parametrizations(self.conv_pre, "weight", leave_parametrized=True) - else: remove_weight_norm(self.conv_pre) - - for up in self.upsamples: - if hasattr(up, "parametrizations") and "weight" in up.parametrizations: parametrize.remove_parametrizations(up, "weight", leave_parametrized=True) - else: remove_weight_norm(up) - - for mrf in self.mrfs: - for block in mrf: - block.remove_weight_norm() - - if hasattr(self.conv_post, "parametrizations") and "weight" in self.conv_post.parametrizations: parametrize.remove_parametrizations(self.conv_post, "weight", leave_parametrized=True) - else: remove_weight_norm(self.conv_post) \ No newline at end of file diff --git a/main/library/generators/nsf_hifigan.py b/main/library/generators/nsf_hifigan.py deleted file mode 100644 index 03db71db3488e909958f0e28e62d0af98e18b0c0..0000000000000000000000000000000000000000 --- a/main/library/generators/nsf_hifigan.py +++ /dev/null @@ -1,122 +0,0 @@ -import os -import sys -import math -import torch - -import numpy as np -import torch.nn.functional as F -import torch.nn.utils.parametrize as parametrize - -from torch.nn.utils import remove_weight_norm -from torch.utils.checkpoint import checkpoint -from torch.nn.utils.parametrizations import weight_norm - -sys.path.append(os.getcwd()) - -from main.library.algorithm.commons import init_weights -from main.library.algorithm.residuals import ResBlock, LRELU_SLOPE - -class SineGen(torch.nn.Module): - def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0, flag_for_pulse=False): - super(SineGen, self).__init__() - self.sine_amp = sine_amp - self.noise_std = noise_std - self.harmonic_num = harmonic_num - self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate - self.voiced_threshold = voiced_threshold - - def _f02uv(self, f0): - uv = torch.ones_like(f0) * (f0 > self.voiced_threshold) - if uv.device.type == "privateuseone": uv = uv.float() - - return uv - - def _f02sine(self, f0, upp): - rad = f0 / self.sampling_rate * torch.arange(1, upp + 1, dtype=f0.dtype, device=f0.device) - rad += F.pad((torch.fmod(rad[:, :-1, -1:].float() + 0.5, 1.0) - 0.5).cumsum(dim=1).fmod(1.0).to(f0), (0, 0, 1, 0), mode='constant') - rad = rad.reshape(f0.shape[0], -1, 1) - rad *= torch.arange(1, self.dim + 1, dtype=f0.dtype, device=f0.device).reshape(1, 1, -1) - rand_ini = torch.rand(1, 1, self.dim, device=f0.device) - rand_ini[..., 0] = 0 - rad += rand_ini - - return (2 * np.pi * rad).sin() - - def forward(self, f0, upp): - with torch.no_grad(): - f0 = f0.unsqueeze(-1) - sine_waves = self._f02sine(f0, upp) * self.sine_amp - uv = F.interpolate(self._f02uv(f0).transpose(2, 1), scale_factor=float(upp), mode="nearest").transpose(2, 1) - sine_waves = sine_waves * uv + ((uv * self.noise_std + (1 - uv) * self.sine_amp / 3) * torch.randn_like(sine_waves)) - - return sine_waves - -class SourceModuleHnNSF(torch.nn.Module): - def __init__(self, sample_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0): - super(SourceModuleHnNSF, self).__init__() - self.sine_amp = sine_amp - self.noise_std = add_noise_std - self.l_sin_gen = SineGen(sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod) - self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) - self.l_tanh = torch.nn.Tanh() - - def forward(self, x, upsample_factor = 1): - return self.l_tanh(self.l_linear(self.l_sin_gen(x, upsample_factor).to(dtype=self.l_linear.weight.dtype))) - -class HiFiGANNRFGenerator(torch.nn.Module): - def __init__(self, initial_channel, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels, sr, checkpointing = False): - super(HiFiGANNRFGenerator, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - self.upp = math.prod(upsample_rates) - self.f0_upsamp = torch.nn.Upsample(scale_factor=self.upp) - self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0) - - self.conv_pre = torch.nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) - self.checkpointing = checkpointing - - self.ups = torch.nn.ModuleList() - self.noise_convs = torch.nn.ModuleList() - - channels = [upsample_initial_channel // (2 ** (i + 1)) for i in range(self.num_upsamples)] - stride_f0s = [math.prod(upsample_rates[i + 1 :]) if i + 1 < self.num_upsamples else 1 for i in range(self.num_upsamples)] - - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - self.ups.append(weight_norm(torch.nn.ConvTranspose1d(upsample_initial_channel // (2**i), channels[i], k, u, padding=((k - u) // 2) if u % 2 == 0 else (u // 2 + u % 2), output_padding=u % 2))) - stride = stride_f0s[i] - kernel = 1 if stride == 1 else stride * 2 - stride % 2 - self.noise_convs.append(torch.nn.Conv1d(1, channels[i], kernel_size=kernel, stride=stride, padding=0 if stride == 1 else (kernel - stride) // 2)) - - self.resblocks = torch.nn.ModuleList([ResBlock(channels[i], k, d) for i in range(len(self.ups)) for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes)]) - self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False) - - self.ups.apply(init_weights) - if gin_channels != 0: self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - def forward(self, x, f0, g = None): - har_source = self.m_source(f0, self.upp).transpose(1, 2) - x = self.conv_pre(x) - if g is not None: x += self.cond(g) - - for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): - x = F.leaky_relu(x, LRELU_SLOPE) - - if self.training and self.checkpointing: - x = checkpoint(ups, x, use_reentrant=False) + noise_convs(har_source) - xs = sum([checkpoint(resblock, x, use_reentrant=False) for j, resblock in enumerate(self.resblocks) if j in range(i * self.num_kernels, (i + 1) * self.num_kernels)]) - else: - x = ups(x) + noise_convs(har_source) - xs = sum([resblock(x) for j, resblock in enumerate(self.resblocks) if j in range(i * self.num_kernels, (i + 1) * self.num_kernels)]) - - x = xs / self.num_kernels - - return self.conv_post(F.leaky_relu(x)).tanh() - - def remove_weight_norm(self): - for l in self.ups: - if hasattr(l, "parametrizations") and "weight" in l.parametrizations: parametrize.remove_parametrizations(l, "weight", leave_parametrized=True) - else: remove_weight_norm(l) - - for l in self.resblocks: - l.remove_weight_norm() \ No newline at end of file diff --git a/main/library/generators/refinegan.py b/main/library/generators/refinegan.py deleted file mode 100644 index f52357d41e33732d877a96d491a80d9831527f40..0000000000000000000000000000000000000000 --- a/main/library/generators/refinegan.py +++ /dev/null @@ -1,192 +0,0 @@ -import os -import sys -import torch -import torchaudio - -import numpy as np -import torch.nn as nn -import torch.nn.functional as F -import torch.nn.utils.parametrize as parametrize - -from torch.utils.checkpoint import checkpoint -from torch.nn.utils import remove_weight_norm -from torch.nn.utils.parametrizations import weight_norm - -sys.path.append(os.getcwd()) - -from main.library.algorithm.commons import init_weights, get_padding - -class ResBlock(nn.Module): - def __init__(self, channels, kernel_size = 7, dilation = (1, 3, 5), leaky_relu_slope = 0.2): - super().__init__() - self.leaky_relu_slope = leaky_relu_slope - self.convs1 = nn.ModuleList([weight_norm(nn.Conv1d(channels, channels, kernel_size, stride=1, dilation=d, padding=get_padding(kernel_size, d))) for d in dilation]) - self.convs1.apply(init_weights) - self.convs2 = nn.ModuleList([weight_norm(nn.Conv1d(channels, channels, kernel_size, stride=1, dilation=1, padding=get_padding(kernel_size, 1))) for _ in dilation]) - self.convs2.apply(init_weights) - - def forward(self, x): - for c1, c2 in zip(self.convs1, self.convs2): - x = c2(F.leaky_relu(c1(F.leaky_relu(x, self.leaky_relu_slope)), self.leaky_relu_slope)) + x - - return x - - def remove_weight_norm(self): - for c1, c2 in zip(self.convs1, self.convs2): - if hasattr(c1, "parametrizations") and "weight" in c1.parametrizations: parametrize.remove_parametrizations(c1, "weight", leave_parametrized=True) - else: remove_weight_norm(c1) - - if hasattr(c2, "parametrizations") and "weight" in c2.parametrizations: parametrize.remove_parametrizations(c2, "weight", leave_parametrized=True) - else: remove_weight_norm(c2) - -class AdaIN(nn.Module): - def __init__(self, *, channels, leaky_relu_slope = 0.2): - super().__init__() - self.weight = nn.Parameter(torch.ones(channels) * 1e-4) - self.activation = nn.LeakyReLU(leaky_relu_slope) - - def forward(self, x): - return self.activation(x + (torch.randn_like(x) * self.weight[None, :, None])) - -class ParallelResBlock(nn.Module): - def __init__(self, *, in_channels, out_channels, kernel_sizes = (3, 7, 11), dilation = (1, 3, 5), leaky_relu_slope = 0.2): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.input_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=7, stride=1, padding=3) - self.input_conv.apply(init_weights) - self.blocks = nn.ModuleList([nn.Sequential(AdaIN(channels=out_channels), ResBlock(out_channels, kernel_size=kernel_size, dilation=dilation, leaky_relu_slope=leaky_relu_slope), AdaIN(channels=out_channels)) for kernel_size in kernel_sizes]) - - def forward(self, x): - x = self.input_conv(x) - return torch.stack([block(x) for block in self.blocks], dim=0).mean(dim=0) - - def remove_weight_norm(self): - remove_weight_norm(self.input_conv) - for block in self.blocks: - block[1].remove_weight_norm() - -class SineGenerator(nn.Module): - def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0): - super(SineGenerator, self).__init__() - self.sine_amp = sine_amp - self.noise_std = noise_std - self.harmonic_num = harmonic_num - self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate - self.voiced_threshold = voiced_threshold - self.merge = nn.Sequential(nn.Linear(self.dim, 1, bias=False), nn.Tanh()) - - def _f02uv(self, f0): - uv = torch.ones_like(f0) * (f0 > self.voiced_threshold) - if uv.device.type == "privateuseone": uv = uv.float() - - return uv - - def _f02sine(self, f0_values): - rad_values = (f0_values / self.sampling_rate) % 1 - rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], dtype=f0_values.dtype, device=f0_values.device) - - rand_ini[:, 0] = 0 - rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini - - tmp_over_one = torch.cumsum(rad_values, 1) % 1 - tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 - - cumsum_shift = torch.zeros_like(rad_values) - cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 - - return (torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi).sin() - - def forward(self, f0): - with torch.no_grad(): - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, dtype=f0.dtype, device=f0.device) - f0_buf[:, :, 0] = f0[:, :, 0] - - for idx in np.arange(self.harmonic_num): - f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) - - sine_waves = self._f02sine(f0_buf) * self.sine_amp - uv = self._f02uv(f0) - sine_waves = sine_waves * uv + ((uv * self.noise_std + (1 - uv) * self.sine_amp / 3) * torch.randn_like(sine_waves)) - - return self.merge(sine_waves) - -class RefineGANGenerator(nn.Module): - def __init__(self, *, sample_rate = 44100, upsample_rates = (8, 8, 2, 2), leaky_relu_slope = 0.2, num_mels = 128, start_channels = 16, gin_channels = 256, checkpointing = False, upsample_initial_channel = 512): - super().__init__() - self.upsample_rates = upsample_rates - self.checkpointing = checkpointing - self.leaky_relu_slope = leaky_relu_slope - self.upp = np.prod(upsample_rates) - self.m_source = SineGenerator(sample_rate) - self.pre_conv = weight_norm(nn.Conv1d(1, 16, 7, 1, padding=3)) - channels = start_channels - size = self.upp - self.downsample_blocks = nn.ModuleList([]) - self.df0 = [] - - for i, _ in enumerate(upsample_rates): - new_size = int(size / upsample_rates[-i - 1]) - self.df0.append([size, new_size]) - size = new_size - - new_channels = channels * 2 - self.downsample_blocks.append(weight_norm(nn.Conv1d(channels, new_channels, 7, 1, padding=3))) - channels = new_channels - - channels = upsample_initial_channel - self.mel_conv = weight_norm(nn.Conv1d(num_mels, channels // 2, 7, 1, padding=3)) - self.mel_conv.apply(init_weights) - - if gin_channels != 0: self.cond = nn.Conv1d(256, channels // 2, 1) - - self.upsample_blocks = nn.ModuleList([]) - self.upsample_conv_blocks = nn.ModuleList([]) - - for rate in upsample_rates: - new_channels = channels // 2 - self.upsample_blocks.append(nn.Upsample(scale_factor=rate, mode="linear")) - self.upsample_conv_blocks.append(ParallelResBlock(in_channels=channels + channels // 4, out_channels=new_channels, kernel_sizes=(3, 7, 11), dilation=(1, 3, 5), leaky_relu_slope=leaky_relu_slope)) - channels = new_channels - - self.conv_post = weight_norm(nn.Conv1d(channels, 1, 7, 1, padding=3, bias=False)) - self.conv_post.apply(init_weights) - - def forward(self, mel, f0, g = None): - f0_size = mel.shape[-1] - har_source = self.m_source(F.interpolate(f0.unsqueeze(1), size=f0_size * self.upp, mode="linear").transpose(1, 2)).transpose(1, 2) - x = self.pre_conv(har_source) - downs = [] - - for block, (old_size, new_size) in zip(self.downsample_blocks, self.df0): - x = F.leaky_relu(x, self.leaky_relu_slope) - downs.append(x) - x = torchaudio.functional.resample(x.contiguous(), orig_freq=int(f0_size * old_size), new_freq=int(f0_size * new_size), lowpass_filter_width=64, rolloff=0.9475937167399596, resampling_method="sinc_interp_kaiser", beta=14.769656459379492) - x = block(x) - - mel = self.mel_conv(mel) - if g is not None: mel += self.cond(g) - - x = torch.cat([mel, x], dim=1) - for ups, res, down in zip(self.upsample_blocks, self.upsample_conv_blocks, reversed(downs)): - x = F.leaky_relu(x, self.leaky_relu_slope) - x = checkpoint(res, torch.cat([checkpoint(ups, x, use_reentrant=False), down], dim=1), use_reentrant=False) if self.training and self.checkpointing else res(torch.cat([ups(x), down], dim=1)) - - return self.conv_post(F.leaky_relu(x, self.leaky_relu_slope)).tanh() - - def remove_weight_norm(self): - if hasattr(self.pre_conv, "parametrizations") and "weight" in self.pre_conv.parametrizations: parametrize.remove_parametrizations(self.pre_conv, "weight", leave_parametrized=True) - else: remove_weight_norm(self.pre_conv) - - if hasattr(self.mel_conv, "parametrizations") and "weight" in self.mel_conv.parametrizations: parametrize.remove_parametrizations(self.mel_conv, "weight", leave_parametrized=True) - else: remove_weight_norm(self.mel_conv) - - if hasattr(self.conv_post, "parametrizations") and "weight" in self.conv_post.parametrizations: parametrize.remove_parametrizations(self.conv_post, "weight", leave_parametrized=True) - else: remove_weight_norm(self.conv_post) - - for block in self.downsample_blocks: - block.remove_weight_norm() - - for block in self.upsample_conv_blocks: - block.remove_weight_norm() \ No newline at end of file diff --git a/main/library/onnx/onnx_export.py b/main/library/onnx/onnx_export.py deleted file mode 100644 index f6b5adceee4e852e7d7f50b20fe936f6db81b4d6..0000000000000000000000000000000000000000 --- a/main/library/onnx/onnx_export.py +++ /dev/null @@ -1,110 +0,0 @@ -import os -import io -import sys -import onnx -import json -import torch -import onnxslim -import warnings - -sys.path.append(os.getcwd()) - -from main.app.variables import logger -from main.library.algorithm.synthesizers import SynthesizerONNX - -warnings.filterwarnings("ignore") - -FEATS_LENGTH = 200 - -def onnx_exporter(input_path, output_path, is_half=False, device="cpu"): - if not device.startswith("cuda"): device = "cpu" - - cpt = (torch.load(input_path, map_location="cpu", weights_only=True) if os.path.isfile(input_path) else None) - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] - - model_name, model_author, epochs, steps, version, f0, model_hash, vocoder, creation_date, energy_use = cpt.get("model_name", None), cpt.get("author", None), cpt.get("epoch", None), cpt.get("step", None), cpt.get("version", "v1"), cpt.get("f0", 1), cpt.get("model_hash", None), cpt.get("vocoder", "Default"), cpt.get("creation_date", None), cpt.get("energy", False) - text_enc_hidden_dim = 768 if version == "v2" else 256 - tgt_sr = cpt["config"][-1] - - net_g = SynthesizerONNX(*cpt["config"], use_f0=f0, text_enc_hidden_dim=text_enc_hidden_dim, vocoder=vocoder, checkpointing=False, energy=energy_use) - net_g.load_state_dict(cpt["weight"], strict=False) - net_g.eval().to(device).to(torch.float16 if is_half else torch.float32) - net_g.remove_weight_norm() - - phone = torch.rand(1, FEATS_LENGTH, text_enc_hidden_dim).to(device) - phone_length = torch.tensor([FEATS_LENGTH]).long().to(device) - ds = torch.LongTensor([0]).to(device) - rnd = torch.rand(1, 192, FEATS_LENGTH).to(device) - - args = [phone, phone_length, ds, rnd] - input_names = ["phone", "phone_lengths", "ds", "rnd"] - dynamic_axes = {"phone": [1], "rnd": [2]} - - if f0: - pitch = torch.randint(size=(1, FEATS_LENGTH), low=5, high=255).to(device) - pitchf = torch.rand(1, FEATS_LENGTH).to(device) - - args += [pitch, pitchf] - input_names += ["pitch", "pitchf"] - dynamic_axes.update({"pitch": [1], "pitchf": [1]}) - - if energy_use: - energy = torch.rand(1, FEATS_LENGTH).to(device) - args.append(energy) - - input_names.append("energy") - dynamic_axes.update({"energy": [1]}) - - try: - with io.BytesIO() as model: - torch.onnx.export( - net_g, - tuple(args), - model, - do_constant_folding=True, - opset_version=17, - verbose=False, - input_names=input_names, - output_names=["audio"], - dynamic_axes=dynamic_axes - ) - - model = onnxslim.slim(onnx.load_model_from_string(model.getvalue())) - model.metadata_props.append( - onnx.StringStringEntryProto( - key="model_info", - value=json.dumps( - { - "model_name": model_name, - "author": model_author, - "epoch": epochs, - "step": steps, - "version": version, - "sr": tgt_sr, - "f0": f0, - "model_hash": model_hash, - "creation_date": creation_date, - "vocoder": vocoder, - "text_enc_hidden_dim": text_enc_hidden_dim, - "energy": energy_use - } - ) - ) - ) - - if is_half: - try: - import onnxconverter_common - except: - os.system(f"{sys.executable} -m pip install onnxconverter_common") - import onnxconverter_common - - model = onnxconverter_common.convert_float_to_float16(model, keep_io_types=True) - - onnx.save(model, output_path) - return output_path - except: - import traceback - logger.error(traceback.format_exc()) - - return None \ No newline at end of file diff --git a/main/library/onnx/wrapper.py b/main/library/onnx/wrapper.py deleted file mode 100644 index e67b943a4c7561e493340e02527fe4dacd055cc4..0000000000000000000000000000000000000000 --- a/main/library/onnx/wrapper.py +++ /dev/null @@ -1,76 +0,0 @@ -import json -import onnx -import torch -import onnxruntime - -import numpy as np - -class ONNXRVC: - def __init__(self, model_path, providers, log_severity_level = 3): - sess_options = onnxruntime.SessionOptions() - sess_options.log_severity_level = log_severity_level - - metadata_dict = None - for prop in onnx.load(model_path).metadata_props: - if prop.key == "model_info": - metadata_dict = json.loads(prop.value) - break - - self.cpt = {} - self.cpt["tgt_sr"] = metadata_dict.get("sr", 32000) - self.cpt["use_f0"] = metadata_dict.get("f0", 1) - self.cpt["version"] = metadata_dict.get("version", "v1") - self.cpt["energy"] = metadata_dict.get("energy", False) - self.net_g = onnxruntime.InferenceSession( - model_path, - sess_options=sess_options, - providers=providers - ) - - def get_onnx_argument(self, feats, p_len, sid, pitch, pitchf, energy): - inputs = { - self.net_g.get_inputs()[0].name: feats.cpu().numpy().astype(np.float32), - self.net_g.get_inputs()[1].name: p_len.cpu().numpy(), - self.net_g.get_inputs()[2].name: np.array([sid.cpu().item()], dtype=np.int64), - self.net_g.get_inputs()[3].name: np.random.randn(1, 192, p_len).astype(np.float32) - } - - if self.cpt["energy"]: - if self.cpt["use_f0"]: - inputs.update({ - self.net_g.get_inputs()[4].name: pitch.cpu().numpy().astype(np.int64), - self.net_g.get_inputs()[5].name: pitchf.cpu().numpy().astype(np.float32), - self.net_g.get_inputs()[6].name: energy.cpu().numpy().astype(np.float32) - }) - else: - inputs.update({ - self.net_g.get_inputs()[4].name: energy.cpu().numpy().astype(np.float32) - }) - else: - if self.cpt["use_f0"]: - inputs.update({ - self.net_g.get_inputs()[4].name: pitch.cpu().numpy().astype(np.int64), - self.net_g.get_inputs()[5].name: pitchf.cpu().numpy().astype(np.float32) - }) - - return inputs - - def to(self, device = "cpu"): - self.device = device - return self - - def infer(self, feats = None, p_len = None, pitch = None, pitchf = None, sid = None, energy = None): - output = self.net_g.run( - [self.net_g.get_outputs()[0].name], ( - self.get_onnx_argument( - feats, - p_len, - sid, - pitch, - pitchf, - energy, - ) - ) - ) - - return torch.as_tensor(output, device=self.device) diff --git a/main/library/predictors/CREPE/CREPE.py b/main/library/predictors/CREPE/CREPE.py deleted file mode 100644 index 8fbe93b89e31901bd171e5b31f5100939a9c7ded..0000000000000000000000000000000000000000 --- a/main/library/predictors/CREPE/CREPE.py +++ /dev/null @@ -1,131 +0,0 @@ -import os -import sys -import torch -import librosa -import scipy.stats - -import numpy as np - -sys.path.append(os.getcwd()) - -from main.library.predictors.CREPE.model import MODEL - -CENTS_PER_BIN, PITCH_BINS, SAMPLE_RATE, WINDOW_SIZE = 20, 360, 16000, 1024 - -class CREPE: - def __init__(self, model_path, model_size="full", hop_length=512, batch_size=None, f0_min=50, f0_max=1100, device=None, sample_rate=16000, providers=None, onnx=False, return_periodicity=False): - self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") - self.hop_length = hop_length - self.batch_size = batch_size - self.sample_rate = sample_rate - self.onnx = onnx - self.f0_min = f0_min - self.f0_max = f0_max - self.return_periodicity = return_periodicity - - if self.onnx: - import onnxruntime as ort - - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 3 - self.model = ort.InferenceSession(model_path, sess_options=sess_options, providers=providers) - else: - model = MODEL(model_size) - ckpt = torch.load(model_path, map_location="cpu", weights_only=True) - model.load_state_dict(ckpt) - model.eval() - self.model = model.to(device) - - def bins_to_frequency(self, bins): - if str(bins.device).startswith(("ocl", "privateuseone")): bins = bins.to(torch.float32) - - cents = CENTS_PER_BIN * bins + 1997.3794084376191 - return 10 * 2 ** ((cents + cents.new_tensor(scipy.stats.triang.rvs(c=0.5, loc=-CENTS_PER_BIN, scale=2 * CENTS_PER_BIN, size=cents.size()))) / 1200) - - def frequency_to_bins(self, frequency, quantize_fn=torch.floor): - return quantize_fn(((1200 * (frequency / 10).log2()) - 1997.3794084376191) / CENTS_PER_BIN).int() - - def viterbi(self, logits): - if not hasattr(self, 'transition'): - xx, yy = np.meshgrid(range(360), range(360)) - transition = np.maximum(12 - abs(xx - yy), 0) - self.transition = transition / transition.sum(axis=1, keepdims=True) - - with torch.no_grad(): - probs = torch.nn.functional.softmax(logits, dim=1) - - bins = torch.tensor(np.array([librosa.sequence.viterbi(sequence, self.transition).astype(np.int64) for sequence in probs.cpu().numpy()]), device=probs.device) - return bins, self.bins_to_frequency(bins) - - def preprocess(self, audio, pad=True): - hop_length = (self.sample_rate // 100) if self.hop_length is None else self.hop_length - - if self.sample_rate != SAMPLE_RATE: - audio = torch.tensor(librosa.resample(audio.detach().cpu().numpy().squeeze(0), orig_sr=self.sample_rate, target_sr=SAMPLE_RATE, res_type="soxr_vhq"), device=audio.device).unsqueeze(0) - hop_length = int(hop_length * SAMPLE_RATE / self.sample_rate) - - if pad: - total_frames = 1 + int(audio.size(1) // hop_length) - audio = torch.nn.functional.pad(audio, (WINDOW_SIZE // 2, WINDOW_SIZE // 2)) - else: total_frames = 1 + int((audio.size(1) - WINDOW_SIZE) // hop_length) - - batch_size = total_frames if self.batch_size is None else self.batch_size - - for i in range(0, total_frames, batch_size): - frames = torch.nn.functional.unfold(audio[:, None, None, max(0, i * hop_length):min(audio.size(1), (i + batch_size - 1) * hop_length + WINDOW_SIZE)], kernel_size=(1, WINDOW_SIZE), stride=(1, hop_length)) - - if self.device.startswith(("ocl", "privateuseone")): - frames = frames.transpose(1, 2).contiguous().reshape(-1, WINDOW_SIZE).to(self.device) - else: - frames = frames.transpose(1, 2).reshape(-1, WINDOW_SIZE).to(self.device) - - frames -= frames.mean(dim=1, keepdim=True) - frames /= torch.tensor(1e-10, device=frames.device).max(frames.std(dim=1, keepdim=True)) - - yield frames - - def periodicity(self, probabilities, bins): - probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS) - periodicity = probs_stacked.gather(1, bins.reshape(-1, 1).to(torch.int64)) - - return periodicity.reshape(probabilities.size(0), probabilities.size(2)) - - def postprocess(self, probabilities): - probabilities = probabilities.detach() - probabilities[:, :self.frequency_to_bins(torch.tensor(self.f0_min))] = -float('inf') - probabilities[:, self.frequency_to_bins(torch.tensor(self.f0_max), torch.ceil):] = -float('inf') - - bins, pitch = self.viterbi(probabilities) - - if not self.return_periodicity: return pitch - return pitch, self.periodicity(probabilities, bins) - - def compute_f0(self, audio, pad=True): - results = [] - - for frames in self.preprocess(audio, pad): - if self.onnx: - model = torch.tensor( - self.model.run( - [self.model.get_outputs()[0].name], - { - self.model.get_inputs()[0].name: frames.cpu().numpy() - } - )[0].transpose(1, 0)[None], - device=self.device - ) - else: - with torch.no_grad(): - model = self.model( - frames, - embed=False - ).reshape(audio.size(0), -1, PITCH_BINS).transpose(1, 2) - - result = self.postprocess(model) - results.append((result[0].to(audio.device), result[1].to(audio.device)) if isinstance(result, tuple) else result.to(audio.device)) - - if self.return_periodicity: - pitch, periodicity = zip(*results) - return torch.cat(pitch, 1), torch.cat(periodicity, 1) - - return torch.cat(results, 1) \ No newline at end of file diff --git a/main/library/predictors/CREPE/filter.py b/main/library/predictors/CREPE/filter.py deleted file mode 100644 index 35ed617a4985f75920b77aa2c33b4f2a25650646..0000000000000000000000000000000000000000 --- a/main/library/predictors/CREPE/filter.py +++ /dev/null @@ -1,37 +0,0 @@ -import torch - -def mean(signals, win_length=9): - assert signals.dim() == 2 - - signals = signals.unsqueeze(1) - mask = ~torch.isnan(signals) - padding = win_length // 2 - - ones_kernel = torch.ones(signals.size(1), 1, win_length, device=signals.device) - avg_pooled = torch.nn.functional.conv1d(torch.where(mask, signals, torch.zeros_like(signals)), ones_kernel, stride=1, padding=padding) / torch.nn.functional.conv1d(mask.float(), ones_kernel, stride=1, padding=padding).clamp(min=1) - avg_pooled[avg_pooled == 0] = float("nan") - - return avg_pooled.squeeze(1) - -def median(signals, win_length): - assert signals.dim() == 2 - - signals = signals.unsqueeze(1) - mask = ~torch.isnan(signals) - padding = win_length // 2 - - x = torch.nn.functional.pad(torch.where(mask, signals, torch.zeros_like(signals)), (padding, padding), mode="reflect") - mask = torch.nn.functional.pad(mask.float(), (padding, padding), mode="constant", value=0) - - x = x.unfold(2, win_length, 1) - mask = mask.unfold(2, win_length, 1) - - x = x.contiguous().view(x.size()[:3] + (-1,)) - mask = mask.contiguous().view(mask.size()[:3] + (-1,)) - - x_sorted, _ = torch.where(mask.bool(), x.float(), float("inf")).to(x).sort(dim=-1) - - median_pooled = x_sorted.gather(-1, ((mask.sum(dim=-1) - 1) // 2).clamp(min=0).unsqueeze(-1).long()).squeeze(-1) - median_pooled[torch.isinf(median_pooled)] = float("nan") - - return median_pooled.squeeze(1) \ No newline at end of file diff --git a/main/library/predictors/CREPE/model.py b/main/library/predictors/CREPE/model.py deleted file mode 100644 index 3d8ca4926972c83299a6010ca5838bd1cce07fa1..0000000000000000000000000000000000000000 --- a/main/library/predictors/CREPE/model.py +++ /dev/null @@ -1,47 +0,0 @@ -import torch -import functools - -PITCH_BINS = 360 - -class MODEL(torch.nn.Module): - def __init__(self, model='full'): - super().__init__() - in_channels = {"full": [1, 1024, 128, 128, 128, 256], "large": [1, 768, 96, 96, 96, 192], "medium": [1, 512, 64, 64, 64, 128], "small": [1, 256, 32, 32, 32, 64], "tiny": [1, 128, 16, 16, 16, 32]}[model] - out_channels = {"full": [1024, 128, 128, 128, 256, 512], "large": [768, 96, 96, 96, 192, 384], "medium": [512, 64, 64, 64, 128, 256], "small": [256, 32, 32, 32, 64, 128], "tiny": [128, 16, 16, 16, 32, 64]}[model] - self.in_features = {"full": 2048, "large": 1536, "medium": 1024, "small": 512, "tiny": 256}[model] - - kernel_sizes = [(512, 1)] + 5 * [(64, 1)] - strides = [(4, 1)] + 5 * [(1, 1)] - batch_norm_fn = functools.partial(torch.nn.BatchNorm2d, eps=0.0010000000474974513, momentum=0.0) - - self.conv1 = torch.nn.Conv2d(in_channels=in_channels[0], out_channels=out_channels[0], kernel_size=kernel_sizes[0], stride=strides[0]) - self.conv1_BN = batch_norm_fn(num_features=out_channels[0]) - - self.conv2 = torch.nn.Conv2d(in_channels=in_channels[1], out_channels=out_channels[1], kernel_size=kernel_sizes[1], stride=strides[1]) - self.conv2_BN = batch_norm_fn(num_features=out_channels[1]) - - self.conv3 = torch.nn.Conv2d(in_channels=in_channels[2], out_channels=out_channels[2], kernel_size=kernel_sizes[2], stride=strides[2]) - self.conv3_BN = batch_norm_fn(num_features=out_channels[2]) - - self.conv4 = torch.nn.Conv2d(in_channels=in_channels[3], out_channels=out_channels[3], kernel_size=kernel_sizes[3], stride=strides[3]) - self.conv4_BN = batch_norm_fn(num_features=out_channels[3]) - - self.conv5 = torch.nn.Conv2d(in_channels=in_channels[4], out_channels=out_channels[4], kernel_size=kernel_sizes[4], stride=strides[4]) - self.conv5_BN = batch_norm_fn(num_features=out_channels[4]) - - self.conv6 = torch.nn.Conv2d(in_channels=in_channels[5], out_channels=out_channels[5], kernel_size=kernel_sizes[5], stride=strides[5]) - self.conv6_BN = batch_norm_fn(num_features=out_channels[5]) - - self.classifier = torch.nn.Linear(in_features=self.in_features, out_features=PITCH_BINS) - - def forward(self, x, embed=False): - x = self.embed(x) - if embed: return x - return self.classifier(self.layer(x, self.conv6, self.conv6_BN).permute(0, 2, 1, 3).reshape(-1, self.in_features)).sigmoid() - - def embed(self, x): - x = x[:, None, :, None] - return self.layer(self.layer(self.layer(self.layer(self.layer(x, self.conv1, self.conv1_BN, (0, 0, 254, 254)), self.conv2, self.conv2_BN), self.conv3, self.conv3_BN), self.conv4, self.conv4_BN), self.conv5, self.conv5_BN) - - def layer(self, x, conv, batch_norm, padding=(0, 0, 31, 32)): - return torch.nn.functional.max_pool2d(batch_norm(torch.nn.functional.relu(conv(torch.nn.functional.pad(x, padding)))), (2, 1), (2, 1)) \ No newline at end of file diff --git a/main/library/predictors/DJCM/DJCM.py b/main/library/predictors/DJCM/DJCM.py deleted file mode 100644 index d1652656922d93b26e034d96f18de706825e37dd..0000000000000000000000000000000000000000 --- a/main/library/predictors/DJCM/DJCM.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -import sys -import torch - -import numpy as np - -from scipy.signal import medfilt - -sys.path.append(os.getcwd()) - -from main.library.predictors.DJCM.model import DJCMM -from main.library.predictors.DJCM.spec import Spectrogram -from main.library.predictors.DJCM.utils import WINDOW_LENGTH, SAMPLE_RATE, N_CLASS - -class DJCM: - def __init__(self, model_path, device = "cpu", is_half = False, onnx = False, providers = ["CPUExecutionProvider"], batch_size = 1, segment_len = 5.12, kernel_size = 3): - super(DJCM, self).__init__() - self.onnx = onnx - - if self.onnx: - import onnxruntime as ort - - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 3 - self.model = ort.InferenceSession(model_path, sess_options=sess_options, providers=providers) - else: - model = DJCMM(1, 1, 1) - model.load_state_dict(torch.load(model_path, map_location="cpu", weights_only=True)) - model = model.to(device).eval() - self.model = model.half() if is_half else model.float() - - self.batch_size = batch_size - self.seg_len = int(segment_len * SAMPLE_RATE) - self.seg_frames = int(self.seg_len // int(SAMPLE_RATE // 100)) - - self.device = device - self.is_half = is_half - self.kernel_size = kernel_size - - self.spec_extractor = Spectrogram(int(SAMPLE_RATE // 100), WINDOW_LENGTH).to(device) - cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191 - self.cents_mapping = np.pad(cents_mapping, (4, 4)) - - def spec2hidden(self, spec): - if self.onnx: - hidden = torch.as_tensor( - self.model.run([self.model.get_outputs()[0].name], {self.model.get_inputs()[0].name: spec.cpu().numpy().astype(np.float32)})[0], device=self.device - ) - else: - hidden = self.model( - spec.half() if self.is_half else spec.float() - ) - - return hidden - - def infer_from_audio(self, audio, thred=0.03): - if torch.is_tensor(audio): audio = audio.cpu().numpy() - if audio.ndim > 1: audio = audio.squeeze() - - with torch.no_grad(): - padded_audio = self.pad_audio(audio) - hidden = self.inference(padded_audio)[:(audio.shape[-1] // int(SAMPLE_RATE // 100) + 1)] - - f0 = self.decode(hidden.squeeze(0).cpu().numpy(), thred) - if self.kernel_size is not None: f0 = medfilt(f0, kernel_size=self.kernel_size) - - return f0 - - def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100): - f0 = self.infer_from_audio(audio, thred) - f0[(f0 < f0_min) | (f0 > f0_max)] = 0 - - return f0 - - def to_local_average_cents(self, salience, thred=0.05): - center = np.argmax(salience, axis=1) - salience = np.pad(salience, ((0, 0), (4, 4))) - center += 4 - todo_salience, todo_cents_mapping = [], [] - starts = center - 4 - ends = center + 5 - - for idx in range(salience.shape[0]): - todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) - todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) - - todo_salience = np.array(todo_salience) - devided = np.sum(todo_salience * np.array(todo_cents_mapping), 1) / np.sum(todo_salience, 1) - devided[np.max(salience, axis=1) <= thred] = 0 - - return devided - - def decode(self, hidden, thred=0.03): - f0 = 10 * (2 ** (self.to_local_average_cents(hidden, thred=thred) / 1200)) - f0[f0 == 10] = 0 - return f0 - - def pad_audio(self, audio): - audio_len = audio.shape[-1] - - seg_nums = int(np.ceil(audio_len / self.seg_len)) + 1 - pad_len = int(seg_nums * self.seg_len - audio_len + self.seg_len // 2) - - left_pad = np.zeros(int(self.seg_len // 4), dtype=np.float32) - right_pad = np.zeros(int(pad_len - self.seg_len // 4), dtype=np.float32) - padded_audio = np.concatenate([left_pad, audio, right_pad], axis=-1) - - segments = [padded_audio[start: start + int(self.seg_len)] for start in range(0, len(padded_audio) - int(self.seg_len) + 1, int(self.seg_len // 2))] - segments = np.stack(segments, axis=0) - segments = torch.from_numpy(segments).unsqueeze(1).to(self.device) - - return segments - - def inference(self, segments): - hidden_segments = torch.cat([ - self.spec2hidden(self.spec_extractor(segments[i:i + self.batch_size].float())) - for i in range(0, len(segments), self.batch_size) - ], dim=0) - - hidden = torch.cat([ - seg[self.seg_frames // 4: int(self.seg_frames * 0.75)] - for seg in hidden_segments - ], dim=0) - - return hidden \ No newline at end of file diff --git a/main/library/predictors/DJCM/decoder.py b/main/library/predictors/DJCM/decoder.py deleted file mode 100644 index bf1003d14a910b8730d59e29a42b55e80ef32262..0000000000000000000000000000000000000000 --- a/main/library/predictors/DJCM/decoder.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -import sys -import torch - -import torch.nn as nn -import torch.nn.functional as F - -sys.path.append(os.getcwd()) - -from main.library.predictors.DJCM.encoder import ResEncoderBlock -from main.library.predictors.DJCM.utils import ResConvBlock, BiGRU, init_bn, init_layer, N_CLASS, WINDOW_LENGTH - -class ResDecoderBlock(nn.Module): - def __init__(self, in_channels, out_channels, n_blocks, stride): - super(ResDecoderBlock, self).__init__() - self.conv1 = nn.ConvTranspose2d(in_channels, out_channels, stride, stride, (0, 0), bias=False) - self.bn1 = nn.BatchNorm2d(in_channels, momentum=0.01) - self.conv = nn.ModuleList([ResConvBlock(out_channels * 2, out_channels)]) - - for _ in range(n_blocks - 1): - self.conv.append(ResConvBlock(out_channels, out_channels)) - - self.init_weights() - - def init_weights(self): - init_bn(self.bn1) - init_layer(self.conv1) - - def forward(self, x, concat): - x = self.conv1(F.relu_(self.bn1(x))) - x = torch.cat((x, concat), dim=1) - - for each_layer in self.conv: - x = each_layer(x) - - return x - -class Decoder(nn.Module): - def __init__(self, n_blocks): - super(Decoder, self).__init__() - self.de_blocks = nn.ModuleList([ - ResDecoderBlock(384, 384, n_blocks, (1, 2)), - ResDecoderBlock(384, 384, n_blocks, (1, 2)), - ResDecoderBlock(384, 256, n_blocks, (1, 2)), - ResDecoderBlock(256, 128, n_blocks, (1, 2)), - ResDecoderBlock(128, 64, n_blocks, (1, 2)), - ResDecoderBlock(64, 32, n_blocks, (1, 2)) - ]) - - def forward(self, x, concat_tensors): - for i, layer in enumerate(self.de_blocks): - x = layer(x, concat_tensors[-1 - i]) - - return x - -class PE_Decoder(nn.Module): - def __init__(self, n_blocks, seq_layers=1): - super(PE_Decoder, self).__init__() - self.de_blocks = Decoder(n_blocks) - self.after_conv1 = ResEncoderBlock(32, 32, n_blocks, None) - self.after_conv2 = nn.Conv2d(32, 1, (1, 1)) - self.fc = nn.Sequential(BiGRU((1, WINDOW_LENGTH // 2), 1, seq_layers), nn.Linear(WINDOW_LENGTH // 2, N_CLASS), nn.Sigmoid()) - init_layer(self.after_conv2) - - def forward(self, x, concat_tensors): - return self.fc(self.after_conv2(self.after_conv1(self.de_blocks(x, concat_tensors)))).squeeze(1) \ No newline at end of file diff --git a/main/library/predictors/DJCM/encoder.py b/main/library/predictors/DJCM/encoder.py deleted file mode 100644 index b77e4536a60040f976e87d323def66aec83df787..0000000000000000000000000000000000000000 --- a/main/library/predictors/DJCM/encoder.py +++ /dev/null @@ -1,45 +0,0 @@ -import os -import sys - -import torch.nn as nn - -sys.path.append(os.getcwd()) - -from main.library.predictors.DJCM.utils import ResConvBlock - -class ResEncoderBlock(nn.Module): - def __init__(self, in_channels, out_channels, n_blocks, kernel_size): - super(ResEncoderBlock, self).__init__() - self.conv = nn.ModuleList([ResConvBlock(in_channels, out_channels)]) - for _ in range(n_blocks - 1): - self.conv.append(ResConvBlock(out_channels, out_channels)) - - self.pool = nn.MaxPool2d(kernel_size) if kernel_size is not None else None - - def forward(self, x): - for each_layer in self.conv: - x = each_layer(x) - - if self.pool is not None: return x, self.pool(x) - return x - -class Encoder(nn.Module): - def __init__(self, in_channels, n_blocks): - super(Encoder, self).__init__() - self.en_blocks = nn.ModuleList([ - ResEncoderBlock(in_channels, 32, n_blocks, (1, 2)), - ResEncoderBlock(32, 64, n_blocks, (1, 2)), - ResEncoderBlock(64, 128, n_blocks, (1, 2)), - ResEncoderBlock(128, 256, n_blocks, (1, 2)), - ResEncoderBlock(256, 384, n_blocks, (1, 2)), - ResEncoderBlock(384, 384, n_blocks, (1, 2)) - ]) - - def forward(self, x): - concat_tensors = [] - - for layer in self.en_blocks: - _, x = layer(x) - concat_tensors.append(_) - - return x, concat_tensors \ No newline at end of file diff --git a/main/library/predictors/DJCM/model.py b/main/library/predictors/DJCM/model.py deleted file mode 100644 index 47e5571efac92a612e6fbe64181cef344e8f9fd1..0000000000000000000000000000000000000000 --- a/main/library/predictors/DJCM/model.py +++ /dev/null @@ -1,40 +0,0 @@ -import os -import sys - -import torch.nn as nn - -sys.path.append(os.getcwd()) - -from main.library.predictors.DJCM.decoder import PE_Decoder -from main.library.predictors.DJCM.utils import init_bn, WINDOW_LENGTH -from main.library.predictors.DJCM.encoder import ResEncoderBlock, Encoder - -class LatentBlocks(nn.Module): - def __init__(self, n_blocks, latent_layers): - super(LatentBlocks, self).__init__() - self.latent_blocks = nn.ModuleList([ - ResEncoderBlock(384, 384, n_blocks, None) - for _ in range(latent_layers) - ]) - - def forward(self, x): - for layer in self.latent_blocks: - x = layer(x) - - return x - -class DJCMM(nn.Module): - def __init__(self, in_channels, n_blocks, latent_layers): - super(DJCMM, self).__init__() - self.bn = nn.BatchNorm2d(WINDOW_LENGTH // 2 + 1, momentum=0.01) - self.pe_encoder = Encoder(in_channels, n_blocks) - self.pe_latent = LatentBlocks(n_blocks, latent_layers) - self.pe_decoder = PE_Decoder(n_blocks) - init_bn(self.bn) - - def forward(self, spec): - x = self.bn(spec.transpose(1, 3)).transpose(1, 3)[..., :-1] - x, concat_tensors = self.pe_encoder(x) - pe_out = self.pe_decoder(self.pe_latent(x), concat_tensors) - - return pe_out \ No newline at end of file diff --git a/main/library/predictors/DJCM/spec.py b/main/library/predictors/DJCM/spec.py deleted file mode 100644 index 207c7118c93829265830be9a596f6d1535dd76b7..0000000000000000000000000000000000000000 --- a/main/library/predictors/DJCM/spec.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -import sys -import torch - -import numpy as np -import torch.nn as nn - -sys.path.append(os.getcwd()) - -class Spectrogram(nn.Module): - def __init__(self, hop_length, win_length, n_fft=None, clamp=1e-10): - super(Spectrogram, self).__init__() - self.n_fft = win_length if n_fft is None else n_fft - self.hop_length = hop_length - self.win_length = win_length - self.clamp = clamp - self.register_buffer("window", torch.hann_window(win_length), persistent=False) - - def forward(self, audio, center=True): - bs, c, segment_samples = audio.shape - audio = audio.reshape(bs * c, segment_samples) - - if str(audio.device).startswith(("ocl", "privateuseone")): - if not hasattr(self, "stft"): - from main.library.backends.utils import STFT - self.stft = STFT(filter_length=self.n_fft, hop_length=self.hop_length, win_length=self.win_length).to(audio.device) - magnitude = self.stft.transform(audio, 1e-9) - else: - fft = torch.stft(audio, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window, center=center, pad_mode="reflect", return_complex=True) - magnitude = (fft.real.pow(2) + fft.imag.pow(2)).sqrt() - - mag = magnitude.transpose(1, 2).clamp(self.clamp, np.inf) - mag = mag.reshape(bs, c, mag.shape[1], mag.shape[2]) - - return mag \ No newline at end of file diff --git a/main/library/predictors/DJCM/utils.py b/main/library/predictors/DJCM/utils.py deleted file mode 100644 index 2317e1f4eb01ea0524a99220458af53354d435d9..0000000000000000000000000000000000000000 --- a/main/library/predictors/DJCM/utils.py +++ /dev/null @@ -1,62 +0,0 @@ -import torch - -from torch import nn -from einops.layers.torch import Rearrange - -SAMPLE_RATE, WINDOW_LENGTH, N_CLASS = 16000, 1024, 360 - -def init_layer(layer): - nn.init.xavier_uniform_(layer.weight) - if hasattr(layer, "bias") and layer.bias is not None: layer.bias.data.fill_(0.0) - -def init_bn(bn): - bn.bias.data.fill_(0.0) - bn.weight.data.fill_(1.0) - bn.running_mean.data.fill_(0.0) - bn.running_var.data.fill_(1.0) - -class BiGRU(nn.Module): - def __init__(self, patch_size, channels, depth): - super(BiGRU, self).__init__() - patch_width, patch_height = patch_size - patch_dim = channels * patch_height * patch_width - self.to_patch_embedding = nn.Sequential(Rearrange('b c (w p1) (h p2) -> b (w h) (p1 p2 c)', p1=patch_width, p2=patch_height)) - self.gru = nn.GRU(patch_dim, patch_dim // 2, num_layers=depth, batch_first=True, bidirectional=True) - - def forward(self, x): - x = self.to_patch_embedding(x) - try: - return self.gru(x)[0] - except: - torch.backends.cudnn.enabled = False - return self.gru(x)[0] - -class ResConvBlock(nn.Module): - def __init__(self, in_planes, out_planes): - super(ResConvBlock, self).__init__() - self.bn1 = nn.BatchNorm2d(in_planes, momentum=0.01) - self.bn2 = nn.BatchNorm2d(out_planes, momentum=0.01) - self.act1 = nn.PReLU() - self.act2 = nn.PReLU() - self.conv1 = nn.Conv2d(in_planes, out_planes, (3, 3), padding=(1, 1), bias=False) - self.conv2 = nn.Conv2d(out_planes, out_planes, (3, 3), padding=(1, 1), bias=False) - self.is_shortcut = False - - if in_planes != out_planes: - self.shortcut = nn.Conv2d(in_planes, out_planes, (1, 1)) - self.is_shortcut = True - - self.init_weights() - - def init_weights(self): - init_bn(self.bn1) - init_bn(self.bn2) - init_layer(self.conv1) - init_layer(self.conv2) - if self.is_shortcut: init_layer(self.shortcut) - - def forward(self, x): - out = self.conv2(self.act2(self.bn2(self.conv1(self.act1(self.bn1(x)))))) - - if self.is_shortcut: return self.shortcut(x) + out - else: return out + x \ No newline at end of file diff --git a/main/library/predictors/FCPE/FCPE.py b/main/library/predictors/FCPE/FCPE.py deleted file mode 100644 index 36a2abcca0ba3c734f2c3a56d7e26a76804dfbd6..0000000000000000000000000000000000000000 --- a/main/library/predictors/FCPE/FCPE.py +++ /dev/null @@ -1,344 +0,0 @@ -import os -import sys -import torch - -import numpy as np -import torch.nn as nn -import onnxruntime as ort -import torch.nn.functional as F - -from einops import rearrange -from torch.nn.utils.parametrizations import weight_norm - -sys.path.append(os.getcwd()) -os.environ["LRU_CACHE_CAPACITY"] = "3" - -from main.library.predictors.FCPE.wav2mel import Wav2Mel -from main.library.predictors.FCPE.encoder import EncoderLayer, ConformerNaiveEncoder -from main.library.predictors.FCPE.utils import l2_regularization, batch_interp_with_replacement_detach, decrypt_model, DotDict - -@torch.no_grad() -def cent_to_f0(cent): - return 10 * 2 ** (cent / 1200) - -@torch.no_grad() -def f0_to_cent(f0): - return 1200 * (f0 / 10).log2() - -@torch.no_grad() -def latent2cents_decoder(cent_table, y, threshold = 0.05, mask = True): - if str(y.device).startswith("privateuseone"): - cent_table = cent_table.cpu() - y = y.cpu() - - B, N, _ = y.size() - ci = cent_table[None, None, :].expand(B, N, -1) - rtn = (ci * y).sum(dim=-1, keepdim=True) / y.sum(dim=-1, keepdim=True) - - if mask: - confident = y.max(dim=-1, keepdim=True)[0] - confident_mask = torch.ones_like(confident) - confident_mask[confident <= threshold] = float("-INF") - rtn = rtn * confident_mask - - return rtn - -@torch.no_grad() -def latent2cents_local_decoder(cent_table, out_dims, y, threshold = 0.05, mask = True): - if str(y.device).startswith("privateuseone"): - cent_table = cent_table.cpu() - y = y.cpu() - - B, N, _ = y.size() - ci = cent_table[None, None, :].expand(B, N, -1) - confident, max_index = y.max(dim=-1, keepdim=True) - - local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4) - local_argmax_index[local_argmax_index < 0] = 0 - local_argmax_index[local_argmax_index >= out_dims] = out_dims - 1 - - y_l = y.gather(-1, local_argmax_index) - rtn = (ci.gather(-1, local_argmax_index) * y_l).sum(dim=-1, keepdim=True) / y_l.sum(dim=-1, keepdim=True) - - if mask: - confident_mask = torch.ones_like(confident) - confident_mask[confident <= threshold] = float("-INF") - rtn = rtn * confident_mask - - return rtn - -def cents_decoder(cent_table, y, confidence, threshold = 0.05, mask=True): - if str(y.device).startswith("privateuseone"): - cent_table = cent_table.cpu() - y = y.cpu() - - B, N, _ = y.size() - rtn = (cent_table[None, None, :].expand(B, N, -1) * y).sum(dim=-1, keepdim=True) / y.sum(dim=-1, keepdim=True) - - if mask: - confident = y.max(dim=-1, keepdim=True)[0] - confident_mask = torch.ones_like(confident) - confident_mask[confident <= threshold] = float("-INF") - rtn = rtn * confident_mask - - return (rtn, confident) if confidence else rtn - -def cents_local_decoder(cent_table, y, n_out, confidence, threshold = 0.05, mask=True): - if str(y.device).startswith("privateuseone"): - cent_table = cent_table.cpu() - y = y.cpu() - - B, N, _ = y.size() - confident, max_index = y.max(dim=-1, keepdim=True) - local_argmax_index = (torch.arange(0, 9).to(max_index.device) + (max_index - 4)).clamp(0, n_out - 1) - y_l = y.gather(-1, local_argmax_index) - rtn = (cent_table[None, None, :].expand(B, N, -1).gather(-1, local_argmax_index) * y_l).sum(dim=-1, keepdim=True) / y_l.sum(dim=-1, keepdim=True) - - if mask: - confident_mask = torch.ones_like(confident) - confident_mask[confident <= threshold] = float("-INF") - rtn = rtn * confident_mask - - return (rtn, confident) if confidence else rtn - -class PCmer(nn.Module): - def __init__(self, num_layers, num_heads, dim_model, dim_keys, dim_values, residual_dropout, attention_dropout): - super().__init__() - self.num_layers = num_layers - self.num_heads = num_heads - self.dim_model = dim_model - self.dim_values = dim_values - self.dim_keys = dim_keys - self.residual_dropout = residual_dropout - self.attention_dropout = attention_dropout - self._layers = nn.ModuleList([EncoderLayer(self) for _ in range(num_layers)]) - - def forward(self, phone, mask=None): - for layer in self._layers: - phone = layer(phone, mask) - - return phone - -class CFNaiveMelPE(nn.Module): - def __init__(self, input_channels, out_dims, hidden_dims = 512, n_layers = 6, n_heads = 8, f0_max = 1975.5, f0_min = 32.70, use_fa_norm = False, conv_only = False, conv_dropout = 0, atten_dropout = 0, use_harmonic_emb = False): - super().__init__() - self.input_channels = input_channels - self.out_dims = out_dims - self.hidden_dims = hidden_dims - self.n_layers = n_layers - self.n_heads = n_heads - self.f0_max = f0_max - self.f0_min = f0_min - self.use_fa_norm = use_fa_norm - self.residual_dropout = 0.1 - self.attention_dropout = 0.1 - self.harmonic_emb = nn.Embedding(9, hidden_dims) if use_harmonic_emb else None - self.input_stack = nn.Sequential(nn.Conv1d(input_channels, hidden_dims, 3, 1, 1), nn.GroupNorm(4, hidden_dims), nn.LeakyReLU(), nn.Conv1d(hidden_dims, hidden_dims, 3, 1, 1)) - self.net = ConformerNaiveEncoder(num_layers=n_layers, num_heads=n_heads, dim_model=hidden_dims, use_norm=use_fa_norm, conv_only=conv_only, conv_dropout=conv_dropout, atten_dropout=atten_dropout) - self.norm = nn.LayerNorm(hidden_dims) - self.output_proj = weight_norm(nn.Linear(hidden_dims, out_dims)) - self.cent_table_b = torch.linspace(f0_to_cent(torch.Tensor([f0_min]))[0], f0_to_cent(torch.Tensor([f0_max]))[0], out_dims).detach() - self.register_buffer("cent_table", self.cent_table_b) - self.gaussian_blurred_cent_mask_b = (1200 * torch.Tensor([self.f0_max / 10.]).log2())[0].detach() - self.register_buffer("gaussian_blurred_cent_mask", self.gaussian_blurred_cent_mask_b) - - def forward(self, x, _h_emb=None): - x = self.input_stack(x.transpose(-1, -2)).transpose(-1, -2) - if self.harmonic_emb is not None: x = x + self.harmonic_emb(torch.LongTensor([0]).to(x.device)) if _h_emb is None else x + self.harmonic_emb(torch.LongTensor([int(_h_emb)]).to(x.device)) - return self.output_proj(self.norm(self.net(x))).sigmoid() - - @torch.no_grad() - def infer(self, mel, decoder = "local_argmax", threshold = 0.05): - latent = self.forward(mel) - return cent_to_f0(latent2cents_decoder(self.cent_table, latent, threshold=threshold) if decoder == "argmax" else latent2cents_local_decoder(self.cent_table, self.out_dims, latent, threshold=threshold)) - -class FCPE_LEGACY(nn.Module): - def __init__(self, input_channel=128, out_dims=360, n_layers=12, n_chans=512, loss_mse_scale=10, loss_l2_regularization=False, loss_l2_regularization_scale=1, loss_grad1_mse=False, loss_grad1_mse_scale=1, f0_max=1975.5, f0_min=32.70, confidence=False, threshold=0.05, use_input_conv=True): - super().__init__() - self.loss_mse_scale = loss_mse_scale - self.loss_l2_regularization = loss_l2_regularization - self.loss_l2_regularization_scale = loss_l2_regularization_scale - self.loss_grad1_mse = loss_grad1_mse - self.loss_grad1_mse_scale = loss_grad1_mse_scale - self.f0_max = f0_max - self.f0_min = f0_min - self.confidence = confidence - self.threshold = threshold - self.use_input_conv = use_input_conv - self.cent_table_b = torch.Tensor(np.linspace(f0_to_cent(torch.Tensor([f0_min]))[0], f0_to_cent(torch.Tensor([f0_max]))[0], out_dims)) - self.register_buffer("cent_table", self.cent_table_b) - self.stack = nn.Sequential(nn.Conv1d(input_channel, n_chans, 3, 1, 1), nn.GroupNorm(4, n_chans), nn.LeakyReLU(), nn.Conv1d(n_chans, n_chans, 3, 1, 1)) - self.decoder = PCmer(num_layers=n_layers, num_heads=8, dim_model=n_chans, dim_keys=n_chans, dim_values=n_chans, residual_dropout=0.1, attention_dropout=0.1) - self.norm = nn.LayerNorm(n_chans) - self.n_out = out_dims - self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out)) - - def forward(self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax", output_interp_target_length=None): - x = self.dense_out(self.norm(self.decoder((self.stack(mel.transpose(1, 2)).transpose(1, 2) if self.use_input_conv else mel)))).sigmoid() - - if not infer: - loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, self.gaussian_blurred_cent(f0_to_cent(gt_f0))) - if self.loss_l2_regularization: loss_all = loss_all + l2_regularization(model=self, l2_alpha=self.loss_l2_regularization_scale) - x = loss_all - else: - x = cent_to_f0(cents_decoder(self.cent_table, x, self.confidence, threshold=self.threshold, mask=True) if cdecoder == "argmax" else cents_local_decoder(self.cent_table, x, self.n_out, self.confidence, threshold=self.threshold, mask=True)) - x = (1 + x / 700).log() if not return_hz_f0 else x - - if output_interp_target_length is not None: - x = F.interpolate(torch.where(x == 0, float("nan"), x).transpose(1, 2), size=int(output_interp_target_length), mode="linear").transpose(1, 2) - x = torch.where(x.isnan(), float(0.0), x) - - return x - - def gaussian_blurred_cent(self, cents): - B, N, _ = cents.size() - return (-(self.cent_table[None, None, :].expand(B, N, -1) - cents).square() / 1250).exp() * (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0))).float() - -class InferCFNaiveMelPE(torch.nn.Module): - def __init__(self, args, state_dict): - super().__init__() - self.model = CFNaiveMelPE(input_channels=args.mel.num_mels, out_dims=args.model.out_dims, hidden_dims=args.model.hidden_dims, n_layers=args.model.n_layers, n_heads=args.model.n_heads, f0_max=args.model.f0_max, f0_min=args.model.f0_min, use_fa_norm=args.model.use_fa_norm, conv_only=args.model.conv_only, conv_dropout=args.model.conv_dropout, atten_dropout=args.model.atten_dropout, use_harmonic_emb=False) - self.model.load_state_dict(state_dict) - self.model.eval() - self.register_buffer("tensor_device_marker", torch.tensor(1.0).float(), persistent=False) - - def forward(self, mel, decoder_mode = "local_argmax", threshold = 0.006): - with torch.no_grad(): - mels = rearrange(torch.stack([mel], -1), "B T C K -> (B K) T C") - f0s = rearrange(self.model.infer(mels, decoder=decoder_mode, threshold=threshold), "(B K) T 1 -> B T (K 1)", K=1) - - return f0s - - def infer(self, mel, decoder_mode = "local_argmax", threshold = 0.006, f0_min = None, f0_max = None, interp_uv = False, output_interp_target_length = None, return_uv = False): - f0 = self.__call__(mel, decoder_mode, threshold) - f0_for_uv = f0 - - uv = (f0_for_uv < f0_min).type(f0_for_uv.dtype) - f0 = f0 * (1 - uv) - - if interp_uv: f0 = batch_interp_with_replacement_detach(uv.squeeze(-1).bool(), f0.squeeze(-1)).unsqueeze(-1) - if f0_max is not None: f0[f0 > f0_max] = f0_max - if output_interp_target_length is not None: - f0 = F.interpolate(torch.where(f0 == 0, float("nan"), f0).transpose(1, 2), size=int(output_interp_target_length), mode="linear").transpose(1, 2) - f0 = torch.where(f0.isnan(), float(0.0), f0) - - if return_uv: return f0, F.interpolate(uv.transpose(1, 2), size=int(output_interp_target_length), mode="nearest").transpose(1, 2) - else: return f0 - -class FCPEInfer_LEGACY: - def __init__(self, configs, model_path, device=None, dtype=torch.float32, providers=None, onnx=False, f0_min=50, f0_max=1100): - if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" - self.device = device - self.dtype = dtype - self.onnx = onnx - self.f0_min = f0_min - self.f0_max = f0_max - self.wav2mel = Wav2Mel(device=self.device, dtype=self.dtype) - - if self.onnx: - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 3 - self.model = ort.InferenceSession(decrypt_model(configs, model_path), sess_options=sess_options, providers=providers) - else: - ckpt = torch.load(model_path, map_location="cpu", weights_only=True) - self.args = DotDict(ckpt["config"]) - model = FCPE_LEGACY(input_channel=self.args.model.input_channel, out_dims=self.args.model.out_dims, n_layers=self.args.model.n_layers, n_chans=self.args.model.n_chans, loss_mse_scale=self.args.loss.loss_mse_scale, loss_l2_regularization=self.args.loss.loss_l2_regularization, loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale, loss_grad1_mse=self.args.loss.loss_grad1_mse, loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale, f0_max=self.f0_max, f0_min=self.f0_min, confidence=self.args.model.confidence) - model.to(self.device).to(self.dtype) - model.load_state_dict(ckpt["model"]) - model.eval() - self.model = model - - @torch.no_grad() - def __call__(self, audio, sr, threshold=0.05, p_len=None): - if not self.onnx: self.model.threshold = threshold - if not hasattr(self, "numpy_threshold") and self.onnx: self.numpy_threshold = np.array(threshold, dtype=np.float32) - - mel = self.wav2mel(audio=audio[None, :], sample_rate=sr).to(self.dtype) - - if self.onnx: - return torch.as_tensor( - self.model.run( - [self.model.get_outputs()[0].name], - { - self.model.get_inputs()[0].name: mel.detach().cpu().numpy(), - self.model.get_inputs()[1].name: self.numpy_threshold - } - )[0], - dtype=self.dtype, - device=self.device - ) - else: - return self.model( - mel=mel, - infer=True, - return_hz_f0=True, - output_interp_target_length=p_len - ) - -class FCPEInfer: - def __init__(self, configs, model_path, device=None, dtype=torch.float32, providers=None, onnx=False, f0_min=50, f0_max=1100): - if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" - self.device = device - self.dtype = dtype - self.onnx = onnx - self.f0_min = f0_min - self.f0_max = f0_max - self.wav2mel = Wav2Mel(device=self.device, dtype=self.dtype) - - if self.onnx: - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 3 - self.model = ort.InferenceSession(decrypt_model(configs, model_path), sess_options=sess_options, providers=providers) - else: - ckpt = torch.load(model_path, map_location="cpu", weights_only=True) - ckpt["config_dict"]["model"]["conv_dropout"] = ckpt["config_dict"]["model"]["atten_dropout"] = 0.0 - self.args = DotDict(ckpt["config_dict"]) - model = InferCFNaiveMelPE(self.args, ckpt["model"]) - self.model = model.to(device).to(self.dtype).eval() - - @torch.no_grad() - def __call__(self, audio, sr, threshold=0.05, p_len=None): - if not hasattr(self, "numpy_threshold") and self.onnx: self.numpy_threshold = np.array(threshold, dtype=np.float32) - mel = self.wav2mel(audio=audio[None, :], sample_rate=sr).to(self.dtype) - - if self.onnx: - return torch.as_tensor( - self.model.run( - [self.model.get_outputs()[0].name], - { - self.model.get_inputs()[0].name: mel.detach().cpu().numpy(), - self.model.get_inputs()[1].name: self.numpy_threshold - } - )[0], - dtype=self.dtype, - device=self.device - ) - else: - return self.model.infer( - mel, - threshold=threshold, - f0_min=self.f0_min, - f0_max=self.f0_max, - output_interp_target_length=p_len - ) - -class FCPE: - def __init__(self, configs, model_path, hop_length=512, f0_min=50, f0_max=1100, dtype=torch.float32, device=None, sample_rate=16000, threshold=0.05, providers=None, onnx=False, legacy=False): - self.model = FCPEInfer_LEGACY if legacy else FCPEInfer - self.fcpe = self.model(configs, model_path, device=device, dtype=dtype, providers=providers, onnx=onnx, f0_min=f0_min, f0_max=f0_max) - self.hop_length = hop_length - self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") - self.threshold = threshold - self.sample_rate = sample_rate - self.dtype = dtype - self.legacy = legacy - - def compute_f0(self, wav, p_len=None): - x = torch.FloatTensor(wav).to(self.dtype).to(self.device) - p_len = (x.shape[0] // self.hop_length) if p_len is None else p_len - - f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold, p_len=p_len) - f0 = f0[:] if f0.dim() == 1 else f0[0, :, 0] - - if torch.all(f0 == 0): return f0.cpu().numpy() if p_len is None else np.zeros(p_len) - return f0.cpu().numpy() \ No newline at end of file diff --git a/main/library/predictors/FCPE/attentions.py b/main/library/predictors/FCPE/attentions.py deleted file mode 100644 index cb7940d6dd6a5797ceb99c33777b5c10122cbe82..0000000000000000000000000000000000000000 --- a/main/library/predictors/FCPE/attentions.py +++ /dev/null @@ -1,281 +0,0 @@ -import math -import torch - -import torch.nn.functional as F - -from torch import nn, einsum -from functools import partial -from einops import rearrange, repeat, pack, unpack - -def exists(val): - return val is not None - -def default(value, d): - return value if exists(value) else d - -def empty(tensor): - return tensor.numel() == 0 - -def pad_to_multiple(tensor, multiple, dim=-1, value=0): - seqlen = tensor.shape[dim] - m = seqlen / multiple - if m.is_integer(): return False, tensor - return True, F.pad(tensor, (*((0,) * (-1 - dim) * 2), 0, (math.ceil(m) * multiple - seqlen)), value = value) - -def look_around(x, backward = 1, forward = 0, pad_value = -1, dim = 2): - t = x.shape[1] - dims = (len(x.shape) - dim) * (0, 0) - padded_x = F.pad(x, (*dims, backward, forward), value = pad_value) - return torch.cat([padded_x[:, ind:(ind + t), ...] for ind in range(forward + backward + 1)], dim = dim) - -def rotate_half(x): - x1, x2 = rearrange(x, 'b ... (r d) -> b ... r d', r = 2).unbind(dim = -2) - return torch.cat((-x2, x1), dim = -1) - -def apply_rotary_pos_emb(q, k, freqs, scale = 1): - q_len = q.shape[-2] - q_freqs = freqs[..., -q_len:, :] - inv_scale = scale ** -1 - if scale.ndim == 2: scale = scale[-q_len:, :] - q = (q * q_freqs.cos() * scale) + (rotate_half(q) * q_freqs.sin() * scale) - k = (k * freqs.cos() * inv_scale) + (rotate_half(k) * freqs.sin() * inv_scale) - - return q, k - -def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None): - unstructured_block = torch.randn((cols, cols), device=device) - q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced") - q, r = map(lambda t: t.to(device), (q, r)) - if qr_uniform_q: - d = r.diag(0) - q *= d.sign() - - return q.t() - -def gaussian_orthogonal_random_matrix(nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None): - nb_full_blocks = int(nb_rows / nb_columns) - block_list = [] - for _ in range(nb_full_blocks): - block_list.append(orthogonal_matrix_chunk(nb_columns, qr_uniform_q=qr_uniform_q, device=device)) - - remaining_rows = nb_rows - nb_full_blocks * nb_columns - if remaining_rows > 0: block_list.append(orthogonal_matrix_chunk(nb_columns, qr_uniform_q=qr_uniform_q, device=device)[:remaining_rows]) - if scaling == 0: multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1) - elif scaling == 1: multiplier = math.sqrt((float(nb_columns))) * torch.ones((nb_rows,), device=device) - else: raise ValueError(f"{scaling} != 0, 1") - - return multiplier.diag() @ torch.cat(block_list) - -def linear_attention(q, k, v): - return einsum("...ed,...nd->...ne", k, q) if v is None else einsum("...de,...nd,...n->...ne", einsum("...nd,...ne->...de", k, v), q, 1.0 / (einsum("...nd,...d->...n", q, k.sum(dim=-2).type_as(q)) + 1e-8)) - -def softmax_kernel(data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None): - b, h, *_ = data.shape - - data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0 - ratio = projection_matrix.shape[0] ** -0.5 - data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), repeat(projection_matrix, "j d -> b h j d", b=b, h=h).type_as(data)) - diag_data = (((data**2).sum(dim=-1) / 2.0) * (data_normalizer**2)).unsqueeze(dim=-1) - - return (ratio * ((data_dash - diag_data - data_dash.max(dim=-1, keepdim=True).values).exp() + eps) if is_query else ratio * ((data_dash - diag_data + eps).exp())).type_as(data) - -class SinusoidalEmbeddings(nn.Module): - def __init__(self, dim, scale_base = None, use_xpos = False, theta = 10000): - super().__init__() - inv_freq = 1. / (theta ** (torch.arange(0, dim, 2).float() / dim)) - self.register_buffer('inv_freq', inv_freq) - self.use_xpos = use_xpos - self.scale_base = scale_base - assert not (use_xpos and not exists(scale_base)) - scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim) - self.register_buffer('scale', scale, persistent = False) - - def forward(self, x): - seq_len, device = x.shape[-2], x.device - t = torch.arange(seq_len, device = x.device).type_as(self.inv_freq) - - freqs = torch.einsum('i , j -> i j', t, self.inv_freq) - freqs = torch.cat((freqs, freqs), dim = -1) - - if not self.use_xpos: return freqs, torch.ones(1, device = device) - - power = (t - (seq_len // 2)) / self.scale_base - scale = self.scale ** rearrange(power, 'n -> n 1') - - return freqs, torch.cat((scale, scale), dim = -1) - -class LocalAttention(nn.Module): - def __init__(self, window_size, causal = False, look_backward = 1, look_forward = None, dropout = 0., shared_qk = False, rel_pos_emb_config = None, dim = None, autopad = False, exact_windowsize = False, scale = None, use_rotary_pos_emb = True, use_xpos = False, xpos_scale_base = None): - super().__init__() - look_forward = default(look_forward, 0 if causal else 1) - assert not (causal and look_forward > 0) - self.scale = scale - self.window_size = window_size - self.autopad = autopad - self.exact_windowsize = exact_windowsize - self.causal = causal - self.look_backward = look_backward - self.look_forward = look_forward - self.dropout = nn.Dropout(dropout) - self.shared_qk = shared_qk - self.rel_pos = None - self.use_xpos = use_xpos - if use_rotary_pos_emb and (exists(rel_pos_emb_config) or exists(dim)): - if exists(rel_pos_emb_config): dim = rel_pos_emb_config[0] - self.rel_pos = SinusoidalEmbeddings(dim, use_xpos = use_xpos, scale_base = default(xpos_scale_base, window_size // 2)) - - def forward(self, q, k, v, mask = None, input_mask = None, attn_bias = None, window_size = None): - mask = default(mask, input_mask) - assert not (exists(window_size) and not self.use_xpos) - - _, autopad, pad_value, window_size, causal, look_backward, look_forward, shared_qk = q.shape, self.autopad, -1, default(window_size, self.window_size), self.causal, self.look_backward, self.look_forward, self.shared_qk - (q, packed_shape), (k, _), (v, _) = map(lambda t: pack([t], '* n d'), (q, k, v)) - - if autopad: - orig_seq_len = q.shape[1] - (_, q), (_, k), (_, v) = map(lambda t: pad_to_multiple(t, self.window_size, dim = -2), (q, k, v)) - - b, n, dim_head, device, dtype = *q.shape, q.device, q.dtype - scale = default(self.scale, dim_head ** -0.5) - - assert (n % window_size) == 0 - windows = n // window_size - - if shared_qk: k = F.normalize(k, dim = -1).type(k.dtype) - - seq = torch.arange(n, device = device) - b_t = rearrange(seq, '(w n) -> 1 w n', w = windows, n = window_size) - bq, bk, bv = map(lambda t: rearrange(t, 'b (w n) d -> b w n d', w = windows), (q, k, v)) - - bq = bq * scale - look_around_kwargs = dict(backward = look_backward, forward = look_forward, pad_value = pad_value) - - bk = look_around(bk, **look_around_kwargs) - bv = look_around(bv, **look_around_kwargs) - - if exists(self.rel_pos): - pos_emb, xpos_scale = self.rel_pos(bk) - bq, bk = apply_rotary_pos_emb(bq, bk, pos_emb, scale = xpos_scale) - - bq_t = b_t - bq_k = look_around(b_t, **look_around_kwargs) - bq_t = rearrange(bq_t, '... i -> ... i 1') - bq_k = rearrange(bq_k, '... j -> ... 1 j') - - pad_mask = bq_k == pad_value - sim = einsum('b h i e, b h j e -> b h i j', bq, bk) - - if exists(attn_bias): - heads = attn_bias.shape[0] - assert (b % heads) == 0 - - attn_bias = repeat(attn_bias, 'h i j -> (b h) 1 i j', b = b // heads) - sim = sim + attn_bias - - mask_value = -torch.finfo(sim.dtype).max - if shared_qk: - self_mask = bq_t == bq_k - sim = sim.masked_fill(self_mask, -5e4) - del self_mask - - if causal: - causal_mask = bq_t < bq_k - if self.exact_windowsize: causal_mask = causal_mask | (bq_t > (bq_k + (self.window_size * self.look_backward))) - sim = sim.masked_fill(causal_mask, mask_value) - del causal_mask - - sim = sim.masked_fill(((bq_k - (self.window_size * self.look_forward)) > bq_t) | (bq_t > (bq_k + (self.window_size * self.look_backward))) | pad_mask, mask_value) if not causal and self.exact_windowsize else sim.masked_fill(pad_mask, mask_value) - - if exists(mask): - batch = mask.shape[0] - assert (b % batch) == 0 - - h = b // mask.shape[0] - if autopad: _, mask = pad_to_multiple(mask, window_size, dim = -1, value = False) - - mask = repeat(rearrange(look_around(rearrange(mask, '... (w n) -> (...) w n', w = windows, n = window_size), **{**look_around_kwargs, 'pad_value': False}), '... j -> ... 1 j'), 'b ... -> (b h) ...', h = h) - sim = sim.masked_fill(~mask, mask_value) - - del mask - - out = rearrange(einsum('b h i j, b h j e -> b h i e', self.dropout(sim.softmax(dim = -1)), bv), 'b w n d -> b (w n) d') - if autopad: out = out[:, :orig_seq_len, :] - - out, *_ = unpack(out, packed_shape, '* n d') - return out - -class FastAttention(nn.Module): - def __init__(self, dim_heads, nb_features=None, ortho_scaling=0, causal=False, generalized_attention=False, kernel_fn=nn.ReLU(), qr_uniform_q=False, no_projection=False): - super().__init__() - nb_features = default(nb_features, int(dim_heads * math.log(dim_heads))) - self.dim_heads = dim_heads - self.nb_features = nb_features - self.ortho_scaling = ortho_scaling - self.create_projection = partial(gaussian_orthogonal_random_matrix, nb_rows=self.nb_features, nb_columns=dim_heads, scaling=ortho_scaling, qr_uniform_q=qr_uniform_q) - projection_matrix = self.create_projection() - self.register_buffer("projection_matrix", projection_matrix) - self.generalized_attention = generalized_attention - self.kernel_fn = kernel_fn - self.no_projection = no_projection - self.causal = causal - - @torch.no_grad() - def redraw_projection_matrix(self): - projections = self.create_projection() - self.projection_matrix.copy_(projections) - del projections - - def forward(self, q, k, v): - if self.no_projection: q, k = q.softmax(dim=-1), (k.exp() if self.causal else k.softmax(dim=-2)) - else: - create_kernel = partial(softmax_kernel, projection_matrix=self.projection_matrix, device=q.device) - q, k = create_kernel(q, is_query=True), create_kernel(k, is_query=False) - - attn_fn = linear_attention if not self.causal else self.causal_linear_fn - return attn_fn(q, k, None) if v is None else attn_fn(q, k, v) - -class SelfAttention(nn.Module): - def __init__(self, dim, causal=False, heads=8, dim_head=64, local_heads=0, local_window_size=256, nb_features=None, feature_redraw_interval=1000, generalized_attention=False, kernel_fn=nn.ReLU(), qr_uniform_q=False, dropout=0.0, no_projection=False): - super().__init__() - assert dim % heads == 0 - dim_head = default(dim_head, dim // heads) - inner_dim = dim_head * heads - self.fast_attention = FastAttention(dim_head, nb_features, causal=causal, generalized_attention=generalized_attention, kernel_fn=kernel_fn, qr_uniform_q=qr_uniform_q, no_projection=no_projection) - self.heads = heads - self.global_heads = heads - local_heads - self.local_attn = (LocalAttention(window_size=local_window_size, causal=causal, autopad=True, dropout=dropout, look_forward=int(not causal), rel_pos_emb_config=(dim_head, local_heads)) if local_heads > 0 else None) - self.to_q = nn.Linear(dim, inner_dim) - self.to_k = nn.Linear(dim, inner_dim) - self.to_v = nn.Linear(dim, inner_dim) - self.to_out = nn.Linear(inner_dim, dim) - self.dropout = nn.Dropout(dropout) - - @torch.no_grad() - def redraw_projection_matrix(self): - self.fast_attention.redraw_projection_matrix() - - def forward(self, x, context=None, mask=None, context_mask=None, name=None, inference=False, **kwargs): - _, _, _, h, gh = *x.shape, self.heads, self.global_heads - cross_attend = exists(context) - context = default(context, x) - context_mask = default(context_mask, mask) if not cross_attend else context_mask - - q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (self.to_q(x), self.to_k(context), self.to_v(context))) - (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v)) - - attn_outs = [] - - if not empty(q): - if exists(context_mask): v.masked_fill_(~context_mask[:, None, :, None], 0.0) - if cross_attend: pass - else: out = self.fast_attention(q, k, v) - - attn_outs.append(out) - - if not empty(lq): - assert (not cross_attend), "not cross_attend" - - out = self.local_attn(lq, lk, lv, input_mask=mask) - attn_outs.append(out) - - return self.dropout(self.to_out(rearrange(torch.cat(attn_outs, dim=1), "b h n d -> b n (h d)"))) \ No newline at end of file diff --git a/main/library/predictors/FCPE/encoder.py b/main/library/predictors/FCPE/encoder.py deleted file mode 100644 index 14c0eabc65ec79da8c48655f6175a000fc83cadd..0000000000000000000000000000000000000000 --- a/main/library/predictors/FCPE/encoder.py +++ /dev/null @@ -1,86 +0,0 @@ -import os -import sys - -import torch.nn as nn -import torch.nn.functional as F - -sys.path.append(os.getcwd()) - -from main.library.predictors.FCPE.attentions import SelfAttention -from main.library.predictors.FCPE.utils import calc_same_padding, Transpose, GLU, Swish - -class ConformerConvModule_LEGACY(nn.Module): - def __init__(self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0): - super().__init__() - inner_dim = dim * expansion_factor - self.net = nn.Sequential(nn.LayerNorm(dim), Transpose((1, 2)), nn.Conv1d(dim, inner_dim * 2, 1), GLU(dim=1), DepthWiseConv1d_LEGACY(inner_dim, inner_dim, kernel_size=kernel_size, padding=(calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0))), Swish(), nn.Conv1d(inner_dim, dim, 1), Transpose((1, 2)), nn.Dropout(dropout)) - - def forward(self, x): - return self.net(x) - -class ConformerConvModule(nn.Module): - def __init__(self, dim, expansion_factor=2, kernel_size=31, dropout=0): - super().__init__() - inner_dim = dim * expansion_factor - self.net = nn.Sequential(nn.LayerNorm(dim), Transpose((1, 2)), nn.Conv1d(dim, inner_dim * 2, 1), nn.GLU(dim=1), DepthWiseConv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=calc_same_padding(kernel_size)[0], groups=inner_dim), nn.SiLU(), nn.Conv1d(inner_dim, dim, 1), Transpose((1, 2)), nn.Dropout(dropout)) - - def forward(self, x): - return self.net(x) - -class DepthWiseConv1d_LEGACY(nn.Module): - def __init__(self, chan_in, chan_out, kernel_size, padding): - super().__init__() - self.padding = padding - self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in) - - def forward(self, x): - return self.conv(F.pad(x, self.padding)) - -class DepthWiseConv1d(nn.Module): - def __init__(self, chan_in, chan_out, kernel_size, padding, groups): - super().__init__() - self.conv = nn.Conv1d(chan_in, chan_out, kernel_size=kernel_size, padding=padding, groups=groups) - - def forward(self, x): - return self.conv(x) - -class EncoderLayer(nn.Module): - def __init__(self, parent): - super().__init__() - self.conformer = ConformerConvModule_LEGACY(parent.dim_model) - self.norm = nn.LayerNorm(parent.dim_model) - self.dropout = nn.Dropout(parent.residual_dropout) - self.attn = SelfAttention(dim=parent.dim_model, heads=parent.num_heads, causal=False) - - def forward(self, phone, mask=None): - phone = phone + (self.attn(self.norm(phone), mask=mask)) - return phone + (self.conformer(phone)) - -class ConformerNaiveEncoder(nn.Module): - def __init__(self, num_layers, num_heads, dim_model, use_norm = False, conv_only = False, conv_dropout = 0, atten_dropout = 0): - super().__init__() - self.num_layers = num_layers - self.num_heads = num_heads - self.dim_model = dim_model - self.use_norm = use_norm - self.residual_dropout = 0.1 - self.attention_dropout = 0.1 - self.encoder_layers = nn.ModuleList([CFNEncoderLayer(dim_model, num_heads, use_norm, conv_only, conv_dropout, atten_dropout) for _ in range(num_layers)]) - - def forward(self, x, mask=None): - for (_, layer) in enumerate(self.encoder_layers): - x = layer(x, mask) - - return x - -class CFNEncoderLayer(nn.Module): - def __init__(self, dim_model, num_heads = 8, use_norm = False, conv_only = False, conv_dropout = 0, atten_dropout = 0): - super().__init__() - self.conformer = nn.Sequential(ConformerConvModule(dim_model), nn.Dropout(conv_dropout)) if conv_dropout > 0 else ConformerConvModule(dim_model) - self.norm = nn.LayerNorm(dim_model) - self.dropout = nn.Dropout(0.1) - self.attn = SelfAttention(dim=dim_model, heads=num_heads, causal=False, use_norm=use_norm, dropout=atten_dropout) if not conv_only else None - - def forward(self, x, mask=None): - if self.attn is not None: x = x + (self.attn(self.norm(x), mask=mask)) - return x + (self.conformer(x)) \ No newline at end of file diff --git a/main/library/predictors/FCPE/stft.py b/main/library/predictors/FCPE/stft.py deleted file mode 100644 index 2fe5d804f9ca6765519c00556a38e4ce3400c1f5..0000000000000000000000000000000000000000 --- a/main/library/predictors/FCPE/stft.py +++ /dev/null @@ -1,61 +0,0 @@ -import os -import sys -import torch - -import numpy as np -import torch.nn.functional as F - -from librosa.filters import mel - -sys.path.append(os.getcwd()) - -class STFT: - def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5): - self.target_sr = sr - self.n_mels = n_mels - self.n_fft = n_fft - self.win_size = win_size - self.hop_length = hop_length - self.fmin = fmin - self.fmax = fmax - self.clip_val = clip_val - self.mel_basis = {} - self.hann_window = {} - - def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): - n_fft = self.n_fft - win_size = self.win_size - hop_length = self.hop_length - fmax = self.fmax - factor = 2 ** (keyshift / 12) - win_size_new = int(np.round(win_size * factor)) - hop_length_new = int(np.round(hop_length * speed)) - mel_basis = self.mel_basis if not train else {} - hann_window = self.hann_window if not train else {} - mel_basis_key = str(fmax) + "_" + str(y.device) - - if mel_basis_key not in mel_basis: mel_basis[mel_basis_key] = torch.from_numpy(mel(sr=self.target_sr, n_fft=n_fft, n_mels=self.n_mels, fmin=self.fmin, fmax=fmax)).float().to(y.device) - keyshift_key = str(keyshift) + "_" + str(y.device) - if keyshift_key not in hann_window: hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device) - - pad_left = (win_size_new - hop_length_new) // 2 - pad_right = max((win_size_new - hop_length_new + 1) // 2, win_size_new - y.size(-1) - pad_left) - - pad = F.pad(y.unsqueeze(1), (pad_left, pad_right), mode="reflect" if pad_right < y.size(-1) else "constant").squeeze(1) - n_fft = int(np.round(n_fft * factor)) - - if str(y.device).startswith(("ocl", "privateuseone")): - if not hasattr(self, "stft"): - from main.library.backends.utils import STFT as _STFT - self.stft = _STFT(filter_length=n_fft, hop_length=hop_length_new, win_length=win_size_new).to(y.device) - spec = self.stft.transform(pad, 1e-9) - else: - spec = torch.stft(pad, n_fft, hop_length=hop_length_new, win_length=win_size_new, window=hann_window[keyshift_key], center=center, pad_mode="reflect", normalized=False, onesided=True, return_complex=True) - spec = (spec.real.pow(2) + spec.imag.pow(2) + 1e-9).sqrt() - - if keyshift != 0: - size = n_fft // 2 + 1 - resize = spec.size(1) - spec = (F.pad(spec, (0, 0, 0, size - resize)) if resize < size else spec[:, :size, :]) * win_size / win_size_new - - return ((mel_basis[mel_basis_key] @ spec).clamp(min=self.clip_val) * 1).log() \ No newline at end of file diff --git a/main/library/predictors/FCPE/utils.py b/main/library/predictors/FCPE/utils.py deleted file mode 100644 index b38c6d44c4964a0cdb9996668dbbb0c9119dfcee..0000000000000000000000000000000000000000 --- a/main/library/predictors/FCPE/utils.py +++ /dev/null @@ -1,81 +0,0 @@ -import os -import torch - -from torch import nn -from io import BytesIO -from Crypto.Cipher import AES -from Crypto.Util.Padding import unpad - -def decrypt_model(configs, input_path): - with open(input_path, "rb") as f: - data = f.read() - - with open(os.path.join(configs["binary_path"], "decrypt.bin"), "rb") as f: - key = f.read() - - return BytesIO(unpad(AES.new(key, AES.MODE_CBC, data[:16]).decrypt(data[16:]), AES.block_size)).read() - -def calc_same_padding(kernel_size): - pad = kernel_size // 2 - return (pad, pad - (kernel_size + 1) % 2) - -def l2_regularization(model, l2_alpha): - l2_loss = [] - for module in model.modules(): - if type(module) is nn.Conv2d: l2_loss.append((module.weight**2).sum() / 2.0) - - return l2_alpha * sum(l2_loss) - -def torch_interp(x, xp, fp): - sort_idx = xp.argsort() - xp = xp[sort_idx] - fp = fp[sort_idx] - - right_idxs = torch.searchsorted(xp, x).clamp(max=len(xp) - 1) - left_idxs = (right_idxs - 1).clamp(min=0) - x_left = xp[left_idxs] - y_left = fp[left_idxs] - - interp_vals = y_left + ((x - x_left) * (fp[right_idxs] - y_left) / (xp[right_idxs] - x_left)) - interp_vals[x < xp[0]] = fp[0] - interp_vals[x > xp[-1]] = fp[-1] - - return interp_vals - -def batch_interp_with_replacement_detach(uv, f0): - result = f0.clone() - for i in range(uv.shape[0]): - interp_vals = torch_interp(torch.where(uv[i])[-1], torch.where(~uv[i])[-1], f0[i][~uv[i]]).detach() - result[i][uv[i]] = interp_vals - - return result - -class DotDict(dict): - def __getattr__(*args): - val = dict.get(*args) - return DotDict(val) if type(val) is dict else val - - __setattr__ = dict.__setitem__ - __delattr__ = dict.__delitem__ - -class Swish(nn.Module): - def forward(self, x): - return x * x.sigmoid() - -class Transpose(nn.Module): - def __init__(self, dims): - super().__init__() - assert len(dims) == 2, "dims == 2" - self.dims = dims - - def forward(self, x): - return x.transpose(*self.dims) - -class GLU(nn.Module): - def __init__(self, dim): - super().__init__() - self.dim = dim - - def forward(self, x): - out, gate = x.chunk(2, dim=self.dim) - return out * gate.sigmoid() \ No newline at end of file diff --git a/main/library/predictors/FCPE/wav2mel.py b/main/library/predictors/FCPE/wav2mel.py deleted file mode 100644 index 5bdff1d48cc4f822e92e70c007241b56f779226a..0000000000000000000000000000000000000000 --- a/main/library/predictors/FCPE/wav2mel.py +++ /dev/null @@ -1,39 +0,0 @@ -import os -import sys -import torch - -from torchaudio.transforms import Resample - -sys.path.append(os.getcwd()) - -from main.library.predictors.FCPE.stft import STFT - -class Wav2Mel: - def __init__(self, device=None, dtype=torch.float32): - self.sample_rate = 16000 - self.hop_size = 160 - if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" - self.device = device - self.dtype = dtype - self.stft = STFT(16000, 128, 1024, 1024, 160, 0, 8000) - self.resample_kernel = {} - - def extract_nvstft(self, audio, keyshift=0, train=False): - return self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2) - - def extract_mel(self, audio, sample_rate, keyshift=0, train=False): - audio = audio.to(self.dtype).to(self.device) - if sample_rate == self.sample_rate: audio_res = audio - else: - key_str = str(sample_rate) - if key_str not in self.resample_kernel: self.resample_kernel[key_str] = Resample(sample_rate, self.sample_rate, lowpass_filter_width=128) - self.resample_kernel[key_str] = (self.resample_kernel[key_str].to(self.dtype).to(self.device)) - audio_res = self.resample_kernel[key_str](audio) - - mel = self.extract_nvstft(audio_res, keyshift=keyshift, train=train) - n_frames = int(audio.shape[1] // self.hop_size) + 1 - mel = (torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel) - return mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel - - def __call__(self, audio, sample_rate, keyshift=0, train=False): - return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train) \ No newline at end of file diff --git a/main/library/predictors/Generator.py b/main/library/predictors/Generator.py deleted file mode 100644 index 387aa2c7b54f1d13782776512125e059c8b9fb72..0000000000000000000000000000000000000000 --- a/main/library/predictors/Generator.py +++ /dev/null @@ -1,546 +0,0 @@ -import os -import re -import sys -import math -import torch -import parselmouth - -import numba as nb -import numpy as np - -from scipy.signal import medfilt -from librosa import yin, pyin, piptrack - -sys.path.append(os.getcwd()) - -from main.library.predictors.CREPE.filter import mean, median -from main.library.predictors.WORLD.SWIPE import swipe, stonemask -from main.app.variables import config, configs, logger, translations -from main.library.utils import autotune_f0, proposal_f0_up_key, circular_write - -@nb.jit(nopython=True) -def post_process(tf0, f0, f0_up_key, manual_x_pad, f0_mel_min, f0_mel_max, manual_f0 = None): - f0 = np.multiply(f0, pow(2, f0_up_key / 12)) - - if manual_f0 is not None: - replace_f0 = np.interp( - list( - range( - np.round( - (manual_f0[:, 0].max() - manual_f0[:, 0].min()) * tf0 + 1 - ).astype(np.int16) - ) - ), - manual_f0[:, 0] * 100, - manual_f0[:, 1] - ) - f0[manual_x_pad * tf0 : manual_x_pad * tf0 + len(replace_f0)] = replace_f0[:f0[manual_x_pad * tf0 : manual_x_pad * tf0 + len(replace_f0)].shape[0]] - - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - - return np.rint(f0_mel).astype(np.int32), f0 - -def realtime_post_process(f0, pitch, pitchf, f0_up_key = 0, f0_mel_min = 50.0, f0_mel_max = 1100.0): - f0 *= 2 ** (f0_up_key / 12) - - f0_mel = 1127.0 * (1.0 + f0 / 700.0).log() - f0_mel = torch.clip((f0_mel - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1, 1, 255, out=f0_mel) - f0_coarse = torch.round(f0_mel, out=f0_mel).long() - - if pitch is not None and pitchf is not None: - circular_write(f0_coarse, pitch) - circular_write(f0, pitchf) - else: - pitch = f0_coarse - pitchf = f0 - - return pitch.unsqueeze(0), pitchf.unsqueeze(0) - -class Generator: - def __init__(self, sample_rate = 16000, hop_length = 160, f0_min = 50, f0_max = 1100, alpha = 0.5, is_half = False, device = "cpu", f0_onnx_mode = False, del_onnx_model = True): - self.sample_rate = sample_rate - self.hop_length = hop_length - self.f0_min = f0_min - self.f0_max = f0_max - self.is_half = is_half - self.device = device - self.providers = config.providers - self.f0_onnx_mode = f0_onnx_mode - self.del_onnx_model = del_onnx_model - self.window = 160 - self.batch_size = 512 - self.alpha = alpha - self.ref_freqs = [49.00, 51.91, 55.00, 58.27, 61.74, 65.41, 69.30, 73.42, 77.78, 82.41, 87.31, 92.50, 98.00, 103.83, 110.00, 116.54, 123.47, 130.81, 138.59, 146.83, 155.56, 164.81, 174.61, 185.00, 196.00, 207.65, 220.00, 233.08, 246.94, 261.63, 277.18, 293.66, 311.13, 329.63, 349.23, 369.99, 392.00, 415.30, 440.00, 466.16, 493.88, 523.25, 554.37, 587.33, 622.25, 659.25, 698.46, 739.99, 783.99, 830.61, 880.00, 932.33, 987.77, 1046.50] - - def calculator(self, x_pad, f0_method, x, f0_up_key = 0, p_len = None, filter_radius = 3, f0_autotune = False, f0_autotune_strength = 1, manual_f0 = None, proposal_pitch = False, proposal_pitch_threshold = 255.0): - if p_len is None: p_len = x.shape[0] // self.window - if "hybrid" in f0_method: logger.debug(translations["hybrid_calc"].format(f0_method=f0_method)) - - model = self.get_f0_hybrid if "hybrid" in f0_method else self.compute_f0 - f0 = model(f0_method, x, p_len, filter_radius if filter_radius % 2 != 0 else filter_radius + 1) - - if proposal_pitch: - up_key = proposal_f0_up_key(f0, proposal_pitch_threshold, configs["limit_f0"]) - logger.debug(translations["proposal_f0"].format(up_key=up_key)) - f0_up_key += up_key - - if f0_autotune: - logger.debug(translations["startautotune"]) - f0 = autotune_f0(self.ref_freqs, f0, f0_autotune_strength) - - return post_process( - self.sample_rate // self.window, - f0, - f0_up_key, - x_pad, - 1127 * math.log(1 + self.f0_min / 700), - 1127 * math.log(1 + self.f0_max / 700), - manual_f0 - ) - - def realtime_calculator(self, audio, f0_method, pitch, pitchf, f0_up_key = 0, filter_radius = 3, f0_autotune = False, f0_autotune_strength = 1, proposal_pitch = False, proposal_pitch_threshold = 255.0): - if torch.is_tensor(audio): audio = audio.cpu().numpy() - p_len = audio.shape[0] // self.window - - f0 = self.compute_f0( - f0_method, - audio, - p_len, - filter_radius if filter_radius % 2 != 0 else filter_radius + 1 - ) - - if f0_autotune: f0 = autotune_f0(self.ref_freqs, f0, f0_autotune_strength) - - if proposal_pitch: - up_key = proposal_f0_up_key(f0, proposal_pitch_threshold, configs["limit_f0"]) - f0_up_key += up_key - - return realtime_post_process( - torch.from_numpy(f0).float().to(self.device), - pitch, - pitchf, - f0_up_key, - self.f0_min, - self.f0_max - ) - - def _resize_f0(self, x, target_len): - source = np.array(x) - source[source < 0.001] = np.nan - - return np.nan_to_num( - np.interp( - np.arange(0, len(source) * target_len, len(source)) / target_len, - np.arange(0, len(source)), - source - ) - ) - - def compute_f0(self, f0_method, x, p_len, filter_radius): - if "pm" in f0_method: - f0 = self.get_f0_pm(x, p_len, filter_radius=filter_radius, mode=f0_method.split("-")[1]) - elif f0_method in ["harvest", "dio"]: - f0 = self.get_f0_pyworld(x, p_len, filter_radius, f0_method) - elif "crepe" in f0_method: - split_f0 = f0_method.split("-") - f0 = self.get_f0_mangio_crepe(x, p_len, split_f0[2]) if split_f0[0] == "mangio" else self.get_f0_crepe(x, p_len, split_f0[1], filter_radius=filter_radius) - elif "fcpe" in f0_method: - f0 = self.get_f0_fcpe(x, p_len, legacy="legacy" in f0_method and "previous" not in f0_method, previous="previous" in f0_method, filter_radius=filter_radius) - elif "rmvpe" in f0_method: - f0 = self.get_f0_rmvpe(x, p_len, clipping="clipping" in f0_method, filter_radius=filter_radius) - elif f0_method in ["yin", "pyin", "piptrack"]: - f0 = self.get_f0_librosa(x, p_len, mode=f0_method) - elif "swipe" in f0_method: - f0 = self.get_f0_swipe(x, p_len, filter_radius=filter_radius) - elif "penn" in f0_method: - f0 = self.get_f0_mangio_penn(x, p_len) if f0_method.split("-")[0] == "mangio" else self.get_f0_penn(x, p_len, filter_radius=filter_radius) - elif "djcm" in f0_method: - f0 = self.get_f0_djcm(x, p_len, clipping="clipping" in f0_method, filter_radius=filter_radius) - elif "pesto" in f0_method: - f0 = self.get_f0_pesto(x, p_len) - elif "swift" in f0_method: - f0 = self.get_f0_swift(x, p_len, filter_radius=filter_radius) - else: - raise ValueError(translations["option_not_valid"]) - - if isinstance(f0, tuple): f0 = f0[0] - if "medfilt" in f0_method: f0 = medfilt(f0, kernel_size=5) - - return f0 - - def get_f0_hybrid(self, methods_str, x, p_len, filter_radius): - methods_str = re.search(r"hybrid\[(.+)\]", methods_str) - if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")] - - n = len(methods) - f0_stack = [] - - for method in methods: - f0_stack.append( - self._resize_f0( - self.compute_f0(method, x, p_len, filter_radius), - p_len - ) - ) - - f0_mix = np.zeros(p_len) - - if not f0_stack: return f0_mix - if len(f0_stack) == 1: return f0_stack[0] - - weights = (1 - np.abs(np.arange(n) / (n - 1) - (1 - self.alpha))) ** 2 - weights /= weights.sum() - - stacked = np.vstack(f0_stack) - voiced_mask = np.any(stacked > 0, axis=0) - f0_mix[voiced_mask] = np.exp(np.nansum(np.log(stacked + 1e-6) * weights[:, None], axis=0)[voiced_mask]) - - return f0_mix - - def get_f0_pm(self, x, p_len, filter_radius=3, mode="ac"): - model = parselmouth.Sound( - x, - self.sample_rate - ) - - time_step = self.window / self.sample_rate * 1000 / 1000 - model_mode = {"ac": model.to_pitch_ac, "cc": model.to_pitch_cc, "shs": model.to_pitch_shs}.get(mode, model.to_pitch_ac) - - if mode != "shs": - f0 = ( - model_mode( - time_step=time_step, - voicing_threshold=filter_radius / 10 * 2, - pitch_floor=self.f0_min, - pitch_ceiling=self.f0_max - ).selected_array["frequency"] - ) - else: - f0 = ( - model_mode( - time_step=time_step, - minimum_pitch=self.f0_min, - maximum_frequency_component=self.f0_max - ).selected_array["frequency"] - ) - - pad_size = (p_len - len(f0) + 1) // 2 - - if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") - return f0 - - def get_f0_mangio_crepe(self, x, p_len, model="full"): - if not hasattr(self, "mangio_crepe"): - from main.library.predictors.CREPE.CREPE import CREPE - - self.mangio_crepe = CREPE( - os.path.join( - configs["predictors_path"], - f"crepe_{model}.{'onnx' if self.f0_onnx_mode else 'pth'}" - ), - model_size=model, - hop_length=self.hop_length, - batch_size=self.hop_length * 2, - f0_min=self.f0_min, - f0_max=self.f0_max, - device=self.device, - sample_rate=self.sample_rate, - providers=self.providers, - onnx=self.f0_onnx_mode, - return_periodicity=False - ) - - x = x.astype(np.float32) - x /= np.quantile(np.abs(x), 0.999) - - audio = torch.from_numpy(x).to(self.device, copy=True).unsqueeze(dim=0) - if audio.ndim == 2 and audio.shape[0] > 1: audio = audio.mean(dim=0, keepdim=True).detach() - - f0 = self.mangio_crepe.compute_f0(audio.detach(), pad=True) - if self.f0_onnx_mode and self.del_onnx_model: del self.mangio_crepe.model, self.mangio_crepe - - return self._resize_f0(f0.squeeze(0).cpu().float().numpy(), p_len) - - def get_f0_crepe(self, x, p_len, model="full", filter_radius=3): - if not hasattr(self, "crepe"): - from main.library.predictors.CREPE.CREPE import CREPE - - self.crepe = CREPE( - os.path.join( - configs["predictors_path"], - f"crepe_{model}.{'onnx' if self.f0_onnx_mode else 'pth'}" - ), - model_size=model, - hop_length=self.window, - batch_size=self.batch_size, - f0_min=self.f0_min, - f0_max=self.f0_max, - device=self.device, - sample_rate=self.sample_rate, - providers=self.providers, - onnx=self.f0_onnx_mode, - return_periodicity=True - ) - - f0, pd = self.crepe.compute_f0(torch.tensor(np.copy(x))[None].float(), pad=True) - if self.f0_onnx_mode and self.del_onnx_model: del self.crepe.model, self.crepe - - f0, pd = mean(f0, filter_radius), median(pd, filter_radius) - f0[pd < 0.1] = 0 - - return self._resize_f0(f0[0].cpu().numpy(), p_len) - - def get_f0_fcpe(self, x, p_len, legacy=False, previous=False, filter_radius=3): - if not hasattr(self, "fcpe"): - from main.library.predictors.FCPE.FCPE import FCPE - - self.fcpe = FCPE( - configs, - os.path.join( - configs["predictors_path"], - ("fcpe_legacy" if legacy else ("fcpe" if previous else "ddsp_200k")) + (".onnx" if self.f0_onnx_mode else ".pt") - ), - hop_length=self.hop_length, - f0_min=self.f0_min, - f0_max=self.f0_max, - dtype=torch.float32, - device=self.device, - sample_rate=self.sample_rate, - threshold=(filter_radius / 100) if legacy else (filter_radius / 1000 * 2), - providers=self.providers, - onnx=self.f0_onnx_mode, - legacy=legacy - ) - - f0 = self.fcpe.compute_f0(x, p_len) - if self.f0_onnx_mode and self.del_onnx_model: del self.fcpe.fcpe.model, self.fcpe - - return f0 - - def get_f0_rmvpe(self, x, p_len, clipping=False, filter_radius=3): - if not hasattr(self, "rmvpe"): - from main.library.predictors.RMVPE.RMVPE import RMVPE - - self.rmvpe = RMVPE( - os.path.join( - configs["predictors_path"], - "rmvpe" + (".onnx" if self.f0_onnx_mode else ".pt") - ), - is_half=self.is_half, - device=self.device, - onnx=self.f0_onnx_mode, - providers=self.providers - ) - - filter_radius = filter_radius / 100 - f0 = self.rmvpe.infer_from_audio_with_pitch(x, thred=filter_radius, f0_min=self.f0_min, f0_max=self.f0_max) if clipping else self.rmvpe.infer_from_audio(x, thred=filter_radius) - - if self.f0_onnx_mode and self.del_onnx_model: del self.rmvpe.model, self.rmvpe - return self._resize_f0(f0, p_len) - - def get_f0_pyworld(self, x, p_len, filter_radius, model="harvest"): - if not hasattr(self, "pw"): - from main.library.predictors.WORLD.WORLD import PYWORLD - - self.pw = PYWORLD(os.path.join(configs["predictors_path"], "world"), os.path.join(configs["binary_path"], "world.bin")) - - x = x.astype(np.double) - pw = self.pw.harvest if model == "harvest" else self.pw.dio - - f0, t = pw( - x, - fs=self.sample_rate, - f0_ceil=self.f0_max, - f0_floor=self.f0_min, - frame_period=1000 * self.window / self.sample_rate - ) - - f0 = self.pw.stonemask( - x, - self.sample_rate, - t, - f0 - ) - - if filter_radius > 2 and model == "harvest": f0 = medfilt(f0, filter_radius) - elif model == "dio": - for index, pitch in enumerate(f0): - f0[index] = round(pitch, 1) - - return self._resize_f0(f0, p_len) - - def get_f0_swipe(self, x, p_len, filter_radius=3): - f0, t = swipe( - x.astype(np.float32), - self.sample_rate, - f0_floor=self.f0_min, - f0_ceil=self.f0_max, - frame_period=1000 * self.window / self.sample_rate, - sTHR=filter_radius / 10 - ) - - return self._resize_f0( - stonemask( - x, - self.sample_rate, - t, - f0 - ), - p_len - ) - - def get_f0_librosa(self, x, p_len, mode="yin"): - if mode != "piptrack": - self.if_yin = mode == "yin" - self.yin = yin if self.if_yin else pyin - - f0 = self.yin( - x.astype(np.float32), - sr=self.sample_rate, - fmin=self.f0_min, - fmax=self.f0_max, - hop_length=self.hop_length - ) - - if not self.if_yin: f0 = f0[0] - else: - pitches, magnitudes = piptrack( - y=x.astype(np.float32), - sr=self.sample_rate, - fmin=self.f0_min, - fmax=self.f0_max, - hop_length=self.hop_length, - ) - - max_indexes = np.argmax(magnitudes, axis=0) - f0 = pitches[max_indexes, range(magnitudes.shape[1])] - - return self._resize_f0(f0, p_len) - - def get_f0_penn(self, x, p_len, filter_radius=3): - if not hasattr(self, "penn"): - from main.library.predictors.PENN.PENN import PENN - - self.penn = PENN( - os.path.join( - configs["predictors_path"], - f"fcn.{'onnx' if self.f0_onnx_mode else 'pt'}" - ), - hop_length=self.window // 2, - batch_size=self.batch_size // 2, - f0_min=self.f0_min, - f0_max=self.f0_max, - sample_rate=self.sample_rate, - device=self.device, - providers=self.providers, - onnx=self.f0_onnx_mode, - ) - - f0, pd = self.penn.compute_f0(torch.tensor(np.copy((x)))[None].float()) - if self.f0_onnx_mode and self.del_onnx_model: del self.penn.model, self.penn.decoder, self.penn.resample_audio, self.penn - - f0, pd = mean(f0, filter_radius), median(pd, filter_radius) - f0[pd < 0.1] = 0 - - return self._resize_f0(f0[0].cpu().numpy(), p_len) - - def get_f0_mangio_penn(self, x, p_len): - if not hasattr(self, "mangio_penn"): - from main.library.predictors.PENN.PENN import PENN - - self.mangio_penn = PENN( - os.path.join( - configs["predictors_path"], - f"fcn.{'onnx' if self.f0_onnx_mode else 'pt'}" - ), - hop_length=self.hop_length // 2, - batch_size=self.hop_length, - f0_min=self.f0_min, - f0_max=self.f0_max, - sample_rate=self.sample_rate, - device=self.device, - providers=self.providers, - onnx=self.f0_onnx_mode, - interp_unvoiced_at=0.1 - ) - - x = x.astype(np.float32) - x /= np.quantile(np.abs(x), 0.999) - - audio = torch.from_numpy(x).to(self.device, copy=True).unsqueeze(dim=0) - if audio.ndim == 2 and audio.shape[0] > 1: audio = audio.mean(dim=0, keepdim=True).detach() - - f0 = self.mangio_penn.compute_f0(audio.detach()) - if self.f0_onnx_mode and self.del_onnx_model: del self.mangio_penn.model, self.mangio_penn.decoder, self.mangio_penn.resample_audio, self.mangio_penn - - return self._resize_f0(f0.squeeze(0).cpu().float().numpy(), p_len) - - def get_f0_djcm(self, x, p_len, clipping=False, filter_radius=3): - if not hasattr(self, "djcm"): - from main.library.predictors.DJCM.DJCM import DJCM - - self.djcm = DJCM( - os.path.join( - configs["predictors_path"], - "djcm" + (".onnx" if self.f0_onnx_mode else ".pt") - ), - is_half=self.is_half, - device=self.device, - onnx=self.f0_onnx_mode, - providers=self.providers - ) - - filter_radius /= 10 - f0 = self.djcm.infer_from_audio_with_pitch(x, thred=filter_radius, f0_min=self.f0_min, f0_max=self.f0_max) if clipping else self.djcm.infer_from_audio(x, thred=filter_radius) - - if self.f0_onnx_mode and self.del_onnx_model: del self.djcm.model, self.djcm - return self._resize_f0(f0, p_len) - - def get_f0_swift(self, x, p_len, filter_radius=3): - if not hasattr(self, "swift"): - from main.library.predictors.SWIFT.SWIFT import SWIFT - - self.swift = SWIFT( - os.path.join( - configs["predictors_path"], - "swift.onnx" - ), - fmin=self.f0_min, - fmax=self.f0_max, - confidence_threshold=filter_radius / 4 + 0.137 - ) - - pitch_hz, _, _ = self.swift.detect_from_array(x, self.sample_rate) - return self._resize_f0(pitch_hz, p_len) - - def get_f0_pesto(self, x, p_len): - if not hasattr(self, "pesto"): - from main.library.predictors.PESTO.PESTO import PESTO - - self.pesto = PESTO( - os.path.join( - configs["predictors_path"], - f"pesto.{'onnx' if self.f0_onnx_mode else 'pt'}" - ), - step_size=1000 * self.window / self.sample_rate, - reduction = "alwa", - num_chunks=1, - sample_rate=self.sample_rate, - device=self.device, - providers=self.providers, - onnx=self.f0_onnx_mode - ) - - x = x.astype(np.float32) - x /= np.quantile(np.abs(x), 0.999) - - audio = torch.from_numpy(x).to(self.device, copy=True).unsqueeze(dim=0) - if audio.ndim == 2 and audio.shape[0] > 1: audio = audio.mean(dim=0, keepdim=True).detach() - - f0 = self.pesto.compute_f0(audio.detach())[0] - if self.f0_onnx_mode and self.del_onnx_model: del self.pesto.model, self.pesto - - return self._resize_f0(f0.squeeze(0).cpu().float().numpy(), p_len) \ No newline at end of file diff --git a/main/library/predictors/PENN/PENN.py b/main/library/predictors/PENN/PENN.py deleted file mode 100644 index 27350a54d44f3c0925440470afd5a64e5f0b8aae..0000000000000000000000000000000000000000 --- a/main/library/predictors/PENN/PENN.py +++ /dev/null @@ -1,187 +0,0 @@ -import os -import sys -import torch -import librosa -import functools -import torchaudio - -import numpy as np -import torch.nn.functional as F - -sys.path.append(os.getcwd()) - -from main.library.predictors.PENN.core import bins_to_cents, cents_to_frequency -from main.library.predictors.PENN.core import PITCH_BINS, CENTS_PER_BIN, OCTAVE, frequency_to_bins, seconds_to_samples, entropy, interpolate - -SAMPLE_RATE, WINDOW_SIZE = 8000, 1024 - -class Viterbi: - def __init__(self, pitch_bins=1440, hop_length=80, sample_rate=8000, local_pitch_window_size=19, octaves=1200, max_octaves_per_second=32, cents_per_bin=5): - self.pitch_bins = pitch_bins - self.hop_length = hop_length - self.sample_rate = sample_rate - self.window_size = local_pitch_window_size - self.octaves = octaves - self.max_octave = max_octaves_per_second - self.cents_per_bin = cents_per_bin - - def __call__(self, logits): - distributions = F.softmax(logits, dim=1).permute(2, 1, 0) - - bins = np.array([ - librosa.sequence.viterbi(sequence, self.transition).astype(np.int64) - for sequence in distributions.cpu().numpy() - ]) - bins = torch.tensor(bins, device=distributions.device) - - pitch = self.local_expected_value_from_bins(bins.T, logits).T - return pitch.T - - @functools.cached_property - def transition(self): - return self.triangular_transition_matrix().cpu().numpy() - - def local_expected_value_from_bins(self, bins, logits): - padded = F.pad(logits.squeeze(2), (self.window_size // 2, self.window_size // 2), value=-float('inf')) - - if str(bins.device).startswith("ocl"): - indices = (bins.cpu().repeat(1, self.window_size) + torch.arange(self.window_size, device="cpu")[None]).to(bins.device) - else: - indices = bins.repeat(1, self.window_size) + torch.arange(self.window_size, device=bins.device)[None] - - return self.expected_value(padded.gather(1, indices), bins_to_cents(torch.clip(indices - self.window_size // 2, 0))) - - def triangular_transition_matrix(self): - xx, yy = torch.meshgrid(torch.arange(self.pitch_bins), torch.arange(self.pitch_bins), indexing='ij') - transition = torch.clip(((self.max_octave * self.hop_length / self.sample_rate) * (self.octaves / self.cents_per_bin) + 1) - (xx - yy).abs(), 0) - return transition / transition.sum(dim=1, keepdims=True) - - def expected_value(self, logits, cents): - return cents_to_frequency((F.softmax(logits, dim=1) * cents).sum(dim=1, keepdims=True)) - -class PENN: - def __init__(self, model_path, hop_length = 80, batch_size = None, f0_min = 31, f0_max = 1984, sample_rate = 8000, interp_unvoiced_at = None, device = None, providers = None, onnx = False): - self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") - self.hopsize = hop_length / SAMPLE_RATE - self.batch_size = batch_size - self.f0_min = f0_min - self.f0_max = f0_max - self.sample_rate = sample_rate - self.interp_unvoiced_at = interp_unvoiced_at - self.onnx = onnx - self.resample_audio = None - self.decoder = Viterbi(PITCH_BINS, hop_length, SAMPLE_RATE, 19, OCTAVE, 32, CENTS_PER_BIN) - - if self.onnx: - import onnxruntime as ort - - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 3 - self.model = ort.InferenceSession(model_path, sess_options=sess_options, providers=providers) - else: - from main.library.predictors.PENN.fcn import FCN - - model = FCN(256, PITCH_BINS, (2, 2)) - ckpt = torch.load(model_path, map_location="cpu", weights_only=True) - model.load_state_dict(ckpt['model']) - model.eval() - self.model = model.to(device) - - def expected_frames(self, samples, sample_rate, hopsize, center): - hopsize_resampled = seconds_to_samples(hopsize, sample_rate) - if center == 'half-window': samples = samples - ((WINDOW_SIZE / SAMPLE_RATE * sample_rate) - hopsize_resampled) - elif center == 'half-hop': samples = samples - elif center == 'zero': samples = samples + hopsize_resampled - else: raise ValueError - - return max(1, int(samples / hopsize_resampled)) - - def resample(self, audio, target_sample_rate=SAMPLE_RATE): - if self.sample_rate == target_sample_rate: return audio - if self.resample_audio is None: self.resample_audio = torchaudio.transforms.Resample(self.sample_rate, target_sample_rate).to(audio.device) - - return self.resample_audio(audio) - - def preprocess(self, audio, sample_rate=SAMPLE_RATE, hopsize=0.01, batch_size=None, center='half-window'): - total_frames = self.expected_frames(audio.shape[-1], self.sample_rate, hopsize, center) - if self.sample_rate != sample_rate: audio = self.resample(audio, sample_rate) - - hopsize = seconds_to_samples(hopsize) - - if center in ['half-hop', 'zero']: - padding = int((WINDOW_SIZE - hopsize) / 2) if center == 'half-hop' else int(WINDOW_SIZE / 2) - audio = torch.nn.functional.pad(audio, (padding, padding), mode='reflect') - - if isinstance(hopsize, int) or hopsize.is_integer(): - hopsize = int(round(hopsize)) - start_idxs = None - else: start_idxs = torch.tensor([hopsize * i for i in range(total_frames + 1)]).round().int() - - batch_size = total_frames if batch_size is None else batch_size - for i in range(0, total_frames, batch_size): - batch = min(total_frames - i, batch_size) - - if start_idxs is None: - start = i * hopsize - end = min(start + int((batch - 1) * hopsize) + WINDOW_SIZE, audio.shape[-1]) - batch_audio = audio[:, start:end] - - if end - start < WINDOW_SIZE: - padding = WINDOW_SIZE - (end - start) - if (end - start) % hopsize: padding += end - start - hopsize - batch_audio = torch.nn.functional.pad(batch_audio, (0, padding)) - - frames = torch.nn.functional.unfold(batch_audio[:, None, None], kernel_size=(1, WINDOW_SIZE), stride=(1, hopsize)).permute(2, 0, 1) - else: - frames = torch.zeros(batch, 1, WINDOW_SIZE) - - for j in range(batch): - start = start_idxs[i + j] - end = min(start + WINDOW_SIZE, audio.shape[-1]) - frames[j, :, : end - start] = audio[:, start:end] - - yield frames - - def postprocess(self, logits, fmin, fmax): - with torch.no_grad(): - logits[:, :frequency_to_bins(torch.tensor(fmin))] = -float('inf') - logits[:, frequency_to_bins(torch.tensor(fmax), torch.ceil):] = -float('inf') - - pitch = self.decoder(logits) - periodicity = entropy(logits) - - return pitch.T, periodicity.T - - def compute_f0(self, audio, center="half-window"): - if self.batch_size is not None: logits = [] - - for frames in self.preprocess(audio, SAMPLE_RATE, self.hopsize, self.batch_size, center): - inferred = self.infer(frames.to(self.device)).detach() - - if self.batch_size is None: pitch, periodicity = self.postprocess(inferred, self.f0_min, self.f0_max) - else: logits.append(inferred.cpu()) - - if self.batch_size is not None: - pitch, periodicity = self.postprocess(torch.cat(logits, 0), self.f0_min, self.f0_max) - - if self.interp_unvoiced_at is not None: - pitch = interpolate(pitch, periodicity, self.interp_unvoiced_at) - return pitch - - return pitch, periodicity - - def infer(self, frames): - if self.onnx: - inferred = torch.tensor( - self.model.run( - [self.model.get_outputs()[0].name], - { - self.model.get_inputs()[0].name: frames.cpu().numpy() - } - )[0] - ) - else: - with torch.no_grad(): - inferred = self.model(frames) - - return inferred \ No newline at end of file diff --git a/main/library/predictors/PENN/core.py b/main/library/predictors/PENN/core.py deleted file mode 100644 index ef9215f32d447fac68238309e33c6411de3e2bc7..0000000000000000000000000000000000000000 --- a/main/library/predictors/PENN/core.py +++ /dev/null @@ -1,57 +0,0 @@ -import math -import torch - -import torch.nn.functional as F - -PITCH_BINS, CENTS_PER_BIN, OCTAVE = 1440, 5, 1200 - -def frequency_to_bins(frequency, quantize_fn=torch.floor): - return cents_to_bins(frequency_to_cents(frequency), quantize_fn) - -def cents_to_bins(cents, quantize_fn=torch.floor): - bins = quantize_fn(cents / CENTS_PER_BIN).long() - bins[bins < 0] = 0 - bins[bins >= PITCH_BINS] = PITCH_BINS - 1 - return bins - -def cents_to_frequency(cents): - return 31 * 2 ** (cents / OCTAVE) - -def bins_to_cents(bins): - return CENTS_PER_BIN * bins - -def frequency_to_cents(frequency): - return OCTAVE * (frequency / 31).log2() - -def seconds_to_samples(seconds, sample_rate=8000): - return seconds * sample_rate - -def interpolate(pitch, periodicity, value): - voiced = periodicity > value - if not voiced.any(): return pitch - - pitch = pitch.log2() - pitch[..., 0] = pitch[voiced][..., 0] - pitch[..., -1] = pitch[voiced][..., -1] - voiced[..., 0] = True - voiced[..., -1] = True - pitch[~voiced] = _interpolate(torch.where(~voiced[0])[0][None], torch.where(voiced[0])[0][None], pitch[voiced][None]) - - return 2 ** pitch - -def _interpolate(x, xp, fp): - if xp.shape[-1] == 0: return x - if xp.shape[-1] == 1: return torch.full(x.shape, fp.squeeze(), device=fp.device, dtype=fp.dtype) - - m = (fp[:, 1:] - fp[:, :-1]) / (xp[:, 1:] - xp[:, :-1]) - b = fp[:, :-1] - (m.mul(xp[:, :-1])) - - indicies = x[:, :, None].ge(xp[:, None, :]).sum(-1) - 1 - indicies = indicies.clamp(0, m.shape[-1] - 1) - line_idx = torch.linspace(0, indicies.shape[0], 1, device=indicies.device).to(torch.long).expand(indicies.shape) - - return m[line_idx, indicies].mul(x) + b[line_idx, indicies] - -def entropy(logits): - distribution = F.softmax(logits, dim=1) - return (1 + 1 / math.log(PITCH_BINS) * (distribution * (distribution + 1e-7).log()).sum(dim=1)) \ No newline at end of file diff --git a/main/library/predictors/PENN/fcn.py b/main/library/predictors/PENN/fcn.py deleted file mode 100644 index 138805d0e1ad6344a62f201479d2b957d7f6687a..0000000000000000000000000000000000000000 --- a/main/library/predictors/PENN/fcn.py +++ /dev/null @@ -1,15 +0,0 @@ -import torch - -class FCN(torch.nn.Sequential): - def __init__(self, channels = 256, pitch_bins = 1440, pooling = (2, 2)): - super().__init__(*(Block(1, channels, 481, pooling), Block(channels, channels // 8, 225, pooling), Block(channels // 8, channels // 8, 97, pooling), Block(channels // 8, channels // 2, 66), Block(channels // 2, channels, 35), Block(channels, channels * 2, 4), torch.nn.Conv1d(channels * 2, pitch_bins, 4))) - - def forward(self, frames): - return super().forward(frames[:, :, 16:-15]) - -class Block(torch.nn.Sequential): - def __init__(self, in_channels, out_channels, length=1, pooling=None, kernel_size=32): - layers = (torch.nn.Conv1d(in_channels, out_channels, kernel_size), torch.nn.ReLU()) - if pooling is not None: layers += (torch.nn.MaxPool1d(*pooling),) - layers += (torch.nn.LayerNorm((out_channels, length)),) - super().__init__(*layers) \ No newline at end of file diff --git a/main/library/predictors/PESTO/PESTO.py b/main/library/predictors/PESTO/PESTO.py deleted file mode 100644 index 2eeb61bfa766f119804a681e8eba5539fec9e69a..0000000000000000000000000000000000000000 --- a/main/library/predictors/PESTO/PESTO.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import sys -import torch - -sys.path.append(os.getcwd()) - -class PESTO: - def __init__(self, model_path, step_size=10, reduction="alwa", num_chunks=1, sample_rate=16000, device=None, providers=None, onnx=False): - self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") - self.step_size = step_size - self.reduction = reduction - self.num_chunks = num_chunks - self.sample_rate = sample_rate - self.onnx = onnx - - if self.onnx: - import onnxruntime as ort - - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 3 - self.model = ort.InferenceSession(model_path, sess_options=sess_options, providers=providers) - else: - from main.library.predictors.PESTO.model import PPESTO, Resnet1d - from main.library.predictors.PESTO.preprocessor import Preprocessor - - ckpt = torch.load(model_path, map_location="cpu", weights_only=False) - model = PPESTO(Resnet1d(**ckpt["hparams"]["encoder"]), preprocessor=Preprocessor(hop_size=step_size, sampling_rate=sample_rate, **ckpt["hcqt_params"]), crop_kwargs=ckpt["hparams"]["pitch_shift"], reduction=ckpt["hparams"]["reduction"]) - model.load_state_dict(ckpt["state_dict"], strict=False) - - self.model = model.to(self.device).eval() - self.model.reduction = self.reduction - - def compute_f0(self, x): - assert x.ndim <= 2 - - with torch.inference_mode(): - with torch.no_grad(): - preds, confidence = [], [] - - for chunk in x.chunk(chunks=self.num_chunks): - if self.onnx: - model = self.model.run( - [self.model.get_outputs()[0].name, self.model.get_outputs()[1].name], - { - self.model.get_inputs()[0].name: chunk.cpu().numpy() - } - ) - pred, conf = torch.tensor(model[0], device=self.device), torch.tensor(model[1], device=self.device) - else: - pred, conf = self.model( - chunk, - sr=self.sample_rate, - convert_to_freq=True, - return_activations=False - ) - - preds.append(pred) - confidence.append(conf) - - return torch.cat(preds, dim=0), torch.cat(confidence, dim=0) \ No newline at end of file diff --git a/main/library/predictors/PESTO/model.py b/main/library/predictors/PESTO/model.py deleted file mode 100644 index be1c4079b8a4cd8822e4b26471fb17c4d678287c..0000000000000000000000000000000000000000 --- a/main/library/predictors/PESTO/model.py +++ /dev/null @@ -1,141 +0,0 @@ -import math -import torch - -from functools import partial - -class PPESTO(torch.nn.Module): - def __init__(self, encoder, preprocessor, crop_kwargs = None, reduction = "alwa"): - super(PPESTO, self).__init__() - self.encoder = encoder - self.preprocessor = preprocessor - self.confidence = ConfidenceClassifier() - if crop_kwargs is None: crop_kwargs = {} - self.crop_cqt = CropCQT(**crop_kwargs) - self.reduction = reduction - self.register_buffer('shift', torch.zeros((), dtype=torch.float), persistent=True) - - def forward(self, audio_waveforms, sr = 16000, convert_to_freq = True, return_activations = False): - batch_size = audio_waveforms.size(0) if audio_waveforms.ndim == 2 else None - x = self.preprocessor(audio_waveforms, sr=sr).flatten(0, 1) - - energy = x.mul_(math.log(10) / 10.).exp().squeeze_(1) - vol = energy.sum(dim=-1) - confidence = self.confidence(energy) - - x = self.crop_cqt(x) - activations = self.encoder(x) - - if batch_size is None: confidence.squeeze_(0) - else: - activations = activations.view(batch_size, -1, activations.size(-1)) - confidence = confidence.view(batch_size, -1) - vol = vol.view(batch_size, -1) - - activations = activations.roll(-(self.shift * self.bins_per_semitone).round().int().item(), -1) - preds = self.reduce_activations(activations) - if convert_to_freq: preds = 440 * 2 ** ((preds - 69) / 12) - - if return_activations: return preds, confidence, vol, activations - return preds, confidence - - @property - def bins_per_semitone(self): - return self.preprocessor.hcqt_kwargs["bins_per_semitone"] - - @property - def hop_size(self): - return self.preprocessor.hop_size - - def reduce_activations(self, activations): - device = activations.device - num_bins = activations.size(-1) - if torch.is_tensor(num_bins): num_bins = num_bins.item() - - bps, r = divmod(num_bins, 128) - assert r == 0 - - if self.reduction == "argmax": return activations.argmax(dim=-1).float() / bps - - all_pitches = torch.arange(num_bins, dtype=torch.float, device=device).div_(bps) - if self.reduction == "mean": return activations.matmul(all_pitches) - - if self.reduction == "alwa": - indices = (activations.argmax(dim=-1, keepdim=True) + (torch.arange(1, 2 * bps, device=device) - bps)).clip_(min=0, max=num_bins - 1) - cropped_activations = activations.gather(-1, indices) - return (cropped_activations * all_pitches.unsqueeze(0).expand_as(activations).gather(-1, indices)).sum(dim=-1) / cropped_activations.sum(dim=-1) - - raise ValueError - -class ConfidenceClassifier(torch.nn.Module): - def __init__(self): - super(ConfidenceClassifier, self).__init__() - self.conv = torch.nn.Conv1d(1, 1, 39, stride=3) - self.linear = torch.nn.Linear(72, 1) - - def forward(self, x): - return self.linear(torch.cat((torch.nn.functional.relu(self.conv(x.unsqueeze(1)).squeeze(1)), x.log().mean(dim=-1, keepdim=True).exp() / x.mean(dim=-1, keepdim=True).clip_(min=1e-8)), dim=-1)).sigmoid().squeeze(-1) - -class CropCQT(torch.nn.Module): - def __init__(self, min_steps, max_steps): - super(CropCQT, self).__init__() - self.min_steps = min_steps - self.max_steps = max_steps - self.lower_bin = self.max_steps - - def forward(self, spectrograms): - return spectrograms[..., self.max_steps: self.min_steps] - -class Resnet1d(torch.nn.Module): - def __init__(self, n_chan_input=1, n_chan_layers=(20, 20, 10, 1), n_prefilt_layers=1, prefilt_kernel_size=15, residual=False, n_bins_in=216, output_dim=128, activation_fn = "leaky", a_lrelu=0.3, p_dropout=0.2, **unused): - super(Resnet1d, self).__init__() - self.hparams = dict(n_chan_input=n_chan_input, n_chan_layers=n_chan_layers, n_prefilt_layers=n_prefilt_layers, prefilt_kernel_size=prefilt_kernel_size, residual=residual, n_bins_in=n_bins_in, output_dim=output_dim, activation_fn=activation_fn, a_lrelu=a_lrelu, p_dropout=p_dropout) - - if activation_fn == "relu": - activation_layer = torch.nn.ReLU - elif activation_fn == "silu": - activation_layer = torch.nn.SiLU - elif activation_fn == "leaky": - activation_layer = partial(torch.nn.LeakyReLU, negative_slope=a_lrelu) - else: - raise ValueError - - n_in = n_chan_input - n_ch = n_chan_layers - if len(n_ch) < 5: n_ch.append(1) - - self.layernorm = torch.nn.LayerNorm(normalized_shape=[n_in, n_bins_in]) - prefilt_padding = prefilt_kernel_size // 2 - - self.conv1 = torch.nn.Sequential(torch.nn.Conv1d(in_channels=n_in, out_channels=n_ch[0], kernel_size=prefilt_kernel_size, padding=prefilt_padding, stride=1), activation_layer(), torch.nn.Dropout(p=p_dropout)) - self.n_prefilt_layers = n_prefilt_layers - self.prefilt_layers = torch.nn.ModuleList([torch.nn.Sequential(torch.nn.Conv1d(in_channels=n_ch[0], out_channels=n_ch[0], kernel_size=prefilt_kernel_size, padding=prefilt_padding, stride=1), activation_layer(), torch.nn.Dropout(p=p_dropout)) for _ in range(n_prefilt_layers-1)]) - self.residual = residual - conv_layers = [] - - for i in range(len(n_chan_layers)-1): - conv_layers.extend([torch.nn.Conv1d(in_channels=n_ch[i], out_channels=n_ch[i + 1], kernel_size=1, padding=0, stride=1), activation_layer(), torch.nn.Dropout(p=p_dropout)]) - - self.conv_layers = torch.nn.Sequential(*conv_layers) - self.flatten = torch.nn.Flatten(start_dim=1) - self.fc = ToeplitzLinear(n_bins_in * n_ch[-1], output_dim) - self.final_norm = torch.nn.Softmax(dim=-1) - - def forward(self, x): - x = self.conv1(self.layernorm(x)) - - for p in range(0, self.n_prefilt_layers - 1): - prefilt_layer = self.prefilt_layers[p] - - if self.residual: - x = prefilt_layer(x) + x - else: - x = prefilt_layer(x) - - return self.final_norm(self.fc(self.flatten(self.conv_layers(x)))) - -class ToeplitzLinear(torch.nn.Conv1d): - def __init__(self, in_features, out_features): - super(ToeplitzLinear, self).__init__(in_channels=1, out_channels=1, kernel_size=in_features+out_features-1, padding=out_features-1, bias=False) - - def forward(self, input): - return super(ToeplitzLinear, self).forward(input.unsqueeze(-2)).squeeze(-2) \ No newline at end of file diff --git a/main/library/predictors/PESTO/preprocessor.py b/main/library/predictors/PESTO/preprocessor.py deleted file mode 100644 index 88bbaa52ae0de4f4c210123c7cf9de55b9ceda06..0000000000000000000000000000000000000000 --- a/main/library/predictors/PESTO/preprocessor.py +++ /dev/null @@ -1,239 +0,0 @@ -import torch - -import numpy as np - -from scipy.signal import get_window - -class Preprocessor(torch.nn.Module): - def __init__(self, hop_size, sampling_rate = None, **hcqt_kwargs): - super(Preprocessor, self).__init__() - self.hcqt_sr = None - self.hcqt_kernels = None - self.hop_size = hop_size - self.hcqt_kwargs = hcqt_kwargs - self.to_log = ToLogMagnitude() - self.register_buffer("_device", torch.zeros(()), persistent=False) - if sampling_rate is not None: - self.hcqt_sr = sampling_rate - self._reset_hcqt_kernels() - - def forward(self, x, sr = None): - return self.to_log(self.hcqt(x, sr=sr).permute(0, 3, 1, 2, 4)) - - def hcqt(self, audio, sr = None) : - if sr is not None and sr != self.hcqt_sr: - self.hcqt_sr = sr - self._reset_hcqt_kernels() - - return self.hcqt_kernels(audio) - - def _reset_hcqt_kernels(self): - self.hcqt_kernels = HarmonicCQT(sr=self.hcqt_sr, hop_length=int(self.hop_size * self.hcqt_sr / 1000 + 0.5), **self.hcqt_kwargs).to(self._device.device) - -class ToLogMagnitude(torch.nn.Module): - def __init__(self): - super(ToLogMagnitude, self).__init__() - self.eps = torch.finfo(torch.float32).eps - - def forward(self, x): - x = (x[..., 0] ** 2 + x[..., 1] ** 2).sqrt() if x.shape[-1] == 2 else x.abs() - x.clamp_(min=self.eps).log10_().mul_(20) - - return x - -class HarmonicCQT(torch.nn.Module): - def __init__(self, harmonics, sr = 22050, hop_length = 512, fmin = 32.7, fmax = None, bins_per_semitone = 1, n_bins = 84, center_bins = True, gamma = 0, center = True, streaming = False, mirror = 0, max_batch_size = 1): - super(HarmonicCQT, self).__init__() - if center_bins: fmin = fmin / 2 ** ((bins_per_semitone - 1) / (24 * bins_per_semitone)) - self.cqt_kernels = torch.nn.ModuleList([CQT(sr=sr, hop_length=hop_length, fmin=h * fmin, fmax=fmax, n_bins=n_bins, bins_per_octave=12*bins_per_semitone, gamma=gamma, center=center, streaming=streaming, mirror=mirror, max_batch_size=max_batch_size, output_format="Complex") for h in harmonics]) - - def forward(self, audio_waveforms): - return torch.stack([cqt(audio_waveforms) for cqt in self.cqt_kernels], dim=1) - -class BaseCQT(torch.nn.Module): - def __init__(self, sr=22050, hop_length=512, fmin=32.70, fmax=None, n_bins=84, bins_per_octave=12, gamma=0, filter_scale=1, norm=1, window="hann", center = True, trainable=False, output_format="Magnitude"): - super(BaseCQT, self).__init__() - self.trainable = trainable - self.n_bins = n_bins - self.hop_length = hop_length - self.center = center - self.output_format = output_format - cqt_kernels, self.kernel_width, lengths, freqs = self.create_cqt_kernels(float(filter_scale) / (2 ** (1 / bins_per_octave) - 1), sr, fmin, n_bins, bins_per_octave, norm, window, fmax, gamma=gamma) - self.sqrt_lengths = lengths.sqrt_().unsqueeze_(-1) - self.frequencies = freqs - self.cqt_kernels = torch.from_numpy(cqt_kernels).unsqueeze(1) - - def create_cqt_kernels(self, Q, fs, fmin, n_bins=84, bins_per_octave=12, norm=1, window="hann", fmax=None, topbin_check=True, gamma=0): - fftLen = 2 ** int(np.ceil(np.log2(np.ceil(Q * fs / fmin)))) - - if (fmax != None) and (n_bins == None): - n_bins = np.ceil(bins_per_octave * np.log2(fmax / fmin)) - freqs = fmin * 2.0 ** (np.r_[0:n_bins] / np.double(bins_per_octave)) - elif (fmax == None) and (n_bins != None): freqs = fmin * 2.0 ** (np.r_[0:n_bins] / np.double(bins_per_octave)) - else: - n_bins = np.ceil(bins_per_octave * np.log2(fmax / fmin)) - freqs = fmin * 2.0 ** (np.r_[0:n_bins] / np.double(bins_per_octave)) - - if np.max(freqs) > fs / 2 and topbin_check == True: raise ValueError - - lengths = np.ceil(Q * fs / (freqs + gamma / (2.0 ** (1.0 / bins_per_octave) - 1.0))) - fftLen = int(2 ** (np.ceil(np.log2(int(max(lengths)))))) - tempKernel = np.zeros((int(n_bins), int(fftLen)), dtype=np.complex64) - - for k in range(0, int(n_bins)): - l = lengths[k] - start = (int(np.ceil(fftLen / 2.0 - l / 2.0)) - 1) if l % 2 == 1 else int(np.ceil(fftLen / 2.0 - l / 2.0)) - N = int(l) - - if isinstance(window, str): - sig = get_window(window, N, fftbins=True) - elif isinstance(window, tuple): - if window[0] == "gaussian": - assert window[1] >= 0 - sig = get_window(("gaussian", np.floor(-N / 2 / np.sqrt(-2 * np.log(10 ** (-window[1] / 20))))), N, fftbins=True) - else: raise Exception - - sig = sig * np.exp(np.r_[-l // 2: l // 2] * 1j * 2 * np.pi * freqs[k] / fs) / l - - if norm: - tempKernel[k, start: start + int(l)] = sig / np.linalg.norm(sig, norm) - else: - tempKernel[k, start: start + int(l)] = sig - - return tempKernel, fftLen, torch.tensor(lengths).float(), freqs - - @torch.no_grad() - def init_weights(self): - self.conv.weight.copy_(torch.cat((self.cqt_kernels.real, -self.cqt_kernels.imag), dim=0)) - self.conv.weight.requires_grad = self.trainable - - def forward(self, x, output_format=None, normalization_type="librosa"): - output_format = output_format or self.output_format - x = self.broadcast_dim(x) - cqt = self.conv(x).view(x.size(0), 2, self.n_bins, -1) - - if normalization_type == "librosa": cqt *= self.sqrt_lengths.to(cqt.device) - elif normalization_type == "convolutional": pass - elif normalization_type == "wrap": cqt *= 2 - else: raise ValueError - - if output_format == "Magnitude": return cqt.pow(2).sum(-3).add(1e-8 if self.trainable else 0).sqrt() - if output_format == "Complex": return cqt.permute(0, 2, 3, 1) - - cqt_real, cqt_imag = cqt.split(self.n_bins, dim=-2) - if output_format == "Phase": return torch.stack((cqt_imag.atan2(cqt_real).cos(), cqt_imag.atan2(cqt_real).sin()), -1) - - raise ValueError - - def broadcast_dim(self, x): - if x.dim() == 2: x = x[:, None, :] - elif x.dim() == 1: x = x[None, None, :] - elif x.dim() == 3: pass - else: raise ValueError - - return x - -class RegularCQT(BaseCQT): - def __init__(self, *args, pad_mode="reflect", **kwargs): - super().__init__(*args, **kwargs) - padding = self.kernel_width // 2 if self.center else 0 - self.conv = torch.nn.Conv1d(1, 2 * self.n_bins, kernel_size=self.kernel_width, stride=self.hop_length, padding=padding, padding_mode=pad_mode, bias=False) - self.init_weights() - -class StreamingCQT(BaseCQT): - def __init__(self, *args, mirror = 0, max_batch_size = 1, **kwargs): - super(StreamingCQT, self).__init__(*args, **kwargs) - if self.center: - mirrored_samples = int(mirror * (self.kernel_width - self.hop_length) / 2) - padding = self.kernel_width - self.hop_length - mirrored_samples - else: - mirrored_samples = 0 - padding = 0 - - self.conv = CachedConv1d(1, 2 * self.n_bins, kernel_size=self.kernel_width, stride=self.hop_length, padding=padding, mirror=mirrored_samples, max_batch_size=max_batch_size, bias=False) - self.init_weights() - -class CQT: - regular_only_kwargs = ["pad_mode"] - streaming_only_kwargs = ["mirror", "max_batch_size"] - - def __new__(cls, *args, **kwargs): - streaming = kwargs.pop("streaming", False) - - if streaming: - for kwarg in cls.regular_only_kwargs: - kwargs.pop(kwarg, None) - - return StreamingCQT(*args, **kwargs) - - for kwarg in cls.streaming_only_kwargs: - kwargs.pop(kwarg, None) - - return RegularCQT(*args, **kwargs) - -class CachedConv1d(torch.nn.Conv1d): - def __init__(self, *args, **kwargs): - kwargs["padding"] = 0 - super(CachedConv1d, self).__init__(*args, **kwargs) - padding = kwargs.get("padding", 0) - max_batch_size = kwargs.pop("max_batch_size", 1) - mirror = kwargs.pop("mirror", 0) - mirror_fn = kwargs.pop("mirror_fn", "zeros") - cumulative_delay = kwargs.pop("cumulative_delay", 0) - - if isinstance(padding, int): r_pad = padding - elif isinstance(padding, list) or isinstance(padding, tuple): - r_pad = padding[1] - padding = padding[0] + padding[1] - else: raise TypeError - - s = self.stride[0] - cd = cumulative_delay - - self.cumulative_delay = (r_pad + ((s - ((r_pad + cd) % s)) % s) + cd) // s - self.cache = CachedPadding1d(padding, max_batch_size=max_batch_size) - - if mirror == 0: - mirroring_fn = torch.nn.Identity - elif mirror_fn == "reflection": - mirroring_fn = torch.nn.ReflectionPad1d - elif mirror_fn == "zeros": - mirroring_fn = torch.nn.ZeroPad1d - elif mirror_fn == "refill": - mirroring_fn = RefillPad1d - else: - mirroring_fn = torch.nn.Identity - - self.mirror = mirroring_fn((0, mirror)) - - def forward(self, x): - return super(CachedConv1d, self).forward(self.mirror(self.cache(x))) - -class RefillPad1d(torch.nn.Module): - def __init__(self, padding): - super(RefillPad1d, self).__init__() - self.right_padding = padding[1] - - def forward(self, x): - return torch.cat((x, x[..., -self.right_padding:]), dim=-1) - -class CachedPadding1d(torch.nn.Module): - def __init__(self, padding, max_batch_size = 1, crop=False): - super().__init__() - self.padding = padding - self.max_batch_size = max_batch_size - self.crop = crop - self.init_cache() - - @torch.jit.unused - @torch.no_grad() - def init_cache(self): - self.register_buffer("pad", torch.zeros(self.max_batch_size, 1, self.padding), persistent=False) - - def forward(self, x): - bs = x.size(0) - if self.padding: - x = torch.cat((self.pad[:bs], x), -1) - self.pad[:bs].copy_(x[..., -self.padding:]) - - return x \ No newline at end of file diff --git a/main/library/predictors/RMVPE/RMVPE.py b/main/library/predictors/RMVPE/RMVPE.py deleted file mode 100644 index 3acac617cdb3e06608ff077adc96302cb80034ae..0000000000000000000000000000000000000000 --- a/main/library/predictors/RMVPE/RMVPE.py +++ /dev/null @@ -1,93 +0,0 @@ -import os -import sys -import torch - -import numpy as np -import torch.nn.functional as F - -sys.path.append(os.getcwd()) - -from main.library.predictors.RMVPE.mel import MelSpectrogram - -N_MELS, N_CLASS = 128, 360 - -class RMVPE: - def __init__(self, model_path, is_half, device=None, providers=None, onnx=False): - self.onnx = onnx - - if self.onnx: - import onnxruntime as ort - - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 3 - self.model = ort.InferenceSession(model_path, sess_options=sess_options, providers=providers) - else: - from main.library.predictors.RMVPE.e2e import E2E - - model = E2E(4, 1, (2, 2)) - ckpt = torch.load(model_path, map_location="cpu", weights_only=True) - model.load_state_dict(ckpt) - model.eval() - if is_half: model = model.half() - self.model = model.to(device) - - self.is_half = is_half - self.device = device - self.mel_extractor = MelSpectrogram(N_MELS, 16000, 1024, 160, None, 30, 8000).to(device) - cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191 - self.cents_mapping = np.pad(cents_mapping, (4, 4)) - - def mel2hidden(self, mel): - with torch.no_grad(): - n_frames = mel.shape[-1] - n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames - if n_pad > 0: mel = F.pad(mel, (0, n_pad), mode="constant") - - if self.onnx: - hidden = self.model.run( - [self.model.get_outputs()[0].name], - { - self.model.get_inputs()[0].name: mel.cpu().numpy().astype(np.float32) - } - )[0] - else: - hidden = self.model( - mel.half() if self.is_half else mel.float() - ) - - return hidden[:, :n_frames] - - def decode(self, hidden, thred=0.03): - f0 = 10 * (2 ** (self.to_local_average_cents(hidden, thred=thred) / 1200)) - f0[f0 == 10] = 0 - - return f0 - - def infer_from_audio(self, audio, thred=0.03): - hidden = self.mel2hidden(self.mel_extractor(torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True)) - - return self.decode(hidden.squeeze(0).cpu().numpy().astype(np.float32) if not self.onnx else hidden[0], thred=thred) - - def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100): - f0 = self.infer_from_audio(audio, thred) - f0[(f0 < f0_min) | (f0 > f0_max)] = 0 - - return f0 - - def to_local_average_cents(self, salience, thred=0.05): - center = np.argmax(salience, axis=1) - salience = np.pad(salience, ((0, 0), (4, 4))) - center += 4 - todo_salience, todo_cents_mapping = [], [] - starts = center - 4 - ends = center + 5 - - for idx in range(salience.shape[0]): - todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) - todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) - - todo_salience = np.array(todo_salience) - devided = np.sum(todo_salience * np.array(todo_cents_mapping), 1) / np.sum(todo_salience, 1) - devided[np.max(salience, axis=1) <= thred] = 0 - - return devided \ No newline at end of file diff --git a/main/library/predictors/RMVPE/deepunet.py b/main/library/predictors/RMVPE/deepunet.py deleted file mode 100644 index 042517ec055368937c564976f190bd8ddb148a8a..0000000000000000000000000000000000000000 --- a/main/library/predictors/RMVPE/deepunet.py +++ /dev/null @@ -1,121 +0,0 @@ -import torch - -import torch.nn as nn - -class ConvBlockRes(nn.Module): - def __init__(self, in_channels, out_channels, momentum=0.01): - super(ConvBlockRes, self).__init__() - self.conv = nn.Sequential(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False), nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU(), nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False), nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU()) - if in_channels != out_channels: - self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) - self.is_shortcut = True - else: self.is_shortcut = False - - def forward(self, x): - return (self.conv(x) + self.shortcut(x)) if self.is_shortcut else (self.conv(x) + x) - -class ResEncoderBlock(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01): - super(ResEncoderBlock, self).__init__() - self.n_blocks = n_blocks - self.conv = nn.ModuleList() - self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) - - for _ in range(n_blocks - 1): - self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) - - self.kernel_size = kernel_size - if self.kernel_size is not None: self.pool = nn.AvgPool2d(kernel_size=kernel_size) - - def forward(self, x): - for i in range(self.n_blocks): - x = self.conv[i](x) - - if self.kernel_size is not None: return x, self.pool(x) - else: return x - -class Encoder(nn.Module): - def __init__(self, in_channels, in_size, n_encoders, kernel_size, n_blocks, out_channels=16, momentum=0.01): - super(Encoder, self).__init__() - self.n_encoders = n_encoders - self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) - self.layers = nn.ModuleList() - - for _ in range(self.n_encoders): - self.layers.append(ResEncoderBlock(in_channels, out_channels, kernel_size, n_blocks, momentum=momentum)) - in_channels = out_channels - out_channels *= 2 - in_size //= 2 - - self.out_size = in_size - self.out_channel = out_channels - - def forward(self, x): - concat_tensors = [] - x = self.bn(x) - - for layer in self.layers: - t, x = layer(x) - concat_tensors.append(t) - - return x, concat_tensors - -class Intermediate(nn.Module): - def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): - super(Intermediate, self).__init__() - self.layers = nn.ModuleList() - self.layers.append(ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)) - - for _ in range(n_inters - 1): - self.layers.append(ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)) - - def forward(self, x): - for layer in self.layers: - x = layer(x) - - return x - -class ResDecoderBlock(nn.Module): - def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): - super(ResDecoderBlock, self).__init__() - out_padding = (0, 1) if stride == (1, 2) else (1, 1) - self.conv1 = nn.Sequential(nn.ConvTranspose2d(in_channels=in_channels, out_channels=out_channels, kernel_size=(3, 3), stride=stride, padding=(1, 1), output_padding=out_padding, bias=False), nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU()) - self.conv2 = nn.ModuleList() - self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) - - for _ in range(n_blocks - 1): - self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) - - def forward(self, x, concat_tensor): - x = torch.cat((self.conv1(x), concat_tensor), dim=1) - for conv2 in self.conv2: - x = conv2(x) - - return x - -class Decoder(nn.Module): - def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): - super(Decoder, self).__init__() - self.layers = nn.ModuleList() - - for _ in range(n_decoders): - out_channels = in_channels // 2 - self.layers.append(ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)) - in_channels = out_channels - - def forward(self, x, concat_tensors): - for i, layer in enumerate(self.layers): - x = layer(x, concat_tensors[-1 - i]) - - return x - -class DeepUnet(nn.Module): - def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16): - super(DeepUnet, self).__init__() - self.encoder = Encoder(in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels) - self.intermediate = Intermediate(self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks) - self.decoder = Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks) - - def forward(self, x): - x, concat_tensors = self.encoder(x) - return self.decoder(self.intermediate(x), concat_tensors) \ No newline at end of file diff --git a/main/library/predictors/RMVPE/e2e.py b/main/library/predictors/RMVPE/e2e.py deleted file mode 100644 index a9611022757dffacf494eea4ea02c4d4ee504ba4..0000000000000000000000000000000000000000 --- a/main/library/predictors/RMVPE/e2e.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -import sys -import torch - -import torch.nn as nn - -sys.path.append(os.getcwd()) - -from main.library.predictors.RMVPE.deepunet import DeepUnet - -N_MELS, N_CLASS = 128, 360 - -class BiGRU(nn.Module): - def __init__(self, input_features, hidden_features, num_layers): - super(BiGRU, self).__init__() - self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True) - - def forward(self, x): - try: - return self.gru(x)[0] - except: - torch.backends.cudnn.enabled = False - return self.gru(x)[0] - -class E2E(nn.Module): - def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16): - super(E2E, self).__init__() - self.unet = DeepUnet(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels) - self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) - self.fc = nn.Sequential(BiGRU(3 * 128, 256, n_gru), nn.Linear(512, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()) if n_gru else nn.Sequential(nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()) - - def forward(self, mel): - return self.fc(self.cnn(self.unet(mel.transpose(-1, -2).unsqueeze(1))).transpose(1, 2).flatten(-2)) \ No newline at end of file diff --git a/main/library/predictors/RMVPE/mel.py b/main/library/predictors/RMVPE/mel.py deleted file mode 100644 index b71bbd57949a50afb960f2dd8187b3a03e79de7b..0000000000000000000000000000000000000000 --- a/main/library/predictors/RMVPE/mel.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -import sys -import torch - -import numpy as np -import torch.nn as nn -import torch.nn.functional as F - -from librosa.filters import mel - -sys.path.append(os.getcwd()) - -class MelSpectrogram(nn.Module): - def __init__(self, n_mel_channels, sample_rate, win_length, hop_length, n_fft=None, mel_fmin=0, mel_fmax=None, clamp=1e-5): - super().__init__() - n_fft = win_length if n_fft is None else n_fft - self.hann_window = {} - mel_basis = mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax, htk=True) - mel_basis = torch.from_numpy(mel_basis).float() - self.register_buffer("mel_basis", mel_basis) - self.n_fft = win_length if n_fft is None else n_fft - self.hop_length = hop_length - self.win_length = win_length - self.sample_rate = sample_rate - self.n_mel_channels = n_mel_channels - self.clamp = clamp - - def forward(self, audio, keyshift=0, speed=1, center=True): - factor = 2 ** (keyshift / 12) - win_length_new = int(np.round(self.win_length * factor)) - keyshift_key = str(keyshift) + "_" + str(audio.device) - if keyshift_key not in self.hann_window: self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device) - - n_fft = int(np.round(self.n_fft * factor)) - hop_length = int(np.round(self.hop_length * speed)) - - if str(audio.device).startswith(("ocl", "privateuseone")): - if not hasattr(self, "stft"): - from main.library.backends.utils import STFT - self.stft = STFT(filter_length=n_fft, hop_length=hop_length, win_length=win_length_new).to(audio.device) - magnitude = self.stft.transform(audio, 1e-9) - else: - fft = torch.stft(audio, n_fft=n_fft, hop_length=hop_length, win_length=win_length_new, window=self.hann_window[keyshift_key], center=center, return_complex=True) - magnitude = (fft.real.pow(2) + fft.imag.pow(2)).sqrt() - - if keyshift != 0: - size = self.n_fft // 2 + 1 - resize = magnitude.size(1) - if resize < size: magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) - magnitude = magnitude[:, :size, :] * self.win_length / win_length_new - - mel_output = self.mel_basis @ magnitude - return mel_output.clamp(min=self.clamp).log() \ No newline at end of file diff --git a/main/library/predictors/SWIFT/SWIFT.py b/main/library/predictors/SWIFT/SWIFT.py deleted file mode 100644 index d73d6a6332ae1830b9fde63408c14150b70f752e..0000000000000000000000000000000000000000 --- a/main/library/predictors/SWIFT/SWIFT.py +++ /dev/null @@ -1,41 +0,0 @@ -import librosa -import onnxruntime - -import numpy as np - -SAMPLE_RATE, HOP_LENGTH, FRAME_LENGTH = 16000, 256, 1024 - -class SWIFT: - def __init__(self, model_path, fmin = 50, fmax = 1100, confidence_threshold = 0.9, providers = ["CPUExecutionProvider"]): - self.fmin = fmin - self.fmax = fmax - self.confidence_threshold = confidence_threshold - session_options = onnxruntime.SessionOptions() - session_options.inter_op_num_threads = 1 - session_options.intra_op_num_threads = 1 - self.pitch_session = onnxruntime.InferenceSession(model_path, session_options, providers=providers) - self.pitch_input_name = self.pitch_session.get_inputs()[0].name - - def _extract_pitch_and_confidence(self, audio_16k): - if audio_16k.ndim != 1 or len(audio_16k) == 0: raise ValueError - if len(audio_16k) < 256: audio_16k = np.pad(audio_16k, (0, max(0, 256 - len(audio_16k))), mode="constant") - - outputs = self.pitch_session.run(None, {self.pitch_input_name: audio_16k[None, :].astype(np.float32)}) - if len(outputs) < 2: raise RuntimeError - - return outputs[0][0], outputs[1][0] - - def _compute_voicing(self, pitch_hz, confidence): - return (confidence > self.confidence_threshold) & (pitch_hz >= self.fmin) & (pitch_hz <= self.fmax) - - def _calculate_timestamps(self, n_frames): - frame_centers = np.arange(n_frames) * HOP_LENGTH + ((FRAME_LENGTH - 1) / 2 - ((FRAME_LENGTH - HOP_LENGTH) // 2)) - return frame_centers / SAMPLE_RATE - - def detect_from_array(self, audio_array, sample_rate=SAMPLE_RATE): - if audio_array.ndim > 1: audio_array = np.mean(audio_array, axis=-1) - - audio_16k = librosa.resample(audio_array.astype(np.float32), orig_sr=sample_rate, target_sr=SAMPLE_RATE) if sample_rate != SAMPLE_RATE else audio_array - pitch_hz, confidence = self._extract_pitch_and_confidence(audio_16k) - - return pitch_hz, self._compute_voicing(pitch_hz, confidence), self._calculate_timestamps(len(pitch_hz)) \ No newline at end of file diff --git a/main/library/predictors/WORLD/SWIPE.py b/main/library/predictors/WORLD/SWIPE.py deleted file mode 100644 index b83a355cab44a3b0595d0655b8ea852c772483fd..0000000000000000000000000000000000000000 --- a/main/library/predictors/WORLD/SWIPE.py +++ /dev/null @@ -1,200 +0,0 @@ -import math - -import numba as nb -import numpy as np - -from matplotlib import mlab -from scipy import interpolate -from decimal import Decimal, ROUND_HALF_UP - - -def swipe(x, fs, f0_floor=50, f0_ceil=1100, frame_period=10, sTHR=0.3): - plim = np.array([f0_floor, f0_ceil]) - t = np.arange(0, int(1000 * len(x) / fs / (frame_period) + 1)) * (frame_period / 1000) - - log2pc = np.arange(np.log2(plim[0]) * 96, np.log2(plim[-1]) * 96) - log2pc *= (1 / 96) - - pc = 2 ** log2pc - S = np.zeros((len(pc), len(t))) - - logWs = [round_matlab(elm) for elm in np.log2(4 * 2 * fs / plim)] - ws = 2 ** np.arange(logWs[0], logWs[1] - 1, -1) - p0 = 4 * 2 * fs / ws - - d = 1 + log2pc - np.log2(4 * 2 * fs / ws[0]) - fERBs = erbs2hz(np.arange(hz2erbs(pc[0] / 4), hz2erbs(fs / 2), 0.1)) - - for i in range(len(ws)): - dn = round_matlab(4 * fs / p0[i]) - X, f, ti = mlab.specgram(x=np.r_[np.zeros(int(ws[i] / 2)), np.r_[x, np.zeros(int(dn + ws[i] / 2))]], NFFT=ws[i], Fs=fs, window=np.hanning(ws[i] + 2)[1:-1], noverlap=max(0, np.round(ws[i] - dn)), mode='complex') - ti = np.r_[0, ti[:-1]] - M = np.maximum(0, interpolate.interp1d(f, np.abs(X.T), kind='cubic')(fERBs)).T - - if i == len(ws) - 1: - j = np.where(d - (i + 1) > -1)[0] - k = np.where(d[j] - (i + 1) < 0)[0] - elif i == 0: - j = np.where(d - (i + 1) < 1)[0] - k = np.where(d[j] - (i + 1) > 0)[0] - else: - j = np.where(np.abs(d - (i + 1)) < 1)[0] - k = np.arange(len(j)) - - Si = pitchStrengthAllCandidates(fERBs, np.sqrt(M), pc[j]) - Si = interpolate.interp1d(ti, Si, bounds_error=False, fill_value='nan')(t) if Si.shape[1] > 1 else np.full((len(Si), len(t)), np.nan) - - mu = np.ones(j.shape) - mu[k] = 1 - np.abs(d[j[k]] - i - 1) - S[j, :] = S[j, :] + np.tile(mu.reshape(-1, 1), (1, Si.shape[1])) * Si - - - p = np.full((S.shape[1], 1), np.nan) - s = np.full((S.shape[1], 1), np.nan) - - for j in range(S.shape[1]): - s[j] = np.max(S[:, j]) - i = np.argmax(S[:, j]) - - if s[j] < sTHR: continue - - if i == 0: p[j] = pc[0] - elif i == len(pc) - 1: p[j] = pc[0] - else: - I = np.arange(i-1, i+2) - tc = 1 / pc[I] - - ntc = (tc / tc[1] - 1) * 2 * np.pi - idx = np.isfinite(S[I, j]) - - c = np.zeros(len(ntc)) - c += np.nan - - I_ = I[idx] - - if len(I_) < 2: c[idx] = (S[I, j])[0] / ntc[0] - else: c[idx] = np.polyfit(ntc[idx], (S[I_, j]), 2) - - pval = np.polyval(c, ((1 / (2 ** np.arange(np.log2(pc[I[0]]), np.log2(pc[I[2]]) + 1 / 12 / 64, 1 / 12 / 64))) / tc[1] - 1) * 2 * np.pi) - s[j] = np.max(pval) - p[j] = 2 ** (np.log2(pc[I[0]]) + (np.argmax(pval)) / 12 / 64) - - p = p.flatten() - p[np.isnan(p)] = 0 - - return np.array(p, dtype=np.float32), np.array(t, dtype=np.float32) - -def round_matlab(n): - return int(Decimal(n).quantize(0, ROUND_HALF_UP)) - -def pitchStrengthAllCandidates(f, L, pc): - den = np.sqrt(np.sum(L * L, axis=0)) - den = np.where(den == 0, 2.220446049250313e-16, den) - - L = L / den - S = np.zeros((len(pc), L.shape[1])) - - for j in range(len(pc)): - S[j,:] = pitchStrengthOneCandidate(f, L, pc[j]) - - return S - -def pitchStrengthOneCandidate(f, L, pc): - k = np.zeros(len(f)) - q = f / pc - - for i in ([1] + sieve(int(np.fix(f[-1] / pc - 0.75)))): - a = np.abs(q - i) - p = a < 0.25 - k[p] = np.cos(2 * np.pi * q[p]) - - v = np.logical_and((0.25 < a), (a < 0.75)) - k[v] = k[v] + np.cos(2 * np.pi * q[v]) / 2 - - k *= np.sqrt(1 / f) - k /= np.linalg.norm(k[k>0]) - - return k @ L - -def hz2erbs(hz): - return 21.4 * np.log10(1 + hz / 229) - -def erbs2hz(erbs): - return (10 ** (erbs / 21.4) - 1) * 229 - -def sieve(n): - primes = list(range(2, n + 1)) - num = 2 - - while num < math.sqrt(n): - i = num - - while i <= n: - i += num - - if i in primes: primes.remove(i) - - for j in primes: - if j > num: - num = j - break - - return primes - -def stonemask(x, fs, temporal_positions, f0): - refined_f0 = np.copy(f0) - - for i in range(len(temporal_positions)): - if f0[i] != 0: - refined_f0[i] = get_refined_f0(x, fs, temporal_positions[i], f0[i]) - if abs(refined_f0[i] - f0[i]) / f0[i] > 0.2: refined_f0[i] = f0[i] - - return np.array(refined_f0, dtype=np.float32) - -def get_refined_f0(x, fs, current_time, current_f0): - f0_initial = current_f0 - half_window_length = np.ceil(3 * fs / f0_initial / 2) - window_length_in_time = (2 * half_window_length + 1) / fs - - base_time = np.arange(-half_window_length, half_window_length + 1) / fs - fft_size = 2 ** math.ceil(math.log((half_window_length * 2 + 1), 2) + 1) - - base_time = np.array([float("{0:.4f}".format(elm)) for elm in base_time]) - index_raw = round_matlab_2((current_time + base_time) * fs) - - window_time = ((index_raw - 1) / fs) - current_time - main_window = 0.42 + 0.5 * np.cos(2 * math.pi * window_time / window_length_in_time) + 0.08 * np.cos(4 * math.pi * window_time / window_length_in_time) - - index = np.array(np.maximum(1, np.minimum(len(x), index_raw)), dtype=int) - spectrum = np.fft.fft(x[index - 1] * main_window, fft_size) - - diff_spectrum = np.fft.fft(x[index - 1] * (-(np.diff(np.r_[0, main_window]) + np.diff(np.r_[main_window, 0])) / 2), fft_size) - power_spectrum = np.abs(spectrum) ** 2 - - from sys import float_info - - power_spectrum[power_spectrum == 0] = float_info.epsilon - instantaneous_frequency = (np.arange(fft_size) / fft_size * fs) + (np.real(spectrum) * np.imag(diff_spectrum) - np.imag(spectrum) * np.real(diff_spectrum)) / power_spectrum * fs / 2 / math.pi - - trim_index = np.array([1, 2]) - index_list_trim = np.array(round_matlab_2(f0_initial * fft_size / fs * trim_index) + 1, int) - - amp_list = np.sqrt(power_spectrum[index_list_trim - 1]) - f0_initial = np.sum(amp_list * instantaneous_frequency[index_list_trim - 1]) / np.sum(amp_list * trim_index) - - if f0_initial < 0: return 0 - - trim_index = np.array([1, 2, 3, 4, 5, 6]) - index_list_trim = np.array(round_matlab_2(f0_initial * fft_size / fs * trim_index) + 1, int) - amp_list = np.sqrt(power_spectrum[index_list_trim - 1]) - - return np.sum(amp_list * instantaneous_frequency[index_list_trim - 1]) / np.sum(amp_list * trim_index) - -@nb.jit((nb.float64[:],), nopython=True, cache=True) -def round_matlab_2(x): - y = x.copy() - - y[x > 0] += 0.5 - y[x <= 0] -= 0.5 - - return y \ No newline at end of file diff --git a/main/library/predictors/WORLD/WORLD.py b/main/library/predictors/WORLD/WORLD.py deleted file mode 100644 index 40de4508e853edde6ce2fc6b782f1a8bb82da939..0000000000000000000000000000000000000000 --- a/main/library/predictors/WORLD/WORLD.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -import pickle -import ctypes -import platform - -import numpy as np - -class DioOption(ctypes.Structure): - _fields_ = [("F0Floor", ctypes.c_double), ("F0Ceil", ctypes.c_double), ("ChannelsInOctave", ctypes.c_double), ("FramePeriod", ctypes.c_double), ("Speed", ctypes.c_int), ("AllowedRange", ctypes.c_double)] - -class HarvestOption(ctypes.Structure): - _fields_ = [("F0Floor", ctypes.c_double), ("F0Ceil", ctypes.c_double), ("FramePeriod", ctypes.c_double)] - -class PYWORLD: - def __init__(self, world_path, model_path): - self.world_path = world_path - os.makedirs(self.world_path, exist_ok=True) - model_type, suffix = (("world_64" if platform.architecture()[0] == "64bit" else "world_86"), ".dll") if platform.system() == "Windows" else ("world_linux", ".so") - self.world_file_path = os.path.join(self.world_path, f"{model_type}{suffix}") - - if not os.path.exists(self.world_file_path): - with open(model_path, "rb") as f: - model = pickle.load(f) - - with open(self.world_file_path, "wb") as w: - w.write(model[model_type]) - - self.world_dll = ctypes.CDLL(self.world_file_path) - - def harvest(self, x, fs, f0_floor=50, f0_ceil=1100, frame_period=10): - self.world_dll.Harvest.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int, ctypes.c_int, ctypes.POINTER(HarvestOption), ctypes.POINTER(ctypes.c_double), ctypes.POINTER(ctypes.c_double)] - self.world_dll.Harvest.restype = None - self.world_dll.InitializeHarvestOption.argtypes = [ctypes.POINTER(HarvestOption)] - self.world_dll.InitializeHarvestOption.restype = None - self.world_dll.GetSamplesForHarvest.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_double] - self.world_dll.GetSamplesForHarvest.restype = ctypes.c_int - - option = HarvestOption() - self.world_dll.InitializeHarvestOption(ctypes.byref(option)) - - option.F0Floor = f0_floor - option.F0Ceil = f0_ceil - option.FramePeriod = frame_period - - f0_length = self.world_dll.GetSamplesForHarvest(fs, len(x), option.FramePeriod) - f0 = (ctypes.c_double * f0_length)() - tpos = (ctypes.c_double * f0_length)() - - self.world_dll.Harvest((ctypes.c_double * len(x))(*x), len(x), fs, ctypes.byref(option), tpos, f0) - return np.array(f0, dtype=np.float32), np.array(tpos, dtype=np.float32) - - def dio(self, x, fs, f0_floor=50, f0_ceil=1100, channels_in_octave=2, frame_period=10, speed=1, allowed_range=0.1): - self.world_dll.Dio.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int, ctypes.c_int, ctypes.POINTER(DioOption), ctypes.POINTER(ctypes.c_double), ctypes.POINTER(ctypes.c_double)] - self.world_dll.Dio.restype = None - self.world_dll.InitializeDioOption.argtypes = [ctypes.POINTER(DioOption)] - self.world_dll.InitializeDioOption.restype = None - self.world_dll.GetSamplesForDIO.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_double] - self.world_dll.GetSamplesForDIO.restype = ctypes.c_int - - option = DioOption() - self.world_dll.InitializeDioOption(ctypes.byref(option)) - - option.F0Floor = f0_floor - option.F0Ceil = f0_ceil - option.ChannelsInOctave = channels_in_octave - option.FramePeriod = frame_period - option.Speed = speed - option.AllowedRange = allowed_range - - f0_length = self.world_dll.GetSamplesForDIO(fs, len(x), option.FramePeriod) - f0 = (ctypes.c_double * f0_length)() - tpos = (ctypes.c_double * f0_length)() - - self.world_dll.Dio((ctypes.c_double * len(x))(*x), len(x), fs, ctypes.byref(option), tpos, f0) - return np.array(f0, dtype=np.float32), np.array(tpos, dtype=np.float32) - - def stonemask(self, x, fs, tpos, f0): - self.world_dll.StoneMask.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_double), ctypes.POINTER(ctypes.c_double), ctypes.c_int, ctypes.POINTER(ctypes.c_double)] - self.world_dll.StoneMask.restype = None - - out_f0 = (ctypes.c_double * len(f0))() - self.world_dll.StoneMask((ctypes.c_double * len(x))(*x), len(x), fs, (ctypes.c_double * len(tpos))(*tpos), (ctypes.c_double * len(f0))(*f0), len(f0), out_f0) - - return np.array(out_f0, dtype=np.float32) \ No newline at end of file diff --git a/main/library/speaker_diarization/ECAPA_TDNN.py b/main/library/speaker_diarization/ECAPA_TDNN.py deleted file mode 100644 index bbc7c51537febcbd2927475a2bb8035a97b917f5..0000000000000000000000000000000000000000 --- a/main/library/speaker_diarization/ECAPA_TDNN.py +++ /dev/null @@ -1,279 +0,0 @@ -import math -import torch - -import torch.nn as nn -import torch.nn.functional as F - -def length_to_mask(length, max_len=None, dtype=None, device=None): - assert len(length.shape) == 1 - - if max_len is None: max_len = length.max().long().item() - - mask = torch.arange(max_len, device=length.device, dtype=length.dtype).expand(len(length), max_len) < length.unsqueeze(1) - - if dtype is None: dtype = length.dtype - if device is None: device = length.device - - return torch.as_tensor(mask, dtype=dtype, device=device) - -def get_padding_elem(L_in, stride, kernel_size, dilation): - if stride > 1: padding = [math.floor(kernel_size / 2), math.floor(kernel_size / 2)] - else: - L_out = (math.floor((L_in - dilation * (kernel_size - 1) - 1) / stride) + 1) - padding = [math.floor((L_in - L_out) / 2), math.floor((L_in - L_out) / 2)] - - return padding - -class _BatchNorm1d(nn.Module): - def __init__(self, input_shape=None, input_size=None, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True, combine_batch_time=False, skip_transpose=False): - super().__init__() - self.combine_batch_time = combine_batch_time - self.skip_transpose = skip_transpose - - if input_size is None and skip_transpose: input_size = input_shape[1] - elif input_size is None: input_size = input_shape[-1] - - self.norm = nn.BatchNorm1d(input_size, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats) - - def forward(self, x): - shape_or = x.shape - - if self.combine_batch_time:x = x.reshape(shape_or[0] * shape_or[1], shape_or[2]) if x.ndim == 3 else x.reshape(shape_or[0] * shape_or[1], shape_or[3], shape_or[2]) - elif not self.skip_transpose: x = x.transpose(-1, 1) - - x_n = self.norm(x) - - if self.combine_batch_time: x_n = x_n.reshape(shape_or) - elif not self.skip_transpose: x_n = x_n.transpose(1, -1) - - return x_n - -class _Conv1d(nn.Module): - def __init__(self, out_channels, kernel_size, input_shape=None, in_channels=None, stride=1, dilation=1, padding="same", groups=1, bias=True, padding_mode="reflect", skip_transpose=False, weight_norm=False, conv_init=None, default_padding=0): - super().__init__() - self.kernel_size = kernel_size - self.stride = stride - self.dilation = dilation - self.padding = padding - self.padding_mode = padding_mode - self.unsqueeze = False - self.skip_transpose = skip_transpose - - if input_shape is None and in_channels is None: raise ValueError - if in_channels is None: in_channels = self._check_input_shape(input_shape) - - self.in_channels = in_channels - self.conv = nn.Conv1d(in_channels, out_channels, self.kernel_size, stride=self.stride, dilation=self.dilation, padding=default_padding, groups=groups, bias=bias) - - if conv_init == "kaiming": nn.init.kaiming_normal_(self.conv.weight) - elif conv_init == "zero": nn.init.zeros_(self.conv.weight) - elif conv_init == "normal": nn.init.normal_(self.conv.weight, std=1e-6) - - if weight_norm: self.conv = nn.utils.weight_norm(self.conv) - - def forward(self, x): - if not self.skip_transpose: x = x.transpose(1, -1) - if self.unsqueeze: x = x.unsqueeze(1) - - if self.padding == "same": x = self._manage_padding(x, self.kernel_size, self.dilation, self.stride) - elif self.padding == "causal": x = F.pad(x, ((self.kernel_size - 1) * self.dilation, 0)) - elif self.padding == "valid": pass - else: raise ValueError - - wx = self.conv(x) - - if self.unsqueeze: wx = wx.squeeze(1) - if not self.skip_transpose: wx = wx.transpose(1, -1) - - return wx - - def _manage_padding(self, x, kernel_size, dilation, stride): - return F.pad(x, get_padding_elem(self.in_channels, stride, kernel_size, dilation), mode=self.padding_mode) - - def _check_input_shape(self, shape): - if len(shape) == 2: - self.unsqueeze = True - in_channels = 1 - elif self.skip_transpose: in_channels = shape[1] - elif len(shape) == 3: in_channels = shape[2] - else: raise ValueError - - if not self.padding == "valid" and self.kernel_size % 2 == 0: raise ValueError - return in_channels - - def remove_weight_norm(self): - self.conv = nn.utils.remove_weight_norm(self.conv) - -class Linear(torch.nn.Module): - def __init__(self, n_neurons, input_shape=None, input_size=None, bias=True, max_norm=None, combine_dims=False): - super().__init__() - self.max_norm = max_norm - self.combine_dims = combine_dims - - if input_shape is None and input_size is None: raise ValueError - if input_size is None: - input_size = input_shape[-1] - if len(input_shape) == 4 and self.combine_dims: input_size = input_shape[2] * input_shape[3] - - self.w = nn.Linear(input_size, n_neurons, bias=bias) - - def forward(self, x): - if x.ndim == 4 and self.combine_dims: x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]) - if self.max_norm is not None: self.w.weight.data = torch.renorm(self.w.weight.data, p=2, dim=0, maxnorm=self.max_norm) - - return self.w(x) - -class Conv1d(_Conv1d): - def __init__(self, *args, **kwargs): - super().__init__(skip_transpose=True, *args, **kwargs) - -class BatchNorm1d(_BatchNorm1d): - def __init__(self, *args, **kwargs): - super().__init__(skip_transpose=True, *args, **kwargs) - -class TDNNBlock(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, dilation, activation=nn.ReLU, groups=1, dropout=0.0): - super().__init__() - self.conv = Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, dilation=dilation, groups=groups) - self.activation = activation() - self.norm = BatchNorm1d(input_size=out_channels) - self.dropout = nn.Dropout1d(p=dropout) - - def forward(self, x): - return self.dropout(self.norm(self.activation(self.conv(x)))) - -class Res2NetBlock(torch.nn.Module): - def __init__(self, in_channels, out_channels, scale=8, kernel_size=3, dilation=1, dropout=0.0): - super().__init__() - assert in_channels % scale == 0 - assert out_channels % scale == 0 - in_channel = in_channels // scale - hidden_channel = out_channels // scale - self.blocks = nn.ModuleList([TDNNBlock(in_channel, hidden_channel, kernel_size=kernel_size, dilation=dilation, dropout=dropout) for _ in range(scale - 1)]) - self.scale = scale - - def forward(self, x): - y = [] - - for i, x_i in enumerate(x.chunk(self.scale, dim=1)): - if i == 0: y_i = x_i - elif i == 1: y_i = self.blocks[i - 1](x_i) - else: y_i = self.blocks[i - 1](x_i + y_i) - - y.append(y_i) - - return torch.cat(y, dim=1) - -class SEBlock(nn.Module): - def __init__(self, in_channels, se_channels, out_channels): - super().__init__() - - self.conv1 = Conv1d(in_channels=in_channels, out_channels=se_channels, kernel_size=1) - self.relu = torch.nn.ReLU(inplace=True) - self.conv2 = Conv1d(in_channels=se_channels, out_channels=out_channels, kernel_size=1) - self.sigmoid = torch.nn.Sigmoid() - - def forward(self, x, lengths=None): - L = x.shape[-1] - - if lengths is not None: - mask = length_to_mask(lengths * L, max_len=L, device=x.device).unsqueeze(1) - s = (x * mask).sum(dim=2, keepdim=True) / mask.sum(dim=2, keepdim=True) - else: s = x.mean(dim=2, keepdim=True) - - return self.sigmoid(self.conv2(self.relu(self.conv1(s)))) * x - -class AttentiveStatisticsPooling(nn.Module): - def __init__(self, channels, attention_channels=128, global_context=True): - super().__init__() - self.eps = 1e-12 - self.global_context = global_context - self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1) if global_context else TDNNBlock(channels, attention_channels, 1, 1) - self.tanh = nn.Tanh() - self.conv = Conv1d(in_channels=attention_channels, out_channels=channels, kernel_size=1) - - def forward(self, x, lengths=None): - L = x.shape[-1] - - def _compute_statistics(x, m, dim=2, eps=self.eps): - mean = (m * x).sum(dim) - return mean, ((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps)).sqrt() - - if lengths is None: lengths = torch.ones(x.shape[0], device=x.device) - mask = length_to_mask(lengths * L, max_len=L, device=x.device).unsqueeze(1) - - if self.global_context: - mean, std = _compute_statistics(x, mask / mask.sum(dim=2, keepdim=True).float()) - attn = torch.cat([x, mean.unsqueeze(2).repeat(1, 1, L), std.unsqueeze(2).repeat(1, 1, L)], dim=1) - else: attn = x - - mean, std = _compute_statistics(x, F.softmax(self.conv(self.tanh(self.tdnn(attn))).masked_fill(mask == 0, float("-inf")), dim=2)) - return torch.cat((mean, std), dim=1).unsqueeze(2) - -class SERes2NetBlock(nn.Module): - def __init__(self, in_channels, out_channels, res2net_scale=8, se_channels=128, kernel_size=1, dilation=1, activation=torch.nn.ReLU, groups=1, dropout=0.0): - super().__init__() - self.out_channels = out_channels - self.tdnn1 = TDNNBlock(in_channels, out_channels, kernel_size=1, dilation=1, activation=activation, groups=groups, dropout=dropout) - self.res2net_block = Res2NetBlock(out_channels, out_channels, res2net_scale, kernel_size, dilation) - self.tdnn2 = TDNNBlock(out_channels, out_channels, kernel_size=1, dilation=1, activation=activation, groups=groups, dropout=dropout) - self.se_block = SEBlock(out_channels, se_channels, out_channels) - - self.shortcut = None - if in_channels != out_channels: self.shortcut = Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1) - - def forward(self, x, lengths=None): - residual = x - if self.shortcut: residual = self.shortcut(x) - - return self.se_block(self.tdnn2(self.res2net_block(self.tdnn1(x))), lengths) + residual - -class ECAPA_TDNN(torch.nn.Module): - def __init__(self, input_size, device="cpu", lin_neurons=192, activation=torch.nn.ReLU, channels=[512, 512, 512, 512, 1536], kernel_sizes=[5, 3, 3, 3, 1], dilations=[1, 2, 3, 4, 1], attention_channels=128, res2net_scale=8, se_channels=128, global_context=True, groups=[1, 1, 1, 1, 1], dropout=0.0): - super().__init__() - assert len(channels) == len(kernel_sizes) - assert len(channels) == len(dilations) - - self.channels = channels - self.blocks = nn.ModuleList() - self.blocks.append(TDNNBlock(input_size, channels[0], kernel_sizes[0], dilations[0], activation, groups[0], dropout)) - - for i in range(1, len(channels) - 1): - self.blocks.append(SERes2NetBlock(channels[i - 1], channels[i], res2net_scale=res2net_scale, se_channels=se_channels, kernel_size=kernel_sizes[i], dilation=dilations[i], activation=activation, groups=groups[i], dropout=dropout)) - - self.mfa = TDNNBlock(channels[-2] * (len(channels) - 2), channels[-1], kernel_sizes[-1], dilations[-1], activation, groups=groups[-1], dropout=dropout) - self.asp = AttentiveStatisticsPooling(channels[-1], attention_channels=attention_channels, global_context=global_context) - self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2) - self.fc = Conv1d(in_channels=channels[-1] * 2, out_channels=lin_neurons, kernel_size=1) - - def forward(self, x, lengths=None): - x = x.transpose(1, 2) - xl = [] - - for layer in self.blocks: - try: - x = layer(x, lengths=lengths) - except TypeError: - x = layer(x) - - xl.append(x) - - return self.fc(self.asp_bn(self.asp(self.mfa(torch.cat(xl[1:], dim=1)), lengths=lengths))).transpose(1, 2) - -class Classifier(torch.nn.Module): - def __init__(self, input_size, device="cpu", lin_blocks=0, lin_neurons=192, out_neurons=1211): - super().__init__() - self.blocks = nn.ModuleList() - - for _ in range(lin_blocks): - self.blocks.extend([_BatchNorm1d(input_size=input_size), Linear(input_size=input_size, n_neurons=lin_neurons)]) - input_size = lin_neurons - - self.weight = nn.Parameter(torch.FloatTensor(out_neurons, input_size, device=device)) - nn.init.xavier_uniform_(self.weight) - - def forward(self, x): - for layer in self.blocks: - x = layer(x) - - return F.linear(F.normalize(x.squeeze(1)), F.normalize(self.weight)).unsqueeze(1) \ No newline at end of file diff --git a/main/library/speaker_diarization/audio.py b/main/library/speaker_diarization/audio.py deleted file mode 100644 index 04dc98ec6ad127d75c2f60591b4873aeba5ea760..0000000000000000000000000000000000000000 --- a/main/library/speaker_diarization/audio.py +++ /dev/null @@ -1,170 +0,0 @@ -import os -import math -import torch -import random -import torchaudio - -from io import IOBase - -def get_torchaudio_info(file, backend = None): - if not backend: - backends = (torchaudio.list_audio_backends()) - backend = "soundfile" if "soundfile" in backends else backends[0] - - info = torchaudio.info(file["audio"], backend=backend) - if isinstance(file["audio"], IOBase): file["audio"].seek(0) - - return info - -class Audio: - @staticmethod - def power_normalize(waveform): - return waveform / (waveform.square().mean(dim=-1, keepdim=True).sqrt() + 1e-8) - - @staticmethod - def validate_file(file): - if isinstance(file, (str, os.PathLike)): file = {"audio": str(file), "uri": os.path.splitext(os.path.basename(file))[0]} - elif isinstance(file, IOBase): return {"audio": file, "uri": "stream"} - else: raise ValueError - - if "waveform" in file: - waveform = file["waveform"] - if len(waveform.shape) != 2 or waveform.shape[0] > waveform.shape[1]: raise ValueError - - sample_rate = file.get("sample_rate", None) - if sample_rate is None: raise ValueError - - file.setdefault("uri", "waveform") - - elif "audio" in file: - if isinstance(file["audio"], IOBase): return file - - path = os.path.abspath(file["audio"]) - file.setdefault("uri", os.path.splitext(os.path.basename(path))[0]) - - else: raise ValueError - - return file - - def __init__(self, sample_rate = None, mono=None, backend = None): - super().__init__() - self.sample_rate = sample_rate - self.mono = mono - - if not backend: - backends = (torchaudio.list_audio_backends()) - backend = "soundfile" if "soundfile" in backends else backends[0] - - self.backend = backend - - def downmix_and_resample(self, waveform, sample_rate): - num_channels = waveform.shape[0] - - if num_channels > 1: - if self.mono == "random": - channel = random.randint(0, num_channels - 1) - waveform = waveform[channel : channel + 1] - elif self.mono == "downmix": waveform = waveform.mean(dim=0, keepdim=True) - - if (self.sample_rate is not None) and (self.sample_rate != sample_rate): - waveform = torchaudio.functional.resample(waveform, sample_rate, self.sample_rate) - sample_rate = self.sample_rate - - return waveform, sample_rate - - def get_duration(self, file): - file = self.validate_file(file) - - if "waveform" in file: - frames = len(file["waveform"].T) - sample_rate = file["sample_rate"] - else: - info = file["torchaudio.info"] if "torchaudio.info" in file else get_torchaudio_info(file, backend=self.backend) - frames = info.num_frames - sample_rate = info.sample_rate - - return frames / sample_rate - - def get_num_samples(self, duration, sample_rate = None): - sample_rate = sample_rate or self.sample_rate - if sample_rate is None: raise ValueError - - return math.floor(duration * sample_rate) - - def __call__(self, file): - file = self.validate_file(file) - - if "waveform" in file: - waveform = file["waveform"] - sample_rate = file["sample_rate"] - elif "audio" in file: - waveform, sample_rate = torchaudio.load(file["audio"], backend=self.backend) - if isinstance(file["audio"], IOBase): file["audio"].seek(0) - - channel = file.get("channel", None) - if channel is not None: waveform = waveform[channel : channel + 1] - - return self.downmix_and_resample(waveform, sample_rate) - - def crop(self, file, segment, duration = None, mode="raise"): - file = self.validate_file(file) - - if "waveform" in file: - waveform = file["waveform"] - frames = waveform.shape[1] - sample_rate = file["sample_rate"] - elif "torchaudio.info" in file: - info = file["torchaudio.info"] - frames = info.num_frames - sample_rate = info.sample_rate - else: - info = get_torchaudio_info(file, backend=self.backend) - frames = info.num_frames - sample_rate = info.sample_rate - - channel = file.get("channel", None) - start_frame = math.floor(segment.start * sample_rate) - - if duration: - num_frames = math.floor(duration * sample_rate) - end_frame = start_frame + num_frames - else: - end_frame = math.floor(segment.end * sample_rate) - num_frames = end_frame - start_frame - - if mode == "raise": - if num_frames > frames: raise ValueError - - if end_frame > frames + math.ceil(0.001 * sample_rate): raise ValueError - else: - end_frame = min(end_frame, frames) - start_frame = end_frame - num_frames - - if start_frame < 0: raise ValueError - elif mode == "pad": - pad_start = -min(0, start_frame) - pad_end = max(end_frame, frames) - frames - - start_frame = max(0, start_frame) - end_frame = min(end_frame, frames) - - num_frames = end_frame - start_frame - - if "waveform" in file: data = file["waveform"][:, start_frame:end_frame] - else: - try: - data, _ = torchaudio.load(file["audio"], frame_offset=start_frame, num_frames=num_frames, backend=self.backend) - if isinstance(file["audio"], IOBase): file["audio"].seek(0) - except RuntimeError: - if isinstance(file["audio"], IOBase): raise RuntimeError - - waveform, sample_rate = self.__call__(file) - data = waveform[:, start_frame:end_frame] - - file["waveform"] = waveform - file["sample_rate"] = sample_rate - - if channel is not None: data = data[channel : channel + 1, :] - if mode == "pad": data = torch.nn.functional.pad(data, (pad_start, pad_end)) - - return self.downmix_and_resample(data, sample_rate) \ No newline at end of file diff --git a/main/library/speaker_diarization/embedding.py b/main/library/speaker_diarization/embedding.py deleted file mode 100644 index 6d7bd6eea9560d142430fc1ccafccaa77c142695..0000000000000000000000000000000000000000 --- a/main/library/speaker_diarization/embedding.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import sys -import torch - -import numpy as np -import torch.nn.functional as F - -from functools import cached_property -from torch.nn.utils.rnn import pad_sequence - -sys.path.append(os.getcwd()) - -from main.library.speaker_diarization.speechbrain import EncoderClassifier - -class SpeechBrainPretrainedSpeakerEmbedding: - def __init__(self, embedding, device = None): - super().__init__() - - self.embedding = embedding - self.device = device or torch.device("cpu") - self.classifier_ = EncoderClassifier.from_hparams(source=self.embedding, run_opts={"device": self.device}) - - @cached_property - def dimension(self): - *_, dimension = self.classifier_.encode_batch(torch.rand(1, 16000).to(self.device)).shape - return dimension - - @cached_property - def min_num_samples(self): - with torch.inference_mode(): - lower, upper = 2, round(0.5 * self.classifier_.audio_normalizer.sample_rate) - middle = (lower + upper) // 2 - - while lower + 1 < upper: - try: - _ = self.classifier_.encode_batch(torch.randn(1, middle).to(self.device)) - upper = middle - except RuntimeError: - lower = middle - - middle = (lower + upper) // 2 - - return upper - - def __call__(self, waveforms, masks = None): - batch_size, num_channels, num_samples = waveforms.shape - assert num_channels == 1 - - waveforms = waveforms.squeeze(dim=1) - - if masks is None: - signals = waveforms.squeeze(dim=1) - wav_lens = signals.shape[1] * torch.ones(batch_size) - else: - batch_size_masks, _ = masks.shape - assert batch_size == batch_size_masks - - imasks = F.interpolate(masks.unsqueeze(dim=1), size=num_samples, mode="nearest").squeeze(dim=1) > 0.5 - signals = pad_sequence([waveform[imask].contiguous() for waveform, imask in zip(waveforms, imasks)], batch_first=True) - wav_lens = imasks.sum(dim=1) - - max_len = wav_lens.max() - if max_len < self.min_num_samples: return np.nan * np.zeros((batch_size, self.dimension)) - - too_short = wav_lens < self.min_num_samples - wav_lens = wav_lens / max_len - wav_lens[too_short] = 1.0 - - embeddings = (self.classifier_.encode_batch(signals, wav_lens=wav_lens).squeeze(dim=1).cpu().numpy()) - embeddings[too_short.cpu().numpy()] = np.nan - - return embeddings \ No newline at end of file diff --git a/main/library/speaker_diarization/encoder.py b/main/library/speaker_diarization/encoder.py deleted file mode 100644 index 8c2c866debc4b64d8457463f24350a1dd7a7ba07..0000000000000000000000000000000000000000 --- a/main/library/speaker_diarization/encoder.py +++ /dev/null @@ -1,245 +0,0 @@ -import os -import sys -import ast -import torch -import itertools -import collections - -sys.path.append(os.getcwd()) - -from main.library.speaker_diarization.speechbrain import if_main_process, ddp_barrier -from main.library.speaker_diarization.features import register_checkpoint_hooks, mark_as_saver, mark_as_loader - -@register_checkpoint_hooks -class CategoricalEncoder: - VALUE_SEPARATOR = " => " - EXTRAS_SEPARATOR = "================\n" - - def __init__(self, starting_index=0, **special_labels): - self.lab2ind = {} - self.ind2lab = {} - self.starting_index = starting_index - self.handle_special_labels(special_labels) - - def handle_special_labels(self, special_labels): - if "unk_label" in special_labels: self.add_unk(special_labels["unk_label"]) - - def __len__(self): - return len(self.lab2ind) - - @classmethod - def from_saved(cls, path): - obj = cls() - obj.load(path) - return obj - - def update_from_iterable(self, iterable, sequence_input=False): - label_iterator = itertools.chain.from_iterable(iterable) if sequence_input else iter(iterable) - for label in label_iterator: - self.ensure_label(label) - - def update_from_didataset(self, didataset, output_key, sequence_input=False): - with didataset.output_keys_as([output_key]): - self.update_from_iterable((data_point[output_key] for data_point in didataset), sequence_input=sequence_input) - - def limited_labelset_from_iterable(self, iterable, sequence_input=False, n_most_common=None, min_count=1): - label_iterator = itertools.chain.from_iterable(iterable) if sequence_input else iter(iterable) - counts = collections.Counter(label_iterator) - - for label, count in counts.most_common(n_most_common): - if count < min_count: break - self.add_label(label) - - return counts - - def load_or_create(self, path, from_iterables=[], from_didatasets=[], sequence_input=False, output_key=None, special_labels={}): - try: - if if_main_process(): - if not self.load_if_possible(path): - for iterable in from_iterables: - self.update_from_iterable(iterable, sequence_input) - - for didataset in from_didatasets: - if output_key is None: raise ValueError - self.update_from_didataset(didataset, output_key, sequence_input) - - self.handle_special_labels(special_labels) - self.save(path) - finally: - ddp_barrier() - self.load(path) - - def add_label(self, label): - if label in self.lab2ind: raise KeyError - index = self._next_index() - - self.lab2ind[label] = index - self.ind2lab[index] = label - - return index - - def ensure_label(self, label): - if label in self.lab2ind: return self.lab2ind[label] - else: return self.add_label(label) - - def insert_label(self, label, index): - if label in self.lab2ind: raise KeyError - else: self.enforce_label(label, index) - - def enforce_label(self, label, index): - index = int(index) - - if label in self.lab2ind: - if index == self.lab2ind[label]: return - else: del self.ind2lab[self.lab2ind[label]] - - if index in self.ind2lab: - saved_label = self.ind2lab[index] - moving_other = True - else: moving_other = False - - self.lab2ind[label] = index - self.ind2lab[index] = label - - if moving_other: - new_index = self._next_index() - self.lab2ind[saved_label] = new_index - self.ind2lab[new_index] = saved_label - - def add_unk(self, unk_label=""): - self.unk_label = unk_label - return self.add_label(unk_label) - - def _next_index(self): - index = self.starting_index - while index in self.ind2lab: - index += 1 - - return index - - def is_continuous(self): - indices = sorted(self.ind2lab.keys()) - return self.starting_index in indices and all(j - i == 1 for i, j in zip(indices[:-1], indices[1:])) - - def encode_label(self, label, allow_unk=True): - self._assert_len() - - try: - return self.lab2ind[label] - except KeyError: - if hasattr(self, "unk_label") and allow_unk: return self.lab2ind[self.unk_label] - elif hasattr(self, "unk_label") and not allow_unk: raise KeyError - elif not hasattr(self, "unk_label") and allow_unk: raise KeyError - else: raise KeyError - - def encode_label_torch(self, label, allow_unk=True): - return torch.LongTensor([self.encode_label(label, allow_unk)]) - - def encode_sequence(self, sequence, allow_unk=True): - self._assert_len() - return [self.encode_label(label, allow_unk) for label in sequence] - - def encode_sequence_torch(self, sequence, allow_unk=True): - return torch.LongTensor([self.encode_label(label, allow_unk) for label in sequence]) - - def decode_torch(self, x): - self._assert_len() - decoded = [] - - if x.ndim == 1: - for element in x: - decoded.append(self.ind2lab[int(element)]) - else: - for subtensor in x: - decoded.append(self.decode_torch(subtensor)) - - return decoded - - def decode_ndim(self, x): - self._assert_len() - try: - decoded = [] - for subtensor in x: - decoded.append(self.decode_ndim(subtensor)) - - return decoded - except TypeError: - return self.ind2lab[int(x)] - - @mark_as_saver - def save(self, path): - self._save_literal(path, self.lab2ind, self._get_extras()) - - def load(self, path): - lab2ind, ind2lab, extras = self._load_literal(path) - self.lab2ind = lab2ind - self.ind2lab = ind2lab - self._set_extras(extras) - - @mark_as_loader - def load_if_possible(self, path, end_of_epoch=False): - del end_of_epoch - - try: - self.load(path) - except FileNotFoundError: - return False - except (ValueError, SyntaxError): - return False - - return True - - def expect_len(self, expected_len): - self.expected_len = expected_len - - def ignore_len(self): - self.expected_len = None - - def _assert_len(self): - if hasattr(self, "expected_len"): - if self.expected_len is None: return - if len(self) != self.expected_len: raise RuntimeError - else: - self.ignore_len() - return - - def _get_extras(self): - extras = {"starting_index": self.starting_index} - if hasattr(self, "unk_label"): extras["unk_label"] = self.unk_label - - return extras - - def _set_extras(self, extras): - if "unk_label" in extras: self.unk_label = extras["unk_label"] - self.starting_index = extras["starting_index"] - - @staticmethod - def _save_literal(path, lab2ind, extras): - with open(path, "w", encoding="utf-8") as f: - for label, ind in lab2ind.items(): - f.write(repr(label) + CategoricalEncoder.VALUE_SEPARATOR + str(ind) + "\n") - - f.write(CategoricalEncoder.EXTRAS_SEPARATOR) - - for key, value in extras.items(): - f.write(repr(key) + CategoricalEncoder.VALUE_SEPARATOR + repr(value) + "\n") - - f.flush() - - @staticmethod - def _load_literal(path): - lab2ind, ind2lab, extras = {}, {}, {} - - with open(path, encoding="utf-8") as f: - for line in f: - if line == CategoricalEncoder.EXTRAS_SEPARATOR: break - literal, ind = line.strip().split(CategoricalEncoder.VALUE_SEPARATOR, maxsplit=1) - label = ast.literal_eval(literal) - lab2ind[label] = int(ind) - ind2lab[ind] = label - - for line in f: - literal_key, literal_value = line.strip().split(CategoricalEncoder.VALUE_SEPARATOR, maxsplit=1) - extras[ast.literal_eval(literal_key)] = ast.literal_eval(literal_value) - - return lab2ind, ind2lab, extras \ No newline at end of file diff --git a/main/library/speaker_diarization/features.py b/main/library/speaker_diarization/features.py deleted file mode 100644 index 9ae80b23c4c0ed2080ecb57c174778014fb6acf2..0000000000000000000000000000000000000000 --- a/main/library/speaker_diarization/features.py +++ /dev/null @@ -1,523 +0,0 @@ -import os -import sys -import math -import torch -import inspect -import functools - -sys.path.append(os.getcwd()) - -from main.library.speaker_diarization.speechbrain import MAIN_PROC_ONLY, is_distributed_initialized, main_process_only - -KEYS_MAPPING = {".mutihead_attn": ".multihead_attn", ".convs_intermedite": ".convs_intermediate"} - -def map_old_state_dict_weights(state_dict, mapping): - for replacement_old, replacement_new in mapping.items(): - for old_key in list(state_dict.keys()): - if replacement_old in old_key: state_dict[old_key.replace(replacement_old, replacement_new)] = state_dict.pop(old_key) - - return state_dict - -def hook_on_loading_state_dict_checkpoint(state_dict): - return map_old_state_dict_weights(state_dict, KEYS_MAPPING) - -def torch_patched_state_dict_load(path, device="cpu"): - return hook_on_loading_state_dict_checkpoint(torch.load(path, map_location=device, weights_only=False)) - -@main_process_only -def torch_save(obj, path): - state_dict = obj.state_dict() - torch.save(state_dict, path) - -def torch_recovery(obj, path, end_of_epoch): - del end_of_epoch - - state_dict = torch_patched_state_dict_load(path, "cpu") - try: - obj.load_state_dict(state_dict, strict=True) - except TypeError: - obj.load_state_dict(state_dict) - -def torch_parameter_transfer(obj, path): - incompatible_keys = obj.load_state_dict(torch_patched_state_dict_load(path, "cpu"), strict=False) - - for missing_key in incompatible_keys.missing_keys: - pass - for unexpected_key in incompatible_keys.unexpected_keys: - pass - -WEAKREF_MARKER = "WEAKREF" - -def _cycliclrsaver(obj, path): - state_dict = obj.state_dict() - if state_dict.get("_scale_fn_ref") is not None: state_dict["_scale_fn_ref"] = WEAKREF_MARKER - - torch.save(state_dict, path) - -def _cycliclrloader(obj, path, end_of_epoch): - del end_of_epoch - - try: - obj.load_state_dict(torch.load(path, map_location="cpu", weights_only=False), strict=True) - except TypeError: - obj.load_state_dict(torch.load(path, map_location="cpu", weights_only=False)) - -DEFAULT_LOAD_HOOKS = {torch.nn.Module: torch_recovery, torch.optim.Optimizer: torch_recovery, torch.optim.lr_scheduler.ReduceLROnPlateau: torch_recovery, torch.cuda.amp.grad_scaler.GradScaler: torch_recovery} -DEFAULT_SAVE_HOOKS = { torch.nn.Module: torch_save, torch.optim.Optimizer: torch_save, torch.optim.lr_scheduler.ReduceLROnPlateau: torch_save, torch.cuda.amp.grad_scaler.GradScaler: torch_save} -DEFAULT_LOAD_HOOKS[torch.optim.lr_scheduler.LRScheduler] = torch_recovery -DEFAULT_SAVE_HOOKS[torch.optim.lr_scheduler.LRScheduler] = torch_save -DEFAULT_TRANSFER_HOOKS = {torch.nn.Module: torch_parameter_transfer} -DEFAULT_SAVE_HOOKS[torch.optim.lr_scheduler.CyclicLR] = _cycliclrsaver -DEFAULT_LOAD_HOOKS[torch.optim.lr_scheduler.CyclicLR] = _cycliclrloader - -def register_checkpoint_hooks(cls, save_on_main_only=True): - global DEFAULT_LOAD_HOOKS, DEFAULT_SAVE_HOOKS, DEFAULT_TRANSFER_HOOKS - - for name, method in cls.__dict__.items(): - if hasattr(method, "_speechbrain_saver"): DEFAULT_SAVE_HOOKS[cls] = main_process_only(method) if save_on_main_only else method - if hasattr(method, "_speechbrain_loader"): DEFAULT_LOAD_HOOKS[cls] = method - if hasattr(method, "_speechbrain_transfer"): DEFAULT_TRANSFER_HOOKS[cls] = method - - return cls - -def mark_as_saver(method): - sig = inspect.signature(method) - - try: - sig.bind(object(), "testpath") - except TypeError: - raise TypeError - - method._speechbrain_saver = True - return method - -def mark_as_transfer(method): - sig = inspect.signature(method) - - try: - sig.bind(object(), "testpath") - except TypeError: - raise TypeError - - method._speechbrain_transfer = True - return method - -def mark_as_loader(method): - sig = inspect.signature(method) - - try: - sig.bind(object(), "testpath", True) - except TypeError: - raise TypeError - - method._speechbrain_loader = True - return method - -def ddp_all_reduce(communication_object, reduce_op): - if MAIN_PROC_ONLY >= 1 or not is_distributed_initialized(): return communication_object - torch.distributed.all_reduce(communication_object, op=reduce_op) - - return communication_object - -def fwd_default_precision(fwd = None, cast_inputs = torch.float32): - if fwd is None: return functools.partial(fwd_default_precision, cast_inputs=cast_inputs) - - wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs) - - @functools.wraps(fwd) - def wrapper(*args, force_allow_autocast = False, **kwargs): - return fwd(*args, **kwargs) if force_allow_autocast else wrapped_fwd(*args, **kwargs) - - return wrapper - -def spectral_magnitude(stft, power = 1, log = False, eps = 1e-14): - spectr = stft.pow(2).sum(-1) - - if power < 1: spectr = spectr + eps - spectr = spectr.pow(power) - - if log: return (spectr + eps).log() - return spectr - -class Filterbank(torch.nn.Module): - def __init__(self, n_mels=40, log_mel=True, filter_shape="triangular", f_min=0, f_max=8000, n_fft=400, sample_rate=16000, power_spectrogram=2, amin=1e-10, ref_value=1.0, top_db=80.0, param_change_factor=1.0, param_rand_factor=0.0, freeze=True): - super().__init__() - self.n_mels = n_mels - self.log_mel = log_mel - self.filter_shape = filter_shape - self.f_min = f_min - self.f_max = f_max - self.n_fft = n_fft - self.sample_rate = sample_rate - self.power_spectrogram = power_spectrogram - self.amin = amin - self.ref_value = ref_value - self.top_db = top_db - self.freeze = freeze - self.n_stft = self.n_fft // 2 + 1 - self.db_multiplier = math.log10(max(self.amin, self.ref_value)) - self.device_inp = torch.device("cpu") - self.param_change_factor = param_change_factor - self.param_rand_factor = param_rand_factor - self.multiplier = 10 if self.power_spectrogram == 2 else 20 - - hz = self._to_hz(torch.linspace(self._to_mel(self.f_min), self._to_mel(self.f_max), self.n_mels + 2)) - - band = hz[1:] - hz[:-1] - self.band = band[:-1] - self.f_central = hz[1:-1] - - if not self.freeze: - self.f_central = torch.nn.Parameter(self.f_central / (self.sample_rate * self.param_change_factor)) - self.band = torch.nn.Parameter(self.band / (self.sample_rate * self.param_change_factor)) - - self.all_freqs_mat = torch.linspace(0, self.sample_rate // 2, self.n_stft).repeat(self.f_central.shape[0], 1) - - def forward(self, spectrogram): - f_central_mat = self.f_central.repeat(self.all_freqs_mat.shape[1], 1).transpose(0, 1) - band_mat = self.band.repeat(self.all_freqs_mat.shape[1], 1).transpose(0, 1) - - if not self.freeze: - f_central_mat = f_central_mat * (self.sample_rate * self.param_change_factor * self.param_change_factor) - band_mat = band_mat * (self.sample_rate * self.param_change_factor * self.param_change_factor) - elif self.param_rand_factor != 0 and self.training: - rand_change = (1.0 + torch.rand(2) * 2 * self.param_rand_factor - self.param_rand_factor) - f_central_mat = f_central_mat * rand_change[0] - band_mat = band_mat * rand_change[1] - - fbank_matrix = self._create_fbank_matrix(f_central_mat, band_mat).to(spectrogram.device) - sp_shape = spectrogram.shape - if len(sp_shape) == 4: spectrogram = spectrogram.permute(0, 3, 1, 2).reshape(sp_shape[0] * sp_shape[3], sp_shape[1], sp_shape[2]) - - fbanks = spectrogram @ fbank_matrix - if self.log_mel: fbanks = self._amplitude_to_DB(fbanks) - - if len(sp_shape) == 4: - fb_shape = fbanks.shape - fbanks = fbanks.reshape(sp_shape[0], sp_shape[3], fb_shape[1], fb_shape[2]).permute(0, 2, 3, 1) - - return fbanks - - @staticmethod - def _to_mel(hz): - return 2595 * math.log10(1 + hz / 700) - - @staticmethod - def _to_hz(mel): - return 700 * (10 ** (mel / 2595) - 1) - - def _triangular_filters(self, all_freqs, f_central, band): - slope = (all_freqs - f_central) / band - return torch.zeros(1, device=self.device_inp).max((slope + 1.0).min(-slope + 1.0)).transpose(0, 1) - - def _rectangular_filters(self, all_freqs, f_central, band): - left_side = right_size = all_freqs.ge(f_central - band) - right_size = all_freqs.le(f_central + band) - - return (left_side * right_size).float().transpose(0, 1) - - def _gaussian_filters(self, all_freqs, f_central, band, smooth_factor=torch.tensor(2)): - return (-0.5 * ((all_freqs - f_central) / (band / smooth_factor)) ** 2).exp().transpose(0, 1) - - def _create_fbank_matrix(self, f_central_mat, band_mat): - if self.filter_shape == "triangular": fbank_matrix = self._triangular_filters(self.all_freqs_mat, f_central_mat, band_mat) - elif self.filter_shape == "rectangular": fbank_matrix = self._rectangular_filters(self.all_freqs_mat, f_central_mat, band_mat) - else: fbank_matrix = self._gaussian_filters(self.all_freqs_mat, f_central_mat, band_mat) - - return fbank_matrix - - def _amplitude_to_DB(self, x): - x_db = self.multiplier * x.clamp(min=self.amin).log10() - x_db -= self.multiplier * self.db_multiplier - - return x_db.max((x_db.amax(dim=(-2, -1)) - self.top_db).view(x_db.shape[0], 1, 1)) - -class ContextWindow(torch.nn.Module): - def __init__(self, left_frames=0, right_frames=0): - super().__init__() - self.left_frames = left_frames - self.right_frames = right_frames - self.context_len = self.left_frames + self.right_frames + 1 - self.kernel_len = 2 * max(self.left_frames, self.right_frames) + 1 - self.kernel = torch.eye(self.context_len, self.kernel_len) - - if self.right_frames > self.left_frames: self.kernel = torch.roll(self.kernel, self.right_frames - self.left_frames, 1) - self.first_call = True - - def forward(self, x): - x = x.transpose(1, 2) - if self.first_call: - self.first_call = False - self.kernel = (self.kernel.repeat(x.shape[1], 1, 1).view(x.shape[1] * self.context_len, self.kernel_len).unsqueeze(1)) - - or_shape = x.shape - if len(or_shape) == 4: x = x.reshape(or_shape[0] * or_shape[2], or_shape[1], or_shape[3]) - - cw_x = torch.nn.functional.conv1d(x, self.kernel.to(x.device), groups=x.shape[1], padding=max(self.left_frames, self.right_frames)) - if len(or_shape) == 4: cw_x = cw_x.reshape(or_shape[0], cw_x.shape[1], or_shape[2], cw_x.shape[-1]) - - return cw_x.transpose(1, 2) - -class FilterProperties: - def __init__(self, window_size = 0, stride = 1, dilation = 1, causal = False): - self.window_size = window_size - self.stride = stride - self.dilation = dilation - self.causal = causal - - def __post_init__(self): - assert self.window_size > 0 - assert self.stride > 0 - assert (self.dilation > 0) - - @staticmethod - def pointwise_filter(): - return FilterProperties(window_size=1, stride=1) - - def get_effective_size(self): - return 1 + ((self.window_size - 1) * self.dilation) - - def get_convolution_padding(self): - if self.window_size % 2 == 0: raise ValueError - if self.causal: return self.get_effective_size() - 1 - - return (self.get_effective_size() - 1) // 2 - - def get_noncausal_equivalent(self): - if not self.causal: return self - return FilterProperties(window_size=(self.window_size - 1) * 2 + 1, stride=self.stride, dilation=self.dilation, causal=False) - - def with_on_top(self, other, allow_approximate=True): - self_size = self.window_size - - if other.window_size % 2 == 0: - if allow_approximate: other_size = other.window_size + 1 - else: raise ValueError - else: other_size = other.window_size - - if (self.causal or other.causal) and not (self.causal and other.causal): - if allow_approximate: return self.get_noncausal_equivalent().with_on_top(other.get_noncausal_equivalent()) - else: raise ValueError - - return FilterProperties(self_size + (self.stride * (other_size - 1)), self.stride * other.stride, self.dilation * other.dilation, self.causal) - -class STFT(torch.nn.Module): - def __init__(self, sample_rate, win_length=25, hop_length=10, n_fft=400, window_fn=torch.hamming_window, normalized_stft=False, center=True, pad_mode="constant", onesided=True): - super().__init__() - self.sample_rate = sample_rate - self.win_length = win_length - self.hop_length = hop_length - self.n_fft = n_fft - self.normalized_stft = normalized_stft - self.center = center - self.pad_mode = pad_mode - self.onesided = onesided - self.win_length = int(round((self.sample_rate / 1000.0) * self.win_length)) - self.hop_length = int(round((self.sample_rate / 1000.0) * self.hop_length)) - self.window = window_fn(self.win_length) - - def forward(self, x): - or_shape = x.shape - if len(or_shape) == 3: x = x.transpose(1, 2).reshape(or_shape[0] * or_shape[2], or_shape[1]) - - device = x.device - if str(device) not in ["cuda", "cpu"]: x = x.cpu() - - stft = torch.view_as_real(torch.stft(x, self.n_fft, self.hop_length, self.win_length, self.window.to(x.device), self.center, self.pad_mode, self.normalized_stft, self.onesided, return_complex=True)) - stft = stft.reshape(or_shape[0], or_shape[2], stft.shape[1], stft.shape[2], stft.shape[3]).permute(0, 3, 2, 4, 1) if len(or_shape) == 3 else stft.transpose(2, 1) - - return stft.to(device) - - def get_filter_properties(self): - if not self.center: raise ValueError - return FilterProperties(window_size=self.win_length, stride=self.hop_length) - -class Deltas(torch.nn.Module): - def __init__(self, input_size, window_length=5): - super().__init__() - self.n = (window_length - 1) // 2 - self.denom = self.n * (self.n + 1) * (2 * self.n + 1) / 3 - self.register_buffer("kernel", torch.arange(-self.n, self.n + 1, dtype=torch.float32).repeat(input_size, 1, 1),) - - def forward(self, x): - x = x.transpose(1, 2).transpose(2, -1) - or_shape = x.shape - - if len(or_shape) == 4: x = x.reshape(or_shape[0] * or_shape[2], or_shape[1], or_shape[3]) - - x = torch.nn.functional.pad(x, (self.n, self.n), mode="replicate") - delta_coeff = (torch.nn.functional.conv1d(x, self.kernel.to(x.device), groups=x.shape[1]) / self.denom) - - if len(or_shape) == 4: delta_coeff = delta_coeff.reshape(or_shape[0], or_shape[1], or_shape[2], or_shape[3]) - return delta_coeff.transpose(1, -1).transpose(2, -1) - -class Fbank(torch.nn.Module): - def __init__(self, deltas=False, context=False, requires_grad=False, sample_rate=16000, f_min=0, f_max=None, n_fft=400, n_mels=40, filter_shape="triangular", param_change_factor=1.0, param_rand_factor=0.0, left_frames=5, right_frames=5, win_length=25, hop_length=10): - super().__init__() - self.deltas = deltas - self.context = context - self.requires_grad = requires_grad - if f_max is None: f_max = sample_rate / 2 - self.compute_STFT = STFT(sample_rate=sample_rate,n_fft=n_fft,win_length=win_length,hop_length=hop_length) - self.compute_fbanks = Filterbank(sample_rate=sample_rate,n_fft=n_fft,n_mels=n_mels,f_min=f_min,f_max=f_max,freeze=not requires_grad,filter_shape=filter_shape,param_change_factor=param_change_factor,param_rand_factor=param_rand_factor) - self.compute_deltas = Deltas(input_size=n_mels) - self.context_window = ContextWindow(left_frames=left_frames, right_frames=right_frames) - - @fwd_default_precision(cast_inputs=torch.float32) - def forward(self, wav): - fbanks = self.compute_fbanks(spectral_magnitude(self.compute_STFT(wav))) - if self.deltas: - delta1 = self.compute_deltas(fbanks) - fbanks = torch.cat([fbanks, delta1, self.compute_deltas(delta1)], dim=2) - - if self.context: fbanks = self.context_window(fbanks) - return fbanks - - def get_filter_properties(self): - return self.compute_STFT.get_filter_properties() - -@register_checkpoint_hooks -class InputNormalization(torch.nn.Module): - def __init__(self, mean_norm=True, std_norm=True, norm_type="global", avg_factor=None, requires_grad=False, update_until_epoch=3): - super().__init__() - self.mean_norm = mean_norm - self.std_norm = std_norm - self.norm_type = norm_type - self.avg_factor = avg_factor - self.requires_grad = requires_grad - self.glob_mean = torch.tensor([0]) - self.glob_std = torch.tensor([0]) - self.spk_dict_mean = {} - self.spk_dict_std = {} - self.spk_dict_count = {} - self.weight = 1.0 - self.count = 0 - self.eps = 1e-10 - self.update_until_epoch = update_until_epoch - - def forward(self, x, lengths, spk_ids = torch.tensor([]), epoch=0): - N_batches = x.shape[0] - current_means, current_stds = [], [] - - if self.norm_type == "sentence" or self.norm_type == "speaker": out = torch.empty_like(x) - - for snt_id in range(N_batches): - actual_size = torch.round(lengths[snt_id] * x.shape[1]).int() - current_mean, current_std = self._compute_current_stats(x[snt_id, 0:actual_size, ...]) - - current_means.append(current_mean) - current_stds.append(current_std) - - if self.norm_type == "sentence": out[snt_id] = (x[snt_id] - current_mean.data) / current_std.data - - if self.norm_type == "speaker": - spk_id = int(spk_ids[snt_id][0]) - - if self.training: - if spk_id not in self.spk_dict_mean: - self.spk_dict_mean[spk_id] = current_mean - self.spk_dict_std[spk_id] = current_std - self.spk_dict_count[spk_id] = 1 - else: - self.spk_dict_count[spk_id] = (self.spk_dict_count[spk_id] + 1) - self.weight = (1 / self.spk_dict_count[spk_id]) if self.avg_factor is None else self.avg_factor - - self.spk_dict_mean[spk_id] = (1 - self.weight) * self.spk_dict_mean[spk_id].to(current_mean) + self.weight * current_mean - self.spk_dict_std[spk_id] = (1 - self.weight) * self.spk_dict_std[spk_id].to(current_std) + self.weight * current_std - - self.spk_dict_mean[spk_id].detach() - self.spk_dict_std[spk_id].detach() - - speaker_mean = self.spk_dict_mean[spk_id].data - speaker_std = self.spk_dict_std[spk_id].data - else: - if spk_id in self.spk_dict_mean: - speaker_mean = self.spk_dict_mean[spk_id].data - speaker_std = self.spk_dict_std[spk_id].data - else: - speaker_mean = current_mean.data - speaker_std = current_std.data - - out[snt_id] = (x[snt_id] - speaker_mean) / speaker_std - - if self.norm_type == "batch" or self.norm_type == "global": - current_mean = ddp_all_reduce(torch.stack(current_means).mean(dim=0), torch.distributed.ReduceOp.AVG) - current_std = ddp_all_reduce(torch.stack(current_stds).mean(dim=0), torch.distributed.ReduceOp.AVG) - - if self.norm_type == "batch": out = (x - current_mean.data) / (current_std.data) - - if self.norm_type == "global": - if self.training: - if self.count == 0: - self.glob_mean = current_mean - self.glob_std = current_std - elif epoch is None or epoch < self.update_until_epoch: - self.weight = (1 / (self.count + 1)) if self.avg_factor is None else self.avg_factor - self.glob_mean = (1 - self.weight) * self.glob_mean.to(current_mean) + self.weight * current_mean - self.glob_std = (1 - self.weight) * self.glob_std.to(current_std) + self.weight * current_std - - self.glob_mean.detach() - self.glob_std.detach() - self.count = self.count + 1 - - out = (x - self.glob_mean.data.to(x)) / (self.glob_std.data.to(x)) - - return out - - def _compute_current_stats(self, x): - current_std = x.std(dim=0).detach().data if self.std_norm else torch.tensor([1.0], device=x.device) - return x.mean(dim=0).detach().data if self.mean_norm else torch.tensor([0.0], device=x.device), torch.max(current_std, self.eps * torch.ones_like(current_std)) - - def _statistics_dict(self): - state = {} - state["count"] = self.count - state["glob_mean"] = self.glob_mean - state["glob_std"] = self.glob_std - state["spk_dict_mean"] = self.spk_dict_mean - state["spk_dict_std"] = self.spk_dict_std - state["spk_dict_count"] = self.spk_dict_count - - return state - - def _load_statistics_dict(self, state): - self.count = state["count"] - - if isinstance(state["glob_mean"], int): - self.glob_mean = state["glob_mean"] - self.glob_std = state["glob_std"] - else: - self.glob_mean = state["glob_mean"] - self.glob_std = state["glob_std"] - - self.spk_dict_mean = {} - for spk in state["spk_dict_mean"]: - self.spk_dict_mean[spk] = state["spk_dict_mean"][spk] - - self.spk_dict_std = {} - for spk in state["spk_dict_std"]: - self.spk_dict_std[spk] = state["spk_dict_std"][spk] - - self.spk_dict_count = state["spk_dict_count"] - return state - - def to(self, device): - self = super(InputNormalization, self).to(device) - self.glob_mean = self.glob_mean.to(device) - self.glob_std = self.glob_std.to(device) - - for spk in self.spk_dict_mean: - self.spk_dict_mean[spk] = self.spk_dict_mean[spk].to(device) - self.spk_dict_std[spk] = self.spk_dict_std[spk].to(device) - - return self - - @mark_as_saver - def _save(self, path): - torch.save(self._statistics_dict(), path) - - @mark_as_transfer - @mark_as_loader - def _load(self, path, end_of_epoch=False): - del end_of_epoch - stats = torch.load(path, map_location="cpu", weights_only=False) - self._load_statistics_dict(stats) \ No newline at end of file diff --git a/main/library/speaker_diarization/parameter_transfer.py b/main/library/speaker_diarization/parameter_transfer.py deleted file mode 100644 index 33b73d07ed8dcc0493fc1235656e76e924c30b4d..0000000000000000000000000000000000000000 --- a/main/library/speaker_diarization/parameter_transfer.py +++ /dev/null @@ -1,119 +0,0 @@ -import os -import sys -import inspect - -sys.path.append(os.getcwd()) - -from main.library.speaker_diarization.speechbrain import fetch, run_on_main -from main.library.speaker_diarization.features import DEFAULT_TRANSFER_HOOKS, DEFAULT_LOAD_HOOKS - -def get_default_hook(obj, default_hooks): - for cls in inspect.getmro(type(obj)): - if cls in default_hooks: return default_hooks[cls] - - return None - -class Pretrainer: - def __init__(self, loadables=None, paths=None, custom_hooks=None, conditions=None): - self.loadables = {} - - if loadables is not None: self.add_loadables(loadables) - self.paths = {} - - if paths is not None: self.add_paths(paths) - self.custom_hooks = {} - - if custom_hooks is not None: self.add_custom_hooks(custom_hooks) - self.conditions = {} - - if conditions is not None: self.add_conditions(conditions) - self.is_local = [] - - def add_loadables(self, loadables): - self.loadables.update(loadables) - - def add_paths(self, paths): - self.paths.update(paths) - - def add_custom_hooks(self, custom_hooks): - self.custom_hooks.update(custom_hooks) - - def add_conditions(self, conditions): - self.conditions.update(conditions) - - @staticmethod - def split_path(path): - def split(src): - if "/" in src: return src.rsplit("/", maxsplit=1) - else: return "./", src - - return split(path) - - def collect_files(self, default_source=None): - loadable_paths = {} - for name in self.loadables: - if not self.is_loadable(name): continue - save_filename = name + ".ckpt" - - if name in self.paths: source, filename = self.split_path(self.paths[name]) - elif default_source is not None: - filename = save_filename - source = default_source - else: raise ValueError - - fetch_kwargs = {"filename": filename, "source": source} - path = None - - def run_fetch(**kwargs): - nonlocal path - - path = fetch(**kwargs) - - run_on_main(run_fetch, kwargs=fetch_kwargs, post_func=run_fetch, post_kwargs=fetch_kwargs) - - loadable_paths[name] = path - self.paths[name] = str(path) - self.is_local.append(name) - - return loadable_paths - - def is_loadable(self, name): - if name not in self.conditions: return True - condition = self.conditions[name] - - if callable(condition): return condition() - else: return bool(condition) - - def load_collected(self): - paramfiles = {} - for name in self.loadables: - if not self.is_loadable(name): continue - - if name in self.is_local: paramfiles[name] = self.paths[name] - else: raise ValueError - - self._call_load_hooks(paramfiles) - - def _call_load_hooks(self, paramfiles): - for name, obj in self.loadables.items(): - if not self.is_loadable(name): continue - loadpath = paramfiles[name] - - if name in self.custom_hooks: - self.custom_hooks[name](obj, loadpath) - continue - - default_hook = get_default_hook(obj, DEFAULT_TRANSFER_HOOKS) - - if default_hook is not None: - default_hook(obj, loadpath) - continue - - default_hook = get_default_hook(obj, DEFAULT_LOAD_HOOKS) - - if default_hook is not None: - end_of_epoch = False - default_hook(obj, loadpath, end_of_epoch) - continue - - raise RuntimeError \ No newline at end of file diff --git a/main/library/speaker_diarization/segment.py b/main/library/speaker_diarization/segment.py deleted file mode 100644 index b1cd6e31ea74a6c96c44b4ecbc478830bf08f64e..0000000000000000000000000000000000000000 --- a/main/library/speaker_diarization/segment.py +++ /dev/null @@ -1,537 +0,0 @@ -import numpy as np - -from sortedcontainers import SortedList - -class Timeline: - @classmethod - def from_df(cls, df, uri = None): - return cls(segments=list(df['segment']), uri=uri) - - def __init__(self, segments = None, uri = None): - if segments is None: segments = () - segments_set = set([segment for segment in segments if segment]) - - self.segments_set_ = segments_set - self.segments_list_ = SortedList(segments_set) - self.segments_boundaries_ = SortedList((boundary for segment in segments_set for boundary in segment)) - self.uri = uri - - def __len__(self): - return len(self.segments_set_) - - def __nonzero__(self): - return self.__bool__() - - def __bool__(self): - return len(self.segments_set_) > 0 - - def __iter__(self): - return iter(self.segments_list_) - - def __getitem__(self, k): - return self.segments_list_[k] - - def __eq__(self, other): - return self.segments_set_ == other.segments_set_ - - def __ne__(self, other): - return self.segments_set_ != other.segments_set_ - - def index(self, segment): - return self.segments_list_.index(segment) - - def add(self, segment): - segments_set_ = self.segments_set_ - if segment in segments_set_ or not segment: return self - - segments_set_.add(segment) - self.segments_list_.add(segment) - - segments_boundaries_ = self.segments_boundaries_ - segments_boundaries_.add(segment.start) - segments_boundaries_.add(segment.end) - - return self - - def remove(self, segment): - segments_set_ = self.segments_set_ - if segment not in segments_set_: return self - - segments_set_.remove(segment) - self.segments_list_.remove(segment) - - segments_boundaries_ = self.segments_boundaries_ - segments_boundaries_.remove(segment.start) - segments_boundaries_.remove(segment.end) - - return self - - def discard(self, segment): - return self.remove(segment) - - def __ior__(self, timeline): - return self.update(timeline) - - def update(self, timeline): - segments_set = self.segments_set_ - segments_set |= timeline.segments_set_ - - self.segments_list_ = SortedList(segments_set) - self.segments_boundaries_ = SortedList((boundary for segment in segments_set for boundary in segment)) - - return self - - def __or__(self, timeline): - return self.union(timeline) - - def union(self, timeline): - return Timeline(segments=self.segments_set_ | timeline.segments_set_, uri=self.uri) - - def co_iter(self, other): - for segment in self.segments_list_: - temp = Segment(start=segment.end, end=segment.end) - - for other_segment in other.segments_list_.irange(maximum=temp): - if segment.intersects(other_segment): yield segment, other_segment - - def crop_iter(self, support, mode = 'intersection', returns_mapping = False): - if mode not in {'loose', 'strict', 'intersection'}: raise ValueError - if not isinstance(support, (Segment, Timeline)): raise TypeError - - if isinstance(support, Segment): - support = Timeline(segments=([support] if support else []), uri=self.uri) - - for yielded in self.crop_iter(support, mode=mode, returns_mapping=returns_mapping): - yield yielded - - return - - support = support.support() - - if mode == 'loose': - for segment, _ in self.co_iter(support): - yield segment - - return - - if mode == 'strict': - for segment, other_segment in self.co_iter(support): - if segment in other_segment: yield segment - - return - - for segment, other_segment in self.co_iter(support): - mapped_to = segment & other_segment - if not mapped_to: continue - - if returns_mapping: yield segment, mapped_to - else: yield mapped_to - - def crop(self, support, mode = 'intersection', returns_mapping = False): - if mode == 'intersection' and returns_mapping: - segments, mapping = [], {} - - for segment, mapped_to in self.crop_iter(support, mode='intersection', returns_mapping=True): - segments.append(mapped_to) - mapping[mapped_to] = mapping.get(mapped_to, list()) + [segment] - - return Timeline(segments=segments, uri=self.uri), mapping - - return Timeline(segments=self.crop_iter(support, mode=mode), uri=self.uri) - - def overlapping(self, t): - return list(self.overlapping_iter(t)) - - def overlapping_iter(self, t): - for segment in self.segments_list_.irange(maximum=Segment(start=t, end=t)): - if segment.overlaps(t): yield segment - - def get_overlap(self): - overlaps_tl = Timeline(uri=self.uri) - - for s1, s2 in self.co_iter(self): - if s1 == s2: continue - - overlaps_tl.add(s1 & s2) - - return overlaps_tl.support() - - def extrude(self, removed, mode = 'intersection'): - if isinstance(removed, Segment): removed = Timeline([removed]) - - if mode == "loose": mode = "strict" - elif mode == "strict": mode = "loose" - - return self.crop(removed.gaps(support=Timeline([self.extent()], uri=self.uri)), mode=mode) - - def __str__(self): - n = len(self.segments_list_) - string = "[" - - for i, segment in enumerate(self.segments_list_): - string += str(segment) - string += "\n " if i + 1 < n else "" - - string += "]" - return string - - def __repr__(self): - return "" % (self.uri, list(self.segments_list_)) - - def __contains__(self, included): - if isinstance(included, Segment): return included in self.segments_set_ - elif isinstance(included, Timeline): return self.segments_set_.issuperset(included.segments_set_) - else: raise TypeError - - def empty(self): - return Timeline(uri=self.uri) - - def covers(self, other): - gaps = self.gaps(support=other.extent()) - - for _ in gaps.co_iter(other): - return False - - return True - - def copy(self, segment_func = None): - if segment_func is None: return Timeline(segments=self.segments_list_, uri=self.uri) - return Timeline(segments=[segment_func(s) for s in self.segments_list_], uri=self.uri) - - def extent(self): - if self.segments_set_: - segments_boundaries_ = self.segments_boundaries_ - return Segment(start=segments_boundaries_[0], end=segments_boundaries_[-1]) - - return Segment(start=0.0, end=0.0) - - def support_iter(self, collar = 0.0): - if not self: return - - new_segment = self.segments_list_[0] - - for segment in self: - possible_gap = segment ^ new_segment - - if not possible_gap or possible_gap.duration < collar: new_segment |= segment - else: - yield new_segment - new_segment = segment - - yield new_segment - - def support(self, collar = 0.): - return Timeline(segments=self.support_iter(collar), uri=self.uri) - - def duration(self): - return sum(s.duration for s in self.support_iter()) - - def gaps_iter(self, support = None): - if support is None: support = self.extent() - if not isinstance(support, (Segment, Timeline)): raise TypeError - - if isinstance(support, Segment): - end = support.start - - for segment in self.crop(support, mode='intersection').support(): - gap = Segment(start=end, end=segment.start) - if gap: yield gap - - end = segment.end - - gap = Segment(start=end, end=support.end) - if gap: yield gap - elif isinstance(support, Timeline): - for segment in support.support(): - for gap in self.gaps_iter(support=segment): - yield gap - - def gaps(self, support = None): - return Timeline(segments=self.gaps_iter(support=support), uri=self.uri) - - def segmentation(self): - support = self.support() - timestamps = set([]) - - for (start, end) in self: - timestamps.add(start) - timestamps.add(end) - - timestamps = sorted(timestamps) - if len(timestamps) == 0: return Timeline(uri=self.uri) - - segments = [] - start = timestamps[0] - - for end in timestamps[1:]: - segment = Segment(start=start, end=end) - - if segment and support.overlapping(segment.middle): segments.append(segment) - start = end - - return Timeline(segments=segments, uri=self.uri) - - def _iter_uem(self): - uri = self.uri if self.uri else "" - - for segment in self: - yield f"{uri} 1 {segment.start:.3f} {segment.end:.3f}\n" - - def to_uem(self): - return "".join([line for line in self._iter_uem()]) - - def write_uem(self, file): - for line in self._iter_uem(): - file.write(line) - - def _repr_png_(self): - return None - -class Segment: - def __init__(self, start, end): - self.start = start - self.end = end - - @staticmethod - def set_precision(ndigits = None): - global AUTO_ROUND_TIME, SEGMENT_PRECISION - - if ndigits is None: - AUTO_ROUND_TIME = False - SEGMENT_PRECISION = 1e-6 - else: - AUTO_ROUND_TIME = True - SEGMENT_PRECISION = 10 ** (-ndigits) - - def __bool__(self): - return bool((self.end - self.start) > SEGMENT_PRECISION) - - def __post_init__(self): - if AUTO_ROUND_TIME: - object.__setattr__(self, 'start', int(self.start / SEGMENT_PRECISION + 0.5) * SEGMENT_PRECISION) - object.__setattr__(self, 'end', int(self.end / SEGMENT_PRECISION + 0.5) * SEGMENT_PRECISION) - - @property - def duration(self): - return self.end - self.start if self else 0. - - @property - def middle(self): - return .5 * (self.start + self.end) - - def __iter__(self): - yield self.start - yield self.end - - def copy(self): - return Segment(start=self.start, end=self.end) - - def __contains__(self, other): - return (self.start <= other.start) and (self.end >= other.end) - - def __and__(self, other): - return Segment(start=max(self.start, other.start), end=min(self.end, other.end)) - - def intersects(self, other): - return (self.start < other.start and other.start < self.end - SEGMENT_PRECISION) or (self.start > other.start and self.start < other.end - SEGMENT_PRECISION) or (self.start == other.start) - - def overlaps(self, t): - return self.start <= t and self.end >= t - - def __or__(self, other): - if not self: return other - if not other: return self - - return Segment(start=min(self.start, other.start), end=max(self.end, other.end)) - - def __xor__(self, other): - if (not self) or (not other): raise ValueError - - return Segment(start=min(self.end, other.end), end=max(self.start, other.start)) - - def _str_helper(self, seconds): - from datetime import timedelta - - negative = seconds < 0 - td = timedelta(seconds=abs(seconds)) - - hours, remainder = divmod(td.seconds + 86400 * td.days, 3600) - minutes, seconds = divmod(remainder, 60) - - return '%s%02d:%02d:%02d.%03d' % ('-' if negative else ' ', hours, minutes, seconds, td.microseconds / 1000) - - def __str__(self): - if self: return '[%s --> %s]' % (self._str_helper(self.start), self._str_helper(self.end)) - return '[]' - - def __repr__(self): - return '' % (self.start, self.end) - - def _repr_png_(self): - return None - -class SlidingWindow: - def __init__(self, duration=0.030, step=0.010, start=0.000, end=None): - if duration <= 0: raise ValueError - self.__duration = duration - if step <= 0: raise ValueError - - self.__step = step - self.__start = start - - if end is None: self.__end = np.inf - else: - if end <= start: raise ValueError - self.__end = end - - self.__i = -1 - - @property - def start(self): - return self.__start - - @property - def end(self): - return self.__end - - @property - def step(self): - return self.__step - - @property - def duration(self): - return self.__duration - - def closest_frame(self, t): - return int(np.rint((t - self.__start - .5 * self.__duration) / self.__step)) - - def samples(self, from_duration, mode = 'strict'): - if mode == 'strict': return int(np.floor((from_duration - self.duration) / self.step)) + 1 - elif mode == 'loose': return int(np.floor((from_duration + self.duration) / self.step)) - elif mode == 'center': return int(np.rint((from_duration / self.step))) - - def crop(self, focus, mode = 'loose', fixed = None, return_ranges = False): - if not isinstance(focus, (Segment, Timeline)): raise TypeError - - if isinstance(focus, Timeline): - if fixed is not None: raise ValueError - - if return_ranges: - ranges = [] - - for i, s in enumerate(focus.support()): - rng = self.crop(s, mode=mode, fixed=fixed, return_ranges=True) - - if i == 0 or rng[0][0] > ranges[-1][1]: ranges += rng - else: ranges[-1][1] = rng[0][1] - - return ranges - - return np.unique(np.hstack([self.crop(s, mode=mode, fixed=fixed, return_ranges=False) for s in focus.support()])) - - if mode == 'loose': - i = int(np.ceil((focus.start - self.duration - self.start) / self.step)) - - if fixed is None: - j = int(np.floor((focus.end - self.start) / self.step)) - rng = (i, j + 1) - else: - n = self.samples(fixed, mode='loose') - rng = (i, i + n) - elif mode == 'strict': - i = int(np.ceil((focus.start - self.start) / self.step)) - - if fixed is None: - j = int(np.floor((focus.end - self.duration - self.start) / self.step)) - rng = (i, j + 1) - else: - n = self.samples(fixed, mode='strict') - rng = (i, i + n) - elif mode == 'center': - i = self.closest_frame(focus.start) - - if fixed is None: - j = self.closest_frame(focus.end) - rng = (i, j + 1) - else: - n = self.samples(fixed, mode='center') - rng = (i, i + n) - else: raise ValueError - - if return_ranges: return [list(rng)] - return np.array(range(*rng), dtype=np.int64) - - def segmentToRange(self, segment): - return self.segment_to_range(segment) - - def segment_to_range(self, segment): - return self.closest_frame(segment.start), int(segment.duration / self.step) + 1 - - def rangeToSegment(self, i0, n): - return self.range_to_segment(i0, n) - - def range_to_segment(self, i0, n): - start = self.__start + (i0 - .5) * self.__step + .5 * self.__duration - - if i0 == 0: start = self.start - return Segment(start, start + (n * self.__step)) - - def samplesToDuration(self, nSamples): - return self.samples_to_duration(nSamples) - - def samples_to_duration(self, n_samples): - return self.range_to_segment(0, n_samples).duration - - def durationToSamples(self, duration): - return self.duration_to_samples(duration) - - def duration_to_samples(self, duration): - return self.segment_to_range(Segment(0, duration))[1] - - def __getitem__(self, i): - start = self.__start + i * self.__step - if start >= self.__end: return None - - return Segment(start=start, end=start + self.__duration) - - def next(self): - return self.__next__() - - def __next__(self): - self.__i += 1 - window = self[self.__i] - - if window: return window - else: raise StopIteration() - - def __iter__(self): - self.__i = -1 - return self - - def __len__(self): - if np.isinf(self.__end): raise ValueError - i = self.closest_frame(self.__end) - - while (self[i]): - i += 1 - - length = i - return length - - def copy(self): - return self.__class__(duration=self.duration, step=self.step, start=self.start, end=self.end) - - def __call__(self, support, align_last = False): - if isinstance(support, Timeline): segments = support - elif isinstance(support, Segment): segments = Timeline(segments=[support]) - else: raise TypeError - - for segment in segments: - if segment.duration < self.duration: continue - - for s in SlidingWindow(duration=self.duration, step=self.step, start=segment.start, end=segment.end): - if s in segment: - yield s - last = s - - if align_last and last.end < segment.end: yield Segment(start=segment.end - self.duration, end=segment.end) \ No newline at end of file diff --git a/main/library/speaker_diarization/speechbrain.py b/main/library/speaker_diarization/speechbrain.py deleted file mode 100644 index f67c8d42357848dd9fe80239d220b54fe2d601d5..0000000000000000000000000000000000000000 --- a/main/library/speaker_diarization/speechbrain.py +++ /dev/null @@ -1,220 +0,0 @@ -import os -import torch -import torchaudio - -from functools import wraps -from types import SimpleNamespace -from torch.nn import SyncBatchNorm -from hyperpyyaml import load_hyperpyyaml - -from torch.nn import DataParallel as DP -from torch.nn.parallel import DistributedDataParallel as DDP - -MAIN_PROC_ONLY = 0 - -def fetch(filename, source): - return os.path.abspath(os.path.join(source, filename)) - -def run_on_main(func, args=None, kwargs=None, post_func=None, post_args=None, post_kwargs=None, run_post_on_main=False): - if args is None: args = [] - if kwargs is None: kwargs = {} - if post_args is None: post_args = [] - if post_kwargs is None: post_kwargs = {} - - main_process_only(func)(*args, **kwargs) - ddp_barrier() - - if post_func is not None: - if run_post_on_main: post_func(*post_args, **post_kwargs) - else: - if not if_main_process(): post_func(*post_args, **post_kwargs) - ddp_barrier() - -def is_distributed_initialized(): - return (torch.distributed.is_available() and torch.distributed.is_initialized()) - -def if_main_process(): - if is_distributed_initialized(): return torch.distributed.get_rank() == 0 - else: return True - -class MainProcessContext: - def __enter__(self): - global MAIN_PROC_ONLY - - MAIN_PROC_ONLY += 1 - return self - - def __exit__(self, exc_type, exc_value, traceback): - global MAIN_PROC_ONLY - - MAIN_PROC_ONLY -= 1 - -def main_process_only(function): - @wraps(function) - def main_proc_wrapped_func(*args, **kwargs): - with MainProcessContext(): - return function(*args, **kwargs) if if_main_process() else None - - return main_proc_wrapped_func - -def ddp_barrier(): - if MAIN_PROC_ONLY >= 1 or not is_distributed_initialized(): return - - if torch.distributed.get_backend() == torch.distributed.Backend.NCCL: torch.distributed.barrier(device_ids=[torch.cuda.current_device()]) - else: torch.distributed.barrier() - -class Resample(torch.nn.Module): - def __init__(self, orig_freq=16000, new_freq=16000, *args, **kwargs): - super().__init__() - - self.orig_freq = orig_freq - self.new_freq = new_freq - self.resampler = torchaudio.transforms.Resample(orig_freq=orig_freq, new_freq=new_freq, *args, **kwargs) - - def forward(self, waveforms): - if self.orig_freq == self.new_freq: return waveforms - - unsqueezed = False - if len(waveforms.shape) == 2: - waveforms = waveforms.unsqueeze(1) - unsqueezed = True - elif len(waveforms.shape) == 3: waveforms = waveforms.transpose(1, 2) - else: raise ValueError - - self.resampler.to(waveforms.device) - resampled_waveform = self.resampler(waveforms) - - return resampled_waveform.squeeze(1) if unsqueezed else resampled_waveform.transpose(1, 2) - -class AudioNormalizer: - def __init__(self, sample_rate=16000, mix="avg-to-mono"): - self.sample_rate = sample_rate - - if mix not in ["avg-to-mono", "keep"]: raise ValueError - - self.mix = mix - self._cached_resamplers = {} - - def __call__(self, audio, sample_rate): - if sample_rate not in self._cached_resamplers: self._cached_resamplers[sample_rate] = Resample(sample_rate, self.sample_rate) - return self._mix(self._cached_resamplers[sample_rate](audio.unsqueeze(0)).squeeze(0)) - - def _mix(self, audio): - flat_input = audio.dim() == 1 - - if self.mix == "avg-to-mono": - if flat_input: return audio - return torch.mean(audio, 1) - - if self.mix == "keep": return audio - -class Pretrained(torch.nn.Module): - HPARAMS_NEEDED, MODULES_NEEDED = [], [] - def __init__(self, modules=None, hparams=None, run_opts=None, freeze_params=True): - super().__init__() - - for arg, default in {"device": "cpu", "data_parallel_count": -1, "data_parallel_backend": False, "distributed_launch": False, "distributed_backend": "nccl", "jit": False, "jit_module_keys": None, "compile": False, "compile_module_keys": None, "compile_mode": "reduce-overhead", "compile_using_fullgraph": False, "compile_using_dynamic_shape_tracing": False}.items(): - if run_opts is not None and arg in run_opts: setattr(self, arg, run_opts[arg]) - elif hparams is not None and arg in hparams: setattr(self, arg, hparams[arg]) - else: setattr(self, arg, default) - - self.mods = torch.nn.ModuleDict(modules) - - for module in self.mods.values(): - if module is not None: module.to(self.device) - - if self.HPARAMS_NEEDED and hparams is None: raise ValueError - - if hparams is not None: - for hp in self.HPARAMS_NEEDED: - if hp not in hparams: raise ValueError - - self.hparams = SimpleNamespace(**hparams) - - self._prepare_modules(freeze_params) - self.audio_normalizer = hparams.get("audio_normalizer", AudioNormalizer()) - - def _prepare_modules(self, freeze_params): - self._compile() - self._wrap_distributed() - - if freeze_params: - self.mods.eval() - for p in self.mods.parameters(): - p.requires_grad = False - - def _compile(self): - compile_available = hasattr(torch, "compile") - if not compile_available and self.compile_module_keys is not None: raise ValueError - - compile_module_keys = set() - if self.compile: compile_module_keys = set(self.mods) if self.compile_module_keys is None else set(self.compile_module_keys) - - jit_module_keys = set() - if self.jit: jit_module_keys = set(self.mods) if self.jit_module_keys is None else set(self.jit_module_keys) - - for name in compile_module_keys | jit_module_keys: - if name not in self.mods: raise ValueError - - for name in compile_module_keys: - try: - module = torch.compile(self.mods[name], mode=self.compile_mode, fullgraph=self.compile_using_fullgraph, dynamic=self.compile_using_dynamic_shape_tracing) - except Exception: - continue - - self.mods[name] = module.to(self.device) - jit_module_keys.discard(name) - - for name in jit_module_keys: - module = torch.jit.script(self.mods[name]) - self.mods[name] = module.to(self.device) - - def _compile_jit(self): - self._compile() - - def _wrap_distributed(self): - if not self.distributed_launch and not self.data_parallel_backend: return - elif self.distributed_launch: - for name, module in self.mods.items(): - if any(p.requires_grad for p in module.parameters()): self.mods[name] = DDP(SyncBatchNorm.convert_sync_batchnorm(module), device_ids=[self.device]) - else: - for name, module in self.mods.items(): - if any(p.requires_grad for p in module.parameters()): self.mods[name] = DP(module) if self.data_parallel_count == -1 else DP(module, [i for i in range(self.data_parallel_count)]) - - @classmethod - def from_hparams(cls, source, hparams_file="hyperparams.yaml", overrides={}, download_only=False, overrides_must_match=True, **kwargs): - with open(fetch(filename=hparams_file, source=source)) as fin: - hparams = load_hyperpyyaml(fin, overrides, overrides_must_match=overrides_must_match) - - pretrainer = hparams.get("pretrainer", None) - - if pretrainer is not None: - run_on_main(pretrainer.collect_files, kwargs={"default_source": source}) - if not download_only: - pretrainer.load_collected() - return cls(hparams["modules"], hparams, **kwargs) - else: return cls(hparams["modules"], hparams, **kwargs) - -class EncoderClassifier(Pretrained): - MODULES_NEEDED = ["compute_features", "mean_var_norm", "embedding_model", "classifier"] - - def encode_batch(self, wavs, wav_lens=None, normalize=False): - if len(wavs.shape) == 1: wavs = wavs.unsqueeze(0) - if wav_lens is None: wav_lens = torch.ones(wavs.shape[0], device=self.device) - - wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device) - wavs = wavs.float() - - embeddings = self.mods.embedding_model(self.mods.mean_var_norm(self.mods.compute_features(wavs), wav_lens), wav_lens) - - if normalize: embeddings = self.hparams.mean_var_norm_emb(embeddings, torch.ones(embeddings.shape[0], device=self.device)) - return embeddings - - def classify_batch(self, wavs, wav_lens=None): - out_prob = self.mods.classifier(self.encode_batch(wavs, wav_lens)).squeeze(1) - score, index = out_prob.max(dim=-1) - - return out_prob, score, index, self.hparams.label_encoder.decode_torch(index) - - def forward(self, wavs, wav_lens=None): - return self.classify_batch(wavs, wav_lens) \ No newline at end of file diff --git a/main/library/speaker_diarization/whisper.py b/main/library/speaker_diarization/whisper.py deleted file mode 100644 index 4ad36ff4826e05bb977fe5e3c72adf266df2dea4..0000000000000000000000000000000000000000 --- a/main/library/speaker_diarization/whisper.py +++ /dev/null @@ -1,1305 +0,0 @@ -import os -import sys -import gzip -import zlib -import tqdm -import torch -import base64 -import string -import tiktoken -import itertools - -import numba as nb -import numpy as np -import torch.nn as nn -import torch.nn.functional as F - -from dataclasses import replace -from torch.distributions import Categorical -from functools import cached_property, lru_cache - -sys.path.append(os.getcwd()) - -from main.app.variables import configs, logger -from main.library.backends import directml, opencl - -LANGUAGES = {"en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian", "ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish", "pl": "polish", "ca": "catalan", "nl": "dutch", "ar": "arabic", "sv": "swedish", "it": "italian", "id": "indonesian", "hi": "hindi", "fi": "finnish", "vi": "vietnamese", "he": "hebrew", "uk": "ukrainian", "el": "greek", "ms": "malay", "cs": "czech", "ro": "romanian", "da": "danish", "hu": "hungarian", "ta": "tamil", "no": "norwegian", "th": "thai", "ur": "urdu", "hr": "croatian", "bg": "bulgarian", "lt": "lithuanian", "la": "latin", "mi": "maori", "ml": "malayalam", "cy": "welsh", "sk": "slovak", "te": "telugu", "fa": "persian", "lv": "latvian", "bn": "bengali", "sr": "serbian", "az": "azerbaijani", "sl": "slovenian", "kn": "kannada", "et": "estonian", "mk": "macedonian", "br": "breton", "eu": "basque", "is": "icelandic", "hy": "armenian", "ne": "nepali", "mn": "mongolian", "bs": "bosnian", "kk": "kazakh", "sq": "albanian", "sw": "swahili", "gl": "galician", "mr": "marathi", "pa": "punjabi", "si": "sinhala", "km": "khmer", "sn": "shona", "yo": "yoruba", "so": "somali", "af": "afrikaans", "oc": "occitan", "ka": "georgian", "be": "belarusian", "tg": "tajik", "sd": "sindhi", "gu": "gujarati", "am": "amharic", "yi": "yiddish", "lo": "lao", "uz": "uzbek", "fo": "faroese", "ht": "haitian creole", "ps": "pashto", "tk": "turkmen", "nn": "nynorsk", "mt": "maltese", "sa": "sanskrit", "lb": "luxembourgish", "my": "myanmar", "bo": "tibetan", "tl": "tagalog", "mg": "malagasy", "as": "assamese", "tt": "tatar", "haw": "hawaiian", "ln": "lingala", "ha": "hausa", "ba": "bashkir", "jw": "javanese", "su": "sundanese", "yue": "cantonese"} -TO_LANGUAGE_CODE = {**{language: code for code, language in LANGUAGES.items()}, "burmese": "my", "valencian": "ca", "flemish": "nl", "haitian": "ht", "letzeburgesch": "lb", "pushto": "ps", "panjabi": "pa", "moldavian": "ro", "moldovan": "ro", "sinhalese": "si", "castilian": "es", "mandarin": "zh"} -_ALIGNMENT_HEADS = {"tiny.en": b"ABzY8J1N>@0{>%R00Bk>$p{7v037`oCl~+#00", "tiny": b"ABzY8bu8Lr0{>%RKn9Fp%m@SkK7Kt=7ytkO", "base.en": b"ABzY8;40c<0{>%RzzG;p*o+Vo09|#PsxSZm00", "base": b"ABzY8KQ!870{>%RzyTQH3`Q^yNP!>##QT-?_)10{>%RpeA61k&I|OI3I$65C{;;pbCHh0B{qLQ;+}v00", "small": b"ABzY8DmU6=0{>%Rpa?J`kvJ6qF(V^F86#Xh7JUGMK}P%R7%R7}kK1fFL7w6%<-Pf*t^=N)Qr&0RR9", "large-v1": b"ABzY8r9j$a0{>%R7#4sLmoOs{s)o3~84-RPdcFk!JR%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj", "large-v3": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00", "large": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00", "large-v3-turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`"} - -SAMPLE_RATE, N_FFT, HOP_LENGTH, CHUNK_LENGTH = 16000, 400, 160, 30 -N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE -N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2 -stft = None - -def exact_div(x, y): - assert x % y == 0 - return x // y - -N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) -FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH) -TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) - -def load_model(name = "base", device = "cpu"): - checkpoint_file = os.path.join(configs["speaker_diarization_path"], "models", name + ".pt") - alignment_heads = _ALIGNMENT_HEADS[name] - - with open(checkpoint_file, "rb") as fp: - checkpoint = torch.load(fp, map_location="cpu", weights_only=True) - - del checkpoint_file - - model = Whisper(ModelDimensions(**checkpoint["dims"])) - model.load_state_dict(checkpoint["model_state_dict"]) - model.set_alignment_heads(alignment_heads) - - return model.to(device) - -def merge_punctuations(alignment, prepended, appended): - i = len(alignment) - 2 - j = len(alignment) - 1 - - while i >= 0: - previous = alignment[i] - following = alignment[j] - - if previous.word.startswith(" ") and previous.word.strip() in prepended: - following.word = previous.word + following.word - following.tokens = previous.tokens + following.tokens - - previous.word = "" - previous.tokens = [] - else: j = i - - i -= 1 - - i = 0 - j = 1 - - while j < len(alignment): - previous = alignment[i] - following = alignment[j] - - if not previous.word.endswith(" ") and following.word in appended: - previous.word = previous.word + following.word - previous.tokens = previous.tokens + following.tokens - - following.word = "" - following.tokens = [] - else: i = j - - j += 1 - -class WordTiming: - def __init__(self, word, tokens, start, end, probability): - self.word = word - self.tokens = tokens - self.start = start - self.end = end - self.probability = probability - -def median_filter(x, filter_width): - pad_width = filter_width // 2 - - if x.shape[-1] <= pad_width: return x - if (ndim := x.ndim) <= 2: x = x[None, None, :] - - assert (filter_width > 0 and filter_width % 2 == 1) - - result = None - x = F.pad(x, (filter_width // 2, filter_width // 2, 0, 0), mode="reflect") - - if result is None: result = x.unfold(-1, filter_width, 1).sort()[0][..., filter_width // 2] - if ndim <= 2: result = result[0, 0] - - return result - -@nb.jit(nopython=True) -def backtrace(trace): - i = trace.shape[0] - 1 - j = trace.shape[1] - 1 - - trace[0, :] = 2 - trace[:, 0] = 1 - - result = [] - while i > 0 or j > 0: - result.append((i - 1, j - 1)) - - if trace[i, j] == 0: - i -= 1 - j -= 1 - elif trace[i, j] == 1: i -= 1 - elif trace[i, j] == 2: j -= 1 - else: raise ValueError - - return np.array(result)[::-1, :].T - - -@nb.jit(nopython=True, parallel=True) -def dtw_cpu(x): - N, M = x.shape - - cost = np.ones((N + 1, M + 1), dtype=np.float32) * np.inf - trace = -np.ones((N + 1, M + 1), dtype=np.float32) - cost[0, 0] = 0 - - for j in range(1, M + 1): - for i in range(1, N + 1): - c0 = cost[i - 1, j - 1] - c1 = cost[i - 1, j] - c2 = cost[i, j - 1] - - if c0 < c1 and c0 < c2: c, t = c0, 0 - elif c1 < c0 and c1 < c2: c, t = c1, 1 - else: c, t = c2, 2 - - cost[i, j] = x[i - 1, j - 1] + c - trace[i, j] = t - - return backtrace(trace) - -def dtw(x): - return dtw_cpu(x.double().cpu().numpy()) - -def find_alignment(model, tokenizer, text_tokens, mel, num_frames, *, medfilt_width = 7, qk_scale = 1.0): - if len(text_tokens) == 0: return [] - - tokens = torch.tensor([*tokenizer.sot_sequence, tokenizer.no_timestamps, *text_tokens, tokenizer.eot]).to(model.device) - - QKs = [None] * model.dims.n_text_layer - hooks = [block.cross_attn.register_forward_hook(lambda _, ins, outs, index=i: QKs.__setitem__(index, outs[-1][0])) for i, block in enumerate(model.decoder.blocks)] - - with torch.no_grad(): - token_probs = model(mel.unsqueeze(0), tokens.unsqueeze(0))[0][len(tokenizer.sot_sequence) :, : tokenizer.eot].softmax(dim=-1) - text_token_probs = token_probs[np.arange(len(text_tokens)), text_tokens].tolist() - - for hook in hooks: - hook.remove() - - if not (opencl.is_available() or directml.is_available()): - alignment_indices = model.alignment_heads.indices().T - else: - alignment_indices = [(l, h) for l in range(model.alignment_heads.size(0)) for h in range(model.alignment_heads.size(1)) if model.alignment_heads[l, h]] - - weights = (torch.stack([QKs[_l][_h] for _l, _h in alignment_indices])[:, :, : num_frames // 2] * qk_scale).softmax(dim=-1) - std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False) - - if directml.is_available(): - weights = median_filter(((weights - mean) / std).cpu(), medfilt_width).to(weights.device) - else: - weights = median_filter((weights - mean) / std, medfilt_width) - - text_indices, time_indices = dtw(-weights.mean(axis=0)[len(tokenizer.sot_sequence) : -1]) - - words, word_tokens = tokenizer.split_to_word_tokens(text_tokens + [tokenizer.eot]) - if len(word_tokens) <= 1: return [] - - word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0)) - jump_times = time_indices[np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)] / TOKENS_PER_SECOND - - return [WordTiming(word, tokens, start, end, probability) for word, tokens, start, end, probability in zip(words, word_tokens, jump_times[word_boundaries[:-1]], jump_times[word_boundaries[1:]], [np.mean(text_token_probs[i:j]) for i, j in zip(word_boundaries[:-1], word_boundaries[1:])])] - -def add_word_timestamps(*, segments, model, tokenizer, mel, num_frames, prepend_punctuations = "\"'“¿([{-", append_punctuations = "\"'.。,,!!??::”)]}、", last_speech_timestamp, **kwargs): - if len(segments) == 0: return - - text_tokens_per_segment = [[token for token in segment["tokens"] if token < tokenizer.eot] for segment in segments] - - text_tokens = list(itertools.chain.from_iterable(text_tokens_per_segment)) - alignment = find_alignment(model, tokenizer, text_tokens, mel, num_frames, **kwargs) - - word_durations = np.array([t.end - t.start for t in alignment]) - word_durations = word_durations[word_durations.nonzero()] - - median_duration = min(0.7, float(np.median(word_durations) if len(word_durations) > 0 else 0.0)) - max_duration = median_duration * 2 - - if len(word_durations) > 0: - sentence_end_marks = ".。!!??" - for i in range(1, len(alignment)): - if alignment[i].end - alignment[i].start > max_duration: - if alignment[i].word in sentence_end_marks: alignment[i].end = alignment[i].start + max_duration - elif alignment[i - 1].word in sentence_end_marks: alignment[i].start = alignment[i].end - max_duration - - merge_punctuations(alignment, prepend_punctuations, append_punctuations) - - time_offset = segments[0]["seek"] * HOP_LENGTH / SAMPLE_RATE - word_index = 0 - - for segment, text_tokens in zip(segments, text_tokens_per_segment): - saved_tokens = 0 - words = [] - - while word_index < len(alignment) and saved_tokens < len(text_tokens): - timing = alignment[word_index] - - if timing.word: words.append(dict(word=timing.word, start=round(time_offset + timing.start, 2), end=round(time_offset + timing.end, 2), probability=timing.probability)) - - saved_tokens += len(timing.tokens) - word_index += 1 - - if len(words) > 0: - if words[0]["end"] - last_speech_timestamp > median_duration * 4 and (words[0]["end"] - words[0]["start"] > max_duration or (len(words) > 1 and words[1]["end"] - words[0]["start"] > max_duration * 2)): - if (len(words) > 1 and words[1]["end"] - words[1]["start"] > max_duration): words[0]["end"] = words[1]["start"] = max(words[1]["end"] / 2, words[1]["end"] - max_duration) - words[0]["start"] = max(0, words[0]["end"] - max_duration) - - if (segment["start"] < words[0]["end"] and segment["start"] - 0.5 > words[0]["start"]): words[0]["start"] = max(0, min(words[0]["end"] - median_duration, segment["start"])) - else: segment["start"] = words[0]["start"] - - if (segment["end"] > words[-1]["start"] and segment["end"] + 0.5 < words[-1]["end"]): words[-1]["end"] = max(words[-1]["start"] + median_duration, segment["end"]) - else: segment["end"] = words[-1]["end"] - - last_speech_timestamp = segment["end"] - - segment["words"] = words - -@lru_cache(maxsize=None) -def mel_filters(device, n_mels): - assert n_mels in {80, 128} - - with np.load(os.path.join(configs["speaker_diarization_path"], "assets", "mel_filters.npz"), allow_pickle=False) as f: - return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) - -def log_mel_spectrogram(audio, n_mels = 80, padding = 0, device = None): - global stft - - if not torch.is_tensor(audio): - if isinstance(audio, str): - from main.library.utils import load_audio - audio = load_audio(audio, sample_rate=SAMPLE_RATE).astype(np.float32) - audio = torch.from_numpy(audio) - - if device is not None: audio = audio.to(device) - if padding > 0: audio = F.pad(audio, (0, padding)) - - if str(audio.device).startswith(("ocl", "privateuseone")): - if stft is None: - from main.library.backends.utils import STFT - stft = STFT(N_FFT, HOP_LENGTH, N_FFT).to(audio.device) - fft = stft.transform(audio.unsqueeze(0), eps=1e-9).squeeze(0) - else: - fft = torch.stft(audio, N_FFT, HOP_LENGTH, window=torch.hann_window(N_FFT).to(audio.device), return_complex=True) - - log_spec = (mel_filters(audio.device, n_mels) @ fft[..., :-1].abs() ** 2).clamp(min=1e-10).log10() - return (log_spec.maximum(log_spec.max() - 8.0) + 4.0) / 4.0 - -def pad_or_trim(array, length = N_SAMPLES, *, axis = -1): - if torch.is_tensor(array): - if array.shape[axis] > length: array = array.index_select(dim=axis, index=torch.arange(length, device=array.device)) - - if array.shape[axis] < length: - pad_widths = [(0, 0)] * array.ndim - pad_widths[axis] = (0, length - array.shape[axis]) - array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes]) - else: - if array.shape[axis] > length: array = array.take(indices=range(length), axis=axis) - - if array.shape[axis] < length: - pad_widths = [(0, 0)] * array.ndim - pad_widths[axis] = (0, length - array.shape[axis]) - array = np.pad(array, pad_widths) - - return array - -def get_end(segments): - return next((w["end"] for s in reversed(segments) for w in reversed(s["words"])), segments[-1]["end"] if segments else None) - -def transcribe_function(model, audio, *, verbose = None, temperature = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), compression_ratio_threshold = 2.4, logprob_threshold = -1.0, no_speech_threshold = 0.6, condition_on_previous_text = True, initial_prompt = None, carry_initial_prompt = False, word_timestamps = False, prepend_punctuations = "\"'“¿([{-", append_punctuations = "\"'.。,,!!??::”)]}、", clip_timestamps = "0", hallucination_silence_threshold = None, fp16 = False, **decode_options): - dtype = torch.float16 if fp16 else torch.float32 - decode_options["fp16"] = fp16 - - mel = log_mel_spectrogram(audio, model.dims.n_mels, padding=N_SAMPLES) - content_frames = mel.shape[-1] - N_FRAMES - content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE) - - if decode_options.get("language", None) is None: - if not model.is_multilingual: decode_options["language"] = "vi" - else: - mel_segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype) - _, probs = model.detect_language(mel_segment) - decode_options["language"] = max(probs, key=probs.get) - - if verbose is not None: logger.info(f"{LANGUAGES[decode_options['language']].title()}") - - language = decode_options["language"] - task = decode_options.get("task", "transcribe") - tokenizer = get_tokenizer(model.is_multilingual, num_languages=model.num_languages, language=language, task=task) - - if isinstance(clip_timestamps, str): clip_timestamps = [float(ts) for ts in (clip_timestamps.split(",") if clip_timestamps else [])] - seek_points = [round(ts * FRAMES_PER_SECOND) for ts in clip_timestamps] - - if len(seek_points) == 0: seek_points.append(0) - if len(seek_points) % 2 == 1: seek_points.append(content_frames) - - seek_clips = list(zip(seek_points[::2], seek_points[1::2])) - punctuation = "\"'“¿([{-\"'.。,,!!??::”)]}、" - - def decode_with_fallback(segment): - temperatures = ([temperature] if isinstance(temperature, (int, float)) else temperature) - decode_result = None - - for t in temperatures: - kwargs = {**decode_options} - - if t > 0: - kwargs.pop("beam_size", None) - kwargs.pop("patience", None) - else: kwargs.pop("best_of", None) - - decode_result = model.decode(segment, DecodingOptions(**kwargs, temperature=t)) - needs_fallback = False - - if (compression_ratio_threshold is not None and decode_result.compression_ratio > compression_ratio_threshold): needs_fallback = True - if (logprob_threshold is not None and decode_result.avg_logprob < logprob_threshold): needs_fallback = True - if (no_speech_threshold is not None and decode_result.no_speech_prob > no_speech_threshold and logprob_threshold is not None and decode_result.avg_logprob < logprob_threshold): needs_fallback = False - if not needs_fallback: break - - return decode_result - - clip_idx = 0 - seek = seek_clips[clip_idx][0] - - input_stride = exact_div(N_FRAMES, model.dims.n_audio_ctx) - time_precision = (input_stride * HOP_LENGTH / SAMPLE_RATE) - - all_tokens, all_segments = [], [] - prompt_reset_since = 0 - - remaining_prompt_length = model.dims.n_text_ctx // 2 - 1 - - if initial_prompt is not None: - initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip()) - all_tokens.extend(initial_prompt_tokens) - remaining_prompt_length -= len(initial_prompt_tokens) - else: initial_prompt_tokens = [] - - def new_segment(*, start, end, tokens, result): - tokens = tokens.tolist() - return {"seek": seek, "start": start, "end": end, "text": tokenizer.decode([token for token in tokens if token < tokenizer.eot]), "tokens": tokens, "temperature": result.temperature, "avg_logprob": result.avg_logprob, "compression_ratio": result.compression_ratio, "no_speech_prob": result.no_speech_prob} - - with tqdm.tqdm(total=content_frames, unit="frames", disable=verbose is not False) as pbar: - last_speech_timestamp = 0.0 - while clip_idx < len(seek_clips): - seek_clip_start, seek_clip_end = seek_clips[clip_idx] - if seek < seek_clip_start: seek = seek_clip_start - - if seek >= seek_clip_end: - clip_idx += 1 - if clip_idx < len(seek_clips): seek = seek_clips[clip_idx][0] - continue - - time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE) - window_end_time = float((seek + N_FRAMES) * HOP_LENGTH / SAMPLE_RATE) - - segment_size = min(N_FRAMES, content_frames - seek, seek_clip_end - seek) - mel_segment = mel[:, seek : seek + segment_size] - - segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE - mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype) - - if carry_initial_prompt: decode_options["prompt"] = initial_prompt_tokens + all_tokens[max(len(initial_prompt_tokens), prompt_reset_since):][-remaining_prompt_length:] - else: decode_options["prompt"] = all_tokens[prompt_reset_since:] - - result = decode_with_fallback(mel_segment) - tokens = torch.tensor(result.tokens) - - if no_speech_threshold is not None: - should_skip = result.no_speech_prob > no_speech_threshold - if (logprob_threshold is not None and result.avg_logprob > logprob_threshold): - should_skip = False - - if should_skip: - seek += segment_size - continue - - previous_seek = seek - current_segments = [] - - def word_anomaly_score(word): - probability = word.get("probability", 0.0) - duration = word["end"] - word["start"] - score = 0.0 - - if probability < 0.15: score += 1.0 - if duration < 0.133: score += (0.133 - duration) * 15 - if duration > 2.0: score += duration - 2.0 - - return score - - def is_segment_anomaly(segment): - if segment is None or not segment["words"]: return False - - words = [w for w in segment["words"] if w["word"] not in punctuation] - words = words[:8] - - score = sum(word_anomaly_score(w) for w in words) - - return score >= 3 or score + 0.01 >= len(words) - - def next_words_segment(segments): - return next((s for s in segments if s["words"]), None) - - timestamp_tokens = tokens.ge(tokenizer.timestamp_begin) - single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True] - - consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] - consecutive.add_(1) - - if len(consecutive) > 0: - slices = consecutive.tolist() - if single_timestamp_ending: - slices.append(len(tokens)) - - last_slice = 0 - for current_slice in slices: - sliced_tokens = tokens[last_slice:current_slice] - current_segments.append(new_segment(start=time_offset + (sliced_tokens[0].item() - tokenizer.timestamp_begin) * time_precision, end=time_offset + (sliced_tokens[-1].item() - tokenizer.timestamp_begin) * time_precision, tokens=sliced_tokens, result=result)) - last_slice = current_slice - - if single_timestamp_ending: seek += segment_size - else: seek += (tokens[last_slice - 1].item() - tokenizer.timestamp_begin) * input_stride - else: - duration = segment_duration - - timestamps = tokens[timestamp_tokens.nonzero().flatten()] - if (len(timestamps) > 0 and timestamps[-1].item() != tokenizer.timestamp_begin): duration = (timestamps[-1].item() - tokenizer.timestamp_begin) * time_precision - - current_segments.append(new_segment(start=time_offset, end=time_offset + duration, tokens=tokens, result=result)) - seek += segment_size - - if word_timestamps: - add_word_timestamps(segments=current_segments, model=model, tokenizer=tokenizer, mel=mel_segment, num_frames=segment_size, prepend_punctuations=prepend_punctuations, append_punctuations=append_punctuations, last_speech_timestamp=last_speech_timestamp) - - if not single_timestamp_ending: - last_word_end = get_end(current_segments) - if last_word_end is not None and last_word_end > time_offset: seek = round(last_word_end * FRAMES_PER_SECOND) - - if hallucination_silence_threshold is not None: - threshold = hallucination_silence_threshold - - if not single_timestamp_ending: - last_word_end = get_end(current_segments) - if last_word_end is not None and last_word_end > time_offset: seek = round(last_word_end * FRAMES_PER_SECOND) if (window_end_time - last_word_end) > threshold else (previous_seek + segment_size) - - first_segment = next_words_segment(current_segments) - - if first_segment is not None and is_segment_anomaly(first_segment): - gap = first_segment["start"] - time_offset - - if gap > threshold: - seek = previous_seek + round(gap * FRAMES_PER_SECOND) - continue - - hal_last_end = last_speech_timestamp - - for si in range(len(current_segments)): - segment = current_segments[si] - if not segment["words"]: continue - - if is_segment_anomaly(segment): - next_segment = next_words_segment(current_segments[si + 1 :]) - hal_next_start = next_segment["words"][0]["start"] if next_segment is not None else (time_offset + segment_duration) - - if (segment["start"] - hal_last_end > threshold or segment["start"] < threshold or segment["start"] - time_offset < 2.0) and (hal_next_start - segment["end"] > threshold or is_segment_anomaly(next_segment) or window_end_time - segment["end"] < 2.0): - seek = round(max(time_offset + 1, segment["start"]) * FRAMES_PER_SECOND) - if content_duration - segment["end"] < threshold: seek = content_frames - - current_segments[si:] = [] - break - - hal_last_end = segment["end"] - - last_word_end = get_end(current_segments) - if last_word_end is not None: last_speech_timestamp = last_word_end - - for _, segment in enumerate(current_segments): - if segment["start"] == segment["end"] or segment["text"].strip() == "": - segment["text"] = "" - segment["tokens"] = [] - segment["words"] = [] - - all_segments.extend([{"id": i, **segment} for i, segment in enumerate(current_segments, start=len(all_segments))]) - all_tokens.extend([token for segment in current_segments for token in segment["tokens"]]) - - if not condition_on_previous_text or result.temperature > 0.5: prompt_reset_since = len(all_tokens) - pbar.update(min(content_frames, seek) - previous_seek) - - return dict(text=tokenizer.decode(all_tokens[len(initial_prompt_tokens) :]), segments=all_segments, language=language) - -def compression_ratio(text): - text_bytes = text.encode("utf-8") - return len(text_bytes) / len(zlib.compress(text_bytes)) - -def sinusoids(length, channels, max_timescale=10000): - assert channels % 2 == 0 - - scaled_time = torch.arange(length)[:, np.newaxis] * (-(np.log(max_timescale) / (channels // 2 - 1)) * torch.arange(channels // 2)).exp()[np.newaxis, :] - return torch.cat([scaled_time.sin(), scaled_time.cos()], dim=1) - -@torch.no_grad() -def detect_language_function(model, mel, tokenizer = None): - if tokenizer is None: tokenizer = get_tokenizer(model.is_multilingual, num_languages=model.num_languages) - if (tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence): raise ValueError - - single = mel.ndim == 2 - - if single: mel = mel.unsqueeze(0) - if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state): mel = model.encoder(mel) - - n_audio = mel.shape[0] - logits = model.logits(torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device), mel)[:, 0] - - mask = torch.ones(logits.shape[-1], dtype=torch.bool) - mask[list(tokenizer.all_language_tokens)] = False - - logits[:, mask] = -np.inf - - language_tokens = logits.argmax(dim=-1) - language_probs = [{c: logits.softmax(dim=-1).cpu()[i, j].item() for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes)} for i in range(n_audio)] - - if single: - language_tokens = language_tokens[0] - language_probs = language_probs[0] - - return language_tokens, language_probs - -@lru_cache(maxsize=None) -def get_tokenizer(multilingual, *, num_languages = 99, language = None, task = None): - if language is not None: - language = language.lower() - if language not in LANGUAGES: - if language in TO_LANGUAGE_CODE: language = TO_LANGUAGE_CODE[language] - else: raise ValueError - - if multilingual: - encoding_name = "multilingual" - language = language or "en" - task = task or "transcribe" - else: - encoding_name = "gpt2" - language = None - task = None - - return Tokenizer(encoding_name=encoding_name, num_languages=num_languages, language=language, task=task) - -@lru_cache(maxsize=None) -def get_encoding(name = "gpt2", num_languages = 99): - vocab_path = os.path.join(configs["speaker_diarization_path"], "assets", f"{name}.tiktoken") - ranks = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in open(vocab_path) if line)} - - n_vocab = len(ranks) - special_tokens = {} - - specials = ["<|endoftext|>", "<|startoftranscript|>", *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]], "<|translate|>", "<|transcribe|>", "<|startoflm|>", "<|startofprev|>", "<|nospeech|>", "<|notimestamps|>", *[f"<|{i * 0.02:.2f}|>" for i in range(1501)]] - - for token in specials: - special_tokens[token] = n_vocab - n_vocab += 1 - - return tiktoken.Encoding(name=os.path.basename(vocab_path), explicit_n_vocab=n_vocab, pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", mergeable_ranks=ranks, special_tokens=special_tokens) - -class DecodingOptions: - def __init__(self, task = "transcribe", language = None, temperature = 0.0, sample_len = None, best_of = None, beam_size = None, patience = None, length_penalty = None, prompt = None, prefix = None, suppress_tokens = "-1", suppress_blank = True, without_timestamps = False, max_initial_timestamp = 1.0, fp16 = False): - self.task = task - self.language = language - self.temperature = temperature - self.sample_len = sample_len - self.best_of = best_of - self.beam_size = beam_size - self.patience = patience - self.length_penalty = length_penalty - self.prompt = prompt - self.prefix = prefix - self.suppress_tokens = suppress_tokens - self.suppress_blank = suppress_blank - self.without_timestamps = without_timestamps - self.max_initial_timestamp = max_initial_timestamp - self.fp16 = fp16 - -@torch.no_grad() -def decode_function(model, mel, options = DecodingOptions(), **kwargs): - if single := mel.ndim == 2: mel = mel.unsqueeze(0) - if kwargs: options = replace(options, **kwargs) - - result = DecodingTask(model, options).run(mel) - return result[0] if single else result - -class ModelDimensions: - def __init__(self, n_mels, n_audio_ctx, n_audio_state, n_audio_head, n_audio_layer, n_vocab, n_text_ctx, n_text_state, n_text_head, n_text_layer): - self.n_mels = n_mels - self.n_audio_ctx = n_audio_ctx - self.n_audio_state = n_audio_state - self.n_audio_head = n_audio_head - self.n_audio_layer = n_audio_layer - self.n_vocab = n_vocab - self.n_text_ctx = n_text_ctx - self.n_text_state = n_text_state - self.n_text_head = n_text_head - self.n_text_layer = n_text_layer - -class LayerNorm(nn.LayerNorm): - def forward(self, x): - return super().forward(x.float()).type(x.dtype) - -class Linear(nn.Linear): - def forward(self, x): - return F.linear(x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype)) - -class Conv1d(nn.Conv1d): - def _conv_forward(self, x, weight, bias): - return super()._conv_forward(x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)) - -class TextDecoder(nn.Module): - def __init__(self, n_vocab, n_ctx, n_state, n_head, n_layer): - super().__init__() - - self.token_embedding = nn.Embedding(n_vocab, n_state) - self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state)) - - self.blocks = nn.ModuleList([ResidualAttentionBlock(n_state, n_head, cross_attention=True) for _ in range(n_layer)]) - self.ln = LayerNorm(n_state) - self.register_buffer("mask", torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1), persistent=False) - - def forward(self, x, xa, kv_cache = None): - offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 - x = (self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]).to(xa.dtype) - - for block in self.blocks: - x = block(x, xa, mask=self.mask, kv_cache=kv_cache) - - x = self.ln(x) - return x @ self.token_embedding.weight.to(x.dtype).transpose(0, 1).float() - -class AudioEncoder(nn.Module): - def __init__(self, n_mels, n_ctx, n_state, n_head, n_layer): - super().__init__() - self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1) - self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1) - self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state)) - - self.blocks = nn.ModuleList([ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]) - self.ln_post = LayerNorm(n_state) - - def forward(self, x): - x = F.gelu(self.conv2(F.gelu(self.conv1(x)))).permute(0, 2, 1) - - assert x.shape[1:] == self.positional_embedding.shape - x = (x + self.positional_embedding).to(x.dtype) - - for block in self.blocks: - x = block(x) - - return self.ln_post(x) - -class Whisper(nn.Module): - def __init__(self, dims): - super().__init__() - self.dims = dims - self.encoder = AudioEncoder(self.dims.n_mels, self.dims.n_audio_ctx, self.dims.n_audio_state, self.dims.n_audio_head, self.dims.n_audio_layer) - self.decoder = TextDecoder(self.dims.n_vocab, self.dims.n_text_ctx, self.dims.n_text_state, self.dims.n_text_head, self.dims.n_text_layer) - - all_heads = torch.zeros(self.dims.n_text_layer, self.dims.n_text_head, dtype=torch.bool) - all_heads[self.dims.n_text_layer // 2 :] = True - self.register_buffer("alignment_heads", all_heads if opencl.is_available() or directml.is_available() else all_heads.to_sparse(), persistent=False) - - def set_alignment_heads(self, dump): - alignment = torch.from_numpy(np.frombuffer(gzip.decompress(base64.b85decode(dump)), dtype=bool).copy()).reshape(self.dims.n_text_layer, self.dims.n_text_head) - if not (opencl.is_available() or directml.is_available()): alignment = alignment.to_sparse() - - self.register_buffer("alignment_heads", alignment, persistent=False) - - def embed_audio(self, mel): - return self.encoder(mel) - - def logits(self, tokens, audio_features): - return self.decoder(tokens, audio_features) - - def forward(self, mel, tokens): - return self.decoder(tokens, self.encoder(mel)) - - @property - def device(self): - return next(self.parameters()).device - - @property - def is_multilingual(self): - return self.dims.n_vocab >= 51865 - - @property - def num_languages(self): - return self.dims.n_vocab - 51765 - int(self.is_multilingual) - - def install_kv_cache_hooks(self, cache = None): - cache = {**cache} if cache is not None else {} - hooks = [] - - def save_to_cache(module, _, output): - cache[module] = output if module not in cache or output.shape[1] > self.dims.n_text_ctx else torch.cat([cache[module], output], dim=1).detach() - return cache[module] - - def install_hooks(layer: nn.Module): - if isinstance(layer, MultiHeadAttention): - hooks.append(layer.key.register_forward_hook(save_to_cache)) - hooks.append(layer.value.register_forward_hook(save_to_cache)) - - self.decoder.apply(install_hooks) - return cache, hooks - - detect_language = detect_language_function - transcribe = transcribe_function - decode = decode_function - -class ResidualAttentionBlock(nn.Module): - def __init__(self, n_state, n_head, cross_attention = False): - super().__init__() - self.attn = MultiHeadAttention(n_state, n_head) - self.attn_ln = LayerNorm(n_state) - self.cross_attn = (MultiHeadAttention(n_state, n_head) if cross_attention else None) - self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None - - n_mlp = n_state * 4 - self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)) - self.mlp_ln = LayerNorm(n_state) - - def forward(self, x, xa = None, mask = None, kv_cache = None): - x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0] - if self.cross_attn: x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0] - - return x + self.mlp(self.mlp_ln(x)) - -class MultiHeadAttention(nn.Module): - def __init__(self, n_state, n_head): - super().__init__() - self.n_head = n_head - self.query = Linear(n_state, n_state) - self.key = Linear(n_state, n_state, bias=False) - self.value = Linear(n_state, n_state) - self.out = Linear(n_state, n_state) - - def forward(self, x, xa = None, mask = None, kv_cache = None): - k, v = (self.key(x if xa is None else xa), self.value(x if xa is None else xa)) if kv_cache is None or xa is None or self.key not in kv_cache else (kv_cache[self.key], kv_cache[self.value]) - wv, qk = self.qkv_attention(self.query(x), k, v, mask) - - return self.out(wv), qk - - def qkv_attention(self, q, k, v, mask = None): - _, n_ctx, n_state = q.shape - scale = (n_state // self.n_head) ** -0.25 - q, k, v = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3), k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3), v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) - - qk = (q * scale) @ (k * scale).transpose(-1, -2) - if mask is not None: qk = qk + mask[:n_ctx, :n_ctx] - qk = qk.float() - - return (F.softmax(qk, dim=-1).to(q.dtype) @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach() - -class LogitFilter: - def apply(self, logits, tokens): - pass - -class SuppressBlank(LogitFilter): - def __init__(self, tokenizer, sample_begin): - self.tokenizer = tokenizer - self.sample_begin = sample_begin - - def apply(self, logits, tokens): - if tokens.shape[1] == self.sample_begin: logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf - -class SuppressTokens(LogitFilter): - def __init__(self, suppress_tokens): - self.suppress_tokens = list(suppress_tokens) - - def apply(self, logits, tokens): - logits[:, self.suppress_tokens] = -np.inf - -class Inference: - def logits(self, tokens, audio_features): - pass - - def rearrange_kv_cache(self, source_indices): - pass - - def cleanup_caching(self): - pass - -class PyTorchInference(Inference): - def __init__(self, model, initial_token_length): - self.model = model - self.initial_token_length = initial_token_length - self.kv_cache = {} - self.hooks = [] - - self.kv_modules = [block.attn.key for block in self.model.decoder.blocks] + [block.attn.value for block in self.model.decoder.blocks] - - def logits(self, tokens, audio_features): - if not self.kv_cache: self.kv_cache, self.hooks = self.model.install_kv_cache_hooks() - if tokens.shape[-1] > self.initial_token_length: tokens = tokens[:, -1:] - - return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache) - - def cleanup_caching(self): - for hook in self.hooks: - hook.remove() - - self.kv_cache = {} - self.hooks = [] - - def rearrange_kv_cache(self, source_indices): - if source_indices != list(range(len(source_indices))): - for module in self.kv_modules: - self.kv_cache[module] = self.kv_cache[module][source_indices].detach() - -class SequenceRanker: - def rank(self, tokens, sum_logprobs): - pass - -class MaximumLikelihoodRanker(SequenceRanker): - def __init__(self, length_penalty): - self.length_penalty = length_penalty - - def rank(self, tokens, sum_logprobs): - def scores(logprobs, lengths): - result = [] - for logprob, length in zip(logprobs, lengths): - result.append(logprob / (length if self.length_penalty is None else ((5 + length) / 6) ** self.length_penalty)) - return result - - return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, [[len(t) for t in s] for s in tokens])] - -class TokenDecoder: - def reset(self): - pass - - def update(self, tokens, logits, sum_logprobs): - pass - - def finalize(self, tokens, sum_logprobs): - pass - -class GreedyDecoder(TokenDecoder): - def __init__(self, temperature, eot): - self.temperature = temperature - self.eot = eot - - def update(self, tokens, logits, sum_logprobs): - next_tokens = logits.argmax(dim=-1) if self.temperature == 0 else ( - Categorical(logits=(logits / self.temperature).cpu() if opencl.is_available() else (logits / self.temperature)) - ).sample().to(logits.device) - - logprobs = F.log_softmax(logits.float(), dim=-1) - sum_logprobs += logprobs[torch.arange(logprobs.shape[0]), next_tokens] * (tokens[:, -1] != self.eot) - - next_tokens[tokens[:, -1] == self.eot] = self.eot - tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1) - - return tokens, (tokens[:, -1] == self.eot).all() - - def finalize(self, tokens, sum_logprobs): - return F.pad(tokens, (0, 1), value=self.eot), sum_logprobs.tolist() - -class BeamSearchDecoder(TokenDecoder): - def __init__(self, beam_size, eot, inference, patience = None): - self.beam_size = beam_size - self.eot = eot - self.inference = inference - self.patience = patience or 1.0 - self.max_candidates = round(beam_size * self.patience) - self.finished_sequences = None - - assert (self.max_candidates > 0) - - def reset(self): - self.finished_sequences = None - - def update(self, tokens, logits, sum_logprobs): - if tokens.shape[0] % self.beam_size != 0: raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0") - - n_audio = tokens.shape[0] // self.beam_size - if self.finished_sequences is None: self.finished_sequences = [{} for _ in range(n_audio)] - - logprobs = F.log_softmax(logits.float(), dim=-1) - next_tokens, source_indices, finished_sequences = [], [], [] - - for i in range(n_audio): - scores, sources, finished = {}, {}, {} - - for j in range(self.beam_size): - idx = i * self.beam_size + j - prefix = tokens[idx].tolist() - for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)): - sequence = tuple(prefix + [token.item()]) - scores[sequence] = (sum_logprobs[idx] + logprob).item() - sources[sequence] = idx - - saved = 0 - - for sequence in sorted(scores, key=scores.get, reverse=True): - if sequence[-1] == self.eot: finished[sequence] = scores[sequence] - else: - sum_logprobs[len(next_tokens)] = scores[sequence] - next_tokens.append(sequence) - source_indices.append(sources[sequence]) - - saved += 1 - if saved == self.beam_size: break - - finished_sequences.append(finished) - - self.inference.rearrange_kv_cache(source_indices) - assert len(self.finished_sequences) == len(finished_sequences) - - for previously_finished, newly_finished in zip(self.finished_sequences, finished_sequences): - for seq in sorted(newly_finished, key=newly_finished.get, reverse=True): - if len(previously_finished) >= self.max_candidates: break - previously_finished[seq] = newly_finished[seq] - - return torch.tensor(next_tokens, device=tokens.device), all(len(sequences) >= self.max_candidates for sequences in self.finished_sequences) - - def finalize(self, preceding_tokens, sum_logprobs): - sum_logprobs = sum_logprobs.cpu() - - for i, sequences in enumerate(self.finished_sequences): - if (len(sequences) < self.beam_size): - for j in list(np.argsort(sum_logprobs[i]))[::-1]: - sequence = preceding_tokens[i, j].tolist() + [self.eot] - sequences[tuple(sequence)] = sum_logprobs[i][j].item() - if len(sequences) >= self.beam_size: break - - return [[torch.tensor(seq) for seq in sequences.keys()] for sequences in self.finished_sequences], [list(sequences.values()) for sequences in self.finished_sequences] - -class ApplyTimestampRules(LogitFilter): - def __init__(self, tokenizer, sample_begin, max_initial_timestamp_index): - self.tokenizer = tokenizer - self.sample_begin = sample_begin - self.max_initial_timestamp_index = max_initial_timestamp_index - - def apply(self, logits, tokens): - if self.tokenizer.no_timestamps is not None: logits[:, self.tokenizer.no_timestamps] = -np.inf - - for k in range(tokens.shape[0]): - sampled_tokens = tokens[k, self.sample_begin :] - seq = [t for t in sampled_tokens.tolist()] - - last_was_timestamp = (len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin) - penultimate_was_timestamp = (len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin) - - if last_was_timestamp: - if penultimate_was_timestamp: logits[k, self.tokenizer.timestamp_begin :] = -np.inf - else: logits[k, : self.tokenizer.eot] = -np.inf - - timestamps = sampled_tokens[sampled_tokens.ge(self.tokenizer.timestamp_begin)] - - if timestamps.numel() > 0: logits[k, self.tokenizer.timestamp_begin : timestamps[-1] if last_was_timestamp and not penultimate_was_timestamp else (timestamps[-1] + 1)] = -np.inf - - if tokens.shape[1] == self.sample_begin: - logits[:, : self.tokenizer.timestamp_begin] = -np.inf - - if self.max_initial_timestamp_index is not None: - last_allowed = (self.tokenizer.timestamp_begin + self.max_initial_timestamp_index) - logits[:, last_allowed + 1 :] = -np.inf - - logprobs = F.log_softmax(logits.float(), dim=-1) - for k in range(tokens.shape[0]): - if logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(dim=-1) > logprobs[k, : self.tokenizer.timestamp_begin].max(): logits[k, : self.tokenizer.timestamp_begin] = -np.inf - -class DecodingTask: - def __init__(self, model, options): - self.model = model - - language = options.language or "en" - tokenizer = get_tokenizer(model.is_multilingual, num_languages=model.num_languages, language=language, task=options.task) - - self.tokenizer = tokenizer - self.options = self._verify_options(options) - - self.n_group = options.beam_size or options.best_of or 1 - self.n_ctx = model.dims.n_text_ctx - self.sample_len = options.sample_len or model.dims.n_text_ctx // 2 - - self.sot_sequence = tokenizer.sot_sequence - if self.options.without_timestamps: self.sot_sequence = tokenizer.sot_sequence_including_notimestamps - - self.initial_tokens = self._get_initial_tokens() - self.sample_begin = len(self.initial_tokens) - self.sot_index = self.initial_tokens.index(tokenizer.sot) - self.inference = PyTorchInference(model, len(self.initial_tokens)) - self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty) - self.decoder = BeamSearchDecoder(options.beam_size, tokenizer.eot, self.inference, options.patience) if options.beam_size is not None else GreedyDecoder(options.temperature, tokenizer.eot) - - self.logit_filters = [] - - if self.options.suppress_blank: self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin)) - if self.options.suppress_tokens: self.logit_filters.append(SuppressTokens(self._get_suppress_tokens())) - - if not options.without_timestamps: - max_initial_timestamp_index = None - if options.max_initial_timestamp: max_initial_timestamp_index = round(self.options.max_initial_timestamp / (CHUNK_LENGTH / model.dims.n_audio_ctx)) - self.logit_filters.append(ApplyTimestampRules(tokenizer, self.sample_begin, max_initial_timestamp_index)) - - def _verify_options(self, options): - if options.beam_size is not None and options.best_of is not None: raise ValueError - if options.temperature == 0 and options.best_of is not None: raise ValueError - if options.patience is not None and options.beam_size is None: raise ValueError - if options.length_penalty is not None and not (0 <= options.length_penalty <= 1): raise ValueError - - return options - - def _get_initial_tokens(self): - tokens = list(self.sot_sequence) - - if prefix := self.options.prefix: - prefix_tokens = (self.tokenizer.encode(" " + prefix.strip()) if isinstance(prefix, str) else prefix) - if self.sample_len is not None: prefix_tokens = prefix_tokens[-(self.n_ctx // 2 - self.sample_len):] - tokens = tokens + prefix_tokens - - if prompt := self.options.prompt: tokens = ([self.tokenizer.sot_prev] + (self.tokenizer.encode(" " + prompt.strip()) if isinstance(prompt, str) else prompt)[-(self.n_ctx // 2 - 1) :] + tokens) - - return tuple(tokens) - - def _get_suppress_tokens(self): - suppress_tokens = self.options.suppress_tokens - if isinstance(suppress_tokens, str): suppress_tokens = [int(t) for t in suppress_tokens.split(",")] - - if -1 in suppress_tokens: - suppress_tokens = [t for t in suppress_tokens if t >= 0] - suppress_tokens.extend(self.tokenizer.non_speech_tokens) - elif suppress_tokens is None or len(suppress_tokens) == 0: suppress_tokens = [] - else: assert isinstance(suppress_tokens, list) - - suppress_tokens.extend([self.tokenizer.transcribe, self.tokenizer.translate, self.tokenizer.sot, self.tokenizer.sot_prev, self.tokenizer.sot_lm]) - - if self.tokenizer.no_speech is not None: suppress_tokens.append(self.tokenizer.no_speech) - return tuple(sorted(set(suppress_tokens))) - - def _get_audio_features(self, mel): - if self.options.fp16: mel = mel.half() - - audio_features = mel if mel.shape[-2:] == (self.model.dims.n_audio_ctx, self.model.dims.n_audio_state) else self.model.encoder(mel) - if audio_features.dtype != (torch.float16 if self.options.fp16 else torch.float32): return TypeError - - return audio_features - - def _detect_language(self, audio_features, tokens): - languages = [self.options.language] * audio_features.shape[0] - lang_probs = None - - if self.options.language is None or self.options.task == "lang_id": - lang_tokens, lang_probs = self.model.detect_language(audio_features, self.tokenizer) - languages = [max(probs, key=probs.get) for probs in lang_probs] - - if self.options.language is None: tokens[:, self.sot_index + 1] = lang_tokens - - return languages, lang_probs - - def _main_loop(self, audio_features, tokens): - n_batch = tokens.shape[0] - sum_logprobs = torch.zeros(n_batch, device=audio_features.device) - no_speech_probs = [np.nan] * n_batch - - try: - for i in range(self.sample_len): - logits = self.inference.logits(tokens, audio_features) - - if (i == 0 and self.tokenizer.no_speech is not None): - probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1) - no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist() - - logits = logits[:, -1] - for logit_filter in self.logit_filters: - logit_filter.apply(logits.to("cpu") if opencl.is_available() else logits, tokens) - - tokens, completed = self.decoder.update(tokens, logits, sum_logprobs) - if completed or tokens.shape[-1] > self.n_ctx: break - finally: - self.inference.cleanup_caching() - - return tokens, sum_logprobs, no_speech_probs - - @torch.no_grad() - def run(self, mel): - self.decoder.reset() - tokenizer = self.tokenizer - n_audio = mel.shape[0] - - audio_features = self._get_audio_features(mel) - tokens = torch.tensor([self.initial_tokens]).repeat(n_audio, 1) - - languages, language_probs = self._detect_language(audio_features, tokens) - if self.options.task == "lang_id": return [DecodingResult(audio_features=features, language=language, language_probs=probs) for features, language, probs in zip(audio_features, languages, language_probs)] - - tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device) - tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens) - - audio_features = audio_features[:: self.n_group] - no_speech_probs = no_speech_probs[:: self.n_group] - - assert audio_features.shape[0] == len(no_speech_probs) == n_audio - - tokens = tokens.reshape(n_audio, self.n_group, -1) - sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group) - - tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs) - tokens = [[t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s] for s in tokens] - - selected = self.sequence_ranker.rank(tokens, sum_logprobs) - tokens = [t[i].tolist() for i, t in zip(selected, tokens)] - - fields = ([tokenizer.decode(t).strip() for t in tokens], languages, tokens, audio_features, [lp / (len(t) + 1) for t, lp in zip(tokens, [lp[i] for i, lp in zip(selected, sum_logprobs)])], no_speech_probs) - if len(set(map(len, fields))) != 1: raise RuntimeError - - return [DecodingResult(audio_features=features, language=language, tokens=tokens, text=text, avg_logprob=avg_logprob, no_speech_prob=no_speech_prob, temperature=self.options.temperature, compression_ratio=compression_ratio(text)) for text, language, tokens, features, avg_logprob, no_speech_prob in zip(*fields)] - -class DecodingResult: - def __init__(self, audio_features, language, language_probs = None, tokens = None, text = "", avg_logprob = np.nan, no_speech_prob = np.nan, temperature = np.nan, compression_ratio = np.nan): - self.audio_features = audio_features - self.language = language - self.language_probs = language_probs if language_probs is not None else {} - self.tokens = tokens if tokens is not None else [] - self.text = text - self.avg_logprob = avg_logprob - self.no_speech_prob = no_speech_prob - self.temperature = temperature - self.compression_ratio = compression_ratio - -class Tokenizer: - def __init__(self, encoding_name, num_languages = 2, language = None, task = None, sot_sequence = ()): - self.encoding = get_encoding(name=encoding_name, num_languages=num_languages) - self.num_languages = num_languages - self.language = language - self.task = task - self.sot_sequence = sot_sequence - self.special_tokens = {} - - for special in self.encoding.special_tokens_set: - special_token = self.encoding.encode_single_token(special) - self.special_tokens[special] = special_token - - sot = self.special_tokens["<|startoftranscript|>"] - langs = tuple(LANGUAGES.keys())[: self.num_languages] - sot_sequence = [sot] - - if self.language is not None: sot_sequence.append(sot + 1 + langs.index(self.language)) - if self.task is not None: sot_sequence.append(self.special_tokens["<|transcribe|>"] if self.task == "transcribe" else self.special_tokens["<|translate|>"]) - - self.sot_sequence = tuple(sot_sequence) - - def encode(self, text, **kwargs): - return self.encoding.encode(text, **kwargs) - - def decode(self, token_ids, **kwargs): - return self.encoding.decode([t for t in token_ids if t < self.timestamp_begin], **kwargs) - - def decode_with_timestamps(self, token_ids, **kwargs): - return self.encoding.decode(token_ids, **kwargs) - - @cached_property - def eot(self): - return self.encoding.eot_token - - @cached_property - def transcribe(self): - return self.special_tokens["<|transcribe|>"] - - @cached_property - def translate(self): - return self.special_tokens["<|translate|>"] - - @cached_property - def sot(self): - return self.special_tokens["<|startoftranscript|>"] - - @cached_property - def sot_lm(self): - return self.special_tokens["<|startoflm|>"] - - @cached_property - def sot_prev(self): - return self.special_tokens["<|startofprev|>"] - - @cached_property - def no_speech(self): - return self.special_tokens["<|nospeech|>"] - - @cached_property - def no_timestamps(self): - return self.special_tokens["<|notimestamps|>"] - - @cached_property - def timestamp_begin(self): - return self.special_tokens["<|0.00|>"] - - @cached_property - def language_token(self): - if self.language is None: raise ValueError - return self.to_language_token(self.language) - - def to_language_token(self, language): - if token := self.special_tokens.get(f"<|{language}|>", None): return token - raise KeyError - - @cached_property - def all_language_tokens(self): - result = [] - for token, token_id in self.special_tokens.items(): - if token.strip("<|>") in LANGUAGES: result.append(token_id) - - return tuple(result)[: self.num_languages] - - @cached_property - def all_language_codes(self): - return tuple(self.decode([_l]).strip("<|>") for _l in self.all_language_tokens) - - @cached_property - def sot_sequence_including_notimestamps(self): - return tuple(list(self.sot_sequence) + [self.no_timestamps]) - - @cached_property - def non_speech_tokens(self): - symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』') - symbols += ("<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()) - - miscellaneous = set("♩♪♫♬♭♮♯") - assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous) - - result = {self.encoding.encode(" -")[0], self.encoding.encode(" '")[0]} - for symbol in symbols + list(miscellaneous): - for tokens in [self.encoding.encode(symbol), self.encoding.encode(" " + symbol)]: - if len(tokens) == 1 or symbol in miscellaneous: result.add(tokens[0]) - - return tuple(sorted(result)) - - def split_to_word_tokens(self, tokens): - if self.language in {"zh", "ja", "th", "lo", "my", "yue"}: return self.split_tokens_on_unicode(tokens) - return self.split_tokens_on_spaces(tokens) - - def split_tokens_on_unicode(self, tokens): - replacement_char = "\ufffd" - - words, word_tokens, current_tokens = [], [], [] - unicode_offset = 0 - - for token in tokens: - current_tokens.append(token) - decoded = self.decode_with_timestamps(current_tokens) - - if (replacement_char not in decoded or self.decode_with_timestamps(tokens)[unicode_offset + decoded.index(replacement_char)] == replacement_char): - words.append(decoded) - word_tokens.append(current_tokens) - current_tokens = [] - unicode_offset += len(decoded) - - return words, word_tokens - - def split_tokens_on_spaces(self, tokens): - subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens) - words, word_tokens = [], [] - - for subword, subword_tokens in zip(subwords, subword_tokens_list): - if (subword_tokens[0] >= self.eot) or (subword.startswith(" ")) or (subword.strip() in string.punctuation) or len(words) == 0: - words.append(subword) - word_tokens.append(subword_tokens) - else: - words[-1] = words[-1] + subword - word_tokens[-1].extend(subword_tokens) - - return words, word_tokens \ No newline at end of file diff --git a/main/library/utils.py b/main/library/utils.py deleted file mode 100644 index 747cdb430255a37835655362dc0489ac06854c1c..0000000000000000000000000000000000000000 --- a/main/library/utils.py +++ /dev/null @@ -1,303 +0,0 @@ -import os -import re -import gc -import sys -import torch -import faiss -import codecs -import logging - -import numpy as np - -from pydub import AudioSegment - -sys.path.append(os.getcwd()) - -from main.tools import huggingface -from main.library.backends import directml, opencl -from main.app.variables import translations, configs, config, logger, embedders_model, spin_model, whisper_model - -for l in ["httpx", "httpcore"]: - logging.getLogger(l).setLevel(logging.ERROR) - -def check_assets(f0_method, hubert, f0_onnx=False, embedders_mode="fairseq"): - predictors_url = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cerqvpgbef/", "rot13") - embedders_url = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/rzorqqref/", "rot13") - if embedders_mode == "spin": embedders_mode = "transformers" - - def download_predictor(predictor): - model_path = os.path.join(configs["predictors_path"], predictor) - - if not os.path.exists(model_path): - huggingface.HF_download_file( - predictors_url + predictor, - model_path - ) - - return os.path.exists(model_path) - - def download_embedder(embedders_mode, hubert): - model_path = os.path.join(configs["speaker_diarization_path"], "models", hubert) if embedders_mode == "whisper" else os.path.join(configs["embedders_path"], hubert) - - if embedders_mode != "transformers" and not os.path.exists(model_path): - if embedders_mode == "whisper": - huggingface.HF_download_file("".join([codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/fcrnxre_qvnevmngvba/", "rot13"), hubert]), model_path) - else: - huggingface.HF_download_file("".join([embedders_url, "fairseq/" if embedders_mode == "fairseq" else "onnx/", hubert]), model_path) - elif embedders_mode == "transformers": - url = "transformers/" if not hubert.startswith("spin") else "spin/" - - bin_file = os.path.join(model_path, "model.safetensors") - config_file = os.path.join(model_path, "config.json") - - os.makedirs(model_path, exist_ok=True) - - if not os.path.exists(bin_file): huggingface.HF_download_file("".join([embedders_url, url, hubert, "/model.safetensors"]), bin_file) - if not os.path.exists(config_file): huggingface.HF_download_file("".join([embedders_url, url, hubert, "/config.json"]), config_file) - - return os.path.exists(bin_file) and os.path.exists(config_file) - - return os.path.exists(model_path) - - def get_modelname(f0_method, f0_onnx=False): - suffix = ".onnx" if f0_onnx else (".pt" if "crepe" not in f0_method else ".pth") - - if "rmvpe" in f0_method: - modelname = "rmvpe" - elif "fcpe" in f0_method: - modelname = ("fcpe" + ("_legacy" if "legacy" in f0_method and "previous" not in f0_method else "")) if "previous" in f0_method else "ddsp_200k" - elif "crepe" in f0_method: - modelname = "crepe_" + f0_method.replace("mangio-", "").split("-")[1] - elif "penn" in f0_method: - modelname = "fcn" - elif "djcm" in f0_method: - modelname = "djcm" - elif "pesto" in f0_method: - modelname = "pesto" - elif "swift" in f0_method: - return "swift.onnx" - else: - return None - - return modelname + suffix - - results = [] - count = configs.get("num_of_restart", 5) - - for _ in range(count): - if "hybrid" in f0_method: - methods_str = re.search(r"hybrid\[(.+)\]", f0_method) - if methods_str: methods = [f0_method.strip() for f0_method in methods_str.group(1).split("+")] - - for method in methods: - modelname = get_modelname(method, f0_onnx) - if modelname is not None: results.append(download_predictor(modelname)) - else: - modelname = get_modelname(f0_method, f0_onnx) - if modelname is not None: results.append(download_predictor(modelname)) - - if hubert in embedders_model + spin_model + whisper_model: - if embedders_mode != "transformers": hubert += ".pt" if embedders_mode in ["fairseq", "whisper"] else ".onnx" - results.append(download_embedder(embedders_mode, hubert)) - - if all(results): return - else: results = [] - - logger.warning(translations["check_assets_error"].format(count=count)) - sys.exit(1) - -def check_spk_diarization(model_size, speechbrain=True): - whisper_model = os.path.join(configs["speaker_diarization_path"], "models", f"{model_size}.pt") - if not os.path.exists(whisper_model): huggingface.HF_download_file("".join([codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/fcrnxre_qvnevmngvba/", "rot13"), model_size, ".pt"]), whisper_model) - - speechbrain_path = os.path.join(configs["speaker_diarization_path"], "models", "speechbrain") - if not os.path.exists(speechbrain_path): os.makedirs(speechbrain_path, exist_ok=True) - - if speechbrain: - for f in ["classifier.ckpt", "config.json", "embedding_model.ckpt", "hyperparams.yaml", "mean_var_norm_emb.ckpt"]: - speechbrain_model = os.path.join(speechbrain_path, f) - - if not os.path.exists(speechbrain_model): huggingface.HF_download_file(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/fcrnxre_qvnevmngvba/fcrrpuoenva/", "rot13") + f, speechbrain_model) - -def load_audio(file, sample_rate=16000, formant_shifting=False, formant_qfrency=0.8, formant_timbre=0.8): - import librosa - import soundfile as sf - - try: - file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - if not os.path.isfile(file): raise FileNotFoundError(translations["not_found"].format(name=file)) - - try: - audio, sr = sf.read(file, dtype=np.float32) - except: - audio, sr = librosa.load(file, sr=None) - - if len(audio.shape) > 1: audio = librosa.to_mono(audio.T) - if sr != sample_rate: audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate, res_type="soxr_vhq") - - if formant_shifting: - from main.library.algorithm.stftpitchshift import StftPitchShift - - pitchshifter = StftPitchShift(1024, 32, sample_rate) - audio = pitchshifter.shiftpitch(audio, factors=1, quefrency=formant_qfrency * 1e-3, distortion=formant_timbre) - except Exception as e: - raise RuntimeError(f"{translations['errors_loading_audio']}: {e}") - - return audio.flatten() - -def pydub_load(input_path, volume = None): - try: - if input_path.endswith(".wav"): audio = AudioSegment.from_wav(input_path) - elif input_path.endswith(".mp3"): audio = AudioSegment.from_mp3(input_path) - elif input_path.endswith(".ogg"): audio = AudioSegment.from_ogg(input_path) - else: audio = AudioSegment.from_file(input_path) - except: - audio = AudioSegment.from_file(input_path) - - return audio if volume is None else (audio + volume) - -def load_embedders_model(embedder_model, embedders_mode="fairseq"): - if embedders_mode in ["fairseq", "whisper"]: embedder_model += ".pt" - elif embedders_mode == "onnx": embedder_model += ".onnx" - elif embedders_mode == "spin": embedders_mode = "transformers" - - embedder_model_path = os.path.join(configs["speaker_diarization_path"], "models", embedder_model) if embedders_mode == "whisper" else os.path.join(configs["embedders_path"], embedder_model) - if not os.path.exists(embedder_model_path): raise FileNotFoundError(f"{translations['not_found'].format(name=translations['model'])}: {embedder_model}") - - try: - if embedders_mode == "fairseq": - from main.library.embedders.fairseq import load_model - hubert_model = load_model(embedder_model_path) - elif embedders_mode == "onnx": - from main.library.embedders.onnx import HubertModelONNX - hubert_model = HubertModelONNX(embedder_model_path, config.providers, config.device) - elif embedders_mode == "transformers": - from main.library.embedders.transformers import HubertModelWithFinalProj - hubert_model = HubertModelWithFinalProj.from_pretrained(embedder_model_path) - elif embedders_mode == "whisper": - from main.library.embedders.ppg import WhisperModel - hubert_model = WhisperModel(embedder_model_path, config.device) - else: raise ValueError(translations["option_not_valid"]) - except Exception as e: - raise RuntimeError(translations["read_model_error"].format(e=e)) - - return hubert_model - -def cut(audio, sr, db_thresh=-60, min_interval=250): - from main.inference.preprocess.slicer2 import Slicer2 - - slicer = Slicer2(sr=sr, threshold=db_thresh, min_interval=min_interval) - return slicer.slice2(audio) - -def restore(segments, total_len, dtype=np.float32): - out = [] - last_end = 0 - - for start, end, processed_seg in segments: - if start > last_end: out.append(np.zeros(start - last_end, dtype=dtype)) - - out.append(processed_seg) - last_end = end - - if last_end < total_len: out.append(np.zeros(total_len - last_end, dtype=dtype)) - return np.concatenate(out, axis=-1) - -def extract_features(model, feats, version, device="cpu"): - with torch.no_grad(): - logits = model.extract_features(**{"source": feats, "padding_mask": torch.BoolTensor(feats.shape).fill_(False).to(device), "output_layer": 9 if version == "v1" else 12}) - feats = model.final_proj(logits[0]) if version == "v1" else logits[0] - - return feats - -def autotune_f0(note_dict, f0, f0_autotune_strength): - autotuned_f0 = np.zeros_like(f0) - - for i, freq in enumerate(f0): - autotuned_f0[i] = freq + (min(note_dict, key=lambda x: abs(x - freq)) - freq) * f0_autotune_strength - - return autotuned_f0 - -def change_rms(source_audio, source_rate, target_audio, target_rate, rate): - import librosa - import torch.nn.functional as F - - rms2 = F.interpolate( - torch.from_numpy( - librosa.feature.rms( - y=target_audio, - frame_length=target_rate // 2 * 2, - hop_length=target_rate // 2 - ) - ).float().unsqueeze(0), - size=target_audio.shape[0], - mode="linear" - ).squeeze() - - return target_audio * ( - F.interpolate( - torch.from_numpy(librosa.feature.rms(y=source_audio, frame_length=source_rate // 2 * 2, hop_length=source_rate // 2)).float().unsqueeze(0), - size=target_audio.shape[0], - mode="linear" - ).squeeze().pow(1 - rate) * rms2.maximum(torch.zeros_like(rms2) + 1e-6).pow(rate - 1) - ).numpy() - -def clear_gpu_cache(): - gc.collect() - - if torch.cuda.is_available(): torch.cuda.empty_cache() - elif torch.backends.mps.is_available(): torch.mps.empty_cache() - elif directml.is_available(): directml.empty_cache() - elif opencl.is_available(): opencl.pytorch_ocl.empty_cache() - -def extract_median_f0(f0): - f0 = np.where(f0 == 0, np.nan, f0) - - return float( - np.median( - np.interp( - np.arange(len(f0)), - np.where(~np.isnan(f0))[0], - f0[~np.isnan(f0)] - ) - ) - ) - -def proposal_f0_up_key(f0, target_f0 = 155.0, limit = 12): - try: - return max( - -limit, - min( - limit, int(np.round(12 * np.log2(target_f0 / extract_median_f0(f0)))) - ) - ) - except ValueError: - return 0 - -def circular_write(new_data, target): - offset = new_data.shape[0] - - target[: -offset] = target[offset :].detach().clone() - target[-offset :] = new_data - - return target - -def load_faiss_index(index_path): - if index_path != "" and os.path.exists(index_path): - try: - index = faiss.read_index(index_path) - big_npy = index.reconstruct_n(0, index.ntotal) - except Exception as e: - logger.error(translations["read_faiss_index_error"].format(e=e)) - index = big_npy = None - else: index = big_npy = None - - return index, big_npy - -def load_model(model_path, weights_only=True, log_severity_level=3): - if not os.path.isfile(model_path): return None - - if model_path.endswith(".pth"): - return torch.load(model_path, map_location="cpu", weights_only=weights_only) - else: - from main.library.onnx.wrapper import ONNXRVC - return ONNXRVC(model_path, config.providers, log_severity_level=log_severity_level) \ No newline at end of file diff --git a/main/library/uvr5_lib/common_separator.py b/main/library/uvr5_lib/common_separator.py deleted file mode 100644 index cc3ed9bffe99faa715ad3b6d7d5606639f0c69dd..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/common_separator.py +++ /dev/null @@ -1,162 +0,0 @@ -import os -import sys -import librosa - -import numpy as np -import soundfile as sf - -from pydub import AudioSegment - -sys.path.append(os.getcwd()) - -from main.library.uvr5_lib.spec_utils import normalize - -class CommonSeparator: - VOCAL_STEM = "Vocals" - INST_STEM = "Instrumental" - OTHER_STEM = "Other" - BASS_STEM = "Bass" - DRUM_STEM = "Drums" - GUITAR_STEM = "Guitar" - PIANO_STEM = "Piano" - SYNTH_STEM = "Synthesizer" - STRINGS_STEM = "Strings" - WOODWINDS_STEM = "Woodwinds" - BRASS_STEM = "Brass" - WIND_INST_STEM = "Wind Inst" - PRIMARY_STEM = "Primary Stem" - SECONDARY_STEM = "Secondary Stem" - LEAD_VOCAL_STEM = "lead_only" - BV_VOCAL_STEM = "backing_only" - NO_STEM = "No " - STEM_PAIR_MAPPER = {VOCAL_STEM: INST_STEM, INST_STEM: VOCAL_STEM, LEAD_VOCAL_STEM: BV_VOCAL_STEM, BV_VOCAL_STEM: LEAD_VOCAL_STEM, PRIMARY_STEM: SECONDARY_STEM} - NON_ACCOM_STEMS = (VOCAL_STEM, OTHER_STEM, BASS_STEM, DRUM_STEM, GUITAR_STEM, PIANO_STEM, SYNTH_STEM, STRINGS_STEM, WOODWINDS_STEM, BRASS_STEM, WIND_INST_STEM) - - def __init__(self, config): - self.logger = config.get("logger") - self.torch_device = config.get("torch_device") - self.torch_device_cpu = config.get("torch_device_cpu") - self.torch_device_mps = config.get("torch_device_mps") - self.onnx_execution_provider = config.get("onnx_execution_provider") - self.model_name = config.get("model_name") - self.model_path = config.get("model_path") - self.model_data = config.get("model_data") - self.output_dir = config.get("output_dir") - self.output_format = config.get("output_format") - self.output_bitrate = config.get("output_bitrate") - self.normalization_threshold = config.get("normalization_threshold") - self.enable_denoise = config.get("enable_denoise") - self.output_single_stem = config.get("output_single_stem") - self.invert_using_spec = config.get("invert_using_spec") - self.sample_rate = config.get("sample_rate") - self.primary_stem_name = None - self.secondary_stem_name = None - - if "training" in self.model_data and "instruments" in self.model_data["training"]: - instruments = self.model_data["training"]["instruments"] - if instruments: - self.primary_stem_name = instruments[0] - self.secondary_stem_name = instruments[1] if len(instruments) > 1 else self.secondary_stem(self.primary_stem_name) - - if self.primary_stem_name is None: - self.primary_stem_name = self.model_data.get("primary_stem", "Vocals") - self.secondary_stem_name = self.secondary_stem(self.primary_stem_name) - - self.is_karaoke = self.model_data.get("is_karaoke", False) - self.is_bv_model = self.model_data.get("is_bv_model", False) - self.bv_model_rebalance = self.model_data.get("is_bv_model_rebalanced", 0) - self.audio_file_path = None - self.audio_file_base = None - self.primary_source = None - self.secondary_source = None - self.primary_stem_output_path = None - self.secondary_stem_output_path = None - self.cached_sources_map = {} - - def secondary_stem(self, primary_stem): - primary_stem = primary_stem if primary_stem else self.NO_STEM - return self.STEM_PAIR_MAPPER[primary_stem] if primary_stem in self.STEM_PAIR_MAPPER else primary_stem.replace(self.NO_STEM, "") if self.NO_STEM in primary_stem else f"{self.NO_STEM}{primary_stem}" - - def separate(self, audio_file_path): - pass - - def final_process(self, stem_path, source, stem_name): - self.write_audio(stem_path, source) - return {stem_name: source} - - def cached_sources_clear(self): - self.cached_sources_map = {} - - def cached_source_callback(self, model_architecture, model_name=None): - model, sources = None, None - mapper = self.cached_sources_map[model_architecture] - for key, value in mapper.items(): - if model_name in key: - model = key - sources = value - - return model, sources - - def cached_model_source_holder(self, model_architecture, sources, model_name=None): - self.cached_sources_map[model_architecture] = {**self.cached_sources_map.get(model_architecture, {}), **{model_name: sources}} - - def prepare_mix(self, mix): - if not isinstance(mix, np.ndarray): - mix, _ = librosa.load(mix, mono=False, sr=self.sample_rate) - else: - mix = mix.T - - if mix.ndim == 1: - mix = np.asfortranarray([mix, mix]) - - return mix - - def write_audio(self, stem_path, stem_source): - duration_seconds = librosa.get_duration(y=librosa.load(self.audio_file_path, sr=None)[0]) - duration_hours = duration_seconds / 3600 - - if duration_hours >= 1: - self.write_audio_soundfile(stem_path, stem_source) - else: - self.write_audio_pydub(stem_path, stem_source) - - def write_audio_pydub(self, stem_path, stem_source): - stem_source = normalize(wave=stem_source, max_peak=self.normalization_threshold) - - if np.max(np.abs(stem_source)) < 1e-6: return - - if self.output_dir: - os.makedirs(self.output_dir, exist_ok=True) - stem_path = os.path.join(self.output_dir, stem_path) - - if stem_source.dtype != np.int16: stem_source = (stem_source * 32767).astype(np.int16) - stem_source_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16) - stem_source_interleaved[0::2] = stem_source[:, 0] - stem_source_interleaved[1::2] = stem_source[:, 1] - - audio_segment = AudioSegment(stem_source_interleaved.tobytes(), frame_rate=self.sample_rate, sample_width=stem_source.dtype.itemsize, channels=2) - file_format = stem_path.lower().split(".")[-1] - - if file_format == "m4a": file_format = "mp4" - elif file_format == "mka": file_format = "matroska" - - audio_segment.export(stem_path, format=file_format, bitrate="320k" if file_format == "mp3" and self.output_bitrate is None else self.output_bitrate) - - def write_audio_soundfile(self, stem_path, stem_source): - if stem_source.shape[1] == 2: - if stem_source.flags["F_CONTIGUOUS"]: stem_source = np.ascontiguousarray(stem_source) - else: - stereo_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16) - stereo_interleaved[0::2] = stem_source[:, 0] - stereo_interleaved[1::2] = stem_source[:, 1] - stem_source = stereo_interleaved - - sf.write(stem_path, stem_source, self.sample_rate) - - def clear_file_specific_paths(self): - self.audio_file_path = None - self.audio_file_base = None - self.primary_source = None - self.secondary_source = None - self.primary_stem_output_path = None - self.secondary_stem_output_path = None \ No newline at end of file diff --git a/main/library/uvr5_lib/demucs/apply.py b/main/library/uvr5_lib/demucs/apply.py deleted file mode 100644 index 24c795148a70e444b409f798644126a20621ff45..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/demucs/apply.py +++ /dev/null @@ -1,251 +0,0 @@ -import os -import sys -import tqdm -import torch -import random -import concurrent.futures - -sys.path.append(os.getcwd()) - -from main.library.uvr5_lib.demucs.utils import center_trim - -class DummyPoolExecutor: - class DummyResult: - def __init__(self, func, *args, **kwargs): - self.func = func - self.args = args - self.kwargs = kwargs - - def result(self): - return self.func(*self.args, **self.kwargs) - - def __init__(self, workers=0): - pass - - def submit(self, func, *args, **kwargs): - return DummyPoolExecutor.DummyResult(func, *args, **kwargs) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, exc_tb): - return - -class BagOfModels(torch.nn.Module): - def __init__(self, models, weights = None, segment = None): - super().__init__() - assert len(models) > 0 - first = models[0] - - for other in models: - assert other.sources == first.sources - assert other.samplerate == first.samplerate - assert other.audio_channels == first.audio_channels - - if segment is not None: other.segment = segment - - self.audio_channels = first.audio_channels - self.samplerate = first.samplerate - self.sources = first.sources - self.models = torch.nn.ModuleList(models) - - if weights is None: weights = [[1.0 for _ in first.sources] for _ in models] - else: - assert len(weights) == len(models) - - for weight in weights: - assert len(weight) == len(first.sources) - - self.weights = weights - - def forward(self, x): - pass - -class TensorChunk: - def __init__(self, tensor, offset=0, length=None): - total_length = tensor.shape[-1] - assert offset >= 0 - assert offset < total_length - - length = total_length - offset if length is None else min(total_length - offset, length) - - if isinstance(tensor, TensorChunk): - self.tensor = tensor.tensor - self.offset = offset + tensor.offset - else: - self.tensor = tensor - self.offset = offset - - self.length = length - self.device = tensor.device - - @property - def shape(self): - shape = list(self.tensor.shape) - shape[-1] = self.length - return shape - - def padded(self, target_length): - delta = target_length - self.length - total_length = self.tensor.shape[-1] - assert delta >= 0 - - start = self.offset - delta // 2 - end = start + target_length - - correct_start = max(0, start) - correct_end = min(total_length, end) - - pad_left = correct_start - start - pad_right = end - correct_end - - out = torch.nn.functional.pad(self.tensor[..., correct_start:correct_end], (pad_left, pad_right)) - - assert out.shape[-1] == target_length - return out - -def tensor_chunk(tensor_or_chunk): - if isinstance(tensor_or_chunk, TensorChunk): return tensor_or_chunk - else: - assert isinstance(tensor_or_chunk, torch.Tensor) - return TensorChunk(tensor_or_chunk) - -def apply_model(model, mix, shifts=1, split=True, overlap=0.25, transition_power=1.0, static_shifts=1, set_progress_bar=None, device=None, progress=False, num_workers=0, pool=None): - global fut_length, bag_num, prog_bar - - device = mix.device if device is None else torch.device(device) - if pool is None: pool = concurrent.futures.ThreadPoolExecutor(num_workers) if num_workers > 0 and device.type == "cpu" else DummyPoolExecutor() - - kwargs = { - "shifts": shifts, - "split": split, - "overlap": overlap, - "transition_power": transition_power, - "progress": progress, - "device": device, - "pool": pool, - "set_progress_bar": set_progress_bar, - "static_shifts": static_shifts, - } - - if isinstance(model, BagOfModels): - estimates, fut_length, prog_bar, current_model = 0, 0, 0, 0 - totals = [0] * len(model.sources) - bag_num = len(model.models) - - for sub_model, weight in zip(model.models, model.weights): - original_model_device = next(iter(sub_model.parameters())).device - sub_model.to(device) - fut_length += fut_length - current_model += 1 - out = apply_model(sub_model, mix, **kwargs) - sub_model.to(original_model_device) - - for k, inst_weight in enumerate(weight): - out[:, k, :, :] *= inst_weight - totals[k] += inst_weight - - estimates += out - del out - - for k in range(estimates.shape[1]): - estimates[:, k, :, :] /= totals[k] - - return estimates - - model.to(device) - model.eval() - assert transition_power >= 1 - batch, channels, length = mix.shape - - if shifts: - kwargs["shifts"] = 0 - max_shift = int(0.5 * model.samplerate) - mix = tensor_chunk(mix) - padded_mix = mix.padded(length + 2 * max_shift) - out = 0 - - for _ in range(shifts): - offset = random.randint(0, max_shift) - shifted = TensorChunk(padded_mix, offset, length + max_shift - offset) - shifted_out = apply_model(model, shifted, **kwargs) - out += shifted_out[..., max_shift - offset :] - - out /= shifts - return out - elif split: - kwargs["split"] = False - out = torch.zeros(batch, len(model.sources), channels, length, device=mix.device) - sum_weight = torch.zeros(length, device=mix.device) - segment = int(model.samplerate * model.segment) - stride = int((1 - overlap) * segment) - offsets = range(0, length, stride) - weight = torch.cat([torch.arange(1, segment // 2 + 1, device=device), torch.arange(segment - segment // 2, 0, -1, device=device)]) - assert len(weight) == segment - weight = (weight / weight.max()) ** transition_power - futures = [] - - for offset in offsets: - chunk = TensorChunk(mix, offset, segment) - future = pool.submit(apply_model, model, chunk, **kwargs) - futures.append((future, offset)) - offset += segment - - if progress: futures = tqdm.tqdm(futures) - - for future, offset in futures: - if set_progress_bar: - fut_length = len(futures) * bag_num * static_shifts - prog_bar += 1 - set_progress_bar(0.1, (0.8 / fut_length * prog_bar)) - - chunk_out = future.result() - chunk_length = chunk_out.shape[-1] - - out[..., offset : offset + segment] += (weight[:chunk_length].to(device) * chunk_out).to(mix.device) - sum_weight[offset : offset + segment] += weight[:chunk_length].to(mix.device) - - assert sum_weight.min() > 0 - - out /= sum_weight - return out - else: - valid_length = model.valid_length(length) if hasattr(model, "valid_length") else length - mix = tensor_chunk(mix) - padded_mix = mix.padded(valid_length).to(device) - - with torch.no_grad(): - out = model(padded_mix) - - return center_trim(out, length) - -def demucs_segments(demucs_segment, demucs_model): - if demucs_segment == "Default": - segment = None - - if isinstance(demucs_model, BagOfModels): - if segment is not None: - for sub in demucs_model.models: - sub.segment = segment - else: - if segment is not None: sub.segment = segment - else: - try: - segment = int(demucs_segment) - if isinstance(demucs_model, BagOfModels): - if segment is not None: - for sub in demucs_model.models: - sub.segment = segment - else: - if segment is not None: sub.segment = segment - except: - segment = None - - if isinstance(demucs_model, BagOfModels): - if segment is not None: - for sub in demucs_model.models: - sub.segment = segment - else: - if segment is not None: sub.segment = segment - - return demucs_model \ No newline at end of file diff --git a/main/library/uvr5_lib/demucs/demucs.py b/main/library/uvr5_lib/demucs/demucs.py deleted file mode 100644 index 94943469fc1470e1bb999f2f766f4c12d9d54483..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/demucs/demucs.py +++ /dev/null @@ -1,371 +0,0 @@ -import os -import sys -import math -import torch -import inspect - -import torch.nn as nn -import torch.nn.functional as F - -sys.path.append(os.getcwd()) - -from main.library.uvr5_lib.demucs.utils import center_trim -from main.library.uvr5_lib.demucs.states import capture_init - -def unfold(a, kernel_size, stride): - *shape, length = a.shape - n_frames = math.ceil(length / stride) - tgt_length = (n_frames - 1) * stride + kernel_size - a = F.pad(a, (0, tgt_length - length)) - strides = list(a.stride()) - assert strides[-1] == 1 - strides = strides[:-1] + [stride, 1] - return a.as_strided([*shape, n_frames, kernel_size], strides) - -def rescale_conv(conv, reference): - scale = (conv.weight.std().detach() / reference) ** 0.5 - conv.weight.data /= scale - if conv.bias is not None: conv.bias.data /= scale - -def rescale_module(module, reference): - for sub in module.modules(): - if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d, nn.Conv2d, nn.ConvTranspose2d)): rescale_conv(sub, reference) - -class BLSTM(nn.Module): - def __init__(self, dim, layers=1, max_steps=None, skip=False): - super().__init__() - assert max_steps is None or max_steps % 4 == 0 - self.max_steps = max_steps - self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim) - self.linear = nn.Linear(2 * dim, dim) - self.skip = skip - - def forward(self, x): - B, C, T = x.shape - y = x - framed = False - - if self.max_steps is not None and T > self.max_steps: - width = self.max_steps - stride = width // 2 - frames = unfold(x, width, stride) - nframes = frames.shape[2] - framed = True - x = frames.permute(0, 2, 1, 3).reshape(-1, C, width) - - x = x.permute(2, 0, 1) - x = self.lstm(x)[0] - x = self.linear(x) - x = x.permute(1, 2, 0) - - if framed: - out = [] - frames = x.reshape(B, -1, C, width) - limit = stride // 2 - - for k in range(nframes): - if k == 0: out.append(frames[:, k, :, :-limit]) - elif k == nframes - 1: out.append(frames[:, k, :, limit:]) - else: out.append(frames[:, k, :, limit:-limit]) - - out = torch.cat(out, -1) - out = out[..., :T] - x = out - - if self.skip: x = x + y - return x - -class LayerScale(nn.Module): - def __init__(self, channels, init = 0): - super().__init__() - self.scale = nn.Parameter(torch.zeros(channels, requires_grad=True)) - self.scale.data[:] = init - - def forward(self, x): - return self.scale[:, None] * x - -class DConv(nn.Module): - def __init__(self, channels, compress = 4, depth = 2, init = 1e-4, norm=True, attn=False, heads=4, ndecay=4, lstm=False, gelu=True, kernel=3, dilate=True): - super().__init__() - assert kernel % 2 == 1 - self.channels = channels - self.compress = compress - self.depth = abs(depth) - dilate = depth > 0 - norm_fn = lambda d: nn.Identity() - if norm: norm_fn = lambda d: nn.GroupNorm(1, d) - hidden = int(channels / compress) - act = nn.GELU if gelu else nn.ReLU - self.layers = nn.ModuleList([]) - - for d in range(self.depth): - dilation = 2**d if dilate else 1 - padding = dilation * (kernel // 2) - - mods = [nn.Conv1d(channels, hidden, kernel, dilation=dilation, padding=padding), norm_fn(hidden), act(), nn.Conv1d(hidden, 2 * channels, 1), norm_fn(2 * channels), nn.GLU(1), LayerScale(channels, init)] - - if attn: mods.insert(3, LocalState(hidden, heads=heads, ndecay=ndecay)) - if lstm: mods.insert(3, BLSTM(hidden, layers=2, max_steps=200, skip=True)) - layer = nn.Sequential(*mods) - self.layers.append(layer) - - def forward(self, x): - for layer in self.layers: - x = x + layer(x) - - return x - -class LocalState(nn.Module): - def __init__(self, channels, heads = 4, nfreqs = 0, ndecay = 4): - super().__init__() - assert channels % heads == 0, (channels, heads) - self.heads = heads - self.nfreqs = nfreqs - self.ndecay = ndecay - self.content = nn.Conv1d(channels, channels, 1) - self.query = nn.Conv1d(channels, channels, 1) - self.key = nn.Conv1d(channels, channels, 1) - - if nfreqs: self.query_freqs = nn.Conv1d(channels, heads * nfreqs, 1) - - if ndecay: - self.query_decay = nn.Conv1d(channels, heads * ndecay, 1) - self.query_decay.weight.data *= 0.01 - assert self.query_decay.bias is not None - self.query_decay.bias.data[:] = -2 - - self.proj = nn.Conv1d(channels + heads * nfreqs, channels, 1) - - def forward(self, x): - B, C, T = x.shape - heads = self.heads - indexes = torch.arange(T, device=x.device, dtype=x.dtype) - delta = indexes[:, None] - indexes[None, :] - queries = self.query(x).view(B, heads, -1, T) - keys = self.key(x).view(B, heads, -1, T) - dots = torch.einsum("bhct,bhcs->bhts", keys, queries) - dots /= keys.shape[2] ** 0.5 - - if self.nfreqs: - periods = torch.arange(1, self.nfreqs + 1, device=x.device, dtype=x.dtype) - freq_kernel = (2 * math.pi * delta / periods.view(-1, 1, 1)).cos() - freq_q = self.query_freqs(x).view(B, heads, -1, T) / self.nfreqs**0.5 - dots += torch.einsum("fts,bhfs->bhts", freq_kernel, freq_q) - - if self.ndecay: - decays = torch.arange(1, self.ndecay + 1, device=x.device, dtype=x.dtype) - decay_q = self.query_decay(x).view(B, heads, -1, T) - decay_q = decay_q.sigmoid() / 2 - decay_kernel = -decays.view(-1, 1, 1) * delta.abs() / self.ndecay**0.5 - dots += torch.einsum("fts,bhfs->bhts", decay_kernel, decay_q) - - dots.masked_fill_(torch.eye(T, device=dots.device, dtype=torch.bool), -100) - weights = dots.softmax(dim=2) - content = self.content(x).view(B, heads, -1, T) - result = torch.einsum("bhts,bhct->bhcs", weights, content) - - if self.nfreqs: - time_sig = torch.einsum("bhts,fts->bhfs", weights, freq_kernel) - result = torch.cat([result, time_sig], 2) - - result = result.reshape(B, -1, T) - return x + self.proj(result) - -class Demucs(nn.Module): - @capture_init - def __init__(self, sources, audio_channels=2, channels=64, growth=2.0, depth=6, rewrite=True, lstm_layers=0, kernel_size=8, stride=4, context=1, gelu=True, glu=True, norm_starts=4, norm_groups=4, dconv_mode=1, dconv_depth=2, dconv_comp=4, dconv_attn=4, dconv_lstm=4, dconv_init=1e-4, normalize=True, resample=True, rescale=0.1, samplerate=44100, segment=4 * 10): - super().__init__() - self.audio_channels = audio_channels - self.sources = sources - self.kernel_size = kernel_size - self.context = context - self.stride = stride - self.depth = depth - self.resample = resample - self.channels = channels - self.normalize = normalize - self.samplerate = samplerate - self.segment = segment - self.encoder = nn.ModuleList() - self.decoder = nn.ModuleList() - self.skip_scales = nn.ModuleList() - - if glu: - activation = nn.GLU(dim=1) - ch_scale = 2 - else: - activation = nn.ReLU() - ch_scale = 1 - - act2 = nn.GELU if gelu else nn.ReLU - - in_channels = audio_channels - padding = 0 - - for index in range(depth): - norm_fn = lambda d: nn.Identity() - if index >= norm_starts: norm_fn = lambda d: nn.GroupNorm(norm_groups, d) - - encode = [] - encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), norm_fn(channels), act2()] - attn = index >= dconv_attn - lstm = index >= dconv_lstm - - if dconv_mode & 1: encode += [DConv(channels, depth=dconv_depth, init=dconv_init, compress=dconv_comp, attn=attn, lstm=lstm)] - if rewrite: encode += [nn.Conv1d(channels, ch_scale * channels, 1), norm_fn(ch_scale * channels), activation] - self.encoder.append(nn.Sequential(*encode)) - - decode = [] - out_channels = in_channels if index > 0 else len(self.sources) * audio_channels - - if rewrite: decode += [nn.Conv1d(channels, ch_scale * channels, 2 * context + 1, padding=context), norm_fn(ch_scale * channels), activation] - if dconv_mode & 2: decode += [DConv(channels, depth=dconv_depth, init=dconv_init, compress=dconv_comp, attn=attn, lstm=lstm)] - decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride, padding=padding)] - - if index > 0: decode += [norm_fn(out_channels), act2()] - self.decoder.insert(0, nn.Sequential(*decode)) - in_channels = channels - channels = int(growth * channels) - - channels = in_channels - self.lstm = BLSTM(channels, lstm_layers) if lstm_layers else None - if rescale: rescale_module(self, reference=rescale) - - def valid_length(self, length): - if self.resample: length *= 2 - - for _ in range(self.depth): - length = math.ceil((length - self.kernel_size) / self.stride) + 1 - length = max(1, length) - - for _ in range(self.depth): - length = (length - 1) * self.stride + self.kernel_size - - if self.resample: length = math.ceil(length / 2) - return int(length) - - def forward(self, mix): - x = mix - length = x.shape[-1] - - if self.normalize: - mono = mix.mean(dim=1, keepdim=True) - mean = mono.mean(dim=-1, keepdim=True) - std = mono.std(dim=-1, keepdim=True) - x = (x - mean) / (1e-5 + std) - else: - mean = 0 - std = 1 - - delta = self.valid_length(length) - length - x = F.pad(x, (delta // 2, delta - delta // 2)) - - if self.resample: x = resample_frac(x, 1, 2) - saved = [] - - for encode in self.encoder: - x = encode(x) - saved.append(x) - - if self.lstm: x = self.lstm(x) - - for decode in self.decoder: - skip = saved.pop(-1) - skip = center_trim(skip, x) - x = decode(x + skip) - - if self.resample: x = resample_frac(x, 2, 1) - - x = x * std + mean - x = center_trim(x, length) - x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1)) - return x - - def load_state_dict(self, state, strict=True): - for idx in range(self.depth): - for a in ["encoder", "decoder"]: - for b in ["bias", "weight"]: - new = f"{a}.{idx}.3.{b}" - old = f"{a}.{idx}.2.{b}" - - if old in state and new not in state: state[new] = state.pop(old) - super().load_state_dict(state, strict=strict) - -class ResampleFrac(torch.nn.Module): - def __init__(self, old_sr, new_sr, zeros = 24, rolloff = 0.945): - super().__init__() - gcd = math.gcd(old_sr, new_sr) - self.old_sr = old_sr // gcd - self.new_sr = new_sr // gcd - self.zeros = zeros - self.rolloff = rolloff - self._init_kernels() - - def _init_kernels(self): - if self.old_sr == self.new_sr: return - - kernels = [] - sr = min(self.new_sr, self.old_sr) - sr *= self.rolloff - - self._width = math.ceil(self.zeros * self.old_sr / sr) - idx = torch.arange(-self._width, self._width + self.old_sr).float() - - for i in range(self.new_sr): - t = ((-i / self.new_sr + idx / self.old_sr) * sr).clamp_(-self.zeros, self.zeros) - t *= math.pi - - kernel = sinc(t) * ((t / self.zeros / 2).cos()**2) - kernel.div_(kernel.sum()) - kernels.append(kernel) - - self.register_buffer("kernel", torch.stack(kernels).view(self.new_sr, 1, -1)) - - def forward(self, x, output_length = None, full = False): - if self.old_sr == self.new_sr: return x - shape = x.shape - length = x.shape[-1] - - x = x.reshape(-1, length) - y = F.conv1d(F.pad(x[:, None], (self._width, self._width + self.old_sr), mode='replicate'), self.kernel, stride=self.old_sr).transpose(1, 2).reshape(list(shape[:-1]) + [-1]) - - float_output_length = torch.as_tensor(self.new_sr * length / self.old_sr) - max_output_length = float_output_length.ceil().long() - default_output_length = float_output_length.floor().long() - - if output_length is None: applied_output_length = max_output_length if full else default_output_length - elif output_length < 0 or output_length > max_output_length: raise ValueError("output_length < 0 or output_length > max_output_length") - else: - applied_output_length = torch.tensor(output_length) - if full: raise ValueError("full=True") - - return y[..., :applied_output_length] - - def __repr__(self): - return simple_repr(self) - -def sinc(x): - return torch.where(x == 0, torch.tensor(1., device=x.device, dtype=x.dtype), x.sin() / x) - -def simple_repr(obj, attrs = None, overrides = {}): - params = inspect.signature(obj.__class__).parameters - attrs_repr = [] - - if attrs is None: attrs = list(params.keys()) - for attr in attrs: - display = False - - if attr in overrides: value = overrides[attr] - elif hasattr(obj, attr): value = getattr(obj, attr) - else: continue - - if attr in params: - param = params[attr] - if param.default is inspect._empty or value != param.default: display = True - else: display = True - - if display: attrs_repr.append(f"{attr}={value}") - return f"{obj.__class__.__name__}({','.join(attrs_repr)})" - -def resample_frac(x, old_sr, new_sr, zeros = 24, rolloff = 0.945, output_length = None, full = False): - return ResampleFrac(old_sr, new_sr, zeros, rolloff).to(x)(x, output_length, full) \ No newline at end of file diff --git a/main/library/uvr5_lib/demucs/hdemucs.py b/main/library/uvr5_lib/demucs/hdemucs.py deleted file mode 100644 index bb502da166fc0b57879e0e3262b026dfbbffe7d3..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/demucs/hdemucs.py +++ /dev/null @@ -1,763 +0,0 @@ -import os -import sys -import math -import torch - -from torch import nn -from copy import deepcopy - -from torch.nn import functional as F - -sys.path.append(os.getcwd()) - -from main.library.uvr5_lib.demucs.states import capture_init -from main.library.uvr5_lib.demucs.demucs import DConv, rescale_module - -def spectro(x, n_fft=512, hop_length=None, pad=0): - *other, length = x.shape - x = x.reshape(-1, length) - device_type = x.device.type - is_other_gpu = not device_type in ["cuda", "cpu"] - if is_other_gpu: x = x.cpu() - z = torch.stft(x, n_fft * (1 + pad), hop_length or n_fft // 4, window=torch.hann_window(n_fft).to(x), win_length=n_fft, normalized=True, center=True, return_complex=True, pad_mode="reflect") - _, freqs, frame = z.shape - return z.view(*other, freqs, frame) - -def ispectro(z, hop_length=None, length=None, pad=0): - *other, freqs, frames = z.shape - n_fft = 2 * freqs - 2 - z = z.view(-1, freqs, frames) - win_length = n_fft // (1 + pad) - device_type = z.device.type - is_other_gpu = not device_type in ["cuda", "cpu"] - if is_other_gpu: z = z.cpu() - x = torch.istft(z, n_fft, hop_length, window=torch.hann_window(win_length).to(z.real), win_length=win_length, normalized=True, length=length, center=True) - _, length = x.shape - return x.view(*other, length) - -def atan2(y, x): - pi = 2 * torch.asin(torch.tensor(1.0)) - x += ((x == 0) & (y == 0)) * 1.0 - out = torch.atan(y / x) - out += ((y >= 0) & (x < 0)) * pi - out -= ((y < 0) & (x < 0)) * pi - out *= 1 - ((y > 0) & (x == 0)) * 1.0 - out += ((y > 0) & (x == 0)) * (pi / 2) - out *= 1 - ((y < 0) & (x == 0)) * 1.0 - out += ((y < 0) & (x == 0)) * (-pi / 2) - return out - -def _norm(x): - return x[..., 0].abs() ** 2 + x[..., 1].abs() ** 2 - -def _mul_add(a, b, out = None): - target_shape = torch.Size([max(sa, sb) for (sa, sb) in zip(a.shape, b.shape)]) - if out is None or out.shape != target_shape: out = torch.zeros(target_shape, dtype=a.dtype, device=a.device) - - if out is a: - real_a = a[..., 0] - out[..., 0] = out[..., 0] + (real_a * b[..., 0] - a[..., 1] * b[..., 1]) - out[..., 1] = out[..., 1] + (real_a * b[..., 1] + a[..., 1] * b[..., 0]) - else: - out[..., 0] = out[..., 0] + (a[..., 0] * b[..., 0] - a[..., 1] * b[..., 1]) - out[..., 1] = out[..., 1] + (a[..., 0] * b[..., 1] + a[..., 1] * b[..., 0]) - - return out - -def _mul(a, b, out = None): - target_shape = torch.Size([max(sa, sb) for (sa, sb) in zip(a.shape, b.shape)]) - if out is None or out.shape != target_shape: out = torch.zeros(target_shape, dtype=a.dtype, device=a.device) - - if out is a: - real_a = a[..., 0] - out[..., 0] = real_a * b[..., 0] - a[..., 1] * b[..., 1] - out[..., 1] = real_a * b[..., 1] + a[..., 1] * b[..., 0] - else: - out[..., 0] = a[..., 0] * b[..., 0] - a[..., 1] * b[..., 1] - out[..., 1] = a[..., 0] * b[..., 1] + a[..., 1] * b[..., 0] - - return out - -def _inv(z, out = None): - ez = _norm(z) - if out is None or out.shape != z.shape: out = torch.zeros_like(z) - - out[..., 0] = z[..., 0] / ez - out[..., 1] = -z[..., 1] / ez - - return out - -def _conj(z, out = None): - if out is None or out.shape != z.shape: out = torch.zeros_like(z) - - out[..., 0] = z[..., 0] - out[..., 1] = -z[..., 1] - - return out - -def _invert(M, out = None): - nb_channels = M.shape[-2] - if out is None or out.shape != M.shape: out = torch.empty_like(M) - - if nb_channels == 1: out = _inv(M, out) - elif nb_channels == 2: - det = _mul(M[..., 0, 0, :], M[..., 1, 1, :]) - det = det - _mul(M[..., 0, 1, :], M[..., 1, 0, :]) - invDet = _inv(det) - out[..., 0, 0, :] = _mul(invDet, M[..., 1, 1, :], out[..., 0, 0, :]) - out[..., 1, 0, :] = _mul(-invDet, M[..., 1, 0, :], out[..., 1, 0, :]) - out[..., 0, 1, :] = _mul(-invDet, M[..., 0, 1, :], out[..., 0, 1, :]) - out[..., 1, 1, :] = _mul(invDet, M[..., 0, 0, :], out[..., 1, 1, :]) - else: raise Exception("Torch == 2 Channels") - return out - -def expectation_maximization(y, x, iterations = 2, eps = 1e-10, batch_size = 200): - (nb_frames, nb_bins, nb_channels) = x.shape[:-1] - nb_sources = y.shape[-1] - regularization = torch.cat((torch.eye(nb_channels, dtype=x.dtype, device=x.device)[..., None], torch.zeros((nb_channels, nb_channels, 1), dtype=x.dtype, device=x.device)), dim=2) - regularization = (torch.as_tensor(eps)).sqrt() * (regularization[None, None, ...].expand((-1, nb_bins, -1, -1, -1))) - R = [torch.zeros((nb_bins, nb_channels, nb_channels, 2), dtype=x.dtype, device=x.device) for j in range(nb_sources)] - weight = torch.zeros((nb_bins,), dtype=x.dtype, device=x.device) - v = torch.zeros((nb_frames, nb_bins, nb_sources), dtype=x.dtype, device=x.device) - - for _ in range(iterations): - v = torch.mean(y[..., 0, :].abs() ** 2 + y[..., 1, :].abs() ** 2, dim=-2) - for j in range(nb_sources): - R[j] = torch.tensor(0.0, device=x.device) - - weight = torch.tensor(eps, device=x.device) - pos = 0 - batch_size = batch_size if batch_size else nb_frames - - while pos < nb_frames: - t = torch.arange(pos, min(nb_frames, pos + batch_size)) - pos = int(t[-1]) + 1 - - R[j] = R[j] + _covariance(y[t, ..., j]).sum(dim=0) - weight = weight + v[t, ..., j].sum(dim=0) - - R[j] = R[j] / weight[..., None, None, None] - weight = torch.zeros_like(weight) - - if y.requires_grad: y = y.clone() - - pos = 0 - - while pos < nb_frames: - t = torch.arange(pos, min(nb_frames, pos + batch_size)) - pos = int(t[-1]) + 1 - - y[t, ...] = torch.tensor(0.0, device=x.device, dtype=x.dtype) - - Cxx = regularization - - for j in range(nb_sources): - Cxx = Cxx + (v[t, ..., j, None, None, None] * R[j][None, ...].clone()) - - inv_Cxx = _invert(Cxx) - - for j in range(nb_sources): - gain = torch.zeros_like(inv_Cxx) - indices = torch.cartesian_prod(torch.arange(nb_channels), torch.arange(nb_channels), torch.arange(nb_channels)) - - for index in indices: - gain[:, :, index[0], index[1], :] = _mul_add(R[j][None, :, index[0], index[2], :].clone(), inv_Cxx[:, :, index[2], index[1], :], gain[:, :, index[0], index[1], :]) - - gain = gain * v[t, ..., None, None, None, j] - - for i in range(nb_channels): - y[t, ..., j] = _mul_add(gain[..., i, :], x[t, ..., i, None, :], y[t, ..., j]) - - return y, v, R - -def wiener(targets_spectrograms, mix_stft, iterations = 1, softmask = False, residual = False, scale_factor = 10.0, eps = 1e-10): - if softmask: y = mix_stft[..., None] * (targets_spectrograms / (eps + targets_spectrograms.sum(dim=-1, keepdim=True).to(mix_stft.dtype)))[..., None, :] - else: - angle = atan2(mix_stft[..., 1], mix_stft[..., 0])[..., None] - nb_sources = targets_spectrograms.shape[-1] - y = torch.zeros(mix_stft.shape + (nb_sources,), dtype=mix_stft.dtype, device=mix_stft.device) - y[..., 0, :] = targets_spectrograms * angle.cos() - y[..., 1, :] = targets_spectrograms * angle.sin() - - if residual: y = torch.cat([y, mix_stft[..., None] - y.sum(dim=-1, keepdim=True)], dim=-1) - if iterations == 0: return y - - max_abs = torch.as_tensor(1.0, dtype=mix_stft.dtype, device=mix_stft.device).max(_norm(mix_stft).sqrt().max() / scale_factor) - mix_stft = mix_stft / max_abs - y = y / max_abs - y = expectation_maximization(y, mix_stft, iterations, eps=eps)[0] - y = y * max_abs - - return y - -def _covariance(y_j): - (nb_frames, nb_bins, nb_channels) = y_j.shape[:-1] - - Cj = torch.zeros((nb_frames, nb_bins, nb_channels, nb_channels, 2), dtype=y_j.dtype, device=y_j.device) - indices = torch.cartesian_prod(torch.arange(nb_channels), torch.arange(nb_channels)) - - for index in indices: - Cj[:, :, index[0], index[1], :] = _mul_add(y_j[:, :, index[0], :], _conj(y_j[:, :, index[1], :]), Cj[:, :, index[0], index[1], :]) - - return Cj - -def pad1d(x, paddings, mode = "constant", value = 0.0): - x0 = x - length = x.shape[-1] - padding_left, padding_right = paddings - - if mode == "reflect": - max_pad = max(padding_left, padding_right) - - if length <= max_pad: - extra_pad = max_pad - length + 1 - extra_pad_right = min(padding_right, extra_pad) - extra_pad_left = extra_pad - extra_pad_right - paddings = (padding_left - extra_pad_left, padding_right - extra_pad_right) - x = F.pad(x, (extra_pad_left, extra_pad_right)) - - out = F.pad(x, paddings, mode, value) - - assert out.shape[-1] == length + padding_left + padding_right - assert (out[..., padding_left : padding_left + length] == x0).all() - return out - -class ScaledEmbedding(nn.Module): - def __init__(self, num_embeddings, embedding_dim, scale = 10.0, smooth=False): - super().__init__() - self.embedding = nn.Embedding(num_embeddings, embedding_dim) - - if smooth: - weight = torch.cumsum(self.embedding.weight.data, dim=0) - weight = weight / torch.arange(1, num_embeddings + 1).to(weight).sqrt()[:, None] - self.embedding.weight.data[:] = weight - - self.embedding.weight.data /= scale - self.scale = scale - - @property - def weight(self): - return self.embedding.weight * self.scale - - def forward(self, x): - return self.embedding(x) * self.scale - -class HEncLayer(nn.Module): - def __init__(self, chin, chout, kernel_size=8, stride=4, norm_groups=1, empty=False, freq=True, dconv=True, norm=True, context=0, dconv_kw={}, pad=True, rewrite=True): - super().__init__() - norm_fn = lambda d: nn.Identity() - if norm: norm_fn = lambda d: nn.GroupNorm(norm_groups, d) - pad = kernel_size // 4 if pad else 0 - - klass = nn.Conv1d - self.freq = freq - self.kernel_size = kernel_size - self.stride = stride - self.empty = empty - self.norm = norm - self.pad = pad - - if freq: - kernel_size = [kernel_size, 1] - stride = [stride, 1] - pad = [pad, 0] - klass = nn.Conv2d - - self.conv = klass(chin, chout, kernel_size, stride, pad) - if self.empty: return - - self.norm1 = norm_fn(chout) - self.rewrite = None - - if rewrite: - self.rewrite = klass(chout, 2 * chout, 1 + 2 * context, 1, context) - self.norm2 = norm_fn(2 * chout) - - self.dconv = None - if dconv: self.dconv = DConv(chout, **dconv_kw) - - def forward(self, x, inject=None): - if not self.freq and x.dim() == 4: - B, C, Fr, T = x.shape - x = x.view(B, -1, T) - - if not self.freq: - le = x.shape[-1] - if not le % self.stride == 0: x = F.pad(x, (0, self.stride - (le % self.stride))) - - y = self.conv(x) - if self.empty: return y - - if inject is not None: - assert inject.shape[-1] == y.shape[-1], (inject.shape, y.shape) - - if inject.dim() == 3 and y.dim() == 4: inject = inject[:, :, None] - y = y + inject - - y = F.gelu(self.norm1(y)) - - if self.dconv: - if self.freq: - B, C, Fr, T = y.shape - y = y.permute(0, 2, 1, 3).reshape(-1, C, T) - - y = self.dconv(y) - if self.freq: y = y.view(B, Fr, C, T).permute(0, 2, 1, 3) - - if self.rewrite: - z = self.norm2(self.rewrite(y)) - z = F.glu(z, dim=1) - else: z = y - - return z - -class MultiWrap(nn.Module): - def __init__(self, layer, split_ratios): - super().__init__() - self.split_ratios = split_ratios - self.layers = nn.ModuleList() - self.conv = isinstance(layer, HEncLayer) - assert not layer.norm - assert layer.freq - assert layer.pad - - if not self.conv: assert not layer.context_freq - - for _ in range(len(split_ratios) + 1): - lay = deepcopy(layer) - - if self.conv: lay.conv.padding = (0, 0) - else: lay.pad = False - - for m in lay.modules(): - if hasattr(m, "reset_parameters"): m.reset_parameters() - - self.layers.append(lay) - - def forward(self, x, skip=None, length=None): - B, C, Fr, T = x.shape - ratios = list(self.split_ratios) + [1] - start = 0 - outs = [] - - for ratio, layer in zip(ratios, self.layers): - if self.conv: - pad = layer.kernel_size // 4 - - if ratio == 1: - limit = Fr - frames = -1 - else: - limit = int(round(Fr * ratio)) - le = limit - start - - if start == 0: le += pad - - frames = round((le - layer.kernel_size) / layer.stride + 1) - limit = start + (frames - 1) * layer.stride + layer.kernel_size - - if start == 0: limit -= pad - - assert limit - start > 0, (limit, start) - assert limit <= Fr, (limit, Fr) - - y = x[:, :, start:limit, :] - - if start == 0: y = F.pad(y, (0, 0, pad, 0)) - if ratio == 1: y = F.pad(y, (0, 0, 0, pad)) - - outs.append(layer(y)) - start = limit - layer.kernel_size + layer.stride - else: - limit = Fr if ratio == 1 else int(round(Fr * ratio)) - - last = layer.last - layer.last = True - - y = x[:, :, start:limit] - s = skip[:, :, start:limit] - out, _ = layer(y, s, None) - - if outs: - outs[-1][:, :, -layer.stride :] += out[:, :, : layer.stride] - layer.conv_tr.bias.view(1, -1, 1, 1) - out = out[:, :, layer.stride :] - - if ratio == 1: out = out[:, :, : -layer.stride // 2, :] - if start == 0: out = out[:, :, layer.stride // 2 :, :] - - outs.append(out) - layer.last = last - start = limit - - out = torch.cat(outs, dim=2) - if not self.conv and not last: out = F.gelu(out) - - if self.conv: return out - else: return out, None - -class HDecLayer(nn.Module): - def __init__(self, chin, chout, last=False, kernel_size=8, stride=4, norm_groups=1, empty=False, freq=True, dconv=True, norm=True, context=1, dconv_kw={}, pad=True, context_freq=True, rewrite=True): - super().__init__() - norm_fn = lambda d: nn.Identity() - - if norm: norm_fn = lambda d: nn.GroupNorm(norm_groups, d) - pad = kernel_size // 4 if pad else 0 - - self.pad = pad - self.last = last - self.freq = freq - self.chin = chin - self.empty = empty - self.stride = stride - self.kernel_size = kernel_size - self.norm = norm - self.context_freq = context_freq - klass = nn.Conv1d - klass_tr = nn.ConvTranspose1d - - if freq: - kernel_size = [kernel_size, 1] - stride = [stride, 1] - klass = nn.Conv2d - klass_tr = nn.ConvTranspose2d - - self.conv_tr = klass_tr(chin, chout, kernel_size, stride) - self.norm2 = norm_fn(chout) - - if self.empty: return - self.rewrite = None - - if rewrite: - if context_freq: self.rewrite = klass(chin, 2 * chin, 1 + 2 * context, 1, context) - else: self.rewrite = klass(chin, 2 * chin, [1, 1 + 2 * context], 1, [0, context]) - - self.norm1 = norm_fn(2 * chin) - - self.dconv = None - if dconv: self.dconv = DConv(chin, **dconv_kw) - - def forward(self, x, skip, length): - if self.freq and x.dim() == 3: - B, C, T = x.shape - x = x.view(B, self.chin, -1, T) - - if not self.empty: - x = x + skip - - y = F.glu(self.norm1(self.rewrite(x)), dim=1) if self.rewrite else x - - if self.dconv: - if self.freq: - B, C, Fr, T = y.shape - y = y.permute(0, 2, 1, 3).reshape(-1, C, T) - - y = self.dconv(y) - - if self.freq: y = y.view(B, Fr, C, T).permute(0, 2, 1, 3) - else: - y = x - assert skip is None - - z = self.norm2(self.conv_tr(y)) - - if self.freq: - if self.pad: z = z[..., self.pad : -self.pad, :] - else: - z = z[..., self.pad : self.pad + length] - assert z.shape[-1] == length, (z.shape[-1], length) - - if not self.last: z = F.gelu(z) - return z, y - -class HDemucs(nn.Module): - @capture_init - def __init__(self, sources, audio_channels=2, channels=48, channels_time=None, growth=2, nfft=4096, wiener_iters=0, end_iters=0, wiener_residual=False, cac=True, depth=6, rewrite=True, hybrid=True, hybrid_old=False, multi_freqs=None, multi_freqs_depth=2, freq_emb=0.2, emb_scale=10, emb_smooth=True, kernel_size=8, time_stride=2, stride=4, context=1, context_enc=0, norm_starts=4, norm_groups=4, dconv_mode=1, dconv_depth=2, dconv_comp=4, dconv_attn=4, dconv_lstm=4, dconv_init=1e-4, rescale=0.1, samplerate=44100, segment=4 * 10): - super().__init__() - self.cac = cac - self.wiener_residual = wiener_residual - self.audio_channels = audio_channels - self.sources = sources - self.kernel_size = kernel_size - self.context = context - self.stride = stride - self.depth = depth - self.channels = channels - self.samplerate = samplerate - self.segment = segment - self.nfft = nfft - self.hop_length = nfft // 4 - self.wiener_iters = wiener_iters - self.end_iters = end_iters - self.freq_emb = None - self.hybrid = hybrid - self.hybrid_old = hybrid_old - if hybrid_old: assert hybrid - if hybrid: assert wiener_iters == end_iters - self.encoder = nn.ModuleList() - self.decoder = nn.ModuleList() - - if hybrid: - self.tencoder = nn.ModuleList() - self.tdecoder = nn.ModuleList() - - chin = audio_channels - chin_z = chin - - if self.cac: chin_z *= 2 - - chout = channels_time or channels - chout_z = channels - freqs = nfft // 2 - - for index in range(depth): - lstm = index >= dconv_lstm - attn = index >= dconv_attn - norm = index >= norm_starts - freq = freqs > 1 - stri = stride - ker = kernel_size - - if not freq: - assert freqs == 1 - - ker = time_stride * 2 - stri = time_stride - - pad = True - last_freq = False - - if freq and freqs <= kernel_size: - ker = freqs - pad = False - last_freq = True - - kw = { - "kernel_size": ker, - "stride": stri, - "freq": freq, - "pad": pad, - "norm": norm, - "rewrite": rewrite, - "norm_groups": norm_groups, - "dconv_kw": {"lstm": lstm, "attn": attn, "depth": dconv_depth, "compress": dconv_comp, "init": dconv_init, "gelu": True}, - } - - kwt = dict(kw) - kwt["freq"] = 0 - kwt["kernel_size"] = kernel_size - kwt["stride"] = stride - kwt["pad"] = True - kw_dec = dict(kw) - - multi = False - - if multi_freqs and index < multi_freqs_depth: - multi = True - kw_dec["context_freq"] = False - - if last_freq: - chout_z = max(chout, chout_z) - chout = chout_z - - enc = HEncLayer(chin_z, chout_z, dconv=dconv_mode & 1, context=context_enc, **kw) - if hybrid and freq: - tenc = HEncLayer(chin, chout, dconv=dconv_mode & 1, context=context_enc, empty=last_freq, **kwt) - self.tencoder.append(tenc) - - if multi: enc = MultiWrap(enc, multi_freqs) - - self.encoder.append(enc) - if index == 0: - chin = self.audio_channels * len(self.sources) - chin_z = chin - - if self.cac: chin_z *= 2 - - dec = HDecLayer(chout_z, chin_z, dconv=dconv_mode & 2, last=index == 0, context=context, **kw_dec) - if multi: dec = MultiWrap(dec, multi_freqs) - - if hybrid and freq: - tdec = HDecLayer(chout, chin, dconv=dconv_mode & 2, empty=last_freq, last=index == 0, context=context, **kwt) - self.tdecoder.insert(0, tdec) - - self.decoder.insert(0, dec) - chin = chout - chin_z = chout_z - chout = int(growth * chout) - chout_z = int(growth * chout_z) - - if freq: - if freqs <= kernel_size: freqs = 1 - else: freqs //= stride - - if index == 0 and freq_emb: - self.freq_emb = ScaledEmbedding(freqs, chin_z, smooth=emb_smooth, scale=emb_scale) - self.freq_emb_scale = freq_emb - - if rescale: rescale_module(self, reference=rescale) - - def _spec(self, x): - hl = self.hop_length - nfft = self.nfft - - if self.hybrid: - assert hl == nfft // 4 - le = int(math.ceil(x.shape[-1] / hl)) - pad = hl // 2 * 3 - x = pad1d(x, (pad, pad + le * hl - x.shape[-1]), mode="reflect") if not self.hybrid_old else pad1d(x, (pad, pad + le * hl - x.shape[-1])) - - z = spectro(x, nfft, hl)[..., :-1, :] - if self.hybrid: - assert z.shape[-1] == le + 4, (z.shape, x.shape, le) - z = z[..., 2 : 2 + le] - - return z - - def _ispec(self, z, length=None, scale=0): - hl = self.hop_length // (4**scale) - z = F.pad(z, (0, 0, 0, 1)) - - if self.hybrid: - z = F.pad(z, (2, 2)) - pad = hl // 2 * 3 - le = hl * int(math.ceil(length / hl)) + 2 * pad if not self.hybrid_old else hl * int(math.ceil(length / hl)) - x = ispectro(z, hl, length=le) - x = x[..., pad : pad + length] if not self.hybrid_old else x[..., :length] - else: x = ispectro(z, hl, length) - - return x - - def _magnitude(self, z): - if self.cac: - B, C, Fr, T = z.shape - m = torch.view_as_real(z).permute(0, 1, 4, 2, 3) - m = m.reshape(B, C * 2, Fr, T) - else: m = z.abs() - - return m - - def _mask(self, z, m): - niters = self.wiener_iters - if self.cac: - B, S, C, Fr, T = m.shape - out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3) - out = torch.view_as_complex(out.contiguous()) - return out - - if self.training: niters = self.end_iters - - if niters < 0: - z = z[:, None] - return z / (1e-8 + z.abs()) * m - else: return self._wiener(m, z, niters) - - def _wiener(self, mag_out, mix_stft, niters): - init = mix_stft.dtype - wiener_win_len = 300 - residual = self.wiener_residual - B, S, C, Fq, T = mag_out.shape - mag_out = mag_out.permute(0, 4, 3, 2, 1) - mix_stft = torch.view_as_real(mix_stft.permute(0, 3, 2, 1)) - outs = [] - - for sample in range(B): - pos = 0 - out = [] - - for pos in range(0, T, wiener_win_len): - frame = slice(pos, pos + wiener_win_len) - z_out = wiener(mag_out[sample, frame], mix_stft[sample, frame], niters, residual=residual) - out.append(z_out.transpose(-1, -2)) - - outs.append(torch.cat(out, dim=0)) - - out = torch.view_as_complex(torch.stack(outs, 0)) - out = out.permute(0, 4, 3, 2, 1).contiguous() - - if residual: out = out[:, :-1] - assert list(out.shape) == [B, S, C, Fq, T] - return out.to(init) - - def forward(self, mix): - x = mix - length = x.shape[-1] - z = self._spec(mix) - mag = self._magnitude(z).to(mix.device) - x = mag - B, C, Fq, T = x.shape - mean = x.mean(dim=(1, 2, 3), keepdim=True) - std = x.std(dim=(1, 2, 3), keepdim=True) - x = (x - mean) / (1e-5 + std) - - if self.hybrid: - xt = mix - meant = xt.mean(dim=(1, 2), keepdim=True) - stdt = xt.std(dim=(1, 2), keepdim=True) - xt = (xt - meant) / (1e-5 + stdt) - - saved, saved_t, lengths, lengths_t = [], [], [], [] - - for idx, encode in enumerate(self.encoder): - lengths.append(x.shape[-1]) - inject = None - - if self.hybrid and idx < len(self.tencoder): - lengths_t.append(xt.shape[-1]) - tenc = self.tencoder[idx] - xt = tenc(xt) - - if not tenc.empty: saved_t.append(xt) - else: inject = xt - - x = encode(x, inject) - - if idx == 0 and self.freq_emb is not None: - frs = torch.arange(x.shape[-2], device=x.device) - emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x) - x = x + self.freq_emb_scale * emb - - saved.append(x) - - x = torch.zeros_like(x) - if self.hybrid: xt = torch.zeros_like(x) - - for idx, decode in enumerate(self.decoder): - skip = saved.pop(-1) - x, pre = decode(x, skip, lengths.pop(-1)) - - if self.hybrid: offset = self.depth - len(self.tdecoder) - - if self.hybrid and idx >= offset: - tdec = self.tdecoder[idx - offset] - length_t = lengths_t.pop(-1) - - if tdec.empty: - assert pre.shape[2] == 1, pre.shape - - pre = pre[:, :, 0] - xt, _ = tdec(pre, None, length_t) - else: - skip = saved_t.pop(-1) - xt, _ = tdec(xt, skip, length_t) - - assert len(saved) == 0 - assert len(lengths_t) == 0 - assert len(saved_t) == 0 - - S = len(self.sources) - x = x.view(B, S, -1, Fq, T) - x = x * std[:, None] + mean[:, None] - device_type = x.device.type - device_load = f"{device_type}:{x.device.index}" if not device_type == "mps" else device_type - x_is_other_gpu = not device_type in ["cuda", "cpu"] - if x_is_other_gpu: x = x.cpu() - zout = self._mask(z, x) - x = self._ispec(zout, length) - if x_is_other_gpu: x = x.to(device_load) - - if self.hybrid: - xt = xt.view(B, S, -1, length) - xt = xt * stdt[:, None] + meant[:, None] - x = xt + x - - return x \ No newline at end of file diff --git a/main/library/uvr5_lib/demucs/htdemucs.py b/main/library/uvr5_lib/demucs/htdemucs.py deleted file mode 100644 index db0249bbf7a2ab5311b567092f50927a9c4a625d..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/demucs/htdemucs.py +++ /dev/null @@ -1,598 +0,0 @@ -import os -import sys -import math -import torch -import random - -import numpy as np - -from torch import nn -from einops import rearrange -from fractions import Fraction -from torch.nn import functional as F - -sys.path.append(os.getcwd()) - -from main.app.variables import translations -from main.library.uvr5_lib.demucs.states import capture_init -from main.library.uvr5_lib.demucs.demucs import rescale_module -from main.library.uvr5_lib.demucs.hdemucs import pad1d, spectro, ispectro, wiener, ScaledEmbedding, HEncLayer, MultiWrap, HDecLayer - -def create_sin_embedding(length, dim, shift = 0, device="cpu", max_period=10000): - assert dim % 2 == 0 - pos = shift + torch.arange(length, device=device).view(-1, 1, 1) - half_dim = dim // 2 - adim = torch.arange(dim // 2, device=device).view(1, 1, -1) - phase = pos / (max_period ** ((adim.to(torch.float32) / torch.tensor(half_dim - 1, dtype=torch.float32, device=device)) if str(device).startswith("ocl") else (adim / (half_dim - 1)))) - return torch.cat([phase.cos(), phase.sin()], dim=-1) - -def create_2d_sin_embedding(d_model, height, width, device="cpu", max_period=10000): - if d_model % 4 != 0: raise ValueError(translations["dims"].format(dims=d_model)) - pe = torch.zeros(d_model, height, width) - d_model = int(d_model / 2) - div_term = (torch.arange(0.0, d_model, 2) * -(math.log(max_period) / d_model)).exp() - pos_w = torch.arange(0.0, width).unsqueeze(1) - pos_h = torch.arange(0.0, height).unsqueeze(1) - pe[0:d_model:2, :, :] = (pos_w * div_term).sin().transpose(0, 1).unsqueeze(1).repeat(1, height, 1) - pe[1:d_model:2, :, :] = (pos_w * div_term).cos().transpose(0, 1).unsqueeze(1).repeat(1, height, 1) - pe[d_model::2, :, :] = (pos_h * div_term).sin().transpose(0, 1).unsqueeze(2).repeat(1, 1, width) - pe[d_model + 1 :: 2, :, :] = (pos_h * div_term).cos().transpose(0, 1).unsqueeze(2).repeat(1, 1, width) - - return pe[None, :].to(device) - -def create_sin_embedding_cape(length, dim, batch_size, mean_normalize, augment, max_global_shift = 0.0, max_local_shift = 0.0, max_scale = 1.0, device = "cpu", max_period = 10000.0): - assert dim % 2 == 0 - pos = 1.0 * torch.arange(length).view(-1, 1, 1) - pos = pos.repeat(1, batch_size, 1) - if mean_normalize: pos -= torch.nanmean(pos, dim=0, keepdim=True) - - if augment: - delta = np.random.uniform(-max_global_shift, +max_global_shift, size=[1, batch_size, 1]) - delta_local = np.random.uniform(-max_local_shift, +max_local_shift, size=[length, batch_size, 1]) - log_lambdas = np.random.uniform(-np.log(max_scale), +np.log(max_scale), size=[1, batch_size, 1]) - pos = (pos + delta + delta_local) * np.exp(log_lambdas) - - pos = pos.to(device) - half_dim = dim // 2 - adim = torch.arange(dim // 2, device=device).view(1, 1, -1) - phase = pos / (max_period ** ((adim.to(torch.float32) / torch.tensor(half_dim - 1, dtype=torch.float32, device=device)) if str(device).startswith("ocl") else (adim / (half_dim - 1)))) - return torch.cat([phase.cos(), phase.sin()], dim=-1).float() - -class MyGroupNorm(nn.GroupNorm): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def forward(self, x): - x = x.transpose(1, 2) - return super().forward(x).transpose(1, 2) - -class LayerScale(nn.Module): - def __init__(self, channels, init = 0, channel_last=False): - super().__init__() - self.channel_last = channel_last - self.scale = nn.Parameter(torch.zeros(channels, requires_grad=True)) - self.scale.data[:] = init - - def forward(self, x): - if self.channel_last: return self.scale * x - else: return self.scale[:, None] * x - -class MyTransformerEncoderLayer(nn.TransformerEncoderLayer): - def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation=F.relu, group_norm=0, norm_first=False, norm_out=False, layer_norm_eps=1e-5, layer_scale=False, init_values=1e-4, device=None, dtype=None, sparse=False, mask_type="diag", mask_random_seed=42, sparse_attn_window=500, global_window=50, auto_sparsity=False, sparsity=0.95, batch_first=False): - factory_kwargs = {"device": device, "dtype": dtype} - super().__init__(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, activation=activation, layer_norm_eps=layer_norm_eps, batch_first=batch_first, norm_first=norm_first, device=device, dtype=dtype) - self.auto_sparsity = auto_sparsity - - if group_norm: - self.norm1 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) - self.norm2 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) - - self.norm_out = None - if self.norm_first & norm_out: self.norm_out = MyGroupNorm(num_groups=int(norm_out), num_channels=d_model) - - self.gamma_1 = LayerScale(d_model, init_values, True) if layer_scale else nn.Identity() - self.gamma_2 = LayerScale(d_model, init_values, True) if layer_scale else nn.Identity() - - def forward(self, src, src_mask=None, src_key_padding_mask=None): - x = src - T, B, C = x.shape - - if self.norm_first: - x = x + self.gamma_1(self._sa_block(self.norm1(x), src_mask, src_key_padding_mask)) - x = x + self.gamma_2(self._ff_block(self.norm2(x))) - if self.norm_out: x = self.norm_out(x) - else: - x = self.norm1(x + self.gamma_1(self._sa_block(x, src_mask, src_key_padding_mask))) - x = self.norm2(x + self.gamma_2(self._ff_block(x))) - - return x - -class CrossTransformerEncoder(nn.Module): - def __init__(self, dim, emb = "sin", hidden_scale = 4.0, num_heads = 8, num_layers = 6, cross_first = False, dropout = 0.0, max_positions = 1000, norm_in = True, norm_in_group = False, group_norm = False, norm_first = False, norm_out = False, max_period = 10000.0, weight_decay = 0.0, lr = None, layer_scale = False, gelu = True, sin_random_shift = 0, weight_pos_embed = 1.0, cape_mean_normalize = True, cape_augment = True, cape_glob_loc_scale = [5000.0, 1.0, 1.4], sparse_self_attn = False, sparse_cross_attn = False, mask_type = "diag", mask_random_seed = 42, sparse_attn_window = 500, global_window = 50, auto_sparsity = False, sparsity = 0.95): - super().__init__() - assert dim % num_heads == 0 - hidden_dim = int(dim * hidden_scale) - self.num_layers = num_layers - self.classic_parity = int(cross_first) - self.emb = emb - self.max_period = max_period - self.weight_decay = weight_decay - self.weight_pos_embed = weight_pos_embed - self.sin_random_shift = sin_random_shift - - if emb == "cape": - self.cape_mean_normalize = cape_mean_normalize - self.cape_augment = cape_augment - self.cape_glob_loc_scale = cape_glob_loc_scale - - if emb == "scaled": self.position_embeddings = ScaledEmbedding(max_positions, dim, scale=0.2) - - self.lr = lr - activation = F.gelu if gelu else F.relu - - if norm_in: - self.norm_in = nn.LayerNorm(dim) - self.norm_in_t = nn.LayerNorm(dim) - elif norm_in_group: - self.norm_in = MyGroupNorm(int(norm_in_group), dim) - self.norm_in_t = MyGroupNorm(int(norm_in_group), dim) - else: - self.norm_in = nn.Identity() - self.norm_in_t = nn.Identity() - - self.layers = nn.ModuleList() - self.layers_t = nn.ModuleList() - - kwargs_common = { - "d_model": dim, - "nhead": num_heads, - "dim_feedforward": hidden_dim, - "dropout": dropout, - "activation": activation, - "group_norm": group_norm, - "norm_first": norm_first, - "norm_out": norm_out, - "layer_scale": layer_scale, - "mask_type": mask_type, - "mask_random_seed": mask_random_seed, - "sparse_attn_window": sparse_attn_window, - "global_window": global_window, - "sparsity": sparsity, - "auto_sparsity": auto_sparsity, - "batch_first": True, - } - - kwargs_classic_encoder = dict(kwargs_common) - kwargs_classic_encoder.update({"sparse": sparse_self_attn}) - kwargs_cross_encoder = dict(kwargs_common) - kwargs_cross_encoder.update({"sparse": sparse_cross_attn}) - - for idx in range(num_layers): - if idx % 2 == self.classic_parity: - self.layers.append(MyTransformerEncoderLayer(**kwargs_classic_encoder)) - self.layers_t.append(MyTransformerEncoderLayer(**kwargs_classic_encoder)) - else: - self.layers.append(CrossTransformerEncoderLayer(**kwargs_cross_encoder)) - self.layers_t.append(CrossTransformerEncoderLayer(**kwargs_cross_encoder)) - - def forward(self, x, xt): - B, C, Fr, T1 = x.shape - - pos_emb_2d = create_2d_sin_embedding(C, Fr, T1, x.device, self.max_period) - pos_emb_2d = rearrange(pos_emb_2d, "b c fr t1 -> b (t1 fr) c") - - x = rearrange(x, "b c fr t1 -> b (t1 fr) c") - x = self.norm_in(x) - x = x + self.weight_pos_embed * pos_emb_2d - - B, C, T2 = xt.shape - xt = rearrange(xt, "b c t2 -> b t2 c") - - pos_emb = self._get_pos_embedding(T2, B, C, x.device) - pos_emb = rearrange(pos_emb, "t2 b c -> b t2 c") - - xt = self.norm_in_t(xt) - xt = xt + self.weight_pos_embed * pos_emb - - for idx in range(self.num_layers): - if idx % 2 == self.classic_parity: - x = self.layers[idx](x) - xt = self.layers_t[idx](xt) - else: - old_x = x - x = self.layers[idx](x, xt) - xt = self.layers_t[idx](xt, old_x) - - x = rearrange(x, "b (t1 fr) c -> b c fr t1", t1=T1) - xt = rearrange(xt, "b t2 c -> b c t2") - return x, xt - - def _get_pos_embedding(self, T, B, C, device): - if self.emb == "sin": - shift = random.randrange(self.sin_random_shift + 1) - pos_emb = create_sin_embedding(T, C, shift=shift, device=device, max_period=self.max_period) - elif self.emb == "cape": - if self.training: pos_emb = create_sin_embedding_cape(T, C, B, device=device, max_period=self.max_period, mean_normalize=self.cape_mean_normalize, augment=self.cape_augment, max_global_shift=self.cape_glob_loc_scale[0], max_local_shift=self.cape_glob_loc_scale[1], max_scale=self.cape_glob_loc_scale[2]) - else: pos_emb = create_sin_embedding_cape(T, C, B, device=device, max_period=self.max_period, mean_normalize=self.cape_mean_normalize, augment=False) - elif self.emb == "scaled": - pos = torch.arange(T, device=device) - pos_emb = self.position_embeddings(pos)[:, None] - - return pos_emb - - def make_optim_group(self): - group = {"params": list(self.parameters()), "weight_decay": self.weight_decay} - if self.lr is not None: group["lr"] = self.lr - return group - -class CrossTransformerEncoderLayer(nn.Module): - def __init__(self, d_model, nhead, dim_feedforward = 2048, dropout = 0.1, activation=F.relu, layer_norm_eps = 1e-5, layer_scale = False, init_values = 1e-4, norm_first = False, group_norm = False, norm_out = False, sparse=False, mask_type="diag", mask_random_seed=42, sparse_attn_window=500, global_window=50, sparsity=0.95, auto_sparsity=None, device=None, dtype=None, batch_first=False): - factory_kwargs = {"device": device, "dtype": dtype} - super().__init__() - self.auto_sparsity = auto_sparsity - self.cross_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first) - self.linear1 = nn.Linear(d_model, dim_feedforward, **factory_kwargs) - self.dropout = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model, **factory_kwargs) - self.norm_first = norm_first - - if group_norm: - self.norm1 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) - self.norm2 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) - self.norm3 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) - else: - self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) - self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) - self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) - - self.norm_out = None - if self.norm_first & norm_out: - self.norm_out = MyGroupNorm(num_groups=int(norm_out), num_channels=d_model) - - self.gamma_1 = LayerScale(d_model, init_values, True) if layer_scale else nn.Identity() - self.gamma_2 = LayerScale(d_model, init_values, True) if layer_scale else nn.Identity() - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) - - if isinstance(activation, str): self.activation = self._get_activation_fn(activation) - else: self.activation = activation - - def forward(self, q, k, mask=None): - if self.norm_first: - x = q + self.gamma_1(self._ca_block(self.norm1(q), self.norm2(k), mask)) - x = x + self.gamma_2(self._ff_block(self.norm3(x))) - - if self.norm_out: x = self.norm_out(x) - else: - x = self.norm1(q + self.gamma_1(self._ca_block(q, k, mask))) - x = self.norm2(x + self.gamma_2(self._ff_block(x))) - - return x - - def _ca_block(self, q, k, attn_mask=None): - x = self.cross_attn(q, k, k, attn_mask=attn_mask, need_weights=False)[0] - return self.dropout1(x) - - def _ff_block(self, x): - x = self.linear2(self.dropout(self.activation(self.linear1(x)))) - return self.dropout2(x) - - def _get_activation_fn(self, activation): - if activation == "relu": return F.relu - elif activation == "gelu": return F.gelu - raise RuntimeError(translations["activation"].format(activation=activation)) - -class HTDemucs(nn.Module): - @capture_init - def __init__(self, sources, audio_channels=2, channels=48, channels_time=None, growth=2, nfft=4096, wiener_iters=0, end_iters=0, wiener_residual=False, cac=True, depth=4, rewrite=True, multi_freqs=None, multi_freqs_depth=3, freq_emb=0.2, emb_scale=10, emb_smooth=True, kernel_size=8, time_stride=2, stride=4, context=1, context_enc=0, norm_starts=4, norm_groups=4, dconv_mode=1, dconv_depth=2, dconv_comp=8, dconv_init=1e-3, bottom_channels=0, t_layers=5, t_emb="sin", t_hidden_scale=4.0, t_heads=8, t_dropout=0.0, t_max_positions=10000, t_norm_in=True, t_norm_in_group=False, t_group_norm=False, t_norm_first=True, t_norm_out=True, t_max_period=10000.0, t_weight_decay=0.0, t_lr=None, t_layer_scale=True, t_gelu=True, t_weight_pos_embed=1.0, t_sin_random_shift=0, t_cape_mean_normalize=True, t_cape_augment=True, t_cape_glob_loc_scale=[5000.0, 1.0, 1.4], t_sparse_self_attn=False, t_sparse_cross_attn=False, t_mask_type="diag", t_mask_random_seed=42, t_sparse_attn_window=500, t_global_window=100, t_sparsity=0.95, t_auto_sparsity=False, t_cross_first=False, rescale=0.1, samplerate=44100, segment=4 * 10, use_train_segment=True): - super().__init__() - self.cac = cac - self.wiener_residual = wiener_residual - self.audio_channels = audio_channels - self.sources = sources - self.kernel_size = kernel_size - self.context = context - self.stride = stride - self.depth = depth - self.bottom_channels = bottom_channels - self.channels = channels - self.samplerate = samplerate - self.segment = segment - self.use_train_segment = use_train_segment - self.nfft = nfft - self.hop_length = nfft // 4 - self.wiener_iters = wiener_iters - self.end_iters = end_iters - self.freq_emb = None - assert wiener_iters == end_iters - self.encoder = nn.ModuleList() - self.decoder = nn.ModuleList() - self.tencoder = nn.ModuleList() - self.tdecoder = nn.ModuleList() - chin = audio_channels - chin_z = chin - if self.cac: chin_z *= 2 - chout = channels_time or channels - chout_z = channels - freqs = nfft // 2 - - for index in range(depth): - norm = index >= norm_starts - freq = freqs > 1 - stri = stride - ker = kernel_size - - if not freq: - assert freqs == 1 - ker = time_stride * 2 - stri = time_stride - - pad = True - last_freq = False - - if freq and freqs <= kernel_size: - ker = freqs - pad = False - last_freq = True - - kw = { - "kernel_size": ker, - "stride": stri, - "freq": freq, - "pad": pad, - "norm": norm, - "rewrite": rewrite, - "norm_groups": norm_groups, - "dconv_kw": {"depth": dconv_depth, "compress": dconv_comp, "init": dconv_init, "gelu": True}, - } - - kwt = dict(kw) - kwt["freq"] = 0 - kwt["kernel_size"] = kernel_size - kwt["stride"] = stride - kwt["pad"] = True - kw_dec = dict(kw) - multi = False - - if multi_freqs and index < multi_freqs_depth: - multi = True - kw_dec["context_freq"] = False - - if last_freq: - chout_z = max(chout, chout_z) - chout = chout_z - - enc = HEncLayer(chin_z, chout_z, dconv=dconv_mode & 1, context=context_enc, **kw) - if freq: - tenc = HEncLayer(chin, chout, dconv=dconv_mode & 1, context=context_enc, empty=last_freq, **kwt) - self.tencoder.append(tenc) - - if multi: enc = MultiWrap(enc, multi_freqs) - - self.encoder.append(enc) - if index == 0: - chin = self.audio_channels * len(self.sources) - chin_z = chin - if self.cac: chin_z *= 2 - - dec = HDecLayer(chout_z, chin_z, dconv=dconv_mode & 2, last=index == 0, context=context, **kw_dec) - if multi: dec = MultiWrap(dec, multi_freqs) - - if freq: - tdec = HDecLayer(chout, chin, dconv=dconv_mode & 2, empty=last_freq, last=index == 0, context=context, **kwt) - self.tdecoder.insert(0, tdec) - - self.decoder.insert(0, dec) - chin = chout - chin_z = chout_z - chout = int(growth * chout) - chout_z = int(growth * chout_z) - - if freq: - if freqs <= kernel_size: freqs = 1 - else: freqs //= stride - - if index == 0 and freq_emb: - self.freq_emb = ScaledEmbedding(freqs, chin_z, smooth=emb_smooth, scale=emb_scale) - self.freq_emb_scale = freq_emb - - if rescale: rescale_module(self, reference=rescale) - transformer_channels = channels * growth ** (depth - 1) - - if bottom_channels: - self.channel_upsampler = nn.Conv1d(transformer_channels, bottom_channels, 1) - self.channel_downsampler = nn.Conv1d(bottom_channels, transformer_channels, 1) - self.channel_upsampler_t = nn.Conv1d(transformer_channels, bottom_channels, 1) - self.channel_downsampler_t = nn.Conv1d(bottom_channels, transformer_channels, 1) - transformer_channels = bottom_channels - - if t_layers > 0: self.crosstransformer = CrossTransformerEncoder(dim=transformer_channels, emb=t_emb, hidden_scale=t_hidden_scale, num_heads=t_heads, num_layers=t_layers, cross_first=t_cross_first, dropout=t_dropout, max_positions=t_max_positions, norm_in=t_norm_in, norm_in_group=t_norm_in_group, group_norm=t_group_norm, norm_first=t_norm_first, norm_out=t_norm_out, max_period=t_max_period, weight_decay=t_weight_decay, lr=t_lr, layer_scale=t_layer_scale, gelu=t_gelu, sin_random_shift=t_sin_random_shift, weight_pos_embed=t_weight_pos_embed, cape_mean_normalize=t_cape_mean_normalize, cape_augment=t_cape_augment, cape_glob_loc_scale=t_cape_glob_loc_scale, sparse_self_attn=t_sparse_self_attn, sparse_cross_attn=t_sparse_cross_attn, mask_type=t_mask_type, mask_random_seed=t_mask_random_seed, sparse_attn_window=t_sparse_attn_window, global_window=t_global_window, sparsity=t_sparsity, auto_sparsity=t_auto_sparsity) - else: self.crosstransformer = None - - def _spec(self, x): - hl = self.hop_length - nfft = self.nfft - assert hl == nfft // 4 - le = int(math.ceil(x.shape[-1] / hl)) - pad = hl // 2 * 3 - x = pad1d(x, (pad, pad + le * hl - x.shape[-1]), mode="reflect") - z = spectro(x, nfft, hl)[..., :-1, :] - assert z.shape[-1] == le + 4, (z.shape, x.shape, le) - z = z[..., 2 : 2 + le] - return z - - def _ispec(self, z, length=None, scale=0): - hl = self.hop_length // (4**scale) - z = F.pad(z, (0, 0, 0, 1)) - z = F.pad(z, (2, 2)) - pad = hl // 2 * 3 - le = hl * int(math.ceil(length / hl)) + 2 * pad - x = ispectro(z, hl, length=le) - x = x[..., pad : pad + length] - return x - - def _magnitude(self, z): - if self.cac: - B, C, Fr, T = z.shape - m = torch.view_as_real(z).permute(0, 1, 4, 2, 3) - m = m.reshape(B, C * 2, Fr, T) - else: m = z.abs() - return m - - def _mask(self, z, m): - niters = self.wiener_iters - if self.cac: - B, S, C, Fr, T = m.shape - out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3) - out = torch.view_as_complex(out.contiguous()) - return out - - if self.training: niters = self.end_iters - - if niters < 0: - z = z[:, None] - return z / (1e-8 + z.abs()) * m - else: return self._wiener(m, z, niters) - - def _wiener(self, mag_out, mix_stft, niters): - init = mix_stft.dtype - wiener_win_len = 300 - residual = self.wiener_residual - B, S, C, Fq, T = mag_out.shape - mag_out = mag_out.permute(0, 4, 3, 2, 1) - mix_stft = torch.view_as_real(mix_stft.permute(0, 3, 2, 1)) - - outs = [] - - for sample in range(B): - pos = 0 - out = [] - - for pos in range(0, T, wiener_win_len): - frame = slice(pos, pos + wiener_win_len) - z_out = wiener(mag_out[sample, frame], mix_stft[sample, frame], niters, residual=residual) - out.append(z_out.transpose(-1, -2)) - - outs.append(torch.cat(out, dim=0)) - - out = torch.view_as_complex(torch.stack(outs, 0)) - out = out.permute(0, 4, 3, 2, 1).contiguous() - - if residual: out = out[:, :-1] - assert list(out.shape) == [B, S, C, Fq, T] - return out.to(init) - - def valid_length(self, length): - if not self.use_train_segment: return length - - training_length = int(self.segment * self.samplerate) - if training_length < length: raise ValueError(translations["length_or_training_length"].format(length=length, training_length=training_length)) - - return training_length - - def forward(self, mix): - length = mix.shape[-1] - length_pre_pad = None - - if self.use_train_segment: - if self.training: self.segment = Fraction(mix.shape[-1], self.samplerate) - else: - training_length = int(self.segment * self.samplerate) - - if mix.shape[-1] < training_length: - length_pre_pad = mix.shape[-1] - mix = F.pad(mix, (0, training_length - length_pre_pad)) - - z = self._spec(mix) - mag = self._magnitude(z).to(mix.device) - x = mag - B, C, Fq, T = x.shape - mean = x.mean(dim=(1, 2, 3), keepdim=True) - std = x.std(dim=(1, 2, 3), keepdim=True) - x = (x - mean) / (1e-5 + std) - xt = mix - meant = xt.mean(dim=(1, 2), keepdim=True) - stdt = xt.std(dim=(1, 2), keepdim=True) - xt = (xt - meant) / (1e-5 + stdt) - - saved, saved_t, lengths, lengths_t = [], [], [], [] - - for idx, encode in enumerate(self.encoder): - lengths.append(x.shape[-1]) - inject = None - - if idx < len(self.tencoder): - lengths_t.append(xt.shape[-1]) - tenc = self.tencoder[idx] - xt = tenc(xt) - - if not tenc.empty: saved_t.append(xt) - else: inject = xt - - x = encode(x, inject) - if idx == 0 and self.freq_emb is not None: - frs = torch.arange(x.shape[-2], device=x.device) - emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x) - x = x + self.freq_emb_scale * emb - - saved.append(x) - - if self.crosstransformer: - if self.bottom_channels: - b, c, f, t = x.shape - x = rearrange(x, "b c f t-> b c (f t)") - x = self.channel_upsampler(x) - x = rearrange(x, "b c (f t)-> b c f t", f=f) - xt = self.channel_upsampler_t(xt) - - x, xt = self.crosstransformer(x, xt) - - if self.bottom_channels: - x = rearrange(x, "b c f t-> b c (f t)") - x = self.channel_downsampler(x) - x = rearrange(x, "b c (f t)-> b c f t", f=f) - xt = self.channel_downsampler_t(xt) - - for idx, decode in enumerate(self.decoder): - skip = saved.pop(-1) - x, pre = decode(x, skip, lengths.pop(-1)) - offset = self.depth - len(self.tdecoder) - - if idx >= offset: - tdec = self.tdecoder[idx - offset] - length_t = lengths_t.pop(-1) - - if tdec.empty: - assert pre.shape[2] == 1, pre.shape - pre = pre[:, :, 0] - xt, _ = tdec(pre, None, length_t) - else: - skip = saved_t.pop(-1) - xt, _ = tdec(xt, skip, length_t) - - assert len(saved) == 0 - assert len(lengths_t) == 0 - assert len(saved_t) == 0 - - S = len(self.sources) - x = x.view(B, S, -1, Fq, T) - x = x * std[:, None] + mean[:, None] - device_type = x.device.type - device_load = f"{device_type}:{x.device.index}" if not device_type == "mps" else device_type - x_is_other_gpu = not device_type in ["cuda", "cpu"] - if x_is_other_gpu: x = x.cpu() - zout = self._mask(z, x) - - if self.use_train_segment: x = self._ispec(zout, length) if self.training else self._ispec(zout, training_length) - else: x = self._ispec(zout, length) - - if x_is_other_gpu: x = x.to(device_load) - - if self.use_train_segment: xt = xt.view(B, S, -1, length) if self.training else xt.view(B, S, -1, training_length) - else: xt = xt.view(B, S, -1, length) - - xt = xt * stdt[:, None] + meant[:, None] - x = xt + x - - if length_pre_pad: x = x[..., :length_pre_pad] - return x \ No newline at end of file diff --git a/main/library/uvr5_lib/demucs/states.py b/main/library/uvr5_lib/demucs/states.py deleted file mode 100644 index 6ea56885cd35576f48432e0a5fd7f364d444ac7b..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/demucs/states.py +++ /dev/null @@ -1,67 +0,0 @@ -import os -import sys -import torch -import inspect -import warnings -import functools - -sys.path.append(os.getcwd()) - -from main.app.variables import translations - -def load_model(path_or_package, strict=False): - if isinstance(path_or_package, dict): package = path_or_package - elif isinstance(path_or_package, (str, os.PathLike)): - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - - package = torch.load(path_or_package, map_location="cpu", weights_only=False) - else: raise ValueError(f"{translations['type_not_valid']} {path_or_package}.") - - klass = package["klass"] - args = package["args"] - kwargs = package["kwargs"] - - if strict: model = klass(*args, **kwargs) - else: - sig = inspect.signature(klass) - - for key in list(kwargs): - if key not in sig.parameters: - warnings.warn(translations["del_parameter"] + key) - - del kwargs[key] - - model = klass(*args, **kwargs) - - state = package["state"] - - set_state(model, state) - - return model - -def restore_quantized_state(model, state): - assert "meta" in state - - quantizer = state["meta"]["klass"](model, **state["meta"]["init_kwargs"]) - - quantizer.restore_quantized_state(state) - - quantizer.detach() - -def set_state(model, state, quantizer=None): - if state.get("__quantized"): - if quantizer is not None: quantizer.restore_quantized_state(model, state["quantized"]) - else: restore_quantized_state(model, state) - else: model.load_state_dict(state) - - return state - -def capture_init(init): - @functools.wraps(init) - def __init__(self, *args, **kwargs): - self._init_args_kwargs = (args, kwargs) - - init(self, *args, **kwargs) - - return __init__ \ No newline at end of file diff --git a/main/library/uvr5_lib/demucs/utils.py b/main/library/uvr5_lib/demucs/utils.py deleted file mode 100644 index ad797842bc7f04e0b35cf2de19283df8eaab8d3c..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/demucs/utils.py +++ /dev/null @@ -1,10 +0,0 @@ -import torch - -def center_trim(tensor, reference): - ref_size = reference.size(-1) if isinstance(reference, torch.Tensor) else reference - delta = tensor.size(-1) - ref_size - - if delta < 0: raise ValueError(f"tensor > parameter: {delta}.") - if delta: tensor = tensor[..., delta // 2 : -(delta - delta // 2)] - - return tensor \ No newline at end of file diff --git a/main/library/uvr5_lib/separator.py b/main/library/uvr5_lib/separator.py deleted file mode 100644 index 57f1dbd78b5615709193b85119345c02b18174e5..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/separator.py +++ /dev/null @@ -1,244 +0,0 @@ -import os -import sys -import time -import yaml -import torch -import codecs -import hashlib -import requests -import warnings -import onnxruntime - -from importlib import import_module - -now_dir = os.getcwd() -sys.path.append(now_dir) - -from main.library.utils import clear_gpu_cache -from main.library.backends import directml, opencl -from main.tools.huggingface import HF_download_file -from main.app.variables import config, translations - -warnings.filterwarnings("ignore") - -class Separator: - def __init__( - self, - logger, - model_file_dir=config.configs["uvr5_path"], - output_dir=None, - output_format="wav", - output_bitrate=None, - normalization_threshold=0.9, - sample_rate=44100, - mdx_params={ - "hop_length": 1024, - "segment_size": 256, - "overlap": 0.25, - "batch_size": 1, - "enable_denoise": False - }, - demucs_params={ - "segment_size": "Default", - "shifts": 2, - "overlap": 0.25, - "segments_enabled": True - }, - vr_params={ - "batch_size": 1, - "window_size": 512, - "aggression": 5, - "enable_tta": False, - "enable_post_process": False, - "post_process_threshold": 0.2, - "high_end_process": False - } - ): - self.logger = logger - self.logger.info(translations["separator_info"].format(output_dir=output_dir, output_format=output_format)) - self.model_file_dir = model_file_dir - self.output_dir = output_dir if output_dir is not None else now_dir - os.makedirs(self.model_file_dir, exist_ok=True) - os.makedirs(self.output_dir, exist_ok=True) - self.output_format = output_format if output_format is not None else "wav" - self.output_bitrate = output_bitrate - self.normalization_threshold = normalization_threshold - if normalization_threshold <= 0 or normalization_threshold > 1: raise ValueError - self.sample_rate = int(sample_rate) - self.arch_specific_params = {"MDX": mdx_params, "Demucs": demucs_params, "VR": vr_params} - self.torch_device = None - self.torch_device_cpu = None - self.torch_device_mps = None - self.onnx_execution_provider = None - self.model_instance = None - self.setup_torch_device() - - def setup_torch_device(self): - hardware_acceleration_enabled = False - ort_providers = onnxruntime.get_available_providers() - self.torch_device_cpu = torch.device("cpu") - - if not config.cpu_mode: - if torch.cuda.is_available(): - self.configure_cuda(ort_providers) - hardware_acceleration_enabled = True - elif opencl.is_available() or directml.is_available(): - hardware_acceleration_enabled = True - self.configure_amd(ort_providers) - elif torch.backends.mps.is_available(): - self.configure_mps(ort_providers) - hardware_acceleration_enabled = True - - if not hardware_acceleration_enabled: - self.logger.info(translations["running_in_cpu"]) - self.torch_device = self.torch_device_cpu - self.onnx_execution_provider = ["CPUExecutionProvider"] - - def configure_cuda(self, ort_providers): - self.logger.info(translations["running_in_cuda"]) - self.torch_device = torch.device("cuda") - - if "CUDAExecutionProvider" in ort_providers: - self.logger.info(translations["onnx_have"].format(have='CUDAExecutionProvider')) - self.onnx_execution_provider = ["CUDAExecutionProvider"] - else: self.logger.warning(translations["onnx_not_have"].format(have='CUDAExecutionProvider')) - - def configure_amd(self, ort_providers): - self.logger.info(translations["running_in_amd"]) - self.torch_device = torch.device(config.device) - - if "DmlExecutionProvider" in ort_providers: - self.logger.info(translations["onnx_have"].format(have='DmlExecutionProvider')) - self.onnx_execution_provider = ["DmlExecutionProvider"] - else: self.logger.warning(translations["onnx_not_have"].format(have='DmlExecutionProvider')) - - def configure_mps(self, ort_providers): - self.logger.info(translations["set_torch_mps"]) - self.torch_device_mps = torch.device("mps") - self.torch_device = self.torch_device_mps - - if "CoreMLExecutionProvider" in ort_providers: - self.logger.info(translations["onnx_have"].format(have='CoreMLExecutionProvider')) - self.onnx_execution_provider = ["CoreMLExecutionProvider"] - else: self.logger.warning(translations["onnx_not_have"].format(have='CoreMLExecutionProvider')) - - def get_model_hash(self, model_path): - try: - with open(model_path, "rb") as f: - f.seek(-10000 * 1024, 2) - return hashlib.md5(f.read()).hexdigest() - except IOError as e: - return hashlib.md5(open(model_path, "rb").read()).hexdigest() - - def download_file_if_not_exists(self, url, output_path): - if os.path.isfile(output_path): return - HF_download_file(url, output_path) - - def list_supported_model_files(self): - response = requests.get(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/enj/znva/wfba/hie_zbqryf.wfba", "rot13")) - response.raise_for_status() - model_downloads_list = response.json() - - return { - "MDX": { - **model_downloads_list["mdx_download_list"], - **model_downloads_list["mdx_download_vip_list"] - }, - "Demucs": {key: value for key, value in model_downloads_list["demucs_download_list"].items() if key.startswith("Demucs v4")}, - "VR": { - **model_downloads_list["vr_download_list"] - } - } - - def download_model_files(self, model_filename): - model_path = os.path.join(self.model_file_dir, model_filename) - supported_models = self.list_supported_model_files() - model_repo = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/hie5_zbqryf", "rot13") - - for model_type, model_list in supported_models.items(): - for _, files in model_list.items(): - if isinstance(files, str) and files == model_filename: - try: - self.download_file_if_not_exists(f"{model_repo}/MDX/{model_filename}", model_path) - except: - try: - self.download_file_if_not_exists(f"{model_repo}/VR/{model_filename}", model_path) - except: - self.download_file_if_not_exists(f"{model_repo}/Demucs/{model_filename}", model_path) - - return model_filename, model_type, model_path - - elif isinstance(files, dict) and any(model_filename in (k, v) for k, v in files.items()): - for file_key, file_val in files.items(): - out_path = os.path.join(self.model_file_dir, file_key) - - if file_val.startswith("http"): - self.download_file_if_not_exists(file_val, out_path) - else: - self.download_file_if_not_exists(f"{model_repo}/Demucs/{file_val}", os.path.join(self.model_file_dir, file_val)) - - return model_filename, model_type, model_path - - raise ValueError - - def load_model_data_from_yaml(self, yaml_config_filename): - model_data_yaml_filepath = os.path.join(self.model_file_dir, yaml_config_filename) if not os.path.exists(yaml_config_filename) else yaml_config_filename - model_data = yaml.load(open(model_data_yaml_filepath, encoding="utf-8"), Loader=yaml.FullLoader) - - return model_data - - def load_model_data_using_hash(self, model_path): - model_hash = self.get_model_hash(model_path) - response = requests.get(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/enj/znva/wfba/zbqry_qngn.wfba", "rot13")) - response.raise_for_status() - model_data_object = response.json() - - if model_hash in model_data_object: model_data = model_data_object[model_hash] - else: raise ValueError - - return model_data - - def load_model(self, model_filename): - self.logger.info(translations["loading_model"].format(model_filename=model_filename)) - model_filename, model_type, model_path = self.download_model_files(model_filename) - - yaml_config_filename = model_path if model_path.lower().endswith(".yaml") else None - - common_params = { - "logger": self.logger, - "torch_device": self.torch_device, - "torch_device_cpu": self.torch_device_cpu, - "torch_device_mps": self.torch_device_mps, - "onnx_execution_provider": self.onnx_execution_provider, - "model_name": model_filename.split(".")[0], - "model_path": model_path, - "model_data": self.load_model_data_from_yaml(yaml_config_filename) if yaml_config_filename is not None else self.load_model_data_using_hash(model_path), - "output_format": self.output_format, - "output_bitrate": self.output_bitrate, - "output_dir": self.output_dir, - "normalization_threshold": self.normalization_threshold, - "output_single_stem": None, - "invert_using_spec": False, - "sample_rate": self.sample_rate - } - separator_classes = {"MDX": "mdx_separator.MDXSeparator", "Demucs": "demucs_separator.DemucsSeparator", "VR": "vr_separator.VRSeparator"} - - if model_type not in self.arch_specific_params or model_type not in separator_classes: raise ValueError(translations["model_type_not_support"].format(model_type=model_type)) - - module_name, class_name = separator_classes[model_type].split(".") - separator_class = getattr(import_module(f"main.library.architectures.{module_name}"), class_name) - self.model_instance = separator_class(common_config=common_params, arch_config=self.arch_specific_params[model_type]) - - def separate(self, audio_file_path): - self.logger.info(f"{translations['starting_separator']}: {audio_file_path}") - separate_start_time = time.perf_counter() - - with torch.amp.autocast(self.torch_device.type if self.torch_device.type != "ocl" else "cpu", enabled=config.is_half, dtype=torch.float16 if config.is_half else torch.float32): - output_files = self.model_instance.separate(audio_file_path) - - clear_gpu_cache() - self.model_instance.clear_file_specific_paths() - - self.logger.debug(translations["separator_success_3"]) - self.logger.info(f"{translations['separator_duration']}: {time.strftime('%H:%M:%S', time.gmtime(int(time.perf_counter() - separate_start_time)))}") - return output_files \ No newline at end of file diff --git a/main/library/uvr5_lib/spec_utils.py b/main/library/uvr5_lib/spec_utils.py deleted file mode 100644 index b5903d4311425b9c730b73b76bd3086c119e9322..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/spec_utils.py +++ /dev/null @@ -1,914 +0,0 @@ -import os -import six -import sys -import librosa -import tempfile -import platform -import audioread -import subprocess - -import numpy as np -import soundfile as sf - -from scipy.signal import correlate, hilbert - -sys.path.append(os.getcwd()) - -from main.app.variables import translations, logger - -OPERATING_SYSTEM = platform.system() -SYSTEM_ARCH = platform.platform() -SYSTEM_PROC = platform.processor() -ARM = "arm" -AUTO_PHASE = "Automatic" -POSITIVE_PHASE = "Positive Phase" -NEGATIVE_PHASE = "Negative Phase" -NONE_P = ("None",) -BASE_PATH_RUB = sys._MEIPASS if getattr(sys, 'frozen', False) else os.path.dirname(os.path.abspath(__file__)) -DEVNULL = open(os.devnull, 'w') if six.PY2 else subprocess.DEVNULL -MAX_SPEC = "Max Spec" -MIN_SPEC = "Min Spec" -AVERAGE = "Average" - -is_macos = False -progress_value, last_update_time = 0, 0 - -if OPERATING_SYSTEM == "Darwin": - wav_resolution = "polyphase" if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else "sinc_fastest" - wav_resolution_float_resampling = "kaiser_best" if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else wav_resolution - is_macos = True -else: - wav_resolution = "sinc_fastest" - wav_resolution_float_resampling = wav_resolution - -def crop_center(h1, h2): - h1_shape = h1.size() - h2_shape = h2.size() - - if h1_shape[3] == h2_shape[3]: return h1 - elif h1_shape[3] < h2_shape[3]: raise ValueError("h1_shape[3] > h2_shape[3]") - - s_time = (h1_shape[3] - h2_shape[3]) // 2 - - h1 = h1[:, :, :, s_time:s_time + h2_shape[3]] - return h1 - -def preprocess(X_spec): - return np.abs(X_spec), np.angle(X_spec) - -def make_padding(width, cropsize, offset): - roi_size = cropsize - offset * 2 - - if roi_size == 0: roi_size = cropsize - return offset, roi_size - (width % roi_size) + offset, roi_size - -def normalize(wave, max_peak=1.0): - maxv = np.abs(wave).max() - - if maxv > max_peak: wave *= max_peak / maxv - return wave - -def auto_transpose(audio_array): - if audio_array.shape[1] == 2: return audio_array.T - return audio_array - -def write_array_to_mem(audio_data, subtype): - if isinstance(audio_data, np.ndarray): - import io - - audio_buffer = io.BytesIO() - sf.write(audio_buffer, audio_data, 44100, subtype=subtype, format="WAV") - - audio_buffer.seek(0) - return audio_buffer - else: return audio_data - -def spectrogram_to_image(spec, mode="magnitude"): - if mode == "magnitude": y = np.log10((np.abs(spec) if np.iscomplexobj(spec) else spec)**2 + 1e-8) - elif mode == "phase": y = np.angle(spec) if np.iscomplexobj(spec) else spec - - y -= y.min() - y *= 255 / y.max() - img = np.uint8(y) - - if y.ndim == 3: - img = img.transpose(1, 2, 0) - img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2) - - return img - -def reduce_vocal_aggressively(X, y, softmask): - y_mag_tmp = np.abs(y) - v_mag_tmp = np.abs(X - y) - - return np.clip(y_mag_tmp - v_mag_tmp * (v_mag_tmp > y_mag_tmp) * softmask, 0, np.inf) * np.exp(1.0j * np.angle(y)) - -def merge_artifacts(y_mask, thres=0.01, min_range=64, fade_size=32): - mask = y_mask - - try: - if min_range < fade_size * 2: raise ValueError("min_range >= fade_size * 2") - - idx = np.where(y_mask.min(axis=(0, 1)) > thres)[0] - start_idx = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0]) - end_idx = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1]) - artifact_idx = np.where(end_idx - start_idx > min_range)[0] - weight = np.zeros_like(y_mask) - - if len(artifact_idx) > 0: - start_idx = start_idx[artifact_idx] - end_idx = end_idx[artifact_idx] - old_e = None - - for s, e in zip(start_idx, end_idx): - if old_e is not None and s - old_e < fade_size: s = old_e - fade_size * 2 - - if s != 0: weight[:, :, s : s + fade_size] = np.linspace(0, 1, fade_size) - else: s -= fade_size - - if e != y_mask.shape[2]: weight[:, :, e - fade_size : e] = np.linspace(1, 0, fade_size) - else: e += fade_size - - weight[:, :, s + fade_size : e - fade_size] = 1 - old_e = e - - v_mask = 1 - y_mask - y_mask += weight * v_mask - mask = y_mask - except Exception as e: - import traceback - logger.error(f'{translations["not_success"]} {type(e).__name__}: {e}\n{traceback.format_exc()}') - - return mask - -def align_wave_head_and_tail(a, b): - l = min([a[0].size, b[0].size]) - return a[:l, :l], b[:l, :l] - -def convert_channels(spec, mp, band): - cc = mp.param["band"][str(band)].get("convert_channels") - - if "mid_side_c" == cc: - spec_left = np.add(spec[0], spec[1] * 0.25) - spec_right = np.subtract(spec[1], spec[0] * 0.25) - elif "mid_side" == cc: - spec_left = np.add(spec[0], spec[1]) / 2 - spec_right = np.subtract(spec[0], spec[1]) - elif "stereo_n" == cc: - spec_left = np.add(spec[0], spec[1] * 0.25) / 0.9375 - spec_right = np.add(spec[1], spec[0] * 0.25) / 0.9375 - else: return spec - - return np.asfortranarray([spec_left, spec_right]) - -def combine_spectrograms(specs, mp, is_v51_model=False): - l = min([specs[i].shape[2] for i in specs]) - spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64) - offset = 0 - bands_n = len(mp.param["band"]) - - for d in range(1, bands_n + 1): - h = mp.param["band"][str(d)]["crop_stop"] - mp.param["band"][str(d)]["crop_start"] - spec_c[:, offset : offset + h, :l] = specs[d][:, mp.param["band"][str(d)]["crop_start"] : mp.param["band"][str(d)]["crop_stop"], :l] - offset += h - - if offset > mp.param["bins"]: raise ValueError("offset > mp.param['bins']") - - if mp.param["pre_filter_start"] > 0: - if is_v51_model: spec_c *= get_lp_filter_mask(spec_c.shape[1], mp.param["pre_filter_start"], mp.param["pre_filter_stop"]) - else: - if bands_n == 1: spec_c = fft_lp_filter(spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]) - else: - import math - gp = 1 - - for b in range(mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]): - g = math.pow(10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0) - gp = g - spec_c[:, b, :] *= g - - return np.asfortranarray(spec_c) - -def wave_to_spectrogram(wave, hop_length, n_fft, mp, band, is_v51_model=False): - if wave.ndim == 1: wave = np.asfortranarray([wave, wave]) - - if not is_v51_model: - if mp.param["reverse"]: - wave_left = np.flip(np.asfortranarray(wave[0])) - wave_right = np.flip(np.asfortranarray(wave[1])) - elif mp.param["mid_side"]: - wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) - elif mp.param["mid_side_b2"]: - wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) - else: - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - else: - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - - spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length) - spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length) - - spec = np.asfortranarray([spec_left, spec_right]) - - if is_v51_model: spec = convert_channels(spec, mp, band) - return spec - -def spectrogram_to_wave(spec, hop_length=1024, mp={}, band=0, is_v51_model=True): - spec_left = np.asfortranarray(spec[0]) - spec_right = np.asfortranarray(spec[1]) - - wave_left = librosa.istft(spec_left, hop_length=hop_length) - wave_right = librosa.istft(spec_right, hop_length=hop_length) - - if is_v51_model: - cc = mp.param["band"][str(band)].get("convert_channels") - - if "mid_side_c" == cc: return np.asfortranarray([np.subtract(wave_left / 1.0625, wave_right / 4.25), np.add(wave_right / 1.0625, wave_left / 4.25)]) - elif "mid_side" == cc: return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) - elif "stereo_n" == cc: return np.asfortranarray([np.subtract(wave_left, wave_right * 0.25), np.subtract(wave_right, wave_left * 0.25)]) - else: - if mp.param["reverse"]: return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) - elif mp.param["mid_side"]: return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) - elif mp.param["mid_side_b2"]: return np.asfortranarray([np.add(wave_right / 1.25, 0.4 * wave_left), np.subtract(wave_left / 1.25, 0.4 * wave_right)]) - - return np.asfortranarray([wave_left, wave_right]) - -def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None, is_v51_model=False): - bands_n = len(mp.param["band"]) - offset = 0 - - for d in range(1, bands_n + 1): - bp = mp.param["band"][str(d)] - spec_s = np.zeros(shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex) - h = bp["crop_stop"] - bp["crop_start"] - spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[:, offset : offset + h, :] - offset += h - - if d == bands_n: - if extra_bins_h: - max_bin = bp["n_fft"] // 2 - spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[:, :extra_bins_h, :] - - if bp["hpf_start"] > 0: - if is_v51_model: spec_s *= get_hp_filter_mask(spec_s.shape[1], bp["hpf_start"], bp["hpf_stop"] - 1) - else: spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) - - wave = spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model) if bands_n == 1 else np.add(wave, spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model)) - else: - sr = mp.param["band"][str(d + 1)]["sr"] - if d == 1: - if is_v51_model: spec_s *= get_lp_filter_mask(spec_s.shape[1], bp["lpf_start"], bp["lpf_stop"]) - else: spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) - - try: - wave = librosa.resample(spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model), orig_sr=bp["sr"], target_sr=sr, res_type="soxr_vhq") - except ValueError as e: - logger.error(f"{translations['resample_error']}: {e}") - logger.error(f"{translations['shapes']} Spec_s: {spec_s.shape}, SR: {sr}, {translations['wav_resolution']}: {wav_resolution}") - else: - if is_v51_model: - spec_s *= get_hp_filter_mask(spec_s.shape[1], bp["hpf_start"], bp["hpf_stop"] - 1) - spec_s *= get_lp_filter_mask(spec_s.shape[1], bp["lpf_start"], bp["lpf_stop"]) - else: - spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) - spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) - - try: - wave = librosa.resample(np.add(wave, spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model)), orig_sr=bp["sr"], target_sr=sr, res_type="soxr_vhq") - except ValueError as e: - logger.error(f"{translations['resample_error']}: {e}") - logger.error(f"{translations['shapes']} Spec_s: {spec_s.shape}, SR: {sr}, {translations['wav_resolution']}: {wav_resolution}") - - return wave - -def get_lp_filter_mask(n_bins, bin_start, bin_stop): - return np.concatenate([np.ones((bin_start - 1, 1)), np.linspace(1, 0, bin_stop - bin_start + 1)[:, None], np.zeros((n_bins - bin_stop, 1))], axis=0) - -def get_hp_filter_mask(n_bins, bin_start, bin_stop): - return np.concatenate([np.zeros((bin_stop + 1, 1)), np.linspace(0, 1, 1 + bin_start - bin_stop)[:, None], np.ones((n_bins - bin_start - 2, 1))], axis=0) - -def fft_lp_filter(spec, bin_start, bin_stop): - g = 1.0 - - for b in range(bin_start, bin_stop): - g -= 1 / (bin_stop - bin_start) - spec[:, b, :] = g * spec[:, b, :] - - spec[:, bin_stop:, :] *= 0 - return spec - -def fft_hp_filter(spec, bin_start, bin_stop): - g = 1.0 - - for b in range(bin_start, bin_stop, -1): - g -= 1 / (bin_start - bin_stop) - spec[:, b, :] = g * spec[:, b, :] - - spec[:, 0 : bin_stop + 1, :] *= 0 - return spec - -def spectrogram_to_wave_old(spec, hop_length=1024): - if spec.ndim == 2: wave = librosa.istft(spec, hop_length=hop_length) - elif spec.ndim == 3: wave = np.asfortranarray([librosa.istft(np.asfortranarray(spec[0]), hop_length=hop_length), librosa.istft(np.asfortranarray(spec[1]), hop_length=hop_length)]) - - return wave - -def wave_to_spectrogram_old(wave, hop_length, n_fft): - return np.asfortranarray([librosa.stft(np.asfortranarray(wave[0]), n_fft=n_fft, hop_length=hop_length), librosa.stft(np.asfortranarray(wave[1]), n_fft=n_fft, hop_length=hop_length)]) - -def mirroring(a, spec_m, input_high_end, mp): - if "mirroring" == a: - mirror = np.flip(np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1) * np.exp(1.0j * np.angle(input_high_end)) - - return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror) - - if "mirroring2" == a: - mi = np.multiply(np.flip(np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1), input_high_end * 1.7) - - return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi) - -def adjust_aggr(mask, is_non_accom_stem, aggressiveness): - aggr = aggressiveness["value"] * 2 - - if aggr != 0: - if is_non_accom_stem: - aggr = 1 - aggr - - if np.any(aggr > 10) or np.any(aggr < -10): logger.warning(f"{translations['warnings']}: {aggr}") - - aggr = [aggr, aggr] - - if aggressiveness["aggr_correction"] is not None: - aggr[0] += aggressiveness["aggr_correction"]["left"] - aggr[1] += aggressiveness["aggr_correction"]["right"] - - for ch in range(2): - mask[ch, : aggressiveness["split_bin"]] = np.power(mask[ch, : aggressiveness["split_bin"]], 1 + aggr[ch] / 3) - mask[ch, aggressiveness["split_bin"] :] = np.power(mask[ch, aggressiveness["split_bin"] :], 1 + aggr[ch]) - - return mask - -def stft(wave, nfft, hl): - return np.asfortranarray([librosa.stft(np.asfortranarray(wave[0]), n_fft=nfft, hop_length=hl), librosa.stft(np.asfortranarray(wave[1]), n_fft=nfft, hop_length=hl)]) - -def istft(spec, hl): - return np.asfortranarray([librosa.istft(np.asfortranarray(spec[0]), hop_length=hl), librosa.istft(np.asfortranarray(spec[1]), hop_length=hl)]) - -def spec_effects(wave, algorithm="Default", value=None): - if np.isnan(wave).any() or np.isinf(wave).any(): logger.warning(f"{translations['warnings_2']}: {wave.shape}") - spec = [stft(wave[0], 2048, 1024), stft(wave[1], 2048, 1024)] - - if algorithm == "Min_Mag": wave = istft(np.where(np.abs(spec[1]) <= np.abs(spec[0]), spec[1], spec[0]), 1024) - elif algorithm == "Max_Mag": wave = istft(np.where(np.abs(spec[1]) >= np.abs(spec[0]), spec[1], spec[0]), 1024) - elif algorithm == "Default": wave = (wave[1] * value) + (wave[0] * (1 - value)) - elif algorithm == "Invert_p": - X_mag, y_mag = np.abs(spec[0]), np.abs(spec[1]) - wave = istft(spec[1] - np.where(X_mag >= y_mag, X_mag, y_mag) * np.exp(1.0j * np.angle(spec[0])), 1024) - - return wave - -def spectrogram_to_wave_no_mp(spec, n_fft=2048, hop_length=1024): - wave = librosa.istft(spec, n_fft=n_fft, hop_length=hop_length) - if wave.ndim == 1: wave = np.asfortranarray([wave, wave]) - - return wave - -def wave_to_spectrogram_no_mp(wave): - spec = librosa.stft(wave, n_fft=2048, hop_length=1024) - - if spec.ndim == 1: spec = np.asfortranarray([spec, spec]) - return spec - -def invert_audio(specs, invert_p=True): - ln = min([specs[0].shape[2], specs[1].shape[2]]) - specs[0] = specs[0][:, :, :ln] - specs[1] = specs[1][:, :, :ln] - - if invert_p: - X_mag, y_mag = np.abs(specs[0]), np.abs(specs[1]) - v_spec = specs[1] - np.where(X_mag >= y_mag, X_mag, y_mag) * np.exp(1.0j * np.angle(specs[0])) - else: - specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2) - v_spec = specs[0] - specs[1] - - return v_spec - -def invert_stem(mixture, stem): - return -spectrogram_to_wave_no_mp(invert_audio([wave_to_spectrogram_no_mp(mixture), wave_to_spectrogram_no_mp(stem)])).T - -def ensembling(a, inputs, is_wavs=False): - for i in range(1, len(inputs)): - if i == 1: input = inputs[0] - - if is_wavs: - ln = min([input.shape[1], inputs[i].shape[1]]) - input = input[:, :ln] - inputs[i] = inputs[i][:, :ln] - else: - ln = min([input.shape[2], inputs[i].shape[2]]) - input = input[:, :, :ln] - inputs[i] = inputs[i][:, :, :ln] - - if MIN_SPEC == a: input = np.where(np.abs(inputs[i]) <= np.abs(input), inputs[i], input) - if MAX_SPEC == a: input = np.where(np.abs(inputs[i]) >= np.abs(input), inputs[i], input) - - return input - -def ensemble_for_align(waves): - specs = [] - - for wav in waves: - spec = wave_to_spectrogram_no_mp(wav.T) - specs.append(spec) - - wav_aligned = spectrogram_to_wave_no_mp(ensembling(MIN_SPEC, specs)).T - wav_aligned = match_array_shapes(wav_aligned, waves[1], is_swap=True) - - return wav_aligned - -def ensemble_inputs(audio_input, algorithm, is_normalization, wav_type_set, save_path, is_wave=False, is_array=False): - wavs_ = [] - - if algorithm == AVERAGE: - output = average_audio(audio_input) - samplerate = 44100 - else: - specs = [] - - for i in range(len(audio_input)): - wave, samplerate = librosa.load(audio_input[i], mono=False, sr=44100) - wavs_.append(wave) - specs.append( wave if is_wave else wave_to_spectrogram_no_mp(wave)) - - wave_shapes = [w.shape[1] for w in wavs_] - target_shape = wavs_[wave_shapes.index(max(wave_shapes))] - - output = ensembling(algorithm, specs, is_wavs=True) if is_wave else spectrogram_to_wave_no_mp(ensembling(algorithm, specs)) - output = to_shape(output, target_shape.shape) - - sf.write(save_path, normalize(output.T, is_normalization), samplerate, subtype=wav_type_set) - -def to_shape(x, target_shape): - padding_list = [] - - for x_dim, target_dim in zip(x.shape, target_shape): - padding_list.append((0, target_dim - x_dim)) - - return np.pad(x, tuple(padding_list), mode="constant") - -def to_shape_minimize(x, target_shape): - padding_list = [] - - for x_dim, target_dim in zip(x.shape, target_shape): - padding_list.append((0, target_dim - x_dim)) - - return np.pad(x, tuple(padding_list), mode="constant") - -def detect_leading_silence(audio, sr, silence_threshold=0.007, frame_length=1024): - if len(audio.shape) == 2: - channel = np.argmax(np.sum(np.abs(audio), axis=1)) - audio = audio[channel] - - for i in range(0, len(audio), frame_length): - if np.max(np.abs(audio[i : i + frame_length])) > silence_threshold: return (i / sr) * 1000 - - return (len(audio) / sr) * 1000 - -def adjust_leading_silence(target_audio, reference_audio, silence_threshold=0.01, frame_length=1024): - def find_silence_end(audio): - if len(audio.shape) == 2: - channel = np.argmax(np.sum(np.abs(audio), axis=1)) - audio_mono = audio[channel] - else: audio_mono = audio - - for i in range(0, len(audio_mono), frame_length): - if np.max(np.abs(audio_mono[i : i + frame_length])) > silence_threshold: return i - - return len(audio_mono) - - ref_silence_end = find_silence_end(reference_audio) - target_silence_end = find_silence_end(target_audio) - silence_difference = ref_silence_end - target_silence_end - - try: - silence_difference_p = ((ref_silence_end / 44100) * 1000) - ((target_silence_end / 44100) * 1000) - except Exception as e: - pass - - if silence_difference > 0: return np.hstack((np.zeros((target_audio.shape[0], silence_difference))if len(target_audio.shape) == 2 else np.zeros(silence_difference), target_audio)) - elif silence_difference < 0: return target_audio[:, -silence_difference:]if len(target_audio.shape) == 2 else target_audio[-silence_difference:] - else: return target_audio - -def match_array_shapes(array_1, array_2, is_swap=False): - - if is_swap: array_1, array_2 = array_1.T, array_2.T - - if array_1.shape[1] > array_2.shape[1]: array_1 = array_1[:, : array_2.shape[1]] - elif array_1.shape[1] < array_2.shape[1]: - padding = array_2.shape[1] - array_1.shape[1] - array_1 = np.pad(array_1, ((0, 0), (0, padding)), "constant", constant_values=0) - - if is_swap: array_1, array_2 = array_1.T, array_2.T - - return array_1 - -def match_mono_array_shapes(array_1, array_2): - if len(array_1) > len(array_2): array_1 = array_1[: len(array_2)] - elif len(array_1) < len(array_2): - padding = len(array_2) - len(array_1) - array_1 = np.pad(array_1, (0, padding), "constant", constant_values=0) - - return array_1 - -def change_pitch_semitones(y, sr, semitone_shift): - factor = 2 ** (semitone_shift / 12) - y_pitch_tuned = [] - - for y_channel in y: - y_pitch_tuned.append(librosa.resample(y_channel, orig_sr=sr, target_sr=sr * factor, res_type="soxr_vhq")) - - y_pitch_tuned = np.array(y_pitch_tuned) - new_sr = sr * factor - - return y_pitch_tuned, new_sr - -def augment_audio(export_path, audio_file, rate, is_normalization, wav_type_set, save_format=None, is_pitch=False, is_time_correction=True): - wav, sr = librosa.load(audio_file, sr=44100, mono=False) - if wav.ndim == 1: wav = np.asfortranarray([wav, wav]) - - if not is_time_correction: wav_mix = change_pitch_semitones(wav, 44100, semitone_shift=-rate)[0] - else: - if is_pitch: wav_1, wav_2 = pitch_shift(wav[0], sr, rate, rbargs=None), pitch_shift(wav[1], sr, rate, rbargs=None) - else: wav_1, wav_2 = time_stretch(wav[0], sr, rate, rbargs=None), time_stretch(wav[1], sr, rate, rbargs=None) - - if wav_1.shape > wav_2.shape: wav_2 = to_shape(wav_2, wav_1.shape) - if wav_1.shape < wav_2.shape: wav_1 = to_shape(wav_1, wav_2.shape) - - wav_mix = np.asfortranarray([wav_1, wav_2]) - - sf.write(export_path, normalize(wav_mix.T, is_normalization), sr, subtype=wav_type_set) - save_format(export_path) - - -def average_audio(audio): - waves, wave_shapes, final_waves = [], [], [] - - for i in range(len(audio)): - wave = librosa.load(audio[i], sr=44100, mono=False) - waves.append(wave[0]) - wave_shapes.append(wave[0].shape[1]) - - wave_shapes_index = wave_shapes.index(max(wave_shapes)) - target_shape = waves[wave_shapes_index] - - waves.pop(wave_shapes_index) - final_waves.append(target_shape) - - for n_array in waves: - wav_target = to_shape(n_array, target_shape.shape) - final_waves.append(wav_target) - - waves = sum(final_waves) - return waves / len(audio) - -def average_dual_sources(wav_1, wav_2, value): - if wav_1.shape > wav_2.shape: wav_2 = to_shape(wav_2, wav_1.shape) - if wav_1.shape < wav_2.shape: wav_1 = to_shape(wav_1, wav_2.shape) - - return (wav_1 * value) + (wav_2 * (1 - value)) - -def reshape_sources(wav_1, wav_2): - if wav_1.shape > wav_2.shape: wav_2 = to_shape(wav_2, wav_1.shape) - - if wav_1.shape < wav_2.shape: - ln = min([wav_1.shape[1], wav_2.shape[1]]) - wav_2 = wav_2[:, :ln] - - ln = min([wav_1.shape[1], wav_2.shape[1]]) - wav_1 = wav_1[:, :ln] - wav_2 = wav_2[:, :ln] - - return wav_2 - -def reshape_sources_ref(wav_1_shape, wav_2): - if wav_1_shape > wav_2.shape: wav_2 = to_shape(wav_2, wav_1_shape) - return wav_2 - -def combine_arrarys(audio_sources, is_swap=False): - source = np.zeros_like(max(audio_sources, key=np.size)) - - for v in audio_sources: - v = match_array_shapes(v, source, is_swap=is_swap) - source += v - - return source - -def combine_audio(paths, audio_file_base=None, wav_type_set="FLOAT", save_format=None): - source = combine_arrarys([load_audio(i) for i in paths]) - save_path = f"{audio_file_base}_combined.wav" - sf.write(save_path, source.T, 44100, subtype=wav_type_set) - save_format(save_path) - -def reduce_mix_bv(inst_source, voc_source, reduction_rate=0.9): - return combine_arrarys([inst_source * (1 - reduction_rate), voc_source], is_swap=True) - -def organize_inputs(inputs): - input_list = {"target": None, "reference": None, "reverb": None, "inst": None} - - for i in inputs: - if i.endswith("_(Vocals).wav"): input_list["reference"] = i - elif "_RVC_" in i: input_list["target"] = i - elif i.endswith("reverbed_stem.wav"): input_list["reverb"] = i - elif i.endswith("_(Instrumental).wav"): input_list["inst"] = i - - return input_list - -def check_if_phase_inverted(wav1, wav2, is_mono=False): - if not is_mono: - wav1 = np.mean(wav1, axis=0) - wav2 = np.mean(wav2, axis=0) - - return np.corrcoef(wav1[:1000], wav2[:1000])[0, 1] < 0 - -def rerun_mp3(audio_file): - with audioread.audio_open(audio_file) as f: - track_length = int(f.duration) - - return track_length - -def align_audio(file1, file2, file2_aligned, file_subtracted, wav_type_set, is_save_aligned, command_Text, save_format, align_window, align_intro_val, db_analysis, set_progress_bar, phase_option, phase_shifts, is_match_silence, is_spec_match): - global progress_value - progress_value = 0 - is_mono = False - - def get_diff(a, b): - return np.correlate(a, b, "full").argmax() - (b.shape[0] - 1) - - def progress_bar(length): - global progress_value - progress_value += 1 - - if (0.90 / length * progress_value) >= 0.9: length = progress_value + 1 - set_progress_bar(0.1, (0.9 / length * progress_value)) - - if file1.endswith(".mp3") and is_macos: - length1 = rerun_mp3(file1) - wav1, sr1 = librosa.load(file1, duration=length1, sr=44100, mono=False) - else: - wav1, sr1 = librosa.load(file1, sr=44100, mono=False) - - if file2.endswith(".mp3") and is_macos: - length2 = rerun_mp3(file2) - wav2, sr2 = librosa.load(file2, duration=length2, sr=44100, mono=False) - else: - wav2, sr2 = librosa.load(file2, sr=44100, mono=False) - - if wav1.ndim == 1 and wav2.ndim == 1: is_mono = True - elif wav1.ndim == 1: wav1 = np.asfortranarray([wav1, wav1]) - elif wav2.ndim == 1: wav2 = np.asfortranarray([wav2, wav2]) - - if phase_option == AUTO_PHASE: - if check_if_phase_inverted(wav1, wav2, is_mono=is_mono): wav2 = -wav2 - elif phase_option == POSITIVE_PHASE: wav2 = +wav2 - elif phase_option == NEGATIVE_PHASE: wav2 = -wav2 - - if is_match_silence: wav2 = adjust_leading_silence(wav2, wav1) - - wav1_length = int(librosa.get_duration(y=wav1, sr=44100)) - wav2_length = int(librosa.get_duration(y=wav2, sr=44100)) - - if not is_mono: - wav1 = wav1.transpose() - wav2 = wav2.transpose() - - wav2_org = wav2.copy() - - command_Text(translations["process_file"]) - seconds_length = min(wav1_length, wav2_length) - wav2_aligned_sources = [] - - for sec_len in align_intro_val: - sec_seg = 1 if sec_len == 1 else int(seconds_length // sec_len) - index = sr1 * sec_seg - - if is_mono: - samp1, samp2 = wav1[index : index + sr1], wav2[index : index + sr1] - diff = get_diff(samp1, samp2) - else: - index = sr1 * sec_seg - samp1, samp2 = wav1[index : index + sr1, 0], wav2[index : index + sr1, 0] - samp1_r, samp2_r = wav1[index : index + sr1, 1], wav2[index : index + sr1, 1] - diff, _ = get_diff(samp1, samp2), get_diff(samp1_r, samp2_r) - - if diff > 0: wav2_aligned = np.append(np.zeros(diff) if is_mono else np.zeros((diff, 2)), wav2_org, axis=0) - elif diff < 0: wav2_aligned = wav2_org[-diff:] - else: wav2_aligned = wav2_org - - if not any(np.array_equal(wav2_aligned, source) for source in wav2_aligned_sources): wav2_aligned_sources.append(wav2_aligned) - - unique_sources = len(wav2_aligned_sources) - sub_mapper_big_mapper = {} - - for s in wav2_aligned_sources: - wav2_aligned = match_mono_array_shapes(s, wav1) if is_mono else match_array_shapes(s, wav1, is_swap=True) - - if align_window: - wav_sub = time_correction(wav1, wav2_aligned, seconds_length, align_window=align_window, db_analysis=db_analysis, progress_bar=progress_bar, unique_sources=unique_sources, phase_shifts=phase_shifts) - sub_mapper_big_mapper = {**sub_mapper_big_mapper, **{np.abs(wav_sub).mean(): wav_sub}} - else: - wav2_aligned = wav2_aligned * np.power(10, db_analysis[0] / 20) - - for db_adjustment in db_analysis[1]: - sub_mapper_big_mapper = {**sub_mapper_big_mapper, **{np.abs(wav_sub).mean(): wav1 - (wav2_aligned * (10 ** (db_adjustment / 20)))}} - - wav_sub = ensemble_for_align(list(sub_mapper_big_mapper.values())) if is_spec_match and len(list(sub_mapper_big_mapper.values())) >= 2 else ensemble_wav(list(sub_mapper_big_mapper.values())) - wav_sub = np.clip(wav_sub, -1, +1) - - command_Text(translations["save_instruments"]) - - if is_save_aligned or is_spec_match: - wav1 = match_mono_array_shapes(wav1, wav_sub) if is_mono else match_array_shapes(wav1, wav_sub, is_swap=True) - wav2_aligned = wav1 - wav_sub - - if is_spec_match: - if wav1.ndim == 1 and wav2.ndim == 1: - wav2_aligned = np.asfortranarray([wav2_aligned, wav2_aligned]).T - wav1 = np.asfortranarray([wav1, wav1]).T - - wav2_aligned = ensemble_for_align([wav2_aligned, wav1]) - wav_sub = wav1 - wav2_aligned - - if is_save_aligned: - sf.write(file2_aligned, wav2_aligned, sr1, subtype=wav_type_set) - save_format(file2_aligned) - - sf.write(file_subtracted, wav_sub, sr1, subtype=wav_type_set) - save_format(file_subtracted) - -def phase_shift_hilbert(signal, degree): - analytic_signal = hilbert(signal) - return np.cos(np.radians(degree)) * analytic_signal.real - np.sin(np.radians(degree)) * analytic_signal.imag - -def get_phase_shifted_tracks(track, phase_shift): - if phase_shift == 180: return [track, -track] - - step = phase_shift - end = 180 - (180 % step) if 180 % step == 0 else 181 - phase_range = range(step, end, step) - flipped_list = [track, -track] - - for i in phase_range: - flipped_list.extend([phase_shift_hilbert(track, i), phase_shift_hilbert(track, -i)]) - - return flipped_list - -def time_correction(mix, instrumental, seconds_length, align_window, db_analysis, sr=44100, progress_bar=None, unique_sources=None, phase_shifts=NONE_P): - def align_tracks(track1, track2): - shifted_tracks = {} - track2 = track2 * np.power(10, db_analysis[0] / 20) - track2_flipped = [track2] if phase_shifts == 190 else get_phase_shifted_tracks(track2, phase_shifts) - - for db_adjustment in db_analysis[1]: - for t in track2_flipped: - track2_adjusted = t * (10 ** (db_adjustment / 20)) - track2_shifted = np.roll(track2_adjusted, shift=np.argmax(np.abs(correlate(track1, track2_adjusted))) - (len(track1) - 1)) - shifted_tracks[np.abs(track1 - track2_shifted).mean()] = track2_shifted - - return shifted_tracks[min(shifted_tracks.keys())] - - assert mix.shape == instrumental.shape, translations["assert"].format(mixshape=mix.shape, instrumentalshape=instrumental.shape) - seconds_length = seconds_length // 2 - - sub_mapper = {} - progress_update_interval, total_iterations = 120, 0 - - if len(align_window) > 2: progress_update_interval = 320 - - for secs in align_window: - step = secs / 2 - window_size = int(sr * secs) - step_size = int(sr * step) - - if len(mix.shape) == 1: total_iterations += ((len(range(0, len(mix) - window_size, step_size)) // progress_update_interval) * unique_sources) - else: total_iterations += ((len(range(0, len(mix[:, 0]) - window_size, step_size)) * 2 // progress_update_interval) * unique_sources) - - for secs in align_window: - sub = np.zeros_like(mix) - divider = np.zeros_like(mix) - window_size = int(sr * secs) - step_size = int(sr * secs / 2) - window = np.hanning(window_size) - - if len(mix.shape) == 1: - counter = 0 - - for i in range(0, len(mix) - window_size, step_size): - counter += 1 - if counter % progress_update_interval == 0: progress_bar(total_iterations) - - window_mix = mix[i : i + window_size] * window - window_instrumental = instrumental[i : i + window_size] * window - window_instrumental_aligned = align_tracks(window_mix, window_instrumental) - sub[i : i + window_size] += window_mix - window_instrumental_aligned - divider[i : i + window_size] += window - else: - counter = 0 - - for ch in range(mix.shape[1]): - for i in range(0, len(mix[:, ch]) - window_size, step_size): - counter += 1 - - if counter % progress_update_interval == 0: progress_bar(total_iterations) - - window_mix = mix[i : i + window_size, ch] * window - window_instrumental = instrumental[i : i + window_size, ch] * window - window_instrumental_aligned = align_tracks(window_mix, window_instrumental) - sub[i : i + window_size, ch] += window_mix - window_instrumental_aligned - divider[i : i + window_size, ch] += window - - return ensemble_wav(list({**sub_mapper, **{np.abs(sub).mean(): np.where(divider > 1e-6, sub / divider, sub)}}.values()), split_size=12) - -def ensemble_wav(waveforms, split_size=240): - waveform_thirds = {i: np.array_split(waveform, split_size) for i, waveform in enumerate(waveforms)} - final_waveform = [] - for third_idx in range(split_size): - final_waveform.append(waveform_thirds[np.argmin([np.abs(waveform_thirds[i][third_idx]).mean() for i in range(len(waveforms))])][third_idx]) - - return np.concatenate(final_waveform) - -def ensemble_wav_min(waveforms): - for i in range(1, len(waveforms)): - if i == 1: wave = waveforms[0] - ln = min(len(wave), len(waveforms[i])) - wave = wave[:ln] - waveforms[i] = waveforms[i][:ln] - wave = np.where(np.abs(waveforms[i]) <= np.abs(wave), waveforms[i], wave) - - return wave - -def align_audio_test(wav1, wav2, sr1=44100): - def get_diff(a, b): - return np.correlate(a, b, "full").argmax() - (b.shape[0] - 1) - - wav1 = wav1.transpose() - wav2 = wav2.transpose() - wav2_org = wav2.copy() - index = sr1 - diff = get_diff(wav1[index : index + sr1, 0], wav2[index : index + sr1, 0]) - - if diff > 0: wav2_aligned = np.append(np.zeros((diff, 1)), wav2_org, axis=0) - elif diff < 0: wav2_aligned = wav2_org[-diff:] - else: wav2_aligned = wav2_org - return wav2_aligned - -def load_audio(audio_file): - wav, _ = librosa.load(audio_file, sr=44100, mono=False) - if wav.ndim == 1: wav = np.asfortranarray([wav, wav]) - return wav - -def __rubberband(y, sr, **kwargs): - assert sr > 0 - fd, infile = tempfile.mkstemp(suffix='.wav') - os.close(fd) - fd, outfile = tempfile.mkstemp(suffix='.wav') - os.close(fd) - - sf.write(infile, y, sr) - - try: - arguments = [os.path.join(BASE_PATH_RUB, 'rubberband'), '-q'] - for key, value in six.iteritems(kwargs): - arguments.append(str(key)) - arguments.append(str(value)) - - arguments.extend([infile, outfile]) - subprocess.check_call(arguments, stdout=DEVNULL, stderr=DEVNULL) - - y_out, _ = sf.read(outfile, always_2d=True) - if y.ndim == 1: y_out = np.squeeze(y_out) - except OSError as exc: - six.raise_from(RuntimeError(translations["rubberband"]), exc) - finally: - os.unlink(infile) - os.unlink(outfile) - - return y_out - -def time_stretch(y, sr, rate, rbargs=None): - if rate <= 0: raise ValueError(translations["rate"]) - if rate == 1.0: return y - if rbargs is None: rbargs = dict() - - rbargs.setdefault('--tempo', rate) - return __rubberband(y, sr, **rbargs) - -def pitch_shift(y, sr, n_steps, rbargs=None): - if n_steps == 0: return y - if rbargs is None: rbargs = dict() - - rbargs.setdefault('--pitch', n_steps) - return __rubberband(y, sr, **rbargs) \ No newline at end of file diff --git a/main/library/uvr5_lib/vr_network/layers.py b/main/library/uvr5_lib/vr_network/layers.py deleted file mode 100644 index 602a8541ad61995a24bbf72b30285eb547f0c880..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/vr_network/layers.py +++ /dev/null @@ -1,123 +0,0 @@ -import os -import sys -import torch - -import torch.nn as nn -import torch.nn.functional as F - -sys.path.append(os.getcwd()) - -from main.library.uvr5_lib import spec_utils - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential(nn.Conv2d(nin, nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, bias=False), nn.BatchNorm2d(nout), activ()) - - def __call__(self, input_tensor): - return self.conv(input_tensor) - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d( - nin, - nout, - kernel_size=1, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, input_tensor): - return self.conv(input_tensor) - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, input_tensor): - skip = self.conv1(input_tensor) - hidden = self.conv2(skip) - - return hidden, skip - -class Decoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, input_tensor, skip=None): - input_tensor = F.interpolate(input_tensor, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, input_tensor) - input_tensor = torch.cat([input_tensor, skip], dim=1) - - output_tensor = self.conv(input_tensor) - if self.dropout is not None: - output_tensor = self.dropout(output_tensor) - - return output_tensor - -class ASPPModule(nn.Module): - def __init__(self, nn_architecture, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.nn_architecture = nn_architecture - self.six_layer = [129605] - self.seven_layer = [537238, 537227, 33966] - extra_conv = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) - - self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, None)), Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) - self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) - self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) - - if self.nn_architecture in self.six_layer: - self.conv6 = extra_conv - nin_x = 6 - elif self.nn_architecture in self.seven_layer: - self.conv6 = extra_conv - self.conv7 = extra_conv - nin_x = 7 - else: - nin_x = 5 - - self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * nin_x, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) - - def forward(self, input_tensor): - _, _, h, w = input_tensor.size() - - feat1 = F.interpolate(self.conv1(input_tensor), size=(h, w), mode="bilinear", align_corners=True) - feat2 = self.conv2(input_tensor) - feat3 = self.conv3(input_tensor) - feat4 = self.conv4(input_tensor) - feat5 = self.conv5(input_tensor) - - if self.nn_architecture in self.six_layer: - feat6 = self.conv6(input_tensor) - out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6), dim=1) - elif self.nn_architecture in self.seven_layer: - feat6 = self.conv6(input_tensor) - feat7 = self.conv7(input_tensor) - out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) - else: - out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - - bottleneck_output = self.bottleneck(out) - return bottleneck_output \ No newline at end of file diff --git a/main/library/uvr5_lib/vr_network/layers_new.py b/main/library/uvr5_lib/vr_network/layers_new.py deleted file mode 100644 index 7192a74835c55b28f8ac4f370d79ca9dd4a7629e..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/vr_network/layers_new.py +++ /dev/null @@ -1,86 +0,0 @@ -import os -import sys -import torch - -import torch.nn as nn -import torch.nn.functional as F - -sys.path.append(os.getcwd()) - -from main.library.uvr5_lib import spec_utils - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential(nn.Conv2d(nin, nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, bias=False), nn.BatchNorm2d(nout), activ()) - - def __call__(self, input_tensor): - return self.conv(input_tensor) - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) - - def __call__(self, input_tensor): - hidden = self.conv1(input_tensor) - hidden = self.conv2(hidden) - - return hidden - -class Decoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): - super(Decoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, input_tensor, skip=None): - input_tensor = F.interpolate(input_tensor, scale_factor=2, mode="bilinear", align_corners=True) - - if skip is not None: - skip = spec_utils.crop_center(skip, input_tensor) - input_tensor = torch.cat([input_tensor, skip], dim=1) - - hidden = self.conv1(input_tensor) - - if self.dropout is not None: - hidden = self.dropout(hidden) - - return hidden - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, None)), Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)) - self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) - self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ) - self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ) - self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ) - self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def forward(self, input_tensor): - _, _, h, w = input_tensor.size() - - out = self.bottleneck(torch.cat((F.interpolate(self.conv1(input_tensor), size=(h, w), mode="bilinear", align_corners=True), self.conv2(input_tensor), self.conv3(input_tensor), self.conv4(input_tensor), self.conv5(input_tensor)), dim=1)) - - if self.dropout is not None: - out = self.dropout(out) - - return out - -class LSTMModule(nn.Module): - def __init__(self, nin_conv, nin_lstm, nout_lstm): - super(LSTMModule, self).__init__() - self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) - self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True) - self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()) - - def forward(self, input_tensor): - N, _, nbins, nframes = input_tensor.size() - - hidden, _ = self.lstm(self.conv(input_tensor)[:, 0].permute(2, 0, 1)) - hidden = self.dense(hidden.reshape(-1, hidden.size()[-1])).reshape(nframes, N, 1, nbins) - - return hidden.permute(1, 2, 3, 0) \ No newline at end of file diff --git a/main/library/uvr5_lib/vr_network/model_param_init.py b/main/library/uvr5_lib/vr_network/model_param_init.py deleted file mode 100644 index e8ae91f23993eb8dc50153fa0efb623dd7e13dd9..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/vr_network/model_param_init.py +++ /dev/null @@ -1,39 +0,0 @@ -import json -import pickle - -default_param = {} -default_param["bins"] = -1 -default_param["unstable_bins"] = -1 -default_param["stable_bins"] = -1 -default_param["sr"] = 44100 -default_param["pre_filter_start"] = -1 -default_param["pre_filter_stop"] = -1 -default_param["band"] = {} - -N_BINS = "n_bins" - -def int_keys(pairs): - result_dict = {} - - for key, value in pairs: - if isinstance(key, str) and key.isdigit(): key = int(key) - result_dict[key] = value - - return result_dict - -class ModelParameters(object): - def __init__(self, config_path="", key_in_bin=None): - if config_path.endswith(".bin"): - with open(config_path, "rb") as f: - data = pickle.load(f) - self.param = data[key_in_bin] - else: - with open(config_path, "r", encoding="utf-8") as f: - self.param = json.loads(f.read(), object_pairs_hook=int_keys) - - for k in ["mid_side", "mid_side_b", "mid_side_b2", "stereo_w", "stereo_n", "reverse"]: - if k not in self.param: - self.param[k] = False - - if N_BINS in self.param: - self.param["bins"] = self.param[N_BINS] \ No newline at end of file diff --git a/main/library/uvr5_lib/vr_network/nets.py b/main/library/uvr5_lib/vr_network/nets.py deleted file mode 100644 index 7b1bbb8613b2a035e39109ce404b438a2ba02ef5..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/vr_network/nets.py +++ /dev/null @@ -1,116 +0,0 @@ -import os -import sys -import torch - -import torch.nn as nn -import torch.nn.functional as F - -sys.path.append(os.getcwd()) - -from main.library.uvr5_lib.vr_network import layers - -class BaseASPPNet(nn.Module): - def __init__(self, nn_architecture, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.nn_architecture = nn_architecture - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - if self.nn_architecture == 129605: - self.enc5 = layers.Encoder(ch * 8, ch * 16, 3, 2, 1) - self.aspp = layers.ASPPModule(nn_architecture, ch * 16, ch * 32, dilations) - self.dec5 = layers.Decoder(ch * (16 + 32), ch * 16, 3, 1, 1) - else: - self.aspp = layers.ASPPModule(nn_architecture, ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, input_tensor): - hidden_state, encoder_output1 = self.enc1(input_tensor) - hidden_state, encoder_output2 = self.enc2(hidden_state) - hidden_state, encoder_output3 = self.enc3(hidden_state) - hidden_state, encoder_output4 = self.enc4(hidden_state) - - if self.nn_architecture == 129605: - hidden_state, encoder_output5 = self.enc5(hidden_state) - hidden_state = self.dec5(self.aspp(hidden_state), encoder_output5) - else: - hidden_state = self.aspp(hidden_state) - - hidden_state = self.dec1(self.dec2(self.dec3(self.dec4(hidden_state, encoder_output4), encoder_output3), encoder_output2), encoder_output1) - return hidden_state - -def determine_model_capacity(n_fft_bins, nn_architecture): - sp_model_arch = [31191, 33966, 129605] - hp_model_arch = [123821, 123812] - hp2_model_arch = [537238, 537227] - - if nn_architecture in sp_model_arch: - model_capacity_data = [(2, 16), (2, 16), (18, 8, 1, 1, 0), (8, 16), (34, 16, 1, 1, 0), (16, 32), (32, 2, 1), (16, 2, 1), (16, 2, 1)] - - if nn_architecture in hp_model_arch: - model_capacity_data = [(2, 32), (2, 32), (34, 16, 1, 1, 0), (16, 32), (66, 32, 1, 1, 0), (32, 64), (64, 2, 1), (32, 2, 1), (32, 2, 1)] - - if nn_architecture in hp2_model_arch: - model_capacity_data = [(2, 64), (2, 64), (66, 32, 1, 1, 0), (32, 64), (130, 64, 1, 1, 0), (64, 128), (128, 2, 1), (64, 2, 1), (64, 2, 1)] - - cascaded = CascadedASPPNet - model = cascaded(n_fft_bins, model_capacity_data, nn_architecture) - - return model - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft, model_capacity_data, nn_architecture): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[0]) - self.stg1_high_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[1]) - self.stg2_bridge = layers.Conv2DBNActiv(*model_capacity_data[2]) - self.stg2_full_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[3]) - self.stg3_bridge = layers.Conv2DBNActiv(*model_capacity_data[4]) - self.stg3_full_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[5]) - self.out = nn.Conv2d(*model_capacity_data[6], bias=False) - self.aux1_out = nn.Conv2d(*model_capacity_data[7], bias=False) - self.aux2_out = nn.Conv2d(*model_capacity_data[8], bias=False) - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - self.offset = 128 - - def forward(self, input_tensor): - mix = input_tensor.detach() - - input_tensor = input_tensor.clone() - input_tensor = input_tensor[:, :, : self.max_bin] - - bandwidth = input_tensor.size()[2] // 2 - aux1 = torch.cat([self.stg1_low_band_net(input_tensor[:, :, :bandwidth]), self.stg1_high_band_net(input_tensor[:, :, bandwidth:])], dim=2) - - hidden_state = torch.cat([input_tensor, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(hidden_state)) - - hidden_state = torch.cat([input_tensor, aux1, aux2], dim=1) - hidden_state = self.stg3_full_band_net(self.stg3_bridge(hidden_state)) - - mask = self.out(hidden_state).sigmoid() - mask = F.pad(input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), mode="replicate") - - if self.training: - aux1 = self.aux1_out(aux1).sigmoid() - aux1 = F.pad(input=aux1, pad=(0, 0, 0, self.output_bin - aux1.size()[2]), mode="replicate") - aux2 = self.aux2_out(aux2).sigmoid() - aux2 = F.pad(input=aux2, pad=(0, 0, 0, self.output_bin - aux2.size()[2]), mode="replicate") - return mask * mix, aux1 * mix, aux2 * mix - else: - return mask - - def predict_mask(self, input_tensor): - mask = self.forward(input_tensor) - - if self.offset > 0: - mask = mask[:, :, :, self.offset : -self.offset] - - return mask \ No newline at end of file diff --git a/main/library/uvr5_lib/vr_network/nets_new.py b/main/library/uvr5_lib/vr_network/nets_new.py deleted file mode 100644 index 735bec05574d46392c6cd1871cc71af1d907b5b0..0000000000000000000000000000000000000000 --- a/main/library/uvr5_lib/vr_network/nets_new.py +++ /dev/null @@ -1,103 +0,0 @@ -import os -import sys -import torch - -import torch.nn as nn -import torch.nn.functional as F - -sys.path.append(os.getcwd()) - -from main.library.uvr5_lib.vr_network import layers_new as layers - -class BaseNet(nn.Module): - def __init__(self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))): - super(BaseNet, self).__init__() - self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1) - self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1) - self.enc3 = layers.Encoder(nout * 2, nout * 4, 3, 2, 1) - self.enc4 = layers.Encoder(nout * 4, nout * 6, 3, 2, 1) - self.enc5 = layers.Encoder(nout * 6, nout * 8, 3, 2, 1) - self.aspp = layers.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) - self.dec4 = layers.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) - self.dec3 = layers.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) - self.dec2 = layers.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) - self.lstm_dec2 = layers.LSTMModule(nout * 2, nin_lstm, nout_lstm) - self.dec1 = layers.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) - - def __call__(self, input_tensor): - encoded1 = self.enc1(input_tensor) - encoded2 = self.enc2(encoded1) - encoded3 = self.enc3(encoded2) - encoded4 = self.enc4(encoded3) - - bottleneck = self.dec2(self.dec3(self.dec4(self.aspp(self.enc5(encoded4)), encoded4), encoded3), encoded2) - bottleneck = self.dec1(torch.cat([bottleneck, self.lstm_dec2(bottleneck)], dim=1), encoded1) - - return bottleneck - -class CascadedNet(nn.Module): - def __init__(self, n_fft, nn_arch_size=51000, nout=32, nout_lstm=128): - super(CascadedNet, self).__init__() - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - self.nin_lstm = self.max_bin // 2 - self.offset = 64 - nout = 64 if nn_arch_size == 218409 else nout - self.stg1_low_band_net = nn.Sequential(BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0)) - self.stg1_high_band_net = BaseNet(2, nout // 4, self.nin_lstm // 2, nout_lstm // 2) - self.stg2_low_band_net = nn.Sequential(BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0)) - self.stg2_high_band_net = BaseNet(nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2) - self.stg3_full_band_net = BaseNet(3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm) - self.out = nn.Conv2d(nout, 2, 1, bias=False) - self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) - - def forward(self, input_tensor): - input_tensor = input_tensor[:, :, : self.max_bin] - bandw = input_tensor.size()[2] // 2 - - l1_in = input_tensor[:, :, :bandw] - h1_in = input_tensor[:, :, bandw:] - - l1 = self.stg1_low_band_net(l1_in) - h1 = self.stg1_high_band_net(h1_in) - aux1 = torch.cat([l1, h1], dim=2) - - l2_in = torch.cat([l1_in, l1], dim=1) - h2_in = torch.cat([h1_in, h1], dim=1) - - l2 = self.stg2_low_band_net(l2_in) - h2 = self.stg2_high_band_net(h2_in) - - aux2 = torch.cat([l2, h2], dim=2) - f3_in = torch.cat([input_tensor, aux1, aux2], dim=1) - f3 = self.stg3_full_band_net(f3_in) - - mask = self.out(f3).sigmoid() - mask = F.pad(input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), mode="replicate") - - if self.training: - aux = self.aux_out(torch.cat([aux1, aux2], dim=1)).sigmoid() - aux = F.pad(input=aux, pad=(0, 0, 0, self.output_bin - aux.size()[2]), mode="replicate") - - return mask, aux - else: - return mask - - def predict_mask(self, input_tensor): - mask = self.forward(input_tensor) - - if self.offset > 0: - mask = mask[:, :, :, self.offset : -self.offset] - assert mask.size()[3] > 0 - - return mask - - def predict(self, input_tensor): - mask = self.forward(input_tensor) - pred_mag = input_tensor * mask - - if self.offset > 0: - pred_mag = pred_mag[:, :, :, self.offset : -self.offset] - assert pred_mag.size()[3] > 0 - - return pred_mag \ No newline at end of file diff --git a/main/tools/gdown.py b/main/tools/gdown.py deleted file mode 100644 index 1fc48a6201b7d8f9a0ce75828c09b603ab69f1a9..0000000000000000000000000000000000000000 --- a/main/tools/gdown.py +++ /dev/null @@ -1,118 +0,0 @@ -import os -import re -import sys -import json -import tqdm -import codecs -import tempfile -import requests - -from urllib.parse import urlparse, parse_qs, unquote - -sys.path.append(os.getcwd()) - -from main.app.variables import translations - -def parse_url(url): - parsed = urlparse(url) - is_download_link = parsed.path.endswith("/uc") - if not parsed.hostname in ("drive.google.com", "docs.google.com"): return None, is_download_link - file_id = parse_qs(parsed.query).get("id", [None])[0] - - if file_id is None: - for pattern in (r"^/file/d/(.*?)/(edit|view)$", r"^/file/u/[0-9]+/d/(.*?)/(edit|view)$", r"^/document/d/(.*?)/(edit|htmlview|view)$", r"^/document/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$", r"^/presentation/d/(.*?)/(edit|htmlview|view)$", r"^/presentation/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$", r"^/spreadsheets/d/(.*?)/(edit|htmlview|view)$", r"^/spreadsheets/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$"): - match = re.match(pattern, parsed.path) - if match: - file_id = match.group(1) - break - return file_id, is_download_link - -def get_url_from_gdrive_confirmation(contents): - for pattern in (r'href="(\/uc\?export=download[^"]+)', r'href="/open\?id=([^"]+)"', r'"downloadUrl":"([^"]+)'): - match = re.search(pattern, contents) - if match: - url = match.group(1) - if pattern == r'href="/open\?id=([^"]+)"': url = (codecs.decode("uggcf://qevir.hfrepbagrag.tbbtyr.pbz/qbjaybnq?vq=", "rot13") + url + "&confirm=t&uuid=" + re.search(r'(.*)

', contents) - if match: raise Exception(match.group(1)) - raise Exception(translations["gdown_error"]) - -def _get_session(use_cookies, return_cookies_file=False): - sess = requests.session() - sess.headers.update({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"}) - cookies_file = os.path.join(os.path.expanduser("~"), ".cache/gdown/cookies.json") - - if os.path.exists(cookies_file) and use_cookies: - with open(cookies_file) as f: - for k, v in json.load(f): - sess.cookies[k] = v - return (sess, cookies_file) if return_cookies_file else sess - -def gdown_download(url=None, output=None): - file_id = None - - if url is None: raise ValueError(translations["gdown_value_error"]) - - if "/file/d/" in url: file_id = url.split("/d/")[1].split("/")[0] - elif "open?id=" in url: file_id = url.split("open?id=")[1].split("/")[0] - elif "/download?id=" in url: file_id = url.split("/download?id=")[1].split("&")[0] - - if file_id: - url = f"{codecs.decode('uggcf://qevir.tbbtyr.pbz/hp?vq=', 'rot13')}{file_id}" - url_origin = url - - sess, cookies_file = _get_session(use_cookies=True, return_cookies_file=True) - gdrive_file_id, is_gdrive_download_link = parse_url(url) - - if gdrive_file_id: - url = f"{codecs.decode('uggcf://qevir.tbbtyr.pbz/hp?vq=', 'rot13')}{gdrive_file_id}" - url_origin = url - is_gdrive_download_link = True - - while 1: - res = sess.get(url, stream=True, verify=True) - if url == url_origin and res.status_code == 500: - url = f"{codecs.decode('uggcf://qevir.tbbtyr.pbz/bcra?vq=', 'rot13')}{gdrive_file_id}" - continue - - os.makedirs(os.path.dirname(cookies_file), exist_ok=True) - with open(cookies_file, "w") as f: - json.dump([(k, v) for k, v in sess.cookies.items() if not k.startswith("download_warning_")], f, indent=2) - - if "Content-Disposition" in res.headers: break - if not (gdrive_file_id and is_gdrive_download_link): break - - try: - url = get_url_from_gdrive_confirmation(res.text) - except Exception as e: - raise Exception(e) - - if gdrive_file_id and is_gdrive_download_link: - content_disposition = unquote(res.headers["Content-Disposition"]) - filename_from_url = (re.search(r"filename\*=UTF-8''(.*)", content_disposition) or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition)).group(1).replace(os.path.sep, "_") - else: filename_from_url = os.path.basename(url) - - output = os.path.join(output or ".", filename_from_url) - tmp_file = tempfile.mktemp(suffix=tempfile.template, prefix=os.path.basename(output), dir=os.path.dirname(output)) - f = open(tmp_file, "ab") - - if tmp_file is not None and f.tell() != 0: res = sess.get(url, headers={"Range": f"bytes={f.tell()}-"}, stream=True, verify=True) - - try: - with tqdm.tqdm(desc=os.path.basename(output), total=int(res.headers.get("Content-Length", 0)), ncols=100, unit="byte") as pbar: - for chunk in res.iter_content(chunk_size=512 * 1024): - f.write(chunk) - pbar.update(len(chunk)) - - pbar.close() - if tmp_file: f.close() - finally: - os.rename(tmp_file, output) - sess.close() - - return output - return None \ No newline at end of file diff --git a/main/tools/huggingface.py b/main/tools/huggingface.py deleted file mode 100644 index b5fa07f1155c5d03fbfc991cbcb6297539780793..0000000000000000000000000000000000000000 --- a/main/tools/huggingface.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import tqdm -import requests - -try: - import wget -except: - wget = None - -def HF_download_file(url, output_path=None): - url = url.replace("/blob/", "/resolve/").replace("?download=true", "").strip() - output_path = os.path.basename(url) if output_path is None else (os.path.join(output_path, os.path.basename(url)) if os.path.isdir(output_path) else output_path) - - if wget != None: wget.download(url, out=output_path) - else: - response = requests.get(url, stream=True, timeout=300) - - if response.status_code == 200: - progress_bar = tqdm.tqdm(total=int(response.headers.get("content-length", 0)), desc=os.path.basename(url), ncols=100, unit="byte", leave=False) - - with open(output_path, "wb") as f: - for chunk in response.iter_content(chunk_size=10 * 1024 * 1024): - progress_bar.update(len(chunk)) - f.write(chunk) - - progress_bar.close() - else: raise ValueError(response.status_code) - - return output_path \ No newline at end of file diff --git a/main/tools/mediafire.py b/main/tools/mediafire.py deleted file mode 100644 index 4ebb5b60df4f4262c53ffe443ac1ef06ec0e2c80..0000000000000000000000000000000000000000 --- a/main/tools/mediafire.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -import sys -import requests - -from bs4 import BeautifulSoup - -def Mediafire_Download(url, output=None, filename=None): - if not filename: filename = url.split('/')[-2] - if not output: output = os.path.dirname(os.path.realpath(__file__)) - output_file = os.path.join(output, filename) - - sess = requests.session() - sess.headers.update({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"}) - - try: - with requests.get(BeautifulSoup(sess.get(url).content, "html.parser").find(id="downloadButton").get("href"), stream=True) as r: - r.raise_for_status() - - with open(output_file, "wb") as f: - total_length = int(r.headers.get('content-length')) - download_progress = 0 - - for chunk in r.iter_content(chunk_size=1024): - download_progress += len(chunk) - f.write(chunk) - - sys.stdout.write(f"\r[{filename}]: {int(100 * download_progress / total_length)}% ({round(download_progress / 1024 / 1024, 2)}mb/{round(total_length / 1024 / 1024, 2)}mb)") - sys.stdout.flush() - - sys.stdout.write("\n") - return output_file - except Exception as e: - raise RuntimeError(e) \ No newline at end of file diff --git a/main/tools/meganz.py b/main/tools/meganz.py deleted file mode 100644 index 7336ea583604059326f11ca1f9e309f7f665bfdd..0000000000000000000000000000000000000000 --- a/main/tools/meganz.py +++ /dev/null @@ -1,130 +0,0 @@ -import os -import re -import sys -import json -import tqdm -import codecs -import random -import base64 -import struct -import shutil -import requests -import tempfile - -from Crypto.Cipher import AES -from Crypto.Util import Counter - -sys.path.append(os.getcwd()) - -from main.app.variables import translations - -def makebyte(x): - return codecs.latin_1_encode(x)[0] - -def a32_to_str(a): - return struct.pack('>%dI' % len(a), *a) - -def get_chunks(size): - p, s = 0, 0x20000 - - while p + s < size: - yield(p, s) - p += s - - if s < 0x100000: s += 0x20000 - - yield(p, size - p) - -def aes_cbc_decrypt(data, key): - aes_cipher = AES.new(key, AES.MODE_CBC, makebyte('\0' * 16)) - return aes_cipher.decrypt(data) - -def decrypt_attr(attr, key): - attr = codecs.latin_1_decode(aes_cbc_decrypt(attr, a32_to_str(key)))[0].rstrip('\0') - return json.loads(attr[4:]) if attr[:6] == 'MEGA{"' else False - -def _api_request(data): - sequence_num = random.randint(0, 0xFFFFFFFF) - params = {'id': sequence_num} - sequence_num += 1 - - if not isinstance(data, list): data = [data] - json_resp = json.loads(requests.post('{0}://g.api.{1}/cs'.format('https', 'mega.co.nz'), params=params, data=json.dumps(data), timeout=160).text) - if isinstance(json_resp, int): raise Exception(json_resp) - - return json_resp[0] - -def base64_url_decode(data): - data += '=='[(2 - len(data) * 3) % 4:] - - for search, replace in (('-', '+'), ('_', '/'), (',', '')): - data = data.replace(search, replace) - - return base64.b64decode(data) - -def str_to_a32(b): - if isinstance(b, str): b = makebyte(b) - if len(b) % 4: b += b'\0' * (4 - len(b) % 4) - return struct.unpack('>%dI' % (len(b) / 4), b) - -def base64_to_a32(s): - return str_to_a32(base64_url_decode(s)) - -def mega_download_file(file_handle, file_key, dest_path=None): - file_key = base64_to_a32(file_key) - file_data = _api_request({'a': 'g', 'g': 1, 'p': file_handle}) - - k = (file_key[0] ^ file_key[4], file_key[1] ^ file_key[5], file_key[2] ^ file_key[6], file_key[3] ^ file_key[7]) - iv = file_key[4:6] + (0, 0) - - if 'g' not in file_data: raise Exception(translations["file_not_access"]) - - file_size = file_data['s'] - attribs = decrypt_attr(base64_url_decode(file_data['at']), k) - input_file = requests.get(file_data['g'], stream=True).raw - - temp_output_file = tempfile.NamedTemporaryFile(mode='w+b', prefix='megapy_', delete=False) - k_str = a32_to_str(k) - aes = AES.new(k_str, AES.MODE_CTR, counter=Counter.new(128, initial_value=((iv[0] << 32) + iv[1]) << 64)) - - mac_str = b'\0' * 16 - mac_encryptor = AES.new(k_str, AES.MODE_CBC, mac_str) - iv_str = a32_to_str([iv[0], iv[1], iv[0], iv[1]]) - - with tqdm.tqdm(total=file_size, ncols=100, unit="byte") as pbar: - for _, chunk_size in get_chunks(file_size): - chunk = aes.decrypt(input_file.read(chunk_size)) - temp_output_file.write(chunk) - pbar.update(len(chunk)) - encryptor = AES.new(k_str, AES.MODE_CBC, iv_str) - - for i in range(0, len(chunk) - 16, 16): - block = chunk[i:i + 16] - encryptor.encrypt(block) - - i = (i + 16) if file_size > 16 else 0 - block = chunk[i:i + 16] - if len(block) % 16: block += b'\0' * (16 - (len(block) % 16)) - - mac_str = mac_encryptor.encrypt(encryptor.encrypt(block)) - - file_mac = str_to_a32(mac_str) - temp_output_file.close() - - if (file_mac[0] ^ file_mac[1], file_mac[2] ^ file_mac[3]) != file_key[6:8]: raise ValueError(translations["mac_not_match"]) - - file_path = os.path.join(dest_path, attribs['n']) - if os.path.exists(file_path): os.remove(file_path) - - shutil.move(temp_output_file.name, file_path) - return file_path - -def mega_download_url(url, dest_path=None): - if '/file/' in url: - url = url.replace(' ', '') - file_id = re.findall(r'\W\w\w\w\w\w\w\w\w\W', url)[0][1:-1] - path = f'{file_id}!{url[re.search(file_id, url).end() + 1:]}'.split('!') - elif '!' in url: path = re.findall(r'/#!(.*)', url)[0].split('!') - else: raise Exception(translations["missing_url"]) - - return mega_download_file(path[0], path[1], dest_path) \ No newline at end of file diff --git a/main/tools/noisereduce.py b/main/tools/noisereduce.py deleted file mode 100644 index ff904e377824a79676dac8289d23b2cbb40ac716..0000000000000000000000000000000000000000 --- a/main/tools/noisereduce.py +++ /dev/null @@ -1,80 +0,0 @@ -import os -import sys -import torch - -from torch.nn.functional import conv1d, conv2d - -sys.path.append(os.getcwd()) - -@torch.no_grad() -def temperature_sigmoid(x, x0, temp_coeff): - return ((x - x0) / temp_coeff).sigmoid() - -@torch.no_grad() -def linspace(start, stop, num = 50, endpoint = True, **kwargs): - return torch.linspace(start, stop, num, **kwargs) if endpoint else torch.linspace(start, stop, num + 1, **kwargs)[:-1] - -@torch.no_grad() -def amp_to_db(x, eps=torch.finfo(torch.float32).eps, top_db=40): - x_db = 20 * (x + eps).log10() - return x_db.max((x_db.max(-1).values - top_db).unsqueeze(-1)) - -class TorchGate(torch.nn.Module): - @torch.no_grad() - def __init__(self, sr, nonstationary = False, n_std_thresh_stationary = 1.5, n_thresh_nonstationary = 1.3, temp_coeff_nonstationary = 0.1, n_movemean_nonstationary = 20, prop_decrease = 1.0, n_fft = 1024, win_length = None, hop_length = None, freq_mask_smooth_hz = 500, time_mask_smooth_ms = 50): - super().__init__() - self.sr = sr - self.nonstationary = nonstationary - assert 0.0 <= prop_decrease <= 1.0 - self.prop_decrease = prop_decrease - self.n_fft = n_fft - self.win_length = self.n_fft if win_length is None else win_length - self.hop_length = self.win_length // 4 if hop_length is None else hop_length - self.n_std_thresh_stationary = n_std_thresh_stationary - self.temp_coeff_nonstationary = temp_coeff_nonstationary - self.n_movemean_nonstationary = n_movemean_nonstationary - self.n_thresh_nonstationary = n_thresh_nonstationary - self.freq_mask_smooth_hz = freq_mask_smooth_hz - self.time_mask_smooth_ms = time_mask_smooth_ms - self.register_buffer("smoothing_filter", self._generate_mask_smoothing_filter()) - - @torch.no_grad() - def _generate_mask_smoothing_filter(self): - if self.freq_mask_smooth_hz is None and self.time_mask_smooth_ms is None: return None - n_grad_freq = (1 if self.freq_mask_smooth_hz is None else int(self.freq_mask_smooth_hz / (self.sr / (self.n_fft / 2)))) - if n_grad_freq < 1: raise ValueError - - n_grad_time = (1 if self.time_mask_smooth_ms is None else int(self.time_mask_smooth_ms / ((self.hop_length / self.sr) * 1000))) - if n_grad_time < 1: raise ValueError - if n_grad_time == 1 and n_grad_freq == 1: return None - - smoothing_filter = torch.outer(torch.cat([linspace(0, 1, n_grad_freq + 1, endpoint=False), linspace(1, 0, n_grad_freq + 2)])[1:-1], torch.cat([linspace(0, 1, n_grad_time + 1, endpoint=False), linspace(1, 0, n_grad_time + 2)])[1:-1]).unsqueeze(0).unsqueeze(0) - return smoothing_filter / smoothing_filter.sum() - - @torch.no_grad() - def _stationary_mask(self, X_db): - std_freq_noise, mean_freq_noise = torch.std_mean(X_db, dim=-1) - return X_db > (mean_freq_noise + std_freq_noise * self.n_std_thresh_stationary).unsqueeze(2) - - @torch.no_grad() - def _nonstationary_mask(self, X_abs): - X_smoothed = (conv1d(X_abs.reshape(-1, 1, X_abs.shape[-1]), torch.ones(self.n_movemean_nonstationary, dtype=X_abs.dtype, device=X_abs.device).view(1, 1, -1), padding="same").view(X_abs.shape) / self.n_movemean_nonstationary) - return temperature_sigmoid(((X_abs - X_smoothed) / X_smoothed), self.n_thresh_nonstationary, self.temp_coeff_nonstationary) - - def forward(self, x): - assert x.ndim == 2 - if x.shape[-1] < self.win_length * 2: raise Exception - - if str(x.device).startswith(("ocl", "privateuseone")): - if not hasattr(self, "stft"): - from main.library.backends.utils import STFT - self.stft = STFT(filter_length=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, pad_mode="constant").to(x.device) - X, phase = self.stft.transform(x, eps=1e-9, return_phase=True) - else: - X = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, return_complex=True, pad_mode="constant", center=True, window=torch.hann_window(self.win_length).to(x.device)) - - sig_mask = self.prop_decrease * ((self._nonstationary_mask(X.abs()) if self.nonstationary else self._stationary_mask(amp_to_db(X.abs()))).float() * 1.0 - 1.0) + 1.0 - if self.smoothing_filter is not None: sig_mask = conv2d(sig_mask.unsqueeze(1), self.smoothing_filter.to(sig_mask.dtype), padding="same") - Y = X * sig_mask.squeeze(1) - - return self.stft.inverse(Y, phase) if hasattr(self, "stft") else torch.istft(Y, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, center=True, window=torch.hann_window(self.win_length).to(Y.device)).to(dtype=x.dtype) \ No newline at end of file diff --git a/main/tools/pixeldrain.py b/main/tools/pixeldrain.py deleted file mode 100644 index 64741b6d9417055efb66ffddae84087f8d52552d..0000000000000000000000000000000000000000 --- a/main/tools/pixeldrain.py +++ /dev/null @@ -1,16 +0,0 @@ -import os -import requests - -def pixeldrain(url, output_dir): - try: - response = requests.get(f"https://pixeldrain.com/api/file/{url.split('pixeldrain.com/u/')[1]}") - - if response.status_code == 200: - file_path = os.path.join(output_dir, (response.headers.get("Content-Disposition").split("filename=")[-1].strip('";'))) - - with open(file_path, "wb") as newfile: - newfile.write(response.content) - return file_path - else: return None - except Exception as e: - raise RuntimeError(e) \ No newline at end of file