Spaces:

EleshVaishnav
/

VoiceConversionWebUI

Build error

App Files Files Community

Elesh Vaishnav commited on Oct 18, 2025

Commit

51e88fc

verified ·

1 Parent(s): 5682687

Upload 13 files

Browse files

Files changed (13) hide show

.gitignore +24 -0
Dockerfile +34 -0
LICENSE +21 -0
Makefile +22 -0
README.md +96 -14
TERMS_OF_USE.md +52 -0
app.py +154 -0
core.py +2423 -0
docker-compose.yaml +16 -0
requirements.txt +50 -0
run-applio.sh +9 -0
run-install.sh +174 -0
run-tensorboard.sh +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+*.exe
+*.pt
+*.onnx
+*.pyc
+*.pth
+*.index
+*.mp3
+*.flac
+*.ogg
+*.m4a
+*.bin
+*.wav
+*.txt
+*.zip
+*.png
+*.safetensors
+assets/audios
+assets/datasets
+logs
+rvc/models
+env
+venv
+.venv

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# syntax=docker/dockerfile:1
+FROM python:3.10-bullseye
+# Expose the required port
+EXPOSE 6969
+# Set up working directory
+WORKDIR /app
+# Install system dependencies, clean up cache to keep image size small
+RUN apt update && \
+    apt install -y -qq ffmpeg && \
+    apt clean && rm -rf /var/lib/apt/lists/*
+# Copy application files into the container
+COPY . .
+# Create a virtual environment in the app directory and install dependencies
+RUN python3 -m venv /app/.venv && \
+    . /app/.venv/bin/activate && \
+    pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir python-ffmpeg && \
+    pip install --no-cache-dir torch==2.7.1 torchvision torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128 && \
+    if [ -f "requirements.txt" ]; then pip install --no-cache-dir -r requirements.txt; fi
+# Define volumes for persistent storage
+VOLUME ["/app/logs/"]
+# Set environment variables if necessary
+ENV PATH="/app/.venv/bin:$PATH"
+# Run the app
+ENTRYPOINT ["python3"]
+CMD ["app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 AI Hispano
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Makefile ADDED Viewed

	@@ -0,0 +1,22 @@

+.PHONY:
+.ONESHELL:
+# Show help message
+help:
+	@grep -hE '^[A-Za-z0-9_ \-]*?:.*##.*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+# Install dependencies
+run-install:
+	apt-get -y install build-essential python3-dev ffmpeg
+	pip install --upgrade setuptools wheel
+	pip install pip==24.1
+	pip install -r requirements.txt
+	apt-get update
+# Run Applio
+run-applio:
+	python app.py --share
+# Run Tensorboard
+run-tensorboard:
+	python core.py tensorboard

README.md CHANGED Viewed

@@ -1,14 +1,96 @@
----
-title: VoiceConversionWebUI
-emoji: 📊
-colorFrom: indigo
-colorTo: purple
-sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
-license: mit
-short_description: VC
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<h1 align="center">
+  <a href="https://applio.org" target="_blank"><img src="https://github.com/IAHispano/Applio/assets/133521603/78e975d8-b07f-47ba-ab23-5a31592f322a" alt="Applio"></a>
+</h1>
+<p align="center">
+    <img alt="Contributors" src="https://img.shields.io/github/contributors/iahispano/applio?style=for-the-badge&color=FFFFFF" />
+    <img alt="Release" src="https://img.shields.io/github/release/iahispano/applio?style=for-the-badge&color=FFFFFF" />
+    <img alt="Stars" src="https://img.shields.io/github/stars/iahispano/applio?style=for-the-badge&color=FFFFFF" />
+    <img alt="Fork" src="https://img.shields.io/github/forks/iahispano/applio?style=for-the-badge&color=FFFFFF" />
+    <img alt="Issues" src="https://img.shields.io/github/issues/iahispano/applio?style=for-the-badge&color=FFFFFF" />
+</p>
+<p align="center">A simple, high-quality voice conversion tool, focused on ease of use and performance.</p>
+<p align="center">
+  <a href="https://applio.org" target="_blank">🌐 Website</a>
+  •
+  <a href="https://docs.applio.org" target="_blank">📚 Documentation</a>
+  •
+  <a href="https://discord.gg/urxFjYmYYh" target="_blank">☎️ Discord</a>
+</p>
+<p align="center">
+  <a href="https://github.com/IAHispano/Applio-Plugins" target="_blank">🛒 Plugins</a>
+  •
+  <a href="https://huggingface.co/IAHispano/Applio/tree/main/Compiled" target="_blank">📦 Compiled</a>
+  •
+  <a href="https://applio.org/playground" target="_blank">🎮 Playground</a>
+  •
+  <a href="https://colab.research.google.com/github/iahispano/applio/blob/main/assets/Applio.ipynb" target="_blank">🔎 Google Colab (UI)</a>
+  •
+  <a href="https://colab.research.google.com/github/iahispano/applio/blob/main/assets/Applio_NoUI.ipynb" target="_blank">🔎 Google Colab (No UI)</a>
+</p>
+> [!NOTE]
+> Applio will no longer receive frequent updates. Going forward, development will focus mainly on security patches, dependency updates, and occasional feature improvements. This is because the project is already stable and mature with limited room for further improvements. Pull requests are still welcome and will be reviewed.
+## Introduction
+Applio is a powerful voice conversion tool focused on simplicity, quality, and performance. Whether you're an artist, developer, or researcher, Applio offers a straightforward platform for high-quality voice transformations. Its flexible design allows for customization through plugins and configurations, catering to a wide range of projects.
+## Terms of Use and Commercial Usage
+Using Applio responsibly is essential.
+- Users must respect copyrights, intellectual property, and privacy rights.
+- Applio is intended for lawful and ethical purposes, including personal, academic, and investigative projects.
+- Commercial usage is permitted, provided users adhere to legal and ethical guidelines, secure appropriate rights and permissions, and comply with the [MIT license](./LICENSE).
+The source code and model weights in this repository are licensed under the permissive [MIT license](./LICENSE), allowing modification, redistribution, and commercial use.
+However, if you choose to use this official version of Applio (as provided in this repository, without significant modification), you must also comply with our [Terms of Use](./TERMS_OF_USE.md). These terms apply to our integrations, configurations, and default project behavior, and are intended to ensure responsible and ethical use without limiting their use in any way.
+For commercial use, we recommend contacting us at [support@applio.org](mailto:support@applio.org) to ensure your usage aligns with ethical standards. All audio generated with Applio must comply with applicable copyright laws. If you find Applio helpful, consider supporting its development [through a donation](https://ko-fi.com/iahispano).
+By using the official version of Applio, you accept full responsibility for complying with both the MIT license and our Terms of Use. Applio and its contributors are not liable for misuse. For full legal details, see the [Terms of Use](./TERMS_OF_USE.md).
+## Getting Started
+### 1. Installation
+Run the installation script based on your operating system:
+- **Windows:** Double-click `run-install.bat`.
+- **Linux/macOS:** Execute `run-install.sh`.
+### 2. Running Applio
+Start Applio using:
+- **Windows:** Double-click `run-applio.bat`.
+- **Linux/macOS:** Run `run-applio.sh`.
+This launches the Gradio interface in your default browser.
+### 3. Optional: TensorBoard Monitoring
+To monitor training or visualize data:
+- **Windows:** Run `run-tensorboard.bat`.
+- **Linux/macOS:** Run `run-tensorboard.sh`.
+For more detailed instructions, visit the [documentation](https://docs.applio.org).
+## References
+Applio is made possible thanks to these projects and their references:
+- [gradio-screen-recorder](https://huggingface.co/spaces/gstaff/gradio-screen-recorder) by gstaff
+- [rvc-cli](https://github.com/blaisewf/rvc-cli) by blaisewf
+### Contributors
+<a href="https://github.com/IAHispano/Applio/graphs/contributors" target="_blank">
+  <img src="https://contrib.rocks/image?repo=IAHispano/Applio" />
+</a>

TERMS_OF_USE.md ADDED Viewed

	@@ -0,0 +1,52 @@

+# Terms of Use
+## Responsibilities of the User
+By using Applio, you agree to the following responsibilities:
+### 1. Respect Intellectual Property and Privacy Rights
+- Ensure that any audio or material processed through Applio is either owned by you or used with explicit permission from the rightful owner.
+- Respect copyrights, intellectual property rights, and privacy rights of all individuals and entities.
+### 2. Avoid Harmful or Unethical Use
+- Do not use Applio to create or distribute content that harms, defames, or infringes upon the rights of others.
+- Avoid any activities that may violate ethical standards, promote hate speech, or facilitate illegal conduct.
+### 3. Adhere to Local Laws and Regulations
+- Familiarize yourself with and comply with the laws and regulations governing the use of AI, voice transformation tools, and generated content in your jurisdiction.
+## Disclaimer of Liability
+Applio and its contributors disclaim all liability for any misuse or unintended consequences arising from the use of this tool.
+- **No Warranty**: Applio is provided "as is" without any warranty, express or implied.
+- **User Responsibility**: You bear full responsibility for how you choose to use Applio and any outcomes resulting from that use.
+- **No Endorsement**: Applio does not endorse or support any activities or content created with this tool that result in harm, illegal activity, or unethical practices.
+## Permitted Use Cases
+Applio is designed for:
+- **Personal Projects**: Experimentation and creative endeavors for personal enrichment.
+- **Academic Research**: Advancing scientific understanding and education.
+- **Investigative Purposes**: Analyzing data in lawful and ethical contexts.
+- **Commercial Use**: Creating content for commercial purposes, provided that appropriate rights and permissions are obtained and all legal and ethical standards are adhered to.
+## Prohibited Activities
+The following uses are explicitly prohibited:
+- **Harmful Applications**: Generating audio to defame, harm, or manipulate others.
+- **Unauthorized Distribution**: Sharing content that violates copyrights or the rights of others.
+- **Deceptive Practices**: Creating content intended to deceive or defraud others.
+## Training Data
+All official models distributed by Applio have been trained under publicly available datasets such as [VCTK](https://huggingface.co/datasets/IAHispano/Applio-Dataset). We strive to maintain transparency and ethical practices in the development and distribution of our tools.
+## Amendments
+Applio reserves the right to modify these terms at any time. Continued use of the tool signifies your acceptance of any updated terms.

app.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import gradio as gr
+import sys
+import os
+import logging
+from typing import Any
+DEFAULT_SERVER_NAME = "127.0.0.1"
+DEFAULT_PORT = 6969
+MAX_PORT_ATTEMPTS = 10
+# Set up logging
+logging.getLogger("uvicorn").setLevel(logging.WARNING)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+# Add current directory to sys.path
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+# Zluda hijack
+import rvc.lib.zluda
+# Import Tabs
+from tabs.inference.inference import inference_tab
+from tabs.train.train import train_tab
+from tabs.extra.extra import extra_tab
+from tabs.report.report import report_tab
+from tabs.download.download import download_tab
+from tabs.tts.tts import tts_tab
+from tabs.voice_blender.voice_blender import voice_blender_tab
+from tabs.plugins.plugins import plugins_tab
+from tabs.settings.settings import settings_tab
+from tabs.realtime.realtime import realtime_tab
+# Run prerequisites
+from core import run_prerequisites_script
+run_prerequisites_script(
+    pretraineds_hifigan=True,
+    models=True,
+    exe=True,
+)
+# Initialize i18n
+from assets.i18n.i18n import I18nAuto
+i18n = I18nAuto()
+# Start Discord presence if enabled
+from tabs.settings.sections.presence import load_config_presence
+if load_config_presence():
+    from assets.discord_presence import RPCManager
+    RPCManager.start_presence()
+# Check installation
+import assets.installation_checker as installation_checker
+installation_checker.check_installation()
+# Load theme
+import assets.themes.loadThemes as loadThemes
+my_applio = loadThemes.load_theme() or "ParityError/Interstellar"
+# Define Gradio interface
+with gr.Blocks(
+    theme=my_applio, title="Applio", css="footer{display:none !important}"
+) as Applio:
+    gr.Markdown("# Applio")
+    gr.Markdown(
+        i18n(
+            "A simple, high-quality voice conversion tool focused on ease of use and performance."
+        )
+    )
+    gr.Markdown(
+        i18n(
+            "[Support](https://discord.gg/urxFjYmYYh) — [GitHub](https://github.com/IAHispano/Applio)"
+        )
+    )
+    with gr.Tab(i18n("Inference")):
+        inference_tab()
+    with gr.Tab(i18n("Training")):
+        train_tab()
+    with gr.Tab(i18n("TTS")):
+        tts_tab()
+    with gr.Tab(i18n("Voice Blender")):
+        voice_blender_tab()
+    with gr.Tab(i18n("Realtime")):
+        realtime_tab()
+    with gr.Tab(i18n("Plugins")):
+        plugins_tab()
+    with gr.Tab(i18n("Download")):
+        download_tab()
+    with gr.Tab(i18n("Report a Bug")):
+        report_tab()
+    with gr.Tab(i18n("Extra")):
+        extra_tab()
+    with gr.Tab(i18n("Settings")):
+        settings_tab()
+    gr.Markdown(
+        """
+    <div style="text-align: center; font-size: 0.9em; text-color: a3a3a3;">
+    By using Applio, you agree to comply with ethical and legal standards, respect intellectual property and privacy rights, avoid harmful or prohibited uses, and accept full responsibility for any outcomes, while Applio disclaims liability and reserves the right to amend these terms.
+    </div>
+    """
+    )
+def launch_gradio(server_name: str, server_port: int) -> None:
+    Applio.launch(
+        favicon_path="assets/ICON.ico",
+        share="--share" in sys.argv,
+        inbrowser="--open" in sys.argv,
+        server_name=server_name,
+        server_port=server_port,
+    )
+def get_value_from_args(key: str, default: Any = None) -> Any:
+    if key in sys.argv:
+        index = sys.argv.index(key) + 1
+        if index < len(sys.argv):
+            return sys.argv[index]
+    return default
+if __name__ == "__main__":
+    port = int(get_value_from_args("--port", DEFAULT_PORT))
+    server = get_value_from_args("--server-name", DEFAULT_SERVER_NAME)
+    for _ in range(MAX_PORT_ATTEMPTS):
+        try:
+            launch_gradio(server, port)
+            break
+        except OSError:
+            print(
+                f"Failed to launch on port {port}, trying again on port {port - 1}..."
+            )
+            port -= 1
+        except Exception as error:
+            print(f"An error occurred launching Gradio: {error}")
+            break

core.py ADDED Viewed

	@@ -0,0 +1,2423 @@

+import os
+import sys
+import json
+import argparse
+import subprocess
+from functools import lru_cache
+from distutils.util import strtobool
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+current_script_directory = os.path.dirname(os.path.realpath(__file__))
+logs_path = os.path.join(current_script_directory, "logs")
+from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline
+from rvc.train.process.model_blender import model_blender
+from rvc.train.process.model_information import model_information
+from rvc.lib.tools.analyzer import analyze_audio
+from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
+from rvc.lib.tools.model_download import model_download_pipeline
+python = sys.executable
+# Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4
+@lru_cache(maxsize=1)  # Cache only one result since the file is static
+def load_voices_data():
+    with open(
+        os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r", encoding="utf-8"
+    ) as file:
+        return json.load(file)
+voices_data = load_voices_data()
+locales = list({voice["ShortName"] for voice in voices_data})
+@lru_cache(maxsize=None)
+def import_voice_converter():
+    from rvc.infer.infer import VoiceConverter
+    return VoiceConverter()
+@lru_cache(maxsize=1)
+def get_config():
+    from rvc.configs.config import Config
+    return Config()
+# Infer
+def run_infer_script(
+    pitch: int,
+    index_rate: float,
+    volume_envelope: float,
+    protect: float,
+    f0_method: str,
+    input_path: str,
+    output_path: str,
+    pth_path: str,
+    index_path: str,
+    split_audio: bool,
+    f0_autotune: bool,
+    f0_autotune_strength: float,
+    proposed_pitch: bool,
+    proposed_pitch_threshold: float,
+    clean_audio: bool,
+    clean_strength: float,
+    export_format: str,
+    embedder_model: str,
+    embedder_model_custom: str = None,
+    formant_shifting: bool = False,
+    formant_qfrency: float = 1.0,
+    formant_timbre: float = 1.0,
+    post_process: bool = False,
+    reverb: bool = False,
+    pitch_shift: bool = False,
+    limiter: bool = False,
+    gain: bool = False,
+    distortion: bool = False,
+    chorus: bool = False,
+    bitcrush: bool = False,
+    clipping: bool = False,
+    compressor: bool = False,
+    delay: bool = False,
+    reverb_room_size: float = 0.5,
+    reverb_damping: float = 0.5,
+    reverb_wet_gain: float = 0.5,
+    reverb_dry_gain: float = 0.5,
+    reverb_width: float = 0.5,
+    reverb_freeze_mode: float = 0.5,
+    pitch_shift_semitones: float = 0.0,
+    limiter_threshold: float = -6,
+    limiter_release_time: float = 0.01,
+    gain_db: float = 0.0,
+    distortion_gain: float = 25,
+    chorus_rate: float = 1.0,
+    chorus_depth: float = 0.25,
+    chorus_center_delay: float = 7,
+    chorus_feedback: float = 0.0,
+    chorus_mix: float = 0.5,
+    bitcrush_bit_depth: int = 8,
+    clipping_threshold: float = -6,
+    compressor_threshold: float = 0,
+    compressor_ratio: float = 1,
+    compressor_attack: float = 1.0,
+    compressor_release: float = 100,
+    delay_seconds: float = 0.5,
+    delay_feedback: float = 0.0,
+    delay_mix: float = 0.5,
+    sid: int = 0,
+):
+    kwargs = {
+        "audio_input_path": input_path,
+        "audio_output_path": output_path,
+        "model_path": pth_path,
+        "index_path": index_path,
+        "volume_envelope": volume_envelope,
+        "pitch": pitch,
+        "index_rate": index_rate,
+        "protect": protect,
+        "f0_method": f0_method,
+        "pth_path": pth_path,
+        "index_path": index_path,
+        "split_audio": split_audio,
+        "f0_autotune": f0_autotune,
+        "f0_autotune_strength": f0_autotune_strength,
+        "proposed_pitch": proposed_pitch,
+        "proposed_pitch_threshold": proposed_pitch_threshold,
+        "clean_audio": clean_audio,
+        "clean_strength": clean_strength,
+        "export_format": export_format,
+        "embedder_model": embedder_model,
+        "embedder_model_custom": embedder_model_custom,
+        "post_process": post_process,
+        "formant_shifting": formant_shifting,
+        "formant_qfrency": formant_qfrency,
+        "formant_timbre": formant_timbre,
+        "reverb": reverb,
+        "pitch_shift": pitch_shift,
+        "limiter": limiter,
+        "gain": gain,
+        "distortion": distortion,
+        "chorus": chorus,
+        "bitcrush": bitcrush,
+        "clipping": clipping,
+        "compressor": compressor,
+        "delay": delay,
+        "reverb_room_size": reverb_room_size,
+        "reverb_damping": reverb_damping,
+        "reverb_wet_level": reverb_wet_gain,
+        "reverb_dry_level": reverb_dry_gain,
+        "reverb_width": reverb_width,
+        "reverb_freeze_mode": reverb_freeze_mode,
+        "pitch_shift_semitones": pitch_shift_semitones,
+        "limiter_threshold": limiter_threshold,
+        "limiter_release": limiter_release_time,
+        "gain_db": gain_db,
+        "distortion_gain": distortion_gain,
+        "chorus_rate": chorus_rate,
+        "chorus_depth": chorus_depth,
+        "chorus_delay": chorus_center_delay,
+        "chorus_feedback": chorus_feedback,
+        "chorus_mix": chorus_mix,
+        "bitcrush_bit_depth": bitcrush_bit_depth,
+        "clipping_threshold": clipping_threshold,
+        "compressor_threshold": compressor_threshold,
+        "compressor_ratio": compressor_ratio,
+        "compressor_attack": compressor_attack,
+        "compressor_release": compressor_release,
+        "delay_seconds": delay_seconds,
+        "delay_feedback": delay_feedback,
+        "delay_mix": delay_mix,
+        "sid": sid,
+    }
+    infer_pipeline = import_voice_converter()
+    infer_pipeline.convert_audio(
+        **kwargs,
+    )
+    return f"File {input_path} inferred successfully.", output_path.replace(
+        ".wav", f".{export_format.lower()}"
+    )
+# Batch infer
+def run_batch_infer_script(
+    pitch: int,
+    index_rate: float,
+    volume_envelope: float,
+    protect: float,
+    f0_method: str,
+    input_folder: str,
+    output_folder: str,
+    pth_path: str,
+    index_path: str,
+    split_audio: bool,
+    f0_autotune: bool,
+    f0_autotune_strength: float,
+    proposed_pitch: bool,
+    proposed_pitch_threshold: float,
+    clean_audio: bool,
+    clean_strength: float,
+    export_format: str,
+    embedder_model: str,
+    embedder_model_custom: str = None,
+    formant_shifting: bool = False,
+    formant_qfrency: float = 1.0,
+    formant_timbre: float = 1.0,
+    post_process: bool = False,
+    reverb: bool = False,
+    pitch_shift: bool = False,
+    limiter: bool = False,
+    gain: bool = False,
+    distortion: bool = False,
+    chorus: bool = False,
+    bitcrush: bool = False,
+    clipping: bool = False,
+    compressor: bool = False,
+    delay: bool = False,
+    reverb_room_size: float = 0.5,
+    reverb_damping: float = 0.5,
+    reverb_wet_gain: float = 0.5,
+    reverb_dry_gain: float = 0.5,
+    reverb_width: float = 0.5,
+    reverb_freeze_mode: float = 0.5,
+    pitch_shift_semitones: float = 0.0,
+    limiter_threshold: float = -6,
+    limiter_release_time: float = 0.01,
+    gain_db: float = 0.0,
+    distortion_gain: float = 25,
+    chorus_rate: float = 1.0,
+    chorus_depth: float = 0.25,
+    chorus_center_delay: float = 7,
+    chorus_feedback: float = 0.0,
+    chorus_mix: float = 0.5,
+    bitcrush_bit_depth: int = 8,
+    clipping_threshold: float = -6,
+    compressor_threshold: float = 0,
+    compressor_ratio: float = 1,
+    compressor_attack: float = 1.0,
+    compressor_release: float = 100,
+    delay_seconds: float = 0.5,
+    delay_feedback: float = 0.0,
+    delay_mix: float = 0.5,
+    sid: int = 0,
+):
+    kwargs = {
+        "audio_input_paths": input_folder,
+        "audio_output_path": output_folder,
+        "model_path": pth_path,
+        "index_path": index_path,
+        "pitch": pitch,
+        "index_rate": index_rate,
+        "volume_envelope": volume_envelope,
+        "protect": protect,
+        "f0_method": f0_method,
+        "pth_path": pth_path,
+        "index_path": index_path,
+        "split_audio": split_audio,
+        "f0_autotune": f0_autotune,
+        "f0_autotune_strength": f0_autotune_strength,
+        "proposed_pitch": proposed_pitch,
+        "proposed_pitch_threshold": proposed_pitch_threshold,
+        "clean_audio": clean_audio,
+        "clean_strength": clean_strength,
+        "export_format": export_format,
+        "embedder_model": embedder_model,
+        "embedder_model_custom": embedder_model_custom,
+        "post_process": post_process,
+        "formant_shifting": formant_shifting,
+        "formant_qfrency": formant_qfrency,
+        "formant_timbre": formant_timbre,
+        "reverb": reverb,
+        "pitch_shift": pitch_shift,
+        "limiter": limiter,
+        "gain": gain,
+        "distortion": distortion,
+        "chorus": chorus,
+        "bitcrush": bitcrush,
+        "clipping": clipping,
+        "compressor": compressor,
+        "delay": delay,
+        "reverb_room_size": reverb_room_size,
+        "reverb_damping": reverb_damping,
+        "reverb_wet_level": reverb_wet_gain,
+        "reverb_dry_level": reverb_dry_gain,
+        "reverb_width": reverb_width,
+        "reverb_freeze_mode": reverb_freeze_mode,
+        "pitch_shift_semitones": pitch_shift_semitones,
+        "limiter_threshold": limiter_threshold,
+        "limiter_release": limiter_release_time,
+        "gain_db": gain_db,
+        "distortion_gain": distortion_gain,
+        "chorus_rate": chorus_rate,
+        "chorus_depth": chorus_depth,
+        "chorus_delay": chorus_center_delay,
+        "chorus_feedback": chorus_feedback,
+        "chorus_mix": chorus_mix,
+        "bitcrush_bit_depth": bitcrush_bit_depth,
+        "clipping_threshold": clipping_threshold,
+        "compressor_threshold": compressor_threshold,
+        "compressor_ratio": compressor_ratio,
+        "compressor_attack": compressor_attack,
+        "compressor_release": compressor_release,
+        "delay_seconds": delay_seconds,
+        "delay_feedback": delay_feedback,
+        "delay_mix": delay_mix,
+        "sid": sid,
+    }
+    infer_pipeline = import_voice_converter()
+    infer_pipeline.convert_audio_batch(
+        **kwargs,
+    )
+    return f"Files from {input_folder} inferred successfully."
+# TTS
+def run_tts_script(
+    tts_file: str,
+    tts_text: str,
+    tts_voice: str,
+    tts_rate: int,
+    pitch: int,
+    index_rate: float,
+    volume_envelope: float,
+    protect: float,
+    f0_method: str,
+    output_tts_path: str,
+    output_rvc_path: str,
+    pth_path: str,
+    index_path: str,
+    split_audio: bool,
+    f0_autotune: bool,
+    f0_autotune_strength: float,
+    proposed_pitch: bool,
+    proposed_pitch_threshold: float,
+    clean_audio: bool,
+    clean_strength: float,
+    export_format: str,
+    embedder_model: str,
+    embedder_model_custom: str = None,
+    sid: int = 0,
+):
+    tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
+    if os.path.exists(output_tts_path) and os.path.abspath(output_tts_path).startswith(
+        os.path.abspath("assets")
+    ):
+        os.remove(output_tts_path)
+    command_tts = [
+        *map(
+            str,
+            [
+                python,
+                tts_script_path,
+                tts_file,
+                tts_text,
+                tts_voice,
+                tts_rate,
+                output_tts_path,
+            ],
+        ),
+    ]
+    subprocess.run(command_tts)
+    infer_pipeline = import_voice_converter()
+    infer_pipeline.convert_audio(
+        pitch=pitch,
+        index_rate=index_rate,
+        volume_envelope=volume_envelope,
+        protect=protect,
+        f0_method=f0_method,
+        audio_input_path=output_tts_path,
+        audio_output_path=output_rvc_path,
+        model_path=pth_path,
+        index_path=index_path,
+        split_audio=split_audio,
+        f0_autotune=f0_autotune,
+        f0_autotune_strength=f0_autotune_strength,
+        proposed_pitch=proposed_pitch,
+        proposed_pitch_threshold=proposed_pitch_threshold,
+        clean_audio=clean_audio,
+        clean_strength=clean_strength,
+        export_format=export_format,
+        embedder_model=embedder_model,
+        embedder_model_custom=embedder_model_custom,
+        sid=sid,
+        formant_shifting=None,
+        formant_qfrency=None,
+        formant_timbre=None,
+        post_process=None,
+        reverb=None,
+        pitch_shift=None,
+        limiter=None,
+        gain=None,
+        distortion=None,
+        chorus=None,
+        bitcrush=None,
+        clipping=None,
+        compressor=None,
+        delay=None,
+        sliders=None,
+    )
+    return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
+        ".wav", f".{export_format.lower()}"
+    )
+# Preprocess
+def run_preprocess_script(
+    model_name: str,
+    dataset_path: str,
+    sample_rate: int,
+    cpu_cores: int,
+    cut_preprocess: str,
+    process_effects: bool,
+    noise_reduction: bool,
+    clean_strength: float,
+    chunk_len: float,
+    overlap_len: float,
+    normalization_mode: str = "none",
+):
+    preprocess_script_path = os.path.join("rvc", "train", "preprocess", "preprocess.py")
+    command = [
+        python,
+        preprocess_script_path,
+        *map(
+            str,
+            [
+                os.path.join(logs_path, model_name),
+                dataset_path,
+                sample_rate,
+                cpu_cores,
+                cut_preprocess,
+                process_effects,
+                noise_reduction,
+                clean_strength,
+                chunk_len,
+                overlap_len,
+                normalization_mode,
+            ],
+        ),
+    ]
+    subprocess.run(command)
+    return f"Model {model_name} preprocessed successfully."
+# Extract
+def run_extract_script(
+    model_name: str,
+    f0_method: str,
+    cpu_cores: int,
+    gpu: int,
+    sample_rate: int,
+    embedder_model: str,
+    embedder_model_custom: str = None,
+    include_mutes: int = 2,
+):
+    model_path = os.path.join(logs_path, model_name)
+    extract = os.path.join("rvc", "train", "extract", "extract.py")
+    command_1 = [
+        python,
+        extract,
+        *map(
+            str,
+            [
+                model_path,
+                f0_method,
+                cpu_cores,
+                gpu,
+                sample_rate,
+                embedder_model,
+                embedder_model_custom,
+                include_mutes,
+            ],
+        ),
+    ]
+    subprocess.run(command_1)
+    return f"Model {model_name} extracted successfully."
+# Train
+def run_train_script(
+    model_name: str,
+    save_every_epoch: int,
+    save_only_latest: bool,
+    save_every_weights: bool,
+    total_epoch: int,
+    sample_rate: int,
+    batch_size: int,
+    gpu: int,
+    overtraining_detector: bool,
+    overtraining_threshold: int,
+    pretrained: bool,
+    cleanup: bool,
+    index_algorithm: str = "Auto",
+    cache_data_in_gpu: bool = False,
+    custom_pretrained: bool = False,
+    g_pretrained_path: str = None,
+    d_pretrained_path: str = None,
+    vocoder: str = "HiFi-GAN",
+    checkpointing: bool = False,
+):
+    if pretrained == True:
+        from rvc.lib.tools.pretrained_selector import pretrained_selector
+        if custom_pretrained == False:
+            pg, pd = pretrained_selector(str(vocoder), int(sample_rate))
+        else:
+            if g_pretrained_path is None or d_pretrained_path is None:
+                raise ValueError(
+                    "Please provide the path to the pretrained G and D models."
+                )
+            pg, pd = g_pretrained_path, d_pretrained_path
+    else:
+        pg, pd = "", ""
+    train_script_path = os.path.join("rvc", "train", "train.py")
+    command = [
+        python,
+        train_script_path,
+        *map(
+            str,
+            [
+                model_name,
+                save_every_epoch,
+                total_epoch,
+                pg,
+                pd,
+                gpu,
+                batch_size,
+                sample_rate,
+                save_only_latest,
+                save_every_weights,
+                cache_data_in_gpu,
+                overtraining_detector,
+                overtraining_threshold,
+                cleanup,
+                vocoder,
+                checkpointing,
+            ],
+        ),
+    ]
+    subprocess.run(command)
+    run_index_script(model_name, index_algorithm)
+    return f"Model {model_name} trained successfully."
+# Index
+def run_index_script(model_name: str, index_algorithm: str):
+    index_script_path = os.path.join("rvc", "train", "process", "extract_index.py")
+    command = [
+        python,
+        index_script_path,
+        os.path.join(logs_path, model_name),
+        index_algorithm,
+    ]
+    subprocess.run(command)
+    return f"Index file for {model_name} generated successfully."
+# Model information
+def run_model_information_script(pth_path: str):
+    print(model_information(pth_path))
+    return model_information(pth_path)
+# Model blender
+def run_model_blender_script(
+    model_name: str, pth_path_1: str, pth_path_2: str, ratio: float
+):
+    message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
+    return message, model_blended
+# Tensorboard
+def run_tensorboard_script():
+    launch_tensorboard_pipeline()
+# Download
+def run_download_script(model_link: str):
+    model_download_pipeline(model_link)
+    return f"Model downloaded successfully."
+# Prerequisites
+def run_prerequisites_script(
+    pretraineds_hifigan: bool,
+    models: bool,
+    exe: bool,
+):
+    prequisites_download_pipeline(
+        pretraineds_hifigan,
+        models,
+        exe,
+    )
+    return "Prerequisites installed successfully."
+# Audio analyzer
+def run_audio_analyzer_script(
+    input_path: str, save_plot_path: str = "logs/audio_analysis.png"
+):
+    audio_info, plot_path = analyze_audio(input_path, save_plot_path)
+    print(
+        f"Audio info of {input_path}: {audio_info}",
+        f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
+    )
+    return audio_info, plot_path
+# Parse arguments
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Run the main.py script with specific parameters."
+    )
+    subparsers = parser.add_subparsers(
+        title="subcommands", dest="mode", help="Choose a mode"
+    )
+    # Parser for 'infer' mode
+    infer_parser = subparsers.add_parser("infer", help="Run inference")
+    pitch_description = (
+        "Set the pitch of the audio. Higher values result in a higher pitch."
+    )
+    infer_parser.add_argument(
+        "--pitch",
+        type=int,
+        help=pitch_description,
+        choices=range(-24, 25),
+        default=0,
+    )
+    index_rate_description = "Control the influence of the index file on the output. Higher values mean stronger influence. Lower values can help reduce artifacts but may result in less accurate voice cloning."
+    infer_parser.add_argument(
+        "--index_rate",
+        type=float,
+        help=index_rate_description,
+        choices=[i / 100.0 for i in range(0, 101)],
+        default=0.3,
+    )
+    volume_envelope_description = "Control the blending of the output's volume envelope. A value of 1 means the output envelope is fully used."
+    infer_parser.add_argument(
+        "--volume_envelope",
+        type=float,
+        help=volume_envelope_description,
+        choices=[i / 100.0 for i in range(0, 101)],
+        default=1,
+    )
+    protect_description = "Protect consonants and breathing sounds from artifacts. A value of 0.5 offers the strongest protection, while lower values may reduce the protection level but potentially mitigate the indexing effect."
+    infer_parser.add_argument(
+        "--protect",
+        type=float,
+        help=protect_description,
+        choices=[i / 1000.0 for i in range(0, 501)],
+        default=0.33,
+    )
+    f0_method_description = "Choose the pitch extraction algorithm for the conversion. 'rmvpe' is the default and generally recommended."
+    infer_parser.add_argument(
+        "--f0_method",
+        type=str,
+        help=f0_method_description,
+        choices=[
+            "crepe",
+            "crepe-tiny",
+            "rmvpe",
+            "fcpe",
+            "swift",
+            "hybrid[crepe+rmvpe]",
+            "hybrid[crepe+fcpe]",
+            "hybrid[rmvpe+fcpe]",
+            "hybrid[crepe+rmvpe+fcpe]",
+        ],
+        default="rmvpe",
+    )
+    infer_parser.add_argument(
+        "--input_path",
+        type=str,
+        help="Full path to the input audio file.",
+        required=True,
+    )
+    infer_parser.add_argument(
+        "--output_path",
+        type=str,
+        help="Full path to the output audio file.",
+        required=True,
+    )
+    pth_path_description = "Full path to the RVC model file (.pth)."
+    infer_parser.add_argument(
+        "--pth_path", type=str, help=pth_path_description, required=True
+    )
+    index_path_description = "Full path to the index file (.index)."
+    infer_parser.add_argument(
+        "--index_path", type=str, help=index_path_description, required=True
+    )
+    split_audio_description = "Split the audio into smaller segments before inference. This can improve the quality of the output for longer audio files."
+    infer_parser.add_argument(
+        "--split_audio",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=split_audio_description,
+        default=False,
+    )
+    f0_autotune_description = "Apply a light autotune to the inferred audio. Particularly useful for singing voice conversions."
+    infer_parser.add_argument(
+        "--f0_autotune",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=f0_autotune_description,
+        default=False,
+    )
+    f0_autotune_strength_description = "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid."
+    infer_parser.add_argument(
+        "--f0_autotune_strength",
+        type=float,
+        help=f0_autotune_strength_description,
+        choices=[(i / 10) for i in range(11)],
+        default=1.0,
+    )
+    proposed_pitch_description = "Proposed Pitch"
+    infer_parser.add_argument(
+        "--proposed_pitch",
+        type=bool,
+        help=proposed_pitch_description,
+        choices=[True, False],
+        default=False,
+    )
+    proposed_pitch_threshold_description = "Proposed Pitch Threshold"
+    infer_parser.add_argument(
+        "--proposed_pitch_threshold",
+        type=float,
+        help=proposed_pitch_threshold_description,
+        choices=[i for i in range(50, 1200)],
+        default=155.0,
+    )
+    clean_audio_description = "Clean the output audio using noise reduction algorithms. Recommended for speech conversions."
+    infer_parser.add_argument(
+        "--clean_audio",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=clean_audio_description,
+        default=False,
+    )
+    clean_strength_description = "Adjust the intensity of the audio cleaning process. Higher values result in stronger cleaning, but may lead to a more compressed sound."
+    infer_parser.add_argument(
+        "--clean_strength",
+        type=float,
+        help=clean_strength_description,
+        choices=[(i / 10) for i in range(11)],
+        default=0.7,
+    )
+    export_format_description = "Select the desired output audio format."
+    infer_parser.add_argument(
+        "--export_format",
+        type=str,
+        help=export_format_description,
+        choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
+        default="WAV",
+    )
+    embedder_model_description = (
+        "Choose the model used for generating speaker embeddings."
+    )
+    infer_parser.add_argument(
+        "--embedder_model",
+        type=str,
+        help=embedder_model_description,
+        choices=[
+            "contentvec",
+            "spin",
+            "spin-v2",
+            "chinese-hubert-base",
+            "japanese-hubert-base",
+            "korean-hubert-base",
+            "custom",
+        ],
+        default="contentvec",
+    )
+    embedder_model_custom_description = "Specify the path to a custom model for speaker embedding. Only applicable if 'embedder_model' is set to 'custom'."
+    infer_parser.add_argument(
+        "--embedder_model_custom",
+        type=str,
+        help=embedder_model_custom_description,
+        default=None,
+    )
+    formant_shifting_description = "Apply formant shifting to the input audio. This can help adjust the timbre of the voice."
+    infer_parser.add_argument(
+        "--formant_shifting",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=formant_shifting_description,
+        default=False,
+        required=False,
+    )
+    formant_qfrency_description = "Control the frequency of the formant shifting effect. Higher values result in a more pronounced effect."
+    infer_parser.add_argument(
+        "--formant_qfrency",
+        type=float,
+        help=formant_qfrency_description,
+        default=1.0,
+        required=False,
+    )
+    formant_timbre_description = "Control the timbre of the formant shifting effect. Higher values result in a more pronounced effect."
+    infer_parser.add_argument(
+        "--formant_timbre",
+        type=float,
+        help=formant_timbre_description,
+        default=1.0,
+        required=False,
+    )
+    sid_description = "Speaker ID for multi-speaker models."
+    infer_parser.add_argument(
+        "--sid",
+        type=int,
+        help=sid_description,
+        default=0,
+        required=False,
+    )
+    post_process_description = "Apply post-processing effects to the output audio."
+    infer_parser.add_argument(
+        "--post_process",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=post_process_description,
+        default=False,
+        required=False,
+    )
+    reverb_description = "Apply reverb effect to the output audio."
+    infer_parser.add_argument(
+        "--reverb",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=reverb_description,
+        default=False,
+        required=False,
+    )
+    pitch_shift_description = "Apply pitch shifting effect to the output audio."
+    infer_parser.add_argument(
+        "--pitch_shift",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=pitch_shift_description,
+        default=False,
+        required=False,
+    )
+    limiter_description = "Apply limiter effect to the output audio."
+    infer_parser.add_argument(
+        "--limiter",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=limiter_description,
+        default=False,
+        required=False,
+    )
+    gain_description = "Apply gain effect to the output audio."
+    infer_parser.add_argument(
+        "--gain",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=gain_description,
+        default=False,
+        required=False,
+    )
+    distortion_description = "Apply distortion effect to the output audio."
+    infer_parser.add_argument(
+        "--distortion",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=distortion_description,
+        default=False,
+        required=False,
+    )
+    chorus_description = "Apply chorus effect to the output audio."
+    infer_parser.add_argument(
+        "--chorus",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=chorus_description,
+        default=False,
+        required=False,
+    )
+    bitcrush_description = "Apply bitcrush effect to the output audio."
+    infer_parser.add_argument(
+        "--bitcrush",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=bitcrush_description,
+        default=False,
+        required=False,
+    )
+    clipping_description = "Apply clipping effect to the output audio."
+    infer_parser.add_argument(
+        "--clipping",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=clipping_description,
+        default=False,
+        required=False,
+    )
+    compressor_description = "Apply compressor effect to the output audio."
+    infer_parser.add_argument(
+        "--compressor",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=compressor_description,
+        default=False,
+        required=False,
+    )
+    delay_description = "Apply delay effect to the output audio."
+    infer_parser.add_argument(
+        "--delay",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=delay_description,
+        default=False,
+        required=False,
+    )
+    reverb_room_size_description = "Control the room size of the reverb effect. Higher values result in a larger room size."
+    infer_parser.add_argument(
+        "--reverb_room_size",
+        type=float,
+        help=reverb_room_size_description,
+        default=0.5,
+        required=False,
+    )
+    reverb_damping_description = "Control the damping of the reverb effect. Higher values result in a more damped sound."
+    infer_parser.add_argument(
+        "--reverb_damping",
+        type=float,
+        help=reverb_damping_description,
+        default=0.5,
+        required=False,
+    )
+    reverb_wet_gain_description = "Control the wet gain of the reverb effect. Higher values result in a stronger reverb effect."
+    infer_parser.add_argument(
+        "--reverb_wet_gain",
+        type=float,
+        help=reverb_wet_gain_description,
+        default=0.5,
+        required=False,
+    )
+    reverb_dry_gain_description = "Control the dry gain of the reverb effect. Higher values result in a stronger dry signal."
+    infer_parser.add_argument(
+        "--reverb_dry_gain",
+        type=float,
+        help=reverb_dry_gain_description,
+        default=0.5,
+        required=False,
+    )
+    reverb_width_description = "Control the stereo width of the reverb effect. Higher values result in a wider stereo image."
+    infer_parser.add_argument(
+        "--reverb_width",
+        type=float,
+        help=reverb_width_description,
+        default=0.5,
+        required=False,
+    )
+    reverb_freeze_mode_description = "Control the freeze mode of the reverb effect. Higher values result in a stronger freeze effect."
+    infer_parser.add_argument(
+        "--reverb_freeze_mode",
+        type=float,
+        help=reverb_freeze_mode_description,
+        default=0.5,
+        required=False,
+    )
+    pitch_shift_semitones_description = "Control the pitch shift in semitones. Positive values increase the pitch, while negative values decrease it."
+    infer_parser.add_argument(
+        "--pitch_shift_semitones",
+        type=float,
+        help=pitch_shift_semitones_description,
+        default=0.0,
+        required=False,
+    )
+    limiter_threshold_description = "Control the threshold of the limiter effect. Higher values result in a stronger limiting effect."
+    infer_parser.add_argument(
+        "--limiter_threshold",
+        type=float,
+        help=limiter_threshold_description,
+        default=-6,
+        required=False,
+    )
+    limiter_release_time_description = "Control the release time of the limiter effect. Higher values result in a longer release time."
+    infer_parser.add_argument(
+        "--limiter_release_time",
+        type=float,
+        help=limiter_release_time_description,
+        default=0.01,
+        required=False,
+    )
+    gain_db_description = "Control the gain in decibels. Positive values increase the gain, while negative values decrease it."
+    infer_parser.add_argument(
+        "--gain_db",
+        type=float,
+        help=gain_db_description,
+        default=0.0,
+        required=False,
+    )
+    distortion_gain_description = "Control the gain of the distortion effect. Higher values result in a stronger distortion effect."
+    infer_parser.add_argument(
+        "--distortion_gain",
+        type=float,
+        help=distortion_gain_description,
+        default=25,
+        required=False,
+    )
+    chorus_rate_description = "Control the rate of the chorus effect. Higher values result in a faster chorus effect."
+    infer_parser.add_argument(
+        "--chorus_rate",
+        type=float,
+        help=chorus_rate_description,
+        default=1.0,
+        required=False,
+    )
+    chorus_depth_description = "Control the depth of the chorus effect. Higher values result in a stronger chorus effect."
+    infer_parser.add_argument(
+        "--chorus_depth",
+        type=float,
+        help=chorus_depth_description,
+        default=0.25,
+        required=False,
+    )
+    chorus_center_delay_description = "Control the center delay of the chorus effect. Higher values result in a longer center delay."
+    infer_parser.add_argument(
+        "--chorus_center_delay",
+        type=float,
+        help=chorus_center_delay_description,
+        default=7,
+        required=False,
+    )
+    chorus_feedback_description = "Control the feedback of the chorus effect. Higher values result in a stronger feedback effect."
+    infer_parser.add_argument(
+        "--chorus_feedback",
+        type=float,
+        help=chorus_feedback_description,
+        default=0.0,
+        required=False,
+    )
+    chorus_mix_description = "Control the mix of the chorus effect. Higher values result in a stronger chorus effect."
+    infer_parser.add_argument(
+        "--chorus_mix",
+        type=float,
+        help=chorus_mix_description,
+        default=0.5,
+        required=False,
+    )
+    bitcrush_bit_depth_description = "Control the bit depth of the bitcrush effect. Higher values result in a stronger bitcrush effect."
+    infer_parser.add_argument(
+        "--bitcrush_bit_depth",
+        type=int,
+        help=bitcrush_bit_depth_description,
+        default=8,
+        required=False,
+    )
+    clipping_threshold_description = "Control the threshold of the clipping effect. Higher values result in a stronger clipping effect."
+    infer_parser.add_argument(
+        "--clipping_threshold",
+        type=float,
+        help=clipping_threshold_description,
+        default=-6,
+        required=False,
+    )
+    compressor_threshold_description = "Control the threshold of the compressor effect. Higher values result in a stronger compressor effect."
+    infer_parser.add_argument(
+        "--compressor_threshold",
+        type=float,
+        help=compressor_threshold_description,
+        default=0,
+        required=False,
+    )
+    compressor_ratio_description = "Control the ratio of the compressor effect. Higher values result in a stronger compressor effect."
+    infer_parser.add_argument(
+        "--compressor_ratio",
+        type=float,
+        help=compressor_ratio_description,
+        default=1,
+        required=False,
+    )
+    compressor_attack_description = "Control the attack of the compressor effect. Higher values result in a stronger compressor effect."
+    infer_parser.add_argument(
+        "--compressor_attack",
+        type=float,
+        help=compressor_attack_description,
+        default=1.0,
+        required=False,
+    )
+    compressor_release_description = "Control the release of the compressor effect. Higher values result in a stronger compressor effect."
+    infer_parser.add_argument(
+        "--compressor_release",
+        type=float,
+        help=compressor_release_description,
+        default=100,
+        required=False,
+    )
+    delay_seconds_description = "Control the delay time in seconds. Higher values result in a longer delay time."
+    infer_parser.add_argument(
+        "--delay_seconds",
+        type=float,
+        help=delay_seconds_description,
+        default=0.5,
+        required=False,
+    )
+    delay_feedback_description = "Control the feedback of the delay effect. Higher values result in a stronger feedback effect."
+    infer_parser.add_argument(
+        "--delay_feedback",
+        type=float,
+        help=delay_feedback_description,
+        default=0.0,
+        required=False,
+    )
+    delay_mix_description = "Control the mix of the delay effect. Higher values result in a stronger delay effect."
+    infer_parser.add_argument(
+        "--delay_mix",
+        type=float,
+        help=delay_mix_description,
+        default=0.5,
+        required=False,
+    )
+    # Parser for 'batch_infer' mode
+    batch_infer_parser = subparsers.add_parser(
+        "batch_infer",
+        help="Run batch inference",
+    )
+    batch_infer_parser.add_argument(
+        "--pitch",
+        type=int,
+        help=pitch_description,
+        choices=range(-24, 25),
+        default=0,
+    )
+    batch_infer_parser.add_argument(
+        "--index_rate",
+        type=float,
+        help=index_rate_description,
+        choices=[i / 100.0 for i in range(0, 101)],
+        default=0.3,
+    )
+    batch_infer_parser.add_argument(
+        "--volume_envelope",
+        type=float,
+        help=volume_envelope_description,
+        choices=[i / 100.0 for i in range(0, 101)],
+        default=1,
+    )
+    batch_infer_parser.add_argument(
+        "--protect",
+        type=float,
+        help=protect_description,
+        choices=[i / 1000.0 for i in range(0, 501)],
+        default=0.33,
+    )
+    batch_infer_parser.add_argument(
+        "--f0_method",
+        type=str,
+        help=f0_method_description,
+        choices=[
+            "crepe",
+            "crepe-tiny",
+            "rmvpe",
+            "fcpe",
+            "swift",
+            "hybrid[crepe+rmvpe]",
+            "hybrid[crepe+fcpe]",
+            "hybrid[rmvpe+fcpe]",
+            "hybrid[crepe+rmvpe+fcpe]",
+        ],
+        default="rmvpe",
+    )
+    batch_infer_parser.add_argument(
+        "--input_folder",
+        type=str,
+        help="Path to the folder containing input audio files.",
+        required=True,
+    )
+    batch_infer_parser.add_argument(
+        "--output_folder",
+        type=str,
+        help="Path to the folder for saving output audio files.",
+        required=True,
+    )
+    batch_infer_parser.add_argument(
+        "--pth_path", type=str, help=pth_path_description, required=True
+    )
+    batch_infer_parser.add_argument(
+        "--index_path", type=str, help=index_path_description, required=True
+    )
+    batch_infer_parser.add_argument(
+        "--split_audio",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=split_audio_description,
+        default=False,
+    )
+    batch_infer_parser.add_argument(
+        "--f0_autotune",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=f0_autotune_description,
+        default=False,
+    )
+    batch_infer_parser.add_argument(
+        "--f0_autotune_strength",
+        type=float,
+        help=clean_strength_description,
+        choices=[(i / 10) for i in range(11)],
+        default=1.0,
+    )
+    proposed_pitch_description = "Proposed Pitch adjustment"
+    batch_infer_parser.add_argument(
+        "--proposed_pitch",
+        type=bool,
+        help=proposed_pitch_description,
+        choices=[True, False],
+        default=False,
+    )
+    proposed_pitch_threshold_description = "Proposed Pitch adjustment value"
+    batch_infer_parser.add_argument(
+        "--proposed_pitch_threshold",
+        type=float,
+        help=proposed_pitch_threshold_description,
+        choices=[i for i in range(50, 1200)],
+        default=155.0,
+    )
+    batch_infer_parser.add_argument(
+        "--clean_audio",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=clean_audio_description,
+        default=False,
+    )
+    batch_infer_parser.add_argument(
+        "--clean_strength",
+        type=float,
+        help=clean_strength_description,
+        choices=[(i / 10) for i in range(11)],
+        default=0.7,
+    )
+    batch_infer_parser.add_argument(
+        "--export_format",
+        type=str,
+        help=export_format_description,
+        choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
+        default="WAV",
+    )
+    batch_infer_parser.add_argument(
+        "--embedder_model",
+        type=str,
+        help=embedder_model_description,
+        choices=[
+            "contentvec",
+            "spin",
+            "spin-v2",
+            "chinese-hubert-base",
+            "japanese-hubert-base",
+            "korean-hubert-base",
+            "custom",
+        ],
+        default="contentvec",
+    )
+    batch_infer_parser.add_argument(
+        "--embedder_model_custom",
+        type=str,
+        help=embedder_model_custom_description,
+        default=None,
+    )
+    batch_infer_parser.add_argument(
+        "--formant_shifting",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=formant_shifting_description,
+        default=False,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--formant_qfrency",
+        type=float,
+        help=formant_qfrency_description,
+        default=1.0,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--formant_timbre",
+        type=float,
+        help=formant_timbre_description,
+        default=1.0,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--sid",
+        type=int,
+        help=sid_description,
+        default=0,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--post_process",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=post_process_description,
+        default=False,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--reverb",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=reverb_description,
+        default=False,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--pitch_shift",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=pitch_shift_description,
+        default=False,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--limiter",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=limiter_description,
+        default=False,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--gain",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=gain_description,
+        default=False,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--distortion",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=distortion_description,
+        default=False,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--chorus",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=chorus_description,
+        default=False,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--bitcrush",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=bitcrush_description,
+        default=False,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--clipping",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=clipping_description,
+        default=False,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--compressor",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=compressor_description,
+        default=False,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--delay",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=delay_description,
+        default=False,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--reverb_room_size",
+        type=float,
+        help=reverb_room_size_description,
+        default=0.5,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--reverb_damping",
+        type=float,
+        help=reverb_damping_description,
+        default=0.5,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--reverb_wet_gain",
+        type=float,
+        help=reverb_wet_gain_description,
+        default=0.5,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--reverb_dry_gain",
+        type=float,
+        help=reverb_dry_gain_description,
+        default=0.5,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--reverb_width",
+        type=float,
+        help=reverb_width_description,
+        default=0.5,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--reverb_freeze_mode",
+        type=float,
+        help=reverb_freeze_mode_description,
+        default=0.5,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--pitch_shift_semitones",
+        type=float,
+        help=pitch_shift_semitones_description,
+        default=0.0,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--limiter_threshold",
+        type=float,
+        help=limiter_threshold_description,
+        default=-6,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--limiter_release_time",
+        type=float,
+        help=limiter_release_time_description,
+        default=0.01,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--gain_db",
+        type=float,
+        help=gain_db_description,
+        default=0.0,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--distortion_gain",
+        type=float,
+        help=distortion_gain_description,
+        default=25,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--chorus_rate",
+        type=float,
+        help=chorus_rate_description,
+        default=1.0,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--chorus_depth",
+        type=float,
+        help=chorus_depth_description,
+        default=0.25,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--chorus_center_delay",
+        type=float,
+        help=chorus_center_delay_description,
+        default=7,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--chorus_feedback",
+        type=float,
+        help=chorus_feedback_description,
+        default=0.0,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--chorus_mix",
+        type=float,
+        help=chorus_mix_description,
+        default=0.5,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--bitcrush_bit_depth",
+        type=int,
+        help=bitcrush_bit_depth_description,
+        default=8,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--clipping_threshold",
+        type=float,
+        help=clipping_threshold_description,
+        default=-6,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--compressor_threshold",
+        type=float,
+        help=compressor_threshold_description,
+        default=0,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--compressor_ratio",
+        type=float,
+        help=compressor_ratio_description,
+        default=1,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--compressor_attack",
+        type=float,
+        help=compressor_attack_description,
+        default=1.0,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--compressor_release",
+        type=float,
+        help=compressor_release_description,
+        default=100,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--delay_seconds",
+        type=float,
+        help=delay_seconds_description,
+        default=0.5,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--delay_feedback",
+        type=float,
+        help=delay_feedback_description,
+        default=0.0,
+        required=False,
+    )
+    batch_infer_parser.add_argument(
+        "--delay_mix",
+        type=float,
+        help=delay_mix_description,
+        default=0.5,
+        required=False,
+    )
+    # Parser for 'tts' mode
+    tts_parser = subparsers.add_parser("tts", help="Run TTS inference")
+    tts_parser.add_argument(
+        "--tts_file", type=str, help="File with a text to be synthesized", required=True
+    )
+    tts_parser.add_argument(
+        "--tts_text", type=str, help="Text to be synthesized", required=True
+    )
+    tts_parser.add_argument(
+        "--tts_voice",
+        type=str,
+        help="Voice to be used for TTS synthesis.",
+        choices=locales,
+        required=True,
+    )
+    tts_parser.add_argument(
+        "--tts_rate",
+        type=int,
+        help="Control the speaking rate of the TTS. Values range from -100 (slower) to 100 (faster).",
+        choices=range(-100, 101),
+        default=0,
+    )
+    tts_parser.add_argument(
+        "--pitch",
+        type=int,
+        help=pitch_description,
+        choices=range(-24, 25),
+        default=0,
+    )
+    tts_parser.add_argument(
+        "--index_rate",
+        type=float,
+        help=index_rate_description,
+        choices=[(i / 10) for i in range(11)],
+        default=0.3,
+    )
+    tts_parser.add_argument(
+        "--volume_envelope",
+        type=float,
+        help=volume_envelope_description,
+        choices=[(i / 10) for i in range(11)],
+        default=1,
+    )
+    tts_parser.add_argument(
+        "--protect",
+        type=float,
+        help=protect_description,
+        choices=[(i / 10) for i in range(6)],
+        default=0.33,
+    )
+    tts_parser.add_argument(
+        "--f0_method",
+        type=str,
+        help=f0_method_description,
+        choices=[
+            "crepe",
+            "crepe-tiny",
+            "rmvpe",
+            "fcpe",
+            "swift",
+            "hybrid[crepe+rmvpe]",
+            "hybrid[crepe+fcpe]",
+            "hybrid[rmvpe+fcpe]",
+            "hybrid[crepe+rmvpe+fcpe]",
+        ],
+        default="rmvpe",
+    )
+    tts_parser.add_argument(
+        "--output_tts_path",
+        type=str,
+        help="Full path to save the synthesized TTS audio.",
+        required=True,
+    )
+    tts_parser.add_argument(
+        "--output_rvc_path",
+        type=str,
+        help="Full path to save the voice-converted audio using the synthesized TTS.",
+        required=True,
+    )
+    tts_parser.add_argument(
+        "--pth_path", type=str, help=pth_path_description, required=True
+    )
+    tts_parser.add_argument(
+        "--index_path", type=str, help=index_path_description, required=True
+    )
+    tts_parser.add_argument(
+        "--split_audio",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=split_audio_description,
+        default=False,
+    )
+    tts_parser.add_argument(
+        "--f0_autotune",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=f0_autotune_description,
+        default=False,
+    )
+    tts_parser.add_argument(
+        "--f0_autotune_strength",
+        type=float,
+        help=clean_strength_description,
+        choices=[(i / 10) for i in range(11)],
+        default=1.0,
+    )
+    proposed_pitch_description = "Proposed Pitch adjustment"
+    tts_parser.add_argument(
+        "--proposed_pitch",
+        type=bool,
+        help=proposed_pitch_description,
+        choices=[True, False],
+        default=False,
+    )
+    proposed_pitch_threshold_description = "Proposed Pitch adjustment value"
+    tts_parser.add_argument(
+        "--proposed_pitch_threshold",
+        type=float,
+        help=proposed_pitch_threshold_description,
+        choices=[i for i in range(100, 500)],
+        default=155.0,
+    )
+    tts_parser.add_argument(
+        "--clean_audio",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help=clean_audio_description,
+        default=False,
+    )
+    tts_parser.add_argument(
+        "--clean_strength",
+        type=float,
+        help=clean_strength_description,
+        choices=[(i / 10) for i in range(11)],
+        default=0.7,
+    )
+    tts_parser.add_argument(
+        "--export_format",
+        type=str,
+        help=export_format_description,
+        choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
+        default="WAV",
+    )
+    tts_parser.add_argument(
+        "--embedder_model",
+        type=str,
+        help=embedder_model_description,
+        choices=[
+            "contentvec",
+            "spin",
+            "spin-v2",
+            "chinese-hubert-base",
+            "japanese-hubert-base",
+            "korean-hubert-base",
+            "custom",
+        ],
+        default="contentvec",
+    )
+    tts_parser.add_argument(
+        "--embedder_model_custom",
+        type=str,
+        help=embedder_model_custom_description,
+        default=None,
+    )
+    # Parser for 'preprocess' mode
+    preprocess_parser = subparsers.add_parser(
+        "preprocess", help="Preprocess a dataset for training."
+    )
+    preprocess_parser.add_argument(
+        "--model_name", type=str, help="Name of the model to be trained.", required=True
+    )
+    preprocess_parser.add_argument(
+        "--dataset_path", type=str, help="Path to the dataset directory.", required=True
+    )
+    preprocess_parser.add_argument(
+        "--sample_rate",
+        type=int,
+        help="Target sampling rate for the audio data.",
+        choices=[32000, 40000, 48000],
+        required=True,
+    )
+    preprocess_parser.add_argument(
+        "--cpu_cores",
+        type=int,
+        help="Number of CPU cores to use for preprocessing.",
+        choices=range(1, 65),
+    )
+    preprocess_parser.add_argument(
+        "--cut_preprocess",
+        type=str,
+        choices=["Skip", "Simple", "Automatic"],
+        help="Cut the dataset into smaller segments for faster preprocessing.",
+        default="Automatic",
+        required=True,
+    )
+    preprocess_parser.add_argument(
+        "--process_effects",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help="Disable all filters during preprocessing.",
+        default=False,
+        required=False,
+    )
+    preprocess_parser.add_argument(
+        "--noise_reduction",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help="Enable noise reduction during preprocessing.",
+        default=False,
+        required=False,
+    )
+    preprocess_parser.add_argument(
+        "--noise_reduction_strength",
+        type=float,
+        help="Strength of the noise reduction filter.",
+        choices=[(i / 10) for i in range(11)],
+        default=0.7,
+        required=False,
+    )
+    preprocess_parser.add_argument(
+        "--chunk_len",
+        type=float,
+        help="Chunk length.",
+        choices=[i * 0.5 for i in range(1, 11)],
+        default=3.0,
+        required=False,
+    )
+    preprocess_parser.add_argument(
+        "--overlap_len",
+        type=float,
+        help="Overlap length.",
+        choices=[0.0, 0.1, 0.2, 0.3, 0.4],
+        default=0.3,
+        required=False,
+    )
+    preprocess_parser.add_argument(
+        "--normalization_mode",
+        type=str,
+        help="Normalization mode.",
+        choices=["none", "pre", "post"],
+        default="none",
+        required=False,
+    )
+    # Parser for 'extract' mode
+    extract_parser = subparsers.add_parser(
+        "extract", help="Extract features from a dataset."
+    )
+    extract_parser.add_argument(
+        "--model_name", type=str, help="Name of the model.", required=True
+    )
+    extract_parser.add_argument(
+        "--f0_method",
+        type=str,
+        help="Pitch extraction method to use.",
+        choices=[
+            "crepe",
+            "crepe-tiny",
+            "rmvpe",
+            "fcpe",
+        ],
+        default="rmvpe",
+    )
+    extract_parser.add_argument(
+        "--cpu_cores",
+        type=int,
+        help="Number of CPU cores to use for feature extraction (optional).",
+        choices=range(1, 65),
+        default=None,
+    )
+    extract_parser.add_argument(
+        "--gpu",
+        type=str,
+        help="GPU device to use for feature extraction (optional).",
+        default="-",
+    )
+    extract_parser.add_argument(
+        "--sample_rate",
+        type=int,
+        help="Target sampling rate for the audio data.",
+        choices=[32000, 40000, 44100, 48000],
+        required=True,
+    )
+    extract_parser.add_argument(
+        "--embedder_model",
+        type=str,
+        help=embedder_model_description,
+        choices=[
+            "contentvec",
+            "spin",
+            "spin-v2",
+            "chinese-hubert-base",
+            "japanese-hubert-base",
+            "korean-hubert-base",
+            "custom",
+        ],
+        default="contentvec",
+    )
+    extract_parser.add_argument(
+        "--embedder_model_custom",
+        type=str,
+        help=embedder_model_custom_description,
+        default=None,
+    )
+    extract_parser.add_argument(
+        "--include_mutes",
+        type=int,
+        help="Number of silent files to include.",
+        choices=range(0, 11),
+        default=2,
+        required=True,
+    )
+    # Parser for 'train' mode
+    train_parser = subparsers.add_parser("train", help="Train an RVC model.")
+    train_parser.add_argument(
+        "--model_name", type=str, help="Name of the model to be trained.", required=True
+    )
+    train_parser.add_argument(
+        "--vocoder",
+        type=str,
+        help="Vocoder name",
+        choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"],
+        default="HiFi-GAN",
+    )
+    train_parser.add_argument(
+        "--checkpointing",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help="Enables memory-efficient training.",
+        default=False,
+        required=False,
+    )
+    train_parser.add_argument(
+        "--save_every_epoch",
+        type=int,
+        help="Save the model every specified number of epochs.",
+        choices=range(1, 101),
+        required=True,
+    )
+    train_parser.add_argument(
+        "--save_only_latest",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help="Save only the latest model checkpoint.",
+        default=False,
+    )
+    train_parser.add_argument(
+        "--save_every_weights",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help="Save model weights every epoch.",
+        default=True,
+    )
+    train_parser.add_argument(
+        "--total_epoch",
+        type=int,
+        help="Total number of epochs to train for.",
+        choices=range(1, 10001),
+        default=1000,
+    )
+    train_parser.add_argument(
+        "--sample_rate",
+        type=int,
+        help="Sampling rate of the training data.",
+        choices=[32000, 40000, 48000],
+        required=True,
+    )
+    train_parser.add_argument(
+        "--batch_size",
+        type=int,
+        help="Batch size for training.",
+        choices=range(1, 51),
+        default=8,
+    )
+    train_parser.add_argument(
+        "--gpu",
+        type=str,
+        help="GPU device to use for training (e.g., '0').",
+        default="0",
+    )
+    train_parser.add_argument(
+        "--pretrained",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help="Use a pretrained model for initialization.",
+        default=True,
+    )
+    train_parser.add_argument(
+        "--custom_pretrained",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help="Use a custom pretrained model.",
+        default=False,
+    )
+    train_parser.add_argument(
+        "--g_pretrained_path",
+        type=str,
+        nargs="?",
+        default=None,
+        help="Path to the pretrained generator model file.",
+    )
+    train_parser.add_argument(
+        "--d_pretrained_path",
+        type=str,
+        nargs="?",
+        default=None,
+        help="Path to the pretrained discriminator model file.",
+    )
+    train_parser.add_argument(
+        "--overtraining_detector",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help="Enable overtraining detection.",
+        default=False,
+    )
+    train_parser.add_argument(
+        "--overtraining_threshold",
+        type=int,
+        help="Threshold for overtraining detection.",
+        choices=range(1, 101),
+        default=50,
+    )
+    train_parser.add_argument(
+        "--cleanup",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help="Cleanup previous training attempt.",
+        default=False,
+    )
+    train_parser.add_argument(
+        "--cache_data_in_gpu",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help="Cache training data in GPU memory.",
+        default=False,
+    )
+    train_parser.add_argument(
+        "--index_algorithm",
+        type=str,
+        choices=["Auto", "Faiss", "KMeans"],
+        help="Choose the method for generating the index file.",
+        default="Auto",
+        required=False,
+    )
+    # Parser for 'index' mode
+    index_parser = subparsers.add_parser(
+        "index", help="Generate an index file for an RVC model."
+    )
+    index_parser.add_argument(
+        "--model_name", type=str, help="Name of the model.", required=True
+    )
+    index_parser.add_argument(
+        "--index_algorithm",
+        type=str,
+        choices=["Auto", "Faiss", "KMeans"],
+        help="Choose the method for generating the index file.",
+        default="Auto",
+        required=False,
+    )
+    # Parser for 'model_information' mode
+    model_information_parser = subparsers.add_parser(
+        "model_information", help="Display information about a trained model."
+    )
+    model_information_parser.add_argument(
+        "--pth_path", type=str, help="Path to the .pth model file.", required=True
+    )
+    # Parser for 'model_blender' mode
+    model_blender_parser = subparsers.add_parser(
+        "model_blender", help="Fuse two RVC models together."
+    )
+    model_blender_parser.add_argument(
+        "--model_name", type=str, help="Name of the new fused model.", required=True
+    )
+    model_blender_parser.add_argument(
+        "--pth_path_1",
+        type=str,
+        help="Path to the first .pth model file.",
+        required=True,
+    )
+    model_blender_parser.add_argument(
+        "--pth_path_2",
+        type=str,
+        help="Path to the second .pth model file.",
+        required=True,
+    )
+    model_blender_parser.add_argument(
+        "--ratio",
+        type=float,
+        help="Ratio for blending the two models (0.0 to 1.0).",
+        choices=[(i / 10) for i in range(11)],
+        default=0.5,
+    )
+    # Parser for 'tensorboard' mode
+    subparsers.add_parser(
+        "tensorboard", help="Launch TensorBoard for monitoring training progress."
+    )
+    # Parser for 'download' mode
+    download_parser = subparsers.add_parser(
+        "download", help="Download a model from a provided link."
+    )
+    download_parser.add_argument(
+        "--model_link", type=str, help="Direct link to the model file.", required=True
+    )
+    # Parser for 'prerequisites' mode
+    prerequisites_parser = subparsers.add_parser(
+        "prerequisites", help="Install prerequisites for RVC."
+    )
+    prerequisites_parser.add_argument(
+        "--pretraineds_hifigan",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        default=True,
+        help="Download pretrained models for RVC v2.",
+    )
+    prerequisites_parser.add_argument(
+        "--models",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        default=True,
+        help="Download additional models.",
+    )
+    prerequisites_parser.add_argument(
+        "--exe",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        default=True,
+        help="Download required executables.",
+    )
+    # Parser for 'audio_analyzer' mode
+    audio_analyzer = subparsers.add_parser(
+        "audio_analyzer", help="Analyze an audio file."
+    )
+    audio_analyzer.add_argument(
+        "--input_path", type=str, help="Path to the input audio file.", required=True
+    )
+    return parser.parse_args()
+def main():
+    if len(sys.argv) == 1:
+        print("Please run the script with '-h' for more information.")
+        sys.exit(1)
+    args = parse_arguments()
+    try:
+        if args.mode == "infer":
+            run_infer_script(
+                pitch=args.pitch,
+                index_rate=args.index_rate,
+                volume_envelope=args.volume_envelope,
+                protect=args.protect,
+                f0_method=args.f0_method,
+                input_path=args.input_path,
+                output_path=args.output_path,
+                pth_path=args.pth_path,
+                index_path=args.index_path,
+                split_audio=args.split_audio,
+                f0_autotune=args.f0_autotune,
+                f0_autotune_strength=args.f0_autotune_strength,
+                proposed_pitch=args.proposed_pitch,
+                proposed_pitch_threshold=args.proposed_pitch_threshold,
+                clean_audio=args.clean_audio,
+                clean_strength=args.clean_strength,
+                export_format=args.export_format,
+                embedder_model=args.embedder_model,
+                embedder_model_custom=args.embedder_model_custom,
+                formant_shifting=args.formant_shifting,
+                formant_qfrency=args.formant_qfrency,
+                formant_timbre=args.formant_timbre,
+                sid=args.sid,
+                post_process=args.post_process,
+                reverb=args.reverb,
+                pitch_shift=args.pitch_shift,
+                limiter=args.limiter,
+                gain=args.gain,
+                distortion=args.distortion,
+                chorus=args.chorus,
+                bitcrush=args.bitcrush,
+                clipping=args.clipping,
+                compressor=args.compressor,
+                delay=args.delay,
+                reverb_room_size=args.reverb_room_size,
+                reverb_damping=args.reverb_damping,
+                reverb_wet_gain=args.reverb_wet_gain,
+                reverb_dry_gain=args.reverb_dry_gain,
+                reverb_width=args.reverb_width,
+                reverb_freeze_mode=args.reverb_freeze_mode,
+                pitch_shift_semitones=args.pitch_shift_semitones,
+                limiter_threshold=args.limiter_threshold,
+                limiter_release_time=args.limiter_release_time,
+                gain_db=args.gain_db,
+                distortion_gain=args.distortion_gain,
+                chorus_rate=args.chorus_rate,
+                chorus_depth=args.chorus_depth,
+                chorus_center_delay=args.chorus_center_delay,
+                chorus_feedback=args.chorus_feedback,
+                chorus_mix=args.chorus_mix,
+                bitcrush_bit_depth=args.bitcrush_bit_depth,
+                clipping_threshold=args.clipping_threshold,
+                compressor_threshold=args.compressor_threshold,
+                compressor_ratio=args.compressor_ratio,
+                compressor_attack=args.compressor_attack,
+                compressor_release=args.compressor_release,
+                delay_seconds=args.delay_seconds,
+                delay_feedback=args.delay_feedback,
+                delay_mix=args.delay_mix,
+            )
+        elif args.mode == "batch_infer":
+            run_batch_infer_script(
+                pitch=args.pitch,
+                index_rate=args.index_rate,
+                volume_envelope=args.volume_envelope,
+                protect=args.protect,
+                f0_method=args.f0_method,
+                input_folder=args.input_folder,
+                output_folder=args.output_folder,
+                pth_path=args.pth_path,
+                index_path=args.index_path,
+                split_audio=args.split_audio,
+                f0_autotune=args.f0_autotune,
+                f0_autotune_strength=args.f0_autotune_strength,
+                proposed_pitch=args.proposed_pitch,
+                proposed_pitch_threshold=args.proposed_pitch_threshold,
+                clean_audio=args.clean_audio,
+                clean_strength=args.clean_strength,
+                export_format=args.export_format,
+                embedder_model=args.embedder_model,
+                embedder_model_custom=args.embedder_model_custom,
+                formant_shifting=args.formant_shifting,
+                formant_qfrency=args.formant_qfrency,
+                formant_timbre=args.formant_timbre,
+                sid=args.sid,
+                post_process=args.post_process,
+                reverb=args.reverb,
+                pitch_shift=args.pitch_shift,
+                limiter=args.limiter,
+                gain=args.gain,
+                distortion=args.distortion,
+                chorus=args.chorus,
+                bitcrush=args.bitcrush,
+                clipping=args.clipping,
+                compressor=args.compressor,
+                delay=args.delay,
+                reverb_room_size=args.reverb_room_size,
+                reverb_damping=args.reverb_damping,
+                reverb_wet_gain=args.reverb_wet_gain,
+                reverb_dry_gain=args.reverb_dry_gain,
+                reverb_width=args.reverb_width,
+                reverb_freeze_mode=args.reverb_freeze_mode,
+                pitch_shift_semitones=args.pitch_shift_semitones,
+                limiter_threshold=args.limiter_threshold,
+                limiter_release_time=args.limiter_release_time,
+                gain_db=args.gain_db,
+                distortion_gain=args.distortion_gain,
+                chorus_rate=args.chorus_rate,
+                chorus_depth=args.chorus_depth,
+                chorus_center_delay=args.chorus_center_delay,
+                chorus_feedback=args.chorus_feedback,
+                chorus_mix=args.chorus_mix,
+                bitcrush_bit_depth=args.bitcrush_bit_depth,
+                clipping_threshold=args.clipping_threshold,
+                compressor_threshold=args.compressor_threshold,
+                compressor_ratio=args.compressor_ratio,
+                compressor_attack=args.compressor_attack,
+                compressor_release=args.compressor_release,
+                delay_seconds=args.delay_seconds,
+                delay_feedback=args.delay_feedback,
+                delay_mix=args.delay_mix,
+            )
+        elif args.mode == "tts":
+            run_tts_script(
+                tts_file=args.tts_file,
+                tts_text=args.tts_text,
+                tts_voice=args.tts_voice,
+                tts_rate=args.tts_rate,
+                pitch=args.pitch,
+                index_rate=args.index_rate,
+                volume_envelope=args.volume_envelope,
+                protect=args.protect,
+                f0_method=args.f0_method,
+                output_tts_path=args.output_tts_path,
+                output_rvc_path=args.output_rvc_path,
+                pth_path=args.pth_path,
+                index_path=args.index_path,
+                split_audio=args.split_audio,
+                f0_autotune=args.f0_autotune,
+                f0_autotune_strength=args.f0_autotune_strength,
+                proposed_pitch=args.proposed_pitch,
+                proposed_pitch_threshold=args.proposed_pitch_threshold,
+                clean_audio=args.clean_audio,
+                clean_strength=args.clean_strength,
+                export_format=args.export_format,
+                embedder_model=args.embedder_model,
+                embedder_model_custom=args.embedder_model_custom,
+            )
+        elif args.mode == "preprocess":
+            run_preprocess_script(
+                model_name=args.model_name,
+                dataset_path=args.dataset_path,
+                sample_rate=args.sample_rate,
+                cpu_cores=args.cpu_cores,
+                cut_preprocess=args.cut_preprocess,
+                process_effects=args.process_effects,
+                noise_reduction=args.noise_reduction,
+                clean_strength=args.noise_reduction_strength,
+                chunk_len=args.chunk_len,
+                overlap_len=args.overlap_len,
+                normalization_mode=args.normalization_mode,
+            )
+        elif args.mode == "extract":
+            run_extract_script(
+                model_name=args.model_name,
+                f0_method=args.f0_method,
+                cpu_cores=args.cpu_cores,
+                gpu=args.gpu,
+                sample_rate=args.sample_rate,
+                embedder_model=args.embedder_model,
+                embedder_model_custom=args.embedder_model_custom,
+                include_mutes=args.include_mutes,
+            )
+        elif args.mode == "train":
+            run_train_script(
+                model_name=args.model_name,
+                save_every_epoch=args.save_every_epoch,
+                save_only_latest=args.save_only_latest,
+                save_every_weights=args.save_every_weights,
+                total_epoch=args.total_epoch,
+                sample_rate=args.sample_rate,
+                batch_size=args.batch_size,
+                gpu=args.gpu,
+                overtraining_detector=args.overtraining_detector,
+                overtraining_threshold=args.overtraining_threshold,
+                pretrained=args.pretrained,
+                custom_pretrained=args.custom_pretrained,
+                cleanup=args.cleanup,
+                index_algorithm=args.index_algorithm,
+                cache_data_in_gpu=args.cache_data_in_gpu,
+                g_pretrained_path=args.g_pretrained_path,
+                d_pretrained_path=args.d_pretrained_path,
+                vocoder=args.vocoder,
+                checkpointing=args.checkpointing,
+            )
+        elif args.mode == "index":
+            run_index_script(
+                model_name=args.model_name,
+                index_algorithm=args.index_algorithm,
+            )
+        elif args.mode == "model_information":
+            run_model_information_script(
+                pth_path=args.pth_path,
+            )
+        elif args.mode == "model_blender":
+            run_model_blender_script(
+                model_name=args.model_name,
+                pth_path_1=args.pth_path_1,
+                pth_path_2=args.pth_path_2,
+                ratio=args.ratio,
+            )
+        elif args.mode == "tensorboard":
+            run_tensorboard_script()
+        elif args.mode == "download":
+            run_download_script(
+                model_link=args.model_link,
+            )
+        elif args.mode == "prerequisites":
+            run_prerequisites_script(
+                pretraineds_hifigan=args.pretraineds_hifigan,
+                models=args.models,
+                exe=args.exe,
+            )
+        elif args.mode == "audio_analyzer":
+            run_audio_analyzer_script(
+                input_path=args.input_path,
+            )
+    except Exception as error:
+        print(f"An error occurred during execution: {error}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+version: '1'
+services:
+  applio:
+    build:
+      context: ./
+      dockerfile: Dockerfile
+    ports:
+      - "6969"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]

requirements.txt ADDED Viewed

	@@ -0,0 +1,50 @@

+# Core dependencies
+pip>=23.3; sys_platform == 'darwin'
+wheel; sys_platform == 'darwin'
+PyYAML; sys_platform == 'darwin'
+numpy==1.26.4
+requests>=2.31.0,<2.32.0
+tqdm
+wget
+# Audio processing
+ffmpeg-python>=0.2.0
+faiss-cpu==1.7.3
+librosa==0.11.0
+scipy==1.11.1
+soundfile==0.12.1
+noisereduce
+pedalboard
+stftpitchshift
+soxr
+# Machine learning and deep learning
+omegaconf>=2.0.6; sys_platform == 'darwin'
+numba; sys_platform == 'linux'
+numba==0.61.0; sys_platform == 'darwin' or sys_platform == 'win32'
+torch==2.7.1; sys_platform == 'darwin'
+torch==2.7.1+cu128; sys_platform == 'linux' or sys_platform == 'win32'
+torchaudio==2.7.1; sys_platform == 'darwin'
+torchaudio==2.7.1+cu128; sys_platform == 'linux' or sys_platform == 'win32'
+torchvision==0.22.1; sys_platform == 'darwin'
+torchvision==0.22.1+cu128; sys_platform == 'linux' or sys_platform == 'win32'
+torchcrepe==0.0.23
+torchfcpe
+swift_f0
+einops
+transformers==4.44.2
+# Visualization and UI
+matplotlib==3.7.2
+tensorboard
+gradio==5.23.1
+# Miscellaneous utilities
+certifi>=2023.07.22; sys_platform == 'darwin'
+antlr4-python3-runtime==4.8; sys_platform == 'darwin'
+tensorboardX
+edge-tts==7.2.0
+pypresence
+beautifulsoup4
+sounddevice
+webrtcvad

run-applio.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/bin/sh
+printf "\033]0;Applio\007"
+. .venv/bin/activate
+ export PYTORCH_ENABLE_MPS_FALLBACK=1
+ export PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
+clear
+python app.py --open

run-install.sh ADDED Viewed

	@@ -0,0 +1,174 @@

+#!/bin/bash
+set -e  # Exit immediately if a command exits with a non-zero status
+printf "\033]0;Installer\007"
+clear
+rm -f *.bat
+# Function to log messages with timestamps
+log_message() {
+    local msg="$1"
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - $msg"
+}
+# Function to find a suitable Python version
+find_python() {
+    for py in python3.11 python3 python; do
+        if command -v "$py" > /dev/null 2>&1; then
+            echo "$py"
+            return
+        fi
+    done
+    log_message "No compatible Python installation found. Please install Python 3.11."
+    exit 1
+}
+# Function to install FFmpeg based on the distribution
+install_ffmpeg() {
+    if command -v brew > /dev/null; then
+        log_message "Installing FFmpeg using Homebrew on macOS..."
+        brew install ffmpeg
+    elif command -v apt > /dev/null; then
+        log_message "Installing FFmpeg using apt..."
+        sudo apt update && sudo apt install -y ffmpeg
+    elif command -v pacman > /dev/null; then
+        log_message "Installing FFmpeg using pacman..."
+        sudo pacman -Syu --noconfirm ffmpeg
+    elif command -v dnf > /dev/null; then
+        log_message "Installing FFmpeg using dnf..."
+        sudo dnf install -y ffmpeg --allowerasing || install_ffmpeg_flatpak
+    else
+        log_message "Unsupported distribution for FFmpeg installation. Trying Flatpak..."
+        install_ffmpeg_flatpak
+    fi
+}
+# Function to install FFmpeg using Flatpak
+install_ffmpeg_flatpak() {
+    if command -v flatpak > /dev/null; then
+        log_message "Installing FFmpeg using Flatpak..."
+        flatpak install --user -y flathub org.freedesktop.Platform.ffmpeg
+    else
+        log_message "Flatpak is not installed. Installing Flatpak..."
+        if command -v apt > /dev/null; then
+            sudo apt install -y flatpak
+        elif command -v pacman > /dev/null; then
+            sudo pacman -Syu --noconfirm flatpak
+        elif command -v dnf > /dev/null; then
+            sudo dnf install -y flatpak
+        elif command -v brew > /dev/null; then
+            brew install flatpak
+        else
+            log_message "Unable to install Flatpak automatically. Please install Flatpak and try again."
+            exit 1
+        fi
+        flatpak install --user -y flathub org.freedesktop.Platform.ffmpeg
+    fi
+}
+install_python_ffmpeg() {
+    log_message "Installing python-ffmpeg..."
+    uv pip install python-ffmpeg
+}
+# Function to create or activate a virtual environment
+prepare_install() {
+    if [ -d ".venv" ]; then
+        log_message "Virtual environment found. This implies Applio has been already installed or this is a broken install."
+        printf "Do you want to execute run-applio.sh? (Y/N): " >&2
+        read -r r
+        r=$(echo "$r" | tr '[:upper:]' '[:lower:]')
+        if [ "$r" = "y" ]; then
+            chmod +x run-applio.sh
+            ./run-applio.sh && exit 0
+        else
+            log_message "Continuing with the installation."
+            rm -rf .venv
+            create_venv
+        fi
+    else
+        create_venv
+    fi
+}
+# Function to create the virtual environment and install dependencies
+create_venv() {
+    log_message "Creating virtual environment..."
+    py=$(find_python)
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    uv venv .venv --python 3.11
+    log_message "Activating virtual environment..."
+    source .venv/bin/activate
+    install_ffmpeg
+    install_python_ffmpeg
+    log_message "Installing dependencies..."
+    if [ -f "requirements.txt" ]; then
+        uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu128 --index-strategy unsafe-best-match
+    else
+        log_message "requirements.txt not found. Please ensure it exists."
+        exit 1
+    fi
+    finish
+}
+# Function to finish installation
+finish() {
+    clear
+    echo "Applio has been successfully installed. Run the file run-applio.sh to start the web interface!"
+    exit 0
+}
+# Main script execution
+if [ "$(uname)" = "Darwin" ]; then
+    log_message "Detected macOS..."
+    if ! command -v brew >/dev/null 2>&1; then
+        log_message "Homebrew not found. Installing Homebrew..."
+        /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
+    fi
+    # Add more detailed Python version check
+    log_message "Checking Python versions..."
+    log_message "python3 path: $(which python3)"
+    log_message "python3.11 path: $(which python3.11 2>/dev/null || echo 'not found')"
+    if command -v python3.11 >/dev/null 2>&1; then
+        python_version=$(python3.11 --version | awk '{print $2}' | cut -d'.' -f1,2)
+    else
+        python_version=$(python3 --version | awk '{print $2}' | cut -d'.' -f1,2)
+    fi
+    log_message "Detected Python version: $python_version"
+    if [ "$python_version" = "3.11" ]; then
+        log_message "Found compatible Python 3.11"
+    else
+        log_message "Python version $python_version is not 3.11. Installing Python 3.11 using Homebrew..."
+        brew install python@3.11
+        export PATH="$(brew --prefix)/opt/python@3.11/bin:$PATH"
+        # Verify the installed version
+        log_message "Verifying installed Python version..."
+        python_version=$(python3.11 --version | awk '{print $2}' | cut -d'.' -f1,2)
+        if [ "$python_version" != "3.11" ]; then
+            log_message "Failed to install Python 3.11. Current version: $python_version"
+            exit 1
+        fi
+    fi
+    brew install faiss
+    export PYTORCH_ENABLE_MPS_FALLBACK=1
+    export PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
+    export PATH="$(brew --prefix)/bin:$PATH"
+elif [ "$(uname)" != "Linux" ]; then
+    log_message "Unsupported operating system. Are you using Windows?"
+    log_message "If yes, use the batch (.bat) file instead of this one!"
+    exit 1
+fi
+prepare_install

run-tensorboard.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/bin/sh
+printf "\033]0;Tensorboard\007"
+. .venv/bin/activate
+clear
+python core.py tensorboard