Elesh Vaishnav commited on
Commit
51e88fc
·
verified ·
1 Parent(s): 5682687

Upload 13 files

Browse files
Files changed (13) hide show
  1. .gitignore +24 -0
  2. Dockerfile +34 -0
  3. LICENSE +21 -0
  4. Makefile +22 -0
  5. README.md +96 -14
  6. TERMS_OF_USE.md +52 -0
  7. app.py +154 -0
  8. core.py +2423 -0
  9. docker-compose.yaml +16 -0
  10. requirements.txt +50 -0
  11. run-applio.sh +9 -0
  12. run-install.sh +174 -0
  13. run-tensorboard.sh +6 -0
.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.exe
2
+ *.pt
3
+ *.onnx
4
+ *.pyc
5
+ *.pth
6
+ *.index
7
+ *.mp3
8
+ *.flac
9
+ *.ogg
10
+ *.m4a
11
+ *.bin
12
+ *.wav
13
+ *.txt
14
+ *.zip
15
+ *.png
16
+ *.safetensors
17
+
18
+ assets/audios
19
+ assets/datasets
20
+ logs
21
+ rvc/models
22
+ env
23
+ venv
24
+ .venv
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ FROM python:3.10-bullseye
3
+
4
+ # Expose the required port
5
+ EXPOSE 6969
6
+
7
+ # Set up working directory
8
+ WORKDIR /app
9
+
10
+ # Install system dependencies, clean up cache to keep image size small
11
+ RUN apt update && \
12
+ apt install -y -qq ffmpeg && \
13
+ apt clean && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Copy application files into the container
16
+ COPY . .
17
+
18
+ # Create a virtual environment in the app directory and install dependencies
19
+ RUN python3 -m venv /app/.venv && \
20
+ . /app/.venv/bin/activate && \
21
+ pip install --no-cache-dir --upgrade pip && \
22
+ pip install --no-cache-dir python-ffmpeg && \
23
+ pip install --no-cache-dir torch==2.7.1 torchvision torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128 && \
24
+ if [ -f "requirements.txt" ]; then pip install --no-cache-dir -r requirements.txt; fi
25
+
26
+ # Define volumes for persistent storage
27
+ VOLUME ["/app/logs/"]
28
+
29
+ # Set environment variables if necessary
30
+ ENV PATH="/app/.venv/bin:$PATH"
31
+
32
+ # Run the app
33
+ ENTRYPOINT ["python3"]
34
+ CMD ["app.py"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 AI Hispano
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Makefile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY:
2
+ .ONESHELL:
3
+
4
+ # Show help message
5
+ help:
6
+ @grep -hE '^[A-Za-z0-9_ \-]*?:.*##.*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
7
+
8
+ # Install dependencies
9
+ run-install:
10
+ apt-get -y install build-essential python3-dev ffmpeg
11
+ pip install --upgrade setuptools wheel
12
+ pip install pip==24.1
13
+ pip install -r requirements.txt
14
+ apt-get update
15
+
16
+ # Run Applio
17
+ run-applio:
18
+ python app.py --share
19
+
20
+ # Run Tensorboard
21
+ run-tensorboard:
22
+ python core.py tensorboard
README.md CHANGED
@@ -1,14 +1,96 @@
1
- ---
2
- title: VoiceConversionWebUI
3
- emoji: 📊
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: VC
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1 align="center">
2
+ <a href="https://applio.org" target="_blank"><img src="https://github.com/IAHispano/Applio/assets/133521603/78e975d8-b07f-47ba-ab23-5a31592f322a" alt="Applio"></a>
3
+ </h1>
4
+
5
+ <p align="center">
6
+ <img alt="Contributors" src="https://img.shields.io/github/contributors/iahispano/applio?style=for-the-badge&color=FFFFFF" />
7
+ <img alt="Release" src="https://img.shields.io/github/release/iahispano/applio?style=for-the-badge&color=FFFFFF" />
8
+ <img alt="Stars" src="https://img.shields.io/github/stars/iahispano/applio?style=for-the-badge&color=FFFFFF" />
9
+ <img alt="Fork" src="https://img.shields.io/github/forks/iahispano/applio?style=for-the-badge&color=FFFFFF" />
10
+ <img alt="Issues" src="https://img.shields.io/github/issues/iahispano/applio?style=for-the-badge&color=FFFFFF" />
11
+ </p>
12
+
13
+ <p align="center">A simple, high-quality voice conversion tool, focused on ease of use and performance.</p>
14
+
15
+ <p align="center">
16
+ <a href="https://applio.org" target="_blank">🌐 Website</a>
17
+
18
+ <a href="https://docs.applio.org" target="_blank">📚 Documentation</a>
19
+
20
+ <a href="https://discord.gg/urxFjYmYYh" target="_blank">☎️ Discord</a>
21
+ </p>
22
+
23
+ <p align="center">
24
+ <a href="https://github.com/IAHispano/Applio-Plugins" target="_blank">🛒 Plugins</a>
25
+
26
+ <a href="https://huggingface.co/IAHispano/Applio/tree/main/Compiled" target="_blank">📦 Compiled</a>
27
+
28
+ <a href="https://applio.org/playground" target="_blank">🎮 Playground</a>
29
+
30
+ <a href="https://colab.research.google.com/github/iahispano/applio/blob/main/assets/Applio.ipynb" target="_blank">🔎 Google Colab (UI)</a>
31
+
32
+ <a href="https://colab.research.google.com/github/iahispano/applio/blob/main/assets/Applio_NoUI.ipynb" target="_blank">🔎 Google Colab (No UI)</a>
33
+ </p>
34
+
35
+ > [!NOTE]
36
+ > Applio will no longer receive frequent updates. Going forward, development will focus mainly on security patches, dependency updates, and occasional feature improvements. This is because the project is already stable and mature with limited room for further improvements. Pull requests are still welcome and will be reviewed.
37
+
38
+ ## Introduction
39
+
40
+ Applio is a powerful voice conversion tool focused on simplicity, quality, and performance. Whether you're an artist, developer, or researcher, Applio offers a straightforward platform for high-quality voice transformations. Its flexible design allows for customization through plugins and configurations, catering to a wide range of projects.
41
+
42
+ ## Terms of Use and Commercial Usage
43
+
44
+ Using Applio responsibly is essential.
45
+
46
+ - Users must respect copyrights, intellectual property, and privacy rights.
47
+ - Applio is intended for lawful and ethical purposes, including personal, academic, and investigative projects.
48
+ - Commercial usage is permitted, provided users adhere to legal and ethical guidelines, secure appropriate rights and permissions, and comply with the [MIT license](./LICENSE).
49
+
50
+ The source code and model weights in this repository are licensed under the permissive [MIT license](./LICENSE), allowing modification, redistribution, and commercial use.
51
+
52
+ However, if you choose to use this official version of Applio (as provided in this repository, without significant modification), you must also comply with our [Terms of Use](./TERMS_OF_USE.md). These terms apply to our integrations, configurations, and default project behavior, and are intended to ensure responsible and ethical use without limiting their use in any way.
53
+
54
+ For commercial use, we recommend contacting us at [support@applio.org](mailto:support@applio.org) to ensure your usage aligns with ethical standards. All audio generated with Applio must comply with applicable copyright laws. If you find Applio helpful, consider supporting its development [through a donation](https://ko-fi.com/iahispano).
55
+
56
+ By using the official version of Applio, you accept full responsibility for complying with both the MIT license and our Terms of Use. Applio and its contributors are not liable for misuse. For full legal details, see the [Terms of Use](./TERMS_OF_USE.md).
57
+
58
+ ## Getting Started
59
+
60
+ ### 1. Installation
61
+
62
+ Run the installation script based on your operating system:
63
+
64
+ - **Windows:** Double-click `run-install.bat`.
65
+ - **Linux/macOS:** Execute `run-install.sh`.
66
+
67
+ ### 2. Running Applio
68
+
69
+ Start Applio using:
70
+
71
+ - **Windows:** Double-click `run-applio.bat`.
72
+ - **Linux/macOS:** Run `run-applio.sh`.
73
+
74
+ This launches the Gradio interface in your default browser.
75
+
76
+ ### 3. Optional: TensorBoard Monitoring
77
+
78
+ To monitor training or visualize data:
79
+
80
+ - **Windows:** Run `run-tensorboard.bat`.
81
+ - **Linux/macOS:** Run `run-tensorboard.sh`.
82
+
83
+ For more detailed instructions, visit the [documentation](https://docs.applio.org).
84
+
85
+ ## References
86
+
87
+ Applio is made possible thanks to these projects and their references:
88
+
89
+ - [gradio-screen-recorder](https://huggingface.co/spaces/gstaff/gradio-screen-recorder) by gstaff
90
+ - [rvc-cli](https://github.com/blaisewf/rvc-cli) by blaisewf
91
+
92
+ ### Contributors
93
+
94
+ <a href="https://github.com/IAHispano/Applio/graphs/contributors" target="_blank">
95
+ <img src="https://contrib.rocks/image?repo=IAHispano/Applio" />
96
+ </a>
TERMS_OF_USE.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Terms of Use
2
+
3
+ ## Responsibilities of the User
4
+
5
+ By using Applio, you agree to the following responsibilities:
6
+
7
+ ### 1. Respect Intellectual Property and Privacy Rights
8
+
9
+ - Ensure that any audio or material processed through Applio is either owned by you or used with explicit permission from the rightful owner.
10
+ - Respect copyrights, intellectual property rights, and privacy rights of all individuals and entities.
11
+
12
+ ### 2. Avoid Harmful or Unethical Use
13
+
14
+ - Do not use Applio to create or distribute content that harms, defames, or infringes upon the rights of others.
15
+ - Avoid any activities that may violate ethical standards, promote hate speech, or facilitate illegal conduct.
16
+
17
+ ### 3. Adhere to Local Laws and Regulations
18
+
19
+ - Familiarize yourself with and comply with the laws and regulations governing the use of AI, voice transformation tools, and generated content in your jurisdiction.
20
+
21
+ ## Disclaimer of Liability
22
+
23
+ Applio and its contributors disclaim all liability for any misuse or unintended consequences arising from the use of this tool.
24
+
25
+ - **No Warranty**: Applio is provided "as is" without any warranty, express or implied.
26
+ - **User Responsibility**: You bear full responsibility for how you choose to use Applio and any outcomes resulting from that use.
27
+ - **No Endorsement**: Applio does not endorse or support any activities or content created with this tool that result in harm, illegal activity, or unethical practices.
28
+
29
+ ## Permitted Use Cases
30
+
31
+ Applio is designed for:
32
+
33
+ - **Personal Projects**: Experimentation and creative endeavors for personal enrichment.
34
+ - **Academic Research**: Advancing scientific understanding and education.
35
+ - **Investigative Purposes**: Analyzing data in lawful and ethical contexts.
36
+ - **Commercial Use**: Creating content for commercial purposes, provided that appropriate rights and permissions are obtained and all legal and ethical standards are adhered to.
37
+
38
+ ## Prohibited Activities
39
+
40
+ The following uses are explicitly prohibited:
41
+
42
+ - **Harmful Applications**: Generating audio to defame, harm, or manipulate others.
43
+ - **Unauthorized Distribution**: Sharing content that violates copyrights or the rights of others.
44
+ - **Deceptive Practices**: Creating content intended to deceive or defraud others.
45
+
46
+ ## Training Data
47
+
48
+ All official models distributed by Applio have been trained under publicly available datasets such as [VCTK](https://huggingface.co/datasets/IAHispano/Applio-Dataset). We strive to maintain transparency and ethical practices in the development and distribution of our tools.
49
+
50
+ ## Amendments
51
+
52
+ Applio reserves the right to modify these terms at any time. Continued use of the tool signifies your acceptance of any updated terms.
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import sys
3
+ import os
4
+ import logging
5
+
6
+ from typing import Any
7
+
8
+ DEFAULT_SERVER_NAME = "127.0.0.1"
9
+ DEFAULT_PORT = 6969
10
+ MAX_PORT_ATTEMPTS = 10
11
+
12
+ # Set up logging
13
+ logging.getLogger("uvicorn").setLevel(logging.WARNING)
14
+ logging.getLogger("httpx").setLevel(logging.WARNING)
15
+
16
+ # Add current directory to sys.path
17
+ now_dir = os.getcwd()
18
+ sys.path.append(now_dir)
19
+
20
+ # Zluda hijack
21
+ import rvc.lib.zluda
22
+
23
+ # Import Tabs
24
+ from tabs.inference.inference import inference_tab
25
+ from tabs.train.train import train_tab
26
+ from tabs.extra.extra import extra_tab
27
+ from tabs.report.report import report_tab
28
+ from tabs.download.download import download_tab
29
+ from tabs.tts.tts import tts_tab
30
+ from tabs.voice_blender.voice_blender import voice_blender_tab
31
+ from tabs.plugins.plugins import plugins_tab
32
+ from tabs.settings.settings import settings_tab
33
+ from tabs.realtime.realtime import realtime_tab
34
+
35
+ # Run prerequisites
36
+ from core import run_prerequisites_script
37
+
38
+ run_prerequisites_script(
39
+ pretraineds_hifigan=True,
40
+ models=True,
41
+ exe=True,
42
+ )
43
+
44
+ # Initialize i18n
45
+ from assets.i18n.i18n import I18nAuto
46
+
47
+ i18n = I18nAuto()
48
+
49
+ # Start Discord presence if enabled
50
+ from tabs.settings.sections.presence import load_config_presence
51
+
52
+ if load_config_presence():
53
+ from assets.discord_presence import RPCManager
54
+
55
+ RPCManager.start_presence()
56
+
57
+ # Check installation
58
+ import assets.installation_checker as installation_checker
59
+
60
+ installation_checker.check_installation()
61
+
62
+ # Load theme
63
+ import assets.themes.loadThemes as loadThemes
64
+
65
+ my_applio = loadThemes.load_theme() or "ParityError/Interstellar"
66
+
67
+ # Define Gradio interface
68
+ with gr.Blocks(
69
+ theme=my_applio, title="Applio", css="footer{display:none !important}"
70
+ ) as Applio:
71
+ gr.Markdown("# Applio")
72
+ gr.Markdown(
73
+ i18n(
74
+ "A simple, high-quality voice conversion tool focused on ease of use and performance."
75
+ )
76
+ )
77
+ gr.Markdown(
78
+ i18n(
79
+ "[Support](https://discord.gg/urxFjYmYYh) — [GitHub](https://github.com/IAHispano/Applio)"
80
+ )
81
+ )
82
+ with gr.Tab(i18n("Inference")):
83
+ inference_tab()
84
+
85
+ with gr.Tab(i18n("Training")):
86
+ train_tab()
87
+
88
+ with gr.Tab(i18n("TTS")):
89
+ tts_tab()
90
+
91
+ with gr.Tab(i18n("Voice Blender")):
92
+ voice_blender_tab()
93
+
94
+ with gr.Tab(i18n("Realtime")):
95
+ realtime_tab()
96
+
97
+ with gr.Tab(i18n("Plugins")):
98
+ plugins_tab()
99
+
100
+ with gr.Tab(i18n("Download")):
101
+ download_tab()
102
+
103
+ with gr.Tab(i18n("Report a Bug")):
104
+ report_tab()
105
+
106
+ with gr.Tab(i18n("Extra")):
107
+ extra_tab()
108
+
109
+ with gr.Tab(i18n("Settings")):
110
+ settings_tab()
111
+
112
+ gr.Markdown(
113
+ """
114
+ <div style="text-align: center; font-size: 0.9em; text-color: a3a3a3;">
115
+ By using Applio, you agree to comply with ethical and legal standards, respect intellectual property and privacy rights, avoid harmful or prohibited uses, and accept full responsibility for any outcomes, while Applio disclaims liability and reserves the right to amend these terms.
116
+ </div>
117
+ """
118
+ )
119
+
120
+
121
+ def launch_gradio(server_name: str, server_port: int) -> None:
122
+ Applio.launch(
123
+ favicon_path="assets/ICON.ico",
124
+ share="--share" in sys.argv,
125
+ inbrowser="--open" in sys.argv,
126
+ server_name=server_name,
127
+ server_port=server_port,
128
+ )
129
+
130
+
131
+ def get_value_from_args(key: str, default: Any = None) -> Any:
132
+ if key in sys.argv:
133
+ index = sys.argv.index(key) + 1
134
+ if index < len(sys.argv):
135
+ return sys.argv[index]
136
+ return default
137
+
138
+
139
+ if __name__ == "__main__":
140
+ port = int(get_value_from_args("--port", DEFAULT_PORT))
141
+ server = get_value_from_args("--server-name", DEFAULT_SERVER_NAME)
142
+
143
+ for _ in range(MAX_PORT_ATTEMPTS):
144
+ try:
145
+ launch_gradio(server, port)
146
+ break
147
+ except OSError:
148
+ print(
149
+ f"Failed to launch on port {port}, trying again on port {port - 1}..."
150
+ )
151
+ port -= 1
152
+ except Exception as error:
153
+ print(f"An error occurred launching Gradio: {error}")
154
+ break
core.py ADDED
@@ -0,0 +1,2423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import argparse
5
+ import subprocess
6
+ from functools import lru_cache
7
+ from distutils.util import strtobool
8
+
9
+ now_dir = os.getcwd()
10
+ sys.path.append(now_dir)
11
+
12
+ current_script_directory = os.path.dirname(os.path.realpath(__file__))
13
+ logs_path = os.path.join(current_script_directory, "logs")
14
+
15
+ from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline
16
+ from rvc.train.process.model_blender import model_blender
17
+ from rvc.train.process.model_information import model_information
18
+ from rvc.lib.tools.analyzer import analyze_audio
19
+ from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
20
+ from rvc.lib.tools.model_download import model_download_pipeline
21
+
22
+ python = sys.executable
23
+
24
+
25
+ # Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4
26
+ @lru_cache(maxsize=1) # Cache only one result since the file is static
27
+ def load_voices_data():
28
+ with open(
29
+ os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r", encoding="utf-8"
30
+ ) as file:
31
+ return json.load(file)
32
+
33
+
34
+ voices_data = load_voices_data()
35
+ locales = list({voice["ShortName"] for voice in voices_data})
36
+
37
+
38
+ @lru_cache(maxsize=None)
39
+ def import_voice_converter():
40
+ from rvc.infer.infer import VoiceConverter
41
+
42
+ return VoiceConverter()
43
+
44
+
45
+ @lru_cache(maxsize=1)
46
+ def get_config():
47
+ from rvc.configs.config import Config
48
+
49
+ return Config()
50
+
51
+
52
+ # Infer
53
+ def run_infer_script(
54
+ pitch: int,
55
+ index_rate: float,
56
+ volume_envelope: float,
57
+ protect: float,
58
+ f0_method: str,
59
+ input_path: str,
60
+ output_path: str,
61
+ pth_path: str,
62
+ index_path: str,
63
+ split_audio: bool,
64
+ f0_autotune: bool,
65
+ f0_autotune_strength: float,
66
+ proposed_pitch: bool,
67
+ proposed_pitch_threshold: float,
68
+ clean_audio: bool,
69
+ clean_strength: float,
70
+ export_format: str,
71
+ embedder_model: str,
72
+ embedder_model_custom: str = None,
73
+ formant_shifting: bool = False,
74
+ formant_qfrency: float = 1.0,
75
+ formant_timbre: float = 1.0,
76
+ post_process: bool = False,
77
+ reverb: bool = False,
78
+ pitch_shift: bool = False,
79
+ limiter: bool = False,
80
+ gain: bool = False,
81
+ distortion: bool = False,
82
+ chorus: bool = False,
83
+ bitcrush: bool = False,
84
+ clipping: bool = False,
85
+ compressor: bool = False,
86
+ delay: bool = False,
87
+ reverb_room_size: float = 0.5,
88
+ reverb_damping: float = 0.5,
89
+ reverb_wet_gain: float = 0.5,
90
+ reverb_dry_gain: float = 0.5,
91
+ reverb_width: float = 0.5,
92
+ reverb_freeze_mode: float = 0.5,
93
+ pitch_shift_semitones: float = 0.0,
94
+ limiter_threshold: float = -6,
95
+ limiter_release_time: float = 0.01,
96
+ gain_db: float = 0.0,
97
+ distortion_gain: float = 25,
98
+ chorus_rate: float = 1.0,
99
+ chorus_depth: float = 0.25,
100
+ chorus_center_delay: float = 7,
101
+ chorus_feedback: float = 0.0,
102
+ chorus_mix: float = 0.5,
103
+ bitcrush_bit_depth: int = 8,
104
+ clipping_threshold: float = -6,
105
+ compressor_threshold: float = 0,
106
+ compressor_ratio: float = 1,
107
+ compressor_attack: float = 1.0,
108
+ compressor_release: float = 100,
109
+ delay_seconds: float = 0.5,
110
+ delay_feedback: float = 0.0,
111
+ delay_mix: float = 0.5,
112
+ sid: int = 0,
113
+ ):
114
+ kwargs = {
115
+ "audio_input_path": input_path,
116
+ "audio_output_path": output_path,
117
+ "model_path": pth_path,
118
+ "index_path": index_path,
119
+ "volume_envelope": volume_envelope,
120
+ "pitch": pitch,
121
+ "index_rate": index_rate,
122
+ "protect": protect,
123
+ "f0_method": f0_method,
124
+ "pth_path": pth_path,
125
+ "index_path": index_path,
126
+ "split_audio": split_audio,
127
+ "f0_autotune": f0_autotune,
128
+ "f0_autotune_strength": f0_autotune_strength,
129
+ "proposed_pitch": proposed_pitch,
130
+ "proposed_pitch_threshold": proposed_pitch_threshold,
131
+ "clean_audio": clean_audio,
132
+ "clean_strength": clean_strength,
133
+ "export_format": export_format,
134
+ "embedder_model": embedder_model,
135
+ "embedder_model_custom": embedder_model_custom,
136
+ "post_process": post_process,
137
+ "formant_shifting": formant_shifting,
138
+ "formant_qfrency": formant_qfrency,
139
+ "formant_timbre": formant_timbre,
140
+ "reverb": reverb,
141
+ "pitch_shift": pitch_shift,
142
+ "limiter": limiter,
143
+ "gain": gain,
144
+ "distortion": distortion,
145
+ "chorus": chorus,
146
+ "bitcrush": bitcrush,
147
+ "clipping": clipping,
148
+ "compressor": compressor,
149
+ "delay": delay,
150
+ "reverb_room_size": reverb_room_size,
151
+ "reverb_damping": reverb_damping,
152
+ "reverb_wet_level": reverb_wet_gain,
153
+ "reverb_dry_level": reverb_dry_gain,
154
+ "reverb_width": reverb_width,
155
+ "reverb_freeze_mode": reverb_freeze_mode,
156
+ "pitch_shift_semitones": pitch_shift_semitones,
157
+ "limiter_threshold": limiter_threshold,
158
+ "limiter_release": limiter_release_time,
159
+ "gain_db": gain_db,
160
+ "distortion_gain": distortion_gain,
161
+ "chorus_rate": chorus_rate,
162
+ "chorus_depth": chorus_depth,
163
+ "chorus_delay": chorus_center_delay,
164
+ "chorus_feedback": chorus_feedback,
165
+ "chorus_mix": chorus_mix,
166
+ "bitcrush_bit_depth": bitcrush_bit_depth,
167
+ "clipping_threshold": clipping_threshold,
168
+ "compressor_threshold": compressor_threshold,
169
+ "compressor_ratio": compressor_ratio,
170
+ "compressor_attack": compressor_attack,
171
+ "compressor_release": compressor_release,
172
+ "delay_seconds": delay_seconds,
173
+ "delay_feedback": delay_feedback,
174
+ "delay_mix": delay_mix,
175
+ "sid": sid,
176
+ }
177
+ infer_pipeline = import_voice_converter()
178
+ infer_pipeline.convert_audio(
179
+ **kwargs,
180
+ )
181
+ return f"File {input_path} inferred successfully.", output_path.replace(
182
+ ".wav", f".{export_format.lower()}"
183
+ )
184
+
185
+
186
+ # Batch infer
187
+ def run_batch_infer_script(
188
+ pitch: int,
189
+ index_rate: float,
190
+ volume_envelope: float,
191
+ protect: float,
192
+ f0_method: str,
193
+ input_folder: str,
194
+ output_folder: str,
195
+ pth_path: str,
196
+ index_path: str,
197
+ split_audio: bool,
198
+ f0_autotune: bool,
199
+ f0_autotune_strength: float,
200
+ proposed_pitch: bool,
201
+ proposed_pitch_threshold: float,
202
+ clean_audio: bool,
203
+ clean_strength: float,
204
+ export_format: str,
205
+ embedder_model: str,
206
+ embedder_model_custom: str = None,
207
+ formant_shifting: bool = False,
208
+ formant_qfrency: float = 1.0,
209
+ formant_timbre: float = 1.0,
210
+ post_process: bool = False,
211
+ reverb: bool = False,
212
+ pitch_shift: bool = False,
213
+ limiter: bool = False,
214
+ gain: bool = False,
215
+ distortion: bool = False,
216
+ chorus: bool = False,
217
+ bitcrush: bool = False,
218
+ clipping: bool = False,
219
+ compressor: bool = False,
220
+ delay: bool = False,
221
+ reverb_room_size: float = 0.5,
222
+ reverb_damping: float = 0.5,
223
+ reverb_wet_gain: float = 0.5,
224
+ reverb_dry_gain: float = 0.5,
225
+ reverb_width: float = 0.5,
226
+ reverb_freeze_mode: float = 0.5,
227
+ pitch_shift_semitones: float = 0.0,
228
+ limiter_threshold: float = -6,
229
+ limiter_release_time: float = 0.01,
230
+ gain_db: float = 0.0,
231
+ distortion_gain: float = 25,
232
+ chorus_rate: float = 1.0,
233
+ chorus_depth: float = 0.25,
234
+ chorus_center_delay: float = 7,
235
+ chorus_feedback: float = 0.0,
236
+ chorus_mix: float = 0.5,
237
+ bitcrush_bit_depth: int = 8,
238
+ clipping_threshold: float = -6,
239
+ compressor_threshold: float = 0,
240
+ compressor_ratio: float = 1,
241
+ compressor_attack: float = 1.0,
242
+ compressor_release: float = 100,
243
+ delay_seconds: float = 0.5,
244
+ delay_feedback: float = 0.0,
245
+ delay_mix: float = 0.5,
246
+ sid: int = 0,
247
+ ):
248
+ kwargs = {
249
+ "audio_input_paths": input_folder,
250
+ "audio_output_path": output_folder,
251
+ "model_path": pth_path,
252
+ "index_path": index_path,
253
+ "pitch": pitch,
254
+ "index_rate": index_rate,
255
+ "volume_envelope": volume_envelope,
256
+ "protect": protect,
257
+ "f0_method": f0_method,
258
+ "pth_path": pth_path,
259
+ "index_path": index_path,
260
+ "split_audio": split_audio,
261
+ "f0_autotune": f0_autotune,
262
+ "f0_autotune_strength": f0_autotune_strength,
263
+ "proposed_pitch": proposed_pitch,
264
+ "proposed_pitch_threshold": proposed_pitch_threshold,
265
+ "clean_audio": clean_audio,
266
+ "clean_strength": clean_strength,
267
+ "export_format": export_format,
268
+ "embedder_model": embedder_model,
269
+ "embedder_model_custom": embedder_model_custom,
270
+ "post_process": post_process,
271
+ "formant_shifting": formant_shifting,
272
+ "formant_qfrency": formant_qfrency,
273
+ "formant_timbre": formant_timbre,
274
+ "reverb": reverb,
275
+ "pitch_shift": pitch_shift,
276
+ "limiter": limiter,
277
+ "gain": gain,
278
+ "distortion": distortion,
279
+ "chorus": chorus,
280
+ "bitcrush": bitcrush,
281
+ "clipping": clipping,
282
+ "compressor": compressor,
283
+ "delay": delay,
284
+ "reverb_room_size": reverb_room_size,
285
+ "reverb_damping": reverb_damping,
286
+ "reverb_wet_level": reverb_wet_gain,
287
+ "reverb_dry_level": reverb_dry_gain,
288
+ "reverb_width": reverb_width,
289
+ "reverb_freeze_mode": reverb_freeze_mode,
290
+ "pitch_shift_semitones": pitch_shift_semitones,
291
+ "limiter_threshold": limiter_threshold,
292
+ "limiter_release": limiter_release_time,
293
+ "gain_db": gain_db,
294
+ "distortion_gain": distortion_gain,
295
+ "chorus_rate": chorus_rate,
296
+ "chorus_depth": chorus_depth,
297
+ "chorus_delay": chorus_center_delay,
298
+ "chorus_feedback": chorus_feedback,
299
+ "chorus_mix": chorus_mix,
300
+ "bitcrush_bit_depth": bitcrush_bit_depth,
301
+ "clipping_threshold": clipping_threshold,
302
+ "compressor_threshold": compressor_threshold,
303
+ "compressor_ratio": compressor_ratio,
304
+ "compressor_attack": compressor_attack,
305
+ "compressor_release": compressor_release,
306
+ "delay_seconds": delay_seconds,
307
+ "delay_feedback": delay_feedback,
308
+ "delay_mix": delay_mix,
309
+ "sid": sid,
310
+ }
311
+ infer_pipeline = import_voice_converter()
312
+ infer_pipeline.convert_audio_batch(
313
+ **kwargs,
314
+ )
315
+
316
+ return f"Files from {input_folder} inferred successfully."
317
+
318
+
319
+ # TTS
320
+ def run_tts_script(
321
+ tts_file: str,
322
+ tts_text: str,
323
+ tts_voice: str,
324
+ tts_rate: int,
325
+ pitch: int,
326
+ index_rate: float,
327
+ volume_envelope: float,
328
+ protect: float,
329
+ f0_method: str,
330
+ output_tts_path: str,
331
+ output_rvc_path: str,
332
+ pth_path: str,
333
+ index_path: str,
334
+ split_audio: bool,
335
+ f0_autotune: bool,
336
+ f0_autotune_strength: float,
337
+ proposed_pitch: bool,
338
+ proposed_pitch_threshold: float,
339
+ clean_audio: bool,
340
+ clean_strength: float,
341
+ export_format: str,
342
+ embedder_model: str,
343
+ embedder_model_custom: str = None,
344
+ sid: int = 0,
345
+ ):
346
+
347
+ tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
348
+
349
+ if os.path.exists(output_tts_path) and os.path.abspath(output_tts_path).startswith(
350
+ os.path.abspath("assets")
351
+ ):
352
+ os.remove(output_tts_path)
353
+
354
+ command_tts = [
355
+ *map(
356
+ str,
357
+ [
358
+ python,
359
+ tts_script_path,
360
+ tts_file,
361
+ tts_text,
362
+ tts_voice,
363
+ tts_rate,
364
+ output_tts_path,
365
+ ],
366
+ ),
367
+ ]
368
+ subprocess.run(command_tts)
369
+ infer_pipeline = import_voice_converter()
370
+ infer_pipeline.convert_audio(
371
+ pitch=pitch,
372
+ index_rate=index_rate,
373
+ volume_envelope=volume_envelope,
374
+ protect=protect,
375
+ f0_method=f0_method,
376
+ audio_input_path=output_tts_path,
377
+ audio_output_path=output_rvc_path,
378
+ model_path=pth_path,
379
+ index_path=index_path,
380
+ split_audio=split_audio,
381
+ f0_autotune=f0_autotune,
382
+ f0_autotune_strength=f0_autotune_strength,
383
+ proposed_pitch=proposed_pitch,
384
+ proposed_pitch_threshold=proposed_pitch_threshold,
385
+ clean_audio=clean_audio,
386
+ clean_strength=clean_strength,
387
+ export_format=export_format,
388
+ embedder_model=embedder_model,
389
+ embedder_model_custom=embedder_model_custom,
390
+ sid=sid,
391
+ formant_shifting=None,
392
+ formant_qfrency=None,
393
+ formant_timbre=None,
394
+ post_process=None,
395
+ reverb=None,
396
+ pitch_shift=None,
397
+ limiter=None,
398
+ gain=None,
399
+ distortion=None,
400
+ chorus=None,
401
+ bitcrush=None,
402
+ clipping=None,
403
+ compressor=None,
404
+ delay=None,
405
+ sliders=None,
406
+ )
407
+
408
+ return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
409
+ ".wav", f".{export_format.lower()}"
410
+ )
411
+
412
+
413
+ # Preprocess
414
+ def run_preprocess_script(
415
+ model_name: str,
416
+ dataset_path: str,
417
+ sample_rate: int,
418
+ cpu_cores: int,
419
+ cut_preprocess: str,
420
+ process_effects: bool,
421
+ noise_reduction: bool,
422
+ clean_strength: float,
423
+ chunk_len: float,
424
+ overlap_len: float,
425
+ normalization_mode: str = "none",
426
+ ):
427
+ preprocess_script_path = os.path.join("rvc", "train", "preprocess", "preprocess.py")
428
+ command = [
429
+ python,
430
+ preprocess_script_path,
431
+ *map(
432
+ str,
433
+ [
434
+ os.path.join(logs_path, model_name),
435
+ dataset_path,
436
+ sample_rate,
437
+ cpu_cores,
438
+ cut_preprocess,
439
+ process_effects,
440
+ noise_reduction,
441
+ clean_strength,
442
+ chunk_len,
443
+ overlap_len,
444
+ normalization_mode,
445
+ ],
446
+ ),
447
+ ]
448
+ subprocess.run(command)
449
+ return f"Model {model_name} preprocessed successfully."
450
+
451
+
452
+ # Extract
453
+ def run_extract_script(
454
+ model_name: str,
455
+ f0_method: str,
456
+ cpu_cores: int,
457
+ gpu: int,
458
+ sample_rate: int,
459
+ embedder_model: str,
460
+ embedder_model_custom: str = None,
461
+ include_mutes: int = 2,
462
+ ):
463
+
464
+ model_path = os.path.join(logs_path, model_name)
465
+ extract = os.path.join("rvc", "train", "extract", "extract.py")
466
+
467
+ command_1 = [
468
+ python,
469
+ extract,
470
+ *map(
471
+ str,
472
+ [
473
+ model_path,
474
+ f0_method,
475
+ cpu_cores,
476
+ gpu,
477
+ sample_rate,
478
+ embedder_model,
479
+ embedder_model_custom,
480
+ include_mutes,
481
+ ],
482
+ ),
483
+ ]
484
+
485
+ subprocess.run(command_1)
486
+
487
+ return f"Model {model_name} extracted successfully."
488
+
489
+
490
+ # Train
491
+ def run_train_script(
492
+ model_name: str,
493
+ save_every_epoch: int,
494
+ save_only_latest: bool,
495
+ save_every_weights: bool,
496
+ total_epoch: int,
497
+ sample_rate: int,
498
+ batch_size: int,
499
+ gpu: int,
500
+ overtraining_detector: bool,
501
+ overtraining_threshold: int,
502
+ pretrained: bool,
503
+ cleanup: bool,
504
+ index_algorithm: str = "Auto",
505
+ cache_data_in_gpu: bool = False,
506
+ custom_pretrained: bool = False,
507
+ g_pretrained_path: str = None,
508
+ d_pretrained_path: str = None,
509
+ vocoder: str = "HiFi-GAN",
510
+ checkpointing: bool = False,
511
+ ):
512
+
513
+ if pretrained == True:
514
+ from rvc.lib.tools.pretrained_selector import pretrained_selector
515
+
516
+ if custom_pretrained == False:
517
+ pg, pd = pretrained_selector(str(vocoder), int(sample_rate))
518
+ else:
519
+ if g_pretrained_path is None or d_pretrained_path is None:
520
+ raise ValueError(
521
+ "Please provide the path to the pretrained G and D models."
522
+ )
523
+ pg, pd = g_pretrained_path, d_pretrained_path
524
+ else:
525
+ pg, pd = "", ""
526
+
527
+ train_script_path = os.path.join("rvc", "train", "train.py")
528
+ command = [
529
+ python,
530
+ train_script_path,
531
+ *map(
532
+ str,
533
+ [
534
+ model_name,
535
+ save_every_epoch,
536
+ total_epoch,
537
+ pg,
538
+ pd,
539
+ gpu,
540
+ batch_size,
541
+ sample_rate,
542
+ save_only_latest,
543
+ save_every_weights,
544
+ cache_data_in_gpu,
545
+ overtraining_detector,
546
+ overtraining_threshold,
547
+ cleanup,
548
+ vocoder,
549
+ checkpointing,
550
+ ],
551
+ ),
552
+ ]
553
+ subprocess.run(command)
554
+ run_index_script(model_name, index_algorithm)
555
+ return f"Model {model_name} trained successfully."
556
+
557
+
558
+ # Index
559
+ def run_index_script(model_name: str, index_algorithm: str):
560
+ index_script_path = os.path.join("rvc", "train", "process", "extract_index.py")
561
+ command = [
562
+ python,
563
+ index_script_path,
564
+ os.path.join(logs_path, model_name),
565
+ index_algorithm,
566
+ ]
567
+
568
+ subprocess.run(command)
569
+ return f"Index file for {model_name} generated successfully."
570
+
571
+
572
+ # Model information
573
+ def run_model_information_script(pth_path: str):
574
+ print(model_information(pth_path))
575
+ return model_information(pth_path)
576
+
577
+
578
+ # Model blender
579
+ def run_model_blender_script(
580
+ model_name: str, pth_path_1: str, pth_path_2: str, ratio: float
581
+ ):
582
+ message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
583
+ return message, model_blended
584
+
585
+
586
+ # Tensorboard
587
+ def run_tensorboard_script():
588
+ launch_tensorboard_pipeline()
589
+
590
+
591
+ # Download
592
+ def run_download_script(model_link: str):
593
+ model_download_pipeline(model_link)
594
+ return f"Model downloaded successfully."
595
+
596
+
597
+ # Prerequisites
598
+ def run_prerequisites_script(
599
+ pretraineds_hifigan: bool,
600
+ models: bool,
601
+ exe: bool,
602
+ ):
603
+ prequisites_download_pipeline(
604
+ pretraineds_hifigan,
605
+ models,
606
+ exe,
607
+ )
608
+ return "Prerequisites installed successfully."
609
+
610
+
611
+ # Audio analyzer
612
+ def run_audio_analyzer_script(
613
+ input_path: str, save_plot_path: str = "logs/audio_analysis.png"
614
+ ):
615
+ audio_info, plot_path = analyze_audio(input_path, save_plot_path)
616
+ print(
617
+ f"Audio info of {input_path}: {audio_info}",
618
+ f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
619
+ )
620
+ return audio_info, plot_path
621
+
622
+
623
+ # Parse arguments
624
+ def parse_arguments():
625
+ parser = argparse.ArgumentParser(
626
+ description="Run the main.py script with specific parameters."
627
+ )
628
+ subparsers = parser.add_subparsers(
629
+ title="subcommands", dest="mode", help="Choose a mode"
630
+ )
631
+
632
+ # Parser for 'infer' mode
633
+ infer_parser = subparsers.add_parser("infer", help="Run inference")
634
+ pitch_description = (
635
+ "Set the pitch of the audio. Higher values result in a higher pitch."
636
+ )
637
+ infer_parser.add_argument(
638
+ "--pitch",
639
+ type=int,
640
+ help=pitch_description,
641
+ choices=range(-24, 25),
642
+ default=0,
643
+ )
644
+ index_rate_description = "Control the influence of the index file on the output. Higher values mean stronger influence. Lower values can help reduce artifacts but may result in less accurate voice cloning."
645
+ infer_parser.add_argument(
646
+ "--index_rate",
647
+ type=float,
648
+ help=index_rate_description,
649
+ choices=[i / 100.0 for i in range(0, 101)],
650
+ default=0.3,
651
+ )
652
+ volume_envelope_description = "Control the blending of the output's volume envelope. A value of 1 means the output envelope is fully used."
653
+ infer_parser.add_argument(
654
+ "--volume_envelope",
655
+ type=float,
656
+ help=volume_envelope_description,
657
+ choices=[i / 100.0 for i in range(0, 101)],
658
+ default=1,
659
+ )
660
+ protect_description = "Protect consonants and breathing sounds from artifacts. A value of 0.5 offers the strongest protection, while lower values may reduce the protection level but potentially mitigate the indexing effect."
661
+ infer_parser.add_argument(
662
+ "--protect",
663
+ type=float,
664
+ help=protect_description,
665
+ choices=[i / 1000.0 for i in range(0, 501)],
666
+ default=0.33,
667
+ )
668
+ f0_method_description = "Choose the pitch extraction algorithm for the conversion. 'rmvpe' is the default and generally recommended."
669
+ infer_parser.add_argument(
670
+ "--f0_method",
671
+ type=str,
672
+ help=f0_method_description,
673
+ choices=[
674
+ "crepe",
675
+ "crepe-tiny",
676
+ "rmvpe",
677
+ "fcpe",
678
+ "swift",
679
+ "hybrid[crepe+rmvpe]",
680
+ "hybrid[crepe+fcpe]",
681
+ "hybrid[rmvpe+fcpe]",
682
+ "hybrid[crepe+rmvpe+fcpe]",
683
+ ],
684
+ default="rmvpe",
685
+ )
686
+ infer_parser.add_argument(
687
+ "--input_path",
688
+ type=str,
689
+ help="Full path to the input audio file.",
690
+ required=True,
691
+ )
692
+ infer_parser.add_argument(
693
+ "--output_path",
694
+ type=str,
695
+ help="Full path to the output audio file.",
696
+ required=True,
697
+ )
698
+ pth_path_description = "Full path to the RVC model file (.pth)."
699
+ infer_parser.add_argument(
700
+ "--pth_path", type=str, help=pth_path_description, required=True
701
+ )
702
+ index_path_description = "Full path to the index file (.index)."
703
+ infer_parser.add_argument(
704
+ "--index_path", type=str, help=index_path_description, required=True
705
+ )
706
+ split_audio_description = "Split the audio into smaller segments before inference. This can improve the quality of the output for longer audio files."
707
+ infer_parser.add_argument(
708
+ "--split_audio",
709
+ type=lambda x: bool(strtobool(x)),
710
+ choices=[True, False],
711
+ help=split_audio_description,
712
+ default=False,
713
+ )
714
+ f0_autotune_description = "Apply a light autotune to the inferred audio. Particularly useful for singing voice conversions."
715
+ infer_parser.add_argument(
716
+ "--f0_autotune",
717
+ type=lambda x: bool(strtobool(x)),
718
+ choices=[True, False],
719
+ help=f0_autotune_description,
720
+ default=False,
721
+ )
722
+ f0_autotune_strength_description = "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid."
723
+ infer_parser.add_argument(
724
+ "--f0_autotune_strength",
725
+ type=float,
726
+ help=f0_autotune_strength_description,
727
+ choices=[(i / 10) for i in range(11)],
728
+ default=1.0,
729
+ )
730
+ proposed_pitch_description = "Proposed Pitch"
731
+ infer_parser.add_argument(
732
+ "--proposed_pitch",
733
+ type=bool,
734
+ help=proposed_pitch_description,
735
+ choices=[True, False],
736
+ default=False,
737
+ )
738
+ proposed_pitch_threshold_description = "Proposed Pitch Threshold"
739
+ infer_parser.add_argument(
740
+ "--proposed_pitch_threshold",
741
+ type=float,
742
+ help=proposed_pitch_threshold_description,
743
+ choices=[i for i in range(50, 1200)],
744
+ default=155.0,
745
+ )
746
+ clean_audio_description = "Clean the output audio using noise reduction algorithms. Recommended for speech conversions."
747
+ infer_parser.add_argument(
748
+ "--clean_audio",
749
+ type=lambda x: bool(strtobool(x)),
750
+ choices=[True, False],
751
+ help=clean_audio_description,
752
+ default=False,
753
+ )
754
+ clean_strength_description = "Adjust the intensity of the audio cleaning process. Higher values result in stronger cleaning, but may lead to a more compressed sound."
755
+ infer_parser.add_argument(
756
+ "--clean_strength",
757
+ type=float,
758
+ help=clean_strength_description,
759
+ choices=[(i / 10) for i in range(11)],
760
+ default=0.7,
761
+ )
762
+ export_format_description = "Select the desired output audio format."
763
+ infer_parser.add_argument(
764
+ "--export_format",
765
+ type=str,
766
+ help=export_format_description,
767
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
768
+ default="WAV",
769
+ )
770
+ embedder_model_description = (
771
+ "Choose the model used for generating speaker embeddings."
772
+ )
773
+ infer_parser.add_argument(
774
+ "--embedder_model",
775
+ type=str,
776
+ help=embedder_model_description,
777
+ choices=[
778
+ "contentvec",
779
+ "spin",
780
+ "spin-v2",
781
+ "chinese-hubert-base",
782
+ "japanese-hubert-base",
783
+ "korean-hubert-base",
784
+ "custom",
785
+ ],
786
+ default="contentvec",
787
+ )
788
+ embedder_model_custom_description = "Specify the path to a custom model for speaker embedding. Only applicable if 'embedder_model' is set to 'custom'."
789
+ infer_parser.add_argument(
790
+ "--embedder_model_custom",
791
+ type=str,
792
+ help=embedder_model_custom_description,
793
+ default=None,
794
+ )
795
+ formant_shifting_description = "Apply formant shifting to the input audio. This can help adjust the timbre of the voice."
796
+ infer_parser.add_argument(
797
+ "--formant_shifting",
798
+ type=lambda x: bool(strtobool(x)),
799
+ choices=[True, False],
800
+ help=formant_shifting_description,
801
+ default=False,
802
+ required=False,
803
+ )
804
+ formant_qfrency_description = "Control the frequency of the formant shifting effect. Higher values result in a more pronounced effect."
805
+ infer_parser.add_argument(
806
+ "--formant_qfrency",
807
+ type=float,
808
+ help=formant_qfrency_description,
809
+ default=1.0,
810
+ required=False,
811
+ )
812
+ formant_timbre_description = "Control the timbre of the formant shifting effect. Higher values result in a more pronounced effect."
813
+ infer_parser.add_argument(
814
+ "--formant_timbre",
815
+ type=float,
816
+ help=formant_timbre_description,
817
+ default=1.0,
818
+ required=False,
819
+ )
820
+ sid_description = "Speaker ID for multi-speaker models."
821
+ infer_parser.add_argument(
822
+ "--sid",
823
+ type=int,
824
+ help=sid_description,
825
+ default=0,
826
+ required=False,
827
+ )
828
+ post_process_description = "Apply post-processing effects to the output audio."
829
+ infer_parser.add_argument(
830
+ "--post_process",
831
+ type=lambda x: bool(strtobool(x)),
832
+ choices=[True, False],
833
+ help=post_process_description,
834
+ default=False,
835
+ required=False,
836
+ )
837
+ reverb_description = "Apply reverb effect to the output audio."
838
+ infer_parser.add_argument(
839
+ "--reverb",
840
+ type=lambda x: bool(strtobool(x)),
841
+ choices=[True, False],
842
+ help=reverb_description,
843
+ default=False,
844
+ required=False,
845
+ )
846
+
847
+ pitch_shift_description = "Apply pitch shifting effect to the output audio."
848
+ infer_parser.add_argument(
849
+ "--pitch_shift",
850
+ type=lambda x: bool(strtobool(x)),
851
+ choices=[True, False],
852
+ help=pitch_shift_description,
853
+ default=False,
854
+ required=False,
855
+ )
856
+
857
+ limiter_description = "Apply limiter effect to the output audio."
858
+ infer_parser.add_argument(
859
+ "--limiter",
860
+ type=lambda x: bool(strtobool(x)),
861
+ choices=[True, False],
862
+ help=limiter_description,
863
+ default=False,
864
+ required=False,
865
+ )
866
+
867
+ gain_description = "Apply gain effect to the output audio."
868
+ infer_parser.add_argument(
869
+ "--gain",
870
+ type=lambda x: bool(strtobool(x)),
871
+ choices=[True, False],
872
+ help=gain_description,
873
+ default=False,
874
+ required=False,
875
+ )
876
+
877
+ distortion_description = "Apply distortion effect to the output audio."
878
+ infer_parser.add_argument(
879
+ "--distortion",
880
+ type=lambda x: bool(strtobool(x)),
881
+ choices=[True, False],
882
+ help=distortion_description,
883
+ default=False,
884
+ required=False,
885
+ )
886
+
887
+ chorus_description = "Apply chorus effect to the output audio."
888
+ infer_parser.add_argument(
889
+ "--chorus",
890
+ type=lambda x: bool(strtobool(x)),
891
+ choices=[True, False],
892
+ help=chorus_description,
893
+ default=False,
894
+ required=False,
895
+ )
896
+
897
+ bitcrush_description = "Apply bitcrush effect to the output audio."
898
+ infer_parser.add_argument(
899
+ "--bitcrush",
900
+ type=lambda x: bool(strtobool(x)),
901
+ choices=[True, False],
902
+ help=bitcrush_description,
903
+ default=False,
904
+ required=False,
905
+ )
906
+
907
+ clipping_description = "Apply clipping effect to the output audio."
908
+ infer_parser.add_argument(
909
+ "--clipping",
910
+ type=lambda x: bool(strtobool(x)),
911
+ choices=[True, False],
912
+ help=clipping_description,
913
+ default=False,
914
+ required=False,
915
+ )
916
+
917
+ compressor_description = "Apply compressor effect to the output audio."
918
+ infer_parser.add_argument(
919
+ "--compressor",
920
+ type=lambda x: bool(strtobool(x)),
921
+ choices=[True, False],
922
+ help=compressor_description,
923
+ default=False,
924
+ required=False,
925
+ )
926
+
927
+ delay_description = "Apply delay effect to the output audio."
928
+ infer_parser.add_argument(
929
+ "--delay",
930
+ type=lambda x: bool(strtobool(x)),
931
+ choices=[True, False],
932
+ help=delay_description,
933
+ default=False,
934
+ required=False,
935
+ )
936
+
937
+ reverb_room_size_description = "Control the room size of the reverb effect. Higher values result in a larger room size."
938
+ infer_parser.add_argument(
939
+ "--reverb_room_size",
940
+ type=float,
941
+ help=reverb_room_size_description,
942
+ default=0.5,
943
+ required=False,
944
+ )
945
+
946
+ reverb_damping_description = "Control the damping of the reverb effect. Higher values result in a more damped sound."
947
+ infer_parser.add_argument(
948
+ "--reverb_damping",
949
+ type=float,
950
+ help=reverb_damping_description,
951
+ default=0.5,
952
+ required=False,
953
+ )
954
+
955
+ reverb_wet_gain_description = "Control the wet gain of the reverb effect. Higher values result in a stronger reverb effect."
956
+ infer_parser.add_argument(
957
+ "--reverb_wet_gain",
958
+ type=float,
959
+ help=reverb_wet_gain_description,
960
+ default=0.5,
961
+ required=False,
962
+ )
963
+
964
+ reverb_dry_gain_description = "Control the dry gain of the reverb effect. Higher values result in a stronger dry signal."
965
+ infer_parser.add_argument(
966
+ "--reverb_dry_gain",
967
+ type=float,
968
+ help=reverb_dry_gain_description,
969
+ default=0.5,
970
+ required=False,
971
+ )
972
+
973
+ reverb_width_description = "Control the stereo width of the reverb effect. Higher values result in a wider stereo image."
974
+ infer_parser.add_argument(
975
+ "--reverb_width",
976
+ type=float,
977
+ help=reverb_width_description,
978
+ default=0.5,
979
+ required=False,
980
+ )
981
+
982
+ reverb_freeze_mode_description = "Control the freeze mode of the reverb effect. Higher values result in a stronger freeze effect."
983
+ infer_parser.add_argument(
984
+ "--reverb_freeze_mode",
985
+ type=float,
986
+ help=reverb_freeze_mode_description,
987
+ default=0.5,
988
+ required=False,
989
+ )
990
+
991
+ pitch_shift_semitones_description = "Control the pitch shift in semitones. Positive values increase the pitch, while negative values decrease it."
992
+ infer_parser.add_argument(
993
+ "--pitch_shift_semitones",
994
+ type=float,
995
+ help=pitch_shift_semitones_description,
996
+ default=0.0,
997
+ required=False,
998
+ )
999
+
1000
+ limiter_threshold_description = "Control the threshold of the limiter effect. Higher values result in a stronger limiting effect."
1001
+ infer_parser.add_argument(
1002
+ "--limiter_threshold",
1003
+ type=float,
1004
+ help=limiter_threshold_description,
1005
+ default=-6,
1006
+ required=False,
1007
+ )
1008
+
1009
+ limiter_release_time_description = "Control the release time of the limiter effect. Higher values result in a longer release time."
1010
+ infer_parser.add_argument(
1011
+ "--limiter_release_time",
1012
+ type=float,
1013
+ help=limiter_release_time_description,
1014
+ default=0.01,
1015
+ required=False,
1016
+ )
1017
+
1018
+ gain_db_description = "Control the gain in decibels. Positive values increase the gain, while negative values decrease it."
1019
+ infer_parser.add_argument(
1020
+ "--gain_db",
1021
+ type=float,
1022
+ help=gain_db_description,
1023
+ default=0.0,
1024
+ required=False,
1025
+ )
1026
+
1027
+ distortion_gain_description = "Control the gain of the distortion effect. Higher values result in a stronger distortion effect."
1028
+ infer_parser.add_argument(
1029
+ "--distortion_gain",
1030
+ type=float,
1031
+ help=distortion_gain_description,
1032
+ default=25,
1033
+ required=False,
1034
+ )
1035
+
1036
+ chorus_rate_description = "Control the rate of the chorus effect. Higher values result in a faster chorus effect."
1037
+ infer_parser.add_argument(
1038
+ "--chorus_rate",
1039
+ type=float,
1040
+ help=chorus_rate_description,
1041
+ default=1.0,
1042
+ required=False,
1043
+ )
1044
+
1045
+ chorus_depth_description = "Control the depth of the chorus effect. Higher values result in a stronger chorus effect."
1046
+ infer_parser.add_argument(
1047
+ "--chorus_depth",
1048
+ type=float,
1049
+ help=chorus_depth_description,
1050
+ default=0.25,
1051
+ required=False,
1052
+ )
1053
+
1054
+ chorus_center_delay_description = "Control the center delay of the chorus effect. Higher values result in a longer center delay."
1055
+ infer_parser.add_argument(
1056
+ "--chorus_center_delay",
1057
+ type=float,
1058
+ help=chorus_center_delay_description,
1059
+ default=7,
1060
+ required=False,
1061
+ )
1062
+
1063
+ chorus_feedback_description = "Control the feedback of the chorus effect. Higher values result in a stronger feedback effect."
1064
+ infer_parser.add_argument(
1065
+ "--chorus_feedback",
1066
+ type=float,
1067
+ help=chorus_feedback_description,
1068
+ default=0.0,
1069
+ required=False,
1070
+ )
1071
+
1072
+ chorus_mix_description = "Control the mix of the chorus effect. Higher values result in a stronger chorus effect."
1073
+ infer_parser.add_argument(
1074
+ "--chorus_mix",
1075
+ type=float,
1076
+ help=chorus_mix_description,
1077
+ default=0.5,
1078
+ required=False,
1079
+ )
1080
+
1081
+ bitcrush_bit_depth_description = "Control the bit depth of the bitcrush effect. Higher values result in a stronger bitcrush effect."
1082
+ infer_parser.add_argument(
1083
+ "--bitcrush_bit_depth",
1084
+ type=int,
1085
+ help=bitcrush_bit_depth_description,
1086
+ default=8,
1087
+ required=False,
1088
+ )
1089
+
1090
+ clipping_threshold_description = "Control the threshold of the clipping effect. Higher values result in a stronger clipping effect."
1091
+ infer_parser.add_argument(
1092
+ "--clipping_threshold",
1093
+ type=float,
1094
+ help=clipping_threshold_description,
1095
+ default=-6,
1096
+ required=False,
1097
+ )
1098
+
1099
+ compressor_threshold_description = "Control the threshold of the compressor effect. Higher values result in a stronger compressor effect."
1100
+ infer_parser.add_argument(
1101
+ "--compressor_threshold",
1102
+ type=float,
1103
+ help=compressor_threshold_description,
1104
+ default=0,
1105
+ required=False,
1106
+ )
1107
+
1108
+ compressor_ratio_description = "Control the ratio of the compressor effect. Higher values result in a stronger compressor effect."
1109
+ infer_parser.add_argument(
1110
+ "--compressor_ratio",
1111
+ type=float,
1112
+ help=compressor_ratio_description,
1113
+ default=1,
1114
+ required=False,
1115
+ )
1116
+
1117
+ compressor_attack_description = "Control the attack of the compressor effect. Higher values result in a stronger compressor effect."
1118
+ infer_parser.add_argument(
1119
+ "--compressor_attack",
1120
+ type=float,
1121
+ help=compressor_attack_description,
1122
+ default=1.0,
1123
+ required=False,
1124
+ )
1125
+
1126
+ compressor_release_description = "Control the release of the compressor effect. Higher values result in a stronger compressor effect."
1127
+ infer_parser.add_argument(
1128
+ "--compressor_release",
1129
+ type=float,
1130
+ help=compressor_release_description,
1131
+ default=100,
1132
+ required=False,
1133
+ )
1134
+
1135
+ delay_seconds_description = "Control the delay time in seconds. Higher values result in a longer delay time."
1136
+ infer_parser.add_argument(
1137
+ "--delay_seconds",
1138
+ type=float,
1139
+ help=delay_seconds_description,
1140
+ default=0.5,
1141
+ required=False,
1142
+ )
1143
+ delay_feedback_description = "Control the feedback of the delay effect. Higher values result in a stronger feedback effect."
1144
+ infer_parser.add_argument(
1145
+ "--delay_feedback",
1146
+ type=float,
1147
+ help=delay_feedback_description,
1148
+ default=0.0,
1149
+ required=False,
1150
+ )
1151
+ delay_mix_description = "Control the mix of the delay effect. Higher values result in a stronger delay effect."
1152
+ infer_parser.add_argument(
1153
+ "--delay_mix",
1154
+ type=float,
1155
+ help=delay_mix_description,
1156
+ default=0.5,
1157
+ required=False,
1158
+ )
1159
+
1160
+ # Parser for 'batch_infer' mode
1161
+ batch_infer_parser = subparsers.add_parser(
1162
+ "batch_infer",
1163
+ help="Run batch inference",
1164
+ )
1165
+ batch_infer_parser.add_argument(
1166
+ "--pitch",
1167
+ type=int,
1168
+ help=pitch_description,
1169
+ choices=range(-24, 25),
1170
+ default=0,
1171
+ )
1172
+ batch_infer_parser.add_argument(
1173
+ "--index_rate",
1174
+ type=float,
1175
+ help=index_rate_description,
1176
+ choices=[i / 100.0 for i in range(0, 101)],
1177
+ default=0.3,
1178
+ )
1179
+ batch_infer_parser.add_argument(
1180
+ "--volume_envelope",
1181
+ type=float,
1182
+ help=volume_envelope_description,
1183
+ choices=[i / 100.0 for i in range(0, 101)],
1184
+ default=1,
1185
+ )
1186
+ batch_infer_parser.add_argument(
1187
+ "--protect",
1188
+ type=float,
1189
+ help=protect_description,
1190
+ choices=[i / 1000.0 for i in range(0, 501)],
1191
+ default=0.33,
1192
+ )
1193
+ batch_infer_parser.add_argument(
1194
+ "--f0_method",
1195
+ type=str,
1196
+ help=f0_method_description,
1197
+ choices=[
1198
+ "crepe",
1199
+ "crepe-tiny",
1200
+ "rmvpe",
1201
+ "fcpe",
1202
+ "swift",
1203
+ "hybrid[crepe+rmvpe]",
1204
+ "hybrid[crepe+fcpe]",
1205
+ "hybrid[rmvpe+fcpe]",
1206
+ "hybrid[crepe+rmvpe+fcpe]",
1207
+ ],
1208
+ default="rmvpe",
1209
+ )
1210
+ batch_infer_parser.add_argument(
1211
+ "--input_folder",
1212
+ type=str,
1213
+ help="Path to the folder containing input audio files.",
1214
+ required=True,
1215
+ )
1216
+ batch_infer_parser.add_argument(
1217
+ "--output_folder",
1218
+ type=str,
1219
+ help="Path to the folder for saving output audio files.",
1220
+ required=True,
1221
+ )
1222
+ batch_infer_parser.add_argument(
1223
+ "--pth_path", type=str, help=pth_path_description, required=True
1224
+ )
1225
+ batch_infer_parser.add_argument(
1226
+ "--index_path", type=str, help=index_path_description, required=True
1227
+ )
1228
+ batch_infer_parser.add_argument(
1229
+ "--split_audio",
1230
+ type=lambda x: bool(strtobool(x)),
1231
+ choices=[True, False],
1232
+ help=split_audio_description,
1233
+ default=False,
1234
+ )
1235
+ batch_infer_parser.add_argument(
1236
+ "--f0_autotune",
1237
+ type=lambda x: bool(strtobool(x)),
1238
+ choices=[True, False],
1239
+ help=f0_autotune_description,
1240
+ default=False,
1241
+ )
1242
+ batch_infer_parser.add_argument(
1243
+ "--f0_autotune_strength",
1244
+ type=float,
1245
+ help=clean_strength_description,
1246
+ choices=[(i / 10) for i in range(11)],
1247
+ default=1.0,
1248
+ )
1249
+ proposed_pitch_description = "Proposed Pitch adjustment"
1250
+ batch_infer_parser.add_argument(
1251
+ "--proposed_pitch",
1252
+ type=bool,
1253
+ help=proposed_pitch_description,
1254
+ choices=[True, False],
1255
+ default=False,
1256
+ )
1257
+ proposed_pitch_threshold_description = "Proposed Pitch adjustment value"
1258
+ batch_infer_parser.add_argument(
1259
+ "--proposed_pitch_threshold",
1260
+ type=float,
1261
+ help=proposed_pitch_threshold_description,
1262
+ choices=[i for i in range(50, 1200)],
1263
+ default=155.0,
1264
+ )
1265
+ batch_infer_parser.add_argument(
1266
+ "--clean_audio",
1267
+ type=lambda x: bool(strtobool(x)),
1268
+ choices=[True, False],
1269
+ help=clean_audio_description,
1270
+ default=False,
1271
+ )
1272
+ batch_infer_parser.add_argument(
1273
+ "--clean_strength",
1274
+ type=float,
1275
+ help=clean_strength_description,
1276
+ choices=[(i / 10) for i in range(11)],
1277
+ default=0.7,
1278
+ )
1279
+ batch_infer_parser.add_argument(
1280
+ "--export_format",
1281
+ type=str,
1282
+ help=export_format_description,
1283
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
1284
+ default="WAV",
1285
+ )
1286
+ batch_infer_parser.add_argument(
1287
+ "--embedder_model",
1288
+ type=str,
1289
+ help=embedder_model_description,
1290
+ choices=[
1291
+ "contentvec",
1292
+ "spin",
1293
+ "spin-v2",
1294
+ "chinese-hubert-base",
1295
+ "japanese-hubert-base",
1296
+ "korean-hubert-base",
1297
+ "custom",
1298
+ ],
1299
+ default="contentvec",
1300
+ )
1301
+ batch_infer_parser.add_argument(
1302
+ "--embedder_model_custom",
1303
+ type=str,
1304
+ help=embedder_model_custom_description,
1305
+ default=None,
1306
+ )
1307
+ batch_infer_parser.add_argument(
1308
+ "--formant_shifting",
1309
+ type=lambda x: bool(strtobool(x)),
1310
+ choices=[True, False],
1311
+ help=formant_shifting_description,
1312
+ default=False,
1313
+ required=False,
1314
+ )
1315
+ batch_infer_parser.add_argument(
1316
+ "--formant_qfrency",
1317
+ type=float,
1318
+ help=formant_qfrency_description,
1319
+ default=1.0,
1320
+ required=False,
1321
+ )
1322
+ batch_infer_parser.add_argument(
1323
+ "--formant_timbre",
1324
+ type=float,
1325
+ help=formant_timbre_description,
1326
+ default=1.0,
1327
+ required=False,
1328
+ )
1329
+ batch_infer_parser.add_argument(
1330
+ "--sid",
1331
+ type=int,
1332
+ help=sid_description,
1333
+ default=0,
1334
+ required=False,
1335
+ )
1336
+ batch_infer_parser.add_argument(
1337
+ "--post_process",
1338
+ type=lambda x: bool(strtobool(x)),
1339
+ choices=[True, False],
1340
+ help=post_process_description,
1341
+ default=False,
1342
+ required=False,
1343
+ )
1344
+ batch_infer_parser.add_argument(
1345
+ "--reverb",
1346
+ type=lambda x: bool(strtobool(x)),
1347
+ choices=[True, False],
1348
+ help=reverb_description,
1349
+ default=False,
1350
+ required=False,
1351
+ )
1352
+
1353
+ batch_infer_parser.add_argument(
1354
+ "--pitch_shift",
1355
+ type=lambda x: bool(strtobool(x)),
1356
+ choices=[True, False],
1357
+ help=pitch_shift_description,
1358
+ default=False,
1359
+ required=False,
1360
+ )
1361
+
1362
+ batch_infer_parser.add_argument(
1363
+ "--limiter",
1364
+ type=lambda x: bool(strtobool(x)),
1365
+ choices=[True, False],
1366
+ help=limiter_description,
1367
+ default=False,
1368
+ required=False,
1369
+ )
1370
+
1371
+ batch_infer_parser.add_argument(
1372
+ "--gain",
1373
+ type=lambda x: bool(strtobool(x)),
1374
+ choices=[True, False],
1375
+ help=gain_description,
1376
+ default=False,
1377
+ required=False,
1378
+ )
1379
+
1380
+ batch_infer_parser.add_argument(
1381
+ "--distortion",
1382
+ type=lambda x: bool(strtobool(x)),
1383
+ choices=[True, False],
1384
+ help=distortion_description,
1385
+ default=False,
1386
+ required=False,
1387
+ )
1388
+
1389
+ batch_infer_parser.add_argument(
1390
+ "--chorus",
1391
+ type=lambda x: bool(strtobool(x)),
1392
+ choices=[True, False],
1393
+ help=chorus_description,
1394
+ default=False,
1395
+ required=False,
1396
+ )
1397
+
1398
+ batch_infer_parser.add_argument(
1399
+ "--bitcrush",
1400
+ type=lambda x: bool(strtobool(x)),
1401
+ choices=[True, False],
1402
+ help=bitcrush_description,
1403
+ default=False,
1404
+ required=False,
1405
+ )
1406
+
1407
+ batch_infer_parser.add_argument(
1408
+ "--clipping",
1409
+ type=lambda x: bool(strtobool(x)),
1410
+ choices=[True, False],
1411
+ help=clipping_description,
1412
+ default=False,
1413
+ required=False,
1414
+ )
1415
+
1416
+ batch_infer_parser.add_argument(
1417
+ "--compressor",
1418
+ type=lambda x: bool(strtobool(x)),
1419
+ choices=[True, False],
1420
+ help=compressor_description,
1421
+ default=False,
1422
+ required=False,
1423
+ )
1424
+
1425
+ batch_infer_parser.add_argument(
1426
+ "--delay",
1427
+ type=lambda x: bool(strtobool(x)),
1428
+ choices=[True, False],
1429
+ help=delay_description,
1430
+ default=False,
1431
+ required=False,
1432
+ )
1433
+
1434
+ batch_infer_parser.add_argument(
1435
+ "--reverb_room_size",
1436
+ type=float,
1437
+ help=reverb_room_size_description,
1438
+ default=0.5,
1439
+ required=False,
1440
+ )
1441
+
1442
+ batch_infer_parser.add_argument(
1443
+ "--reverb_damping",
1444
+ type=float,
1445
+ help=reverb_damping_description,
1446
+ default=0.5,
1447
+ required=False,
1448
+ )
1449
+
1450
+ batch_infer_parser.add_argument(
1451
+ "--reverb_wet_gain",
1452
+ type=float,
1453
+ help=reverb_wet_gain_description,
1454
+ default=0.5,
1455
+ required=False,
1456
+ )
1457
+
1458
+ batch_infer_parser.add_argument(
1459
+ "--reverb_dry_gain",
1460
+ type=float,
1461
+ help=reverb_dry_gain_description,
1462
+ default=0.5,
1463
+ required=False,
1464
+ )
1465
+
1466
+ batch_infer_parser.add_argument(
1467
+ "--reverb_width",
1468
+ type=float,
1469
+ help=reverb_width_description,
1470
+ default=0.5,
1471
+ required=False,
1472
+ )
1473
+
1474
+ batch_infer_parser.add_argument(
1475
+ "--reverb_freeze_mode",
1476
+ type=float,
1477
+ help=reverb_freeze_mode_description,
1478
+ default=0.5,
1479
+ required=False,
1480
+ )
1481
+
1482
+ batch_infer_parser.add_argument(
1483
+ "--pitch_shift_semitones",
1484
+ type=float,
1485
+ help=pitch_shift_semitones_description,
1486
+ default=0.0,
1487
+ required=False,
1488
+ )
1489
+
1490
+ batch_infer_parser.add_argument(
1491
+ "--limiter_threshold",
1492
+ type=float,
1493
+ help=limiter_threshold_description,
1494
+ default=-6,
1495
+ required=False,
1496
+ )
1497
+
1498
+ batch_infer_parser.add_argument(
1499
+ "--limiter_release_time",
1500
+ type=float,
1501
+ help=limiter_release_time_description,
1502
+ default=0.01,
1503
+ required=False,
1504
+ )
1505
+ batch_infer_parser.add_argument(
1506
+ "--gain_db",
1507
+ type=float,
1508
+ help=gain_db_description,
1509
+ default=0.0,
1510
+ required=False,
1511
+ )
1512
+
1513
+ batch_infer_parser.add_argument(
1514
+ "--distortion_gain",
1515
+ type=float,
1516
+ help=distortion_gain_description,
1517
+ default=25,
1518
+ required=False,
1519
+ )
1520
+
1521
+ batch_infer_parser.add_argument(
1522
+ "--chorus_rate",
1523
+ type=float,
1524
+ help=chorus_rate_description,
1525
+ default=1.0,
1526
+ required=False,
1527
+ )
1528
+
1529
+ batch_infer_parser.add_argument(
1530
+ "--chorus_depth",
1531
+ type=float,
1532
+ help=chorus_depth_description,
1533
+ default=0.25,
1534
+ required=False,
1535
+ )
1536
+ batch_infer_parser.add_argument(
1537
+ "--chorus_center_delay",
1538
+ type=float,
1539
+ help=chorus_center_delay_description,
1540
+ default=7,
1541
+ required=False,
1542
+ )
1543
+
1544
+ batch_infer_parser.add_argument(
1545
+ "--chorus_feedback",
1546
+ type=float,
1547
+ help=chorus_feedback_description,
1548
+ default=0.0,
1549
+ required=False,
1550
+ )
1551
+
1552
+ batch_infer_parser.add_argument(
1553
+ "--chorus_mix",
1554
+ type=float,
1555
+ help=chorus_mix_description,
1556
+ default=0.5,
1557
+ required=False,
1558
+ )
1559
+
1560
+ batch_infer_parser.add_argument(
1561
+ "--bitcrush_bit_depth",
1562
+ type=int,
1563
+ help=bitcrush_bit_depth_description,
1564
+ default=8,
1565
+ required=False,
1566
+ )
1567
+
1568
+ batch_infer_parser.add_argument(
1569
+ "--clipping_threshold",
1570
+ type=float,
1571
+ help=clipping_threshold_description,
1572
+ default=-6,
1573
+ required=False,
1574
+ )
1575
+
1576
+ batch_infer_parser.add_argument(
1577
+ "--compressor_threshold",
1578
+ type=float,
1579
+ help=compressor_threshold_description,
1580
+ default=0,
1581
+ required=False,
1582
+ )
1583
+
1584
+ batch_infer_parser.add_argument(
1585
+ "--compressor_ratio",
1586
+ type=float,
1587
+ help=compressor_ratio_description,
1588
+ default=1,
1589
+ required=False,
1590
+ )
1591
+
1592
+ batch_infer_parser.add_argument(
1593
+ "--compressor_attack",
1594
+ type=float,
1595
+ help=compressor_attack_description,
1596
+ default=1.0,
1597
+ required=False,
1598
+ )
1599
+
1600
+ batch_infer_parser.add_argument(
1601
+ "--compressor_release",
1602
+ type=float,
1603
+ help=compressor_release_description,
1604
+ default=100,
1605
+ required=False,
1606
+ )
1607
+ batch_infer_parser.add_argument(
1608
+ "--delay_seconds",
1609
+ type=float,
1610
+ help=delay_seconds_description,
1611
+ default=0.5,
1612
+ required=False,
1613
+ )
1614
+ batch_infer_parser.add_argument(
1615
+ "--delay_feedback",
1616
+ type=float,
1617
+ help=delay_feedback_description,
1618
+ default=0.0,
1619
+ required=False,
1620
+ )
1621
+ batch_infer_parser.add_argument(
1622
+ "--delay_mix",
1623
+ type=float,
1624
+ help=delay_mix_description,
1625
+ default=0.5,
1626
+ required=False,
1627
+ )
1628
+
1629
+ # Parser for 'tts' mode
1630
+ tts_parser = subparsers.add_parser("tts", help="Run TTS inference")
1631
+ tts_parser.add_argument(
1632
+ "--tts_file", type=str, help="File with a text to be synthesized", required=True
1633
+ )
1634
+ tts_parser.add_argument(
1635
+ "--tts_text", type=str, help="Text to be synthesized", required=True
1636
+ )
1637
+ tts_parser.add_argument(
1638
+ "--tts_voice",
1639
+ type=str,
1640
+ help="Voice to be used for TTS synthesis.",
1641
+ choices=locales,
1642
+ required=True,
1643
+ )
1644
+ tts_parser.add_argument(
1645
+ "--tts_rate",
1646
+ type=int,
1647
+ help="Control the speaking rate of the TTS. Values range from -100 (slower) to 100 (faster).",
1648
+ choices=range(-100, 101),
1649
+ default=0,
1650
+ )
1651
+ tts_parser.add_argument(
1652
+ "--pitch",
1653
+ type=int,
1654
+ help=pitch_description,
1655
+ choices=range(-24, 25),
1656
+ default=0,
1657
+ )
1658
+ tts_parser.add_argument(
1659
+ "--index_rate",
1660
+ type=float,
1661
+ help=index_rate_description,
1662
+ choices=[(i / 10) for i in range(11)],
1663
+ default=0.3,
1664
+ )
1665
+ tts_parser.add_argument(
1666
+ "--volume_envelope",
1667
+ type=float,
1668
+ help=volume_envelope_description,
1669
+ choices=[(i / 10) for i in range(11)],
1670
+ default=1,
1671
+ )
1672
+ tts_parser.add_argument(
1673
+ "--protect",
1674
+ type=float,
1675
+ help=protect_description,
1676
+ choices=[(i / 10) for i in range(6)],
1677
+ default=0.33,
1678
+ )
1679
+ tts_parser.add_argument(
1680
+ "--f0_method",
1681
+ type=str,
1682
+ help=f0_method_description,
1683
+ choices=[
1684
+ "crepe",
1685
+ "crepe-tiny",
1686
+ "rmvpe",
1687
+ "fcpe",
1688
+ "swift",
1689
+ "hybrid[crepe+rmvpe]",
1690
+ "hybrid[crepe+fcpe]",
1691
+ "hybrid[rmvpe+fcpe]",
1692
+ "hybrid[crepe+rmvpe+fcpe]",
1693
+ ],
1694
+ default="rmvpe",
1695
+ )
1696
+ tts_parser.add_argument(
1697
+ "--output_tts_path",
1698
+ type=str,
1699
+ help="Full path to save the synthesized TTS audio.",
1700
+ required=True,
1701
+ )
1702
+ tts_parser.add_argument(
1703
+ "--output_rvc_path",
1704
+ type=str,
1705
+ help="Full path to save the voice-converted audio using the synthesized TTS.",
1706
+ required=True,
1707
+ )
1708
+ tts_parser.add_argument(
1709
+ "--pth_path", type=str, help=pth_path_description, required=True
1710
+ )
1711
+ tts_parser.add_argument(
1712
+ "--index_path", type=str, help=index_path_description, required=True
1713
+ )
1714
+ tts_parser.add_argument(
1715
+ "--split_audio",
1716
+ type=lambda x: bool(strtobool(x)),
1717
+ choices=[True, False],
1718
+ help=split_audio_description,
1719
+ default=False,
1720
+ )
1721
+ tts_parser.add_argument(
1722
+ "--f0_autotune",
1723
+ type=lambda x: bool(strtobool(x)),
1724
+ choices=[True, False],
1725
+ help=f0_autotune_description,
1726
+ default=False,
1727
+ )
1728
+ tts_parser.add_argument(
1729
+ "--f0_autotune_strength",
1730
+ type=float,
1731
+ help=clean_strength_description,
1732
+ choices=[(i / 10) for i in range(11)],
1733
+ default=1.0,
1734
+ )
1735
+ proposed_pitch_description = "Proposed Pitch adjustment"
1736
+ tts_parser.add_argument(
1737
+ "--proposed_pitch",
1738
+ type=bool,
1739
+ help=proposed_pitch_description,
1740
+ choices=[True, False],
1741
+ default=False,
1742
+ )
1743
+ proposed_pitch_threshold_description = "Proposed Pitch adjustment value"
1744
+ tts_parser.add_argument(
1745
+ "--proposed_pitch_threshold",
1746
+ type=float,
1747
+ help=proposed_pitch_threshold_description,
1748
+ choices=[i for i in range(100, 500)],
1749
+ default=155.0,
1750
+ )
1751
+ tts_parser.add_argument(
1752
+ "--clean_audio",
1753
+ type=lambda x: bool(strtobool(x)),
1754
+ choices=[True, False],
1755
+ help=clean_audio_description,
1756
+ default=False,
1757
+ )
1758
+ tts_parser.add_argument(
1759
+ "--clean_strength",
1760
+ type=float,
1761
+ help=clean_strength_description,
1762
+ choices=[(i / 10) for i in range(11)],
1763
+ default=0.7,
1764
+ )
1765
+ tts_parser.add_argument(
1766
+ "--export_format",
1767
+ type=str,
1768
+ help=export_format_description,
1769
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
1770
+ default="WAV",
1771
+ )
1772
+ tts_parser.add_argument(
1773
+ "--embedder_model",
1774
+ type=str,
1775
+ help=embedder_model_description,
1776
+ choices=[
1777
+ "contentvec",
1778
+ "spin",
1779
+ "spin-v2",
1780
+ "chinese-hubert-base",
1781
+ "japanese-hubert-base",
1782
+ "korean-hubert-base",
1783
+ "custom",
1784
+ ],
1785
+ default="contentvec",
1786
+ )
1787
+ tts_parser.add_argument(
1788
+ "--embedder_model_custom",
1789
+ type=str,
1790
+ help=embedder_model_custom_description,
1791
+ default=None,
1792
+ )
1793
+
1794
+ # Parser for 'preprocess' mode
1795
+ preprocess_parser = subparsers.add_parser(
1796
+ "preprocess", help="Preprocess a dataset for training."
1797
+ )
1798
+ preprocess_parser.add_argument(
1799
+ "--model_name", type=str, help="Name of the model to be trained.", required=True
1800
+ )
1801
+ preprocess_parser.add_argument(
1802
+ "--dataset_path", type=str, help="Path to the dataset directory.", required=True
1803
+ )
1804
+ preprocess_parser.add_argument(
1805
+ "--sample_rate",
1806
+ type=int,
1807
+ help="Target sampling rate for the audio data.",
1808
+ choices=[32000, 40000, 48000],
1809
+ required=True,
1810
+ )
1811
+ preprocess_parser.add_argument(
1812
+ "--cpu_cores",
1813
+ type=int,
1814
+ help="Number of CPU cores to use for preprocessing.",
1815
+ choices=range(1, 65),
1816
+ )
1817
+ preprocess_parser.add_argument(
1818
+ "--cut_preprocess",
1819
+ type=str,
1820
+ choices=["Skip", "Simple", "Automatic"],
1821
+ help="Cut the dataset into smaller segments for faster preprocessing.",
1822
+ default="Automatic",
1823
+ required=True,
1824
+ )
1825
+ preprocess_parser.add_argument(
1826
+ "--process_effects",
1827
+ type=lambda x: bool(strtobool(x)),
1828
+ choices=[True, False],
1829
+ help="Disable all filters during preprocessing.",
1830
+ default=False,
1831
+ required=False,
1832
+ )
1833
+ preprocess_parser.add_argument(
1834
+ "--noise_reduction",
1835
+ type=lambda x: bool(strtobool(x)),
1836
+ choices=[True, False],
1837
+ help="Enable noise reduction during preprocessing.",
1838
+ default=False,
1839
+ required=False,
1840
+ )
1841
+ preprocess_parser.add_argument(
1842
+ "--noise_reduction_strength",
1843
+ type=float,
1844
+ help="Strength of the noise reduction filter.",
1845
+ choices=[(i / 10) for i in range(11)],
1846
+ default=0.7,
1847
+ required=False,
1848
+ )
1849
+ preprocess_parser.add_argument(
1850
+ "--chunk_len",
1851
+ type=float,
1852
+ help="Chunk length.",
1853
+ choices=[i * 0.5 for i in range(1, 11)],
1854
+ default=3.0,
1855
+ required=False,
1856
+ )
1857
+ preprocess_parser.add_argument(
1858
+ "--overlap_len",
1859
+ type=float,
1860
+ help="Overlap length.",
1861
+ choices=[0.0, 0.1, 0.2, 0.3, 0.4],
1862
+ default=0.3,
1863
+ required=False,
1864
+ )
1865
+ preprocess_parser.add_argument(
1866
+ "--normalization_mode",
1867
+ type=str,
1868
+ help="Normalization mode.",
1869
+ choices=["none", "pre", "post"],
1870
+ default="none",
1871
+ required=False,
1872
+ )
1873
+
1874
+ # Parser for 'extract' mode
1875
+ extract_parser = subparsers.add_parser(
1876
+ "extract", help="Extract features from a dataset."
1877
+ )
1878
+ extract_parser.add_argument(
1879
+ "--model_name", type=str, help="Name of the model.", required=True
1880
+ )
1881
+ extract_parser.add_argument(
1882
+ "--f0_method",
1883
+ type=str,
1884
+ help="Pitch extraction method to use.",
1885
+ choices=[
1886
+ "crepe",
1887
+ "crepe-tiny",
1888
+ "rmvpe",
1889
+ "fcpe",
1890
+ ],
1891
+ default="rmvpe",
1892
+ )
1893
+ extract_parser.add_argument(
1894
+ "--cpu_cores",
1895
+ type=int,
1896
+ help="Number of CPU cores to use for feature extraction (optional).",
1897
+ choices=range(1, 65),
1898
+ default=None,
1899
+ )
1900
+ extract_parser.add_argument(
1901
+ "--gpu",
1902
+ type=str,
1903
+ help="GPU device to use for feature extraction (optional).",
1904
+ default="-",
1905
+ )
1906
+ extract_parser.add_argument(
1907
+ "--sample_rate",
1908
+ type=int,
1909
+ help="Target sampling rate for the audio data.",
1910
+ choices=[32000, 40000, 44100, 48000],
1911
+ required=True,
1912
+ )
1913
+ extract_parser.add_argument(
1914
+ "--embedder_model",
1915
+ type=str,
1916
+ help=embedder_model_description,
1917
+ choices=[
1918
+ "contentvec",
1919
+ "spin",
1920
+ "spin-v2",
1921
+ "chinese-hubert-base",
1922
+ "japanese-hubert-base",
1923
+ "korean-hubert-base",
1924
+ "custom",
1925
+ ],
1926
+ default="contentvec",
1927
+ )
1928
+ extract_parser.add_argument(
1929
+ "--embedder_model_custom",
1930
+ type=str,
1931
+ help=embedder_model_custom_description,
1932
+ default=None,
1933
+ )
1934
+ extract_parser.add_argument(
1935
+ "--include_mutes",
1936
+ type=int,
1937
+ help="Number of silent files to include.",
1938
+ choices=range(0, 11),
1939
+ default=2,
1940
+ required=True,
1941
+ )
1942
+
1943
+ # Parser for 'train' mode
1944
+ train_parser = subparsers.add_parser("train", help="Train an RVC model.")
1945
+ train_parser.add_argument(
1946
+ "--model_name", type=str, help="Name of the model to be trained.", required=True
1947
+ )
1948
+ train_parser.add_argument(
1949
+ "--vocoder",
1950
+ type=str,
1951
+ help="Vocoder name",
1952
+ choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"],
1953
+ default="HiFi-GAN",
1954
+ )
1955
+ train_parser.add_argument(
1956
+ "--checkpointing",
1957
+ type=lambda x: bool(strtobool(x)),
1958
+ choices=[True, False],
1959
+ help="Enables memory-efficient training.",
1960
+ default=False,
1961
+ required=False,
1962
+ )
1963
+ train_parser.add_argument(
1964
+ "--save_every_epoch",
1965
+ type=int,
1966
+ help="Save the model every specified number of epochs.",
1967
+ choices=range(1, 101),
1968
+ required=True,
1969
+ )
1970
+ train_parser.add_argument(
1971
+ "--save_only_latest",
1972
+ type=lambda x: bool(strtobool(x)),
1973
+ choices=[True, False],
1974
+ help="Save only the latest model checkpoint.",
1975
+ default=False,
1976
+ )
1977
+ train_parser.add_argument(
1978
+ "--save_every_weights",
1979
+ type=lambda x: bool(strtobool(x)),
1980
+ choices=[True, False],
1981
+ help="Save model weights every epoch.",
1982
+ default=True,
1983
+ )
1984
+ train_parser.add_argument(
1985
+ "--total_epoch",
1986
+ type=int,
1987
+ help="Total number of epochs to train for.",
1988
+ choices=range(1, 10001),
1989
+ default=1000,
1990
+ )
1991
+ train_parser.add_argument(
1992
+ "--sample_rate",
1993
+ type=int,
1994
+ help="Sampling rate of the training data.",
1995
+ choices=[32000, 40000, 48000],
1996
+ required=True,
1997
+ )
1998
+ train_parser.add_argument(
1999
+ "--batch_size",
2000
+ type=int,
2001
+ help="Batch size for training.",
2002
+ choices=range(1, 51),
2003
+ default=8,
2004
+ )
2005
+ train_parser.add_argument(
2006
+ "--gpu",
2007
+ type=str,
2008
+ help="GPU device to use for training (e.g., '0').",
2009
+ default="0",
2010
+ )
2011
+ train_parser.add_argument(
2012
+ "--pretrained",
2013
+ type=lambda x: bool(strtobool(x)),
2014
+ choices=[True, False],
2015
+ help="Use a pretrained model for initialization.",
2016
+ default=True,
2017
+ )
2018
+ train_parser.add_argument(
2019
+ "--custom_pretrained",
2020
+ type=lambda x: bool(strtobool(x)),
2021
+ choices=[True, False],
2022
+ help="Use a custom pretrained model.",
2023
+ default=False,
2024
+ )
2025
+ train_parser.add_argument(
2026
+ "--g_pretrained_path",
2027
+ type=str,
2028
+ nargs="?",
2029
+ default=None,
2030
+ help="Path to the pretrained generator model file.",
2031
+ )
2032
+ train_parser.add_argument(
2033
+ "--d_pretrained_path",
2034
+ type=str,
2035
+ nargs="?",
2036
+ default=None,
2037
+ help="Path to the pretrained discriminator model file.",
2038
+ )
2039
+ train_parser.add_argument(
2040
+ "--overtraining_detector",
2041
+ type=lambda x: bool(strtobool(x)),
2042
+ choices=[True, False],
2043
+ help="Enable overtraining detection.",
2044
+ default=False,
2045
+ )
2046
+ train_parser.add_argument(
2047
+ "--overtraining_threshold",
2048
+ type=int,
2049
+ help="Threshold for overtraining detection.",
2050
+ choices=range(1, 101),
2051
+ default=50,
2052
+ )
2053
+ train_parser.add_argument(
2054
+ "--cleanup",
2055
+ type=lambda x: bool(strtobool(x)),
2056
+ choices=[True, False],
2057
+ help="Cleanup previous training attempt.",
2058
+ default=False,
2059
+ )
2060
+ train_parser.add_argument(
2061
+ "--cache_data_in_gpu",
2062
+ type=lambda x: bool(strtobool(x)),
2063
+ choices=[True, False],
2064
+ help="Cache training data in GPU memory.",
2065
+ default=False,
2066
+ )
2067
+ train_parser.add_argument(
2068
+ "--index_algorithm",
2069
+ type=str,
2070
+ choices=["Auto", "Faiss", "KMeans"],
2071
+ help="Choose the method for generating the index file.",
2072
+ default="Auto",
2073
+ required=False,
2074
+ )
2075
+
2076
+ # Parser for 'index' mode
2077
+ index_parser = subparsers.add_parser(
2078
+ "index", help="Generate an index file for an RVC model."
2079
+ )
2080
+ index_parser.add_argument(
2081
+ "--model_name", type=str, help="Name of the model.", required=True
2082
+ )
2083
+ index_parser.add_argument(
2084
+ "--index_algorithm",
2085
+ type=str,
2086
+ choices=["Auto", "Faiss", "KMeans"],
2087
+ help="Choose the method for generating the index file.",
2088
+ default="Auto",
2089
+ required=False,
2090
+ )
2091
+
2092
+ # Parser for 'model_information' mode
2093
+ model_information_parser = subparsers.add_parser(
2094
+ "model_information", help="Display information about a trained model."
2095
+ )
2096
+ model_information_parser.add_argument(
2097
+ "--pth_path", type=str, help="Path to the .pth model file.", required=True
2098
+ )
2099
+
2100
+ # Parser for 'model_blender' mode
2101
+ model_blender_parser = subparsers.add_parser(
2102
+ "model_blender", help="Fuse two RVC models together."
2103
+ )
2104
+ model_blender_parser.add_argument(
2105
+ "--model_name", type=str, help="Name of the new fused model.", required=True
2106
+ )
2107
+ model_blender_parser.add_argument(
2108
+ "--pth_path_1",
2109
+ type=str,
2110
+ help="Path to the first .pth model file.",
2111
+ required=True,
2112
+ )
2113
+ model_blender_parser.add_argument(
2114
+ "--pth_path_2",
2115
+ type=str,
2116
+ help="Path to the second .pth model file.",
2117
+ required=True,
2118
+ )
2119
+ model_blender_parser.add_argument(
2120
+ "--ratio",
2121
+ type=float,
2122
+ help="Ratio for blending the two models (0.0 to 1.0).",
2123
+ choices=[(i / 10) for i in range(11)],
2124
+ default=0.5,
2125
+ )
2126
+
2127
+ # Parser for 'tensorboard' mode
2128
+ subparsers.add_parser(
2129
+ "tensorboard", help="Launch TensorBoard for monitoring training progress."
2130
+ )
2131
+
2132
+ # Parser for 'download' mode
2133
+ download_parser = subparsers.add_parser(
2134
+ "download", help="Download a model from a provided link."
2135
+ )
2136
+ download_parser.add_argument(
2137
+ "--model_link", type=str, help="Direct link to the model file.", required=True
2138
+ )
2139
+
2140
+ # Parser for 'prerequisites' mode
2141
+ prerequisites_parser = subparsers.add_parser(
2142
+ "prerequisites", help="Install prerequisites for RVC."
2143
+ )
2144
+ prerequisites_parser.add_argument(
2145
+ "--pretraineds_hifigan",
2146
+ type=lambda x: bool(strtobool(x)),
2147
+ choices=[True, False],
2148
+ default=True,
2149
+ help="Download pretrained models for RVC v2.",
2150
+ )
2151
+ prerequisites_parser.add_argument(
2152
+ "--models",
2153
+ type=lambda x: bool(strtobool(x)),
2154
+ choices=[True, False],
2155
+ default=True,
2156
+ help="Download additional models.",
2157
+ )
2158
+ prerequisites_parser.add_argument(
2159
+ "--exe",
2160
+ type=lambda x: bool(strtobool(x)),
2161
+ choices=[True, False],
2162
+ default=True,
2163
+ help="Download required executables.",
2164
+ )
2165
+
2166
+ # Parser for 'audio_analyzer' mode
2167
+ audio_analyzer = subparsers.add_parser(
2168
+ "audio_analyzer", help="Analyze an audio file."
2169
+ )
2170
+ audio_analyzer.add_argument(
2171
+ "--input_path", type=str, help="Path to the input audio file.", required=True
2172
+ )
2173
+
2174
+ return parser.parse_args()
2175
+
2176
+
2177
+ def main():
2178
+ if len(sys.argv) == 1:
2179
+ print("Please run the script with '-h' for more information.")
2180
+ sys.exit(1)
2181
+
2182
+ args = parse_arguments()
2183
+
2184
+ try:
2185
+ if args.mode == "infer":
2186
+ run_infer_script(
2187
+ pitch=args.pitch,
2188
+ index_rate=args.index_rate,
2189
+ volume_envelope=args.volume_envelope,
2190
+ protect=args.protect,
2191
+ f0_method=args.f0_method,
2192
+ input_path=args.input_path,
2193
+ output_path=args.output_path,
2194
+ pth_path=args.pth_path,
2195
+ index_path=args.index_path,
2196
+ split_audio=args.split_audio,
2197
+ f0_autotune=args.f0_autotune,
2198
+ f0_autotune_strength=args.f0_autotune_strength,
2199
+ proposed_pitch=args.proposed_pitch,
2200
+ proposed_pitch_threshold=args.proposed_pitch_threshold,
2201
+ clean_audio=args.clean_audio,
2202
+ clean_strength=args.clean_strength,
2203
+ export_format=args.export_format,
2204
+ embedder_model=args.embedder_model,
2205
+ embedder_model_custom=args.embedder_model_custom,
2206
+ formant_shifting=args.formant_shifting,
2207
+ formant_qfrency=args.formant_qfrency,
2208
+ formant_timbre=args.formant_timbre,
2209
+ sid=args.sid,
2210
+ post_process=args.post_process,
2211
+ reverb=args.reverb,
2212
+ pitch_shift=args.pitch_shift,
2213
+ limiter=args.limiter,
2214
+ gain=args.gain,
2215
+ distortion=args.distortion,
2216
+ chorus=args.chorus,
2217
+ bitcrush=args.bitcrush,
2218
+ clipping=args.clipping,
2219
+ compressor=args.compressor,
2220
+ delay=args.delay,
2221
+ reverb_room_size=args.reverb_room_size,
2222
+ reverb_damping=args.reverb_damping,
2223
+ reverb_wet_gain=args.reverb_wet_gain,
2224
+ reverb_dry_gain=args.reverb_dry_gain,
2225
+ reverb_width=args.reverb_width,
2226
+ reverb_freeze_mode=args.reverb_freeze_mode,
2227
+ pitch_shift_semitones=args.pitch_shift_semitones,
2228
+ limiter_threshold=args.limiter_threshold,
2229
+ limiter_release_time=args.limiter_release_time,
2230
+ gain_db=args.gain_db,
2231
+ distortion_gain=args.distortion_gain,
2232
+ chorus_rate=args.chorus_rate,
2233
+ chorus_depth=args.chorus_depth,
2234
+ chorus_center_delay=args.chorus_center_delay,
2235
+ chorus_feedback=args.chorus_feedback,
2236
+ chorus_mix=args.chorus_mix,
2237
+ bitcrush_bit_depth=args.bitcrush_bit_depth,
2238
+ clipping_threshold=args.clipping_threshold,
2239
+ compressor_threshold=args.compressor_threshold,
2240
+ compressor_ratio=args.compressor_ratio,
2241
+ compressor_attack=args.compressor_attack,
2242
+ compressor_release=args.compressor_release,
2243
+ delay_seconds=args.delay_seconds,
2244
+ delay_feedback=args.delay_feedback,
2245
+ delay_mix=args.delay_mix,
2246
+ )
2247
+ elif args.mode == "batch_infer":
2248
+ run_batch_infer_script(
2249
+ pitch=args.pitch,
2250
+ index_rate=args.index_rate,
2251
+ volume_envelope=args.volume_envelope,
2252
+ protect=args.protect,
2253
+ f0_method=args.f0_method,
2254
+ input_folder=args.input_folder,
2255
+ output_folder=args.output_folder,
2256
+ pth_path=args.pth_path,
2257
+ index_path=args.index_path,
2258
+ split_audio=args.split_audio,
2259
+ f0_autotune=args.f0_autotune,
2260
+ f0_autotune_strength=args.f0_autotune_strength,
2261
+ proposed_pitch=args.proposed_pitch,
2262
+ proposed_pitch_threshold=args.proposed_pitch_threshold,
2263
+ clean_audio=args.clean_audio,
2264
+ clean_strength=args.clean_strength,
2265
+ export_format=args.export_format,
2266
+ embedder_model=args.embedder_model,
2267
+ embedder_model_custom=args.embedder_model_custom,
2268
+ formant_shifting=args.formant_shifting,
2269
+ formant_qfrency=args.formant_qfrency,
2270
+ formant_timbre=args.formant_timbre,
2271
+ sid=args.sid,
2272
+ post_process=args.post_process,
2273
+ reverb=args.reverb,
2274
+ pitch_shift=args.pitch_shift,
2275
+ limiter=args.limiter,
2276
+ gain=args.gain,
2277
+ distortion=args.distortion,
2278
+ chorus=args.chorus,
2279
+ bitcrush=args.bitcrush,
2280
+ clipping=args.clipping,
2281
+ compressor=args.compressor,
2282
+ delay=args.delay,
2283
+ reverb_room_size=args.reverb_room_size,
2284
+ reverb_damping=args.reverb_damping,
2285
+ reverb_wet_gain=args.reverb_wet_gain,
2286
+ reverb_dry_gain=args.reverb_dry_gain,
2287
+ reverb_width=args.reverb_width,
2288
+ reverb_freeze_mode=args.reverb_freeze_mode,
2289
+ pitch_shift_semitones=args.pitch_shift_semitones,
2290
+ limiter_threshold=args.limiter_threshold,
2291
+ limiter_release_time=args.limiter_release_time,
2292
+ gain_db=args.gain_db,
2293
+ distortion_gain=args.distortion_gain,
2294
+ chorus_rate=args.chorus_rate,
2295
+ chorus_depth=args.chorus_depth,
2296
+ chorus_center_delay=args.chorus_center_delay,
2297
+ chorus_feedback=args.chorus_feedback,
2298
+ chorus_mix=args.chorus_mix,
2299
+ bitcrush_bit_depth=args.bitcrush_bit_depth,
2300
+ clipping_threshold=args.clipping_threshold,
2301
+ compressor_threshold=args.compressor_threshold,
2302
+ compressor_ratio=args.compressor_ratio,
2303
+ compressor_attack=args.compressor_attack,
2304
+ compressor_release=args.compressor_release,
2305
+ delay_seconds=args.delay_seconds,
2306
+ delay_feedback=args.delay_feedback,
2307
+ delay_mix=args.delay_mix,
2308
+ )
2309
+ elif args.mode == "tts":
2310
+ run_tts_script(
2311
+ tts_file=args.tts_file,
2312
+ tts_text=args.tts_text,
2313
+ tts_voice=args.tts_voice,
2314
+ tts_rate=args.tts_rate,
2315
+ pitch=args.pitch,
2316
+ index_rate=args.index_rate,
2317
+ volume_envelope=args.volume_envelope,
2318
+ protect=args.protect,
2319
+ f0_method=args.f0_method,
2320
+ output_tts_path=args.output_tts_path,
2321
+ output_rvc_path=args.output_rvc_path,
2322
+ pth_path=args.pth_path,
2323
+ index_path=args.index_path,
2324
+ split_audio=args.split_audio,
2325
+ f0_autotune=args.f0_autotune,
2326
+ f0_autotune_strength=args.f0_autotune_strength,
2327
+ proposed_pitch=args.proposed_pitch,
2328
+ proposed_pitch_threshold=args.proposed_pitch_threshold,
2329
+ clean_audio=args.clean_audio,
2330
+ clean_strength=args.clean_strength,
2331
+ export_format=args.export_format,
2332
+ embedder_model=args.embedder_model,
2333
+ embedder_model_custom=args.embedder_model_custom,
2334
+ )
2335
+ elif args.mode == "preprocess":
2336
+ run_preprocess_script(
2337
+ model_name=args.model_name,
2338
+ dataset_path=args.dataset_path,
2339
+ sample_rate=args.sample_rate,
2340
+ cpu_cores=args.cpu_cores,
2341
+ cut_preprocess=args.cut_preprocess,
2342
+ process_effects=args.process_effects,
2343
+ noise_reduction=args.noise_reduction,
2344
+ clean_strength=args.noise_reduction_strength,
2345
+ chunk_len=args.chunk_len,
2346
+ overlap_len=args.overlap_len,
2347
+ normalization_mode=args.normalization_mode,
2348
+ )
2349
+ elif args.mode == "extract":
2350
+ run_extract_script(
2351
+ model_name=args.model_name,
2352
+ f0_method=args.f0_method,
2353
+ cpu_cores=args.cpu_cores,
2354
+ gpu=args.gpu,
2355
+ sample_rate=args.sample_rate,
2356
+ embedder_model=args.embedder_model,
2357
+ embedder_model_custom=args.embedder_model_custom,
2358
+ include_mutes=args.include_mutes,
2359
+ )
2360
+ elif args.mode == "train":
2361
+ run_train_script(
2362
+ model_name=args.model_name,
2363
+ save_every_epoch=args.save_every_epoch,
2364
+ save_only_latest=args.save_only_latest,
2365
+ save_every_weights=args.save_every_weights,
2366
+ total_epoch=args.total_epoch,
2367
+ sample_rate=args.sample_rate,
2368
+ batch_size=args.batch_size,
2369
+ gpu=args.gpu,
2370
+ overtraining_detector=args.overtraining_detector,
2371
+ overtraining_threshold=args.overtraining_threshold,
2372
+ pretrained=args.pretrained,
2373
+ custom_pretrained=args.custom_pretrained,
2374
+ cleanup=args.cleanup,
2375
+ index_algorithm=args.index_algorithm,
2376
+ cache_data_in_gpu=args.cache_data_in_gpu,
2377
+ g_pretrained_path=args.g_pretrained_path,
2378
+ d_pretrained_path=args.d_pretrained_path,
2379
+ vocoder=args.vocoder,
2380
+ checkpointing=args.checkpointing,
2381
+ )
2382
+ elif args.mode == "index":
2383
+ run_index_script(
2384
+ model_name=args.model_name,
2385
+ index_algorithm=args.index_algorithm,
2386
+ )
2387
+ elif args.mode == "model_information":
2388
+ run_model_information_script(
2389
+ pth_path=args.pth_path,
2390
+ )
2391
+ elif args.mode == "model_blender":
2392
+ run_model_blender_script(
2393
+ model_name=args.model_name,
2394
+ pth_path_1=args.pth_path_1,
2395
+ pth_path_2=args.pth_path_2,
2396
+ ratio=args.ratio,
2397
+ )
2398
+ elif args.mode == "tensorboard":
2399
+ run_tensorboard_script()
2400
+ elif args.mode == "download":
2401
+ run_download_script(
2402
+ model_link=args.model_link,
2403
+ )
2404
+ elif args.mode == "prerequisites":
2405
+ run_prerequisites_script(
2406
+ pretraineds_hifigan=args.pretraineds_hifigan,
2407
+ models=args.models,
2408
+ exe=args.exe,
2409
+ )
2410
+ elif args.mode == "audio_analyzer":
2411
+ run_audio_analyzer_script(
2412
+ input_path=args.input_path,
2413
+ )
2414
+ except Exception as error:
2415
+ print(f"An error occurred during execution: {error}")
2416
+
2417
+ import traceback
2418
+
2419
+ traceback.print_exc()
2420
+
2421
+
2422
+ if __name__ == "__main__":
2423
+ main()
docker-compose.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '1'
2
+
3
+ services:
4
+ applio:
5
+ build:
6
+ context: ./
7
+ dockerfile: Dockerfile
8
+ ports:
9
+ - "6969"
10
+ deploy:
11
+ resources:
12
+ reservations:
13
+ devices:
14
+ - driver: nvidia
15
+ count: 1
16
+ capabilities: [gpu]
requirements.txt ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ pip>=23.3; sys_platform == 'darwin'
3
+ wheel; sys_platform == 'darwin'
4
+ PyYAML; sys_platform == 'darwin'
5
+ numpy==1.26.4
6
+ requests>=2.31.0,<2.32.0
7
+ tqdm
8
+ wget
9
+
10
+ # Audio processing
11
+ ffmpeg-python>=0.2.0
12
+ faiss-cpu==1.7.3
13
+ librosa==0.11.0
14
+ scipy==1.11.1
15
+ soundfile==0.12.1
16
+ noisereduce
17
+ pedalboard
18
+ stftpitchshift
19
+ soxr
20
+
21
+ # Machine learning and deep learning
22
+ omegaconf>=2.0.6; sys_platform == 'darwin'
23
+ numba; sys_platform == 'linux'
24
+ numba==0.61.0; sys_platform == 'darwin' or sys_platform == 'win32'
25
+ torch==2.7.1; sys_platform == 'darwin'
26
+ torch==2.7.1+cu128; sys_platform == 'linux' or sys_platform == 'win32'
27
+ torchaudio==2.7.1; sys_platform == 'darwin'
28
+ torchaudio==2.7.1+cu128; sys_platform == 'linux' or sys_platform == 'win32'
29
+ torchvision==0.22.1; sys_platform == 'darwin'
30
+ torchvision==0.22.1+cu128; sys_platform == 'linux' or sys_platform == 'win32'
31
+ torchcrepe==0.0.23
32
+ torchfcpe
33
+ swift_f0
34
+ einops
35
+ transformers==4.44.2
36
+
37
+ # Visualization and UI
38
+ matplotlib==3.7.2
39
+ tensorboard
40
+ gradio==5.23.1
41
+
42
+ # Miscellaneous utilities
43
+ certifi>=2023.07.22; sys_platform == 'darwin'
44
+ antlr4-python3-runtime==4.8; sys_platform == 'darwin'
45
+ tensorboardX
46
+ edge-tts==7.2.0
47
+ pypresence
48
+ beautifulsoup4
49
+ sounddevice
50
+ webrtcvad
run-applio.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+ printf "\033]0;Applio\007"
3
+ . .venv/bin/activate
4
+
5
+ export PYTORCH_ENABLE_MPS_FALLBACK=1
6
+ export PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
7
+
8
+ clear
9
+ python app.py --open
run-install.sh ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e # Exit immediately if a command exits with a non-zero status
3
+
4
+ printf "\033]0;Installer\007"
5
+ clear
6
+ rm -f *.bat
7
+
8
+ # Function to log messages with timestamps
9
+ log_message() {
10
+ local msg="$1"
11
+ echo "$(date '+%Y-%m-%d %H:%M:%S') - $msg"
12
+ }
13
+
14
+ # Function to find a suitable Python version
15
+ find_python() {
16
+ for py in python3.11 python3 python; do
17
+ if command -v "$py" > /dev/null 2>&1; then
18
+ echo "$py"
19
+ return
20
+ fi
21
+ done
22
+ log_message "No compatible Python installation found. Please install Python 3.11."
23
+ exit 1
24
+ }
25
+
26
+ # Function to install FFmpeg based on the distribution
27
+ install_ffmpeg() {
28
+ if command -v brew > /dev/null; then
29
+ log_message "Installing FFmpeg using Homebrew on macOS..."
30
+ brew install ffmpeg
31
+ elif command -v apt > /dev/null; then
32
+ log_message "Installing FFmpeg using apt..."
33
+ sudo apt update && sudo apt install -y ffmpeg
34
+ elif command -v pacman > /dev/null; then
35
+ log_message "Installing FFmpeg using pacman..."
36
+ sudo pacman -Syu --noconfirm ffmpeg
37
+ elif command -v dnf > /dev/null; then
38
+ log_message "Installing FFmpeg using dnf..."
39
+ sudo dnf install -y ffmpeg --allowerasing || install_ffmpeg_flatpak
40
+ else
41
+ log_message "Unsupported distribution for FFmpeg installation. Trying Flatpak..."
42
+ install_ffmpeg_flatpak
43
+ fi
44
+ }
45
+
46
+ # Function to install FFmpeg using Flatpak
47
+ install_ffmpeg_flatpak() {
48
+ if command -v flatpak > /dev/null; then
49
+ log_message "Installing FFmpeg using Flatpak..."
50
+ flatpak install --user -y flathub org.freedesktop.Platform.ffmpeg
51
+ else
52
+ log_message "Flatpak is not installed. Installing Flatpak..."
53
+ if command -v apt > /dev/null; then
54
+ sudo apt install -y flatpak
55
+ elif command -v pacman > /dev/null; then
56
+ sudo pacman -Syu --noconfirm flatpak
57
+ elif command -v dnf > /dev/null; then
58
+ sudo dnf install -y flatpak
59
+ elif command -v brew > /dev/null; then
60
+ brew install flatpak
61
+ else
62
+ log_message "Unable to install Flatpak automatically. Please install Flatpak and try again."
63
+ exit 1
64
+ fi
65
+ flatpak install --user -y flathub org.freedesktop.Platform.ffmpeg
66
+ fi
67
+ }
68
+
69
+ install_python_ffmpeg() {
70
+ log_message "Installing python-ffmpeg..."
71
+ uv pip install python-ffmpeg
72
+ }
73
+
74
+ # Function to create or activate a virtual environment
75
+ prepare_install() {
76
+ if [ -d ".venv" ]; then
77
+ log_message "Virtual environment found. This implies Applio has been already installed or this is a broken install."
78
+ printf "Do you want to execute run-applio.sh? (Y/N): " >&2
79
+ read -r r
80
+ r=$(echo "$r" | tr '[:upper:]' '[:lower:]')
81
+ if [ "$r" = "y" ]; then
82
+ chmod +x run-applio.sh
83
+ ./run-applio.sh && exit 0
84
+ else
85
+ log_message "Continuing with the installation."
86
+ rm -rf .venv
87
+ create_venv
88
+ fi
89
+ else
90
+ create_venv
91
+ fi
92
+ }
93
+
94
+ # Function to create the virtual environment and install dependencies
95
+ create_venv() {
96
+ log_message "Creating virtual environment..."
97
+ py=$(find_python)
98
+
99
+ curl -LsSf https://astral.sh/uv/install.sh | sh
100
+ uv venv .venv --python 3.11
101
+
102
+ log_message "Activating virtual environment..."
103
+ source .venv/bin/activate
104
+
105
+ install_ffmpeg
106
+ install_python_ffmpeg
107
+
108
+ log_message "Installing dependencies..."
109
+ if [ -f "requirements.txt" ]; then
110
+ uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu128 --index-strategy unsafe-best-match
111
+ else
112
+ log_message "requirements.txt not found. Please ensure it exists."
113
+ exit 1
114
+ fi
115
+
116
+ finish
117
+ }
118
+
119
+ # Function to finish installation
120
+ finish() {
121
+ clear
122
+ echo "Applio has been successfully installed. Run the file run-applio.sh to start the web interface!"
123
+ exit 0
124
+ }
125
+
126
+ # Main script execution
127
+ if [ "$(uname)" = "Darwin" ]; then
128
+ log_message "Detected macOS..."
129
+ if ! command -v brew >/dev/null 2>&1; then
130
+ log_message "Homebrew not found. Installing Homebrew..."
131
+ /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
132
+ fi
133
+
134
+ # Add more detailed Python version check
135
+ log_message "Checking Python versions..."
136
+ log_message "python3 path: $(which python3)"
137
+ log_message "python3.11 path: $(which python3.11 2>/dev/null || echo 'not found')"
138
+
139
+ if command -v python3.11 >/dev/null 2>&1; then
140
+ python_version=$(python3.11 --version | awk '{print $2}' | cut -d'.' -f1,2)
141
+ else
142
+ python_version=$(python3 --version | awk '{print $2}' | cut -d'.' -f1,2)
143
+ fi
144
+
145
+ log_message "Detected Python version: $python_version"
146
+
147
+ if [ "$python_version" = "3.11" ]; then
148
+ log_message "Found compatible Python 3.11"
149
+ else
150
+ log_message "Python version $python_version is not 3.11. Installing Python 3.11 using Homebrew..."
151
+ brew install python@3.11
152
+ export PATH="$(brew --prefix)/opt/python@3.11/bin:$PATH"
153
+ # Verify the installed version
154
+ log_message "Verifying installed Python version..."
155
+ python_version=$(python3.11 --version | awk '{print $2}' | cut -d'.' -f1,2)
156
+ if [ "$python_version" != "3.11" ]; then
157
+ log_message "Failed to install Python 3.11. Current version: $python_version"
158
+ exit 1
159
+ fi
160
+ fi
161
+
162
+ brew install faiss
163
+ export PYTORCH_ENABLE_MPS_FALLBACK=1
164
+ export PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
165
+ export PATH="$(brew --prefix)/bin:$PATH"
166
+ elif [ "$(uname)" != "Linux" ]; then
167
+ log_message "Unsupported operating system. Are you using Windows?"
168
+ log_message "If yes, use the batch (.bat) file instead of this one!"
169
+ exit 1
170
+ fi
171
+
172
+ prepare_install
173
+
174
+
run-tensorboard.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+ printf "\033]0;Tensorboard\007"
3
+ . .venv/bin/activate
4
+
5
+ clear
6
+ python core.py tensorboard