medallo commited on
Commit
f32558d
·
verified ·
1 Parent(s): 6368047

Upload 14 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ FROM python:3.11-slim-bookworm AS base
3
+
4
+ ARG APP_NAME=xtts-finetune-webui
5
+ ARG CUDA_VER=cu121
6
+ ARG GID=966
7
+ ARG UID=966
8
+ ARG WHISPER_MODEL="large-v3"
9
+
10
+ # Environment
11
+ ENV APP_NAME=$APP_NAME \
12
+ CUDA_VER=$CUDA_VER \
13
+ WHISPER_MODEL=$WHISPER_MODEL
14
+
15
+ # User configuration
16
+ ENV HOME /app/$APP_NAME
17
+ RUN groupadd -r app -g $GID && \
18
+ useradd --no-log-init -m -r -g app app -u $UID
19
+
20
+ # Prepare file-system
21
+ RUN mkdir -p /app/server && chown -R $UID:$GID /app
22
+ COPY --chown=$UID:$GID *.py *.sh *.txt *.md /app/server/
23
+ ADD --chown=$UID:$GID utils /app/server/utils
24
+
25
+ # Enter environment and install dependencies
26
+ WORKDIR /app/server
27
+
28
+ USER $UID:$GID
29
+
30
+ ENV NVIDIA_VISIBLE_DEVICES=all PATH=$PATH:$HOME/.local/bin
31
+ # Install nvidia-pyindex & nvidia-cudnn for libcudnn_ops_infer.so.8
32
+ # See: https://github.com/SYSTRAN/faster-whisper/issues/516
33
+ RUN pip3 install --user --no-cache-dir nvidia-pyindex && \
34
+ pip3 install --user --no-cache-dir nvidia-cudnn && \
35
+ pip3 install --user --no-cache-dir torch torchvision torchaudio \
36
+ --index-url https://download.pytorch.org/whl/$CUDA_VER && \
37
+ pip3 install --user --no-cache-dir -r requirements.txt --no-cache-dir && \
38
+ python3 -c "import os; from faster_whisper import WhisperModel; WhisperModel(os.environ['WHISPER_MODEL'], device='cpu', compute_type='int8')"
39
+
40
+ # Ports and servername
41
+ EXPOSE 5003
42
+ ENV GRADIO_ANALYTICS_ENABLED="False"
43
+
44
+ CMD [ "bash", "start-container.sh"]
README.md CHANGED
@@ -1,14 +1,88 @@
1
- ---
2
- title: Xtts Webui
3
- emoji: 👀
4
- colorFrom: gray
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 5.6.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: xtts-webui
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # xtts-finetune-webui
2
+
3
+ This webui is a slightly modified copy of the [official webui](https://github.com/coqui-ai/TTS/pull/3296) for finetune xtts.
4
+
5
+ If you are looking for an option for normal XTTS use look here [https://github.com/daswer123/xtts-webui](https://github.com/daswer123/xtts-webui)
6
+
7
+ ## TODO
8
+ - [ ] Add the ability to use via console
9
+
10
+ ## Key features:
11
+
12
+ ### Data processing
13
+
14
+ 1. Updated faster-whisper to 0.10.0 with the ability to select a larger-v3 model.
15
+ 2. Changed output folder to output folder inside the main folder.
16
+ 3. If there is already a dataset in the output folder and you want to add new data, you can do so by simply adding new audio, what was there will not be processed again and the new data will be automatically added
17
+ 4. Turn on VAD filter
18
+ 5. After the dataset is created, a file is created that specifies the language of the dataset. This file is read before training so that the language always matches. It is convenient when you restart the interface
19
+
20
+ ### Fine-tuning XTTS Encoder
21
+
22
+ 1. Added the ability to select the base model for XTTS, as well as when you re-training does not need to download the model again.
23
+ 2. Added ability to select custom model as base model during training, which will allow finetune already finetune model.
24
+ 3. Added possibility to get optimized version of the model for 1 click ( step 2.5, put optimized version in output folder).
25
+ 4. You can choose whether to delete training folders after you have optimized the model
26
+ 5. When you optimize the model, the example reference audio is moved to the output folder
27
+ 6. Checking for correctness of the specified language and dataset language
28
+
29
+ ### Inference
30
+
31
+ 1. Added possibility to customize infer settings during model checking.
32
+
33
+ ### Other
34
+
35
+ 1. If you accidentally restart the interface during one of the steps, you can load data to additional buttons
36
+ 2. Removed the display of logs as it was causing problems when restarted
37
+ 3. The finished result is copied to the ready folder, these are fully finished files, you can move them anywhere and use them as a standard model
38
+ 4. Added support for finetune Japanese
39
+
40
+ ## Changes in webui
41
+
42
+ ### 1 - Data processing
43
+
44
+ ![image](https://github.com/daswer123/xtts-finetune-webui/assets/22278673/8f09b829-098b-48f5-9668-832e7319403b)
45
+
46
+ ### 2 - Fine-tuning XTTS Encoder
47
+
48
+ ![image](https://github.com/daswer123/xtts-finetune-webui/assets/22278673/897540d9-3a6b-463c-abb8-261c289cc929)
49
+
50
+ ### 3 - Inference
51
+
52
+ ![image](https://github.com/daswer123/xtts-finetune-webui/assets/22278673/aa05bcd4-8642-4de4-8f2f-bc0f5571af63)
53
+
54
+ ## Google colab
55
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DrewThomasson/xtts-finetune-webui/blob/main/notebook/xtts_finetune_webui.ipynb)
56
+
57
+ ## 🐳 Run in Docker
58
+ ```docker
59
+ docker run -it --gpus all --pull always -p 7860:7860 --platform=linux/amd64 athomasson2/fine_tune_xtts:huggingface python app.py
60
+ ```
61
+
62
+
63
+
64
+ ## Install
65
+
66
+ 1. Make sure you have `Cuda` installed
67
+ 2. `git clone https://github.com/daswer123/xtts-finetune-webui`
68
+ 3. `cd xtts-finetune-webui`
69
+ 4. `pip install torch==2.1.1+cu118 torchaudio==2.1.1+cu118 --index-url https://download.pytorch.org/whl/cu118`
70
+ 5. `pip install -r requirements.txt`
71
+
72
+ ### If you're using Windows
73
+
74
+ 1. First start `install.bat`
75
+ 2. To start the server start `start.bat`
76
+ 3. Go to the local address `127.0.0.1:5003`
77
+
78
+ ### On Linux
79
+
80
+ 1. Run `bash install.sh`
81
+ 2. To start the server start `start.sh`
82
+ 3. Go to the local address `127.0.0.1:5003`
83
+
84
+ ### On Apple Silicon Mac (python 3.10 env)
85
+ 1. Run `pip install --no-deps -r apple_silicon_requirements.txt`
86
+ 2. To start the server `python xtts_demo.py`
87
+ 3. Go to the local address `127.0.0.1:5003`
88
+ ~
apple_silicon_requirements.txt ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ aiofiles==23.2.1
3
+ aiohttp==3.9.5
4
+ aiosignal==1.3.1
5
+ altair==5.3.0
6
+ annotated-types==0.7.0
7
+ anyascii==0.3.2
8
+ anyio==3.7.1
9
+ async-timeout==4.0.3
10
+ attrs==23.2.0
11
+ audioread==3.0.1
12
+ av==12.2.0
13
+ Babel==2.15.0
14
+ bangla==0.0.2
15
+ blinker==1.8.2
16
+ blis==0.7.11
17
+ bnnumerizer==0.0.2
18
+ bnunicodenormalizer==0.1.7
19
+ catalogue==2.0.10
20
+ certifi==2024.7.4
21
+ cffi==1.16.0
22
+ charset-normalizer==3.3.2
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ colorama==0.4.6
26
+ coloredlogs==15.0.1
27
+ confection==0.1.5
28
+ contourpy==1.2.1
29
+ coqpit==0.0.17
30
+ coqui-tts==0.24.2
31
+ coqui-tts-trainer==0.1.4
32
+ ctranslate2==4.3.1
33
+ cutlet==0.4.0
34
+ cycler==0.12.1
35
+ cymem==2.0.8
36
+ Cython==3.0.10
37
+ dateparser==1.1.8
38
+ decorator==5.1.1
39
+ dnspython==2.6.1
40
+ docopt==0.6.2
41
+ einops==0.8.0
42
+ email_validator==2.2.0
43
+ encodec==0.1.1
44
+ exceptiongroup==1.2.2
45
+ fastapi==0.103.1
46
+ fastapi-cli==0.0.4
47
+ faster-whisper==1.0.2
48
+ ffmpy==0.3.2
49
+ filelock==3.15.4
50
+ Flask==3.0.3
51
+ flatbuffers==24.3.25
52
+ fonttools==4.53.1
53
+ frozenlist==1.4.1
54
+ fsspec==2024.6.1
55
+ fugashi==1.3.2
56
+ g2pkk==0.1.2
57
+ gradio==4.44.1
58
+ gradio_client==1.3.0
59
+ grpcio==1.64.1
60
+ gruut==2.4.0
61
+ gruut-ipa==0.13.0
62
+ gruut_lang_de==2.0.1
63
+ gruut_lang_en==2.0.1
64
+ gruut_lang_es==2.0.1
65
+ gruut_lang_fr==2.0.2
66
+ h11==0.14.0
67
+ hangul-romanize==0.1.0
68
+ httpcore==1.0.5
69
+ httptools==0.6.1
70
+ httpx==0.27.0
71
+ huggingface-hub==0.23.5
72
+ humanfriendly==10.0
73
+ idna==3.7
74
+ importlib_resources==6.4.0
75
+ inflect==7.3.1
76
+ itsdangerous==2.2.0
77
+ jaconv==0.4.0
78
+ jamo==0.4.1
79
+ jieba==0.42.1
80
+ Jinja2==3.1.4
81
+ joblib==1.4.2
82
+ jsonlines==1.2.0
83
+ jsonschema==4.23.0
84
+ jsonschema-specifications==2023.12.1
85
+ kiwisolver==1.4.5
86
+ langcodes==3.4.0
87
+ language_data==1.2.0
88
+ lazy_loader==0.4
89
+ librosa==0.10.2.post1
90
+ llvmlite==0.43.0
91
+ marisa-trie==1.2.0
92
+ Markdown==3.6
93
+ markdown-it-py==3.0.0
94
+ MarkupSafe==2.1.5
95
+ matplotlib==3.8.4
96
+ mdurl==0.1.2
97
+ mecab-python3==1.0.9
98
+ mojimoji==0.0.13
99
+ more-itertools==10.3.0
100
+ mpmath==1.3.0
101
+ msgpack==1.0.8
102
+ multidict==6.0.5
103
+ murmurhash==1.0.10
104
+ networkx==2.8.8
105
+ nltk==3.8.1
106
+ num2words==0.5.13
107
+ numba==0.60.0
108
+ numpy==1.26.4
109
+ onnxruntime==1.18.1
110
+ orjson==3.10.6
111
+ packaging==24.1
112
+ pandas==1.5.3
113
+ pillow==10.4.0
114
+ platformdirs==4.2.2
115
+ pooch==1.8.2
116
+ preshed==3.0.9
117
+ protobuf==4.25.3
118
+ psutil==6.0.0
119
+ pycparser==2.22
120
+ pydantic==2.3.0
121
+ pydantic_core==2.6.3
122
+ pydub==0.25.1
123
+ pygame==2.6.0
124
+ Pygments==2.18.0
125
+ pynndescent==0.5.13
126
+ pyparsing==3.1.2
127
+ pypinyin==0.51.0
128
+ pysbd==0.3.4
129
+ python-crfsuite==0.9.10
130
+ python-dateutil==2.9.0.post0
131
+ python-dotenv==1.0.1
132
+ python-multipart==0.0.9
133
+ pytz==2024.1
134
+ PyYAML==6.0.1
135
+ referencing==0.35.1
136
+ regex==2024.5.15
137
+ requests==2.32.3
138
+ rich==13.7.1
139
+ rpds-py==0.19.0
140
+ ruff==0.5.2
141
+ safetensors==0.4.3
142
+ scikit-learn==1.5.1
143
+ scipy==1.11.4
144
+ semantic-version==2.10.0
145
+ shellingham==1.5.4
146
+ six==1.16.0
147
+ smart-open==6.4.0
148
+ sniffio==1.3.1
149
+ soundfile==0.12.1
150
+ soxr==0.3.7
151
+ spacy==3.7.4
152
+ spacy-legacy==3.0.12
153
+ spacy-loggers==1.0.5
154
+ srsly==2.4.8
155
+ starlette==0.27.0
156
+ SudachiDict-core==20240409
157
+ SudachiPy==0.6.8
158
+ sympy==1.13.0
159
+ tensorboard==2.17.0
160
+ tensorboard-data-server==0.7.2
161
+ thinc==8.2.5
162
+ threadpoolctl==3.5.0
163
+ tokenizers==0.19.1
164
+ tomlkit==0.12.0
165
+ toolz==0.12.1
166
+ torch==2.3.1
167
+ torchaudio==2.3.1
168
+ tqdm==4.66.4
169
+ trainer==0.0.36
170
+ transformers==4.42.4
171
+ TTS==0.21.3
172
+ typeguard==4.3.0
173
+ typer==0.12.5
174
+ typing_extensions==4.12.2
175
+ tzdata==2024.1
176
+ tzlocal==5.2
177
+ umap-learn==0.5.6
178
+ Unidecode==1.3.8
179
+ unidic-lite==1.0.8
180
+ urllib3==2.2.2
181
+ uvicorn==0.30.1
182
+ uvloop==0.19.0
183
+ wasabi==1.1.3
184
+ watchfiles==0.22.0
185
+ weasel==0.3.4
186
+ websockets==11.0.3
187
+ Werkzeug==3.0.3
188
+ wrapt==1.16.0
189
+ yarl==1.9.4
install.bat ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+
3
+ python -m venv venv
4
+ call venv/scripts/activate
5
+
6
+
7
+ pip install -r .\requirements.txt
8
+ pip install torch==2.1.1+cu118 torchaudio==2.1.1+cu118 --index-url https://download.pytorch.org/whl/cu118
9
+
10
+ python xtts_demo.py
install.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Create a Python virtual environment
4
+ python -m venv venv
5
+ # Activate the virtual environment
6
+ source venv/bin/activate
7
+
8
+ # Install other dependencies from requirements.txt
9
+ pip install -r requirements.txt
10
+ pip install torch==2.1.1+cu118 torchaudio==2.1.1+cu118 --index-url https://download.pytorch.org/whl/cu118
11
+
12
+ python xtts_demo.py
13
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ faster_whisper==1.0.3
2
+ gradio==5.1.0
3
+ spacy==3.7.5
4
+ coqui-tts[languages] == 0.24.2
5
+
6
+ cutlet
7
+ fugashi[unidic-lite]
start-container.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Enable resolution of libcudnn_ops_infer.so.8
4
+ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/app/xtts-finetune-webui/.local/lib/python3.11/site-packages/torch/lib:/app/xtts-finetune-webui/.local/lib/python3.11/site-packages/nvidia/cudnn/lib"
5
+
6
+ python3 xtts_demo.py
start.bat ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ @echo off
2
+
3
+ call venv/scripts/activate
4
+
5
+ python xtts_demo.py
start.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Create a Python virtual environment
4
+ python -m venv venv
5
+ # Activate the virtual environment
6
+ source venv/bin/activate
7
+
8
+ python xtts_demo.py
9
+
utils/formatter.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
+ import torchaudio
4
+ import pandas
5
+ from faster_whisper import WhisperModel
6
+ from glob import glob
7
+
8
+ from tqdm import tqdm
9
+
10
+ from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
11
+ # Add support for JA train
12
+ # from utils.tokenizer import multilingual_cleaners
13
+
14
+ import torch
15
+ import torchaudio
16
+ # torch.set_num_threads(1)
17
+
18
+
19
+ torch.set_num_threads(16)
20
+ import os
21
+
22
+ audio_types = (".wav", ".mp3", ".flac")
23
+
24
+ def find_latest_best_model(folder_path):
25
+ search_path = os.path.join(folder_path, '**', 'best_model.pth')
26
+ files = glob(search_path, recursive=True)
27
+ latest_file = max(files, key=os.path.getctime, default=None)
28
+ return latest_file
29
+
30
+
31
+ def list_audios(basePath, contains=None):
32
+ # return the set of files that are valid
33
+ return list_files(basePath, validExts=audio_types, contains=contains)
34
+
35
+ def list_files(basePath, validExts=None, contains=None):
36
+ # loop over the directory structure
37
+ for (rootDir, dirNames, filenames) in os.walk(basePath):
38
+ # loop over the filenames in the current directory
39
+ for filename in filenames:
40
+ # if the contains string is not none and the filename does not contain
41
+ # the supplied string, then ignore the file
42
+ if contains is not None and filename.find(contains) == -1:
43
+ continue
44
+
45
+ # determine the file extension of the current file
46
+ ext = filename[filename.rfind("."):].lower()
47
+
48
+ # check to see if the file is an audio and should be processed
49
+ if validExts is None or ext.endswith(validExts):
50
+ # construct the path to the audio and yield it
51
+ audioPath = os.path.join(rootDir, filename)
52
+ yield audioPath
53
+
54
+ def format_audio_list(audio_files, asr_model, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None):
55
+ audio_total_size = 0
56
+ os.makedirs(out_path, exist_ok=True)
57
+
58
+ lang_file_path = os.path.join(out_path, "lang.txt")
59
+ current_language = None
60
+ if os.path.exists(lang_file_path):
61
+ with open(lang_file_path, 'r', encoding='utf-8') as existing_lang_file:
62
+ current_language = existing_lang_file.read().strip()
63
+
64
+ if current_language != target_language:
65
+ with open(lang_file_path, 'w', encoding='utf-8') as lang_file:
66
+ lang_file.write(target_language + '\n')
67
+ print("Warning, existing language does not match target language. Updated lang.txt with target language.")
68
+ else:
69
+ print("Existing language matches target language")
70
+
71
+ metadata = {"audio_file": [], "text": [], "speaker_name": []}
72
+ train_metadata_path = os.path.join(out_path, "metadata_train.csv")
73
+ eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
74
+
75
+ existing_metadata = {'train': None, 'eval': None}
76
+ if os.path.exists(train_metadata_path):
77
+ existing_metadata['train'] = pandas.read_csv(train_metadata_path, sep="|")
78
+ print("Existing training metadata found and loaded.")
79
+
80
+ if os.path.exists(eval_metadata_path):
81
+ existing_metadata['eval'] = pandas.read_csv(eval_metadata_path, sep="|")
82
+ print("Existing evaluation metadata found and loaded.")
83
+
84
+ if gradio_progress is not None:
85
+ tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...")
86
+ else:
87
+ tqdm_object = tqdm(audio_files)
88
+
89
+ for audio_path in tqdm_object:
90
+ audio_file_name_without_ext, _= os.path.splitext(os.path.basename(audio_path))
91
+ prefix_check = f"wavs/{audio_file_name_without_ext}_"
92
+
93
+ skip_processing = False
94
+ for key in ['train', 'eval']:
95
+ if existing_metadata[key] is not None:
96
+ mask = existing_metadata[key]['audio_file'].str.startswith(prefix_check)
97
+ if mask.any():
98
+ print(f"Segments from {audio_file_name_without_ext} have been previously processed; skipping...")
99
+ skip_processing = True
100
+ break
101
+
102
+ if skip_processing:
103
+ continue
104
+
105
+ wav, sr = torchaudio.load(audio_path)
106
+ if wav.size(0) != 1:
107
+ wav = torch.mean(wav, dim=0, keepdim=True)
108
+
109
+ wav = wav.squeeze()
110
+ audio_total_size += (wav.size(-1) / sr)
111
+
112
+ segments, _= asr_model.transcribe(audio_path, vad_filter=True, word_timestamps=True, language=target_language)
113
+ segments = list(segments)
114
+ i = 0
115
+ sentence = ""
116
+ sentence_start = None
117
+ first_word = True
118
+ words_list = []
119
+ for _, segment in enumerate(segments):
120
+ words = list(segment.words)
121
+ words_list.extend(words)
122
+
123
+ for word_idx, word in enumerate(words_list):
124
+ if first_word:
125
+ sentence_start = word.start
126
+ if word_idx == 0:
127
+ sentence_start = max(sentence_start - buffer, 0)
128
+ else:
129
+ previous_word_end = words_list[word_idx - 1].end
130
+ sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start) / 2)
131
+
132
+ sentence = word.word
133
+ first_word = False
134
+ else:
135
+ sentence += word.word
136
+
137
+ if word.word[-1] in ["!", "。", ".", "?"]:
138
+ sentence = sentence[1:]
139
+ sentence = multilingual_cleaners(sentence, target_language)
140
+ audio_file_name, _= os.path.splitext(os.path.basename(audio_path))
141
+ audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"
142
+
143
+ if word_idx + 1 < len(words_list):
144
+ next_word_start = words_list[word_idx + 1].start
145
+ else:
146
+ next_word_start = (wav.shape[0] - 1) / sr
147
+
148
+ word_end = min((word.end + next_word_start) / 2, word.end + buffer)
149
+
150
+ absolute_path = os.path.join(out_path, audio_file)
151
+ os.makedirs(os.path.dirname(absolute_path), exist_ok=True)
152
+ i += 1
153
+ first_word = True
154
+
155
+ audio = wav[int(sr*sentence_start):int(sr *word_end)].unsqueeze(0)
156
+ if audio.size(-1) >= sr / 3:
157
+ torchaudio.save(absolute_path, audio, sr)
158
+ else:
159
+ continue
160
+
161
+ metadata["audio_file"].append(audio_file)
162
+ metadata["text"].append(sentence)
163
+ metadata["speaker_name"].append(speaker_name)
164
+
165
+ df = pandas.DataFrame(metadata)
166
+
167
+ mode = 'w' if not os.path.exists(train_metadata_path) else 'a'
168
+ header = not os.path.exists(train_metadata_path)
169
+ df.to_csv(train_metadata_path, sep="|", index=False, mode=mode, header=header)
170
+
171
+ mode = 'w' if not os.path.exists(eval_metadata_path) else 'a'
172
+ header = not os.path.exists(eval_metadata_path)
173
+ df.to_csv(eval_metadata_path, sep="|", index=False, mode=mode, header=header)
174
+
175
+ metadata = {"audio_file": [], "text": [], "speaker_name": []}
176
+
177
+ if os.path.exists(train_metadata_path) and os.path.exists(eval_metadata_path):
178
+ existing_train_df = existing_metadata['train']
179
+ existing_eval_df = existing_metadata['eval']
180
+ else:
181
+ existing_train_df = pandas.DataFrame(columns=["audio_file", "text", "speaker_name"])
182
+ existing_eval_df = pandas.DataFrame(columns=["audio_file", "text", "speaker_name"])
183
+
184
+ new_data_df = pandas.read_csv(train_metadata_path, sep="|")
185
+
186
+ combined_train_df = pandas.concat([existing_train_df, new_data_df], ignore_index=True).drop_duplicates().reset_index(drop=True)
187
+ combined_eval_df = pandas.concat([existing_eval_df, new_data_df], ignore_index=True).drop_duplicates().reset_index(drop=True)
188
+
189
+ combined_train_df_shuffled = combined_train_df.sample(frac=1)
190
+ num_val_samples = int(len(combined_train_df_shuffled)* eval_percentage)
191
+
192
+ final_eval_set = combined_train_df_shuffled[:num_val_samples]
193
+ final_training_set = combined_train_df_shuffled[num_val_samples:]
194
+
195
+ final_training_set.sort_values('audio_file').to_csv(train_metadata_path, sep='|', index=False)
196
+ final_eval_set.sort_values('audio_file').to_csv(eval_metadata_path, sep='|', index=False)
197
+
198
+ return train_metadata_path, eval_metadata_path, audio_total_size
utils/gpt_train.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import gc
4
+ from pathlib import Path
5
+
6
+ from trainer import Trainer, TrainerArgs
7
+
8
+ from TTS.config.shared_configs import BaseDatasetConfig
9
+ from TTS.tts.datasets import load_tts_samples
10
+ from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
11
+ from TTS.utils.manage import ModelManager
12
+ import shutil
13
+
14
+
15
+ def train_gpt(custom_model,version, language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path, max_audio_length=255995):
16
+ # Logging parameters
17
+ RUN_NAME = "GPT_XTTS_FT"
18
+ PROJECT_NAME = "XTTS_trainer"
19
+ DASHBOARD_LOGGER = "tensorboard"
20
+ LOGGER_URI = None
21
+
22
+ # print(f"XTTS version = {version}")
23
+
24
+ # Set here the path that the checkpoints will be saved. Default: ./run/training/
25
+ OUT_PATH = os.path.join(output_path, "run", "training")
26
+
27
+ # Training Parameters
28
+ OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
29
+ START_WITH_EVAL = False # if True it will star with evaluation
30
+ BATCH_SIZE = batch_size # set here the batch size
31
+ GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps
32
+
33
+
34
+ # Define here the dataset that you want to use for the fine-tuning on.
35
+ config_dataset = BaseDatasetConfig(
36
+ formatter="coqui",
37
+ dataset_name="ft_dataset",
38
+ path=os.path.dirname(train_csv),
39
+ meta_file_train=train_csv,
40
+ meta_file_val=eval_csv,
41
+ language=language,
42
+ )
43
+
44
+ # Add here the configs of the datasets
45
+ DATASETS_CONFIG_LIST = [config_dataset]
46
+
47
+ # Define the path where XTTS v2.0.1 files will be downloaded
48
+ CHECKPOINTS_OUT_PATH = os.path.join(Path.cwd(), "base_models",f"{version}")
49
+ os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
50
+
51
+
52
+ # DVAE files
53
+ DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
54
+ MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
55
+
56
+ # Set the path to the downloaded files
57
+ DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
58
+ MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
59
+
60
+ # download DVAE files if needed
61
+ if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
62
+ print(" > Downloading DVAE files!")
63
+ ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
64
+
65
+
66
+ # Download XTTS v2.0 checkpoint if needed
67
+ TOKENIZER_FILE_LINK = f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{version}/vocab.json"
68
+ XTTS_CHECKPOINT_LINK = f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{version}/model.pth"
69
+ XTTS_CONFIG_LINK = f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{version}/config.json"
70
+ XTTS_SPEAKER_LINK = f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers_xtts.pth"
71
+
72
+ # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
73
+ TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
74
+ XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
75
+ XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK)) # config.json file
76
+ XTTS_SPEAKER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_SPEAKER_LINK)) # speakers_xtts.pth file
77
+
78
+ # download XTTS v2.0 files if needed
79
+ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
80
+ print(f" > Downloading XTTS v{version} files!")
81
+ ModelManager._download_model_files(
82
+ [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK,XTTS_SPEAKER_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
83
+ )
84
+
85
+ # Transfer this files to ready folder
86
+ READY_MODEL_PATH = os.path.join(output_path,"ready")
87
+ if not os.path.exists(READY_MODEL_PATH):
88
+ os.makedirs(READY_MODEL_PATH)
89
+
90
+ NEW_TOKENIZER_FILE = os.path.join(READY_MODEL_PATH, "vocab.json")
91
+ # NEW_XTTS_CHECKPOINT = os.path.join(READY_MODEL_PATH, "model.pth")
92
+ NEW_XTTS_CONFIG_FILE = os.path.join(READY_MODEL_PATH, "config.json")
93
+ NEW_XTTS_SPEAKER_FILE = os.path.join(READY_MODEL_PATH, "speakers_xtts.pth")
94
+
95
+ shutil.copy(TOKENIZER_FILE, NEW_TOKENIZER_FILE)
96
+ # shutil.copy(XTTS_CHECKPOINT, os.path.join(READY_MODEL_PATH, "model.pth"))
97
+ shutil.copy(XTTS_CONFIG_FILE, NEW_XTTS_CONFIG_FILE)
98
+ shutil.copy(XTTS_SPEAKER_FILE, NEW_XTTS_SPEAKER_FILE)
99
+
100
+ # Use from ready folder
101
+ TOKENIZER_FILE = NEW_TOKENIZER_FILE # vocab.json file
102
+ # XTTS_CHECKPOINT = NEW_XTTS_CHECKPOINT # model.pth file
103
+ XTTS_CONFIG_FILE = NEW_XTTS_CONFIG_FILE # config.json file
104
+ XTTS_SPEAKER_FILE = NEW_XTTS_SPEAKER_FILE # speakers_xtts.pth file
105
+
106
+
107
+ if custom_model != "":
108
+ if os.path.exists(custom_model) and custom_model.endswith('.pth'):
109
+ XTTS_CHECKPOINT = custom_model
110
+ print(f" > Loading custom model: {XTTS_CHECKPOINT}")
111
+ else:
112
+ print(" > Error: The specified custom model is not a valid .pth file path.")
113
+
114
+ num_workers = 8
115
+ if language == "ja":
116
+ num_workers = 0
117
+ # init args and config
118
+ model_args = GPTArgs(
119
+ max_conditioning_length=132300, # 6 secs
120
+ min_conditioning_length=66150, # 3 secs
121
+ debug_loading_failures=False,
122
+ max_wav_length=max_audio_length, # ~11.6 seconds
123
+ max_text_length=200,
124
+ mel_norm_file=MEL_NORM_FILE,
125
+ dvae_checkpoint=DVAE_CHECKPOINT,
126
+ xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
127
+ tokenizer_file=TOKENIZER_FILE,
128
+ gpt_num_audio_tokens=1026,
129
+ gpt_start_audio_token=1024,
130
+ gpt_stop_audio_token=1025,
131
+ gpt_use_masking_gt_prompt_approach=True,
132
+ gpt_use_perceiver_resampler=True,
133
+ )
134
+ # define audio config
135
+ audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
136
+ # training parameters config
137
+ config = GPTTrainerConfig(
138
+ epochs=num_epochs,
139
+ output_path=OUT_PATH,
140
+ model_args=model_args,
141
+ run_name=RUN_NAME,
142
+ project_name=PROJECT_NAME,
143
+ run_description="""
144
+ GPT XTTS training
145
+ """,
146
+ dashboard_logger=DASHBOARD_LOGGER,
147
+ logger_uri=LOGGER_URI,
148
+ audio=audio_config,
149
+ batch_size=BATCH_SIZE,
150
+ batch_group_size=48,
151
+ eval_batch_size=BATCH_SIZE,
152
+ num_loader_workers=num_workers,
153
+ eval_split_max_size=256,
154
+ print_step=50,
155
+ plot_step=100,
156
+ log_model_step=100,
157
+ save_step=1000,
158
+ save_n_checkpoints=1,
159
+ save_checkpoints=True,
160
+ # target_loss="loss",
161
+ print_eval=False,
162
+ # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
163
+ optimizer="AdamW",
164
+ optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
165
+ optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
166
+ lr=5e-06, # learning rate
167
+ lr_scheduler="MultiStepLR",
168
+ # it was adjusted accordly for the new step scheme
169
+ lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
170
+ test_sentences=[],
171
+ )
172
+
173
+ # init the model from config
174
+ model = GPTTrainer.init_from_config(config)
175
+
176
+ # load training samples
177
+ train_samples, eval_samples = load_tts_samples(
178
+ DATASETS_CONFIG_LIST,
179
+ eval_split=True,
180
+ eval_split_max_size=config.eval_split_max_size,
181
+ eval_split_size=config.eval_split_size,
182
+ )
183
+
184
+ # init the trainer and 🚀
185
+ trainer = Trainer(
186
+ TrainerArgs(
187
+ restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
188
+ skip_train_epoch=False,
189
+ start_with_eval=START_WITH_EVAL,
190
+ grad_accum_steps=GRAD_ACUMM_STEPS,
191
+ ),
192
+ config,
193
+ output_path=OUT_PATH,
194
+ model=model,
195
+ train_samples=train_samples,
196
+ eval_samples=eval_samples,
197
+ )
198
+ trainer.fit()
199
+
200
+ # get the longest text audio file to use as speaker reference
201
+ samples_len = [len(item["text"].split(" ")) for item in train_samples]
202
+ longest_text_idx = samples_len.index(max(samples_len))
203
+ speaker_ref = train_samples[longest_text_idx]["audio_file"]
204
+
205
+ trainer_out_path = trainer.output_path
206
+
207
+ # close file handlers and remove them from the logger
208
+ for handler in logging.getLogger('trainer').handlers:
209
+ if isinstance(handler, logging.FileHandler):
210
+ handler.close()
211
+ logging.getLogger('trainer').removeHandler(handler)
212
+
213
+ # now you should be able to delete the log file
214
+ log_file = os.path.join(trainer.output_path, f"trainer_{trainer.args.rank}_log.txt")
215
+ os.remove(log_file)
216
+
217
+ # deallocate VRAM and RAM
218
+ del model, trainer, train_samples, eval_samples
219
+ gc.collect()
220
+
221
+ return XTTS_SPEAKER_FILE,XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref
utils/tokenizer.py ADDED
@@ -0,0 +1,869 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import textwrap
4
+ from functools import cached_property
5
+
6
+ import pypinyin
7
+ import torch
8
+ from hangul_romanize import Transliter
9
+ from hangul_romanize.rule import academic
10
+ from num2words import num2words
11
+ from spacy.lang.ar import Arabic
12
+ from spacy.lang.en import English
13
+ from spacy.lang.es import Spanish
14
+ from spacy.lang.ja import Japanese
15
+ from spacy.lang.zh import Chinese
16
+ from tokenizers import Tokenizer
17
+
18
+ from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
19
+
20
+
21
+ def get_spacy_lang(lang):
22
+ if lang == "zh":
23
+ return Chinese()
24
+ elif lang == "ja":
25
+ return Japanese()
26
+ elif lang == "ar":
27
+ return Arabic()
28
+ elif lang == "es":
29
+ return Spanish()
30
+ else:
31
+ # For most languages, Enlish does the job
32
+ return English()
33
+
34
+
35
+ def split_sentence(text, lang, text_split_length=250):
36
+ """Preprocess the input text"""
37
+ text_splits = []
38
+ if text_split_length is not None and len(text) >= text_split_length:
39
+ text_splits.append("")
40
+ nlp = get_spacy_lang(lang)
41
+ nlp.add_pipe("sentencizer")
42
+ doc = nlp(text)
43
+ for sentence in doc.sents:
44
+ if len(text_splits[-1]) + len(str(sentence)) <= text_split_length:
45
+ # if the last sentence + the current sentence is less than the text_split_length
46
+ # then add the current sentence to the last sentence
47
+ text_splits[-1] += " " + str(sentence)
48
+ text_splits[-1] = text_splits[-1].lstrip()
49
+ elif len(str(sentence)) > text_split_length:
50
+ # if the current sentence is greater than the text_split_length
51
+ for line in textwrap.wrap(
52
+ str(sentence),
53
+ width=text_split_length,
54
+ drop_whitespace=True,
55
+ break_on_hyphens=False,
56
+ tabsize=1,
57
+ ):
58
+ text_splits.append(str(line))
59
+ else:
60
+ text_splits.append(str(sentence))
61
+
62
+ if len(text_splits) > 1:
63
+ if text_splits[0] == "":
64
+ del text_splits[0]
65
+ else:
66
+ text_splits = [text.lstrip()]
67
+
68
+ return text_splits
69
+
70
+
71
+ _whitespace_re = re.compile(r"\s+")
72
+
73
+ # List of (regular expression, replacement) pairs for abbreviations:
74
+ _abbreviations = {
75
+ "en": [
76
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
77
+ for x in [
78
+ ("mrs", "misess"),
79
+ ("mr", "mister"),
80
+ ("dr", "doctor"),
81
+ ("st", "saint"),
82
+ ("co", "company"),
83
+ ("jr", "junior"),
84
+ ("maj", "major"),
85
+ ("gen", "general"),
86
+ ("drs", "doctors"),
87
+ ("rev", "reverend"),
88
+ ("lt", "lieutenant"),
89
+ ("hon", "honorable"),
90
+ ("sgt", "sergeant"),
91
+ ("capt", "captain"),
92
+ ("esq", "esquire"),
93
+ ("ltd", "limited"),
94
+ ("col", "colonel"),
95
+ ("ft", "fort"),
96
+ ]
97
+ ],
98
+ "es": [
99
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
100
+ for x in [
101
+ ("sra", "señora"),
102
+ ("sr", "señor"),
103
+ ("dr", "doctor"),
104
+ ("dra", "doctora"),
105
+ ("st", "santo"),
106
+ ("co", "compañía"),
107
+ ("jr", "junior"),
108
+ ("ltd", "limitada"),
109
+ ]
110
+ ],
111
+ "fr": [
112
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
113
+ for x in [
114
+ ("mme", "madame"),
115
+ ("mr", "monsieur"),
116
+ ("dr", "docteur"),
117
+ ("st", "saint"),
118
+ ("co", "compagnie"),
119
+ ("jr", "junior"),
120
+ ("ltd", "limitée"),
121
+ ]
122
+ ],
123
+ "de": [
124
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
125
+ for x in [
126
+ ("fr", "frau"),
127
+ ("dr", "doktor"),
128
+ ("st", "sankt"),
129
+ ("co", "firma"),
130
+ ("jr", "junior"),
131
+ ]
132
+ ],
133
+ "pt": [
134
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
135
+ for x in [
136
+ ("sra", "senhora"),
137
+ ("sr", "senhor"),
138
+ ("dr", "doutor"),
139
+ ("dra", "doutora"),
140
+ ("st", "santo"),
141
+ ("co", "companhia"),
142
+ ("jr", "júnior"),
143
+ ("ltd", "limitada"),
144
+ ]
145
+ ],
146
+ "it": [
147
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
148
+ for x in [
149
+ # ("sig.ra", "signora"),
150
+ ("sig", "signore"),
151
+ ("dr", "dottore"),
152
+ ("st", "santo"),
153
+ ("co", "compagnia"),
154
+ ("jr", "junior"),
155
+ ("ltd", "limitata"),
156
+ ]
157
+ ],
158
+ "pl": [
159
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
160
+ for x in [
161
+ ("p", "pani"),
162
+ ("m", "pan"),
163
+ ("dr", "doktor"),
164
+ ("sw", "święty"),
165
+ ("jr", "junior"),
166
+ ]
167
+ ],
168
+ "ar": [
169
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
170
+ for x in [
171
+ # There are not many common abbreviations in Arabic as in English.
172
+ ]
173
+ ],
174
+ "zh": [
175
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
176
+ for x in [
177
+ # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
178
+ ]
179
+ ],
180
+ "cs": [
181
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
182
+ for x in [
183
+ ("dr", "doktor"), # doctor
184
+ ("ing", "inženýr"), # engineer
185
+ ("p", "pan"), # Could also map to pani for woman but no easy way to do it
186
+ # Other abbreviations would be specialized and not as common.
187
+ ]
188
+ ],
189
+ "ru": [
190
+ (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
191
+ for x in [
192
+ ("г-жа", "госпожа"), # Mrs.
193
+ ("г-н", "господин"), # Mr.
194
+ ("д-р", "доктор"), # doctor
195
+ # Other abbreviations are less common or specialized.
196
+ ]
197
+ ],
198
+ "nl": [
199
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
200
+ for x in [
201
+ ("dhr", "de heer"), # Mr.
202
+ ("mevr", "mevrouw"), # Mrs.
203
+ ("dr", "dokter"), # doctor
204
+ ("jhr", "jonkheer"), # young lord or nobleman
205
+ # Dutch uses more abbreviations, but these are the most common ones.
206
+ ]
207
+ ],
208
+ "tr": [
209
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
210
+ for x in [
211
+ ("b", "bay"), # Mr.
212
+ ("byk", "büyük"), # büyük
213
+ ("dr", "doktor"), # doctor
214
+ # Add other Turkish abbreviations here if needed.
215
+ ]
216
+ ],
217
+ "hu": [
218
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
219
+ for x in [
220
+ ("dr", "doktor"), # doctor
221
+ ("b", "bácsi"), # Mr.
222
+ ("nőv", "nővér"), # nurse
223
+ # Add other Hungarian abbreviations here if needed.
224
+ ]
225
+ ],
226
+ "ko": [
227
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
228
+ for x in [
229
+ # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
230
+ ]
231
+ ],
232
+ "ja": [
233
+ (re.compile("\\b%s\\b" % x[0]), x[1])
234
+ for x in [
235
+ ("氏", "さん"), # Mr.
236
+ ("夫人", "おんなのひと"), # Mrs.
237
+ ("博士", "はかせ"), # Doctor or PhD
238
+ ("株", "株式会社"), # Corporation
239
+ ("有", "有限会社"), # Limited company
240
+ ("大学", "だいがく"), # University
241
+ ("先生", "せんせい"), # Teacher/Professor/Master
242
+ ("君", "くん") # Used at the end of boys' names to express familiarity or affection.
243
+ ]
244
+ ],
245
+ }
246
+
247
+
248
+ def expand_abbreviations_multilingual(text, lang="en"):
249
+ for regex, replacement in _abbreviations[lang]:
250
+ text = re.sub(regex, replacement, text)
251
+ return text
252
+
253
+
254
+ _symbols_multilingual = {
255
+ "en": [
256
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
257
+ for x in [
258
+ ("&", " and "),
259
+ ("@", " at "),
260
+ ("%", " percent "),
261
+ ("#", " hash "),
262
+ ("$", " dollar "),
263
+ ("£", " pound "),
264
+ ("°", " degree "),
265
+ ]
266
+ ],
267
+ "es": [
268
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
269
+ for x in [
270
+ ("&", " y "),
271
+ ("@", " arroba "),
272
+ ("%", " por ciento "),
273
+ ("#", " numeral "),
274
+ ("$", " dolar "),
275
+ ("£", " libra "),
276
+ ("°", " grados "),
277
+ ]
278
+ ],
279
+ "fr": [
280
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
281
+ for x in [
282
+ ("&", " et "),
283
+ ("@", " arobase "),
284
+ ("%", " pour cent "),
285
+ ("#", " dièse "),
286
+ ("$", " dollar "),
287
+ ("£", " livre "),
288
+ ("°", " degrés "),
289
+ ]
290
+ ],
291
+ "de": [
292
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
293
+ for x in [
294
+ ("&", " und "),
295
+ ("@", " at "),
296
+ ("%", " prozent "),
297
+ ("#", " raute "),
298
+ ("$", " dollar "),
299
+ ("£", " pfund "),
300
+ ("°", " grad "),
301
+ ]
302
+ ],
303
+ "pt": [
304
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
305
+ for x in [
306
+ ("&", " e "),
307
+ ("@", " arroba "),
308
+ ("%", " por cento "),
309
+ ("#", " cardinal "),
310
+ ("$", " dólar "),
311
+ ("£", " libra "),
312
+ ("°", " graus "),
313
+ ]
314
+ ],
315
+ "it": [
316
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
317
+ for x in [
318
+ ("&", " e "),
319
+ ("@", " chiocciola "),
320
+ ("%", " per cento "),
321
+ ("#", " cancelletto "),
322
+ ("$", " dollaro "),
323
+ ("£", " sterlina "),
324
+ ("°", " gradi "),
325
+ ]
326
+ ],
327
+ "pl": [
328
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
329
+ for x in [
330
+ ("&", " i "),
331
+ ("@", " małpa "),
332
+ ("%", " procent "),
333
+ ("#", " krzyżyk "),
334
+ ("$", " dolar "),
335
+ ("£", " funt "),
336
+ ("°", " stopnie "),
337
+ ]
338
+ ],
339
+ "ar": [
340
+ # Arabic
341
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
342
+ for x in [
343
+ ("&", " و "),
344
+ ("@", " على "),
345
+ ("%", " في المئة "),
346
+ ("#", " رقم "),
347
+ ("$", " دولار "),
348
+ ("£", " جنيه "),
349
+ ("°", " درجة "),
350
+ ]
351
+ ],
352
+ "zh": [
353
+ # Chinese
354
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
355
+ for x in [
356
+ ("&", " 和 "),
357
+ ("@", " 在 "),
358
+ ("%", " 百分之 "),
359
+ ("#", " 号 "),
360
+ ("$", " 美元 "),
361
+ ("£", " 英镑 "),
362
+ ("°", " 度 "),
363
+ ]
364
+ ],
365
+ "cs": [
366
+ # Czech
367
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
368
+ for x in [
369
+ ("&", " a "),
370
+ ("@", " na "),
371
+ ("%", " procento "),
372
+ ("#", " křížek "),
373
+ ("$", " dolar "),
374
+ ("£", " libra "),
375
+ ("°", " stupně "),
376
+ ]
377
+ ],
378
+ "ru": [
379
+ # Russian
380
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
381
+ for x in [
382
+ ("&", " и "),
383
+ ("@", " собака "),
384
+ ("%", " процентов "),
385
+ ("#", " номер "),
386
+ ("$", " доллар "),
387
+ ("£", " фунт "),
388
+ ("°", " градус "),
389
+ ]
390
+ ],
391
+ "nl": [
392
+ # Dutch
393
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
394
+ for x in [
395
+ ("&", " en "),
396
+ ("@", " bij "),
397
+ ("%", " procent "),
398
+ ("#", " hekje "),
399
+ ("$", " dollar "),
400
+ ("£", " pond "),
401
+ ("°", " graden "),
402
+ ]
403
+ ],
404
+ "tr": [
405
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
406
+ for x in [
407
+ ("&", " ve "),
408
+ ("@", " at "),
409
+ ("%", " yüzde "),
410
+ ("#", " diyez "),
411
+ ("$", " dolar "),
412
+ ("£", " sterlin "),
413
+ ("°", " derece "),
414
+ ]
415
+ ],
416
+ "hu": [
417
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
418
+ for x in [
419
+ ("&", " és "),
420
+ ("@", " kukac "),
421
+ ("%", " százalék "),
422
+ ("#", " kettőskereszt "),
423
+ ("$", " dollár "),
424
+ ("£", " font "),
425
+ ("°", " fok "),
426
+ ]
427
+ ],
428
+ "ko": [
429
+ # Korean
430
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
431
+ for x in [
432
+ ("&", " 그리고 "),
433
+ ("@", " 에 "),
434
+ ("%", " 퍼센트 "),
435
+ ("#", " 번호 "),
436
+ ("$", " 달러 "),
437
+ ("£", " 파운드 "),
438
+ ("°", " 도 "),
439
+ ]
440
+ ],
441
+ "ja": [
442
+ (re.compile(r"%s" % re.escape(x[0])), x[1])
443
+ for x in [
444
+ ("&", " と "),
445
+ ("@", " アットマーク "),
446
+ ("%", " パーセント "),
447
+ ("#", " ナンバー "),
448
+ ("$", " ドル "),
449
+ ("£", " ポンド "),
450
+ ("°", " 度"),
451
+ ]
452
+ ],
453
+ }
454
+
455
+
456
+ def expand_symbols_multilingual(text, lang="en"):
457
+ for regex, replacement in _symbols_multilingual[lang]:
458
+ text = re.sub(regex, replacement, text)
459
+ text = text.replace(" ", " ") # Ensure there are no double spaces
460
+ return text.strip()
461
+
462
+
463
+ _ordinal_re = {
464
+ "en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
465
+ "es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"),
466
+ "fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"),
467
+ "de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"),
468
+ "pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"),
469
+ "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"),
470
+ "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"),
471
+ "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"),
472
+ "cs": re.compile(r"([0-9]+)\.(?=\s|$)"), # In Czech, a dot is often used after the number to indicate ordinals.
473
+ "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
474
+ "nl": re.compile(r"([0-9]+)(de|ste|e)"),
475
+ "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
476
+ "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
477
+ "ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
478
+ "ja": re.compile(r"([0-9]+)(番|回|つ|目|等|位)")
479
+ }
480
+ _number_re = re.compile(r"[0-9]+")
481
+ _currency_re = {
482
+ "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
483
+ "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
484
+ "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
485
+ }
486
+
487
+ _comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
488
+ _dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b")
489
+ _decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
490
+
491
+
492
+ def _remove_commas(m):
493
+ text = m.group(0)
494
+ if "," in text:
495
+ text = text.replace(",", "")
496
+ return text
497
+
498
+
499
+ def _remove_dots(m):
500
+ text = m.group(0)
501
+ if "." in text:
502
+ text = text.replace(".", "")
503
+ return text
504
+
505
+
506
+ def _expand_decimal_point(m, lang="en"):
507
+ amount = m.group(1).replace(",", ".")
508
+ return num2words(float(amount), lang=lang if lang != "cs" else "cz")
509
+
510
+
511
+ def _expand_currency(m, lang="en", currency="USD"):
512
+ amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
513
+ full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz")
514
+
515
+ and_equivalents = {
516
+ "en": ", ",
517
+ "es": " con ",
518
+ "fr": " et ",
519
+ "de": " und ",
520
+ "pt": " e ",
521
+ "it": " e ",
522
+ "pl": ", ",
523
+ "cs": ", ",
524
+ "ru": ", ",
525
+ "nl": ", ",
526
+ "ar": ", ",
527
+ "tr": ", ",
528
+ "hu": ", ",
529
+ "ko": ", ",
530
+ }
531
+
532
+ if amount.is_integer():
533
+ last_and = full_amount.rfind(and_equivalents[lang])
534
+ if last_and != -1:
535
+ full_amount = full_amount[:last_and]
536
+
537
+ return full_amount
538
+
539
+
540
+ def _expand_ordinal(m, lang="en"):
541
+ return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
542
+
543
+
544
+ def _expand_number(m, lang="en"):
545
+ return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
546
+
547
+
548
+ def expand_numbers_multilingual(text, lang="en"):
549
+ if lang == "zh":
550
+ text = zh_num2words()(text)
551
+ else:
552
+ if lang in ["en", "ru"]:
553
+ text = re.sub(_comma_number_re, _remove_commas, text)
554
+ else:
555
+ text = re.sub(_dot_number_re, _remove_dots, text)
556
+ try:
557
+ text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text)
558
+ text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text)
559
+ text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text)
560
+ except:
561
+ pass
562
+ if lang != "tr":
563
+ text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
564
+ text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
565
+ text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
566
+ return text
567
+
568
+
569
+ def lowercase(text):
570
+ return text.lower()
571
+
572
+
573
+ def collapse_whitespace(text):
574
+ return re.sub(_whitespace_re, " ", text)
575
+
576
+
577
+ def multilingual_cleaners(text, lang):
578
+ text = text.replace('"', "")
579
+ if lang == "tr":
580
+ text = text.replace("İ", "i")
581
+ text = text.replace("Ö", "ö")
582
+ text = text.replace("Ü", "ü")
583
+ text = lowercase(text)
584
+ text = expand_numbers_multilingual(text, lang)
585
+ text = expand_abbreviations_multilingual(text, lang)
586
+ text = expand_symbols_multilingual(text, lang=lang)
587
+ text = collapse_whitespace(text)
588
+ return text
589
+
590
+
591
+ def basic_cleaners(text):
592
+ """Basic pipeline that lowercases and collapses whitespace without transliteration."""
593
+ text = lowercase(text)
594
+ text = collapse_whitespace(text)
595
+ return text
596
+
597
+
598
+ def chinese_transliterate(text):
599
+ return "".join(
600
+ [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)]
601
+ )
602
+
603
+
604
+ def japanese_cleaners(text, katsu):
605
+ text = katsu.romaji(text)
606
+ text = lowercase(text)
607
+ return text
608
+
609
+
610
+ def korean_transliterate(text):
611
+ r = Transliter(academic)
612
+ return r.translit(text)
613
+
614
+
615
+ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json")
616
+
617
+
618
+ class VoiceBpeTokenizer:
619
+ def __init__(self, vocab_file=None):
620
+ self.tokenizer = None
621
+ if vocab_file is not None:
622
+ self.tokenizer = Tokenizer.from_file(vocab_file)
623
+ self.char_limits = {
624
+ "en": 250,
625
+ "de": 253,
626
+ "fr": 273,
627
+ "es": 239,
628
+ "it": 213,
629
+ "pt": 203,
630
+ "pl": 224,
631
+ "zh": 82,
632
+ "ar": 166,
633
+ "cs": 186,
634
+ "ru": 182,
635
+ "nl": 251,
636
+ "tr": 226,
637
+ "ja": 71,
638
+ "hu": 224,
639
+ "ko": 95,
640
+ }
641
+
642
+ @cached_property
643
+ def katsu(self):
644
+ import cutlet
645
+
646
+ return cutlet.Cutlet()
647
+
648
+ def check_input_length(self, txt, lang):
649
+ lang = lang.split("-")[0] # remove the region
650
+ limit = self.char_limits.get(lang, 250)
651
+ if len(txt) > limit:
652
+ print(
653
+ f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
654
+ )
655
+
656
+ def preprocess_text(self, txt, lang):
657
+ if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
658
+ txt = multilingual_cleaners(txt, lang)
659
+ if lang == "zh":
660
+ txt = chinese_transliterate(txt)
661
+ if lang == "ko":
662
+ txt = korean_transliterate(txt)
663
+ elif lang == "ja":
664
+ txt = japanese_cleaners(txt, self.katsu)
665
+ elif lang == "hi":
666
+ # @manmay will implement this
667
+ txt = basic_cleaners(txt)
668
+ else:
669
+ raise NotImplementedError(f"Language '{lang}' is not supported.")
670
+ return txt
671
+
672
+ def encode(self, txt, lang):
673
+ lang = lang.split("-")[0] # remove the region
674
+ self.check_input_length(txt, lang)
675
+ txt = self.preprocess_text(txt, lang)
676
+ lang = "zh-cn" if lang == "zh" else lang
677
+ txt = f"[{lang}]{txt}"
678
+ txt = txt.replace(" ", "[SPACE]")
679
+ return self.tokenizer.encode(txt).ids
680
+
681
+ def decode(self, seq):
682
+ if isinstance(seq, torch.Tensor):
683
+ seq = seq.cpu().numpy()
684
+ txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
685
+ txt = txt.replace("[SPACE]", " ")
686
+ txt = txt.replace("[STOP]", "")
687
+ txt = txt.replace("[UNK]", "")
688
+ return txt
689
+
690
+ def __len__(self):
691
+ return self.tokenizer.get_vocab_size()
692
+
693
+ def get_number_tokens(self):
694
+ return max(self.tokenizer.get_vocab().values()) + 1
695
+
696
+
697
+ def test_expand_numbers_multilingual():
698
+ test_cases = [
699
+ # English
700
+ ("In 12.5 seconds.", "In twelve point five seconds.", "en"),
701
+ ("There were 50 soldiers.", "There were fifty soldiers.", "en"),
702
+ ("This is a 1st test", "This is a first test", "en"),
703
+ ("That will be $20 sir.", "That will be twenty dollars sir.", "en"),
704
+ ("That will be 20€ sir.", "That will be twenty euro sir.", "en"),
705
+ ("That will be 20.15€ sir.", "That will be twenty euro, fifteen cents sir.", "en"),
706
+ ("That's 100,000.5.", "That's one hundred thousand point five.", "en"),
707
+ # French
708
+ ("En 12,5 secondes.", "En douze virgule cinq secondes.", "fr"),
709
+ ("Il y avait 50 soldats.", "Il y avait cinquante soldats.", "fr"),
710
+ ("Ceci est un 1er test", "Ceci est un premier test", "fr"),
711
+ ("Cela vous fera $20 monsieur.", "Cela vous fera vingt dollars monsieur.", "fr"),
712
+ ("Cela vous fera 20€ monsieur.", "Cela vous fera vingt euros monsieur.", "fr"),
713
+ ("Cela vous fera 20,15€ monsieur.", "Cela vous fera vingt euros et quinze centimes monsieur.", "fr"),
714
+ ("Ce sera 100.000,5.", "Ce sera cent mille virgule cinq.", "fr"),
715
+ # German
716
+ ("In 12,5 Sekunden.", "In zwölf Komma fünf Sekunden.", "de"),
717
+ ("Es gab 50 Soldaten.", "Es gab fünfzig Soldaten.", "de"),
718
+ ("Dies ist ein 1. Test", "Dies ist ein erste Test", "de"), # Issue with gender
719
+ ("Das macht $20 Herr.", "Das macht zwanzig Dollar Herr.", "de"),
720
+ ("Das macht 20€ Herr.", "Das macht zwanzig Euro Herr.", "de"),
721
+ ("Das macht 20,15€ Herr.", "Das macht zwanzig Euro und fünfzehn Cent Herr.", "de"),
722
+ # Spanish
723
+ ("En 12,5 segundos.", "En doce punto cinco segundos.", "es"),
724
+ ("Había 50 soldados.", "Había cincuenta soldados.", "es"),
725
+ ("Este es un 1er test", "Este es un primero test", "es"),
726
+ ("Eso le costará $20 señor.", "Eso le costará veinte dólares señor.", "es"),
727
+ ("Eso le costará 20€ señor.", "Eso le costará veinte euros señor.", "es"),
728
+ ("Eso le costará 20,15€ señor.", "Eso le costará veinte euros con quince céntimos señor.", "es"),
729
+ # Italian
730
+ ("In 12,5 secondi.", "In dodici virgola cinque secondi.", "it"),
731
+ ("C'erano 50 soldati.", "C'erano cinquanta soldati.", "it"),
732
+ ("Questo è un 1° test", "Questo è un primo test", "it"),
733
+ ("Ti costerà $20 signore.", "Ti costerà venti dollari signore.", "it"),
734
+ ("Ti costerà 20€ signore.", "Ti costerà venti euro signore.", "it"),
735
+ ("Ti costerà 20,15€ signore.", "Ti costerà venti euro e quindici centesimi signore.", "it"),
736
+ # Portuguese
737
+ ("Em 12,5 segundos.", "Em doze vírgula cinco segundos.", "pt"),
738
+ ("Havia 50 soldados.", "Havia cinquenta soldados.", "pt"),
739
+ ("Este é um 1º teste", "Este é um primeiro teste", "pt"),
740
+ ("Isso custará $20 senhor.", "Isso custará vinte dólares senhor.", "pt"),
741
+ ("Isso custará 20€ senhor.", "Isso custará vinte euros senhor.", "pt"),
742
+ (
743
+ "Isso custará 20,15€ senhor.",
744
+ "Isso custará vinte euros e quinze cêntimos senhor.",
745
+ "pt",
746
+ ), # "cêntimos" should be "centavos" num2words issue
747
+ # Polish
748
+ ("W 12,5 sekundy.", "W dwanaście przecinek pięć sekundy.", "pl"),
749
+ ("Było 50 żołnierzy.", "Było pięćdziesiąt żołnierzy.", "pl"),
750
+ ("To będzie kosztować 20€ panie.", "To będzie kosztować dwadzieścia euro panie.", "pl"),
751
+ ("To będzie kosztować 20,15€ panie.", "To będzie kosztować dwadzieścia euro, piętnaście centów panie.", "pl"),
752
+ # Arabic
753
+ ("في الـ 12,5 ثانية.", "في الـ اثنا عشر , خمسون ثانية.", "ar"),
754
+ ("كان هناك 50 جنديًا.", "كان هناك خمسون جنديًا.", "ar"),
755
+ # ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words
756
+ # ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'),
757
+ # Czech
758
+ ("Za 12,5 vteřiny.", "Za dvanáct celá pět vteřiny.", "cs"),
759
+ ("Bylo tam 50 vojáků.", "Bylo tam padesát vojáků.", "cs"),
760
+ ("To bude stát 20€ pane.", "To bude stát dvacet euro pane.", "cs"),
761
+ ("To bude 20.15€ pane.", "To bude dvacet euro, patnáct centů pane.", "cs"),
762
+ # Russian
763
+ ("Через 12.5 секунды.", "Через двенадцать запятая пять секунды.", "ru"),
764
+ ("Там было 50 солдат.", "Там было пятьдесят солдат.", "ru"),
765
+ ("Это будет 20.15€ сэр.", "Это будет двадцать евро, пятнадцать центов сэр.", "ru"),
766
+ ("Это будет стоить 20€ господин.", "Это будет стоить двадцать евро господин.", "ru"),
767
+ # Dutch
768
+ ("In 12,5 seconden.", "In twaalf komma vijf seconden.", "nl"),
769
+ ("Er waren 50 soldaten.", "Er waren vijftig soldaten.", "nl"),
770
+ ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"),
771
+ ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"),
772
+ # Chinese (Simplified)
773
+ ("在12.5秒内", "在十二点五秒内", "zh"),
774
+ ("有50名士兵", "有五十名士兵", "zh"),
775
+ # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work
776
+ # ("那将是20€先生", '那将是二十欧元先生', 'zh'),
777
+ # Turkish
778
+ # ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR
779
+ ("50 asker vardı.", "elli asker vardı.", "tr"),
780
+ ("Bu 1. test", "Bu birinci test", "tr"),
781
+ # ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'),
782
+ # Hungarian
783
+ ("12,5 másodperc alatt.", "tizenkettő egész öt tized másodperc alatt.", "hu"),
784
+ ("50 katona volt.", "ötven katona volt.", "hu"),
785
+ ("Ez az 1. teszt", "Ez az első teszt", "hu"),
786
+ # Korean
787
+ ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"),
788
+ ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"),
789
+ ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"),
790
+ ]
791
+ for a, b, lang in test_cases:
792
+ out = expand_numbers_multilingual(a, lang=lang)
793
+ assert out == b, f"'{out}' vs '{b}'"
794
+
795
+
796
+ def test_abbreviations_multilingual():
797
+ test_cases = [
798
+ # English
799
+ ("Hello Mr. Smith.", "Hello mister Smith.", "en"),
800
+ ("Dr. Jones is here.", "doctor Jones is here.", "en"),
801
+ # Spanish
802
+ ("Hola Sr. Garcia.", "Hola señor Garcia.", "es"),
803
+ ("La Dra. Martinez es muy buena.", "La doctora Martinez es muy buena.", "es"),
804
+ # French
805
+ ("Bonjour Mr. Dupond.", "Bonjour monsieur Dupond.", "fr"),
806
+ ("Mme. Moreau est absente aujourd'hui.", "madame Moreau est absente aujourd'hui.", "fr"),
807
+ # German
808
+ ("Frau Dr. Müller ist sehr klug.", "Frau doktor Müller ist sehr klug.", "de"),
809
+ # Portuguese
810
+ ("Olá Sr. Silva.", "Olá senhor Silva.", "pt"),
811
+ ("Dra. Costa, você está disponível?", "doutora Costa, você está disponível?", "pt"),
812
+ # Italian
813
+ ("Buongiorno, Sig. Rossi.", "Buongiorno, signore Rossi.", "it"),
814
+ # ("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern
815
+ # Polish
816
+ ("Dzień dobry, P. Kowalski.", "Dzień dobry, pani Kowalski.", "pl"),
817
+ ("M. Nowak, czy mogę zadać pytanie?", "pan Nowak, czy mogę zadać pytanie?", "pl"),
818
+ # Czech
819
+ ("P. Novák", "pan Novák", "cs"),
820
+ ("Dr. Vojtěch", "doktor Vojtěch", "cs"),
821
+ # Dutch
822
+ ("Dhr. Jansen", "de heer Jansen", "nl"),
823
+ ("Mevr. de Vries", "mevrouw de Vries", "nl"),
824
+ # Russian
825
+ ("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", "ru"),
826
+ ("Д-р Смирнов здесь, чтобы увидеть вас.", "доктор Смирнов здесь, чтобы увидеть вас.", "ru"),
827
+ # Turkish
828
+ ("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", "tr"),
829
+ ("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"),
830
+ # Hungarian
831
+ ("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"),
832
+ ]
833
+
834
+ for a, b, lang in test_cases:
835
+ out = expand_abbreviations_multilingual(a, lang=lang)
836
+ assert out == b, f"'{out}' vs '{b}'"
837
+
838
+
839
+ def test_symbols_multilingual():
840
+ test_cases = [
841
+ ("I have 14% battery", "I have 14 percent battery", "en"),
842
+ ("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"),
843
+ ("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"),
844
+ ("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"),
845
+ ("O meu email é ana&joao@gmail.com", "O meu email é ana e joao arroba gmail.com", "pt"),
846
+ ("linguaggio di programmazione C#", "linguaggio di programmazione C cancelletto", "it"),
847
+ ("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"),
848
+ ("Mám 14% baterie", "Mám 14 procento baterie", "cs"),
849
+ ("Těším se na tebe @ party", "Těším se na tebe na party", "cs"),
850
+ ("У меня 14% заряда", "У меня 14 процентов заряда", "ru"),
851
+ ("Я буду @ дома", "Я буду собака дома", "ru"),
852
+ ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"),
853
+ ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"),
854
+ ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"),
855
+ ("我的电量为 14%", "我的电量为 14 百分之", "zh"),
856
+ ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
857
+ ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
858
+ ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
859
+ ]
860
+
861
+ for a, b, lang in test_cases:
862
+ out = expand_symbols_multilingual(a, lang=lang)
863
+ assert out == b, f"'{out}' vs '{b}'"
864
+
865
+
866
+ if __name__ == "__main__":
867
+ test_expand_numbers_multilingual()
868
+ test_abbreviations_multilingual()
869
+ test_symbols_multilingual()
xtts_demo.py ADDED
@@ -0,0 +1,1213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+ import tempfile
5
+ from pathlib import Path
6
+
7
+ import shutil
8
+ import glob
9
+
10
+ import gradio as gr
11
+ import librosa.display
12
+ import numpy as np
13
+
14
+ import torch
15
+ import torchaudio
16
+ import traceback
17
+ from utils.formatter import format_audio_list,find_latest_best_model, list_audios
18
+ from utils.gpt_train import train_gpt
19
+
20
+ from faster_whisper import WhisperModel
21
+
22
+ from TTS.tts.configs.xtts_config import XttsConfig
23
+ from TTS.tts.models.xtts import Xtts
24
+
25
+ import requests
26
+
27
+ def download_file(url, destination):
28
+ try:
29
+ response = requests.get(url, stream=True)
30
+ response.raise_for_status()
31
+ with open(destination, "wb") as f:
32
+ for chunk in response.iter_content(chunk_size=8192):
33
+ f.write(chunk)
34
+ print(f"Downloaded file to {destination}")
35
+ return destination
36
+ except Exception as e:
37
+ print(f"Failed to download the file: {e}")
38
+ return None
39
+
40
+ # Clear logs
41
+ def remove_log_file(file_path):
42
+ log_file = Path(file_path)
43
+
44
+ if log_file.exists() and log_file.is_file():
45
+ log_file.unlink()
46
+
47
+ # remove_log_file(str(Path.cwd() / "log.out"))
48
+
49
+ def clear_gpu_cache():
50
+ # clear the GPU cache
51
+ if torch.cuda.is_available():
52
+ torch.cuda.empty_cache()
53
+
54
+ XTTS_MODEL = None
55
+
56
+ def create_zip(folder_path, zip_name):
57
+ zip_path = os.path.join(tempfile.gettempdir(), f"{zip_name}.zip")
58
+ shutil.make_archive(zip_path.replace('.zip', ''), 'zip', folder_path)
59
+ return zip_path
60
+
61
+ def get_model_zip(out_path):
62
+ ready_folder = os.path.join(out_path, "ready")
63
+ if os.path.exists(ready_folder):
64
+ return create_zip(ready_folder, "optimized_model")
65
+ return None
66
+
67
+ def get_dataset_zip(out_path):
68
+ dataset_folder = os.path.join(out_path, "dataset")
69
+ if os.path.exists(dataset_folder):
70
+ return create_zip(dataset_folder, "dataset")
71
+ return None
72
+
73
+ def load_model(xtts_checkpoint, xtts_config, xtts_vocab,xtts_speaker):
74
+ global XTTS_MODEL
75
+ clear_gpu_cache()
76
+ if not xtts_checkpoint or not xtts_config or not xtts_vocab:
77
+ return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
78
+ config = XttsConfig()
79
+ config.load_json(xtts_config)
80
+ XTTS_MODEL = Xtts.init_from_config(config)
81
+ print("Loading XTTS model! ")
82
+ XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab,speaker_file_path=xtts_speaker, use_deepspeed=False)
83
+ if torch.cuda.is_available():
84
+ XTTS_MODEL.cuda()
85
+
86
+ print("Model Loaded!")
87
+ return "Model Loaded!"
88
+
89
+ def run_tts0(selected_language, lang, tts_text, speaker_audio_file, temperature, length_penalty,repetition_penalty,top_k,top_p,sentence_split,use_config):
90
+ if XTTS_MODEL is None or not speaker_audio_file:
91
+ return "You need to run the previous step to load the model !!", None, None
92
+ #
93
+ selected_speaker = speaker_audio_file
94
+ selec_languaje = load_text_langs(selected_language)
95
+
96
+ # Construct the file path
97
+ speaker_audio_path = f"/tmp/Voice/{selec_languaje}/{selected_speaker}.mp3"
98
+
99
+ gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_path, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
100
+
101
+ if use_config:
102
+ out = XTTS_MODEL.inference(
103
+ text=tts_text,
104
+ language=lang,
105
+ gpt_cond_latent=gpt_cond_latent,
106
+ speaker_embedding=speaker_embedding,
107
+ temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
108
+ length_penalty=XTTS_MODEL.config.length_penalty,
109
+ repetition_penalty=XTTS_MODEL.config.repetition_penalty,
110
+ top_k=XTTS_MODEL.config.top_k,
111
+ top_p=XTTS_MODEL.config.top_p,
112
+ enable_text_splitting = True
113
+ )
114
+ else:
115
+ out = XTTS_MODEL.inference(
116
+ text=tts_text,
117
+ language=lang,
118
+ gpt_cond_latent=gpt_cond_latent,
119
+ speaker_embedding=speaker_embedding,
120
+ temperature=temperature, # Add custom parameters here
121
+ length_penalty=length_penalty,
122
+ repetition_penalty=float(repetition_penalty),
123
+ top_k=top_k,
124
+ top_p=top_p,
125
+ enable_text_splitting = sentence_split
126
+ )
127
+
128
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
129
+ out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
130
+ out_path = fp.name
131
+ torchaudio.save(out_path, out["wav"], 24000)
132
+
133
+ return "Speech generated !", out_path, speaker_audio_path
134
+
135
+ def run_tts(lang, tts_text, speaker_audio_file, temperature, length_penalty,repetition_penalty,top_k,top_p,sentence_split,use_config):
136
+ if XTTS_MODEL is None or not speaker_audio_file:
137
+ return "You need to run the previous step to load the model !!", None, None
138
+
139
+ gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
140
+
141
+ if use_config:
142
+ out = XTTS_MODEL.inference(
143
+ text=tts_text,
144
+ language=lang,
145
+ gpt_cond_latent=gpt_cond_latent,
146
+ speaker_embedding=speaker_embedding,
147
+ temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
148
+ length_penalty=XTTS_MODEL.config.length_penalty,
149
+ repetition_penalty=XTTS_MODEL.config.repetition_penalty,
150
+ top_k=XTTS_MODEL.config.top_k,
151
+ top_p=XTTS_MODEL.config.top_p,
152
+ enable_text_splitting = True
153
+ )
154
+ else:
155
+ out = XTTS_MODEL.inference(
156
+ text=tts_text,
157
+ language=lang,
158
+ gpt_cond_latent=gpt_cond_latent,
159
+ speaker_embedding=speaker_embedding,
160
+ temperature=temperature, # Add custom parameters here
161
+ length_penalty=length_penalty,
162
+ repetition_penalty=float(repetition_penalty),
163
+ top_k=top_k,
164
+ top_p=top_p,
165
+ enable_text_splitting = sentence_split
166
+ )
167
+
168
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
169
+ out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
170
+ out_path = fp.name
171
+ torchaudio.save(out_path, out["wav"], 24000)
172
+
173
+ return "Speech generated !", out_path, speaker_audio_file
174
+
175
+
176
+ # Diccionario de idiomas y sus códigos
177
+ leng_and_ids = {
178
+ "Select language": "es",
179
+ "Voices Legacy": "show_legacy",
180
+ "Arabic": "ar",
181
+ "Bulgarian": "bg",
182
+ "Chinese": "zh",
183
+ "Croatian": "hr",
184
+ "Czech": "cs",
185
+ "Danish": "da",
186
+ "Dutch": "nl",
187
+ "English-1": "en1",
188
+ "English-2": "en2",
189
+ "Finnish": "fi",
190
+ "French": "fr",
191
+ "German": "de",
192
+ "Greek": "el",
193
+ "Hindi": "hi",
194
+ "Hungarian": "hu",
195
+ "Indonesian": "id",
196
+ "Italian": "it",
197
+ "Japanese": "ja",
198
+ "Korean": "ko",
199
+ "Norwegian": "no",
200
+ "Polish": "pl",
201
+ "Portuguese": "pt",
202
+ "Romanian": "ro",
203
+ "Russian": "ru",
204
+ "Slovak": "sk",
205
+ "Spanish": "es",
206
+ "Swedish": "sv",
207
+ "Tamil": "ta",
208
+ "Turkish": "tr",
209
+ "Ukrainian": "uk",
210
+ "Vietnamese": "vi"
211
+ }
212
+
213
+ # Listas de nombres para cada idioma (Solución 2: Recomendada)
214
+ show_legacy = ['Adam', 'Alice', 'Antoni', 'Aria', 'Arnold', 'Bill', 'Brian', 'Callum', 'Charlie', 'Charlotte', 'Chris', 'Clyde', 'Daniel', 'Dave', 'David_Martin._1', 'Domi', 'Dorothy', 'Drew', 'Elli', 'Emily', 'Eric', 'Ethan', 'Fin', 'Freya', 'George', 'Gigi', 'Giovanni', 'Glinda', 'Grace', 'Harry', 'James', 'Jeremy', 'Jessica', 'Jessie', 'Joseph', 'Josh', 'Laura', 'Liam', 'Lily', 'Matilda', 'Michael', 'Mimi', 'Nicole', 'Patrick', 'Paul', 'Rachel', 'River', 'Roger', 'Sam', 'Sarah', 'Serena', 'Thomas', 'Will']
215
+
216
+ arabic_names = ['Amr', 'Anas', 'HMIDA', 'Hamid', 'Haytham', 'Haytham_-_Conversation', 'Jafar_-_Deep_Narrator', 'Mo_Wiseman', 'Mona', 'Mourad_Sami', 'Raed', 'Sana', 'Wahab_Arabic']
217
+ bulgarian_names = ['Elena', 'Julian']
218
+ chinese_names = ['Coco_Li', 'Karo_Yang', 'Liang', 'Martin_Li', 'Maya_-_Young__Calm', 'ShanShan_-_Young_Energetic_Female', 'Stacy_-_Sweet_and_Cute_Chinese', 'YT']
219
+ croatian_names = ['Ivan', 'Luka_-_Narration', 'Maja', 'Slobodan']
220
+ czech_names_names = ['Anet', 'Hana_-_CZ', 'Hanka_beta', 'Jan', 'Jan_-_kind__gentle', 'Jiri', 'Ondřej_–_vypravěč', 'Pawel_TV™️_-_High_Quality_', 'Petr_Sovadina', 'Tony']
221
+ danish_names = ['Christian_-_Danish_calm_voice', 'Constantin_Birkedal', 'Mathias_-_Storyteller', 'Peter_-___Readings__Presentations', 'Sissel', 'Thomas_Hansen']
222
+
223
+ dutch_names = ['Arno_Drost', 'Bart', 'Daniel_van_der_Meer_', 'Jaimie_from_the_Netherlands_-_Dutch_Amsterdam_Voiceover_-_Young_Male_Age_30_', 'Richard', 'Serge_de_Beer_Pro1', 'Tijs']
224
+
225
+ finnish_names = ['Christoffer_Satu']
226
+
227
+ french_names = ['Adina_-_French_teenager', 'Adrien_Piret', 'Alexandre_Boutin_-_French_Canadian', 'Audiobooks_Lady', 'Audrey', 'Camille_Martin', 'Christophe_Géradon_Belge', 'Christophe_M', 'Claire', 'Coco_-_French_-_for_E-learning_and_Tutorial', 'Corentin', 'Cyril_-_Narration__Audiobook', 'Darine_-_Narration', 'Dave_-_Pro_Narrative', 'David', 'Denis_Landrieu', 'Emilie_Lacroix', 'Eric', 'Franck_de_France', 'Frédéric_-__French_Narration', 'Gaétan_L-Pro_French_Warm_Calm_Clear_Voice_Reader_conditions', 'Guillaume_-_French_voice_-_Narration_and_Voiceover', 'Guillaume_-_Narration', 'Haseeb_-_Canadian_French', 'Hélène', 'JaySoft', 'Jean_Petit_-_jeune', 'Jeanne_-_Professional_and_captivating_voice', 'Kevin_histoire_V2', 'Laurence_-_Class__Mature', 'Liam_-_Sharp__Pro', 'Louis_Boutin', 'Lucie', 'Lucien', 'Ludovic', 'Léo_-_Quebec_French', 'Léo_Latti', 'Mademoiselle_French_-_For_Conversational', 'Mademoiselle_French_-_for_Institutional_Video', 'Manuel_Formateur_-_Français', 'Martin_Dupont_Aimable', 'Martin_Dupont_Intime', 'Martin_Dupont_Profond', 'Mat', 'Mathieu_-_French_voice_-_Narration', 'Maxime_-_French_Young_Male', 'Maxime_Lavaud_-_French_young_man', 'Maxime_Lavaud_-_French_young_man_', 'Michel', 'Miss_French_-_For_Audiobook', 'Miss_French_Papote', 'Miss_Radio', 'Nicolas_-_Narration', 'Nicolas_Petit', 'Nicolas_Petit_-_Deep_voice_narration_', 'Nicolas_animateur_', 'Olivier_Calm', 'Patrick_-_Québec_Canada', 'Peter_-_Engaging_friendly_young_adult_male_voice', 'Romain_-_Lecture', 'Sam_French', 'SkaraB', 'Sophie_-_Pro_Audiobook', 'Sébastien_-_French_Male', 'Theo_-_Smart_warm_open', 'Ulys_-_Young__Energetic', 'Vincent_FR', 'Voix_Nicolas_Petit_ton_Animateur_Radio', 'Voix_grand_père']
228
+
229
+ german_names = ['Aaron', 'Albert_-_Funny_Cartoon_Character', 'Aleks', 'Alessandro_Devigus', 'Alex_-__Professional_German_Male_Voiceover', 'Amadeus', 'Ana', 'Ana_-_Novel_Audiobook', 'Andi_Brewi_-_Moderator_advertising_spokesperson', 'Andreas_-_Clear_German', 'Andreas_-_Deep_German_Voice', 'Annika', 'Anton_Dark_Magic_-_Thriller_-_True_Crime', 'Antonia_Konstanz_-_German_Native', 'Apollo_-_Documentary__TV_Voice', 'Ava_-_youthful_and_expressive_German_female_voice', 'Bartholomeus_Bösewicht_-_Grim_and_Gruesome', 'Ben', 'Ben_Hoffmann_-_German_Ads__Trailers', 'Carlos_-_der_Spanier', 'Carola_Ferstl_Nachrichten', 'Christian', 'Christian_Ehler', 'Christian_Kinderbuch', 'Clemens_Hartmann_-_The_Berlin_Voice', 'Clemens_Hartmann_2_-_for_Ads__Trailers', 'Clemens_Hartmann_3_-_The_Narrator', 'Cornelia_', 'Daniel_DaFraVe', 'Daniel_DaFraVe_Whisper._ASMR._Meditation._Relaxing', 'David_-_Serious_voice_for_narration_and_stories', 'Der_Beamte', 'Dimawalker_', 'Dimi', 'Dirk', 'Elias_-_Radio_Host__Radio_News_Presenter_Voice', 'Elias_-_Social_Media_Podcasts_Conversations__Discussions', 'Emilia_-_German_narrator', 'Fabian', 'Felix_-_Smooth_German_Chaos', 'Felix_-_Soft_Deep_German_Narration_Voice', 'Felix_Gebhardt_-_authentisch_und_berührend_Podcast_Hörbuch_Radio', 'Finnegan_Fairytale_-_Exciting_Childrens_Stories', 'Flauschi', 'Frederick', 'Frederick_-_Calm_Meditation_Deutsch', 'Frederick_-_Calm_and_Soothing_Meditation', 'Frederick_-_Friendly__helpful_', 'Frederick_-_Old_Gnarly_Narrator', 'German_Daniel', 'German_Michael_-_Loud_Clear__Striking', 'German_Voice', 'Grandpa_Georg_-_Funny_and_Gruff', 'Günther_Goodnight_-_Relaxed_and_Slow', 'Hans_Kraft', 'Heidi_factual_Standard_German_-_with_Swiss_Accent', 'Helmut_Schwarz', 'Herr_Gruber', 'Horvath_aus_Wien', 'Isabell', 'Jan', 'Jean_Art', 'Jesper', 'Johannes_-_Documentary_film', 'Jonas', 'Juan_Schubert', 'Julia', 'Julian_-_German_Explainer_Voice', 'Julius', 'Kris_Klingenberg', 'Kurt_-_Calm', 'Lana_Weiss_-_Meditation', 'Lea', 'Lena_-_Cute_German_Voice', 'Leo_liest', 'Leo_liest_tief', 'Leon_Stern_-_Fiction__Fantasy_', 'Leonie', 'Lex_Mystery', 'Lorenz', 'Louisa_', 'Luisa', 'Lukas_Harmony', 'Manuel_-_Your_Narrator_and_Storyteller', 'Marc', 'Marc_Weber_-_Non-fiction_books_', 'Marcel__Male__Audiobook__Tutorial__Trainings_GERMAN', 'Marco_-_Gentle_German_ASMR_Narrator', 'Marcus_KvE_–_German_Voice_Over', 'Marie_-_German_Frenchwoman', 'Marko_-_German_Male_Deep_Voice', 'Markus', 'Martin_History', 'Martin_Jung', 'Martin_R._Pro', 'Max_Mustermann_-_Ernst', 'Meine_Lesestimme', 'Michael', 'Mila', 'Nader', 'Narrator_Markus', 'Niander_Wallace_', 'Otto', 'Patrick_-_German_speaker', 'Peter_Hartlapp_-_Voiceactor_Werbesprecher_und_Moderator', 'Peter_Meta_Business_Twin', 'Petra_PeFraVe_Pro_', 'Petra_PeFraVe__-_Funny', 'Phil_-_Fantasy__Thriller', 'Philipp_-_Male_with_standard_accent', 'Prinz_Pricklig_-_Whispering_Sparkling_and_Crisp_', 'Rafi_Biber', 'Reeloverlay', 'Rob', 'Robby_-_Audio_books_Speeches__Stories', 'Robert_dein_freundlicher_Assistent', 'Robert_erklaert_mit_Betonung', 'Robert_hypnotisiert_entspannte_Meditation', 'Samer', 'Sammy_Zimmermanns', 'Sascha_Pro_', 'Sebastian_Thomas', 'Stefan_Rank_der_Erzähler_Radio-Moderator', 'Susi', 'Sympathische_Stimme', 'Thomas_-_The_pragmatist', 'Timo', 'Tom_-_Deep_German_Voice', 'Tom_Magic', 'Tommy_Studio_Voice_2', 'Torsten_-_Raspy_Charmer', 'Tristan_Medersburg_-_Trustworthy_Deepness', 'Vali_-_Young_man_with_a_bass-heavy_voice', 'Vincent_-_Factual', 'Willi_-_Professional_German_Narrator']
230
+
231
+ greek_names = ['Agapi', 'Fatsis_', 'Giassiranis_Dimitrios', 'Kyriakos', 'Niki_-_native_Greek_female_', 'Niki_2_-_native_Greek_female', 'Niki_3_-_native_Greek_female', 'Stefanos_-_Calm_youthful_and_casual', 'Takis_-_native_Greek_male']
232
+
233
+ hindi_names = ['Aaditya_Kapur_-_Calm_Conversational_Hindi_Voice', 'Aakash_Aryan_-_Conversational_Voice', 'Amit_Gupta', 'Anand_-_Storytelling_and_Narration_Hindi', 'Anoop', 'Ayesha_-_Energetic_Hindi_Voice', 'Bobby_', 'Danish_Khan_-_Expressive_Old_Voice', 'Devi_-_Clear_Hindi_pronunciation', 'Faiq_-_Standard_Hindi', 'God', 'Guru_-_Rich_Bass_Hindi_Voice', 'Ishika_Singh_-__Storytelling_and_Narration_Hindi', 'Janvi_-_Expressive_Indian_Voice_', 'Jitu', 'John_-_Confident_and_Deep', 'Kaaya_-_Gentle_Hindi_', 'Kanika_-_Relatable_Hindi_Voice', 'Krishna_-_Energetic_Hindi_Voice', 'Kunal_Agarwal', 'Leo_-_Energetic_Hindi_Voice_', 'Luv_-_Hindi_Storytelling_Voice', 'Manu_-_Smooth_Modulated_Voice', 'Monika_Sogam_-_Hindi_Modulated', 'Muskaan_-_Casual_Hindi_Voice', 'Natasha_-_Energetic_Hindi_Voice', 'Neel_-_Expressive_Narrator', 'Nikita_-_Youthful_Hindi_Voice', 'Nipunn_-_Deep_Hindi_voice', 'Niraj_-_Hindi_Narrator', 'P_K_Anil_-_Clear_Hindi', 'Parmeshwar_परमेश्वर', 'Parveen_-_Hindi', 'Pratima_-_Casual_Hindi_Conversational_Voice', 'Prem_-_Connectable_Hindi_Voice', 'Priya', 'Raju_-_Relatable_Hindi_Voice', 'Ranbir_Merchant_-_Deep_Engaging_Hindi_Voice', 'Ranga_-_Authoritative_and_Deep_Hindi_Voice', 'Reva_-_Familiar_Hindi_Voice', 'Riya_K._Rao_-_Hindi_Conversational_Voice', 'Ruhaan_-_Clean_Hindi_Narration_Voice', 'Saanu_-_Soft_and_Calm', 'Sachin_-_Deep_and_thoughtful', 'Saira_-_Young_Casual_Voice', 'Samads_Realistic_Voice', 'Shakuntala_-_Expressive_Indian_Voice', 'Shrey_-_Deep_Hindi_Voice', 'Sohaib_Jasra_', 'Sonu_Indian_Male', 'Suhaan_-_Delhi_Guy', 'Sweetie', 'Vihan_Ahuja_-_Friendly_Hindi_Voice', 'Yash_A_Malhotra_-_Warm__Friendly_Hindi_Voice', 'Zadok_-_Good_for_character']
234
+
235
+ hungarian_names = ['Magyar_Férfi_-_Hungarian_Male', 'Susanna_Rutkai']
236
+
237
+ indonesian_names = ['Abyasa', 'Andi', 'Andra', 'Bambang__', 'Bee_Ard_-_Clear_Dynamic_Voice', 'Blasto', 'Hendro_Atmoko', 'Jin', 'Mahaputra', 'Meraki_female_Indonesian_voice', 'Miz', 'Pramoedya_Chandra', 'Pratama', 'Putra', 'Suara_narasi', 'Tri_Nugraha_Ramadhani', 'Zephlyn']
238
+
239
+ italian_names = ['Aaron', 'Alessandro', 'Alessio_-_positive_and_professional', 'Andrea_Loco', 'Anna', 'Antonio_Farina_-_Italian_PRO_Talent_-_Audiobook_Narration', 'Carmelo_La_Rosa_-_Italian_Pro_Talent_e-learning_news_webinar_istitutional.', 'Chris_Basetta_-_Audio_Books', 'Chris_Basetta_-_Social_Media', 'Dante_-_Italian_30_years_old', 'Emanuel', 'Eray_Rio·Sae', 'Fabi', 'Francesco', 'Francesco_-_Narrative', 'Francesco_-_Premium', 'Gabriele', 'Germano_Carella', 'GianP_-_Edu_-_Clear__Upbeat', 'GianP_-_Narrative_Storytelling', 'GianP_-_News_Info_and_Documentary', 'GianP_-_Social_Media__Ads', 'Gianluigi_Toso', 'Giovanni_Rossi_-_giovane', 'Giulia_-_sweet_and_soothing', 'Gus_-_Deep_and_Pleasant', 'Kina_-_Cute_happy_girl', 'Leandro_', 'Linda_Fiore', 'Luca', 'Luca_Brasi_Gentile', 'Luca_Brasi_Intimo', 'Luca_Brasi_Profondo', 'Luna', 'Marcello_Lares_-_Soothing_Narrator', 'Marco', 'MarcoTrox_-_Italian_Pro_Voice_Actor_-_Storytelling_Audiobooks_Narration.', 'MarcoTrox_-_Italian_Professional_Voice_Talent', 'Marco_Pro', 'MrVibes', 'Nicola_Lorusso_-_Italian_Pro_-_Storytelling_Audiobooks_Narration.', 'Oceano_-_A_very_young_narrator', 'Pietro_-_Crazy_Character_Narrator', 'RenzoTech_', 'Stefano', 'Stefano_Becciolini_1']
240
+
241
+ japanese_names = ['Asahi_-_Japanese_male', 'Ena_', 'Hinata', 'Hiro_Satake', 'Ichiro', 'Ishibashi_-_Strong_Japanese_Male_Voice', 'Junichi', 'Ken', 'Ken_-_Japanese_male', 'Kozy_Male_Japanese_Narrative_Voice_-_Tokyo_Standard_Accent', 'Morioki', 'Otani', 'Sakura_Suzuki', 'Shoki']
242
+
243
+ korean_names = ['Anna_Kim', 'Bin', 'ChulSu', 'Do_Hyeon', 'Funny_Jackie_Lee', 'HYUK_', 'Hyuk', 'Hyun_Bin', 'Jaedong_Ahn', 'Jina', 'Jung_-_Narrative', 'KKC', 'KKC_-_Guided_Meditation__Narration', 'Kyungduk_Ko', 'Man_Bo', 'Min_ho']
244
+
245
+ norwegian_names = ['Johannes_-_Norwegian_-_Upbeat', 'Mia_Starset']
246
+
247
+ polish_names = ['Adam_-_Polish_narrator', 'Adygeusz', 'Aneta_-_Loud_and_confident_voice', 'Ave_Cezar', 'Bart', 'Bea', 'Damian_PL_', 'Daniel', 'Dawid_PL', 'Ignacius', 'James_-_Narrative__Story', 'Jerzy', 'Krzysztof_PL', 'Lena_Suzuki', 'Maciej', 'Maciek', 'Mark_-_Polish', 'Martin', 'MePolish', 'Mr_Lucas_', 'Oliver_Brown', 'Pawel_Pro_-_Polish', 'Piotrek_Pro', 'Pixi', 'Robert', 'Robert_Rob']
248
+
249
+ portuguese_names = ['Adriano_-_Narrador3', 'Adriano_-_Narrator', 'Adriano_-_Narrator2', 'Alcione', 'Ale_Garcia', 'Ana_-_Brazilian', 'Ana_Dias', 'Andreia_I.', 'Bia_-_Brazilian', 'Brazilian_Dudy', 'Conrado_Bueno', 'Daiane_Candido', 'Daniel_Dan', 'Davi', 'Dhyogo_Azevedo', 'Diego', 'Eddie_Barroso_-_Brazilian', 'Edna_E.', 'FMDAmbrosio', 'FRANCISCO_IA', 'Fabio_Filho', 'Flavio_Francisco_-_Narrative_-_Brazilian_Portuguese', 'Gabby', 'Gilson_Lima', 'Gustavo_Barros', 'Gustavo_Jannuzzi_', 'Gustavo_Sancho', 'Higor_Bourges', 'Hugo_Mendonça', 'João_Pedro', 'Juliana_Barbieri', 'Keren_-_Young_Brazilian_Female', 'Klaus_-__Young_Brazilian_Professional_Narrator', 'Kuhcsal', 'Lax', 'Leonardo_Hamaral', 'Locução_para_Propaganda', 'Luka', 'Marcelo_Costa_Brasileiro', 'Matheus_-_Energic_Young_Voice', 'Michele_-_Brazilian', 'Muhammad_Umm', 'Oliveir4_Music', 'Onildo_F._Rocha', 'Otto_de_La_Luna', 'Papai_Noel_', 'Rafael_Valente_-_Brazilian_Professional_Narrator', 'Rener', 'Roberto_Barbieri', 'Rodrigo_Rodrigues', 'Samuel_-_Jovem_Empreendedor', 'ScheilaSMTy', 'Slany', 'Thiago_Realista', 'Vagner_De_souza', 'Vinicius_Bergamo', 'Wesley_Bessa_', 'Weverton_', 'Will_-_Deep']
250
+
251
+ romanian_names = ['Andrei', 'Antonia', 'Apeiron', 'Ciprian_Pop', 'Corina_Ioana', 'Cristi_Romana', 'Cristina_Amza', 'Jora_Slobod', 'Liviu_Mihai', '_Bogdan_-_Advertising']
252
+
253
+ russian_names = ['Aleksandr_Petrov', 'Andrei_-_Calm_and_Friendly', 'Anna_-_Calm_and_pleasant_', 'Artem_K', 'Artemii_Levkoy', 'Dimitri', 'Dmitry', 'Felix_-_calm_friendly', 'Larisa_Actrisa', 'Marat', 'Mark_Rozenberg', 'Max_-_Clear__Professional', 'Nadia', 'Nikolay', 'Oleg_Krugliak_', 'Oleksandr_Trotsenko', 'Ranger3D.pro', 'Tyler_Soapen', 'Viktoriia_-_clear_resonant_young_female_voice']
254
+
255
+ slovak_names = ['Andrej']
256
+
257
+ spanish_names = ['AF', 'Alberto_Rodriguez', 'Alejandro_-_Mexican_male', 'Alejandro_Aragon', 'Alejandro_Ballesteros', 'Alejandro_Durán', 'Alex_-_Happy_Upbeat_Joyful_Energetic', 'Alex_Comunicando', 'Andrea', 'Andrew_V.', 'Andromeda_Thunders', 'Angie_vendedora_Colombiana', 'Ani_Egea', 'Ani_Egea_-_Expressive', 'Antonio_LV', 'Antonio_ia', 'Apex_-_Fitness_-_Nutrition_-_Coach_-_Energetic_-_Professional', 'ArthisRap_Pro', 'Ashley_Travels-_American_English_Tourist_speaking_Spanish_', 'Bardo_Limon_-_Epic_Promotional_Voice', 'Bebe_Lunita_-_Bebe_hablando', 'Beto_-_Latin_American_Spanish_Argentina', 'Bruno_-_Suspense_-_Thrill_-_Horror_-_Tense', 'Brêchet_Simon', 'CRISTINA_VOICE', 'Carles_Pujol', 'Carlos_-_Podcasting__News', 'Carmelo', 'Carmelo_Crespo', 'Carmelo_Crespo_-_Expressive', 'Carolina_-_Spanish_woman_-_es_ES', 'Christian_Avilés_-_documentales_e-learning_corporativos_y_Redes_Sociales', 'Claudia_Whispers-_Asmr_Spanish_Intimate', 'Cristi_Poot', 'Cristian_Medina', 'Damian_Valdez', 'Dan_Dan', 'Dante_-_Castilian_Spanish', 'Dany_-_Professional_narrator', 'David_Martin._1', 'David_Martin_2', 'Denilson', 'Didak_Leñero__Spanish_Spain', 'Diego_Aguado_-_Spanish_deep_voice', 'Diego_Cárdenas', 'Diego_Galán', 'Dominican_', 'Dosi_Español', 'EDGARD', 'Eduardo_-_Advertising__Commercial_voice_in_Spanish', 'Eduardo_M._-_Mexican_Spanish', 'Eduardo_Román', 'Efrayn', 'Eleguar_-_Latin_American_Spanish', 'Eleguar_-__Deep_Latin_American_Spanish', 'Emiliano_Zamora', 'Emilio_Menal', 'Enrico', 'Enrique_M._Nieto', 'Enrique_Mondragón', 'Erika_-_Raspy_and_Pleasant', 'Eva_Dorado', 'FantasyCraft_Studios', 'Fer', 'Fernanda_olea_1', 'Fernando', 'Fernando_Martinez', 'Firusho', 'Francisco', 'Frankie_San_Juan', 'Gabriela_-_Spanish_from_Mexico_', 'Gabriela_Gonzalez_', 'Gilfoy', 'Ginyin', 'Ginyin_2_-_Webpages_Narrative__Books', 'Grandma_Titina_-_70_year_old_woman', 'Guillermo_Brazález', 'Guillermo_Brazález_-_Dynamic__Cheerful', 'Haroldo_', 'Hernán_Cortés', 'Isabela_-_Spanish_Childrens_Book_Narrator', 'Jacson_Ander', 'Jaime_Fregoso_-_Professional_Annoucer', 'Jaime_Tu_Locutor_Online', 'Jarpa_Test_-_Francisco', 'Jav_-_Calm_clean_and_profound_voice', 'Javier_España', 'Javier_Madrid', 'Javisanchez', 'JeiJo_', 'Jhenny_-_Warm_Fluid_and_Smooth', 'Jhenny_Antiques_-_Calm_Soft_and_Sweet', 'Jonathan', 'Jorge', 'Jorge_Gaviria_-_Powerful_and_impactful', 'Jorge_Mario_-_Spanish_to_read_books_and_narration', 'Jose_A._del_Rio', 'José_Borda', 'José_Borda_-_Deep', 'José_Borda_-_Expressive', 'Juan', 'Juan_Carlos', 'Juan_Manuel', 'Juan_Manuel_-_Conversational', 'Juan_Pablo', 'Kiko_Hdz', 'Knight_JAVIER-Calm_Gentle', 'Lalo', 'Leo_-_Energetic_Warm_Happy_Upbeat_Inviting_Optimistic', 'Leo_Kid_Spanish-_Character', 'Leonardo', 'Ligia_Elena', 'Ligia_Mendez', 'LoidaBurgos', 'Luis', 'Luis_Guary', 'Luis_R_Casiano', 'Luis_Vega', 'Lumina_-_Clara__Natural', 'Maicolangel', 'Malena_Tango', 'Mariluz_Parras', 'Mariluz_Parras_-_Expressive', 'Martin_Osborne_1', 'Martin_Osborne_2', 'Martin_Osborne_4', 'Martin_Osborne_5', 'Martin_Osborne_6', 'Martin_Osborne_7_', 'Mary', 'María', 'Mauricio', 'Mauro_C', 'Maxi_Araya', 'Maxi_Argames', 'Memo_M_-Professional_Latin_American_Spanish', 'Mia_García-_business_narrations_and_informative', 'Mia_Instructor-_Spanish_E_learning_corporate_Conversational_training', 'Miguel', 'Mikel_-_Adulto_idioma_español', 'Miquel', 'Nina', 'Oliver_Podcasting_Refinada', 'OmarVoice', 'Omgpvoice', 'Omgpvoice_-_Expressive', 'Pablo_Vambe_AI_V2', 'Paloma_S.__-_Spanish_-_Conversational_Comforting_Compelling', 'Pilar_Corral', 'Rafael', 'Regina_Martin', 'Ricardo', 'Rodolfo_Rodriguez_', 'Rosa_-_Spanish_Calm_Old_Woman', 'Rosa_Zambrano_', 'Santiago', 'Santiago_-_calm', 'Sara_Martin_1', 'Sara_Martin_2', 'Sara_Martin_3', 'Screaming_George', 'Serena_AI', 'Sergio_Juvenal', 'Sofi', 'Soy_Luis_Cen', 'Tatiana_Martin', 'Tony_Villa', 'Valeria', 'Victor', 'Víctor_Hinojosa', 'Yinet_-Upbeat_Columbian_Woman', 'Yorman_Andres', 'Zabra_-_Commercial_Announcer', '_Medellin_-_Colombian_Voice', 'paco']
258
+
259
+ swedish_names = ['Adam_Composer_Stockholm', 'Jonas_calm__informative_Swedish_voice', 'Sanna_Hartfield_-_Sassy_Swedish_', 'Sanna_Hartfield_-_Swedish_Conversational', 'Sanna_Hartfield_-_Swedish_Narration']
260
+
261
+ tamil_names = ['Ashwin_-_Relatable_Tamil_Voice', 'Madsri_-_Friendly_Tamil_Voice', 'Madsri_-_Tamil_Narrator', 'Meera_-_Conversational_Tamil_Voice', 'Nila_-_Warm__Expressive_Tamil_Voice', 'Ramaa_–_Energetic_Conversational_Tamil', 'Ramaa_–_Energetic_Tamil_Narrator']
262
+
263
+ turkish_names = ['Adilcan_Demirel', 'Ahmed', 'Ahmet_Evlice', 'Ahmet_Çiçek', 'Arman_Yılmazkurt', 'Belma_-_Dynamic_Playful_Clear_Narrator', 'Burak_Yoglu', 'Burcu_Basyigit', 'Cagatay_A.', 'Calm_Turkish_AudioGuide', 'Cavit_Pancar_-_Epic_Powerful_Historical', 'Cem', 'Cicek_-_Joyful_Dynamic_Storyteller', 'Derin_Roman_-_Epic_Dark_Powerful', 'Doacast_', 'Doga', 'Eda_Atlas', 'Emre', 'Emre_Gökçe', 'Farshid', 'Fatih', 'Fatih_Çetinkaya', 'Furkan_Keser', 'Gokce', 'Gokce_lx', 'Gozde_Arikan', 'Gönül_Filiz', 'Hakan_Turk', 'Halil_', 'Hulya', 'Hurrem_-_Confident_Turkish_Actress', 'Ipek_-_Professional_Confident_Narrator', 'Irem', 'Kamil', 'MUHAMMER_ARABACI', 'Mad_Scientist_-_For_All_Languages', 'Mahidevran_-_Playful_Clear_Powerful_Narrator', 'Mert', 'Mertkan_Erkan', 'Mustafa_Can', 'Onur_Can', 'Onur_Naci_Ozturkler_-_spunkram', 'Ramazan', 'Recep_Arkiş_', 'Rıdvan_Elitez', 'Se_-_Young_Male_Reading', 'Sedat', 'Sencer', 'Seyda_-_Eğlenceli_Anlaşılabilir_Fun_Fluent_Clear', 'Sohbet_Adami_-_Natural_Chat_Friend', 'Sultan_-_Charming_Seductive_Narrator', 'Tarik', 'Tuba_Velidede', 'Tuncay_Saran', 'Valperga', 'Walter_BJ', 'Whispering_Irem', 'Yigit', 'Zafer_', 'bilgehan', 'İbrahim_Halil_Acioglu', 'İbrahim_Khan_İpek']
264
+
265
+ ukrainian_names = ['Anton', 'Danylo_Fedirko', 'Dmytro_UA', 'Oleksii_Safin', 'Olena', 'Volodymyr_Pro']
266
+
267
+ vietnamese_names = ['Actor_Pham_Hung', 'Announcer_Van_Phuc', 'Ca_Dao', 'Kim_Tuyến', 'Ly_Hai', 'MC_Duy_Minh', 'Mai', 'Nhung', 'Sơn', 'Trang', 'Trung_Caha', 'Tuan_TLU']
268
+
269
+ english1_names = ['2B_Impression', 'ANDREA_CUTE_female_voice', 'Aakash_Aryan_-_Conversational_English_Voice', 'Aaron_-_Monotone_tech_narrator', 'Aaron_-_trusted_and_engaged', 'Abandoned_school', 'Abigail_-_arrogant_and_snobbish', 'Abrogail_', 'Ada', 'Adam__-_Newscaster', 'Adina_-_Teen_Girl', 'Aditi', 'Adriano_-_44', 'Aerylla', 'Aiden_-_Happy_Video_Host', 'Ailema_-_calm__Soft', 'Akwasi_-_Young_Ghanaian_man', 'Al', 'Alan', 'Alec_-_Energetic_Confident_and_Charismatic', 'Alex', 'Alex_-_Vibrant_Engaging_and_Lively', 'Alex_the_Performer_-_Commercial_Warm_Inviting_Expressive', 'Alexander_-_Mature_and_confident', 'Alexi', 'Alexite', 'Ali_', 'Alice_-_calm_and_soft_narrator', 'Alisha_-_Soft_and_Engaging', 'Alton', 'Alyx_-_Vibrant_British_Male', 'Amada', 'Amanda_-_a_natural_narrator', 'Amar', 'Amelia', 'Amelia_-_young_and_soft', 'Amilia', 'Amina_-_regal', 'Amritanshu_Professional_voice', 'Amy_-_Clear_and_Concise', 'Amy_-_Smart_Teacher_Narration', 'Amy_-_Witty_College_Girl', 'Andre_LeDoux_-_Romantic_Fancy_Talking_male_', 'Andrew', 'Andrew_-_Old_slow_voice', 'Andrew_-_Smooth_audio_books', 'Andrew_Radio', 'Andy_Berg', 'Angie_-_Upbeat_Book_Narrator_Professional_Videos_Engaging_Conversations_Radio_News_Meditation', 'Anjali', 'Anjina', 'Anna_-_Modern', 'Annie', 'Anthony_-_emotive__expressive', 'Arabella', 'Arayah_-_Mature_and_Professional', 'Archer', 'Aria_-_Sexy_Female_Villain_Voice', 'Armando', 'Armando_realistic', 'Asarte', 'Ash_', 'Asher_Avery_Alex_-_Engaging_and_Real__Storyteller_and_Performer', 'Asmodia_-_earnest', 'Aspexia_-_Grand__Clear', 'Athena_-_Stern_serious_and_powerful', 'Attention_Grabbing_Male_Narrator', 'Aunt_Annie_-_calm_and_professional', 'Aurelia_-_High_Quality_Realistic_Princess_', 'Aurion_-_Wise_Narrator', 'Austin_-_Dramatic_Narration', 'Austin_Boy', 'Austin_the_Cowboy', 'Ava', 'Ayden', 'Ayesha_-_Energetic_Indian_Voice', 'Ayinde_-_young_British_Nigerian', 'Bailey_-_twenty-something_earnest_confident', 'Bails', 'Barry_Bob_Alone', 'Bateman_-_Deep_Masculine_and_Authoritative', 'Befutig_-_Steady_Robust__Engaging', 'Befutig_Safiza_Uj-alet_-_Resonant_Commanding__Authentic', 'Belinda_-_Curious_and_Soft', 'Bella-_sensual_allurin_beautiful', 'Bella_-_Direct_and_Understanding', 'Belle_-_Clear_Well-Modulated_Expressive', 'Ben', 'Ben_-_British_male_young', 'Benjamin_-_The_Frenchy_Guy', 'Benjamin_S_Powell', 'Benny', 'Bert', 'Beth_-_gentle_and_nurturing', 'Betsy_-_Wise_and_Thoughtful', 'Betty', 'Beyond_Average_Joe', 'Bhavna_-_Insightful_Storyteller', 'Bianca_-_City_girl_', 'Bill_-__A_deep_voice_narrator', 'Bill_Oxley_-_Clear_informative_mature_forthright_and_understandable', 'Blaire_Frost', 'Blkking407_', 'Bob_-_old_man', 'Bogdan_-_Soft_Male_Narration', 'Boi', 'Booney_-_calm_and_cute', 'Brandon_-_Young_Male_American_Voice_Over', 'Brandon_Cole', 'Brandon_VO_Artist_Clone', 'Brayden_-_Conversational_Older_Teen', 'Brenda', 'Brenda_-_Raspy_female_', 'Bria_-_Young_and_Soft', 'Brittney_-_Male_Child_-_Youthful_Raspy_Cute__Excitable_', 'Brittney_-_Social_Media_Voice_-_Fun_Youthful__Informative', 'Brody_-_Serious', 'Broom', 'Bruce_-_vibrant_and_baritone', 'Bryan', 'Bryn_-_Calm_and_Expressive', 'Bud', 'Cal_-_confident_professor', 'Calliope_-_ancient_muse', 'Camelia', 'Cara_-_Expressive_and_Direct', 'Carl_-_Big_Voice', 'Caroline_-_clear_and_confident', 'Carter_-_Caring_and_Rational_British_Male', 'Cassandra', 'Cassandra_-_Confident_and_Vibrant', 'Cassia', 'Catherine_-_Professional_and_Direct', 'Cecile_-_Confident_and_Strict', 'Charlie_-_Posh_and_Royal', 'Charlotte_-_precise', 'Chazza_Hypno', 'Chechi_for_first_video', 'Chelsea_Boddie', 'Chinmay_-_Calm_Energetic__Relatable_', 'Chloe_-_sharp', 'Chris_-_irritable_boss', 'Chris_C_-_Mid_30s_-_Podcast_Reviewer_good_for_shorts', 'Chris___Young_and_Inspired', 'Chrissy_-_Millenial_Female', 'Christian_Rivera', 'Christina_-_Trained_on_over_900_characters_with_emotional_dialogue', 'Christopher_', 'Christopher_-_friendly_guy_next_door', 'Chrisva', 'Ciro_-_real_intense_twentyish', 'Clara', 'Claw_Benn', 'Cody_-_Energetic_Upbeat_Educator', 'Cody_McAvoy', 'Cole_-_Gritty-Rough-Strong', 'Cooper', 'Cornelis', 'Creator', 'Cristiano', 'Crystal', 'Cyrus', 'Dan', 'Dana', 'Danbee', 'Daniel', 'Daniel_-_American_Game_Show_Host', 'Daniel_-_expressive_and_wise', 'Danny_-_highschool_jockish', 'Daphne_-_alluring_goth', 'Dara_-_loud_and_Intense_', 'Darwin_-_Rich_Mature_Voice', 'Daryl', 'Dath_Ilan', 'David', 'David_-_British_Storyteller', 'David_-_Deep_British', 'David_Bent', 'David_Castlemore_-_Newsreader_and_Educator', 'David_DeWitt', 'David_Eclipse', 'David_Esposito', 'David_Hertel', 'Dean_-_Goody_Two_Shoes', 'Deb_-_emotive_and_expressive', 'Deja', 'DellaRayne', 'DellaRayne_-_Smooth_and_Assertive', 'Demon_Monster', 'Derrick_-_melancholy', 'Desdemona_-_sassy', 'Desmond_-_clear_sincere_angst', 'Dezzy_-_Young_and_Soft', 'Dhyogo_azevedo', 'Diana_-_Meditative_Calm', 'Donny_-_Real_New_Yorker', 'Donny_-_very_deep', 'DrRenetta_Weaver', 'Dr_Lovejoy_-_Pro_Whisper_ASMR_', 'Drake__Warm_Canadian_English', 'Drew', 'Drew_-_Deep_Soothing_Guided_Meditation', 'Duke', 'Durgesh', 'Eamon_-_old_lecturer', 'Ed_Holderness', 'Edward', 'Egbert_-_upbeat_meditations', 'Elisa', 'Elisabeth_-_meditative', 'Elizabeth', 'Elizabeth_-_Wise_and_wistful', 'Elizabeth_-_calm_commanding_classic', 'Ella_-_Old_And_Deep_', 'Ellie', 'Emily_-_Australian_Female', 'Emily_-_relaxed_and_conversational', 'Emily_-_sweet', 'Emma', 'Emma_', 'Emma_-_A_brilliant_young_magician', 'Emma_-_sharp', 'Emma_Taylor', 'Emmeline_-_a_young_clear_and_confident', 'Epiktet_Philosoph_', 'Erdem_-_Educational_and_Instructional', 'Erika', 'Erin_-_Meditation_Guide', 'Ethan', 'Ethan_-_expressive_wise', 'Eustis_', 'Evan_-_showbiz_excited_happy', 'Eve_-_young_Australian_girl', 'Ezreal', 'Ezreal_-_energetic', 'Faith', 'Feeven', 'Female_Romance_Novel_', 'Foxy_-_Futuristic_Robotic_Personal_AGI', 'Francesca', 'Frank', 'Frank-_scary_stories', 'Frank_Johnson', 'Fucia_-_Youthful_and_Confident', 'Gabriella_-_deep', 'Garrett_Wasny', 'Gault_-_Youngish_excitable_high-strung.', 'General_Joe_-_WWII_Narrator', 'George', 'George_-_Serious_and_Experienced', 'Gerhard_Bakker', 'Gertrude_-_Childrens_Narrator', 'Gijs', 'Gladys', 'Goddess_Freyja_-_A_Mysterious__Magical_Muse', 'Graham_-_Old_and_Wise', 'Greg_Murphy', 'Gregoria', 'Gruhastha_-_Energetic_Enthusiastic__Articulate', 'Guy', 'Hakim_-_Audiobook_English__Arabic_Gulf_Accent', 'Halbert', 'Halley_McClure', 'Hallie_-_soft-spoken_and_subtle', 'Hallie_-_youthful_girl_voice', 'Hamlin_-_Deep_and_Booming', 'Hannah___Confident_Teacher', 'Hardcore_Henry_-_Intense_Storyteller', 'Harold', 'Harry_-_Proper_and_Academic', 'Harry___Sad_Emotional_Reck_', 'Harvey_-_Knowledgeful_Upfront', 'Haven_Sands', 'Helena_-_British_female_gentle_and_smart', 'Hemaka', 'Herbie_-_Lisp_and_whistle_S_sounds', 'Hermes_-_frank_abrupt_messenger', 'Hobbs_-_Casual_Narration', 'Horace_-_intense_deep_elder', 'Huckleberry_-_Southern_Charm', 'Hyde', 'Ian', 'Indian', 'Ingmar_-_Intimately_Mysterious', 'Investigator_Jane', 'Iomedae', 'Iris', 'Isabel_-_emotional__lisp', 'Isabella_', 'Isla_-_Strong_British_accent', 'Isla_Reid', 'Ivan_the_Mighty', 'Ivy_-_Free_Spirit', 'J._Thorn', 'J._Tyson', 'Jack', 'Jack-_Raspy__deep', 'Jacme', 'Jade', 'Jakobi_-_Emotive_and_Intriguing', 'James_-_Deep_', 'James_-_Deep_and_Booming', 'James_-_cool_and_expressive', 'James_-_deep_and_to_the_point', 'James_Fitzgerald', 'Jami_-_Mature_and_Clear', 'Jannice', 'Jason', 'Jason_Jordan', 'Jason_Pike', 'Jasper_-_androgynous_and_rebellious', 'Jasper_-_erudite_and_inquiring', 'Jeff', 'Jeff_-_Australian_Male', 'Jeff_-_Smooth_and_Confidant', 'Jennifer_-_expressive_and_cheerful_narrator', 'Jeremy_-_meditative', 'Jeremy_Clarkson', 'Jeremy_Smith', 'Jerry', 'Jerry_-_Energetic_and_Upbeat', 'Jessica_Anne_Bogart_-_Conversations', 'Jeż', 'Jim', 'JimBob_', 'Joan', 'Jodie_-_Assertive_and_Intelligent', 'Joe_-_American_Male_Narrator', 'Joe_-_professional_British_male_voiceover', 'Joe_02', 'Joey_Reeve', 'John', 'John_-_Deep', 'John_-_Guided_Meditation__Narration', 'John_-_Ultra_Brutal_Man', 'John_Beamer', 'John_Doe_Gentle', 'John_Domus_Cruo_-_Serious', 'John_Fernandes_-_Energetic__Friendly', 'Johnny_-_Upbeat_Professional_American_Male', 'Johnny_Boy_-_Action_Movie_Narrator', 'Johnny_Kid__-_Serious', 'Johnson_-_American_Male_voice_', 'Jones_-_Articulate_Gruff_Raspy', 'Jordan', 'Jordan_-_Warm_Narrator', 'Josh', 'Josh_-_Quiet_Person', 'Josh_T.', 'Joshua_-_Authoritative_Warm_and_Articulate', 'Joshua_-_Young_Soft_Warm_Male_Voice', 'Judy_-_Aged_and_Confident_Elder', 'Julie_-_expressive_and_energetic_romance_narrator', 'Justin_Time_-_eLearning_Narration', 'Justine_-_Expressive_Teen_Boy', 'Kade_Murdock_2.0', 'Kala', 'Kallen', 'Karen', 'Karma_-_Professional_and_Thoughtful', 'Kasi', 'Kathleen_Julie_-_alto_serious_articulate_focused_and_direct', 'Katy_-_sassy_teen', 'Kayla_-_Nurturing_and_Caring', 'Kelli-_Young_Mature_Southern', 'Kelly', 'Kelly_-_clear_teen_voice', 'Kenneth_-_strange_eccentric_old_gentleman', 'Kenny_-_Volume_2', 'Kevin', 'Khaled_', 'Khushi_-_New_Indian_Voice', 'Kieran_-_newsreader_male', 'Kik', 'Kim_-_Swedish_accent', 'Kim_Selch_-_Pro_Studio_Recording', 'King_Chuku', 'Kingsley_-_dapper_and_deep_narrator', 'Kirsten_-_Elegant_Knowledgeable_and_Reassuring', 'Kirt', 'Kitten_Kaley_Rose', 'Kiwi_-_Holistic_Educator', 'Kristopher_-_Gentle_ASMR_', 'Kuk', 'Kurrayah_-_young_and_friendly', 'Kuthon', 'Kwame', 'Kyana_Cook', 'LIAM_DALE', 'Lalitha_J_-_Tamil_Old_Woman', 'Lamar_Lincoln-_Black_Male', 'Latisha', 'Laurance', 'Lawrence_Mayles', 'Lee__Middle-Aged_Australian_Male', 'Leif_-_husky_male', 'Lena_-_crispt_and_confident', 'Leonardo_', 'Lerato', 'Liam', 'Liam_', 'Lila_-_Intelligent_and_emotive', 'Lily', 'Linus_-_A_young_American_tech_video_narrator', 'Lisa___Stern_and_Assertive_', 'LiveCat', 'LiveChi_', 'Liz', 'Lloyd', 'Lucas_-_motivational_speaker', 'Ludo_-_Storyteller_-_Your_epic_story_narrator', 'Luis_Gabriel', 'Lukas', 'Luna_-_Well_rounded_insightful_charismatic', 'Lyle', 'MANSHI', 'Magnolia_-_Mature_and_Wellspoken', 'Magpie', 'Marc_--_Smart_Soothing_Man', 'Marco-_hot_male_voice', 'Margot_', 'Mariam', 'Marianne_-_Narrative_Friendly_British_', 'Maribeth_-_A_Southern_Sweetheart_', 'Marie_KC', 'Marilyn_-_confident', 'Marissa_-_Friendly_and_Sociable', 'Marissa_from_ElevenLabs', 'Marjorie_', 'Mark', 'Mark_-_Very_Deep_Confident_Professional', 'Mark_-_Young_and_Calm_', 'Mark_-_clear_and_professional_newscaster', 'Mark_-_confident', 'Mark_-_raspy', 'Markus_-_Mature_and_Chill', 'Marshal_-_Dandy_Brit', 'Marshal_-_Toon_Character', 'Marta_-_Officious', 'Matt_Landon_', 'Matt_Rogo', 'Matt_Washer', 'Matthew_-_American_Male_Narrator', 'Matthew_-_Friendly_Clear_and_Perfect_for_Educational_Content', 'Matthew_MacGyver', 'Max_-_YouTube_Professional', 'Melina_CTC', 'Melissa_-_Female_Soothing_Narrator', 'Melville__Euro-accented_narrator.', 'Melvin_-_soothing_and_gentle', 'Mia', 'Mia_-_confident_and_annoyed', 'Mia_Chou', 'Michael', 'Michael_Anthony', 'Michelle_-_Old_and_Daring', 'Mike', 'Milan_Diekstra', 'Milean_-_bassy_with_plosives', 'Miller', 'Milo_-_Casual_Chill_Relatable_Young_Male', 'Mina', 'Miriam_-_Casual_and_Wry', 'Miss_Brittany_Andrews', 'Mkves_-_Calm_', 'Molly', 'Monika_Sogam', 'Mono', 'Morgan', 'Motivational_Coach_-_Leader', 'Mr._P_-_the_fun_guy', 'Mr_Novella_Main_Voice_-_Kobe_Black_British_Male_Young', 'Mun_W', 'My_Fortress', 'Nakiso', 'Nana-chan', 'Narender_Sharma', 'Narrador-34', 'Nata_Professional', 'Natasha_', 'Natasha_-_Sensual_Hypnosis', 'Nathaniel_C._-_Deep_Rich_Mature_British_voice', 'Naty_Heals_voice', 'Neal_-_Perfect_for_documentaries', 'Neil_-_cheerful_upbeat_youth', 'Nellie_-_soft', 'Newton', 'Niamh', 'Nichalia_Schwartz', 'Nick_Colter', 'Nicki', 'Nigel_-_Mysterious_Intriguing', 'Niladri_Mahapatra', 'Nina_-_nerdy', 'Nipunn_-_deep_captivating', 'Noah_-_scary_story_voice', 'Nora_', 'Nova_-_Wise_and_Tranquil_', 'Old_Joshua', 'Old_man_with_a_soft_voice', 'Olivia', 'Omeo', 'Oscar_-_Older_Narrative_Epic', 'Osiris_-_Deep_and_commanding_rumble_', 'Oswald_-_intelligent_professor', 'Pace_-_Deep_Menacing_and_Raspy', 'Page', 'Paladin', 'Parki_-_expressive_and_loud_elder', 'Paul_Henry_Smith_-_gentle_patient_clear', 'Paul_J._-_Calm_and_soothing_', 'Paula_Moon_-__Sleepy-time_true_crime_vocal', 'Paxti_-_Young_and_Earnest', 'Penelope_-_relaxed_and_breathy', 'Penfist_-_Military_Broadcaster', 'Penny_-_sweet_story_teller', 'Peter_-_Eastern_European_English', 'Peter_-_Hungarian_accent', 'Phil_-_Author_Non-fiction', 'Phillip', 'Phoebe', 'Pixy', 'Planty_-_raspy_voice', 'Prakash', 'Priya_-_Beautiful_and_melodic_Indian_accent', 'Priyam_-_Deep_Indian_Voice', 'Quasi-Jude-Lw', 'Rabih_Rizk', 'Rainbow', 'Raj', 'Raja_Babu', 'Ran', 'Rasper', 'Raven_Nightshade', 'Raven_Reed', 'Ray_-_Male_Soothing_Narrator', 'Raymond_Baxter', 'Raymond_Elliott', 'ReadingSam', 'Recvoice', 'RedGlassesVoiceovers', 'Remus_-_Fantasy_Professor', 'Rex_-_Throaty_and_World_Weary', 'Richard_-_enthusiastic_young_male', 'Richard_Yu', 'Ricky_The_K', 'Riley_-_loud_and_intense', 'Rinoa_-_Middle_Aged_Lady', 'Robert', 'Robert_-_American_standard_broadcaster', 'Robert_-_Business_Book_Narrator', 'Ronald_Wang', 'Rosalind_-_Classy_British_Actress', 'Rose', 'Rowan_-_gruff_and_raspy', 'Rufus', 'Rupert___Strong_British', 'Russel-_clear_realistic_pleasant', 'Russell', 'Russo_-_Dramatic_Australian_TV', 'Ryan', 'Ryan_-_Calm_Masculine_Teenager', 'Ryan_-_Dynamic_', 'Ryan_-_rough', 'Ryder_-_cool_and_balanced', 'SAVVAS', 'Sahand_RZ', 'Sally_Ford', 'Sam', 'Sam_-_English_Storyteller', 'Samantha', 'Sandy', 'Sanjana_', 'Sara_Jay', 'Sarah', 'Sarah_Lawson', 'Sash', 'Sassy_Aerisita', 'Satyam_1', 'Scar', 'Scott_-_Mature_and_Deep', 'Scott_-_Young_male_Canadian_voice', 'Scott_-_drill_instructor', 'Sean', 'Sean_-_deliberate_low_voice_authoritative_narration', 'Sean_Michael', 'Security', 'Serenity', 'Sexy_American_Female_voice', 'Seán', 'Shannon', 'Shannon_-_High_Quality_American_Male_Voice', 'Shanny_-_Soothing_Calm_American_Woman', 'Shelley_-_Clear_and_confident_British_female', 'Sheriff_Ben_-_Deep_Gruff_Authoritative', 'Shianne_-_Young_and_Confident', 'Shiv_-_Mature_Deep_Voice', 'Shoobu_-_Old_British_Man', 'Shot_List_voice_Girl', 'Sieu_Muoi', 'Sigrid_-_solemn_raspy_wise', 'Silas_-_stern_british_male', 'Silvia_-_upbeat_british_lady', 'Simeon', 'Simon_J_Kidson', 'Sina_-_Your_Narrator', 'SirEden_', 'Smart_Sara', 'Smarty_Pants_Amy', 'Smokey_McSmoker_-_Deep_and_Motivational', 'SocraGPTs', 'Sofy', 'Sophia', 'Sophia_Florence', 'Southern_Ann', 'Stan', 'Starry', 'Stella_-_Calm', 'Stephanie_P_-_Casual_feminine_great_for_storytelling', 'Stephen_-_Calm_British_Narrator_', 'Steve_Maughan', 'Steven_-_Calm_British_Deep_Soothing', 'Steven_-_Vibrant_Resonant_and_Inspiring', 'Stuart', 'Subirachs', 'Subu', 'Sully', 'Susan', 'Swara_-_Young__Calm_Voice', 'Tamara', 'Tanya-_Upbeat_and_Expressive', 'Tara', 'Tarini_-_Expressive__Cheerful_Narrator', 'Tarnish', 'Taro-_Young_Japanese_Accented_Guy', 'Tatsuya_Suzuki', 'Technical_Narrator_-_Precise_Knowledgeable_Engaging', 'Technical_Southerner', 'Temos_Sevandivasen_-_Resolute_Philosophical_Empathetic', 'Test_Aaron_2', 'Test_Plumb_2', 'Thaddeus_-_ancient_historian', 'Theodor_-_deep_american', 'Theodore-Old_Man__Deep_Husky_Voice', 'Theresa', 'Thomas', 'Tiffany_Kim_-_versatile_and_engaged_narrator', 'TikTok_Male_Voice', 'Todd_-_Universal_Crossover', 'Tom', 'Tom_-_trailer_narrator', 'Tommy_-_Reedy_Annoyed', 'Tony_-_Middle-aged_with_American_accent_', 'Trent_-_quirky', 'Tulipe', 'Twilight_Zone_Guy', 'Tyler_Kurk', 'Tyrone_-_Deep_Strong_Masculine_Narrator', 'Tyson', 'Upbeat_Teacher', 'Vee_-_Soft_Spoken_British_Male', 'Vicki_', 'Victoria_Queen_of_England', 'Vieux', 'Vivian', 'Vivie_2_Upbeat', 'W._Sillyman_Oxley', 'Wade_-_powerful', 'Walker', 'Wally_-_Warm_Deep_Masculine', 'Walter_-_Intelligent_and_Resolute', 'Wanda_-_calm_female', 'Wesley_-_nervous_cowardly_fellow', 'Will_', 'William', 'Winston_-_Distinguished_Erudite_and_Genteel', 'Winston__Authoritative_British_Man', 'Yee', 'Yoel', 'Young_brit', 'Yousef_-_Passionate_Sympathetic', 'Zara_-_understanding_friend', 'Zashikix', 'Zee_-_Childish', 'Zeus_-_arrogant', 'Zeus_Epic', 'Zoe_-_emphatic_and_pleasant', 'Zon-Kuthon', 'Zuri_-_New_Yorker', '_Luca_-_ calm_soothing_steady', '_Martha_-_Narration', '_Vicky_-_Posh_Voice_With_A_Lipse_', 'adriano_-_41', 'emily_', 'harry_deep_and_warm', 'madeline', 'neuris', 'sebastian_']
270
+
271
+ english2_names = ['19keys', 'ADAM', 'ADAM_v2', 'ALESSANDRO_DEEP', 'ATAKAN_ARISOY', 'A_Top_Narrator_VO_PRO', 'Aaditya_Kapur_-_Calm_Conversational_Voice', 'Aarav_-_Deep_and_wise_Indian', 'Aaron_-_Narration_Voice__A_Voice_thats_One_in_a_Million_NOT_like_a_Million_Others', 'Aaron_Davis_Emerson', 'Aaron_Sage_-_Friendly__Conversational_', 'Abigail', 'Adam_-_Calm_Smart', 'Adam_-_deep_voice_Australian', 'Adam_-_low_rough_and_full', 'Adam_-_old_and_knowing', 'Addie_-_Podcast_Princess', 'Adeline', 'Adi', 'Adriano_-_narrador_37', 'Aelar', 'Agatha', 'Agent_L', 'Ajay', 'Akua', 'Albert_-_Pleasant_deep_voice', 'Albert_-_Strong_German_Accent_', 'Albert_-_deep_slurred_meditations', 'Albert_Banoy', 'Alden_-_Resolute_Gravitas', 'Alex_-_Australian_Male_-_Casual_-_Melbourne_City', 'Alex_-_Business_Book_Narrator', 'Alex_-_Young_American_Male', 'Alex_-_expressive_narrator', 'Alex_Ozwyn', 'Alex_Wright', 'Alexander_-_Deep_Calm_and_Authoritative', 'Alexis_-_chic_and_cosmopolitan', 'Alfie', 'Ali', 'Alice_-_calm__composed', 'Alice_-_young_and_confident', 'Alita', 'Allison_-_millennial', 'Aly_-_Serious_and_Strict', 'Amaniri_-_British_Stalwart_Lass', 'Amelia_-_haughty', 'Amrut_Deshmukh_-_Booklet_Guy', 'Amy_-_Spunky_Cartoon_Girl_Voice', 'Amy_-_mean', 'Ana', 'Ana-Rita', 'Andrea_Wolff_-_clear_youthful_evenly_paced_', 'Andrew_-_tech_wizard', 'Andrews', 'Android_X.Y._Z._-_AI_Robot_of_the_Future', 'Angel', 'Ann_the_neighbor_', 'Anna_-_Cute_Calming_Narrator', 'Anne_Marie', 'Anthony10', 'Antoine', 'Antonio', 'Antonio_-_English_with_Subtle_Italian_Accent', 'Anup_Chugh_', 'Anushri_-_Natural_Young_Indian_Voice', 'Archie_-_English_teen_youth', 'Ardian', 'Ariah', 'Aristocrat', 'Arjun', 'Arnold', 'Arthur', 'Arthur_-_Energetic_American_Male_Narrator', 'Arthur_-_Geeky_Masculine_Deep', 'Arthur_-_Royal_Narrator', 'Arthur_the_anchorman', 'Arun', 'Ash', 'Asher_-_Confident_Aristocratic_Male', 'Ashley_American_Mom', 'Astrid', 'Astro_-_Audiobook_Excellence', 'Athena_-_corporate_supervisor', 'Aurelius_-_Calming_Deep_Serious_', 'Ava_Said_2', 'B._Hardscrabble_Oxley', 'Baron_Theatricus_-_Dramatic_Elocution', 'Beatrice_-_energetic_older_female_voice', 'Bedlam', 'Belinda', 'Ben_-_Masculine_Authorative', 'Ben_-_Scary_Stories', 'Benjamin', 'Benjamin_-_Deep_Warm_Calming', 'Benjamin_-_strong_and_confident', 'Bert_-_Mystical__Whimsical', 'Bill_Oxley_', 'Biquette_-_sad_and_resigned', 'Blake_-_bassy_and_gruff', 'Bob__-_Young_Deep-voice', 'Brian', 'Brian_-_Broadcast_News_Anchor', 'Brian_-_deep_narrator', 'Brian_Overturf', 'Brittany', 'Brittney', 'Brittney_-_Young_Peppy_Female_-_Social_Media_How_Tos_Explainers', 'Brody_-_Cool_Deep_Chilled', 'Bruce_-_Deep_Warm_Strong', 'Bruce_Actor', 'Brucifer', 'Brutus_-_Profound_Slow-paced_Inspiring', 'Bryan-Deep_Narration', 'Bryan_-_Narration', 'Bryan_-_Professional_Narrator', 'Bubba_Marshal', 'Burak_-_accented_storyteller', 'CAMILO_-_AMERICAN_VOICE_NARRATOR', 'CJ_Murph', 'CS_New_', 'Cal_-_Deep_and_Calming', 'Caleb', 'Cali_-_American_Female_voice_for_Promos', 'Cally_-_Young_and_Sweet', 'Calvin_', 'Cameron_-_deep_and_emotive', 'Can', 'Capt_Lynch_-_Sophisticated_Wise_Calm', 'Carlo_', 'Carlos_', 'CarterSutra', 'Carters_Edge', 'Casey_-_Clean_crisp_female_voice', 'Cecil_-_Profound_and_Precise', 'Cecilia', 'Charles_-_Deep_Hoarseness_Voice', 'Charlie_-_gentle_knowledgeable_old_voice___', 'Charlotte_-_sweet', 'Charmion_-_Soft_and_husky', 'Chloe_-_Girl_Next_Door', 'Chris_-_British_Friendly_Advertising_', 'Christian', 'Christina-_friendly_and_energetic', 'Christine_-_calm_teacher', 'Christopher', 'Christopher_-_Immersive', 'Christopher_-_scientific_mind', 'Chuck', 'Chuck_-_True_Crime', 'Claire', 'Clarice_-_Kind_and_Trustworthy', 'Clover_-_Calm_and_Collected', 'Cody_-_Authoritative__Deep_Motivational_Narration', 'Connor', 'Conny_-_Old_and_Stubborn', 'Consuelo', 'Conversational_Joe_-_A_chatty_casual_voice_British_RP_male', 'Courtney', 'Courtney_-_Soothing_and_Calm', 'Crime_Channel', 'Crystal_-_Pleasant_sultry_Voice_for_Audio_Experience', 'DJ_Marathon', 'DR_Dean_British', 'Dakota_H', 'Dalia_', 'Damon_-_Deep_and_Strong', 'Dan_-_Young_British_friendly_voice', 'Daniel_Lappisto', 'Daniel_R', 'Danielle_-_Canadian_Narrator', 'Dave', 'Dave_-_Dry_Quirky_Wit', 'David_-_American_Narrator', 'David_-_Epic_Movie_Trailer_', 'David_-_Gentle_Engaging_Soothing', 'David_-_Mature_Engaging_Male_Voice_American_accent', 'David_-_knowledgeable_old_soul_', 'Davy_-_Deep_Pirate_Voice', 'Dean_-_British_RP_Warm_and_Friendly', 'Dean_Jones', 'Delegate_-_Bright_and_Airy', 'Demeter_-_expressive_and_sincere_mother', 'Denis_-_Authoritative_and_Deep_Narrator', 'Denzel', 'Denzel_-_Casual_Narration_', 'Desdemona_-_balanced', 'Dez', 'Dispater_-_Refined_Strong__Authoritative_', 'Divija_-_A_female_voice_young_and_vibrant', 'Djali_Vesela', 'Don_-_Deep_Warm_Realistic', 'Don_Kim', 'Donald_-_American_70_years_old', 'Dorian_-_fast_paced_mediations', 'Dragonia_-_Dragon_Rider', 'Drake', 'Duncan_--_the_Melancholy_Intellectual_', 'Dying_story_teller', 'Ebony', 'Ebsa-_Realistic_Deep_male_voice', 'Ed_-_sweet_and_soft', 'Eddy', 'Edgar_-_nerdy', 'Edmund', 'Edris_-_deep_and_powerful', 'Edward-_muffled_and_distorted', 'Elaine_-_Sweet_and_Lively', 'Elaine_-_emotionally_versatile_narrator', 'Ele_-_Elegant_Youthful', 'Eli_-_American_voice_for_promos_and_explainers', 'Elijah_-_Narrative_Reader', 'Ella_', 'Ella_-_soft_and_sweet', 'Ellie_-_Tender_young_woman', 'Emily', 'Emily__-_pleasant_teen_voice', 'Emma_watson', 'Emms', 'Emre', 'Erecura_-_Walm_and_Nurturing', 'Erin', 'Erin_', 'Eris_-_strong', 'Eugene_-_nerd_and_geek', 'Evan_-_deep_narrator_voice', 'Evan_Byers', 'Evy_-_endearing_textured', 'Extraordinary_Joe_', 'Faheem_Ahmed_', 'Felicity_-_young_and_well-spoken', 'Finn_-_Serious_and_Sincere', 'Florence_-_Mature_Educated', 'Fowler_-_scary_and_authoratative', 'Franklin', 'Frederick_Surrey', 'Gandalf_', 'Garretts_Groove', 'Gemma_-_Refined_Witty_and_Warm', 'Gemma_-_Young_Australian_Female', 'Gene_-_informative_and_trustworthy', 'German_Petra_-_English_with_hard_accent', 'Giovanni', 'Godfrey_a_National_Treasure', 'Godot_-_Wise_and_Serene', 'Gordon_', 'Grace', 'GrandMaester_Game_of_Thrones', 'Grandma_Margaret_-_Storybook_Narrator', 'Grandpa_Slow_Reading', 'Grandpa_Spuds_Oxley', 'Gravitas_-_The_deep_narrator_voice', 'Gregory_-_British_Nature_Narrator', 'Gwen_-_Calm_and_Pleasant', 'Hades_-_grim_gravitas', 'Hamza', 'Hannah_-_assertive__refined', 'Hannah_-_soft-spoken', 'Harrison_-_Deep_and_Cinematic', 'Harrison_Gale_–_The_Velvet_Voice__deep_resonant_powerful_smooth_rich_storytelling_narrator', 'Haseeb_-_Canadian_Narration', 'Haven_Glass', 'Hector', 'Hector_-_Deep_Narrative', 'Hephaestus_-_steady_and_patient_teacher', 'Hiro', 'Hope_-_natural_conversations', 'Hope_-_upbeat_and_clear', 'Huss', 'Hyznberg_-_Crime_Time_Cool', 'Ian_Cartwell_-_Suspense_Mystery_and_Thriller', 'Igor_Radio', 'Illya_-_Soft_and_neutral', 'Isabel_-_Soft_Spoken_Teen_Youth', 'Isac', 'Isadore_', 'Ivy_-_Female_Childish_-_Young_Innocent__Bubbly', 'Ivys_Allure', 'Jace_Nox_-_Mellow_Gentle_and_Diverse', 'Jack_-_Calm_Monotone_Measured_Speech', 'Jack_the_Pirate', 'Jackson_-_Confident_Charismatic_and_Approachable', 'Jacob_-_Teen_and_Popular', 'Jacob_Dayi', 'Jacqui_Griffin', 'Jada_-_confident_and_direct', 'Jalia_-_soothing_female_voice', 'Jamal_', 'James', 'James_-_British_TV_presenter', 'James_-_classic_narrator', 'James_-_professional_and_authoritative', 'James_Lindsay_Pro', 'Jameson_-_Guided_Meditation__Narration', 'Jamie_-_young_child_voice', 'Jan', 'Janet', 'Jaquon', 'Jarvis_-_Polite_and_Upfront', 'Jason_-_Authoritative_Smooth_and_Approachable', 'Jay_-_Proper_Mancunian', 'Jeevan_-_Expressive_Indian_Voice', 'Jenny', 'Jerry_-_Presenter_Announcer_Event', 'Jerry_Beharry_-_Conversational', 'Jessica', 'Jessica_-_Cali_girl', 'Jessica_-_meditative', 'Jessica_-_smart_coach', 'Jessica_Anne_Bogart_-_A_VO_Professional_now_cloned', 'Jhon_-_casual_and_friendly', 'Jhonny_-_Agradable_reading', 'Joe', 'Joe_-_British_male_in_high_quality', 'Joey', 'Joey_-_Upbeat_Popular_News_Host', 'Joey_-_Youthful_and_Energetic', 'John2', 'John_-_American_War_Speech', 'John_-_Old_and_kind_', 'John_-_The_Heart_Of_America', 'John_Adams', 'John_Doe_-_Deep', 'John_Fernandes_-_Vibrant_British_Voice', 'John_Martin_-_Funny', 'Johnny_-_deep_and_gruff', 'Johnny_Lefors', 'Jona_-_man_of_the_Desert', 'Jonah_-_sassy_young_male', 'Jonathon', 'Jose_Feliciano_Voice_Clone', 'Joseph_-_Comforting_', 'Joseph_-_Cool_calm_and_great_for_narration', 'Joseph_-_motivational_speaker', 'Joy_Love', 'Judith_-_calm_and_confident', 'Julia_-_soft_and_shy', 'Julian_-_deep_rich_mature_British_voice', 'JuniorDT', 'Justin_-_hyped', 'Kai_Selekwa', 'Kamwe', 'Karan', 'Karan_-_EnglishStandup_Comedian_', 'Karl_C._Shroff_-_Professional_Calm_Voice', 'Karl_Stuke', 'Kass_-_Energetic_Casual_Engaging', 'Kat_Dollar', 'Kavya_-_Energetic_Kids_Voice_', 'Keel_-_confident_dramatic_narrator', 'Kelcey_-_Teen_and_Adventurous', 'Kellan_-_soft_and_gentle', 'Ken_-_African_man_with_heavy_accent', 'Ken_-_Influential_British_Male', 'Kenneth_-_calm_newcaster', 'Kevin_W._Krause_', 'Khemet_-_Deep_and_Powerful', 'Kostiantyn', 'Kyle_-_narrator', 'Lachita', 'Lana-_Robin_Rekia', 'Landon_Bailey', 'Laura_-_emphatic', 'Lauren_-Confident_Quick_Talking_No_Nonsense_Gal', 'Layo_Queen', 'Lee_-_Calm_and_Relaxed', 'Lena_-_emotive_and_expressive', 'Leo', 'Leo_-_Energetic_Indian_Voice', 'Leoni_Vergara_', 'Lily_Wolff_-_Expressive_Clear_Youthful_Calming', 'Lisa_-_pleasant', 'Lisa__-_Pleasant_calm_and_dynamic', 'Long_Storyteller', 'Lowy_-_soothing_gentle_and_warm', 'Lucan_Rook__-_Energetic_Male', 'Lucia_Reid_', 'Lucy_-_British_Storyteller', 'Lucy_-_sweet_and_sensual', 'Lucy_-_yound_anime_girl', 'Luis_-_Relaxed_and_Calm_Narration_-_Pro_Recording', 'Luke_old_and_deep', 'Luminessence_-_Light_Mirror', 'Luna_Spencer', 'Lydia_-_squeaky_', 'MW', 'Maccabaeus_-_Audiobook_Narration', 'Mael_-_deep_raspy_male', 'Mahmood', 'Manohar_-_Gruff_Seasoned_and_Wise', 'Marcus', 'Maria', 'Marie-Alice', 'Mark_-_Natural_Conversations', 'Mark_-_Robust_Dependable_and_Engaging', 'Mark_-_calm_and_wise_teacher', 'Marques_-_Young_and_Wary', 'Marshal_-_Grumpy_Sourpuss', 'Marshal_-_New_Jersey_Male', 'Martas', 'Mary_-_soft_and_warm', 'Matilda', 'Mats', 'Matt', 'Matt_Snowden', 'Matthew_-_calm_and_peaceful', 'Matthew_Wayne_-_Natural_calm_steady', 'Max_-_fast_friendly_and_direct', 'Maxwell_-_deep_and_dramatic', 'Maya', 'Meera', 'Melissa', 'Mellow_Matt', 'Merlin', 'Merlin_the_Wizard_Protector_of_King_Arthur', 'Mia_-_Clear_Smooth_Professional', 'Mia_-_Old_And_Confident_', 'Micah', 'Michael_', 'Michael_-_A_narrator_with_a_buttery-smooth_deep_voice', 'Michael_-_Confident', 'Michael_-_Excited_and_Ready_to_Speak', 'Michael_Filce', 'Michela', 'Middle_age_Southern_Male', 'Mike_Adams_-_All_things_space', 'Mike_G', 'Milo', 'Mine', 'Minerva_-_Fantasy_Professor', 'Mira_Gold_-_Dystopian', 'Miranda', 'Mirilene', 'Misti_-_English_Technology_Virtual_Training_Teacher', 'Mistress_Regina', 'Modavian_-_Dignified_Experienced_Authoritative', 'Moe', 'Mohammed', 'Mohanapriya', 'Monotone_Mike', 'Mora_of_Maragall_-_Resilient_Compassionate_Inspiring', 'Mouse', 'Mr_Clem', 'Mr_President_-_Strong_Fast_and_Impactful', 'Mrs_Novella_Main_Voice_-_Althea_Female_Young_European', 'Mwika_Kayange', 'Myriam_-_sweet_Teen_Girl', 'NEW_AMREEN', 'Nadya', 'Naina_-_Sophisticated_Indian_Girl', 'Nala_-_African_Female', 'Narrador_-_documentarios', 'Natalie_-_Posh_British', 'Natasha_-_Gentle_Meditation_', 'Natasha_-_Valley_girl', 'Nate_the_Great_-_American_Male', 'Neha', 'Neil', 'Neil_-_calm_and_deep', 'Neville', 'Nia_-_Black_Female', 'Niall_-_dramatic_male', 'Nichalia_Schwartz_-_Gentle_Kind_Sweet_GenAm', 'Nicholas_-_Raspy_Mature', 'Nickrad_', 'Nicola', 'Nicoletta', 'Nicolette_-_Strong_and_Stern', 'Nicolette_-_Young_Woman_Clear_Accented', 'Nigel_-_classic', 'Nigel_J.', 'Noah_-_The_stoic_narrator', 'Nolan_-_Emotive_and_Smooth', 'Northern_Irish_Peter', 'Northern_Terry', 'ONeal', 'Ocean_•_Monotonous_Voice', 'Oi', 'Okole', 'Old_Osirion_Woman_-_Timeless_Mystical_Nurturing', 'Older_British_gangster_-_Gravelly_and_Rough', 'Oliver_-_Documentary_Narration', 'Olivia-_sweet_and_soft', 'Omari_African_Voice_VERY_foreign_sounding.', 'Ophelia_Rose', 'Opsy', 'Oscar', 'Outstanding_for_Side_Character', 'Oxley_-_Evil_Character', 'Pablo_Marshal', 'Panda_Montana', 'Patino_-_Columbian_Spanish', 'Patrick_International', 'Patsy_Dahling', 'Paul_Martin', 'Pedro_Costa', 'Penelope', 'Persephone_-_lively', 'Peter', 'Peter_-_annoyingly_pitchy_and_enforcing', 'Peter_Owen_-_non-fiction_audiobooks_and_factual_VO', 'Philemon_-_serious_old_scientist', 'Pilar_-_Young_and_Cheerful', 'Piper', 'Pirate_Marshal', 'Pop', 'Pratheep_Tharan', 'Pro_Narrator_-_Convincing_story_teller', 'Prometheus', 'Queen_Rosamund_-_British_Older_Woman', 'Rachel', 'Rachel_M_-_Pro_British_Radio_Presenter_', 'Rachel__McGrath', 'Raju_-_Relatable_Indian_Voice', 'Rakhat_Eje_', 'Ralf_Eisend', 'Rama_-_wise_and_philosophical_sage', 'Randell-_Glone_Rekia', 'Randolph_-_Trustworthy_and_wise', 'Red_-_Dynamic_Expressive_and_Invigorating', 'Reginald_-_intense_villain', 'Researcher_-_Nerdy_and_Hesitant', 'Rhett_Sutton', 'Rhys_--_Sexy_British_Twink', 'Rich_Baritone_American_Radio_Announcer', 'Richard-2', 'Rike_Fischer', 'Road_Dawg__', 'Rob', 'Rob_-_confident_and_formal', 'Robert-__British_Narrator', 'Roberto_Riva', 'Ron_', 'Ron_-_Older_American_Story_Teller', 'Ruhaan_-_Clean_narration_voice', 'Rupert_-_British_60_years_old', 'Russell_-_Dramatic_British_TV', 'Ruth_-_grandmother_storyteller', 'Ryan_-_subtle_accent_and_deep_timbre', 'Ryan_Kurk', 'Ryan_Quin', 'Sagar_-_Voice_of_India', 'Sahara_-_Soothing_Meditation-Hypnosis-Romance', 'Sally', 'Sam_-_Chill_Southern_California_Male', 'Sam_-_Slight_Welsh_Accent', 'Sam_Bragg', 'Samantha_', 'Samantha_Narrations', 'Sanjay_-_profound_and_deep_', 'Sanna_Hartfield_Beta_1.0', 'Saphira_-_Teen_and_Nerdy', 'Sara', 'Sarcini_-_Snarky_Quick-witted_Unapologetic', 'Sasha_-_Soothing_and_Chill', 'Sayn_Awal', 'Scary_Story_', 'Scoobie_-_American_Male_enthusiastic_sharp_smart', 'Scot_Combs_Narration', 'Scott', 'Scott_Woodworth', 'Sean_John_-_Top_Quality', 'Sebastion-Young_uncertain._', 'Selena_-_Introspective_Intuitive', 'Seth_-_Vibrant_Engaging_and_Genuine', 'Sevan_Bomar_-_Black_Motivational_Speaker', 'Sexy_Female_Villain_Voice', 'Sgt_Hayes_-_Authority_Deep_Masculine', 'Shannon_B_-_Sad_Emo_Teenage_Girl', 'Shannon_B_-_Warm_Southern_Woman', 'Shayne_-_Narrator_RJ_Voice', 'Sheba', 'Shelby', 'Shelby_-_Erratic_and_Confident', 'Shells', 'Sheps_Rocky', 'Shrey_-_Deep__Engaging_', 'Simba', 'Sir_Linus_Warmheart', 'Sita_2', 'Soft_Daria_-_Meditation', 'Soft_Demure_Garden_Voice', 'Soft_young_male_voice', 'Sohaib_Jasra', 'Soothing_Narrator', 'Sophia_-_Female_UK_Accent_-_Audiobooks_E-learning_Courses_Adverts', 'Southern_Stewart', 'Sparrow_Lee_', 'Stanley', 'Stanley_', 'Starina_Jr_Pro', 'Stephen', 'Stephen_-_Narrator', 'Steve_-_Australian_Male_', 'Steve_-_British_-_Clean_Smooth_Professional', 'Steve_V', 'Steven', 'Steven_-_Business_Book_Narrator', 'Stuart_-_Energetic_and_enthusiastic', 'Summer', 'Sylvia_-_confident_sensible_wise', 'THE_PROTOTYPE_LIVE_aka_Ana_Daugherty', 'Tara_-_Conversational_Expressive_Voice', 'Tarun_C._Dhanraj_-_Rich_Warm_and_friendly', 'Tass', 'Tatiana', 'Taylor_Andrew_Commercial-Driven', 'Tere', 'Terry_Blackburn_', 'Tessa', 'Thalia_-_Mysteriously_Captivating', 'Thalias_Engine_-_Mysteriously_Captivating', 'The_Great_Conversationalist_', 'Theodore_-_Oldschool_Cool', 'Thomas_-_Measured_Clear_Informative_', 'Thomas_Candia', 'Thomas_Fischer_-_Authentic_German_Accent', 'Tiffany', 'Tim_Rooney', 'Tira_Shabbar_-_Spirited_Irreverent_Young-at-Heart', 'Tommy_-_Teen_Cool__Nonchalant_', 'Tony_-_King_of_New_York', 'Tony_-_middle_aged_male_Northern_English_native_accent', 'Tyler_', 'Tyrell', 'Tyrone', 'UK_Teen_-_Black_man_Marquess_Germain', 'Val_3.0', 'Valentino', 'Valentyna_-_Soft_and_calm', 'Veda', 'Very_Vlad_-_Soviet_Comrade', 'Victor_-_the_motivational_speaker', 'Victoria_-_classy_and_mature', 'Victorian_-_a_lady_of_quality', 'Victorino_-_Deep', 'Vidhi_-_Young__Bold', 'Vidura', 'Vikrant_-_Indian_', 'Vincent_Sparks_-_Deep_American_Voice', 'Vivian_-_knowledgeable_voice__', 'Von_-_Perfect_Storytelling_Clean_Realistic', 'W._Storytime_Oxley', 'Whimsy_-_Kids_Cartoon_Character', 'Whispering_Joe_-_a_storytelling_whisper_ASMR_British_RP_male', 'Wildebeest', 'Will_-_Young_Australian_Male', 'William_Shanks', 'Xanthippe_Abelló_-_Exuberant_Inquisitive_Unconventional', 'Yagiz', 'Yaisa', 'Yash_A_Malhotra_-_Warm__Friendly', 'Yomiee', 'Young_Jamal', 'Yuan_-_emotional_artist_poem_romantic_sensible', 'Zach_-_Storyteller_Narrator_Audiobooks_Podcasts', 'Zakirah_-_Chill_and_Calm', 'Zara_-_Soft_and_Serene_Indian_Voice', 'Zoe', 'Zoe_-_crisp_and_strong', '_Ethan_-_Calm_Intense_and_Compelling', '_Haseeb_-_Canadian_Presentation', '_Louis_Bloom', 'glenda-_soft_and_friendly', 'wise-woman']
272
+
273
+
274
+ # Función para cargar nombres en el dropdown según el idioma seleccionado
275
+ """def load_names(selected_language):
276
+ print(f"Idioma seleccionado: {selected_language}")
277
+ if selected_language == "Arabic":
278
+ return gr.update(choices=arabic_names, value=arabic_names[0] if arabic_names else None)
279
+ elif selected_language == "Bulgarian":
280
+ return gr.update(choices=bulgarian_names, value=bulgarian_names[0] if bulgarian_names else None)
281
+ elif selected_language == "Chinese":
282
+ return gr.update(choices=chinese_names, value=chinese_names[0] if chinese_names else None)
283
+ else:
284
+ return gr.update(choices=[], value=None)"""
285
+
286
+ def load_names(selected_language):
287
+ print(f"Idioma seleccionado: {selected_language}")
288
+
289
+ # Diccionario para mapear idiomas a listas de nombres
290
+ nombres_por_idioma = {
291
+ "Voices Legacy": show_legacy,
292
+ "Arabic": arabic_names,
293
+ "Bulgarian": bulgarian_names,
294
+ "Chinese": chinese_names,
295
+ "Croatian": croatian_names,
296
+ "Czech": czech_names_names, # Corregido el nombre de la variable
297
+ "Danish": danish_names,
298
+ "Dutch": dutch_names,
299
+ "Finnish": finnish_names,
300
+ "French": french_names,
301
+ "German": german_names,
302
+ "Greek": greek_names,
303
+ "Hindi": hindi_names,
304
+ "Hungarian": hungarian_names,
305
+ "Indonesian": indonesian_names,
306
+ "Italian": italian_names,
307
+ "Japanese": japanese_names,
308
+ "Korean": korean_names,
309
+ "Norwegian": norwegian_names,
310
+ "Polish": polish_names,
311
+ "Portuguese": portuguese_names,
312
+ "Romanian": romanian_names,
313
+ "Russian": russian_names,
314
+ "Slovak": slovak_names,
315
+ "Spanish": spanish_names,
316
+ "Swedish": swedish_names,
317
+ "Tamil": tamil_names,
318
+ "Turkish": turkish_names,
319
+ "Ukrainian": ukrainian_names,
320
+ "Vietnamese": vietnamese_names,
321
+ "English-1": english1_names,
322
+ "English-2": english2_names
323
+ }
324
+
325
+ nombres = nombres_por_idioma.get(selected_language, []) # Obtener la lista de nombres o una lista vacía si no se encuentra
326
+
327
+ return gr.update(choices=nombres, value=nombres[0] if nombres else None)
328
+
329
+ def load_text(selected_name, selected_language):
330
+ # Mapeo de idiomas a directorios
331
+ directorios_por_idioma = {
332
+ "Voices Legacy": "show_legacy",
333
+ "Arabic": "ar",
334
+ "Bulgarian": "bg",
335
+ "Chinese": "zh",
336
+ "Croatian": "hr",
337
+ "Czech": "cs",
338
+ "Danish": "da",
339
+ "Dutch": "nl",
340
+ "English-1": "en1", # Asegúrate de que estos nombres coincidan
341
+ "English-2": "en2", # con las claves de leng_and_ids
342
+ "Finnish": "fi",
343
+ "French": "fr",
344
+ "German": "de",
345
+ "Greek": "el",
346
+ "Hindi": "hi",
347
+ "Hungarian": "hu",
348
+ "Indonesian": "id",
349
+ "Italian": "it",
350
+ "Japanese": "ja",
351
+ "Korean": "ko",
352
+ "Norwegian": "no",
353
+ "Polish": "pl",
354
+ "Portuguese": "pt",
355
+ "Romanian": "ro",
356
+ "Russian": "ru",
357
+ "Slovak": "sk",
358
+ "Spanish": "es",
359
+ "Swedish": "sv",
360
+ "Tamil": "ta",
361
+ "Turkish": "tr",
362
+ "Ukrainian": "uk",
363
+ "Vietnamese": "vi"
364
+ }
365
+ dir_idioma = directorios_por_idioma.get(selected_language)
366
+ if not dir_idioma:
367
+ return "" # Manejar el caso en que el idioma no tenga directorio
368
+
369
+ ruta_archivo = f"/tmp/Voice/{dir_idioma}/{selected_name}.txt"
370
+
371
+ try:
372
+ with open(ruta_archivo, "r", encoding="utf-8") as f:
373
+ texto = f.read()
374
+ return texto
375
+ except FileNotFoundError:
376
+ return f"Archivo no encontrado: {ruta_archivo}" # Mostrar un mensaje de error si el archivo no existe
377
+
378
+
379
+ def load_text_langs(selected_language):
380
+ # Mapeo de idiomas a directorios
381
+ directorios_por_idioma = {
382
+ "Voices Legacy": "show_legacy",
383
+ "Arabic": "ar",
384
+ "Bulgarian": "bg",
385
+ "Chinese": "zh",
386
+ "Croatian": "hr",
387
+ "Czech": "cs",
388
+ "Danish": "da",
389
+ "Dutch": "nl",
390
+ "English-1": "en1", # Asegúrate de que estos nombres coincidan
391
+ "English-2": "en2", # con las claves de leng_and_ids
392
+ "Finnish": "fi",
393
+ "French": "fr",
394
+ "German": "de",
395
+ "Greek": "el",
396
+ "Hindi": "hi",
397
+ "Hungarian": "hu",
398
+ "Indonesian": "id",
399
+ "Italian": "it",
400
+ "Japanese": "ja",
401
+ "Korean": "ko",
402
+ "Norwegian": "no",
403
+ "Polish": "pl",
404
+ "Portuguese": "pt",
405
+ "Romanian": "ro",
406
+ "Russian": "ru",
407
+ "Slovak": "sk",
408
+ "Spanish": "es",
409
+ "Swedish": "sv",
410
+ "Tamil": "ta",
411
+ "Turkish": "tr",
412
+ "Ukrainian": "uk",
413
+ "Vietnamese": "vi"
414
+ }
415
+ dir_idioma = directorios_por_idioma.get(selected_language)
416
+
417
+ return dir_idioma
418
+
419
+ # Función para cargar el texto y el audio de referencia
420
+ def update_reference_info(speaker_reference_audio, selected_language):
421
+ # Actualizar el texto de referencia
422
+ #print(speaker_reference_audio, load_text_langs(selected_language))
423
+ text_info = load_text(speaker_reference_audio, selected_language)
424
+
425
+ # Generar la ruta del archivo de audio
426
+ audio_path = f"/tmp/Voice/{load_text_langs(selected_language)}/{speaker_reference_audio}.mp3"
427
+
428
+ # Retornar ambos valores
429
+ return text_info, audio_path
430
+
431
+ def load_params_tts(out_path,version):
432
+
433
+ out_path = Path(out_path)
434
+
435
+ # base_model_path = Path.cwd() / "models" / version
436
+
437
+ # if not base_model_path.exists():
438
+ # return "Base model not found !","","",""
439
+
440
+ ready_model_path = out_path / "ready"
441
+
442
+ vocab_path = ready_model_path / "vocab.json"
443
+ config_path = ready_model_path / "config.json"
444
+ speaker_path = ready_model_path / "speakers_xtts.pth"
445
+ reference_path = ready_model_path / "reference.wav"
446
+
447
+ model_path = ready_model_path / "model.pth"
448
+
449
+ if not model_path.exists():
450
+ model_path = ready_model_path / "unoptimize_model.pth"
451
+ if not model_path.exists():
452
+ return "Params for TTS not found", "", "", ""
453
+
454
+ return "Params for TTS loaded", model_path, config_path, vocab_path,speaker_path, reference_path
455
+
456
+ def upload_audio(audio, current_path):
457
+ if audio is None:
458
+ return current_path
459
+
460
+ upload_dir = "speaker_reference_audio"
461
+ os.makedirs(upload_dir, exist_ok=True)
462
+
463
+ if isinstance(audio, str): # If it's a string (filepath)
464
+ audio_path = audio # Use the provided path directly
465
+ if not os.path.exists(audio_path): # Check if the file exists
466
+ print(f"Error: File not found at {audio_path}")
467
+ return current_path # Or return an error message
468
+ return audio_path # Return the valid path
469
+
470
+
471
+ elif hasattr(audio, "name"): # If it's an UploadedFile object
472
+ audio_path = os.path.join(upload_dir, audio.name)
473
+ try:
474
+ with open(audio_path, "wb") as f:
475
+ f.write(audio.read())
476
+ return audio_path
477
+ except Exception as e:
478
+ print(f"Error saving uploaded audio: {e}")
479
+ return current_path
480
+ else:
481
+ print("The reference audio input format is not recognized")
482
+ return current_path
483
+
484
+
485
+ if __name__ == "__main__":
486
+
487
+ parser = argparse.ArgumentParser(
488
+ description="""XTTS fine-tuning demo\n\n"""
489
+ """
490
+ Example runs:
491
+ python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port
492
+ """,
493
+ formatter_class=argparse.RawTextHelpFormatter,
494
+ )
495
+ parser.add_argument(
496
+ "--whisper_model",
497
+ type=str,
498
+ help="Name of the whisper model selected by default (Optional) Choices are: ['large-v3','large-v2', 'large', 'medium', 'small'] Default Value: 'large-v3'",
499
+ default="large-v3",
500
+ )
501
+ parser.add_argument(
502
+ "--audio_folder_path",
503
+ type=str,
504
+ help="Path to the folder with audio files (optional)",
505
+ default="",
506
+ )
507
+ parser.add_argument(
508
+ "--share",
509
+ action="store_true",
510
+ default=False,
511
+ help="Enable sharing of the Gradio interface via public link.",
512
+ )
513
+ parser.add_argument(
514
+ "--port",
515
+ type=int,
516
+ help="Port to run the gradio demo. Default: 5003",
517
+ default=5003,
518
+ )
519
+ parser.add_argument(
520
+ "--out_path",
521
+ type=str,
522
+ help="Output path (where data and checkpoints will be saved) Default: output/",
523
+ default=str(Path.cwd() / "train_models"),
524
+ )
525
+
526
+ parser.add_argument(
527
+ "--num_epochs",
528
+ type=int,
529
+ help="Number of epochs to train. Default: 6",
530
+ default=6,
531
+ )
532
+ parser.add_argument(
533
+ "--batch_size",
534
+ type=int,
535
+ help="Batch size. Default: 2",
536
+ default=2,
537
+ )
538
+ parser.add_argument(
539
+ "--grad_acumm",
540
+ type=int,
541
+ help="Grad accumulation steps. Default: 1",
542
+ default=1,
543
+ )
544
+ parser.add_argument(
545
+ "--max_audio_length",
546
+ type=int,
547
+ help="Max permitted audio size in seconds. Default: 11",
548
+ default=11,
549
+ )
550
+
551
+ args = parser.parse_args()
552
+
553
+ language_names = {
554
+ "en": "English",
555
+ "es": "Spanish",
556
+ "fr": "French",
557
+ "de": "German",
558
+ "it": "Italian",
559
+ "pt": "Portuguese",
560
+ "pl": "Polish",
561
+ "tr": "Turkish",
562
+ "ru": "Russian",
563
+ "nl": "Dutch",
564
+ "cs": "Czech",
565
+ "ar": "Arabic",
566
+ "zh": "Chinese",
567
+ "hu": "Hungarian",
568
+ "ko": "Korean",
569
+ "ja": "Japanese",
570
+ }
571
+
572
+ with gr.Blocks(title=os.environ.get("APP_NAME", "Gradio")) as demo:
573
+ with gr.Tab("1 - Data processing"):
574
+ out_path = gr.Textbox(
575
+ label="Output path (where data and checkpoints will be saved):",
576
+ value=args.out_path,
577
+ )
578
+ # upload_file = gr.Audio(
579
+ # sources="upload",
580
+ # label="Select here the audio files that you want to use for XTTS trainining !",
581
+ # type="filepath",
582
+ # )
583
+ upload_file = gr.File(
584
+ file_count="multiple",
585
+ label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)",
586
+ )
587
+
588
+ audio_folder_path = gr.Textbox(
589
+ label="Path to the folder with audio files (optional):",
590
+ value=args.audio_folder_path,
591
+ )
592
+
593
+ whisper_model = gr.Dropdown(
594
+ label="Whisper Model",
595
+ value=args.whisper_model,
596
+ choices=[
597
+ "large-v3",
598
+ "large-v2",
599
+ "large",
600
+ "medium",
601
+ "small"
602
+ ],
603
+ )
604
+
605
+ lang = gr.Dropdown(
606
+ label="Dataset Language",
607
+ value="en",
608
+ choices=list(zip(language_names.values(), language_names.keys()))
609
+ )
610
+ progress_data = gr.Label(
611
+ label="Progress:"
612
+ )
613
+ # demo.load(read_logs, None, logs, every=1)
614
+
615
+ prompt_compute_btn = gr.Button(value="Step 1 - Create dataset")
616
+
617
+ def preprocess_dataset(audio_path, audio_folder_path, language, whisper_model, out_path, train_csv, eval_csv, progress=gr.Progress(track_tqdm=True)):
618
+ clear_gpu_cache()
619
+
620
+ train_csv = ""
621
+ eval_csv = ""
622
+
623
+ out_path = os.path.join(out_path, "dataset")
624
+ os.makedirs(out_path, exist_ok=True)
625
+
626
+ if audio_folder_path:
627
+ audio_files = list(list_audios(audio_folder_path))
628
+ else:
629
+ audio_files = audio_path
630
+
631
+ if not audio_files:
632
+ return "No audio files found! Please provide files via Gradio or specify a folder path.", "", ""
633
+ else:
634
+ try:
635
+ # Loading Whisper
636
+ device = "cuda" if torch.cuda.is_available() else "cpu"
637
+
638
+ # Detect compute type
639
+ if torch.cuda.is_available():
640
+ compute_type = "float16"
641
+ else:
642
+ compute_type = "float32"
643
+
644
+ asr_model = WhisperModel(whisper_model, device=device, compute_type=compute_type)
645
+ train_meta, eval_meta, audio_total_size = format_audio_list(audio_files, asr_model=asr_model, target_language=language, out_path=out_path, gradio_progress=progress)
646
+ except:
647
+ traceback.print_exc()
648
+ error = traceback.format_exc()
649
+ return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", ""
650
+
651
+ # clear_gpu_cache()
652
+
653
+ # if audio total len is less than 2 minutes raise an error
654
+ if audio_total_size < 120:
655
+ message = "The sum of the duration of the audios that you provided should be at least 2 minutes!"
656
+ print(message)
657
+ return message, "", ""
658
+
659
+ print("Dataset Processed!")
660
+ return "Dataset Processed!", train_meta, eval_meta
661
+
662
+
663
+ with gr.Tab("2 - XTTS Encoder"):
664
+ load_params_btn = gr.Button(value="Load Params from output folder")
665
+ version = gr.Dropdown(
666
+ label="XTTS base version",
667
+ value="v2.0.2",
668
+ choices=[
669
+ "v2.0.3",
670
+ "v2.0.2",
671
+ "v2.0.1",
672
+ "v2.0.0",
673
+ "main"
674
+ ],
675
+ )
676
+ train_csv = gr.Textbox(
677
+ label="Train CSV:",
678
+ )
679
+ eval_csv = gr.Textbox(
680
+ label="Eval CSV:",
681
+ )
682
+ custom_model = gr.Textbox(
683
+ label="(Optional) Custom model.pth file , leave blank if you want to use the base file.",
684
+ value="",
685
+ )
686
+ num_epochs = gr.Slider(
687
+ label="Number of epochs:",
688
+ minimum=1,
689
+ maximum=100,
690
+ step=1,
691
+ value=args.num_epochs,
692
+ )
693
+ batch_size = gr.Slider(
694
+ label="Batch size:",
695
+ minimum=2,
696
+ maximum=512,
697
+ step=1,
698
+ value=args.batch_size,
699
+ )
700
+ grad_acumm = gr.Slider(
701
+ label="Grad accumulation steps:",
702
+ minimum=2,
703
+ maximum=128,
704
+ step=1,
705
+ value=args.grad_acumm,
706
+ )
707
+ max_audio_length = gr.Slider(
708
+ label="Max permitted audio size in seconds:",
709
+ minimum=2,
710
+ maximum=20,
711
+ step=1,
712
+ value=args.max_audio_length,
713
+ )
714
+ clear_train_data = gr.Dropdown(
715
+ label="Clear train data, you will delete selected folder, after optimizing",
716
+ value="none",
717
+ choices=[
718
+ "none",
719
+ "run",
720
+ "dataset",
721
+ "all"
722
+ ])
723
+
724
+ progress_train = gr.Label(
725
+ label="Progress:"
726
+ )
727
+
728
+ # demo.load(read_logs, None, logs_tts_train, every=1)
729
+ train_btn = gr.Button(value="Step 2 - Run the training")
730
+ optimize_model_btn = gr.Button(value="Step 2.5 - Optimize the model")
731
+
732
+ import os
733
+ import shutil
734
+ from pathlib import Path
735
+ import traceback
736
+
737
+ def train_model(custom_model, version, language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length):
738
+ clear_gpu_cache()
739
+
740
+ # Check if `custom_model` is a URL and download it if true.
741
+ if custom_model.startswith("http"):
742
+ print("Downloading custom model from URL...")
743
+ custom_model = download_file(custom_model, "custom_model.pth")
744
+ if not custom_model:
745
+ return "Failed to download the custom model.", "", "", "", ""
746
+
747
+ run_dir = Path(output_path) / "run"
748
+
749
+ # Remove train dir
750
+ if run_dir.exists():
751
+ shutil.rmtree(run_dir)
752
+
753
+ # Check if the dataset language matches the language you specified
754
+ lang_file_path = Path(output_path) / "dataset" / "lang.txt"
755
+
756
+ # Check if lang.txt already exists and contains a different language
757
+ current_language = None
758
+ if lang_file_path.exists():
759
+ with open(lang_file_path, 'r', encoding='utf-8') as existing_lang_file:
760
+ current_language = existing_lang_file.read().strip()
761
+ if current_language != language:
762
+ print("The language that was prepared for the dataset does not match the specified language. Change the language to the one specified in the dataset")
763
+ language = current_language
764
+
765
+ if not train_csv or not eval_csv:
766
+ return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", ""
767
+ try:
768
+ # convert seconds to waveform frames
769
+ max_audio_length = int(max_audio_length * 22050)
770
+ speaker_xtts_path, config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(custom_model, version, language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length)
771
+ except:
772
+ traceback.print_exc()
773
+ error = traceback.format_exc()
774
+ return f"The training was interrupted due to an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", ""
775
+
776
+ ready_dir = Path(output_path) / "ready"
777
+
778
+ ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth")
779
+
780
+ shutil.copy(ft_xtts_checkpoint, ready_dir / "unoptimize_model.pth")
781
+
782
+ ft_xtts_checkpoint = os.path.join(ready_dir, "unoptimize_model.pth")
783
+
784
+ # Move reference audio to output folder and rename it
785
+ speaker_reference_path = Path(speaker_wav)
786
+ speaker_reference_new_path = ready_dir / "reference.wav"
787
+ shutil.copy(speaker_reference_path, speaker_reference_new_path)
788
+
789
+ print("Model training done!")
790
+ return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_xtts_path, speaker_reference_new_path
791
+
792
+ def optimize_model(out_path, clear_train_data):
793
+ # print(out_path)
794
+ out_path = Path(out_path) # Ensure that out_path is a Path object.
795
+
796
+ ready_dir = out_path / "ready"
797
+ run_dir = out_path / "run"
798
+ dataset_dir = out_path / "dataset"
799
+
800
+ # Clear specified training data directories.
801
+ if clear_train_data in {"run", "all"} and run_dir.exists():
802
+ try:
803
+ shutil.rmtree(run_dir)
804
+ except PermissionError as e:
805
+ print(f"An error occurred while deleting {run_dir}: {e}")
806
+
807
+ if clear_train_data in {"dataset", "all"} and dataset_dir.exists():
808
+ try:
809
+ shutil.rmtree(dataset_dir)
810
+ except PermissionError as e:
811
+ print(f"An error occurred while deleting {dataset_dir}: {e}")
812
+
813
+ # Get full path to model
814
+ model_path = ready_dir / "unoptimize_model.pth"
815
+
816
+ if not model_path.is_file():
817
+ return "Unoptimized model not found in ready folder", ""
818
+
819
+ # Load the checkpoint and remove unnecessary parts.
820
+ checkpoint = torch.load(model_path, map_location=torch.device("cpu"))
821
+ del checkpoint["optimizer"]
822
+
823
+ for key in list(checkpoint["model"].keys()):
824
+ if "dvae" in key:
825
+ del checkpoint["model"][key]
826
+
827
+ # Make sure out_path is a Path object or convert it to Path
828
+ os.remove(model_path)
829
+
830
+ # Save the optimized model.
831
+ optimized_model_file_name="model.pth"
832
+ optimized_model=ready_dir/optimized_model_file_name
833
+
834
+ torch.save(checkpoint, optimized_model)
835
+ ft_xtts_checkpoint=str(optimized_model)
836
+
837
+ clear_gpu_cache()
838
+
839
+ return f"Model optimized and saved at {ft_xtts_checkpoint}!", ft_xtts_checkpoint
840
+
841
+ def load_params(out_path):
842
+ path_output = Path(out_path)
843
+
844
+ dataset_path = path_output / "dataset"
845
+
846
+ if not dataset_path.exists():
847
+ return "The output folder does not exist!", "", ""
848
+
849
+ eval_train = dataset_path / "metadata_train.csv"
850
+ eval_csv = dataset_path / "metadata_eval.csv"
851
+
852
+ # Write the target language to lang.txt in the output directory
853
+ lang_file_path = dataset_path / "lang.txt"
854
+
855
+ # Check if lang.txt already exists and contains a different language
856
+ current_language = None
857
+ if os.path.exists(lang_file_path):
858
+ with open(lang_file_path, 'r', encoding='utf-8') as existing_lang_file:
859
+ current_language = existing_lang_file.read().strip()
860
+
861
+ clear_gpu_cache()
862
+
863
+ print(current_language)
864
+ return "The data has been updated", eval_train, eval_csv, current_language
865
+
866
+
867
+ with gr.Tab("3 - Inference"):
868
+ with gr.Row():
869
+ with gr.Column() as col1:
870
+ load_params_tts_btn = gr.Button(value="Load params for TTS from output folder")
871
+ xtts_checkpoint = gr.Textbox(
872
+ label="XTTS checkpoint path:",
873
+ value="",
874
+ )
875
+ xtts_config = gr.Textbox(
876
+ label="XTTS config path:",
877
+ value="",
878
+ )
879
+
880
+ xtts_vocab = gr.Textbox(
881
+ label="XTTS vocab path:",
882
+ value="",
883
+ )
884
+ xtts_speaker = gr.Textbox(
885
+ label="XTTS speaker path:",
886
+ value="",
887
+ )
888
+ progress_load = gr.Label(
889
+ label="Progress:"
890
+ )
891
+ load_btn = gr.Button(value="Step 3 - Load XTTS model")
892
+
893
+ with gr.Column() as col2:
894
+ speaker_reference_audio = gr.Textbox(
895
+ label="Speaker Reference Audio:", # More descriptive label
896
+ value="",
897
+ interactive=True, # Allow users to edit path manually
898
+ )
899
+ speaker_audio_upload = gr.Audio(
900
+ label="Upload Speaker Audio (wav, mp3, flac)",
901
+ type="filepath", # Just keep type="filepath"
902
+ )
903
+ tts_language = gr.Dropdown(
904
+ label="Language",
905
+ value="en",
906
+ choices=list(zip(language_names.values(), language_names.keys()))
907
+ )
908
+ tts_text = gr.Textbox(
909
+ label="Input Text.",
910
+ value="This model sounds really good and above all, it's reasonably fast.",
911
+ )
912
+ with gr.Accordion("Advanced settings", open=False) as acr:
913
+ temperature = gr.Slider(
914
+ label="temperature",
915
+ minimum=0,
916
+ maximum=1,
917
+ step=0.05,
918
+ value=0.75,
919
+ )
920
+ length_penalty = gr.Slider(
921
+ label="length_penalty",
922
+ minimum=-10.0,
923
+ maximum=10.0,
924
+ step=0.5,
925
+ value=1,
926
+ )
927
+ repetition_penalty = gr.Slider(
928
+ label="repetition penalty",
929
+ minimum=1,
930
+ maximum=10,
931
+ step=0.5,
932
+ value=5,
933
+ )
934
+ top_k = gr.Slider(
935
+ label="top_k",
936
+ minimum=1,
937
+ maximum=100,
938
+ step=1,
939
+ value=50,
940
+ )
941
+ top_p = gr.Slider(
942
+ label="top_p",
943
+ minimum=0,
944
+ maximum=1,
945
+ step=0.05,
946
+ value=0.85,
947
+ )
948
+ sentence_split = gr.Checkbox(
949
+ label="Enable text splitting",
950
+ value=True,
951
+ )
952
+ use_config = gr.Checkbox(
953
+ label="Use Inference settings from config, if disabled use the settings above",
954
+ value=False,
955
+ )
956
+ tts_btn = gr.Button(value="Step 4 - Inference")
957
+
958
+
959
+ with gr.Column() as col3:
960
+ progress_gen = gr.Label(
961
+ label="Progress:"
962
+ )
963
+ tts_output_audio = gr.Audio(label="Generated Audio.")
964
+ reference_audio = gr.Audio(label="Reference audio used.")
965
+
966
+
967
+ with gr.Tab("2161 Voices"):
968
+ with gr.Row():
969
+ with gr.Column() as col1:
970
+ #load_params_tts_btn = gr.Button(value="Load params for TTS from output folder")
971
+ xtts_checkpoint0 = gr.Textbox(
972
+ label="XTTS checkpoint path:",
973
+ value="/content/xtts-webui/model/model.pth",
974
+ )
975
+ xtts_config0 = gr.Textbox(
976
+ label="XTTS config path:",
977
+ value="/content/xtts-webui/model/config.json",
978
+ )
979
+
980
+ xtts_vocab0 = gr.Textbox(
981
+ label="XTTS vocab path:",
982
+ value="/content/xtts-webui/model/vocab.json",
983
+ )
984
+ xtts_speaker0 = gr.Textbox(
985
+ label="XTTS speaker path:",
986
+ value="/content/xtts-webui/model/speakers_xtts.pth",
987
+ )
988
+ progress_load0 = gr.Label(
989
+ label="Progress:"
990
+ )
991
+ load_btn0 = gr.Button(value="Load model")
992
+
993
+ with gr.Column() as col2:
994
+
995
+ # Dropdown de selección de idioma
996
+ selected_language0 = gr.Dropdown(list(leng_and_ids.keys()), value="Select language", label="Language reference audio")
997
+ speaker_reference_audio0 = gr.Dropdown(interactive=True, allow_custom_value=True, label="Speaker reference audio:")
998
+ text_output0 = gr.Textbox(label="Audio reference information")
999
+
1000
+ tts_language0 = gr.Dropdown(
1001
+ label="Language",
1002
+ value="en",
1003
+ choices=list(zip(language_names.values(), language_names.keys()))
1004
+ )
1005
+ tts_text0 = gr.Textbox(
1006
+ label="Input Text.",
1007
+ value="This model sounds really good and above all, it's reasonably fast.",
1008
+ )
1009
+ with gr.Accordion("Advanced settings", open=False) as acr:
1010
+ temperature0 = gr.Slider(
1011
+ label="temperature",
1012
+ minimum=0,
1013
+ maximum=1,
1014
+ step=0.05,
1015
+ value=0.75,
1016
+ )
1017
+ length_penalty0 = gr.Slider(
1018
+ label="length_penalty",
1019
+ minimum=-10.0,
1020
+ maximum=10.0,
1021
+ step=0.5,
1022
+ value=1,
1023
+ )
1024
+ repetition_penalty0 = gr.Slider(
1025
+ label="repetition penalty",
1026
+ minimum=1,
1027
+ maximum=10,
1028
+ step=0.5,
1029
+ value=5,
1030
+ )
1031
+ top_k0 = gr.Slider(
1032
+ label="top_k",
1033
+ minimum=1,
1034
+ maximum=100,
1035
+ step=1,
1036
+ value=50,
1037
+ )
1038
+ top_p0 = gr.Slider(
1039
+ label="top_p",
1040
+ minimum=0,
1041
+ maximum=1,
1042
+ step=0.05,
1043
+ value=0.85,
1044
+ )
1045
+ sentence_split0 = gr.Checkbox(
1046
+ label="Enable text splitting",
1047
+ value=True,
1048
+ )
1049
+ use_config0 = gr.Checkbox(
1050
+ label="Use Inference settings from config, if disabled use the settings above",
1051
+ value=False,
1052
+ )
1053
+ tts_btn0 = gr.Button(value="Generate")
1054
+
1055
+
1056
+ selected_language0.change(load_names, inputs=selected_language0, outputs=speaker_reference_audio0)
1057
+ #speaker_reference_audio0.change(load_text, inputs=[speaker_reference_audio0, selected_language0], outputs=text_output0)
1058
+
1059
+
1060
+ with gr.Column() as col3:
1061
+ progress_gen0 = gr.Label(
1062
+ label="Progress:"
1063
+ )
1064
+ tts_output_audio0 = gr.Audio(label="Generated Audio.")
1065
+ reference_audio0 = gr.Audio(label="Reference audio used.")
1066
+ speaker_reference_audio0.change(update_reference_info, inputs=[speaker_reference_audio0, selected_language0], outputs=[text_output0, reference_audio0])
1067
+
1068
+
1069
+
1070
+
1071
+
1072
+ prompt_compute_btn.click(
1073
+ fn=preprocess_dataset,
1074
+ inputs=[
1075
+ upload_file,
1076
+ audio_folder_path,
1077
+ lang,
1078
+ whisper_model,
1079
+ out_path,
1080
+ train_csv,
1081
+ eval_csv
1082
+ ],
1083
+ outputs=[
1084
+ progress_data,
1085
+ train_csv,
1086
+ eval_csv,
1087
+ ],
1088
+ )
1089
+
1090
+
1091
+ load_params_btn.click(
1092
+ fn=load_params,
1093
+ inputs=[out_path],
1094
+ outputs=[
1095
+ progress_train,
1096
+ train_csv,
1097
+ eval_csv,
1098
+ lang
1099
+ ]
1100
+ )
1101
+
1102
+
1103
+ train_btn.click(
1104
+ fn=train_model,
1105
+ inputs=[
1106
+ custom_model,
1107
+ version,
1108
+ lang,
1109
+ train_csv,
1110
+ eval_csv,
1111
+ num_epochs,
1112
+ batch_size,
1113
+ grad_acumm,
1114
+ out_path,
1115
+ max_audio_length,
1116
+ ],
1117
+ outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint,xtts_speaker, speaker_reference_audio],
1118
+ )
1119
+
1120
+ optimize_model_btn.click(
1121
+ fn=optimize_model,
1122
+ inputs=[
1123
+ out_path,
1124
+ clear_train_data
1125
+ ],
1126
+ outputs=[progress_train,xtts_checkpoint0],
1127
+ )
1128
+
1129
+
1130
+ load_btn0.click(
1131
+ fn=load_model,
1132
+ inputs=[
1133
+ xtts_checkpoint0,
1134
+ xtts_config0,
1135
+ xtts_vocab0,
1136
+ xtts_speaker0
1137
+ ],
1138
+ outputs=[progress_load0],
1139
+ )
1140
+
1141
+ tts_btn0.click(
1142
+ fn=run_tts0,
1143
+ inputs=[
1144
+ selected_language0,
1145
+ tts_language0,
1146
+ tts_text0,
1147
+ speaker_reference_audio0,
1148
+ temperature0,
1149
+ length_penalty0,
1150
+ repetition_penalty0,
1151
+ top_k0,
1152
+ top_p0,
1153
+ sentence_split0,
1154
+ use_config0
1155
+ ],
1156
+ outputs=[progress_gen0, tts_output_audio0,reference_audio0],
1157
+ )
1158
+
1159
+ load_btn.click(
1160
+ fn=load_model,
1161
+ inputs=[
1162
+ xtts_checkpoint,
1163
+ xtts_config,
1164
+ xtts_vocab,
1165
+ xtts_speaker
1166
+ ],
1167
+ outputs=[progress_load],
1168
+ )
1169
+
1170
+
1171
+
1172
+ tts_btn.click(
1173
+ fn=run_tts,
1174
+ inputs=[
1175
+ tts_language,
1176
+ tts_text,
1177
+ speaker_reference_audio,
1178
+ temperature,
1179
+ length_penalty,
1180
+ repetition_penalty,
1181
+ top_k,
1182
+ top_p,
1183
+ sentence_split,
1184
+ use_config
1185
+ ],
1186
+ outputs=[progress_gen, tts_output_audio,reference_audio],
1187
+ )
1188
+
1189
+ load_params_tts_btn.click(
1190
+ fn=load_params_tts,
1191
+ inputs=[
1192
+ out_path,
1193
+ version
1194
+ ],
1195
+ outputs=[progress_load,xtts_checkpoint,xtts_config,xtts_vocab,xtts_speaker,speaker_reference_audio],
1196
+ )
1197
+
1198
+ speaker_audio_upload.upload(
1199
+ upload_audio,
1200
+ inputs=[speaker_audio_upload, speaker_reference_audio],
1201
+ outputs=speaker_reference_audio,
1202
+ )
1203
+
1204
+
1205
+
1206
+
1207
+ demo.launch(
1208
+ share=args.share,
1209
+ debug=False,
1210
+ server_port=args.port,
1211
+ # inweb=True,
1212
+ # server_name="localhost"
1213
+ )