Spaces:
Runtime error
Runtime error
Commit ·
899cf32
0
Parent(s):
Initial commit
Browse files- .gitignore +169 -0
- LICENSE +21 -0
- README.md +31 -0
- models/.gitkeep +0 -0
- notebooks/.gitkeep +0 -0
- scripts/cloning_inference.py +30 -0
- scripts/inference_config.json +7 -0
- scripts/input/hank.mp3 +0 -0
- scripts/input/homer.mp3 +0 -0
- scripts/output/.gitkeep +0 -0
- scripts/train.py +69 -0
- scripts/training_config.json +9 -0
- setup.py +106 -0
- src/deep_voice_cloning/__init__.py +0 -0
- src/deep_voice_cloning/cloning/__init__.py +0 -0
- src/deep_voice_cloning/cloning/config.json +7 -0
- src/deep_voice_cloning/cloning/model.py +54 -0
- src/deep_voice_cloning/data/__init__.py +0 -0
- src/deep_voice_cloning/data/collator.py +45 -0
- src/deep_voice_cloning/data/dataset.py +63 -0
- src/deep_voice_cloning/transcriber/__init__.py +0 -0
- src/deep_voice_cloning/transcriber/config.json +7 -0
- src/deep_voice_cloning/transcriber/model.py +22 -0
.gitignore
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Initially taken from Github's Python gitignore file
|
| 2 |
+
|
| 3 |
+
# Byte-compiled / optimized / DLL files
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.py[cod]
|
| 6 |
+
*$py.class
|
| 7 |
+
|
| 8 |
+
# C extensions
|
| 9 |
+
*.so
|
| 10 |
+
|
| 11 |
+
# tests and logs
|
| 12 |
+
tests/fixtures/cached_*_text.txt
|
| 13 |
+
logs/
|
| 14 |
+
lightning_logs/
|
| 15 |
+
lang_code_data/
|
| 16 |
+
|
| 17 |
+
# Distribution / packaging
|
| 18 |
+
.Python
|
| 19 |
+
build/
|
| 20 |
+
develop-eggs/
|
| 21 |
+
dist/
|
| 22 |
+
downloads/
|
| 23 |
+
eggs/
|
| 24 |
+
.eggs/
|
| 25 |
+
lib/
|
| 26 |
+
lib64/
|
| 27 |
+
parts/
|
| 28 |
+
sdist/
|
| 29 |
+
var/
|
| 30 |
+
wheels/
|
| 31 |
+
*.egg-info/
|
| 32 |
+
.installed.cfg
|
| 33 |
+
*.egg
|
| 34 |
+
MANIFEST
|
| 35 |
+
|
| 36 |
+
# PyInstaller
|
| 37 |
+
# Usually these files are written by a python script from a template
|
| 38 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 39 |
+
*.manifest
|
| 40 |
+
*.spec
|
| 41 |
+
|
| 42 |
+
# Installer logs
|
| 43 |
+
pip-log.txt
|
| 44 |
+
pip-delete-this-directory.txt
|
| 45 |
+
|
| 46 |
+
# Unit test / coverage reports
|
| 47 |
+
htmlcov/
|
| 48 |
+
.tox/
|
| 49 |
+
.nox/
|
| 50 |
+
.coverage
|
| 51 |
+
.coverage.*
|
| 52 |
+
.cache
|
| 53 |
+
nosetests.xml
|
| 54 |
+
coverage.xml
|
| 55 |
+
*.cover
|
| 56 |
+
.hypothesis/
|
| 57 |
+
.pytest_cache/
|
| 58 |
+
|
| 59 |
+
# Translations
|
| 60 |
+
*.mo
|
| 61 |
+
*.pot
|
| 62 |
+
|
| 63 |
+
# Django stuff:
|
| 64 |
+
*.log
|
| 65 |
+
local_settings.py
|
| 66 |
+
db.sqlite3
|
| 67 |
+
|
| 68 |
+
# Flask stuff:
|
| 69 |
+
instance/
|
| 70 |
+
.webassets-cache
|
| 71 |
+
|
| 72 |
+
# Scrapy stuff:
|
| 73 |
+
.scrapy
|
| 74 |
+
|
| 75 |
+
# Sphinx documentation
|
| 76 |
+
docs/_build/
|
| 77 |
+
|
| 78 |
+
# PyBuilder
|
| 79 |
+
target/
|
| 80 |
+
|
| 81 |
+
# Jupyter Notebook
|
| 82 |
+
.ipynb_checkpoints
|
| 83 |
+
|
| 84 |
+
# IPython
|
| 85 |
+
profile_default/
|
| 86 |
+
ipython_config.py
|
| 87 |
+
|
| 88 |
+
# pyenv
|
| 89 |
+
.python-version
|
| 90 |
+
|
| 91 |
+
# celery beat schedule file
|
| 92 |
+
celerybeat-schedule
|
| 93 |
+
|
| 94 |
+
# SageMath parsed files
|
| 95 |
+
*.sage.py
|
| 96 |
+
|
| 97 |
+
# Environments
|
| 98 |
+
.env
|
| 99 |
+
.venv
|
| 100 |
+
env/
|
| 101 |
+
venv/
|
| 102 |
+
ENV/
|
| 103 |
+
env.bak/
|
| 104 |
+
venv.bak/
|
| 105 |
+
|
| 106 |
+
# Spyder project settings
|
| 107 |
+
.spyderproject
|
| 108 |
+
.spyproject
|
| 109 |
+
|
| 110 |
+
# Rope project settings
|
| 111 |
+
.ropeproject
|
| 112 |
+
|
| 113 |
+
# mkdocs documentation
|
| 114 |
+
/site
|
| 115 |
+
|
| 116 |
+
# mypy
|
| 117 |
+
.mypy_cache/
|
| 118 |
+
.dmypy.json
|
| 119 |
+
dmypy.json
|
| 120 |
+
|
| 121 |
+
# Pyre type checker
|
| 122 |
+
.pyre/
|
| 123 |
+
|
| 124 |
+
# vscode
|
| 125 |
+
.vs
|
| 126 |
+
.vscode
|
| 127 |
+
|
| 128 |
+
# Pycharm
|
| 129 |
+
.idea
|
| 130 |
+
|
| 131 |
+
# TF code
|
| 132 |
+
tensorflow_code
|
| 133 |
+
|
| 134 |
+
# Models
|
| 135 |
+
proc_data
|
| 136 |
+
|
| 137 |
+
# examples
|
| 138 |
+
runs
|
| 139 |
+
/runs_old
|
| 140 |
+
/wandb
|
| 141 |
+
/examples/runs
|
| 142 |
+
/examples/**/*.args
|
| 143 |
+
/examples/rag/sweep
|
| 144 |
+
|
| 145 |
+
# data
|
| 146 |
+
/data
|
| 147 |
+
serialization_dir
|
| 148 |
+
|
| 149 |
+
# emacs
|
| 150 |
+
*.*~
|
| 151 |
+
debug.env
|
| 152 |
+
|
| 153 |
+
# vim
|
| 154 |
+
.*.swp
|
| 155 |
+
|
| 156 |
+
#ctags
|
| 157 |
+
tags
|
| 158 |
+
|
| 159 |
+
# pre-commit
|
| 160 |
+
.pre-commit*
|
| 161 |
+
|
| 162 |
+
# .lock
|
| 163 |
+
*.lock
|
| 164 |
+
|
| 165 |
+
# DS_Store (MacOS)
|
| 166 |
+
.DS_Store
|
| 167 |
+
|
| 168 |
+
# ruff
|
| 169 |
+
.ruff_cache
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2023 Konstantin Verner
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Few-Shot Voice Cloning
|
| 2 |
+
|
| 3 |
+
This repository is an implementation of the pipeline for few-short voice cloning based on SpeechT5 architecture introduced in [ SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205).
|
| 4 |
+
It is able to clone a voice from 15-30 seconds of audio recording in English (another languages are planned).
|
| 5 |
+
|
| 6 |
+
# Getting Started
|
| 7 |
+
|
| 8 |
+
Clone repository
|
| 9 |
+
```angular2html
|
| 10 |
+
git clone https://github.com/konverner/deep-voice-cloning.git
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
Install the modules
|
| 14 |
+
```angular2html
|
| 15 |
+
pip install .
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
Run traning specifying arguments using config file `training_config.json` or the console command, for example
|
| 19 |
+
```angular2html
|
| 20 |
+
python scripts/train.py --audio_path scripts/input/hank.mp3 --output_dir /content/deep-voice-cloning/models
|
| 21 |
+
```
|
| 22 |
+
Resulting model will be saved in `output_dir` directory. It will be used in the next step.
|
| 23 |
+
|
| 24 |
+
Run inference specifying arguments using config file `inference_config.json` or the console command, for example
|
| 25 |
+
```angular2html
|
| 26 |
+
python scripts/cloning_inference.py --model_path "/content/deep-voice-cloning/models/microsoft_speecht5_tts_hank"\
|
| 27 |
+
--input_text 'do the things, not because they are easy, but because they are hard'\
|
| 28 |
+
--output_path "scripts/output/do_the_things.wav"
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
Resulting audio file will be saved as `output_path` file.
|
models/.gitkeep
ADDED
|
File without changes
|
notebooks/.gitkeep
ADDED
|
File without changes
|
scripts/cloning_inference.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
import soundfile as sf
|
| 6 |
+
|
| 7 |
+
from deep_voice_cloning.cloning.model import CloningModel
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
if __name__ == "__main__":
|
| 11 |
+
parser = argparse.ArgumentParser()
|
| 12 |
+
parser.add_argument("--model_path", type=str, default=None, help="Path to model directory")
|
| 13 |
+
parser.add_argument("--input_text", type=str, default=None, help="Text to be synthesized")
|
| 14 |
+
parser.add_argument("--output_path", type=str, default=None, help="Path to output audio file")
|
| 15 |
+
args = parser.parse_args()
|
| 16 |
+
|
| 17 |
+
with open(os.path.join(os.path.dirname(__file__), "inference_config.json")) as f:
|
| 18 |
+
config = json.load(f)
|
| 19 |
+
|
| 20 |
+
if args.model_path is not None:
|
| 21 |
+
config['model_path'] = args.model_path
|
| 22 |
+
if args.input_text is not None:
|
| 23 |
+
config['input_text'] = args.input_text
|
| 24 |
+
if args.output_path is not None:
|
| 25 |
+
config['output_path'] = args.output_path
|
| 26 |
+
|
| 27 |
+
cloning_model = CloningModel(config)
|
| 28 |
+
waveform_array = cloning_model.forward(config["input_text"])
|
| 29 |
+
|
| 30 |
+
sf.write(config['output_path'], waveform_array, samplerate=16000)
|
scripts/inference_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_path": "/content/deep-voice-cloning/models/microsoft_speecht5_tts_hank_hill",
|
| 3 |
+
"speaker_model_name": "speechbrain/spkrec-xvect-voxceleb",
|
| 4 |
+
"vocoder_name": "microsoft/speecht5_hifigan",
|
| 5 |
+
"input_text": "do the things, not because they are easy, but because they are hard",
|
| 6 |
+
"output_path": "/content/deep-voice-cloning/scripts/output/do_the_things.wav"
|
| 7 |
+
}
|
scripts/input/hank.mp3
ADDED
|
Binary file (526 kB). View file
|
|
|
scripts/input/homer.mp3
ADDED
|
Binary file (913 kB). View file
|
|
|
scripts/output/.gitkeep
ADDED
|
File without changes
|
scripts/train.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
|
| 7 |
+
|
| 8 |
+
from deep_voice_cloning.cloning.model import CloningModel
|
| 9 |
+
from deep_voice_cloning.transcriber.model import TranscriberModel
|
| 10 |
+
from deep_voice_cloning.data.collator import TTSDataCollatorWithPadding
|
| 11 |
+
from deep_voice_cloning.data.dataset import get_cloning_dataset
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
if __name__ == "__main__":
|
| 15 |
+
parser = argparse.ArgumentParser()
|
| 16 |
+
parser.add_argument("--lang", type=str, default=None, help="Language of speech samples")
|
| 17 |
+
parser.add_argument("--audio_path", type=str, default=None, help="Path to training audio file")
|
| 18 |
+
parser.add_argument("--output_dir", type=str, default=None, help="Path to output directory for trained model")
|
| 19 |
+
args = parser.parse_args()
|
| 20 |
+
|
| 21 |
+
with open(os.path.join(os.path.dirname(__file__), "training_config.json")) as f:
|
| 22 |
+
training_config = json.load(f)
|
| 23 |
+
|
| 24 |
+
if args.lang is not None:
|
| 25 |
+
training_config['lang'] = args.lang
|
| 26 |
+
if args.audio_path is not None:
|
| 27 |
+
training_config['audio_path'] = args.audio_path
|
| 28 |
+
if args.output_dir is not None:
|
| 29 |
+
training_config['output_dir'] = args.output_dir
|
| 30 |
+
|
| 31 |
+
transcriber_model = TranscriberModel(lang=training_config['lang'])
|
| 32 |
+
cloning_model = CloningModel(lang=training_config['lang'])
|
| 33 |
+
|
| 34 |
+
dataset = get_cloning_dataset(training_config['audio_path'], transcriber_model, cloning_model)
|
| 35 |
+
data_collator = TTSDataCollatorWithPadding(processor=cloning_model.processor, model=cloning_model.model)
|
| 36 |
+
|
| 37 |
+
training_args = Seq2SeqTrainingArguments(
|
| 38 |
+
output_dir=training_config["output_dir"],
|
| 39 |
+
per_device_train_batch_size=training_config['batch_size'],
|
| 40 |
+
gradient_accumulation_steps=2,
|
| 41 |
+
overwrite_output_dir=True,
|
| 42 |
+
learning_rate=training_config['learning_rate'],
|
| 43 |
+
warmup_steps=training_config['warmup_steps'],
|
| 44 |
+
max_steps=training_config['max_steps'],
|
| 45 |
+
gradient_checkpointing=True,
|
| 46 |
+
fp16=transcriber_model.device == torch.device("cuda"),
|
| 47 |
+
evaluation_strategy="steps",
|
| 48 |
+
per_device_eval_batch_size=8,
|
| 49 |
+
save_strategy="no",
|
| 50 |
+
eval_steps=100,
|
| 51 |
+
logging_steps=20,
|
| 52 |
+
load_best_model_at_end=False,
|
| 53 |
+
greater_is_better=False,
|
| 54 |
+
label_names=["labels"],
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
trainer = Seq2SeqTrainer(
|
| 58 |
+
args=training_args,
|
| 59 |
+
model=cloning_model.model,
|
| 60 |
+
train_dataset=dataset,
|
| 61 |
+
eval_dataset=dataset,
|
| 62 |
+
data_collator=data_collator,
|
| 63 |
+
tokenizer=cloning_model.processor.tokenizer,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
trainer.train()
|
| 67 |
+
cloning_model.save_pretrained(training_config["output_dir"] +\
|
| 68 |
+
'/' + cloning_model.config['model_path'].replace('/', '_') +\
|
| 69 |
+
'_' + training_config['audio_path'].split('/')[-1].split('.')[0])
|
scripts/training_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_path": "/content/deep-voice-cloning/scripts/input/hank_hill.mp3",
|
| 3 |
+
"output_dir": "/content/deep-voice-cloning/models",
|
| 4 |
+
"lang": "en",
|
| 5 |
+
"batch_size": 2,
|
| 6 |
+
"learning_rate": 1e-4,
|
| 7 |
+
"max_steps": 1500,
|
| 8 |
+
"warmup_steps": 250
|
| 9 |
+
}
|
setup.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from setuptools import find_packages, setup
|
| 4 |
+
|
| 5 |
+
README_TEXT = (Path(__file__).parent / "README.md").read_text(encoding="utf-8")
|
| 6 |
+
|
| 7 |
+
MAINTAINER = "Konstantin Verner"
|
| 8 |
+
MAINTAINER_EMAIL = "konst.verner@gmail.com"
|
| 9 |
+
REQUIRED_PKGS = ["accelerate==0.21.0",
|
| 10 |
+
"aiohttp==3.8.4",
|
| 11 |
+
"aiosignal==1.3.1",
|
| 12 |
+
"appdirs==1.4.4",
|
| 13 |
+
"async-timeout==4.0.2",
|
| 14 |
+
"attrs==23.1.0",
|
| 15 |
+
"audioread==3.0.0",
|
| 16 |
+
"certifi==2023.5.7",
|
| 17 |
+
"cffi==1.15.1",
|
| 18 |
+
"charset-normalizer==3.2.0",
|
| 19 |
+
"colorama==0.4.6",
|
| 20 |
+
"datasets==2.13.1",
|
| 21 |
+
"decorator>=4.0.2",
|
| 22 |
+
"dill==0.3.6",
|
| 23 |
+
"filelock==3.12.2",
|
| 24 |
+
"frozenlist==1.4.0",
|
| 25 |
+
"fsspec==2023.6.0",
|
| 26 |
+
"huggingface-hub==0.16.4",
|
| 27 |
+
"HyperPyYAML==1.2.1",
|
| 28 |
+
"idna==3.4",
|
| 29 |
+
"Jinja2==3.1.2",
|
| 30 |
+
"joblib==1.3.1",
|
| 31 |
+
"lazy_loader==0.3",
|
| 32 |
+
"librosa==0.10.0.post2",
|
| 33 |
+
"llvmlite==0.40.1",
|
| 34 |
+
"MarkupSafe==2.1.3",
|
| 35 |
+
"mpmath==1.3.0",
|
| 36 |
+
"msgpack==1.0.5",
|
| 37 |
+
"multidict==6.0.4",
|
| 38 |
+
"multiprocess==0.70.14",
|
| 39 |
+
"networkx==3.1",
|
| 40 |
+
"numba==0.57.1",
|
| 41 |
+
"numpy>=1.22",
|
| 42 |
+
"packaging==23.1",
|
| 43 |
+
"pandas>=1.5.3",
|
| 44 |
+
"pooch==1.6.0",
|
| 45 |
+
"psutil==5.9.5",
|
| 46 |
+
"pyarrow>=3.0.0",
|
| 47 |
+
"pycparser==2.21",
|
| 48 |
+
"python-dateutil==2.8.2",
|
| 49 |
+
"pytz==2023.3",
|
| 50 |
+
"PyYAML==6.0",
|
| 51 |
+
"ruamel.yaml==0.17.28",
|
| 52 |
+
"ruamel.yaml.clib==0.2.7",
|
| 53 |
+
"safetensors==0.3.1",
|
| 54 |
+
"scikit-learn==1.3.0",
|
| 55 |
+
"scipy==1.11.1",
|
| 56 |
+
"sentencepiece==0.1.99",
|
| 57 |
+
"six==1.16.0",
|
| 58 |
+
"soundfile==0.12.1",
|
| 59 |
+
"soxr==0.3.5",
|
| 60 |
+
"speechbrain==0.5.14",
|
| 61 |
+
"sympy==1.12",
|
| 62 |
+
"threadpoolctl==3.2.0",
|
| 63 |
+
"tokenizers==0.13.3",
|
| 64 |
+
"torch==2.0.1",
|
| 65 |
+
"torchaudio==2.0.2",
|
| 66 |
+
"tqdm==4.65.0",
|
| 67 |
+
"transformers==4.30.2",
|
| 68 |
+
"typing_extensions==4.7.1",
|
| 69 |
+
"tzdata==2023.3",
|
| 70 |
+
"urllib3==2.0.3",
|
| 71 |
+
"xxhash==3.2.0",
|
| 72 |
+
"yarl==1.9.2"]
|
| 73 |
+
|
| 74 |
+
print(find_packages("src"))
|
| 75 |
+
|
| 76 |
+
setup(
|
| 77 |
+
name="deep_voice_cloning",
|
| 78 |
+
version="0.1.0",
|
| 79 |
+
description="Few-Shot Voice Cloning",
|
| 80 |
+
long_description=README_TEXT,
|
| 81 |
+
long_description_content_type="text/markdown",
|
| 82 |
+
maintainer=MAINTAINER,
|
| 83 |
+
maintainer_email=MAINTAINER_EMAIL,
|
| 84 |
+
url="",
|
| 85 |
+
download_url="",
|
| 86 |
+
license="MIT",
|
| 87 |
+
package_dir={"": "src"},
|
| 88 |
+
packages=find_packages("src"),
|
| 89 |
+
include_package_data=True,
|
| 90 |
+
package_data={"": ["*.json"]},
|
| 91 |
+
install_requires=REQUIRED_PKGS,
|
| 92 |
+
classifiers=[
|
| 93 |
+
"Development Status :: 1 - Planning",
|
| 94 |
+
"Intended Audience :: Developers",
|
| 95 |
+
"Intended Audience :: Education",
|
| 96 |
+
"Intended Audience :: Science/Research",
|
| 97 |
+
"License :: OSI Approved :: MIT",
|
| 98 |
+
"Operating System :: OS Independent",
|
| 99 |
+
"Programming Language :: Python :: 3",
|
| 100 |
+
"Programming Language :: Python :: 3.8",
|
| 101 |
+
"Programming Language :: Python :: 3.9",
|
| 102 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 103 |
+
],
|
| 104 |
+
keywords="asr, machine learning, fewshot learning, transformers",
|
| 105 |
+
zip_safe=False, # Required for mypy to find the py.typed file
|
| 106 |
+
)
|
src/deep_voice_cloning/__init__.py
ADDED
|
File without changes
|
src/deep_voice_cloning/cloning/__init__.py
ADDED
|
File without changes
|
src/deep_voice_cloning/cloning/config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"en": {
|
| 3 |
+
"model_path": "microsoft/speecht5_tts",
|
| 4 |
+
"vocoder_name": "microsoft/speecht5_hifigan",
|
| 5 |
+
"speaker_model_name": "speechbrain/spkrec-xvect-voxceleb"
|
| 6 |
+
}
|
| 7 |
+
}
|
src/deep_voice_cloning/cloning/model.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from typing import Dict
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import torch
|
| 7 |
+
from speechbrain.pretrained import EncoderClassifier
|
| 8 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class CloningModel:
|
| 12 |
+
def __init__(self, config: Dict[str, Dict[str, str]] = None, lang: str = 'en'):
|
| 13 |
+
super(CloningModel, self).__init__()
|
| 14 |
+
if config is None:
|
| 15 |
+
self.speaker_embedding = None
|
| 16 |
+
with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
|
| 17 |
+
self.config = json.load(f)[lang]
|
| 18 |
+
else:
|
| 19 |
+
self.config = config
|
| 20 |
+
self.speaker_embedding = torch.load(self.config['model_path'] + "/speaker_embedding.pt")[0]
|
| 21 |
+
self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
|
| 22 |
+
self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
|
| 23 |
+
self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
|
| 24 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 25 |
+
self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
|
| 26 |
+
self.to(self.device)
|
| 27 |
+
|
| 28 |
+
def to(self, device: torch.device):
|
| 29 |
+
self.model = self.model.to(device)
|
| 30 |
+
self.vocoder = self.vocoder.to(device)
|
| 31 |
+
|
| 32 |
+
def save_pretrained(self, save_directory: str):
|
| 33 |
+
self.model.save_pretrained(save_directory)
|
| 34 |
+
self.processor.save_pretrained(save_directory)
|
| 35 |
+
torch.save(self.speaker_embedding, save_directory + "/speaker_embedding.pt")
|
| 36 |
+
|
| 37 |
+
def forward(self, text: str) -> np.array:
|
| 38 |
+
# tokenize text
|
| 39 |
+
inputs = self.processor(text=text, return_tensors="pt")
|
| 40 |
+
# generate spectrogram using backbone model
|
| 41 |
+
spectrogram = self.model.generate_speech(inputs["input_ids"].to(self.device),
|
| 42 |
+
self.speaker_embedding.to(self.device))
|
| 43 |
+
# decode spectrogram into waveform using vocoder
|
| 44 |
+
with torch.no_grad():
|
| 45 |
+
waveform_array = self.vocoder(spectrogram).detach().cpu().numpy()
|
| 46 |
+
return waveform_array
|
| 47 |
+
|
| 48 |
+
def create_speaker_embedding(self, waveform: torch.tensor) -> torch.tensor:
|
| 49 |
+
with torch.no_grad():
|
| 50 |
+
speaker_embeddings = self.speaker_model.encode_batch(waveform)
|
| 51 |
+
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
|
| 52 |
+
self.speaker_embedding = speaker_embeddings
|
| 53 |
+
speaker_embeddings = speaker_embeddings.squeeze()
|
| 54 |
+
return speaker_embeddings
|
src/deep_voice_cloning/data/__init__.py
ADDED
|
File without changes
|
src/deep_voice_cloning/data/collator.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from typing import Any, Dict, List, Union
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class TTSDataCollatorWithPadding:
|
| 6 |
+
|
| 7 |
+
def __init__(self, model, processor):
|
| 8 |
+
self.model = model
|
| 9 |
+
self.processor = processor
|
| 10 |
+
|
| 11 |
+
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
| 12 |
+
input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
|
| 13 |
+
label_features = [{"input_values": feature["labels"]} for feature in features]
|
| 14 |
+
speaker_features = [feature["speaker_embeddings"] for feature in features]
|
| 15 |
+
|
| 16 |
+
# collate the inputs and targets into a batch
|
| 17 |
+
batch = self.processor.pad(
|
| 18 |
+
input_ids=input_ids,
|
| 19 |
+
labels=label_features,
|
| 20 |
+
return_tensors="pt",
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# replace padding with -100 to ignore loss correctly
|
| 24 |
+
batch["labels"] = batch["labels"].masked_fill(
|
| 25 |
+
batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# not used during fine-tuning
|
| 29 |
+
del batch["decoder_attention_mask"]
|
| 30 |
+
|
| 31 |
+
# round down target lengths to multiple of reduction factor
|
| 32 |
+
if self.model.config.reduction_factor > 1:
|
| 33 |
+
target_lengths = torch.tensor([
|
| 34 |
+
len(feature["input_values"]) for feature in label_features
|
| 35 |
+
])
|
| 36 |
+
target_lengths = target_lengths.new([
|
| 37 |
+
length - length % self.model.config.reduction_factor for length in target_lengths
|
| 38 |
+
])
|
| 39 |
+
max_length = max(target_lengths)
|
| 40 |
+
batch["labels"] = batch["labels"][:, :max_length]
|
| 41 |
+
|
| 42 |
+
# add the speaker embeddings
|
| 43 |
+
batch["speaker_embeddings"] = torch.tensor(speaker_features)
|
| 44 |
+
|
| 45 |
+
return batch
|
src/deep_voice_cloning/data/dataset.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Any
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import librosa
|
| 5 |
+
import numpy as np
|
| 6 |
+
from datasets import Dataset
|
| 7 |
+
|
| 8 |
+
from ..cloning.model import CloningModel
|
| 9 |
+
from ..transcriber.model import TranscriberModel
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def prepare_dataset(example: Dict[str, Any], model: CloningModel) -> Dict[str, Any]:
|
| 13 |
+
"""
|
| 14 |
+
Prepare a single example for training
|
| 15 |
+
"""
|
| 16 |
+
# feature extraction and tokenization
|
| 17 |
+
processed_example = model.processor(
|
| 18 |
+
text=example["normalized_text"],
|
| 19 |
+
audio_target=example["audio"]["array"],
|
| 20 |
+
sampling_rate=16000,
|
| 21 |
+
return_attention_mask=False,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# strip off the batch dimension
|
| 25 |
+
if len(torch.tensor(processed_example['input_ids']).shape) > 1:
|
| 26 |
+
processed_example['input_ids'] = processed_example['input_ids'][0]
|
| 27 |
+
|
| 28 |
+
processed_example["labels"] = processed_example["labels"][0]
|
| 29 |
+
|
| 30 |
+
# use SpeechBrain to obtain x-vector
|
| 31 |
+
processed_example["speaker_embeddings"] = model.create_speaker_embedding(
|
| 32 |
+
torch.tensor(example["audio"]["array"])
|
| 33 |
+
).numpy()
|
| 34 |
+
|
| 35 |
+
return processed_example
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def get_cloning_dataset(input_audio_path: str,
|
| 39 |
+
transcriber_model: TranscriberModel,
|
| 40 |
+
cloning_model: CloningModel,
|
| 41 |
+
sampling_rate: int = 16000,
|
| 42 |
+
window_size_secs: int = 5) -> Dataset:
|
| 43 |
+
"""
|
| 44 |
+
Create dataset by transcribing an audio file using a pretrained Wav2Vec2 model.
|
| 45 |
+
"""
|
| 46 |
+
speech_array, _ = librosa.load(input_audio_path, sr=sampling_rate)
|
| 47 |
+
|
| 48 |
+
# split a waveform into splits of 5 secs each
|
| 49 |
+
speech_arrays = np.split(speech_array, range(0, len(speech_array), window_size_secs * sampling_rate))[1:]
|
| 50 |
+
texts = [transcriber_model.forward(speech_array, sampling_rate=sampling_rate)
|
| 51 |
+
for speech_array in speech_arrays]
|
| 52 |
+
|
| 53 |
+
dataset = Dataset.from_list([
|
| 54 |
+
{'audio': {'array': speech_arrays[i]}, 'normalized_text': texts[i]}
|
| 55 |
+
for i in range(len(speech_arrays))]
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
dataset = dataset.map(
|
| 59 |
+
prepare_dataset, fn_kwargs={'model': cloning_model},
|
| 60 |
+
remove_columns=dataset.column_names,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
return dataset
|
src/deep_voice_cloning/transcriber/__init__.py
ADDED
|
File without changes
|
src/deep_voice_cloning/transcriber/config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"language_model_names": {
|
| 3 |
+
"en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
|
| 4 |
+
"fr": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
|
| 5 |
+
"de": "jonatasgrosman/wav2vec2-large-xlsr-53-german"
|
| 6 |
+
}
|
| 7 |
+
}
|
src/deep_voice_cloning/transcriber/model.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class TranscriberModel:
|
| 10 |
+
def __init__(self, lang: str = 'en'):
|
| 11 |
+
with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
|
| 12 |
+
config = json.load(f)
|
| 13 |
+
self.processor = Wav2Vec2Processor.from_pretrained(config['language_model_names'][lang])
|
| 14 |
+
self.model = Wav2Vec2ForCTC.from_pretrained(config['language_model_names'][lang])
|
| 15 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 16 |
+
|
| 17 |
+
def forward(self, speech_array: np.array, sampling_rate: int = 16000) -> str:
|
| 18 |
+
model_input = self.processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
|
| 19 |
+
with torch.no_grad():
|
| 20 |
+
logits = self.model(model_input.input_values, attention_mask=model_input.attention_mask).logits
|
| 21 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
| 22 |
+
return self.processor.batch_decode(predicted_ids)
|