Spaces:
Running
Running
| # ================================================================================================== | |
| # DEEPFAKE AUDIO - synthesizer_preprocess_embeds.py (Neural Latent Distillation) | |
| # ================================================================================================== | |
| # | |
| # π DESCRIPTION | |
| # This script bridge the Gap between Speaker Identification and Speech Synthesis. | |
| # It utilizes a pre-trained Speaker Encoder to generate latent speaker embeddings | |
| # for every utterance in the synthesis dataset. These embeddings provide the | |
| # high-dimensional "identity signals" that allow the Synthesizer to adapt its | |
| # output to a specific target voice. | |
| # | |
| # π€ AUTHORS | |
| # - Amey Thakur (https://github.com/Amey-Thakur) | |
| # - Mega Satish (https://github.com/msatmod) | |
| # | |
| # π€π» CREDITS | |
| # Original Real-Time Voice Cloning methodology by CorentinJ | |
| # Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning | |
| # | |
| # π PROJECT LINKS | |
| # Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO | |
| # Video Demo: https://youtu.be/i3wnBcbHDbs | |
| # Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb | |
| # | |
| # π LICENSE | |
| # Released under the MIT License | |
| # Release Date: 2021-02-06 | |
| # ================================================================================================== | |
| from synthesizer.preprocess import create_embeddings | |
| from utils.argutils import print_args | |
| from pathlib import Path | |
| import argparse | |
| if __name__ == "__main__": | |
| # --- INTERFACE COMMANDS --- | |
| parser = argparse.ArgumentParser( | |
| description="Embedding Distiller: Derives latent speaker vectors for synthesis training.", | |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
| ) | |
| # --- PATH & MODEL PARAMETERS --- | |
| parser.add_argument("synthesizer_root", type=Path, | |
| help="Root directory of the synthesizer training data (generated by synthesizer_preprocess_audio.py).") | |
| parser.add_argument("-e", "--encoder_model_fpath", type=Path, | |
| default="saved_models/default/encoder.pt", | |
| help="Path to the pre-trained neural encoder (.pt).") | |
| # --- COMPUTE ORCHESTRATION --- | |
| parser.add_argument("-n", "--n_processes", type=int, default=4, | |
| help="Degree of parallelism. Note: High parallelism may exhaust GPU VRAM during distillation.") | |
| args = parser.parse_args() | |
| # --- EXECUTION --- | |
| print_args(args, parser) | |
| print("π€π» Scholarly Partnership: Amey Thakur & Mega Satish") | |
| print("π Distilling acoustic identities into neural embeddings...") | |
| # Execute the embedding generation engine. | |
| create_embeddings(**vars(args)) | |