# ================================================================================================== # DEEPFAKE AUDIO - synthesizer_preprocess_embeds.py (Neural Latent Distillation) # ================================================================================================== # # 📝 DESCRIPTION # This script bridge the Gap between Speaker Identification and Speech Synthesis. # It utilizes a pre-trained Speaker Encoder to generate latent speaker embeddings # for every utterance in the synthesis dataset. These embeddings provide the # high-dimensional "identity signals" that allow the Synthesizer to adapt its # output to a specific target voice. # # 👤 AUTHORS # - Amey Thakur (https://github.com/Amey-Thakur) # - Mega Satish (https://github.com/msatmod) # # 🤝🏻 CREDITS # Original Real-Time Voice Cloning methodology by CorentinJ # Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning # # 🔗 PROJECT LINKS # Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO # Video Demo: https://youtu.be/i3wnBcbHDbs # Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb # # 📜 LICENSE # Released under the MIT License # Release Date: 2021-02-06 # ================================================================================================== from synthesizer.preprocess import create_embeddings from utils.argutils import print_args from pathlib import Path import argparse if __name__ == "__main__": # --- INTERFACE COMMANDS --- parser = argparse.ArgumentParser( description="Embedding Distiller: Derives latent speaker vectors for synthesis training.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) # --- PATH & MODEL PARAMETERS --- parser.add_argument("synthesizer_root", type=Path, help="Root directory of the synthesizer training data (generated by synthesizer_preprocess_audio.py).") parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="saved_models/default/encoder.pt", help="Path to the pre-trained neural encoder (.pt).") # --- COMPUTE ORCHESTRATION --- parser.add_argument("-n", "--n_processes", type=int, default=4, help="Degree of parallelism. Note: High parallelism may exhaust GPU VRAM during distillation.") args = parser.parse_args() # --- EXECUTION --- print_args(args, parser) print("🤝🏻 Scholarly Partnership: Amey Thakur & Mega Satish") print("🚀 Distilling acoustic identities into neural embeddings...") # Execute the embedding generation engine. create_embeddings(**vars(args))