# ================================================================================================== # DEEPFAKE AUDIO - encoder_train.py (Neural Identity Orchestration) # ================================================================================================== # # 📝 DESCRIPTION # This script manages the training lifecycle of the Speaker Encoder. It optimizes a # d-vector based neural network to minimize the GE2E (Generalized End-to-End) loss. # The goal is to maximize the similarity between embeddings of the same speaker # while minimizing similarity between different speakers, enabling high-fidelity # zero-shot voice cloning. # # 👤 AUTHORS # - Amey Thakur (https://github.com/Amey-Thakur) # - Mega Satish (https://github.com/msatmod) # # 🤝🏻 CREDITS # Original Real-Time Voice Cloning methodology by CorentinJ # Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning # # 🔗 PROJECT LINKS # Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO # Video Demo: https://youtu.be/i3wnBcbHDbs # Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb # # 📜 LICENSE # Released under the MIT License # Release Date: 2021-02-06 # ================================================================================================== from utils.argutils import print_args from encoder.train import train from pathlib import Path import argparse if __name__ == "__main__": # --- INTERFACE COMMANDS --- parser = argparse.ArgumentParser( description="Encoder Training Hub: Optimizing identity embeddings from preprocessed data.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) # --- SESSION DEFINITION --- parser.add_argument("run_id", type=str, help="Identifier for this training experiment. Models and logs will be organized under this ID.") parser.add_argument("clean_data_root", type=Path, help="Root path to the mel-spectrograms generated by encoder_preprocess.py.") # --- STORAGE & TELEMETRY --- parser.add_argument("-m", "--models_dir", type=Path, default="saved_models", help="Parent directory for serialized weights, backups, and diagnostic plots.") parser.add_argument("-v", "--vis_every", type=int, default=10, help="Iteration frequency for updating training curves and loss metrics.") parser.add_argument("-u", "--umap_every", type=int, default=100, help="Frequency of UMAP projections to visualize speaker cluster separation.") parser.add_argument("-s", "--save_every", type=int, default=500, help="Step interval for materializing model weights (.pt) on disk.") parser.add_argument("-b", "--backup_every", type=int, default=7500, help="Interval for creating immutable rolling backups of the model state.") parser.add_argument("-f", "--force_restart", action="store_true", help="Bypass existing checkpoints and initialize weights from distribution (restart from scratch).") # --- VISUALIZATION SERVER --- parser.add_argument("--visdom_server", type=str, default="http://localhost", help="Remote address of the Visdom dashboard server.") parser.add_argument("--no_visdom", action="store_true", help="Inhibit rich visual telemetry (not recommended for production monitoring).") args = parser.parse_args() # --- EXECUTION --- print_args(args, parser) print("🤝🏻 Scholarly Partnership: Amey Thakur & Mega Satish") print("🚀 Initiating Neural Training Pipeline - Monitoring d-vector clusters...") # Delegate to the internal training engine. train(**vars(args))