Spaces:
Running
Running
File size: 3,834 Bytes
1d8403e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | # ==================================================================================================
# DEEPFAKE AUDIO - encoder_train.py (Neural Identity Orchestration)
# ==================================================================================================
#
# π DESCRIPTION
# This script manages the training lifecycle of the Speaker Encoder. It optimizes a
# d-vector based neural network to minimize the GE2E (Generalized End-to-End) loss.
# The goal is to maximize the similarity between embeddings of the same speaker
# while minimizing similarity between different speakers, enabling high-fidelity
# zero-shot voice cloning.
#
# π€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# π€π» CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# π PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# π LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================
from utils.argutils import print_args
from encoder.train import train
from pathlib import Path
import argparse
if __name__ == "__main__":
# --- INTERFACE COMMANDS ---
parser = argparse.ArgumentParser(
description="Encoder Training Hub: Optimizing identity embeddings from preprocessed data.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
# --- SESSION DEFINITION ---
parser.add_argument("run_id", type=str,
help="Identifier for this training experiment. Models and logs will be organized under this ID.")
parser.add_argument("clean_data_root", type=Path,
help="Root path to the mel-spectrograms generated by encoder_preprocess.py.")
# --- STORAGE & TELEMETRY ---
parser.add_argument("-m", "--models_dir", type=Path, default="saved_models",
help="Parent directory for serialized weights, backups, and diagnostic plots.")
parser.add_argument("-v", "--vis_every", type=int, default=10,
help="Iteration frequency for updating training curves and loss metrics.")
parser.add_argument("-u", "--umap_every", type=int, default=100,
help="Frequency of UMAP projections to visualize speaker cluster separation.")
parser.add_argument("-s", "--save_every", type=int, default=500,
help="Step interval for materializing model weights (.pt) on disk.")
parser.add_argument("-b", "--backup_every", type=int, default=7500,
help="Interval for creating immutable rolling backups of the model state.")
parser.add_argument("-f", "--force_restart", action="store_true",
help="Bypass existing checkpoints and initialize weights from distribution (restart from scratch).")
# --- VISUALIZATION SERVER ---
parser.add_argument("--visdom_server", type=str, default="http://localhost",
help="Remote address of the Visdom dashboard server.")
parser.add_argument("--no_visdom", action="store_true",
help="Inhibit rich visual telemetry (not recommended for production monitoring).")
args = parser.parse_args()
# --- EXECUTION ---
print_args(args, parser)
print("π€π» Scholarly Partnership: Amey Thakur & Mega Satish")
print("π Initiating Neural Training Pipeline - Monitoring d-vector clusters...")
# Delegate to the internal training engine.
train(**vars(args))
|