Deepfake-Audio / Source Code /synthesizer_preprocess_embeds.py
ameythakur's picture
Deepfake-Audio
1d8403e verified
# ==================================================================================================
# DEEPFAKE AUDIO - synthesizer_preprocess_embeds.py (Neural Latent Distillation)
# ==================================================================================================
#
# πŸ“ DESCRIPTION
# This script bridge the Gap between Speaker Identification and Speech Synthesis.
# It utilizes a pre-trained Speaker Encoder to generate latent speaker embeddings
# for every utterance in the synthesis dataset. These embeddings provide the
# high-dimensional "identity signals" that allow the Synthesizer to adapt its
# output to a specific target voice.
#
# πŸ‘€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# 🀝🏻 CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# πŸ”— PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# πŸ“œ LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================
from synthesizer.preprocess import create_embeddings
from utils.argutils import print_args
from pathlib import Path
import argparse
if __name__ == "__main__":
# --- INTERFACE COMMANDS ---
parser = argparse.ArgumentParser(
description="Embedding Distiller: Derives latent speaker vectors for synthesis training.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
# --- PATH & MODEL PARAMETERS ---
parser.add_argument("synthesizer_root", type=Path,
help="Root directory of the synthesizer training data (generated by synthesizer_preprocess_audio.py).")
parser.add_argument("-e", "--encoder_model_fpath", type=Path,
default="saved_models/default/encoder.pt",
help="Path to the pre-trained neural encoder (.pt).")
# --- COMPUTE ORCHESTRATION ---
parser.add_argument("-n", "--n_processes", type=int, default=4,
help="Degree of parallelism. Note: High parallelism may exhaust GPU VRAM during distillation.")
args = parser.parse_args()
# --- EXECUTION ---
print_args(args, parser)
print("🀝🏻 Scholarly Partnership: Amey Thakur & Mega Satish")
print("πŸš€ Distilling acoustic identities into neural embeddings...")
# Execute the embedding generation engine.
create_embeddings(**vars(args))