Spaces:
Running
Running
File size: 2,706 Bytes
1d8403e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | # ==================================================================================================
# DEEPFAKE AUDIO - synthesizer_preprocess_embeds.py (Neural Latent Distillation)
# ==================================================================================================
#
# π DESCRIPTION
# This script bridge the Gap between Speaker Identification and Speech Synthesis.
# It utilizes a pre-trained Speaker Encoder to generate latent speaker embeddings
# for every utterance in the synthesis dataset. These embeddings provide the
# high-dimensional "identity signals" that allow the Synthesizer to adapt its
# output to a specific target voice.
#
# π€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# π€π» CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# π PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# π LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================
from synthesizer.preprocess import create_embeddings
from utils.argutils import print_args
from pathlib import Path
import argparse
if __name__ == "__main__":
# --- INTERFACE COMMANDS ---
parser = argparse.ArgumentParser(
description="Embedding Distiller: Derives latent speaker vectors for synthesis training.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
# --- PATH & MODEL PARAMETERS ---
parser.add_argument("synthesizer_root", type=Path,
help="Root directory of the synthesizer training data (generated by synthesizer_preprocess_audio.py).")
parser.add_argument("-e", "--encoder_model_fpath", type=Path,
default="saved_models/default/encoder.pt",
help="Path to the pre-trained neural encoder (.pt).")
# --- COMPUTE ORCHESTRATION ---
parser.add_argument("-n", "--n_processes", type=int, default=4,
help="Degree of parallelism. Note: High parallelism may exhaust GPU VRAM during distillation.")
args = parser.parse_args()
# --- EXECUTION ---
print_args(args, parser)
print("π€π» Scholarly Partnership: Amey Thakur & Mega Satish")
print("π Distilling acoustic identities into neural embeddings...")
# Execute the embedding generation engine.
create_embeddings(**vars(args))
|