File size: 2,706 Bytes
1d8403e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# ==================================================================================================
# DEEPFAKE AUDIO - synthesizer_preprocess_embeds.py (Neural Latent Distillation)
# ==================================================================================================
# 
# πŸ“ DESCRIPTION
# This script bridge the Gap between Speaker Identification and Speech Synthesis. 
# It utilizes a pre-trained Speaker Encoder to generate latent speaker embeddings 
# for every utterance in the synthesis dataset. These embeddings provide the 
# high-dimensional "identity signals" that allow the Synthesizer to adapt its 
# output to a specific target voice.
#
# πŸ‘€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# 🀝🏻 CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# πŸ”— PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# πŸ“œ LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================

from synthesizer.preprocess import create_embeddings
from utils.argutils import print_args
from pathlib import Path
import argparse

if __name__ == "__main__":
    # --- INTERFACE COMMANDS ---
    parser = argparse.ArgumentParser(
        description="Embedding Distiller: Derives latent speaker vectors for synthesis training.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    # --- PATH & MODEL PARAMETERS ---
    parser.add_argument("synthesizer_root", type=Path, 
                        help="Root directory of the synthesizer training data (generated by synthesizer_preprocess_audio.py).")
    parser.add_argument("-e", "--encoder_model_fpath", type=Path,
                        default="saved_models/default/encoder.pt", 
                        help="Path to the pre-trained neural encoder (.pt).")
    
    # --- COMPUTE ORCHESTRATION ---
    parser.add_argument("-n", "--n_processes", type=int, default=4, 
                        help="Degree of parallelism. Note: High parallelism may exhaust GPU VRAM during distillation.")
    
    args = parser.parse_args()

    # --- EXECUTION ---
    print_args(args, parser)
    print("🀝🏻 Scholarly Partnership: Amey Thakur & Mega Satish")
    print("πŸš€ Distilling acoustic identities into neural embeddings...")
    
    # Execute the embedding generation engine.
    create_embeddings(**vars(args))