Spaces:
Running
Running
File size: 3,919 Bytes
1d8403e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | # ==================================================================================================
# DEEPFAKE AUDIO - vocoder_preprocess.py (Spectrogram Realignment for Waveform Synthesis)
# ==================================================================================================
#
# π DESCRIPTION
# This script prepares the Ground-Truth Aligned (GTA) mel-spectrograms required 1or
# training the Waveform Vocoder (WaveRNN). By passing training audio through the
# synthesizer and capturing the resulting spectrograms, we ensure that the vocoder
# learns to reconstruct audio from the specific spectral artifacts produced by
# the synthesis engine, rather than just perfect ground truth data.
#
# π€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# π€π» CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# π PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# π LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================
import argparse
import os
from pathlib import Path
# --- SYNTHETIC PIPELINE MODULES ---
from synthesizer.hparams import hparams
from synthesizer.synthesize import run_synthesis
from utils.argutils import print_args
if __name__ == "__main__":
# --- INTERFACE CONFIGURATION ---
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
"""Custom formatter to preserve scholarly descriptions."""
pass
parser = argparse.ArgumentParser(
description="GTA Pre-processor: Generates synthesizer-aligned spectrograms for vocoder training.",
formatter_class=MyFormatter
)
# --- PATH SPECIFICATIONS ---
parser.add_argument("datasets_root", type=Path,
help="Root directory of the SV2TTS data structure.")
parser.add_argument("-s", "--syn_model_fpath", type=Path,
default="saved_models/default/synthesizer.pt",
help="Path to the trained synthesizer weights used for GTA generation.")
parser.add_argument("-i", "--in_dir", type=Path, default=argparse.SUPPRESS,
help="Input directory (synthesizer-level data). Defaults to <datasets_root>/SV2TTS/synthesizer/.")
parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS,
help="Output directory for GTA features. Defaults to <datasets_root>/SV2TTS/vocoder/.")
# --- COMPUTE PARAMETERS ---
parser.add_argument("--hparams", default="",
help="Acoustic hyperparameter overrides.")
parser.add_argument("--cpu", action="store_true",
help="Enforce CPU-only processing for GTA synthesis.")
args = parser.parse_args()
# --- ARCHITECTURAL ORCHESTRATION ---
print_args(args, parser)
print("π€π» Scholarly Partnership: Amey Thakur & Mega Satish")
print("π Extracting Ground-Truth Aligned mel-spectrograms...")
modified_hp = hparams.parse(args.hparams)
# Resolution of default paths within the SV2TTS ecosystem.
if not hasattr(args, "in_dir"):
args.in_dir = args.datasets_root / "SV2TTS" / "synthesizer"
if not hasattr(args, "out_dir"):
args.out_dir = args.datasets_root / "SV2TTS" / "vocoder"
# Hardware masking for CPU-constrained environments.
if args.cpu:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# Execute the GTA synthesis engine.
run_synthesis(args.in_dir, args.out_dir, args.syn_model_fpath, modified_hp)
|