Deepfake-Audio / Source Code /synthesizer_preprocess_audio.py
ameythakur's picture
Deepfake-Audio
1d8403e verified
# ==================================================================================================
# DEEPFAKE AUDIO - synthesizer_preprocess_audio.py (Acoustic Feature Alignment)
# ==================================================================================================
#
# πŸ“ DESCRIPTION
# This script orchestrates the ground-truth audio preprocessing for the Synthesizer.
# It transforms raw speech utterances into time-aligned Mel-Spectrograms, which are
# essential for training the Tacotron 2-based synthesis architecture. This process
# ensures that the synthesizer can learn the relationship between textual tokens
# and acoustic spectral features.
#
# πŸ‘€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# 🀝🏻 CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# πŸ”— PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# πŸ“œ LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================
from synthesizer.preprocess import preprocess_dataset
from synthesizer.hparams import hparams
from utils.argutils import print_args
from pathlib import Path
import argparse
if __name__ == "__main__":
# --- INTERFACE COMMANDS ---
parser = argparse.ArgumentParser(
description="Synthesizer Audio Pre-processor: Transforms speech data into training-ready spectrograms.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
# --- PATH SPECIFICATIONS ---
parser.add_argument("datasets_root", type=Path,
help="Root directory containing the LibriSpeech/TTS datasets.")
parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS,
help="Destination for the synthesized spectrograms and embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
# --- PERFORMANCE & OPTIMIZATION ---
parser.add_argument("-n", "--n_processes", type=int, default=4,
help="Degree of parallelism for the preprocessing pipeline.")
parser.add_argument("-s", "--skip_existing", action="store_true",
help="Bypass utterances that have already been materialized on disk.")
parser.add_argument("--hparams", type=str, default="",
help="Acoustic hyperparameter overrides (comma-separated).")
parser.add_argument("--no_alignments", action="store_true",
help="Fallback mode for datasets lacking temporal alignment metadata.")
# --- DATASET MANAGEMENT ---
parser.add_argument("--datasets_name", type=str, default="LibriSpeech",
help="Target dataset identifier to prioritize.")
parser.add_argument("--subfolders", type=str, default="train-clean-100,train-clean-360",
help="Specific sub-corpora to target within the dataset root.")
args = parser.parse_args()
# --- ARCHITECTURAL ORCHESTRATION ---
if not hasattr(args, "out_dir"):
args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
# Verify workspace integrity.
assert args.datasets_root.exists(), "Fatal: datasets_root not found. 🀝🏻 Verify pathing."
args.out_dir.mkdir(exist_ok=True, parents=True)
# --- EXECUTION ---
print_args(args, parser)
print("🀝🏻 Scholarly Partnership: Amey Thakur & Mega Satish")
print("πŸš€ Initiating Mel-Spectrogram Extraction Pipeline...")
args.hparams = hparams.parse(args.hparams)
preprocess_dataset(**vars(args))