# ================================================================================================== # DEEPFAKE AUDIO - synthesizer_preprocess_audio.py (Acoustic Feature Alignment) # ================================================================================================== # # 📝 DESCRIPTION # This script orchestrates the ground-truth audio preprocessing for the Synthesizer. # It transforms raw speech utterances into time-aligned Mel-Spectrograms, which are # essential for training the Tacotron 2-based synthesis architecture. This process # ensures that the synthesizer can learn the relationship between textual tokens # and acoustic spectral features. # # 👤 AUTHORS # - Amey Thakur (https://github.com/Amey-Thakur) # - Mega Satish (https://github.com/msatmod) # # 🤝🏻 CREDITS # Original Real-Time Voice Cloning methodology by CorentinJ # Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning # # 🔗 PROJECT LINKS # Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO # Video Demo: https://youtu.be/i3wnBcbHDbs # Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb # # 📜 LICENSE # Released under the MIT License # Release Date: 2021-02-06 # ================================================================================================== from synthesizer.preprocess import preprocess_dataset from synthesizer.hparams import hparams from utils.argutils import print_args from pathlib import Path import argparse if __name__ == "__main__": # --- INTERFACE COMMANDS --- parser = argparse.ArgumentParser( description="Synthesizer Audio Pre-processor: Transforms speech data into training-ready spectrograms.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) # --- PATH SPECIFICATIONS --- parser.add_argument("datasets_root", type=Path, help="Root directory containing the LibriSpeech/TTS datasets.") parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help="Destination for the synthesized spectrograms and embeds. Defaults to /SV2TTS/synthesizer/") # --- PERFORMANCE & OPTIMIZATION --- parser.add_argument("-n", "--n_processes", type=int, default=4, help="Degree of parallelism for the preprocessing pipeline.") parser.add_argument("-s", "--skip_existing", action="store_true", help="Bypass utterances that have already been materialized on disk.") parser.add_argument("--hparams", type=str, default="", help="Acoustic hyperparameter overrides (comma-separated).") parser.add_argument("--no_alignments", action="store_true", help="Fallback mode for datasets lacking temporal alignment metadata.") # --- DATASET MANAGEMENT --- parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help="Target dataset identifier to prioritize.") parser.add_argument("--subfolders", type=str, default="train-clean-100,train-clean-360", help="Specific sub-corpora to target within the dataset root.") args = parser.parse_args() # --- ARCHITECTURAL ORCHESTRATION --- if not hasattr(args, "out_dir"): args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer") # Verify workspace integrity. assert args.datasets_root.exists(), "Fatal: datasets_root not found. 🤝🏻 Verify pathing." args.out_dir.mkdir(exist_ok=True, parents=True) # --- EXECUTION --- print_args(args, parser) print("🤝🏻 Scholarly Partnership: Amey Thakur & Mega Satish") print("🚀 Initiating Mel-Spectrogram Extraction Pipeline...") args.hparams = hparams.parse(args.hparams) preprocess_dataset(**vars(args))