Spaces:
Running
Running
| # ================================================================================================== | |
| # DEEPFAKE AUDIO - synthesizer_preprocess_audio.py (Acoustic Feature Alignment) | |
| # ================================================================================================== | |
| # | |
| # π DESCRIPTION | |
| # This script orchestrates the ground-truth audio preprocessing for the Synthesizer. | |
| # It transforms raw speech utterances into time-aligned Mel-Spectrograms, which are | |
| # essential for training the Tacotron 2-based synthesis architecture. This process | |
| # ensures that the synthesizer can learn the relationship between textual tokens | |
| # and acoustic spectral features. | |
| # | |
| # π€ AUTHORS | |
| # - Amey Thakur (https://github.com/Amey-Thakur) | |
| # - Mega Satish (https://github.com/msatmod) | |
| # | |
| # π€π» CREDITS | |
| # Original Real-Time Voice Cloning methodology by CorentinJ | |
| # Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning | |
| # | |
| # π PROJECT LINKS | |
| # Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO | |
| # Video Demo: https://youtu.be/i3wnBcbHDbs | |
| # Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb | |
| # | |
| # π LICENSE | |
| # Released under the MIT License | |
| # Release Date: 2021-02-06 | |
| # ================================================================================================== | |
| from synthesizer.preprocess import preprocess_dataset | |
| from synthesizer.hparams import hparams | |
| from utils.argutils import print_args | |
| from pathlib import Path | |
| import argparse | |
| if __name__ == "__main__": | |
| # --- INTERFACE COMMANDS --- | |
| parser = argparse.ArgumentParser( | |
| description="Synthesizer Audio Pre-processor: Transforms speech data into training-ready spectrograms.", | |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
| ) | |
| # --- PATH SPECIFICATIONS --- | |
| parser.add_argument("datasets_root", type=Path, | |
| help="Root directory containing the LibriSpeech/TTS datasets.") | |
| parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, | |
| help="Destination for the synthesized spectrograms and embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/") | |
| # --- PERFORMANCE & OPTIMIZATION --- | |
| parser.add_argument("-n", "--n_processes", type=int, default=4, | |
| help="Degree of parallelism for the preprocessing pipeline.") | |
| parser.add_argument("-s", "--skip_existing", action="store_true", | |
| help="Bypass utterances that have already been materialized on disk.") | |
| parser.add_argument("--hparams", type=str, default="", | |
| help="Acoustic hyperparameter overrides (comma-separated).") | |
| parser.add_argument("--no_alignments", action="store_true", | |
| help="Fallback mode for datasets lacking temporal alignment metadata.") | |
| # --- DATASET MANAGEMENT --- | |
| parser.add_argument("--datasets_name", type=str, default="LibriSpeech", | |
| help="Target dataset identifier to prioritize.") | |
| parser.add_argument("--subfolders", type=str, default="train-clean-100,train-clean-360", | |
| help="Specific sub-corpora to target within the dataset root.") | |
| args = parser.parse_args() | |
| # --- ARCHITECTURAL ORCHESTRATION --- | |
| if not hasattr(args, "out_dir"): | |
| args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer") | |
| # Verify workspace integrity. | |
| assert args.datasets_root.exists(), "Fatal: datasets_root not found. π€π» Verify pathing." | |
| args.out_dir.mkdir(exist_ok=True, parents=True) | |
| # --- EXECUTION --- | |
| print_args(args, parser) | |
| print("π€π» Scholarly Partnership: Amey Thakur & Mega Satish") | |
| print("π Initiating Mel-Spectrogram Extraction Pipeline...") | |
| args.hparams = hparams.parse(args.hparams) | |
| preprocess_dataset(**vars(args)) | |