Spaces:
Running
Running
File size: 8,495 Bytes
1d8403e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | # ==================================================================================================
# DEEPFAKE AUDIO - encoder/preprocess.py (Acoustic Feature Extraction Engine)
# ==================================================================================================
#
# π DESCRIPTION
# This module implements the internal orchestration for dataset preprocessing.
# It provides highly-parallelized functions to traverse raw speech corpora
# (LibriSpeech, VoxCeleb), extract speaker-specific metadata, and materialize
# normalized Mel-Spectrograms onto the disk for high-throughput training.
#
# π€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# π€π» CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# π PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# π LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================
from datetime import datetime
from functools import partial
from multiprocessing import Pool
from pathlib import Path
import numpy as np
from tqdm import tqdm
# --- INTERNAL SIGNAL UTILITIES ---
from encoder import audio
from encoder.config import librispeech_datasets, anglophone_nationalites
from encoder.params_data import *
_AUDIO_EXTENSIONS = ("wav", "flac", "m4a", "mp3")
class DatasetLog:
"""
Experimental Ledger: Records the metadata and parameter state of a
preprocessing run for reproducibility.
"""
def __init__(self, root, name):
self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
self.sample_data = dict()
start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
self.write_line("π Initiating Preprocessing Cycle for: %s on %s" % (name, start_time))
self.write_line("-----")
self._log_params()
def _log_params(self):
"""Archives the effective hyperparameters in the log."""
from encoder import params_data
self.write_line("Acoustic Hyperparameters:")
for param_name in (p for p in dir(params_data) if not p.startswith("__")):
value = getattr(params_data, param_name)
self.write_line("\t%s: %s" % (param_name, value))
self.write_line("-----")
def write_line(self, line):
self.text_file.write("%s\n" % line)
def add_sample(self, **kwargs):
"""Accumulates statistical distributions of processed samples."""
for param_name, value in kwargs.items():
if not param_name in self.sample_data:
self.sample_data[param_name] = []
self.sample_data[param_name].append(value)
def finalize(self):
"""Computes and writes dataset-wide statistics before closing."""
self.write_line("Neural Statistical Profile:")
for param_name, values in self.sample_data.items():
self.write_line("\t%s:" % param_name)
self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
self.write_line("-----")
end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
self.write_line("π€π» Cycle Completed on %s" % end_time)
self.text_file.close()
def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
"""Integrity check and log initialization for a specific corpus."""
dataset_root = datasets_root.joinpath(dataset_name)
if not dataset_root.exists():
print("β οΈ Scholarly Alert: Dataset %s not found. Skipping." % dataset_root)
return None, None
return dataset_root, DatasetLog(out_dir, dataset_name)
def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, skip_existing: bool):
"""
Utterance Orchestration: Processes all vocal samples for a single speaker.
Extracts Mel-Spectrograms and archives source mappings.
"""
speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
speaker_out_dir = out_dir.joinpath(speaker_name)
speaker_out_dir.mkdir(exist_ok=True)
sources_fpath = speaker_out_dir.joinpath("_sources.txt")
# Resilience: Load existing sources index
existing_fnames = {}
if sources_fpath.exists():
try:
with sources_fpath.open("r") as sources_file:
existing_fnames = {line.split(",")[0] for line in sources_file}
except:
pass
# Process utterances recursively
sources_file = sources_fpath.open("a" if skip_existing else "w")
audio_durs = []
for extension in _AUDIO_EXTENSIONS:
for in_fpath in speaker_dir.glob("**/*.%s" % extension):
out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
out_fname = out_fname.replace(".%s" % extension, ".npy")
if skip_existing and out_fname in existing_fnames:
continue
# Signal Normalization
wav = audio.preprocess_wav(in_fpath)
if len(wav) == 0:
continue
# Spectral Analysis
frames = audio.wav_to_mel_spectrogram(wav)
if len(frames) < partials_n_frames:
continue
# Materialization
out_fpath = speaker_out_dir.joinpath(out_fname)
np.save(out_fpath, frames)
sources_file.write("%s,%s\n" % (out_fname, in_fpath))
audio_durs.append(len(wav) / sampling_rate)
sources_file.close()
return audio_durs
def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger):
"""Degree of parallelism management for speaker-level processing."""
print("π€π» Processing %d speakers from: %s" % (len(speaker_dirs), dataset_name))
work_fn = partial(_preprocess_speaker, datasets_root=datasets_root, out_dir=out_dir, skip_existing=skip_existing)
with Pool(4) as pool:
tasks = pool.imap(work_fn, speaker_dirs)
for sample_durs in tqdm(tasks, dataset_name, len(speaker_dirs), unit="speakers"):
for sample_dur in sample_durs:
logger.add_sample(duration=sample_dur)
logger.finalize()
def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
"""Pipeline for LibriSpeech: A massive corpus of read English speech."""
for dataset_name in librispeech_datasets["train"]["other"]:
dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
if not dataset_root: continue
speaker_dirs = list(dataset_root.glob("*"))
_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
"""Pipeline for VoxCeleb1: Celebrity voices with multi-national diversity."""
dataset_name = "VoxCeleb1"
dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
if not dataset_root: return
# Meta-Guided Filtering for Anglophone Speakers
with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
metadata = [line.split("\t") for line in metafile][1:]
nationalities = {line[0]: line[3] for line in metadata}
keep_speaker_ids = [s_id for s_id, nat in nationalities.items() if nat.lower() in anglophone_nationalites]
speaker_dirs = [d for d in dataset_root.joinpath("wav").glob("*") if d.name in keep_speaker_ids]
_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
"""Pipeline for VoxCeleb2: Broadscale celebrity speech data."""
dataset_name = "VoxCeleb2"
dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
if not dataset_root: return
speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
|