Spaces:
Running
Running
File size: 11,405 Bytes
1d8403e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | # ==================================================================================================
# DEEPFAKE AUDIO - synthesizer/preprocess.py (Corpus Orchestration & Normalization)
# ==================================================================================================
#
# π DESCRIPTION
# This module implements the data preprocessing pipeline for the synthesizer.
# It handles the ingestion of raw speech datasets (LibriSpeech, LibriTTS),
# audio-text alignment check, noise reduction via logMMSE, silence-based
# temporal segmentation, and the serialization of Mel-Spectrograms and
# speaker embeddings.
#
# π€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# π€π» CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# π PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# π LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================
from multiprocessing.pool import Pool
from synthesizer import audio
from functools import partial
from itertools import chain
from encoder import inference as encoder
from pathlib import Path
from utils import logmmse
from tqdm import tqdm
import numpy as np
import librosa
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
no_alignments: bool, datasets_name: str, subfolders: str):
"""
Mass Corpus Processing:
Orchestrates the conversion of a raw dataset into neural-ready formats.
"""
dataset_root = datasets_root.joinpath(datasets_name)
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
assert all(input_dir.exists() for input_dir in input_dirs)
# Physical Layer Initialization
out_dir.joinpath("mels").mkdir(exist_ok=True)
out_dir.joinpath("audio").mkdir(exist_ok=True)
# Telemetry: Tracking the metadata stream
metadata_fpath = out_dir.joinpath("train.txt")
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
# Distributed Execution: Multiprocessing for high-throughput I/O
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
hparams=hparams, no_alignments=no_alignments)
job = Pool(n_processes).imap(func, speaker_dirs)
for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
for metadatum in speaker_metadata:
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
metadata_file.close()
# Analytical Validation: Dataset statistics report
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
mel_frames = sum([int(m[4]) for m in metadata])
timesteps = sum([int(m[3]) for m in metadata])
sample_rate = hparams.sample_rate
hours = (timesteps / sample_rate) / 3600
print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
(len(metadata), mel_frames, timesteps, hours))
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
"""Identifies and processes all vocal samples for a specific individual."""
metadata = []
for book_dir in speaker_dir.glob("*"):
if no_alignments:
# Flexible format ingestion: WAV, FLAC, MP3
extensions = ["*.wav", "*.flac", "*.mp3"]
for extension in extensions:
wav_fpaths = book_dir.glob(extension)
for wav_fpath in wav_fpaths:
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Linguistic Linkage: Locating transcriptions
text_fpath = wav_fpath.with_suffix(".txt")
if not text_fpath.exists():
text_fpath = wav_fpath.with_suffix(".normalized.txt")
assert text_fpath.exists()
with text_fpath.open("r") as text_file:
text = "".join([line for line in text_file])
text = text.replace("\"", "").strip()
metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
skip_existing, hparams))
else:
# Alignment-guided segmentation (LibriSpeech support)
try:
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
with alignments_fpath.open("r") as alignments_file:
alignments = [line.rstrip().split(" ") for line in alignments_file]
except StopIteration:
continue
for wav_fname, words, end_times in alignments:
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
assert wav_fpath.exists()
words = words.replace("\"", "").split(",")
end_times = list(map(float, end_times.replace("\"", "").split(",")))
# Sub-utterance Fractionation
wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
for i, (wav, text) in enumerate(zip(wavs, texts)):
sub_basename = "%s_%02d" % (wav_fname, i)
metadata.append(process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams))
return [m for m in metadata if m is not None]
def split_on_silences(wav_fpath, words, end_times, hparams):
"""
Spatio-Temporal Segmentation:
Fractures long utterances into smaller chunks based on acoustic silences.
Includes noise profiling and reduction via logMMSE.
"""
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
words = np.array(words)
start_times = np.array([0.0] + end_times[:-1])
end_times = np.array(end_times)
# Silence Detection Mask
mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
mask[0] = mask[-1] = True
breaks = np.where(mask)[0]
# Acoustic Enhancement: Noise reduction
silence_times = [[start_times[i], end_times[i]] for i in breaks]
silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
if len(noisy_wav) > hparams.sample_rate * 0.02:
profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile, eta=0)
# Segment Re-aggregation: Ensures mini-utterances meet minimum duration
segments = list(zip(breaks[:-1], breaks[1:]))
segment_durations = [start_times[end] - end_times[start] for start, end in segments]
i = 0
while i < len(segments) and len(segments) > 1:
if segment_durations[i] < hparams.utterance_min_duration:
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
joined_duration = segment_durations[i] + min(left_duration, right_duration)
if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
i += 1
continue
j = i - 1 if left_duration <= right_duration else i
segments[j] = (segments[j][0], segments[j + 1][1])
segment_durations[j] = joined_duration
del segments[j + 1], segment_durations[j + 1]
else:
i += 1
# Materialization: Final split segments
segment_times = [[end_times[start], start_times[end]] for start, end in segments]
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
return wavs, texts
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
skip_existing: bool, hparams):
"""
Unit Feature Extraction:
Extracts and serializes features for a single utterance.
Applies VAD-based silence trimming before Mel-Spectrogram generation.
"""
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
return None
# VAD-based refinement
if hparams.trim_silence:
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
# Validity filtering
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
return None
# Feature Distillation: Mel-Spectrogram
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
mel_frames = mel_spectrogram.shape[1]
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
return None
# Persistence Layer: Serialization as .npy arrays
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
np.save(wav_fpath, wav, allow_pickle=False)
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
def embed_utterance(fpaths, encoder_model_fpath):
"""Neural Projection: Generates a fixed-dimensional speaker embedding."""
if not encoder.is_loaded():
encoder.load_model(encoder_model_fpath)
wav_fpath, embed_fpath = fpaths
wav = np.load(wav_fpath)
wav = encoder.preprocess_wav(wav)
embed = encoder.embed_utterance(wav)
np.save(embed_fpath, embed, allow_pickle=False)
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
"""Identity Ingestion: Orchestrates high-speed embedding generation for all samples."""
wav_dir = synthesizer_root.joinpath("audio")
metadata_fpath = synthesizer_root.joinpath("train.txt")
assert wav_dir.exists() and metadata_fpath.exists()
embed_dir = synthesizer_root.joinpath("embeds")
embed_dir.mkdir(exist_ok=True)
with metadata_fpath.open("r") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
# Distributed Embedding Loop
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|