ameythakur's picture
Deepfake-Audio
1d8403e verified
# ==================================================================================================
# DEEPFAKE AUDIO - synthesizer/preprocess.py (Corpus Orchestration & Normalization)
# ==================================================================================================
#
# πŸ“ DESCRIPTION
# This module implements the data preprocessing pipeline for the synthesizer.
# It handles the ingestion of raw speech datasets (LibriSpeech, LibriTTS),
# audio-text alignment check, noise reduction via logMMSE, silence-based
# temporal segmentation, and the serialization of Mel-Spectrograms and
# speaker embeddings.
#
# πŸ‘€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# 🀝🏻 CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# πŸ”— PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# πŸ“œ LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================
from multiprocessing.pool import Pool
from synthesizer import audio
from functools import partial
from itertools import chain
from encoder import inference as encoder
from pathlib import Path
from utils import logmmse
from tqdm import tqdm
import numpy as np
import librosa
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
no_alignments: bool, datasets_name: str, subfolders: str):
"""
Mass Corpus Processing:
Orchestrates the conversion of a raw dataset into neural-ready formats.
"""
dataset_root = datasets_root.joinpath(datasets_name)
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
assert all(input_dir.exists() for input_dir in input_dirs)
# Physical Layer Initialization
out_dir.joinpath("mels").mkdir(exist_ok=True)
out_dir.joinpath("audio").mkdir(exist_ok=True)
# Telemetry: Tracking the metadata stream
metadata_fpath = out_dir.joinpath("train.txt")
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
# Distributed Execution: Multiprocessing for high-throughput I/O
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
hparams=hparams, no_alignments=no_alignments)
job = Pool(n_processes).imap(func, speaker_dirs)
for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
for metadatum in speaker_metadata:
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
metadata_file.close()
# Analytical Validation: Dataset statistics report
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
mel_frames = sum([int(m[4]) for m in metadata])
timesteps = sum([int(m[3]) for m in metadata])
sample_rate = hparams.sample_rate
hours = (timesteps / sample_rate) / 3600
print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
(len(metadata), mel_frames, timesteps, hours))
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
"""Identifies and processes all vocal samples for a specific individual."""
metadata = []
for book_dir in speaker_dir.glob("*"):
if no_alignments:
# Flexible format ingestion: WAV, FLAC, MP3
extensions = ["*.wav", "*.flac", "*.mp3"]
for extension in extensions:
wav_fpaths = book_dir.glob(extension)
for wav_fpath in wav_fpaths:
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Linguistic Linkage: Locating transcriptions
text_fpath = wav_fpath.with_suffix(".txt")
if not text_fpath.exists():
text_fpath = wav_fpath.with_suffix(".normalized.txt")
assert text_fpath.exists()
with text_fpath.open("r") as text_file:
text = "".join([line for line in text_file])
text = text.replace("\"", "").strip()
metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
skip_existing, hparams))
else:
# Alignment-guided segmentation (LibriSpeech support)
try:
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
with alignments_fpath.open("r") as alignments_file:
alignments = [line.rstrip().split(" ") for line in alignments_file]
except StopIteration:
continue
for wav_fname, words, end_times in alignments:
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
assert wav_fpath.exists()
words = words.replace("\"", "").split(",")
end_times = list(map(float, end_times.replace("\"", "").split(",")))
# Sub-utterance Fractionation
wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
for i, (wav, text) in enumerate(zip(wavs, texts)):
sub_basename = "%s_%02d" % (wav_fname, i)
metadata.append(process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams))
return [m for m in metadata if m is not None]
def split_on_silences(wav_fpath, words, end_times, hparams):
"""
Spatio-Temporal Segmentation:
Fractures long utterances into smaller chunks based on acoustic silences.
Includes noise profiling and reduction via logMMSE.
"""
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
words = np.array(words)
start_times = np.array([0.0] + end_times[:-1])
end_times = np.array(end_times)
# Silence Detection Mask
mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
mask[0] = mask[-1] = True
breaks = np.where(mask)[0]
# Acoustic Enhancement: Noise reduction
silence_times = [[start_times[i], end_times[i]] for i in breaks]
silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
if len(noisy_wav) > hparams.sample_rate * 0.02:
profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile, eta=0)
# Segment Re-aggregation: Ensures mini-utterances meet minimum duration
segments = list(zip(breaks[:-1], breaks[1:]))
segment_durations = [start_times[end] - end_times[start] for start, end in segments]
i = 0
while i < len(segments) and len(segments) > 1:
if segment_durations[i] < hparams.utterance_min_duration:
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
joined_duration = segment_durations[i] + min(left_duration, right_duration)
if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
i += 1
continue
j = i - 1 if left_duration <= right_duration else i
segments[j] = (segments[j][0], segments[j + 1][1])
segment_durations[j] = joined_duration
del segments[j + 1], segment_durations[j + 1]
else:
i += 1
# Materialization: Final split segments
segment_times = [[end_times[start], start_times[end]] for start, end in segments]
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
return wavs, texts
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
skip_existing: bool, hparams):
"""
Unit Feature Extraction:
Extracts and serializes features for a single utterance.
Applies VAD-based silence trimming before Mel-Spectrogram generation.
"""
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
return None
# VAD-based refinement
if hparams.trim_silence:
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
# Validity filtering
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
return None
# Feature Distillation: Mel-Spectrogram
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
mel_frames = mel_spectrogram.shape[1]
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
return None
# Persistence Layer: Serialization as .npy arrays
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
np.save(wav_fpath, wav, allow_pickle=False)
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
def embed_utterance(fpaths, encoder_model_fpath):
"""Neural Projection: Generates a fixed-dimensional speaker embedding."""
if not encoder.is_loaded():
encoder.load_model(encoder_model_fpath)
wav_fpath, embed_fpath = fpaths
wav = np.load(wav_fpath)
wav = encoder.preprocess_wav(wav)
embed = encoder.embed_utterance(wav)
np.save(embed_fpath, embed, allow_pickle=False)
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
"""Identity Ingestion: Orchestrates high-speed embedding generation for all samples."""
wav_dir = synthesizer_root.joinpath("audio")
metadata_fpath = synthesizer_root.joinpath("train.txt")
assert wav_dir.exists() and metadata_fpath.exists()
embed_dir = synthesizer_root.joinpath("embeds")
embed_dir.mkdir(exist_ok=True)
with metadata_fpath.open("r") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
# Distributed Embedding Loop
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))