Spaces:

ameythakur
/

Deepfake-Audio

Running

App Files Files Community

Deepfake-Audio / Source Code /vocoder /vocoder_dataset.py

ameythakur

Deepfake-Audio

1d8403e verified about 2 months ago

raw

history blame contribute delete

4.8 kB

	# ==================================================================================================
	# DEEPFAKE AUDIO - vocoder/vocoder_dataset.py (Neural Audio Loader)
	# ==================================================================================================
	#
	# 📝 DESCRIPTION
	# This module implements the data loading pipeline for vocoder training.
	# It handles the pairing of Mel-Spectrograms with their corresponding raw
	# audio waveforms, performing real-time quantization (Mu-Law or linear)
	# and windowed sampling for WaveRNN optimization.
	#
	# 👤 AUTHORS
	# - Amey Thakur (https://github.com/Amey-Thakur)
	# - Mega Satish (https://github.com/msatmod)
	#
	# 🤝🏻 CREDITS
	# Original Real-Time Voice Cloning methodology by CorentinJ
	# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
	#
	# 🔗 PROJECT LINKS
	# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
	# Video Demo: https://youtu.be/i3wnBcbHDbs
	# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
	#
	# 📜 LICENSE
	# Released under the MIT License
	# Release Date: 2021-02-06
	# ==================================================================================================

	from torch.utils.data import Dataset
	from pathlib import Path
	from vocoder import audio
	import vocoder.hparams as hp
	import numpy as np
	import torch

	class VocoderDataset(Dataset):
	"""
	Asset Orchestrator:
	Encapsulates the logic for loading and preprocessing synthesized speech data.
	"""
	def __init__(self, metadata_fpath: Path, mel_dir: Path, wav_dir: Path):
	print("Dataset Ingestion: Syncing with metadata and signal directories...")

	with metadata_fpath.open("r") as metadata_file:
	metadata = [line.split("\|") for line in metadata_file]

	# Binary Filtering: Only ingest samples marked for training
	gta_fnames = [x[1] for x in metadata if int(x[4])]
	gta_fpaths = [mel_dir.joinpath(fname) for fname in gta_fnames]
	wav_fnames = [x[0] for x in metadata if int(x[4])]
	wav_fpaths = [wav_dir.joinpath(fname) for fname in wav_fnames]
	self.samples_fpaths = list(zip(gta_fpaths, wav_fpaths))

	print("Status: Found %d training samples" % len(self.samples_fpaths))

	def __getitem__(self, index):
	"""Atomic Fetcher: Retrieves and quantizes a single Mel-Audio pair."""
	mel_path, wav_path = self.samples_fpaths[index]

	# Feature Mapping: Load Mel-Spectrogram and scale to unit interval
	mel = np.load(mel_path).T.astype(np.float32) / hp.mel_max_abs_value

	# Audio Conditioning: Load waveform and apply spectral shaping
	wav = np.load(wav_path)
	if hp.apply_preemphasis:
	wav = audio.pre_emphasis(wav)
	wav = np.clip(wav, -1, 1)

	# Structural Sync: Match audio length to spectrogram frame count
	r_pad = (len(wav) // hp.hop_length + 1) * hp.hop_length - len(wav)
	wav = np.pad(wav, (0, r_pad), mode='constant')
	wav = wav[:mel.shape[1] * hp.hop_length]

	# Categorical Quantization: Map amplitudes to discrete neural labels
	if hp.voc_mode == 'RAW':
	if hp.mu_law:
	quant = audio.encode_mu_law(wav, mu=2 ** hp.bits)
	else:
	quant = audio.float_2_label(wav, bits=hp.bits)
	elif hp.voc_mode == 'MOL':
	quant = audio.float_2_label(wav, bits=16)

	return mel.astype(np.float32), quant.astype(np.int64)

	def __len__(self):
	return len(self.samples_fpaths)

	def collate_vocoder(batch):
	"""
	Neural Packager:
	Batch-wise sampler that extracts random synchronized windows from long audio sequences.
	"""
	mel_win = hp.voc_seq_len // hp.hop_length + 2 * hp.voc_pad
	max_offsets = [x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad) for x in batch]
	mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
	sig_offsets = [(offset + hp.voc_pad) * hp.hop_length for offset in mel_offsets]

	mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] for i, x in enumerate(batch)]
	labels = [x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1] for i, x in enumerate(batch)]

	mels = np.stack(mels).astype(np.float32)
	labels = np.stack(labels).astype(np.int64)

	mels = torch.tensor(mels)
	labels = torch.tensor(labels).long()

	# Temporal Offset: x and y are shifted by one sample for autoregressive modeling
	x = labels[:, :hp.voc_seq_len]
	y = labels[:, 1:]

	bits = 16 if hp.voc_mode == 'MOL' else hp.bits
	x = audio.label_2_float(x.float(), bits)

	if hp.voc_mode == 'MOL':
	y = audio.label_2_float(y.float(), bits)

	return x, y, mels