Spaces:

ameythakur
/

Deepfake-Audio

Running

App Files Files Community

Deepfake-Audio / Source Code /encoder /preprocess.py

ameythakur

Deepfake-Audio

1d8403e verified about 2 months ago

raw

history blame contribute delete

8.5 kB

	# ==================================================================================================
	# DEEPFAKE AUDIO - encoder/preprocess.py (Acoustic Feature Extraction Engine)
	# ==================================================================================================
	#
	# 📝 DESCRIPTION
	# This module implements the internal orchestration for dataset preprocessing.
	# It provides highly-parallelized functions to traverse raw speech corpora
	# (LibriSpeech, VoxCeleb), extract speaker-specific metadata, and materialize
	# normalized Mel-Spectrograms onto the disk for high-throughput training.
	#
	# 👤 AUTHORS
	# - Amey Thakur (https://github.com/Amey-Thakur)
	# - Mega Satish (https://github.com/msatmod)
	#
	# 🤝🏻 CREDITS
	# Original Real-Time Voice Cloning methodology by CorentinJ
	# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
	#
	# 🔗 PROJECT LINKS
	# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
	# Video Demo: https://youtu.be/i3wnBcbHDbs
	# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
	#
	# 📜 LICENSE
	# Released under the MIT License
	# Release Date: 2021-02-06
	# ==================================================================================================

	from datetime import datetime
	from functools import partial
	from multiprocessing import Pool
	from pathlib import Path
	import numpy as np
	from tqdm import tqdm

	# --- INTERNAL SIGNAL UTILITIES ---
	from encoder import audio
	from encoder.config import librispeech_datasets, anglophone_nationalites
	from encoder.params_data import *

	_AUDIO_EXTENSIONS = ("wav", "flac", "m4a", "mp3")

	class DatasetLog:
	"""
	Experimental Ledger: Records the metadata and parameter state of a
	preprocessing run for reproducibility.
	"""
	def __init__(self, root, name):
	self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
	self.sample_data = dict()

	start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
	self.write_line("🚀 Initiating Preprocessing Cycle for: %s on %s" % (name, start_time))
	self.write_line("-----")
	self._log_params()

	def _log_params(self):
	"""Archives the effective hyperparameters in the log."""
	from encoder import params_data
	self.write_line("Acoustic Hyperparameters:")
	for param_name in (p for p in dir(params_data) if not p.startswith("__")):
	value = getattr(params_data, param_name)
	self.write_line("\t%s: %s" % (param_name, value))
	self.write_line("-----")

	def write_line(self, line):
	self.text_file.write("%s\n" % line)

	def add_sample(self, **kwargs):
	"""Accumulates statistical distributions of processed samples."""
	for param_name, value in kwargs.items():
	if not param_name in self.sample_data:
	self.sample_data[param_name] = []
	self.sample_data[param_name].append(value)

	def finalize(self):
	"""Computes and writes dataset-wide statistics before closing."""
	self.write_line("Neural Statistical Profile:")
	for param_name, values in self.sample_data.items():
	self.write_line("\t%s:" % param_name)
	self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
	self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
	self.write_line("-----")
	end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
	self.write_line("🤝🏻 Cycle Completed on %s" % end_time)
	self.text_file.close()

	def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
	"""Integrity check and log initialization for a specific corpus."""
	dataset_root = datasets_root.joinpath(dataset_name)
	if not dataset_root.exists():
	print("⚠️ Scholarly Alert: Dataset %s not found. Skipping." % dataset_root)
	return None, None
	return dataset_root, DatasetLog(out_dir, dataset_name)

	def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, skip_existing: bool):
	"""
	Utterance Orchestration: Processes all vocal samples for a single speaker.
	Extracts Mel-Spectrograms and archives source mappings.
	"""
	speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
	speaker_out_dir = out_dir.joinpath(speaker_name)
	speaker_out_dir.mkdir(exist_ok=True)
	sources_fpath = speaker_out_dir.joinpath("_sources.txt")

	# Resilience: Load existing sources index
	existing_fnames = {}
	if sources_fpath.exists():
	try:
	with sources_fpath.open("r") as sources_file:
	existing_fnames = {line.split(",")[0] for line in sources_file}
	except:
	pass

	# Process utterances recursively
	sources_file = sources_fpath.open("a" if skip_existing else "w")
	audio_durs = []
	for extension in _AUDIO_EXTENSIONS:
	for in_fpath in speaker_dir.glob("*/.%s" % extension):
	out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
	out_fname = out_fname.replace(".%s" % extension, ".npy")

	if skip_existing and out_fname in existing_fnames:
	continue

	# Signal Normalization
	wav = audio.preprocess_wav(in_fpath)
	if len(wav) == 0:
	continue

	# Spectral Analysis
	frames = audio.wav_to_mel_spectrogram(wav)
	if len(frames) < partials_n_frames:
	continue

	# Materialization
	out_fpath = speaker_out_dir.joinpath(out_fname)
	np.save(out_fpath, frames)
	sources_file.write("%s,%s\n" % (out_fname, in_fpath))
	audio_durs.append(len(wav) / sampling_rate)

	sources_file.close()
	return audio_durs

	def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger):
	"""Degree of parallelism management for speaker-level processing."""
	print("🤝🏻 Processing %d speakers from: %s" % (len(speaker_dirs), dataset_name))

	work_fn = partial(_preprocess_speaker, datasets_root=datasets_root, out_dir=out_dir, skip_existing=skip_existing)
	with Pool(4) as pool:
	tasks = pool.imap(work_fn, speaker_dirs)
	for sample_durs in tqdm(tasks, dataset_name, len(speaker_dirs), unit="speakers"):
	for sample_dur in sample_durs:
	logger.add_sample(duration=sample_dur)

	logger.finalize()

	def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
	"""Pipeline for LibriSpeech: A massive corpus of read English speech."""
	for dataset_name in librispeech_datasets["train"]["other"]:
	dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
	if not dataset_root: continue

	speaker_dirs = list(dataset_root.glob("*"))
	_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)

	def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
	"""Pipeline for VoxCeleb1: Celebrity voices with multi-national diversity."""
	dataset_name = "VoxCeleb1"
	dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
	if not dataset_root: return

	# Meta-Guided Filtering for Anglophone Speakers
	with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
	metadata = [line.split("\t") for line in metafile][1:]

	nationalities = {line[0]: line[3] for line in metadata}
	keep_speaker_ids = [s_id for s_id, nat in nationalities.items() if nat.lower() in anglophone_nationalites]

	speaker_dirs = [d for d in dataset_root.joinpath("wav").glob("*") if d.name in keep_speaker_ids]
	_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)

	def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
	"""Pipeline for VoxCeleb2: Broadscale celebrity speech data."""
	dataset_name = "VoxCeleb2"
	dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
	if not dataset_root: return

	speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
	_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)