Spaces:

ameythakur
/

Deepfake-Audio

Running

App Files Files Community

Deepfake-Audio / Source Code /encoder /data_objects /speaker.py

ameythakur

Deepfake-Audio

1d8403e verified about 2 months ago

raw

history blame contribute delete

2.84 kB

	# ==================================================================================================
	# DEEPFAKE AUDIO - encoder/data_objects/speaker.py (Categorical Identity Representation)
	# ==================================================================================================
	#
	# 📝 DESCRIPTION
	# This module implements the 'Speaker' abstraction, encapulating all linguistic
	# and acoustic metadata for a single individual. It manages the retrieval
	# and segmented sampling of utterances, acting as a gateway to the serialized
	# Mel-Spectrograms used in neural distillation.
	#
	# 👤 AUTHORS
	# - Amey Thakur (https://github.com/Amey-Thakur)
	# - Mega Satish (https://github.com/msatmod)
	#
	# 🤝🏻 CREDITS
	# Original Real-Time Voice Cloning methodology by CorentinJ
	# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
	#
	# 🔗 PROJECT LINKS
	# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
	# Video Demo: https://youtu.be/i3wnBcbHDbs
	# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
	#
	# 📜 LICENSE
	# Released under the MIT License
	# Release Date: 2021-02-06
	# ==================================================================================================

	from encoder.data_objects.random_cycler import RandomCycler
	from encoder.data_objects.utterance import Utterance
	from pathlib import Path

	class Speaker:
	"""
	Categorical Data Container:
	Aggregates all speech samples associated with a unique institutional speaker ID.
	"""
	def __init__(self, root: Path):
	self.root = root
	self.name = root.name
	self.utterances = None
	self.utterance_cycler = None

	def _load_utterances(self):
	"""Lazy-loading of utterance metadata from the serialized index (_sources.txt)."""
	with self.root.joinpath("_sources.txt").open("r") as sources_file:
	sources = [l.split(",") for l in sources_file]

	# Identity Mapping: frames_fname -> original_wave_fpath
	sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
	self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
	self.utterance_cycler = RandomCycler(self.utterances)

	def random_partial(self, count, n_frames):
	"""
	Samples a batch of <count> unique partial utterances.
	Ensures diverse temporal coverage within the speaker's available vocal range.
	"""
	if self.utterances is None:
	self._load_utterances()

	# Stochastic selection of utterances
	utterances = self.utterance_cycler.sample(count)

	# Spatio-temporal cropping: (utterance, frames, crop_range)
	a = [(u,) + u.random_partial(n_frames) for u in utterances]

	return a