Spaces:
Running
Running
| # ================================================================================================== | |
| # DEEPFAKE AUDIO - encoder/data_objects/speaker.py (Categorical Identity Representation) | |
| # ================================================================================================== | |
| # | |
| # π DESCRIPTION | |
| # This module implements the 'Speaker' abstraction, encapulating all linguistic | |
| # and acoustic metadata for a single individual. It manages the retrieval | |
| # and segmented sampling of utterances, acting as a gateway to the serialized | |
| # Mel-Spectrograms used in neural distillation. | |
| # | |
| # π€ AUTHORS | |
| # - Amey Thakur (https://github.com/Amey-Thakur) | |
| # - Mega Satish (https://github.com/msatmod) | |
| # | |
| # π€π» CREDITS | |
| # Original Real-Time Voice Cloning methodology by CorentinJ | |
| # Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning | |
| # | |
| # π PROJECT LINKS | |
| # Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO | |
| # Video Demo: https://youtu.be/i3wnBcbHDbs | |
| # Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb | |
| # | |
| # π LICENSE | |
| # Released under the MIT License | |
| # Release Date: 2021-02-06 | |
| # ================================================================================================== | |
| from encoder.data_objects.random_cycler import RandomCycler | |
| from encoder.data_objects.utterance import Utterance | |
| from pathlib import Path | |
| class Speaker: | |
| """ | |
| Categorical Data Container: | |
| Aggregates all speech samples associated with a unique institutional speaker ID. | |
| """ | |
| def __init__(self, root: Path): | |
| self.root = root | |
| self.name = root.name | |
| self.utterances = None | |
| self.utterance_cycler = None | |
| def _load_utterances(self): | |
| """Lazy-loading of utterance metadata from the serialized index (_sources.txt).""" | |
| with self.root.joinpath("_sources.txt").open("r") as sources_file: | |
| sources = [l.split(",") for l in sources_file] | |
| # Identity Mapping: frames_fname -> original_wave_fpath | |
| sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources} | |
| self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()] | |
| self.utterance_cycler = RandomCycler(self.utterances) | |
| def random_partial(self, count, n_frames): | |
| """ | |
| Samples a batch of <count> unique partial utterances. | |
| Ensures diverse temporal coverage within the speaker's available vocal range. | |
| """ | |
| if self.utterances is None: | |
| self._load_utterances() | |
| # Stochastic selection of utterances | |
| utterances = self.utterance_cycler.sample(count) | |
| # Spatio-temporal cropping: (utterance, frames, crop_range) | |
| a = [(u,) + u.random_partial(n_frames) for u in utterances] | |
| return a | |