| | import random |
| |
|
| | import torch |
| | from torch.utils.data import Dataset |
| |
|
| | from TTS.encoder.utils.generic_utils import AugmentWAV |
| |
|
| |
|
| | class EncoderDataset(Dataset): |
| | def __init__( |
| | self, |
| | config, |
| | ap, |
| | meta_data, |
| | voice_len=1.6, |
| | num_classes_in_batch=64, |
| | num_utter_per_class=10, |
| | verbose=False, |
| | augmentation_config=None, |
| | use_torch_spec=None, |
| | ): |
| | """ |
| | Args: |
| | ap (TTS.tts.utils.AudioProcessor): audio processor object. |
| | meta_data (list): list of dataset instances. |
| | seq_len (int): voice segment length in seconds. |
| | verbose (bool): print diagnostic information. |
| | """ |
| | super().__init__() |
| | self.config = config |
| | self.items = meta_data |
| | self.sample_rate = ap.sample_rate |
| | self.seq_len = int(voice_len * self.sample_rate) |
| | self.num_utter_per_class = num_utter_per_class |
| | self.ap = ap |
| | self.verbose = verbose |
| | self.use_torch_spec = use_torch_spec |
| | self.classes, self.items = self.__parse_items() |
| |
|
| | self.classname_to_classid = {key: i for i, key in enumerate(self.classes)} |
| |
|
| | |
| | self.augmentator = None |
| | self.gaussian_augmentation_config = None |
| | if augmentation_config: |
| | self.data_augmentation_p = augmentation_config["p"] |
| | if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config): |
| | self.augmentator = AugmentWAV(ap, augmentation_config) |
| |
|
| | if "gaussian" in augmentation_config.keys(): |
| | self.gaussian_augmentation_config = augmentation_config["gaussian"] |
| |
|
| | if self.verbose: |
| | print("\n > DataLoader initialization") |
| | print(f" | > Classes per Batch: {num_classes_in_batch}") |
| | print(f" | > Number of instances : {len(self.items)}") |
| | print(f" | > Sequence length: {self.seq_len}") |
| | print(f" | > Num Classes: {len(self.classes)}") |
| | print(f" | > Classes: {self.classes}") |
| |
|
| | def load_wav(self, filename): |
| | audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) |
| | return audio |
| |
|
| | def __parse_items(self): |
| | class_to_utters = {} |
| | for item in self.items: |
| | path_ = item["audio_file"] |
| | class_name = item[self.config.class_name_key] |
| | if class_name in class_to_utters.keys(): |
| | class_to_utters[class_name].append(path_) |
| | else: |
| | class_to_utters[class_name] = [ |
| | path_, |
| | ] |
| |
|
| | |
| | class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class} |
| |
|
| | classes = list(class_to_utters.keys()) |
| | classes.sort() |
| |
|
| | new_items = [] |
| | for item in self.items: |
| | path_ = item["audio_file"] |
| | class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"] |
| | |
| | if class_name not in classes: |
| | continue |
| | |
| | if self.load_wav(path_).shape[0] - self.seq_len <= 0: |
| | continue |
| |
|
| | new_items.append({"wav_file_path": path_, "class_name": class_name}) |
| |
|
| | return classes, new_items |
| |
|
| | def __len__(self): |
| | return len(self.items) |
| |
|
| | def get_num_classes(self): |
| | return len(self.classes) |
| |
|
| | def get_class_list(self): |
| | return self.classes |
| |
|
| | def set_classes(self, classes): |
| | self.classes = classes |
| | self.classname_to_classid = {key: i for i, key in enumerate(self.classes)} |
| |
|
| | def get_map_classid_to_classname(self): |
| | return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items()) |
| |
|
| | def __getitem__(self, idx): |
| | return self.items[idx] |
| |
|
| | def collate_fn(self, batch): |
| | |
| | labels = [] |
| | feats = [] |
| | for item in batch: |
| | utter_path = item["wav_file_path"] |
| | class_name = item["class_name"] |
| |
|
| | |
| | class_id = self.classname_to_classid[class_name] |
| | |
| | wav = self.load_wav(utter_path) |
| | offset = random.randint(0, wav.shape[0] - self.seq_len) |
| | wav = wav[offset : offset + self.seq_len] |
| |
|
| | if self.augmentator is not None and self.data_augmentation_p: |
| | if random.random() < self.data_augmentation_p: |
| | wav = self.augmentator.apply_one(wav) |
| |
|
| | if not self.use_torch_spec: |
| | mel = self.ap.melspectrogram(wav) |
| | feats.append(torch.FloatTensor(mel)) |
| | else: |
| | feats.append(torch.FloatTensor(wav)) |
| |
|
| | labels.append(class_id) |
| |
|
| | feats = torch.stack(feats) |
| | labels = torch.LongTensor(labels) |
| |
|
| | return feats, labels |
| |
|