| | import os |
| | from glob import glob |
| | import torchaudio |
| | from torch.utils.data import Dataset |
| | import pandas as pd |
| | from PIL import Image |
| | import pickle |
| | from copy import deepcopy |
| | from glob import glob |
| | import random |
| | from sklearn.model_selection import train_test_split |
| | import json |
| | import os |
| | import numpy as np |
| | import librosa |
| | import torch |
| | import soundfile as sf |
| | import pandas as pd |
| | import random |
| |
|
| | class EARS(Dataset): |
| | """ |
| | EARS dataset for 10sec or less that 10sec segments. |
| | Returns: |
| | audio: torch.Tensor in (1,16000) or (1, <16000), audio waveform |
| | sid: str (p103), speaker id |
| | metadict: dict, metadata |
| | caption: str, caption |
| | alignment: list |
| | """ |
| | def __init__(self, root, data_path, meta_path,utterance_path, prompts_path, sample_rate, train_mapper=False, split="train"): |
| | super().__init__() |
| | self.root = root |
| |
|
| | with open(f"{data_path}", "r") as f: |
| | self.data = json.load(f) |
| |
|
| | with open(f"{meta_path}", "r") as f: |
| | self.meta = json.load(f) |
| | |
| | with open(f"{utterance_path}", "r") as f: |
| | self.utterance = json.load(f) |
| |
|
| | with open(f"{prompts_path}", "r") as f: |
| | self.prompts = json.load(f) |
| |
|
| | self.new_data = [] |
| | if train_mapper: |
| | for d in self.data: |
| | file_name = d["filename"] |
| | sid = file_name.split("/")[0] |
| | temp = random.sample(self.prompts[sid], 10) |
| | for qa in temp: |
| | self.new_data.append({"filename": file_name, |
| | "start": d["start"], |
| | "end": d["end"], |
| | "prompt": qa[0], |
| | "answer": qa[1]}) |
| | else: |
| | self.new_data = self.data |
| | if split == "train": |
| | random.shuffle(self.new_data) |
| |
|
| | self.sample_rate = sample_rate |
| |
|
| | def __len__(self): |
| | return len(self.new_data) |
| |
|
| | def __getitem__(self, idx): |
| | entry = self.new_data[idx] |
| | filename = entry["filename"] |
| | sid = filename.split("/")[0] |
| | audio_path = os.path.join(self.root, filename) |
| |
|
| | |
| | audio, sample_rate = torchaudio.load(audio_path) |
| | start_sample, end_sample = entry["start"], entry["end"] |
| | |
| | |
| | if sample_rate != self.sample_rate: |
| | audio = torchaudio.transforms.Resample(sample_rate, self.sample_rate)(audio) |
| |
|
| | |
| | total_samples = end_sample - start_sample |
| | num_samples_3s = 3 * self.sample_rate |
| | |
| | |
| | if total_samples >= num_samples_3s: |
| | start_offset = random.randint(start_sample, end_sample - num_samples_3s) |
| | end_offset = start_offset + num_samples_3s |
| | audio = audio[:, start_offset:end_offset] |
| | else: |
| | |
| | pad_size = num_samples_3s - total_samples |
| | audio = audio[:, start_sample:end_sample] |
| | audio = torch.nn.functional.pad(audio, (0, pad_size)) |
| |
|
| | |
| | mean = torch.mean(audio) |
| | std = torch.std(audio) |
| | audio = (audio - mean) / (std + 1e-8) |
| |
|
| | return { |
| | "audio_tensor": audio, |
| | "filename": filename, |
| | "sid": sid, |
| | "prompt": entry.get("prompt", None), |
| | "answer": entry.get("answer", None), |
| | } |