| import os |
| from glob import glob |
| import torchaudio |
| from torch.utils.data import Dataset |
| import pandas as pd |
| from PIL import Image |
| import pickle |
| from copy import deepcopy |
| from glob import glob |
| import random |
| from sklearn.model_selection import train_test_split |
| import json |
| import os |
| import numpy as np |
| import librosa |
| import torch |
| import soundfile as sf |
| import pandas as pd |
| import random |
|
|
| class EARS(Dataset): |
| """ |
| EARS dataset for 10sec or less that 10sec segments. |
| Returns: |
| audio: torch.Tensor in (1,16000) or (1, <16000), audio waveform |
| sid: str (p103), speaker id |
| metadict: dict, metadata |
| caption: str, caption |
| alignment: list |
| """ |
| def __init__(self, root, data_path, meta_path,utterance_path, prompts_path, sample_rate, train_mapper=False, split="train"): |
| super().__init__() |
| self.root = root |
|
|
| with open(f"{data_path}", "r") as f: |
| self.data = json.load(f) |
|
|
| with open(f"{meta_path}", "r") as f: |
| self.meta = json.load(f) |
| |
| with open(f"{utterance_path}", "r") as f: |
| self.utterance = json.load(f) |
|
|
| with open(f"{prompts_path}", "r") as f: |
| self.prompts = json.load(f) |
|
|
| self.new_data = [] |
| if train_mapper: |
| for d in self.data: |
| file_name = d["filename"] |
| sid = file_name.split("/")[0] |
| temp = random.sample(self.prompts[sid], 10) |
| for qa in temp: |
| self.new_data.append({"filename": file_name, |
| "start": d["start"], |
| "end": d["end"], |
| "prompt": qa[0], |
| "answer": qa[1]}) |
| else: |
| self.new_data = self.data |
| if split == "train": |
| random.shuffle(self.new_data) |
|
|
| self.sample_rate = sample_rate |
|
|
| def __len__(self): |
| return len(self.new_data) |
|
|
| def __getitem__(self, idx): |
| entry = self.new_data[idx] |
| filename = entry["filename"] |
| sid = filename.split("/")[0] |
| audio_path = os.path.join(self.root, filename) |
|
|
| |
| audio, sample_rate = torchaudio.load(audio_path) |
| start_sample, end_sample = entry["start"], entry["end"] |
| |
| |
| if sample_rate != self.sample_rate: |
| audio = torchaudio.transforms.Resample(sample_rate, self.sample_rate)(audio) |
|
|
| |
| total_samples = end_sample - start_sample |
| num_samples_3s = 3 * self.sample_rate |
| |
| |
| if total_samples >= num_samples_3s: |
| start_offset = random.randint(start_sample, end_sample - num_samples_3s) |
| end_offset = start_offset + num_samples_3s |
| audio = audio[:, start_offset:end_offset] |
| else: |
| |
| pad_size = num_samples_3s - total_samples |
| audio = audio[:, start_sample:end_sample] |
| audio = torch.nn.functional.pad(audio, (0, pad_size)) |
|
|
| |
| mean = torch.mean(audio) |
| std = torch.std(audio) |
| audio = (audio - mean) / (std + 1e-8) |
|
|
| return { |
| "audio_tensor": audio, |
| "filename": filename, |
| "sid": sid, |
| "prompt": entry.get("prompt", None), |
| "answer": entry.get("answer", None), |
| } |