File size: 3,597 Bytes

f55a095

import os
from glob import glob
import torchaudio
from torch.utils.data import Dataset
import pandas as pd
from PIL import Image
import pickle
from copy import deepcopy
from glob import glob
import random
from sklearn.model_selection import train_test_split
import json
import os
import numpy as np 
import librosa
import torch
import soundfile as sf
import pandas as pd
import random

class EARS(Dataset):
    """
    EARS dataset for 10sec or less that 10sec segments.
    Returns:
        audio: torch.Tensor in (1,16000) or (1, <16000), audio waveform
        sid: str (p103), speaker id
        metadict: dict, metadata
        caption: str, caption
        alignment: list
    """
    def __init__(self, root, data_path, meta_path,utterance_path, prompts_path, sample_rate, train_mapper=False, split="train"):
        super().__init__()
        self.root = root

        with open(f"{data_path}", "r") as f:
            self.data = json.load(f)

        with open(f"{meta_path}", "r") as f:
            self.meta = json.load(f)
        
        with open(f"{utterance_path}", "r") as f:
            self.utterance = json.load(f)

        with open(f"{prompts_path}", "r") as f:
            self.prompts = json.load(f)

        self.new_data = []
        if train_mapper:
            for d in self.data:
                file_name = d["filename"]
                sid = file_name.split("/")[0]
                temp = random.sample(self.prompts[sid], 10)
                for qa in temp:
                    self.new_data.append({"filename": file_name, 
                                        "start": d["start"], 
                                        "end": d["end"], 
                                        "prompt": qa[0], 
                                        "answer": qa[1]})
        else:
            self.new_data = self.data
        if split == "train":
            random.shuffle(self.new_data)

        self.sample_rate = sample_rate

    def __len__(self):
        return len(self.new_data)

    def __getitem__(self, idx):
        entry = self.new_data[idx]
        filename = entry["filename"]
        sid      = filename.split("/")[0]
        audio_path = os.path.join(self.root, filename)

        # Load audio
        audio, sample_rate = torchaudio.load(audio_path)
        start_sample, end_sample = entry["start"], entry["end"]
        
        # Resample if needed
        if sample_rate != self.sample_rate:
            audio = torchaudio.transforms.Resample(sample_rate, self.sample_rate)(audio)

        # Compute duration in samples
        total_samples = end_sample - start_sample
        num_samples_3s = 3 * self.sample_rate  # 3 seconds worth of samples
        
        # Select a random 3s window within the available range
        if total_samples >= num_samples_3s:
            start_offset = random.randint(start_sample, end_sample - num_samples_3s)
            end_offset = start_offset + num_samples_3s
            audio = audio[:, start_offset:end_offset]
        else:
            # If less than 3s, take full segment and pad
            pad_size = num_samples_3s - total_samples
            audio = audio[:, start_sample:end_sample]
            audio = torch.nn.functional.pad(audio, (0, pad_size))

        # Normalize
        mean = torch.mean(audio)
        std = torch.std(audio)
        audio = (audio - mean) / (std + 1e-8)

        return {
            "audio_tensor": audio,
            "filename": filename,
            "sid": sid,
            "prompt": entry.get("prompt", None),
            "answer": entry.get("answer", None),
        }