from dataclasses import dataclass from pathlib import Path import numpy as np import pandas as pd import soundfile as sf from simple_parsing import Serializable from torch.utils.data import Dataset as TorchDataset from audiozen.acoustics.io import load_audio @dataclass class DatasetArgs(Serializable): librispeech_dir: str librispeech_metadata_fpath: str duration: float = 6.0 sr: int = 24000 num_samples: int = 40000 class Dataset(TorchDataset): def __init__(self, args: DatasetArgs): super().__init__() librispeech_dir = Path(args.librispeech_dir).expanduser().resolve() librispeech_metadata_fpath = Path(args.librispeech_metadata_fpath).expanduser().resolve() self.librispeech_metadata = pd.read_csv(librispeech_metadata_fpath, engine="python") self.librispeech_dir = librispeech_dir self.num_samples = args.num_samples self.duration = args.duration self.sr = args.sr def __len__(self) -> int: return self.num_samples def __getitem__(self, index): spk_1_info = self.librispeech_metadata.sample(n=1).iloc[0] spk_1_id = spk_1_info["speaker_ID"] spk_2_info = self.librispeech_metadata.sample(n=1).iloc[0] spk_2_id = spk_2_info["speaker_ID"] while spk_1_id == spk_2_id: spk_2_info = self.librispeech_metadata.sample(n=1).iloc[0] spk_2_id = spk_2_info["speaker_ID"] spk_1_path = self.librispeech_dir / spk_1_info["origin_path"] spk_2_path = self.librispeech_dir / spk_2_info["origin_path"] spk_1_y, _ = load_audio(spk_1_path, duration=self.duration, sr=self.sr) spk_2_y, _ = load_audio(spk_2_path, duration=self.duration, sr=self.sr) # Randomly select a delay size from 1s to 5s delay_size = np.random.randint(1, 5) delay_frames = int(delay_size * self.sr) # Offset the second one and add it to the first one. spk_2_y = spk_2_y[:-delay_frames] spk_2_y = np.pad(spk_2_y, (delay_frames, 0), mode="constant") assert spk_1_y.shape == spk_2_y.shape # Mix the two files with the delay size mix_y = spk_1_y + spk_2_y # Normalize the mixture max_val = np.max(np.abs(mix_y)) mix_y /= max_val spk_1_y /= max_val spk_2_y /= max_val file_id = f"{spk_1_id}_{spk_2_id}_{delay_size}.wav" spk_y = np.stack([spk_1_y, spk_2_y], axis=0) return mix_y, spk_y, file_id if __name__ == "__main__": dist_dir = Path("./tmp") dataset = Dataset( librispeech_dir="/nfs/xhao/data/LibriSpeech", librispeech_metadata_fpath="/home/xhao/proj/audiozen/recipes/librimix_sot/local/metadata/LibriSpeech/train-clean-100-24K.csv", duration=6.0, sr=24000, num_samples=500, ) for i in range(len(dataset)): mix_y, spk_y, file_id = dataset[i] (dist_dir / "mix").mkdir(parents=True, exist_ok=True) (dist_dir / "spk_1").mkdir(parents=True, exist_ok=True) (dist_dir / "spk_2").mkdir(parents=True, exist_ok=True) sf.write(dist_dir / "mix" / f"{file_id}.wav", mix_y, samplerate=24000) sf.write(dist_dir / "spk_1" / f"{file_id}.wav", spk_y[0], samplerate=24000) sf.write(dist_dir / "spk_2" / f"{file_id}.wav", spk_y[1], samplerate=24000)