| | from dataclasses import dataclass |
| | from pathlib import Path |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | import soundfile as sf |
| | from simple_parsing import Serializable |
| | from torch.utils.data import Dataset as TorchDataset |
| |
|
| | from audiozen.acoustics.io import load_audio |
| |
|
| |
|
| | @dataclass |
| | class DatasetArgs(Serializable): |
| | librispeech_dir: str |
| | librispeech_metadata_fpath: str |
| | duration: float = 6.0 |
| | sr: int = 24000 |
| | num_samples: int = 40000 |
| |
|
| |
|
| | class Dataset(TorchDataset): |
| | def __init__(self, args: DatasetArgs): |
| | super().__init__() |
| |
|
| | librispeech_dir = Path(args.librispeech_dir).expanduser().resolve() |
| | librispeech_metadata_fpath = Path(args.librispeech_metadata_fpath).expanduser().resolve() |
| |
|
| | self.librispeech_metadata = pd.read_csv(librispeech_metadata_fpath, engine="python") |
| | self.librispeech_dir = librispeech_dir |
| | self.num_samples = args.num_samples |
| | self.duration = args.duration |
| | self.sr = args.sr |
| |
|
| | def __len__(self) -> int: |
| | return self.num_samples |
| |
|
| | def __getitem__(self, index): |
| | spk_1_info = self.librispeech_metadata.sample(n=1).iloc[0] |
| | spk_1_id = spk_1_info["speaker_ID"] |
| | spk_2_info = self.librispeech_metadata.sample(n=1).iloc[0] |
| | spk_2_id = spk_2_info["speaker_ID"] |
| |
|
| | while spk_1_id == spk_2_id: |
| | spk_2_info = self.librispeech_metadata.sample(n=1).iloc[0] |
| | spk_2_id = spk_2_info["speaker_ID"] |
| |
|
| | spk_1_path = self.librispeech_dir / spk_1_info["origin_path"] |
| | spk_2_path = self.librispeech_dir / spk_2_info["origin_path"] |
| |
|
| | spk_1_y, _ = load_audio(spk_1_path, duration=self.duration, sr=self.sr) |
| | spk_2_y, _ = load_audio(spk_2_path, duration=self.duration, sr=self.sr) |
| |
|
| | |
| | delay_size = np.random.randint(1, 5) |
| | delay_frames = int(delay_size * self.sr) |
| |
|
| | |
| | spk_2_y = spk_2_y[:-delay_frames] |
| | spk_2_y = np.pad(spk_2_y, (delay_frames, 0), mode="constant") |
| |
|
| | assert spk_1_y.shape == spk_2_y.shape |
| |
|
| | |
| | mix_y = spk_1_y + spk_2_y |
| |
|
| | |
| | max_val = np.max(np.abs(mix_y)) |
| | mix_y /= max_val |
| | spk_1_y /= max_val |
| | spk_2_y /= max_val |
| |
|
| | file_id = f"{spk_1_id}_{spk_2_id}_{delay_size}.wav" |
| |
|
| | spk_y = np.stack([spk_1_y, spk_2_y], axis=0) |
| |
|
| | return mix_y, spk_y, file_id |
| |
|
| |
|
| | if __name__ == "__main__": |
| | dist_dir = Path("./tmp") |
| | dataset = Dataset( |
| | librispeech_dir="/nfs/xhao/data/LibriSpeech", |
| | librispeech_metadata_fpath="/home/xhao/proj/audiozen/recipes/librimix_sot/local/metadata/LibriSpeech/train-clean-100-24K.csv", |
| | duration=6.0, |
| | sr=24000, |
| | num_samples=500, |
| | ) |
| |
|
| | for i in range(len(dataset)): |
| | mix_y, spk_y, file_id = dataset[i] |
| |
|
| | (dist_dir / "mix").mkdir(parents=True, exist_ok=True) |
| | (dist_dir / "spk_1").mkdir(parents=True, exist_ok=True) |
| | (dist_dir / "spk_2").mkdir(parents=True, exist_ok=True) |
| | sf.write(dist_dir / "mix" / f"{file_id}.wav", mix_y, samplerate=24000) |
| | sf.write(dist_dir / "spk_1" / f"{file_id}.wav", spk_y[0], samplerate=24000) |
| | sf.write(dist_dir / "spk_2" / f"{file_id}.wav", spk_y[1], samplerate=24000) |
| |
|