arse__ar_ss / dataloader.py
haoxiangsnr's picture
Add files using upload-large-folder tool
1002053 verified
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import pandas as pd
import soundfile as sf
from simple_parsing import Serializable
from torch.utils.data import Dataset as TorchDataset
from audiozen.acoustics.io import load_audio
@dataclass
class DatasetArgs(Serializable):
librispeech_dir: str
librispeech_metadata_fpath: str
duration: float = 6.0
sr: int = 24000
num_samples: int = 40000
class Dataset(TorchDataset):
def __init__(self, args: DatasetArgs):
super().__init__()
librispeech_dir = Path(args.librispeech_dir).expanduser().resolve()
librispeech_metadata_fpath = Path(args.librispeech_metadata_fpath).expanduser().resolve()
self.librispeech_metadata = pd.read_csv(librispeech_metadata_fpath, engine="python")
self.librispeech_dir = librispeech_dir
self.num_samples = args.num_samples
self.duration = args.duration
self.sr = args.sr
def __len__(self) -> int:
return self.num_samples
def __getitem__(self, index):
spk_1_info = self.librispeech_metadata.sample(n=1).iloc[0]
spk_1_id = spk_1_info["speaker_ID"]
spk_2_info = self.librispeech_metadata.sample(n=1).iloc[0]
spk_2_id = spk_2_info["speaker_ID"]
while spk_1_id == spk_2_id:
spk_2_info = self.librispeech_metadata.sample(n=1).iloc[0]
spk_2_id = spk_2_info["speaker_ID"]
spk_1_path = self.librispeech_dir / spk_1_info["origin_path"]
spk_2_path = self.librispeech_dir / spk_2_info["origin_path"]
spk_1_y, _ = load_audio(spk_1_path, duration=self.duration, sr=self.sr)
spk_2_y, _ = load_audio(spk_2_path, duration=self.duration, sr=self.sr)
# Randomly select a delay size from 1s to 5s
delay_size = np.random.randint(1, 5)
delay_frames = int(delay_size * self.sr)
# Offset the second one and add it to the first one.
spk_2_y = spk_2_y[:-delay_frames]
spk_2_y = np.pad(spk_2_y, (delay_frames, 0), mode="constant")
assert spk_1_y.shape == spk_2_y.shape
# Mix the two files with the delay size
mix_y = spk_1_y + spk_2_y
# Normalize the mixture
max_val = np.max(np.abs(mix_y))
mix_y /= max_val
spk_1_y /= max_val
spk_2_y /= max_val
file_id = f"{spk_1_id}_{spk_2_id}_{delay_size}.wav"
spk_y = np.stack([spk_1_y, spk_2_y], axis=0)
return mix_y, spk_y, file_id
if __name__ == "__main__":
dist_dir = Path("./tmp")
dataset = Dataset(
librispeech_dir="/nfs/xhao/data/LibriSpeech",
librispeech_metadata_fpath="/home/xhao/proj/audiozen/recipes/librimix_sot/local/metadata/LibriSpeech/train-clean-100-24K.csv",
duration=6.0,
sr=24000,
num_samples=500,
)
for i in range(len(dataset)):
mix_y, spk_y, file_id = dataset[i]
(dist_dir / "mix").mkdir(parents=True, exist_ok=True)
(dist_dir / "spk_1").mkdir(parents=True, exist_ok=True)
(dist_dir / "spk_2").mkdir(parents=True, exist_ok=True)
sf.write(dist_dir / "mix" / f"{file_id}.wav", mix_y, samplerate=24000)
sf.write(dist_dir / "spk_1" / f"{file_id}.wav", spk_y[0], samplerate=24000)
sf.write(dist_dir / "spk_2" / f"{file_id}.wav", spk_y[1], samplerate=24000)