import argparse from pathlib import Path import torchaudio import webdataset as wds if __name__ == "__main__": parser = argparse.ArgumentParser() # Specify the input folder # input_folder = Path("/nfs/xhao/data/LibriSpeech/train-clean-100-24K/") input_folder = Path("/home/xhao/data/LibriSpeech/LibriSpeech/train-clean-100-24K/") # Specify the output pattern # output_pattern = "/nfs/xhao/data/LibriSpeech/train-clean-100-24K-%03d.tar" output_pattern = "/home/xhao/data/LibriSpeech/LibriSpeech/train-clean-100-24K-%03d.tar" ext = "flac" sr = 24000 maxsize = 3e9 print(f"Writing {input_folder} to {output_pattern}") print(f"Finding files with extension {ext}...") fpath_list = sorted(input_folder.rglob(f"*.{ext}")) print(f"Found {len(fpath_list)} files.") print(fpath_list[:2]) sink = wds.ShardWriter(output_pattern, maxsize=maxsize, verbose=True, start_shard=1) for i, fpath in enumerate(fpath_list): tensor, sr = torchaudio.load(fpath) sink.write( { "__key__": f"{i:08d}", "audio.pth": tensor, "json": { "fpath": str(fpath), "sr": sr, "duration": tensor.shape[-1] / sr, }, } ) sink.close()