| import argparse | |
| from pathlib import Path | |
| import torchaudio | |
| import webdataset as wds | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| # Specify the input folder | |
| # input_folder = Path("/nfs/xhao/data/LibriSpeech/train-clean-100-24K/") | |
| input_folder = Path("/home/xhao/data/LibriSpeech/LibriSpeech/train-clean-100-24K/") | |
| # Specify the output pattern | |
| # output_pattern = "/nfs/xhao/data/LibriSpeech/train-clean-100-24K-%03d.tar" | |
| output_pattern = "/home/xhao/data/LibriSpeech/LibriSpeech/train-clean-100-24K-%03d.tar" | |
| ext = "flac" | |
| sr = 24000 | |
| maxsize = 3e9 | |
| print(f"Writing {input_folder} to {output_pattern}") | |
| print(f"Finding files with extension {ext}...") | |
| fpath_list = sorted(input_folder.rglob(f"*.{ext}")) | |
| print(f"Found {len(fpath_list)} files.") | |
| print(fpath_list[:2]) | |
| sink = wds.ShardWriter(output_pattern, maxsize=maxsize, verbose=True, start_shard=1) | |
| for i, fpath in enumerate(fpath_list): | |
| tensor, sr = torchaudio.load(fpath) | |
| sink.write( | |
| { | |
| "__key__": f"{i:08d}", | |
| "audio.pth": tensor, | |
| "json": { | |
| "fpath": str(fpath), | |
| "sr": sr, | |
| "duration": tensor.shape[-1] / sr, | |
| }, | |
| } | |
| ) | |
| sink.close() | |