arse__ar_ss / local /write2webdataset.py
haoxiangsnr's picture
Add files using upload-large-folder tool
1002053 verified
import argparse
from pathlib import Path
import torchaudio
import webdataset as wds
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Specify the input folder
# input_folder = Path("/nfs/xhao/data/LibriSpeech/train-clean-100-24K/")
input_folder = Path("/home/xhao/data/LibriSpeech/LibriSpeech/train-clean-100-24K/")
# Specify the output pattern
# output_pattern = "/nfs/xhao/data/LibriSpeech/train-clean-100-24K-%03d.tar"
output_pattern = "/home/xhao/data/LibriSpeech/LibriSpeech/train-clean-100-24K-%03d.tar"
ext = "flac"
sr = 24000
maxsize = 3e9
print(f"Writing {input_folder} to {output_pattern}")
print(f"Finding files with extension {ext}...")
fpath_list = sorted(input_folder.rglob(f"*.{ext}"))
print(f"Found {len(fpath_list)} files.")
print(fpath_list[:2])
sink = wds.ShardWriter(output_pattern, maxsize=maxsize, verbose=True, start_shard=1)
for i, fpath in enumerate(fpath_list):
tensor, sr = torchaudio.load(fpath)
sink.write(
{
"__key__": f"{i:08d}",
"audio.pth": tensor,
"json": {
"fpath": str(fpath),
"sr": sr,
"duration": tensor.shape[-1] / sr,
},
}
)
sink.close()