| # Data preparation script for VibeToken training. | |
| # Set DATA_DIR to control where datasets are stored (defaults to ./data). | |
| # | |
| # Usage: | |
| # export DATA_DIR=/mnt/fastssd/datasets # optional, defaults to ./data | |
| # bash setup.sh | |
| DATA_DIR="${DATA_DIR:-./data}" | |
| echo "Using DATA_DIR=${DATA_DIR}" | |
| # Download ImageNet-1k via HuggingFace | |
| export HF_HUB_ENABLE_HF_TRANSFER=1 | |
| huggingface-cli download ILSVRC/imagenet-1k --repo-type dataset --local-dir "${DATA_DIR}/imagenet-1k" | |
| # Convert to WebDataset format | |
| python data/convert_imagenet_to_wds.py \ | |
| --input_dir "${DATA_DIR}/imagenet-1k" \ | |
| --output_dir "${DATA_DIR}/imagenet_wds" | |