File size: 506 Bytes
c39435c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | export HF_HOME=/mnt/jfzn/HF/
python preprocess.py \
--dataset /mnt/jfzn/data/SlimPajama-627B/train/chunk3 \
--name slimp \
--split train \
--output /mnt/jfzn/data/SlimPajama-627B/pre_slimp_chunk3 \
--tokenizer /mnt/jfzn/msj/models--fla-hub--gla-1.3B-100B
# git lfs install
# git clone https://huggingface.co/datasets/cerebras/SlimPajama-627B --depth 1
# python preprocess.py \
# --dataset SlimPajama-627B \
# --split train \
# --context_length 2048 \
# --output /mnt/jfzn/msj/pre_slimp \ |