msj19's picture
Add files using upload-large-folder tool
c39435c verified
export HF_HOME=/mnt/jfzn/HF/
python preprocess.py \
--dataset /mnt/jfzn/data/SlimPajama-627B/train/chunk3 \
--name slimp \
--split train \
--output /mnt/jfzn/data/SlimPajama-627B/pre_slimp_chunk3 \
--tokenizer /mnt/jfzn/msj/models--fla-hub--gla-1.3B-100B
# git lfs install
# git clone https://huggingface.co/datasets/cerebras/SlimPajama-627B --depth 1
# python preprocess.py \
# --dataset SlimPajama-627B \
# --split train \
# --context_length 2048 \
# --output /mnt/jfzn/msj/pre_slimp \