export HF_HOME=/mnt/jfzn/HF/ python preprocess.py \ --dataset /mnt/jfzn/data/SlimPajama-627B/train/chunk3 \ --name slimp \ --split train \ --output /mnt/jfzn/data/SlimPajama-627B/pre_slimp_chunk3 \ --tokenizer /mnt/jfzn/msj/models--fla-hub--gla-1.3B-100B # git lfs install # git clone https://huggingface.co/datasets/cerebras/SlimPajama-627B --depth 1 # python preprocess.py \ # --dataset SlimPajama-627B \ # --split train \ # --context_length 2048 \ # --output /mnt/jfzn/msj/pre_slimp \