File size: 506 Bytes
c39435c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
export HF_HOME=/mnt/jfzn/HF/
python preprocess.py \
  --dataset /mnt/jfzn/data/SlimPajama-627B/train/chunk3 \
  --name slimp \
  --split train \
  --output /mnt/jfzn/data/SlimPajama-627B/pre_slimp_chunk3 \
  --tokenizer /mnt/jfzn/msj/models--fla-hub--gla-1.3B-100B

# git lfs install
# git clone https://huggingface.co/datasets/cerebras/SlimPajama-627B --depth 1
# python preprocess.py \
#   --dataset SlimPajama-627B \
#   --split train \
#   --context_length 2048 \
#   --output /mnt/jfzn/msj/pre_slimp \