File size: 1,660 Bytes
72c0672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# bash precompress_run.sh

# bash precompress_merge_jsonls.sh <input_dir> <output_dir>

# hdfs dfs -put <output_dir> hdfs://harunava/home/byte_malia_gcp_aiic/user/linzheng/data/<output_dir>


# examples:
# hdfs dfs -put ocp_subsampled_50G_m1 hdfs://harunava/home/byte_malia_gcp_aiic/user/linzheng/data/ocp_subsampled_50G_m1_debug

# grep "Error" gpu*_process*_total2.log

#### AC with BPE

python offline_compress_m1_bpe.py \
    --tokenizer_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab65K_from16K_stage2.json \
    --output_file /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab65K_from16K_stage2_ac_map.json\
    --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000

python offline_compress_m1_bpe.py \
    --tokenizer_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab49K_from16K_stage2.json \
    --output_file /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab49K_from16K_stage2_ac_map.json\
    --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000

python offline_compress_m1_bpe.py \
    --tokenizer_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/spm_byte_vocab65536_python500k_pretok.json \
    --output_file /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/spm_byte_vocab65536_python500k_pretok_ac_map.json\
    --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000