File size: 3,334 Bytes
72c0672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# tokenizer_paths=(
#     /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab400K_from250K_stage2.json
#     /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab400K_stage1.json
#     /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab800K_from160K_stage2.json
#     /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab800K_from320K_stage2.json
#     /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab800K_from480K_stage2.json
#     /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab800K_from640K_stage2.json
#     /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab800K_from80K_stage2.json
#     /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab800K_stage1.json
# )
# for tokenizer_path in "${tokenizer_paths[@]}"
# do
#     python ../scripts/huffman_count_freq.py --tokenizer_path $tokenizer_path --input_data_path subsample_python.jsonl --output_freq_path "${tokenizer_path%.json}_huffman_freq.json"
#     echo "${tokenizer_path%.json}_huffman_freq.json"
# done

tokenizer_paths=(
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab32K_from16K_stage2.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab32K_stage1.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab49K_from16K_stage2.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab49K_stage1.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab65K_from16K_stage2.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab65K_stage1.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab800K_from640K_stage2.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab2M_from160K_stage2.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab2M_stage1.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab400K_from250K_stage2.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab400K_stage1.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab4M_from160K_stage2.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab4M_stage1.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab800K_from160K_stage2.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab800K_from320K_stage2.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab800K_from480K_stage2.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab800K_from80K_stage2.json
    /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab800K_stage1.json
)
for tokenizer_path in "${tokenizer_paths[@]}"
do
    python ../scripts/huffman_count_freq.py --tokenizer_path $tokenizer_path --input_data_path subsample_opencoder.jsonl --output_freq_path "${tokenizer_path%.json}_huffman_freq.json"
    echo "${tokenizer_path%.json}_huffman_freq.json"
done