File size: 2,552 Bytes
72c0672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# python3 offline_compress_m1.py \
#     --input_dir data/m1 \
#     --output_dir test_data/m1 \
#     --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000

total_jsonls=8
total_jobs=1

# total_jsonls=8
# total_jobs=2

# --firstbyte_prob_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/ac_unigram_probs/python500k_unigram_prob.json \
for JSONL_IDX in $(seq 1 $total_jsonls); do
    for index in $(seq 0 $((total_jobs - 1))); do
        echo "Starting job $index..."

        GPU_IDX=$(( JSONL_IDX - 1 ))
        CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_compress_m1_dynamicwindow_multiprocess.py \
            --input_file /mnt/hdfs/user/linzheng/data/ocpython_subsampled_50G/ocp.chunk.${JSONL_IDX}.jsonl \
            --output_dir ocpython_subsampled_50G_outputwindow_16_entropy87 \
            --entropy_model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs32_seqlen512_python/checkpoints/0000200000 \
            --compression_model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000 \
            --data_batch_size 512 --output_window_size 24 --max_window_size 64 \
            --max_entropy_batch_size 128 --max_compression_batch_size 8192 \
            --num_workers 1 --process_id $index --num_processes $total_jobs \
            --base_global_quantile 0.87 --base_monotonic_quantile 0.87 \
            --chunk_size 512 > gpu${GPU_IDX}_process${index}_total${total_jobs}.log 2>&1 &
    done
done



# for JSONL_IDX in $(seq 1 $total_jsonls); do
#     for index in $(seq 0 $((total_jobs - 1))); do
#         echo "Starting job $index..."

#         GPU_IDX=$(( JSONL_IDX - 1 ))
#         CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_compress_m1_outputwindow_v3.py \
#             --input_file /mnt/hdfs/user/linzheng/data/ocpython_subsampled_50G/ocp.chunk.${JSONL_IDX}.jsonl \
#             --output_dir ocpython_subsampled_50G_outputwindow_24 \
#             --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs32_seqlen512_python/checkpoints/0000200000 \
#             --data_batch_size 512 --output_window_size 24 --max_m1_batch_size 4096 --max_window_size 64 \
#             --num_workers 1 --process_id $index --num_processes $total_jobs \
#             --output_window_size 32 > gpu${GPU_IDX}_process${index}_total${total_jobs}.log 2>&1 &
#     done
# done