Byte-lingua-code / precompress_pipeline.sh
2ira's picture
offline_compression_graph_code
72c0672 verified
# bash precompress_run.sh
# bash precompress_merge_jsonls.sh <input_dir> <output_dir>
# hdfs dfs -put <output_dir> hdfs://harunava/home/byte_malia_gcp_aiic/user/linzheng/data/<output_dir>
# examples:
# hdfs dfs -put ocp_subsampled_50G_m1 hdfs://harunava/home/byte_malia_gcp_aiic/user/linzheng/data/ocp_subsampled_50G_m1_debug
# grep "Error" gpu*_process*_total2.log
#### AC with BPE
python offline_compress_m1_bpe.py \
--tokenizer_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab65K_from16K_stage2.json \
--output_file /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab65K_from16K_stage2_ac_map.json\
--model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000
python offline_compress_m1_bpe.py \
--tokenizer_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab49K_from16K_stage2.json \
--output_file /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab49K_from16K_stage2_ac_map.json\
--model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000
python offline_compress_m1_bpe.py \
--tokenizer_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/spm_byte_vocab65536_python500k_pretok.json \
--output_file /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/spm_byte_vocab65536_python500k_pretok_ac_map.json\
--model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000