# bash precompress_run.sh # bash precompress_merge_jsonls.sh # hdfs dfs -put hdfs://harunava/home/byte_malia_gcp_aiic/user/linzheng/data/ # examples: # hdfs dfs -put ocp_subsampled_50G_m1 hdfs://harunava/home/byte_malia_gcp_aiic/user/linzheng/data/ocp_subsampled_50G_m1_debug # grep "Error" gpu*_process*_total2.log #### AC with BPE python offline_compress_m1_bpe.py \ --tokenizer_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab65K_from16K_stage2.json \ --output_file /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab65K_from16K_stage2_ac_map.json\ --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000 python offline_compress_m1_bpe.py \ --tokenizer_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab49K_from16K_stage2.json \ --output_file /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab49K_from16K_stage2_ac_map.json\ --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000 python offline_compress_m1_bpe.py \ --tokenizer_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/spm_byte_vocab65536_python500k_pretok.json \ --output_file /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/spm_byte_vocab65536_python500k_pretok_ac_map.json\ --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000