| # bash precompress_run.sh | |
| # bash precompress_merge_jsonls.sh <input_dir> <output_dir> | |
| # hdfs dfs -put <output_dir> hdfs://harunava/home/byte_malia_gcp_aiic/user/linzheng/data/<output_dir> | |
| # examples: | |
| # hdfs dfs -put ocp_subsampled_50G_m1 hdfs://harunava/home/byte_malia_gcp_aiic/user/linzheng/data/ocp_subsampled_50G_m1_debug | |
| # grep "Error" gpu*_process*_total2.log | |
| #### AC with BPE | |
| python offline_compress_m1_bpe.py \ | |
| --tokenizer_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab65K_from16K_stage2.json \ | |
| --output_file /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab65K_from16K_stage2_ac_map.json\ | |
| --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000 | |
| python offline_compress_m1_bpe.py \ | |
| --tokenizer_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab49K_from16K_stage2.json \ | |
| --output_file /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab49K_from16K_stage2_ac_map.json\ | |
| --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000 | |
| python offline_compress_m1_bpe.py \ | |
| --tokenizer_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/spm_byte_vocab65536_python500k_pretok.json \ | |
| --output_file /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/spm_byte_vocab65536_python500k_pretok_ac_map.json\ | |
| --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000 | |