2ira
/

Byte-lingua-code

Model card Files Files and versions

Byte-lingua-code / superbpe /run_huffman.sh

2ira's picture

offline_compression_graph_code

72c0672 verified 4 months ago

history blame contribute delete

3.33 kB

	# tokenizer_paths=(
	# /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab400K_from250K_stage2.json
	# /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab400K_stage1.json
	# /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab800K_from160K_stage2.json
	# /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab800K_from320K_stage2.json
	# /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab800K_from480K_stage2.json
	# /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab800K_from640K_stage2.json
	# /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab800K_from80K_stage2.json
	# /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_python500k_vocab800K_stage1.json
	# )
	# for tokenizer_path in "${tokenizer_paths[@]}"
	# do
	# python ../scripts/huffman_count_freq.py --tokenizer_path $tokenizer_path --input_data_path subsample_python.jsonl --output_freq_path "${tokenizer_path%.json}_huffman_freq.json"
	# echo "${tokenizer_path%.json}_huffman_freq.json"
	# done

	tokenizer_paths=(
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab32K_from16K_stage2.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab32K_stage1.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab49K_from16K_stage2.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab49K_stage1.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab65K_from16K_stage2.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab65K_stage1.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab800K_from640K_stage2.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab2M_from160K_stage2.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab2M_stage1.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab400K_from250K_stage2.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab400K_stage1.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab4M_from160K_stage2.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab4M_stage1.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab800K_from160K_stage2.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab800K_from320K_stage2.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab800K_from480K_stage2.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab800K_from80K_stage2.json
	/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/superbpe_opencoder300k_vocab800K_stage1.json
	)
	for tokenizer_path in "${tokenizer_paths[@]}"
	do
	python ../scripts/huffman_count_freq.py --tokenizer_path $tokenizer_path --input_data_path subsample_opencoder.jsonl --output_freq_path "${tokenizer_path%.json}_huffman_freq.json"
	echo "${tokenizer_path%.json}_huffman_freq.json"
	done