Byte-lingua-code / superbpe /scripts /evaluate_compression_rate.py

offline_compression_graph_code

72c0672 verified 4 months ago

1.64 kB

	import json
	from tokenizers import Tokenizer
	import argparse
	from tqdm import tqdm
	def calculate_compression_ratio(jsonl_file_path, tokenizer):
	total_utf8_length = 0
	total_tokenized_length = 0

	with open(jsonl_file_path, 'r', encoding='utf-8') as file:
	for line in tqdm(file):
	document = json.loads(line)["text"]
	if isinstance(document, str):
	utf8_length = len(document.encode('utf-8'))
	tokenized = tokenizer.encode(document).ids
	decoded = tokenizer.decode(tokenized)
	if document != decoded:
	print("######### [DEBUG] ##########")
	print([document])
	print([decoded])
	raise ValueError
	assert document == decoded
	tokenized_length = len(tokenized)

	total_utf8_length += utf8_length
	total_tokenized_length += tokenized_length

	if total_utf8_length == 0:
	compression_ratio = 0
	else:
	compression_ratio = total_utf8_length / total_tokenized_length

	print(f"Total UTF-8 length: {total_utf8_length}")
	print(f"Total tokenized length: {total_tokenized_length}")
	print(f"Compression ratio: {compression_ratio:.4f}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--eval_data_path", type=str, default="")
	parser.add_argument("--tokenizer_path", type=str, default="")
	args = parser.parse_args()
	tokenizer = Tokenizer.from_file(args.tokenizer_path)
	calculate_compression_ratio(args.eval_data_path, tokenizer)