File size: 1,641 Bytes
72c0672 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | import json
from tokenizers import Tokenizer
import argparse
from tqdm import tqdm
def calculate_compression_ratio(jsonl_file_path, tokenizer):
total_utf8_length = 0
total_tokenized_length = 0
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
for line in tqdm(file):
document = json.loads(line)["text"]
if isinstance(document, str):
utf8_length = len(document.encode('utf-8'))
tokenized = tokenizer.encode(document).ids
decoded = tokenizer.decode(tokenized)
if document != decoded:
print("######### [DEBUG] ##########")
print([document])
print([decoded])
raise ValueError
assert document == decoded
tokenized_length = len(tokenized)
total_utf8_length += utf8_length
total_tokenized_length += tokenized_length
if total_utf8_length == 0:
compression_ratio = 0
else:
compression_ratio = total_utf8_length / total_tokenized_length
print(f"Total UTF-8 length: {total_utf8_length}")
print(f"Total tokenized length: {total_tokenized_length}")
print(f"Compression ratio: {compression_ratio:.4f}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--eval_data_path", type=str, default="")
parser.add_argument("--tokenizer_path", type=str, default="")
args = parser.parse_args()
tokenizer = Tokenizer.from_file(args.tokenizer_path)
calculate_compression_ratio(args.eval_data_path, tokenizer)
|