import json from tokenizers import Tokenizer import argparse from tqdm import tqdm def calculate_compression_ratio(jsonl_file_path, tokenizer): total_utf8_length = 0 total_tokenized_length = 0 with open(jsonl_file_path, 'r', encoding='utf-8') as file: for line in tqdm(file): document = json.loads(line)["text"] if isinstance(document, str): utf8_length = len(document.encode('utf-8')) tokenized = tokenizer.encode(document).ids decoded = tokenizer.decode(tokenized) if document != decoded: print("######### [DEBUG] ##########") print([document]) print([decoded]) raise ValueError assert document == decoded tokenized_length = len(tokenized) total_utf8_length += utf8_length total_tokenized_length += tokenized_length if total_utf8_length == 0: compression_ratio = 0 else: compression_ratio = total_utf8_length / total_tokenized_length print(f"Total UTF-8 length: {total_utf8_length}") print(f"Total tokenized length: {total_tokenized_length}") print(f"Compression ratio: {compression_ratio:.4f}") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--eval_data_path", type=str, default="") parser.add_argument("--tokenizer_path", type=str, default="") args = parser.parse_args() tokenizer = Tokenizer.from_file(args.tokenizer_path) calculate_compression_ratio(args.eval_data_path, tokenizer)