| import json |
| from tokenizers import Tokenizer |
| import argparse |
| from tqdm import tqdm |
| def calculate_compression_ratio(jsonl_file_path, tokenizer): |
| total_utf8_length = 0 |
| total_tokenized_length = 0 |
|
|
| with open(jsonl_file_path, 'r', encoding='utf-8') as file: |
| for line in tqdm(file): |
| document = json.loads(line)["text"] |
| if isinstance(document, str): |
| utf8_length = len(document.encode('utf-8')) |
| tokenized = tokenizer.encode(document).ids |
| decoded = tokenizer.decode(tokenized) |
| if document != decoded: |
| print("######### [DEBUG] ##########") |
| print([document]) |
| print([decoded]) |
| raise ValueError |
| assert document == decoded |
| tokenized_length = len(tokenized) |
|
|
| total_utf8_length += utf8_length |
| total_tokenized_length += tokenized_length |
|
|
| if total_utf8_length == 0: |
| compression_ratio = 0 |
| else: |
| compression_ratio = total_utf8_length / total_tokenized_length |
|
|
| print(f"Total UTF-8 length: {total_utf8_length}") |
| print(f"Total tokenized length: {total_tokenized_length}") |
| print(f"Compression ratio: {compression_ratio:.4f}") |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--eval_data_path", type=str, default="") |
| parser.add_argument("--tokenizer_path", type=str, default="") |
| args = parser.parse_args() |
| tokenizer = Tokenizer.from_file(args.tokenizer_path) |
| calculate_compression_ratio(args.eval_data_path, tokenizer) |
|
|