File size: 1,641 Bytes

72c0672

import json
from tokenizers import Tokenizer
import argparse
from tqdm import tqdm
def calculate_compression_ratio(jsonl_file_path, tokenizer):
    total_utf8_length = 0
    total_tokenized_length = 0

    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in tqdm(file):
            document = json.loads(line)["text"]
            if isinstance(document, str):
                utf8_length = len(document.encode('utf-8'))
                tokenized = tokenizer.encode(document).ids
                decoded = tokenizer.decode(tokenized)
                if document != decoded:
                    print("######### [DEBUG] ##########")
                    print([document])
                    print([decoded])
                    raise ValueError
                assert document == decoded
                tokenized_length = len(tokenized)

                total_utf8_length += utf8_length
                total_tokenized_length += tokenized_length

    if total_utf8_length == 0:
        compression_ratio = 0
    else:
        compression_ratio = total_utf8_length / total_tokenized_length

    print(f"Total UTF-8 length: {total_utf8_length}")
    print(f"Total tokenized length: {total_tokenized_length}")
    print(f"Compression ratio: {compression_ratio:.4f}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--eval_data_path", type=str, default="")
    parser.add_argument("--tokenizer_path", type=str, default="")
    args = parser.parse_args()
    tokenizer = Tokenizer.from_file(args.tokenizer_path)
    calculate_compression_ratio(args.eval_data_path, tokenizer)