Mini-LLM / data /raw /verify_compression_ratio.py
vishurizz11's picture
Duplicate from Ashx098/Mini-LLM
c5c7ae3
raw
history blame contribute delete
401 Bytes
from transformers import PreTrainedTokenizerFast
tok = PreTrainedTokenizerFast(tokenizer_file="tokenizer/hf/tokenizer.json")
import os
with open("tokenizer/corpus.txt","r") as f:
text = f.read()
num_bytes = len(text.encode("utf-8"))
num_tokens = len(tok.encode(text))
ratio = num_bytes / num_tokens
print("Compression ratio:", ratio)
# Expected ratio is around 3.5 to 4.5 for a good tokenizer