File size: 401 Bytes
f4e346e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
from transformers import PreTrainedTokenizerFast
tok = PreTrainedTokenizerFast(tokenizer_file="tokenizer/hf/tokenizer.json")
import os
with open("tokenizer/corpus.txt","r") as f:
text = f.read()
num_bytes = len(text.encode("utf-8"))
num_tokens = len(tok.encode(text))
ratio = num_bytes / num_tokens
print("Compression ratio:", ratio)
# Expected ratio is around 3.5 to 4.5 for a good tokenizer |