File size: 401 Bytes
f4e346e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from transformers import PreTrainedTokenizerFast

tok = PreTrainedTokenizerFast(tokenizer_file="tokenizer/hf/tokenizer.json")

import os

with open("tokenizer/corpus.txt","r") as f:
    text = f.read()

num_bytes = len(text.encode("utf-8"))
num_tokens = len(tok.encode(text))

ratio = num_bytes / num_tokens
print("Compression ratio:", ratio)
# Expected ratio is around 3.5 to 4.5 for a good tokenizer