File size: 873 Bytes
2f610df 94aa96e 2f610df da3a6cf 2f610df 94aa96e 2f610df da3a6cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
import tiktoken
from constants.tokens import special_tokens
# Private tokenizer instance (internal use only)
_tokenizer = tiktoken.get_encoding("cl100k_base")
# Tokenize keys and values using the internal tokenizer.
def tokenize_dataset(data):
inputs = []
outputs = []
for key, value in data.items():
inp_tokens = _tokenizer.encode(key)
out_tokens = _tokenizer.encode(value)
inputs.append(inp_tokens)
outputs.append(out_tokens)
return inputs, outputs
# Build vocabulary mapping from token IDs and add special tokens.
def build_vocab(inputs, outputs):
offset = len(special_tokens)
all_ids = set(tok for seq in inputs + outputs for tok in seq)
vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))}
vocab.update({k: v for k, v in special_tokens.items()})
inv_vocab = {v: k for k, v in vocab.items()}
return vocab, inv_vocab
|