rellow / src /services /tokenizer.py
Rafael Camargo
chore: improve comments and remove unnecessary blank lines
94aa96e
import tiktoken
from constants.tokens import special_tokens
# Private tokenizer instance (internal use only)
_tokenizer = tiktoken.get_encoding("cl100k_base")
# Tokenize keys and values using the internal tokenizer.
def tokenize_dataset(data):
inputs = []
outputs = []
for key, value in data.items():
inp_tokens = _tokenizer.encode(key)
out_tokens = _tokenizer.encode(value)
inputs.append(inp_tokens)
outputs.append(out_tokens)
return inputs, outputs
# Build vocabulary mapping from token IDs and add special tokens.
def build_vocab(inputs, outputs):
offset = len(special_tokens)
all_ids = set(tok for seq in inputs + outputs for tok in seq)
vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))}
vocab.update({k: v for k, v in special_tokens.items()})
inv_vocab = {v: k for k, v in vocab.items()}
return vocab, inv_vocab