| import tiktoken | |
| from constants.tokens import special_tokens | |
| # Private tokenizer instance (internal use only) | |
| _tokenizer = tiktoken.get_encoding("cl100k_base") | |
| # Tokenize keys and values using the internal tokenizer. | |
| def tokenize_dataset(data): | |
| inputs = [] | |
| outputs = [] | |
| for key, value in data.items(): | |
| inp_tokens = _tokenizer.encode(key) | |
| out_tokens = _tokenizer.encode(value) | |
| inputs.append(inp_tokens) | |
| outputs.append(out_tokens) | |
| return inputs, outputs | |
| # Build vocabulary mapping from token IDs and add special tokens. | |
| def build_vocab(inputs, outputs): | |
| offset = len(special_tokens) | |
| all_ids = set(tok for seq in inputs + outputs for tok in seq) | |
| vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))} | |
| vocab.update({k: v for k, v in special_tokens.items()}) | |
| inv_vocab = {v: k for k, v in vocab.items()} | |
| return vocab, inv_vocab | |