File size: 873 Bytes
2f610df
 
 
 
 
 
94aa96e
2f610df
da3a6cf
 
 
 
 
 
 
 
2f610df
94aa96e
2f610df
da3a6cf
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import tiktoken
from constants.tokens import special_tokens

# Private tokenizer instance (internal use only)
_tokenizer = tiktoken.get_encoding("cl100k_base")

# Tokenize keys and values using the internal tokenizer.
def tokenize_dataset(data):
  inputs = []
  outputs = []
  for key, value in data.items():
    inp_tokens = _tokenizer.encode(key)
    out_tokens = _tokenizer.encode(value)
    inputs.append(inp_tokens)
    outputs.append(out_tokens)
  return inputs, outputs

# Build vocabulary mapping from token IDs and add special tokens.
def build_vocab(inputs, outputs):
  offset = len(special_tokens)
  all_ids = set(tok for seq in inputs + outputs for tok in seq)
  vocab = {tok: i + offset for i, tok in enumerate(sorted(all_ids))}
  vocab.update({k: v for k, v in special_tokens.items()})
  inv_vocab = {v: k for k, v in vocab.items()}
  return vocab, inv_vocab