Spaces:
Sleeping
Sleeping
File size: 874 Bytes
bb2fa48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | """ Collection of tokenizers for text data. """
import string
class CharacterLevelTokenizer:
""" A simple character-level tokenizer for text data. """
def __init__(self):
""" Initializes the tokenizer by creating a vocabulary of unique characters """
self.vocab = sorted(set(string.ascii_letters + string.digits + string.punctuation + " \n"))
self.token_to_id = {token: idx for idx, token in enumerate(self.vocab)}
self.id_to_token = {idx: token for idx, token in enumerate(self.vocab)}
def encode(self, str_input):
""" encoder: take a string, output a list of integers """
return [self.token_to_id[token] for token in str_input]
def decode(self, token_ids):
""" decoder: take a list of integers, output a string """
return ''.join([self.id_to_token[token_id] for token_id in token_ids])
|