Spaces:
Sleeping
Sleeping
| import torch | |
| with open('data/input.txt', 'r', encoding='utf-8') as f: | |
| text = f.read() | |
| # here are all the unique characters that occur in this text | |
| chars = sorted(list(set(text))) | |
| vocab_size = len(chars) | |
| # create a mapping from characters to integers | |
| stoi = { ch:i for i,ch in enumerate(chars) } | |
| itos = { i:ch for i,ch in enumerate(chars) } | |
| encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers | |
| decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string | |
| # Train and test splits | |
| data = torch.tensor(encode(text), dtype=torch.long) | |
| n = int(0.9*len(data)) # first 90% will be train, rest val | |
| train_data = data[:n] | |
| val_data = data[n:] | |
| ''' | |
| ''' | |
| def get_train_data(): | |
| return train_data | |
| ''' | |
| ''' | |
| def get_val_data(): | |
| return val_data | |
| ''' | |
| ''' | |
| def get_data(): | |
| return data | |
| ''' | |
| ''' | |
| def get_encoder(): | |
| return encode | |
| ''' | |
| ''' | |
| def get_decoder(): | |
| return decode | |
| ''' | |
| ''' | |
| def get_vocab_size(): | |
| return vocab_size |