File size: 2,679 Bytes
63b0b0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9aaf95c
63b0b0b
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import torch
import torch.nn as nn
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from collections import Counter
from itertools import chain


# # Hyperparameters
# d_model = 512  # Dimension of the embeddings and the token representations
seq_length = 10  # Length of the input and output sequences
# vocab_size = 1000  # Size of the vocabulary
batch_size = 32  # Batch size for training
# num_heads = 8  # Number of heads in multi-head attention
# dim_feedforward = 2048  # Dimension of feedforward network in encoder and decoder


# Assuming `transformer_model` and hyperparameters are defined as before

class TextDataset(Dataset):
    def __init__(self, text, vocab=None, seq_length=seq_length):
        # Tokenization - simple split by whitespace
        self.tokens = text.split()

        # If a vocabulary is provided, use it, otherwise create a new one
        if vocab:
            self.vocab = vocab
        else:
            # Build vocabulary from the unique tokens with added <pad> and <eos> tokens
            self.vocab = {'<pad>': 0, '<eos>': 1}
            token_counts = Counter(self.tokens)
            for token, _ in token_counts.items():
                self.vocab[token] = len(self.vocab)
        
        # Inverse mapping from indices to tokens
        self.index2token = {index: token for token, index in self.vocab.items()}

        # Convert tokens to indices
        self.indexed_tokens = [self.vocab[token] for token in self.tokens]
        
        # Sequence length
        self.seq_length = seq_length
    
    def __len__(self):
        # Number of tokens divided by the sequence length gives the number of sequences
        return len(self.indexed_tokens) // self.seq_length

    def __getitem__(self, idx):
        # Slice the indexed_tokens to get a sequence
        start_idx = idx * self.seq_length
        end_idx = start_idx + self.seq_length + 1  # +1 for <eos> token
        sequence = self.indexed_tokens[start_idx:end_idx]
        # Convert to torch tensor
        return torch.tensor(sequence, dtype=torch.long)

# Load the text from a file
with open('tiny-shakespeare.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Create the dataset
dataset = TextDataset(text)
#vocab_size = len(list(set(dataset.tokens)))
vocab_size = len(dataset.vocab)  # Update vocab_size for the transformer model
train_data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Function to generate padding mask for sequences
def create_padding_mask(seq):
    return (seq == dataset.vocab['<pad>']).transpose(0, 1)

# Training loop remains the same as before