import torch import torch.nn as nn import numpy as np from torch.nn.utils.rnn import pad_sequence from torch.utils.data import DataLoader, Dataset from collections import Counter from itertools import chain # # Hyperparameters # d_model = 512 # Dimension of the embeddings and the token representations seq_length = 10 # Length of the input and output sequences # vocab_size = 1000 # Size of the vocabulary batch_size = 32 # Batch size for training # num_heads = 8 # Number of heads in multi-head attention # dim_feedforward = 2048 # Dimension of feedforward network in encoder and decoder # Assuming `transformer_model` and hyperparameters are defined as before class TextDataset(Dataset): def __init__(self, text, vocab=None, seq_length=seq_length): # Tokenization - simple split by whitespace self.tokens = text.split() # If a vocabulary is provided, use it, otherwise create a new one if vocab: self.vocab = vocab else: # Build vocabulary from the unique tokens with added and tokens self.vocab = {'': 0, '': 1} token_counts = Counter(self.tokens) for token, _ in token_counts.items(): self.vocab[token] = len(self.vocab) # Inverse mapping from indices to tokens self.index2token = {index: token for token, index in self.vocab.items()} # Convert tokens to indices self.indexed_tokens = [self.vocab[token] for token in self.tokens] # Sequence length self.seq_length = seq_length def __len__(self): # Number of tokens divided by the sequence length gives the number of sequences return len(self.indexed_tokens) // self.seq_length def __getitem__(self, idx): # Slice the indexed_tokens to get a sequence start_idx = idx * self.seq_length end_idx = start_idx + self.seq_length + 1 # +1 for token sequence = self.indexed_tokens[start_idx:end_idx] # Convert to torch tensor return torch.tensor(sequence, dtype=torch.long) # Load the text from a file with open('tiny-shakespeare.txt', 'r', encoding='utf-8') as file: text = file.read() # Create the dataset dataset = TextDataset(text) #vocab_size = len(list(set(dataset.tokens))) vocab_size = len(dataset.vocab) # Update vocab_size for the transformer model train_data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True) # Function to generate padding mask for sequences def create_padding_mask(seq): return (seq == dataset.vocab['']).transpose(0, 1) # Training loop remains the same as before