File size: 2,679 Bytes
63b0b0b 9aaf95c 63b0b0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import torch
import torch.nn as nn
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from collections import Counter
from itertools import chain
# # Hyperparameters
# d_model = 512 # Dimension of the embeddings and the token representations
seq_length = 10 # Length of the input and output sequences
# vocab_size = 1000 # Size of the vocabulary
batch_size = 32 # Batch size for training
# num_heads = 8 # Number of heads in multi-head attention
# dim_feedforward = 2048 # Dimension of feedforward network in encoder and decoder
# Assuming `transformer_model` and hyperparameters are defined as before
class TextDataset(Dataset):
def __init__(self, text, vocab=None, seq_length=seq_length):
# Tokenization - simple split by whitespace
self.tokens = text.split()
# If a vocabulary is provided, use it, otherwise create a new one
if vocab:
self.vocab = vocab
else:
# Build vocabulary from the unique tokens with added <pad> and <eos> tokens
self.vocab = {'<pad>': 0, '<eos>': 1}
token_counts = Counter(self.tokens)
for token, _ in token_counts.items():
self.vocab[token] = len(self.vocab)
# Inverse mapping from indices to tokens
self.index2token = {index: token for token, index in self.vocab.items()}
# Convert tokens to indices
self.indexed_tokens = [self.vocab[token] for token in self.tokens]
# Sequence length
self.seq_length = seq_length
def __len__(self):
# Number of tokens divided by the sequence length gives the number of sequences
return len(self.indexed_tokens) // self.seq_length
def __getitem__(self, idx):
# Slice the indexed_tokens to get a sequence
start_idx = idx * self.seq_length
end_idx = start_idx + self.seq_length + 1 # +1 for <eos> token
sequence = self.indexed_tokens[start_idx:end_idx]
# Convert to torch tensor
return torch.tensor(sequence, dtype=torch.long)
# Load the text from a file
with open('tiny-shakespeare.txt', 'r', encoding='utf-8') as file:
text = file.read()
# Create the dataset
dataset = TextDataset(text)
#vocab_size = len(list(set(dataset.tokens)))
vocab_size = len(dataset.vocab) # Update vocab_size for the transformer model
train_data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Function to generate padding mask for sequences
def create_padding_mask(seq):
return (seq == dataset.vocab['<pad>']).transpose(0, 1)
# Training loop remains the same as before
|