import torch import torch.nn as nn import torch.nn.functional as F import random DEVICE = 'cpu' class Encoder(nn.Module): def __init__(self, vocab_size, embed_dim, enc_hid_dim, dec_hid_dim, dropout_p=0.3): super().__init__() self.embed = nn.Embedding(vocab_size, embed_dim) self.rnn = nn.GRU(embed_dim, enc_hid_dim, batch_first=True, bidirectional=True) self.fc = nn.Linear(enc_hid_dim*2, dec_hid_dim) self.dropout = nn.Dropout(dropout_p) def forward(self, x): embedded = self.dropout(self.embed(x)) outputs, hidden = self.rnn(embedded) hidden = self.fc(torch.cat((hidden[0], hidden[1]), dim=1)) hidden = hidden.unsqueeze(0) return outputs, hidden class Attention(nn.Module): def __init__(self, enc_hid_dim, dec_hid_dim): super().__init__() self.attn = nn.Linear((enc_hid_dim*2)+dec_hid_dim, dec_hid_dim) self.value = nn.Linear(dec_hid_dim, 1, bias=False) def forward(self, encoder_outputs, dec_hidden): # dec_hidden --> (1, batch, hidden_dim) # encoder_outputs --> (batch, seq_len, hidden_dim * 2) seq_len = encoder_outputs.shape[1] dec_hidden = dec_hidden.permute(1, 0, 2).repeat(1, seq_len, 1) concatenated = torch.cat((dec_hidden, encoder_outputs), dim=2) energy = F.relu(self.attn(concatenated)) attention = self.value(energy) attention = F.softmax(attention, dim=1) # attn_weights --> (batch, seq_len, 1) attn_weights = attention.permute(0, 2, 1) context = torch.bmm(attn_weights, encoder_outputs) # context --> (batch, 1, hidden_size * 2) return context class Decoder(nn.Module): def __init__(self, embed_dim, vocab_size, enc_hid_dim, dec_hid_dim, attn): super().__init__() self.attention = attn self.embed = nn.Embedding(vocab_size, embed_dim) self.rnn = nn.GRU((enc_hid_dim*2)+embed_dim, dec_hid_dim, batch_first=True) self.fc = nn.Linear(dec_hid_dim, vocab_size) def forward(self, token, encoder_outputs, hidden): embedding = self.embed(token) # context --> (batch, 1, hidden_size * 2) context = self.attention(encoder_outputs, hidden) rnn_input = torch.cat((context, embedding), dim=2) dec_outputs, dec_hidden = self.rnn(rnn_input, hidden) dec_outputs = dec_outputs.squeeze(1) predictions = self.fc(dec_outputs) return predictions, dec_hidden class Seq2Seq(nn.Module): def __init__(self, encoder, decoder): super().__init__() self.encoder = encoder self.decoder = decoder def forward(self, source, target, teacher_forcing_ratio=0.5): # source --> (batch, src_seq_len) batch_size = target.shape[0] target_len = target.shape[1] target_vocab_size = self.decoder.fc.out_features outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(DEVICE) encoder_outputs, hidden = self.encoder(source) # Grab the SOS token x = source[:, 0] for time_step in range(1, target_len): predictions, hidden = self.decoder(x.unsqueeze(1), encoder_outputs, hidden) outputs[:, time_step, :] = predictions top_1 = torch.argmax(predictions, dim=1) x = target[:, time_step] if random.random() < teacher_forcing_ratio else top_1 return outputs