Initial commit

Browse files

Files changed (4) hide show

main.py +288 -0
my_checkpoint.pth.tar +3 -0
translation.pkl +3 -0
utils.py +112 -0

main.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+import spacy
+from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
+from torch.utils.tensorboard import SummaryWriter
+from torchtext.datasets import Multi30k
+from torchtext.data import Field, BucketIterator
+from tqdm import tqdm
+"""
+To install spacy languages do:
+python -m spacy download en
+python -m spacy download de
+"""
+# Preparation of tokenizer for tokenizing english and german
+spacy_ger = spacy.load('de_core_news_sm')
+spacy_eng = spacy.load('en_core_web_sm')
+def tokenize_ger(text):
+    """ Tokenize German text """
+    return [tok.text for tok in spacy_ger.tokenizer(text)]
+def tokenize_eng(text):
+    """ Tokenize English text """
+    return [tok.text for tok in spacy_eng.tokenizer(text)]
+# Data preprocessing from torchtext
+german = Field(tokenize=tokenize_ger, lower=True,
+               init_token='<sos>', eos_token='<eos>')
+english = Field(tokenize=tokenize_eng, lower=True,
+                init_token='<sos>', eos_token='<eos>')
+# Dataset preparation
+train_data, valid_data, test_data = Multi30k.splits(
+    exts=(".de", ".en"), fields=(german, english),
+    path="/data/multi30k",  # Specify the directory to save the dataset
+)
+# Preparing vocabulary
+german.build_vocab(train_data, max_size=10000, min_freq=2)
+english.build_vocab(train_data, max_size=10000, min_freq=2)
+class Transformer(nn.Module):
+    """Transformer model for sequence-to-sequence tasks."""
+    def __init__(
+        self,
+        embedding_size,
+        src_vocab_size,
+        trg_vocab_size,
+        src_pad_idx,
+        num_heads,
+        num_encoder_layers,
+        num_decoder_layers,
+        forward_expansion,
+        dropout,
+        max_len,
+        device,
+    ):
+        """
+        Initialize Transformer model.
+        Args:
+            embedding_size (int): Size of word embeddings.
+            src_vocab_size (int): Size of source vocabulary.
+            trg_vocab_size (int): Size of target vocabulary.
+            src_pad_idx (int): Padding index for source language.
+            num_heads (int): Number of attention heads.
+            num_encoder_layers (int): Number of encoder layers.
+            num_decoder_layers (int): Number of decoder layers.
+            forward_expansion (int): Size of feedforward layer in transformer blocks.
+            dropout (float): Dropout probability.
+            max_len (int): Maximum sequence length.
+            device (torch.device): Device to run the model on.
+        """
+        super(Transformer, self).__init__()
+        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
+        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
+        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
+        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)
+        self.device = device
+        self.transformer = nn.Transformer(
+            embedding_size,
+            num_heads,
+            num_encoder_layers,
+            num_decoder_layers,
+            forward_expansion,
+            dropout,
+        )
+        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
+        self.dropout = nn.Dropout(dropout)
+        self.src_pad_idx = src_pad_idx
+    def make_src_mask(self, src):
+        """
+        Create mask to ignore padded elements in source sequence.
+        Args:
+            src (torch.Tensor): Source sequence.
+        Returns:
+            torch.Tensor: Mask tensor.
+        """
+        src_mask = src.transpose(0, 1) == self.src_pad_idx
+        return src_mask.to(self.device)
+    def forward(self, src, trg):
+        """
+        Forward pass of the Transformer model.
+        Args:
+            src (torch.Tensor): Source sequence.
+            trg (torch.Tensor): Target sequence.
+        Returns:
+            torch.Tensor: Model output.
+        """
+        src_seq_length, N = src.shape
+        trg_seq_length, N = trg.shape
+        src_positions = (
+            torch.arange(0, src_seq_length)
+            .unsqueeze(1)
+            .expand(src_seq_length, N)
+            .to(self.device)
+        )
+        trg_positions = (
+            torch.arange(0, trg_seq_length)
+            .unsqueeze(1)
+            .expand(trg_seq_length, N)
+            .to(self.device)
+        )
+        embed_src = self.dropout(
+            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
+        )
+        embed_trg = self.dropout(
+            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
+        )
+        src_padding_mask = self.make_src_mask(src)
+        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
+            self.device
+        )
+        out = self.transformer(
+            embed_src,
+            embed_trg,
+            src_key_padding_mask=src_padding_mask,
+            tgt_mask=trg_mask,
+        )
+        out = self.fc_out(out)
+        return out
+# We're ready to define everything we need for training our Seq2Seq model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+load_model = False
+save_model = True
+# Training hyperparameters
+num_epochs = 100
+learning_rate = 3e-4
+batch_size = 32
+# Model hyperparameters
+src_vocab_size = len(german.vocab)
+trg_vocab_size = len(english.vocab)
+embedding_size = 512
+num_heads = 8
+num_encoder_layers = 3
+num_decoder_layers = 3
+dropout = 0.10
+max_len = 100
+forward_expansion = 4
+src_pad_idx = english.vocab.stoi["<pad>"]
+# Tensorboard to get nice loss plot
+writer = SummaryWriter("/runs/loss_plot")
+step = 0
+train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
+    (train_data, valid_data, test_data),
+    batch_size=batch_size,
+    sort_within_batch=True,
+    sort_key=lambda x: len(x.src),
+    device=device,
+)
+model = Transformer(
+    embedding_size,
+    src_vocab_size,
+    trg_vocab_size,
+    src_pad_idx,
+    num_heads,
+    num_encoder_layers,
+    num_decoder_layers,
+    forward_expansion,
+    dropout,
+    max_len,
+    device,
+).to(device)
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+    optimizer, factor=0.1, patience=10, verbose=True
+)
+pad_idx = english.vocab.stoi["<pad>"]
+criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
+if load_model:
+    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
+sentence = "ein pferd geht unter einer brücke neben einem boot."
+for epoch in range(num_epochs):
+    print(f"[Epoch {epoch} / {num_epochs}]")
+    if save_model:
+        checkpoint = {
+            "state_dict": model.state_dict(),
+            "optimizer": optimizer.state_dict(),
+        }
+        save_checkpoint(checkpoint)
+    model.eval()
+    translated_sentence = translate_sentence(
+        model, sentence, german, english, device, max_length=50
+    )
+    print(f"Translated example sentence: \n {translated_sentence}")
+    model.train()
+    losses = []
+    for batch_idx, batch in enumerate(tqdm(train_iterator, leave=True)):
+        # Get input and targets and get to cuda
+        inp_data = batch.src.to(device)
+        target = batch.trg.to(device)
+        # Forward prop
+        output = model(inp_data, target[:-1, :])
+        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
+        # doesn't take input in that form. For example if we have MNIST we want to have
+        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
+        # way that we have output_words * batch_size that we want to send in into
+        # our cost function, so we need to do some reshaping.
+        # Let's also remove the start token while we're at it
+        output = output.reshape(-1, output.shape[2])
+        target = target[1:].reshape(-1)
+        optimizer.zero_grad()
+        loss = criterion(output, target)
+        losses.append(loss.item())
+        # Back prop
+        loss.backward()
+        # Clip to avoid exploding gradient issues, makes sure grads are
+        # within a healthy range
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
+        # Gradient descent step
+        optimizer.step()
+        # plot to tensorboard
+        writer.add_scalar("Training loss", loss, global_step=step)
+        step += 1
+    mean_loss = sum(losses) / len(losses)
+    scheduler.step(mean_loss)
+# Running on entire test data takes a while
+score = bleu(test_data[1:100], model, german, english, device)
+print(f"Bleu score {score * 100:.2f}")

my_checkpoint.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:945a255991fa7679e4cf4b0fa787e9d4d23b98874a7fcfa1f00dc0d023059533
+size 236096282

translation.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a77f07362f194f052ca54bb0b0ba19627e1a43bed421e50efb9215788117b97c
+size 78710346

utils.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+import spacy
+from torchtext.data.metrics import bleu_score
+import sys
+def translate_sentence(model, sentence, german, english, device, max_length=50):
+    """
+    Translate a sentence from German to English using the provided model.
+    Args:
+        model (nn.Module): The translation model.
+        sentence (str or list): The input German sentence as a string or list of tokens.
+        german (torchtext.data.Field): German Field object for tokenization.
+        english (torchtext.data.Field): English Field object for tokenization.
+        device (torch.device): Device to run the model on.
+        max_length (int, optional): Maximum length of the output sentence. Defaults to 50.
+    Returns:
+        list: The translated English sentence as a list of tokens.
+    """
+    # Load German tokenizer
+    spacy_ger = spacy.load("de_core_news_sm")
+    # Create tokens using spaCy and everything in lower case (which is what our vocab is)
+    if type(sentence) == str:
+        tokens = [token.text.lower() for token in spacy_ger(sentence)]
+    else:
+        tokens = [token.lower() for token in sentence]
+    # Add <SOS> and <EOS> in the beginning and end respectively
+    tokens.insert(0, german.init_token)
+    tokens.append(german.eos_token)
+    # Go through each German token and convert to an index
+    text_to_indices = [german.vocab.stoi[token] for token in tokens]
+    # Convert to Tensor
+    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
+    outputs = [english.vocab.stoi["<sos>"]]
+    for i in range(max_length):
+        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)
+        with torch.no_grad():
+            output = model(sentence_tensor, trg_tensor)
+        best_guess = output.argmax(2)[-1, :].item()
+        outputs.append(best_guess)
+        if best_guess == english.vocab.stoi["<eos>"]:
+            break
+    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
+    # Remove start token
+    return translated_sentence[1:]
+def bleu(data, model, german, english, device):
+    """
+    Calculate the BLEU score for the translation model.
+    Args:
+        data (torchtext.datasets): Dataset to evaluate the model on.
+        model (nn.Module): The translation model.
+        german (torchtext.data.Field): German Field object for tokenization.
+        english (torchtext.data.Field): English Field object for tokenization.
+        device (torch.device): Device to run the model on.
+    Returns:
+        float: The BLEU score.
+    """
+    targets = []
+    outputs = []
+    for example in data:
+        src = vars(example)["src"]
+        trg = vars(example)["trg"]
+        prediction = translate_sentence(model, src, german, english, device)
+        prediction = prediction[:-1]  # Remove <eos> token
+        targets.append([trg])
+        outputs.append(prediction)
+    return bleu_score(outputs, targets)
+def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
+    """
+    Save model checkpoint to file.
+    Args:
+        state (dict): Dictionary containing model state and optimizer state.
+        filename (str, optional): File path to save the checkpoint. Defaults to "my_checkpoint.pth.tar".
+    """
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+def load_checkpoint(checkpoint, model, optimizer):
+    """
+    Load model checkpoint from file.
+    Args:
+        checkpoint (dict): Dictionary containing model state and optimizer state.
+        model (nn.Module): The translation model.
+        optimizer (torch.optim.Optimizer): Optimizer for the model.
+    """
+    print("=> Loading checkpoint")
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])