Spaces:

ML701G7
/

taim-gan

Runtime error

App Files Files Community

Dmmc commited on Dec 5, 2022

Commit

c8ddb9b

1 Parent(s): aec1df6

three-model version

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +2 -0
app.py +117 -0
requirements.txt +5 -0
src/__init__.py +2 -0
src/__pycache__/__init__.cpython-39.pyc +0 -0
src/__pycache__/config.cpython-39.pyc +0 -0
src/config.py +47 -0
src/data/.gitkeep +0 -0
src/data/__init__.py +5 -0
src/data/__pycache__/__init__.cpython-39.pyc +0 -0
src/data/__pycache__/collate.cpython-39.pyc +0 -0
src/data/__pycache__/datasets.cpython-39.pyc +0 -0
src/data/__pycache__/tokenizer.cpython-39.pyc +0 -0
src/data/collate.py +43 -0
src/data/datasets.py +387 -0
src/data/stubs/bird.jpg +0 -0
src/data/stubs/pigeon.jpg +0 -0
src/data/stubs/rohit.jpeg +0 -0
src/data/tokenizer.py +23 -0
src/features/.gitkeep +0 -0
src/features/__init__.py +0 -0
src/features/build_features.py +0 -0
src/models/.gitkeep +0 -0
src/models/__init__.py +4 -0
src/models/__pycache__/__init__.cpython-39.pyc +0 -0
src/models/__pycache__/losses.cpython-39.pyc +0 -0
src/models/__pycache__/train_model.cpython-39.pyc +0 -0
src/models/__pycache__/utils.cpython-39.pyc +0 -0
src/models/losses.py +344 -0
src/models/modules/__init__.py +12 -0
src/models/modules/__pycache__/__init__.cpython-39.pyc +0 -0
src/models/modules/__pycache__/acm.cpython-39.pyc +0 -0
src/models/modules/__pycache__/attention.cpython-39.pyc +0 -0
src/models/modules/__pycache__/cond_augment.cpython-39.pyc +0 -0
src/models/modules/__pycache__/conv_utils.cpython-39.pyc +0 -0
src/models/modules/__pycache__/discriminator.cpython-39.pyc +0 -0
src/models/modules/__pycache__/downsample.cpython-39.pyc +0 -0
src/models/modules/__pycache__/generator.cpython-39.pyc +0 -0
src/models/modules/__pycache__/image_encoder.cpython-39.pyc +0 -0
src/models/modules/__pycache__/residual.cpython-39.pyc +0 -0
src/models/modules/__pycache__/text_encoder.cpython-39.pyc +0 -0
src/models/modules/__pycache__/upsample.cpython-39.pyc +0 -0
src/models/modules/acm.py +37 -0
src/models/modules/attention.py +88 -0
src/models/modules/cond_augment.py +57 -0
src/models/modules/conv_utils.py +78 -0
src/models/modules/discriminator.py +144 -0
src/models/modules/downsample.py +14 -0
src/models/modules/generator.py +300 -0
src/models/modules/image_encoder.py +138 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/*
2	+ .idea/*

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import numpy as np # this should come first to mitigate mlk-service bug
+from src.models.utils import get_image_arr, load_model
+from src.data import TAIMGANTokenizer
+from torchvision import transforms
+from src.config import config_dict
+from pathlib import Path
+from enum import IntEnum, auto
+from PIL import Image
+import gradio as gr
+import torch
+from src.models.modules import (
+    VGGEncoder,
+    InceptionEncoder,
+    TextEncoder,
+    Generator
+)
+##########
+# PARAMS #
+##########
+IMG_CHANS = 3  # RGB channels for image
+IMG_HW = 256  # height and width of images
+HIDDEN_DIM = 128  # hidden dimensions of lstm cell in one direction
+C = 2 * HIDDEN_DIM  # length of embeddings
+Ng = config_dict["Ng"]
+cond_dim = config_dict["condition_dim"]
+z_dim = config_dict["noise_dim"]
+###############
+# LOAD MODELS #
+###############
+models = {
+    "COCO": {
+        "dir": "weights/coco"
+    },
+    "Bird": {
+        "dir": "weights/bird"
+    },
+    "UTKFace": {
+        "dir": "weights/utkface"
+    }
+}
+for model_name in models:
+    # create tokenizer
+    models[model_name]["tokenizer"] = TAIMGANTokenizer(captions_path=f"{models[model_name]['dir']}/captions.pickle")
+    vocab_size = len(models[model_name]["tokenizer"].word_to_ix)
+    # instantiate models
+    models[model_name]["generator"] = Generator(Ng=Ng, D=C, conditioning_dim=cond_dim, noise_dim=z_dim).eval()
+    models[model_name]["lstm"] = TextEncoder(vocab_size=vocab_size, emb_dim=C, hidden_dim=HIDDEN_DIM).eval()
+    models[model_name]["vgg"] = VGGEncoder().eval()
+    models[model_name]["inception"] = InceptionEncoder(D=C).eval()
+    # load models
+    load_model(
+        generator=models[model_name]["generator"],
+        discriminator=None,
+        image_encoder=models[model_name]["inception"],
+        text_encoder=models[model_name]["lstm"],
+        output_dir=Path(models[model_name]["dir"]),
+        device=torch.device("cpu")
+    )
+def change_image_with_text(image: Image, text: str, model_name: str) -> Image:
+    """
+    Create an image modified by text from the original image
+    and save it with _modified postfix
+    :param gr.Image image: Path to the image
+    :param str text: Desired caption
+    """
+    global models
+    tokenizer = models[model_name]["tokenizer"]
+    G = models[model_name]["generator"]
+    lstm = models[model_name]["lstm"]
+    inception = models[model_name]["inception"]
+    vgg = models[model_name]["vgg"]
+    # generate some noise
+    noise = torch.rand(z_dim).unsqueeze(0)
+    # transform input text and get masks with embeddings
+    tokens = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
+    mask = (tokens == tokenizer.pad_token_id)
+    word_embs, sent_embs = lstm(tokens)
+    # open the image and transform it to the tensor
+    image = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Resize((IMG_HW, IMG_HW)),
+        transforms.Normalize(
+            mean=(0.5, 0.5, 0.5),
+            std=(0.5, 0.5, 0.5)
+        )
+    ])(image).unsqueeze(0)
+    # obtain visual features of the image
+    vgg_features = vgg(image)
+    local_features, global_features = inception(image)
+    # generate new image from the old one
+    fake_image, _, _ = G(noise, sent_embs, word_embs, global_features,
+                         local_features, vgg_features, mask)
+    # denormalize the image
+    fake_image = Image.fromarray(get_image_arr(fake_image)[0])
+    # return image in gradio format
+    return fake_image
+##########
+# GRADIO #
+##########
+demo = gr.Interface(
+    fn=change_image_with_text,
+    inputs=[gr.Image(type="pil"), "text", gr.inputs.Dropdown(list(models.keys()))],
+    outputs=gr.Image(type="pil")
+)
+demo.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Pillow
+torch
+torchvision
+torchaudio
+nltk

src/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """Config file for the project."""
2	+ from .config import config_dict, update_config

src/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (260 Bytes). View file

src/__pycache__/config.cpython-39.pyc ADDED Viewed

Binary file (1.17 kB). View file

src/config.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Configurations for the project."""
+from pathlib import Path
+from typing import Any, Dict
+import torch
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+repo_path = Path(__file__).parent.parent.absolute()
+output_path = repo_path / "models"
+config_dict = {
+    "Ng": 32,
+    "D": 256,
+    "condition_dim": 100,
+    "noise_dim": 100,
+    "lr_config": {
+        "disc_lr": 2e-4,
+        "gen_lr": 2e-4,
+        "img_encoder_lr": 3e-3,
+        "text_encoder_lr": 3e-3,
+    },
+    "batch_size": 64,
+    "device": device,
+    "epochs": 200,
+    "output_dir": output_path,
+    "snapshot": 5,
+    "const_dict": {
+        "smooth_val_gen": 0.999,
+        "lambda1": 1,
+        "lambda2": 1,
+        "lambda3": 1,
+        "lambda4": 1,
+        "gamma1": 4,
+        "gamma2": 5,
+        "gamma3": 10,
+    },
+}
+def update_config(cfg_dict: Dict[str, Any], **kwargs: Any) -> Dict[str, Any]:
+    """
+    Function to update the configuration dictionary.
+    """
+    for key, value in kwargs.items():
+        cfg_dict[key] = value
+    return cfg_dict

src/data/.gitkeep ADDED Viewed

File without changes

src/data/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Dataset and custom collate function to load"""
+from .collate import custom_collate
+from .datasets import TextImageDataset
+from .tokenizer import TAIMGANTokenizer

src/data/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (372 Bytes). View file

src/data/__pycache__/collate.cpython-39.pyc ADDED Viewed

Binary file (1.3 kB). View file

src/data/__pycache__/datasets.cpython-39.pyc ADDED Viewed

Binary file (11.8 kB). View file

src/data/__pycache__/tokenizer.cpython-39.pyc ADDED Viewed

Binary file (1.55 kB). View file

src/data/collate.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Custom collate function for the data loader."""
+from typing import Any, List
+import torch
+from torch.nn.utils.rnn import pad_sequence
+def custom_collate(batch: List[Any], device: Any) -> Any:
+    """
+    Custom collate function to be used in the data loader.
+    :param batch: list, with length equal to number of batches.
+    :return: processed batch of data [add padding to text, stack tensors in batch]
+    """
+    img, correct_capt, curr_class, word_labels = zip(*batch)
+    batched_img = torch.stack(img, dim=0).to(
+        device
+    )  # shape: (batch_size, 3, height, width)
+    correct_capt_len = torch.tensor(
+        [len(capt) for capt in correct_capt], dtype=torch.int64
+    ).unsqueeze(
+        1
+    )  # shape: (batch_size, 1)
+    batched_correct_capt = pad_sequence(
+        correct_capt, batch_first=True, padding_value=0
+    ).to(
+        device
+    )  # shape: (batch_size, max_seq_len)
+    batched_curr_class = torch.stack(curr_class, dim=0).to(
+        device
+    )  # shape: (batch_size, 1)
+    batched_word_labels = pad_sequence(
+        word_labels, batch_first=True, padding_value=0
+    ).to(
+        device
+    )  # shape: (batch_size, max_seq_len)
+    return (
+        batched_img,
+        batched_correct_capt,
+        correct_capt_len,
+        batched_curr_class,
+        batched_word_labels,
+    )

src/data/datasets.py ADDED Viewed

	@@ -0,0 +1,387 @@

+"""Pytorch Dataset classes for the datasets used in the project."""
+import os
+import pickle
+from collections import defaultdict
+from typing import Any
+import nltk
+import numpy as np
+import pandas as pd
+import torch
+import torchvision.transforms.functional as F
+from nltk.tokenize import RegexpTokenizer
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+class TextImageDataset(Dataset):  # type: ignore
+    """Custom PyTorch Dataset class to load Image and Text data."""
+    # pylint: disable=too-many-instance-attributes
+    # pylint: disable=too-many-locals
+    # pylint: disable=too-many-function-args
+    def __init__(
+        self, data_path: str, split: str, num_captions: int, transform: Any = None
+    ):
+        """
+        :param data_path: Path to the data directory. [i.e. can be './birds/', or './coco/]
+        :param split: 'train' or 'test' split
+        :param num_captions: number of captions present per image.
+        [For birds, this is 10, for coco, this is 5]
+        :param transform: PyTorch transform to apply to the images.
+        """
+        self.transform = transform
+        self.bound_box_map = None
+        self.file_names = self.load_filenames(data_path, split)
+        self.data_path = data_path
+        self.num_captions_per_image = num_captions
+        (
+            self.captions,
+            self.ix_to_word,
+            self.word_to_ix,
+            self.vocab_len,
+        ) = self.get_capt_and_vocab(data_path, split)
+        self.normalize = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+            ]
+        )
+        self.class_ids = self.get_class_id(data_path, split, len(self.file_names))
+        if self.data_path.endswith("birds/"):
+            self.bound_box_map = self.get_bound_box(data_path)
+        elif self.data_path.endswith("coco/"):
+            pass
+        else:
+            raise ValueError(
+                "Invalid data path. Please ensure the data [CUB/COCO] is stored in correct folders."
+            )
+    def __len__(self) -> int:
+        """Return the length of the dataset."""
+        return len(self.file_names)
+    def __getitem__(self, idx: int) -> Any:
+        """
+        Return the item at index idx.
+        :param idx: index of the item to return
+        :return img_tensor: image tensor
+        :return correct_caption: correct caption for the image [list of word indices]
+        :return curr_class_id: class id of the image
+        :return word_labels: POS_tagged word labels [1 for noun and adjective, 0 else]
+        """
+        file_name = self.file_names[idx]
+        curr_class_id = self.class_ids[idx]
+        if self.bound_box_map is not None:
+            bbox = self.bound_box_map[file_name]
+            images_dir = os.path.join(self.data_path, "CUB_200_2011/images")
+        else:
+            bbox = None
+            images_dir = os.path.join(self.data_path, "images")
+        img_path = os.path.join(images_dir, file_name + ".jpg")
+        img_tensor = self.get_image(img_path, bbox, self.transform)
+        rand_sent_idx = np.random.randint(0, self.num_captions_per_image)
+        rand_sent_idx = idx * self.num_captions_per_image + rand_sent_idx
+        correct_caption = torch.tensor(self.captions[rand_sent_idx], dtype=torch.int64)
+        num_words = len(correct_caption)
+        capt_token_list = []
+        for i in range(num_words):
+            capt_token_list.append(self.ix_to_word[correct_caption[i].item()])
+        pos_tag_list = nltk.tag.pos_tag(capt_token_list)
+        word_labels = []
+        for pos_tag in pos_tag_list:
+            if (
+                "NN" in pos_tag[1] or "JJ" in pos_tag[1]
+            ):  # check for Nouns and Adjective only
+                word_labels.append(1)
+            else:
+                word_labels.append(0)
+        word_labels = torch.tensor(word_labels).float()  # type: ignore
+        curr_class_id = torch.tensor(curr_class_id, dtype=torch.int64).unsqueeze(0)
+        return (
+            img_tensor,
+            correct_caption,
+            curr_class_id,
+            word_labels,
+        )
+    def get_capt_and_vocab(self, data_dir: str, split: str) -> Any:
+        """
+        Helper function to get the captions, vocab dict for each image.
+        :param data_dir: path to the data directory [i.e. './birds/' or './coco/']
+        :param split: 'train' or 'test' split
+        :return captions: list of all captions for each image
+        :return ix_to_word: dictionary mapping index to word
+        :return word_to_ix: dictionary mapping word to index
+        :return num_words: number of unique words in the vocabulary
+        """
+        captions_ckpt_path = os.path.join(data_dir, "stubs/captions.pickle")
+        if os.path.exists(
+            captions_ckpt_path
+        ):  # check if previously processed captions exist
+            with open(captions_ckpt_path, "rb") as ckpt_file:
+                captions = pickle.load(ckpt_file)
+                train_captions, test_captions = captions[0], captions[1]
+                ix_to_word, word_to_ix = captions[2], captions[3]
+                num_words = len(ix_to_word)
+                del captions
+                if split == "train":
+                    return train_captions, ix_to_word, word_to_ix, num_words
+                return test_captions, ix_to_word, word_to_ix, num_words
+        else:  # if not, process the captions and save them
+            train_files = self.load_filenames(data_dir, "train")
+            test_files = self.load_filenames(data_dir, "test")
+            train_captions_tokenized = self.get_tokenized_captions(
+                data_dir, train_files
+            )
+            test_captions_tokenized = self.get_tokenized_captions(
+                data_dir, test_files
+            )  # we need both train and test captions to build the vocab
+            (
+                train_captions,
+                test_captions,
+                ix_to_word,
+                word_to_ix,
+                num_words,
+            ) = self.build_vocab(  # type: ignore
+                train_captions_tokenized, test_captions_tokenized, split
+            )
+            vocab_list = [train_captions, test_captions, ix_to_word, word_to_ix]
+            with open(captions_ckpt_path, "wb") as ckpt_file:
+                pickle.dump(vocab_list, ckpt_file)
+            if split == "train":
+                return train_captions, ix_to_word, word_to_ix, num_words
+            if split == "test":
+                return test_captions, ix_to_word, word_to_ix, num_words
+            raise ValueError("Invalid split. Please use 'train' or 'test'")
+    def build_vocab(
+        self, tokenized_captions_train: list, tokenized_captions_test: list  # type: ignore
+    ) -> Any:
+        """
+        Helper function which builds the vocab dicts.
+        :param tokenized_captions_train: list containing all the
+        train tokenized captions in the dataset. This is list of lists.
+        :param tokenized_captions_test: list containing all the
+        test tokenized captions in the dataset. This is list of lists.
+        :return train_captions_int: list of all captions in training,
+        where each word is replaced by its index in the vocab
+        :return test_captions_int: list of all captions in test,
+        where each word is replaced by its index in the vocab
+        :return ix_to_word: dictionary mapping index to word
+        :return word_to_ix: dictionary mapping word to index
+        :return num_words: number of unique words in the vocabulary
+        """
+        vocab = defaultdict(int)  # type: ignore
+        total_captions = tokenized_captions_train + tokenized_captions_test
+        for caption in total_captions:
+            for word in caption:
+                vocab[word] += 1
+        # sort vocab dict by frequency in descending order
+        vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)  # type: ignore
+        ix_to_word = {}
+        word_to_ix = {}
+        ix_to_word[0] = "<end>"
+        word_to_ix["<end>"] = 0
+        word_idx = 1
+        for word, _ in vocab:
+            word_to_ix[word] = word_idx
+            ix_to_word[word_idx] = word
+            word_idx += 1
+        train_captions_int = []  # we want to convert words to indices in vocab.
+        for caption in tokenized_captions_train:
+            curr_caption_int = []
+            for word in caption:
+                curr_caption_int.append(word_to_ix[word])
+            train_captions_int.append(curr_caption_int)
+        test_captions_int = []
+        for caption in tokenized_captions_test:
+            curr_caption_int = []
+            for word in caption:
+                curr_caption_int.append(word_to_ix[word])
+            test_captions_int.append(curr_caption_int)
+        return (
+            train_captions_int,
+            test_captions_int,
+            ix_to_word,
+            word_to_ix,
+            len(ix_to_word),
+        )
+    def get_tokenized_captions(self, data_dir: str, filenames: list) -> Any:  # type: ignore
+        """
+        Helper function to tokenize and return captions for each image in filenames.
+        :param data_dir: path to the data directory [i.e. './birds/' or './coco/']
+        :param filenames: list of all filenames corresponding to the split
+        :return tokenized_captions: list of all tokenized captions for all files in filenames.
+        [this returns a list, where each element is again a list of tokens/words]
+        """
+        all_captions = []
+        for filename in filenames:
+            caption_path = os.path.join(data_dir, "text", filename + ".txt")
+            with open(caption_path, "r", encoding="utf8") as txt_file:
+                captions = txt_file.readlines()
+                count = 0
+                for caption in captions:
+                    if len(caption) == 0:
+                        continue
+                    caption = caption.replace("\ufffd\ufffd", " ")
+                    tokenizer = RegexpTokenizer(r"\w+")
+                    tokens = tokenizer.tokenize(
+                        caption.lower()
+                    )  # splits current caption/line to list of words/tokens
+                    if len(tokens) == 0:
+                        continue
+                    tokens = [
+                        t.encode("ascii", "ignore").decode("ascii") for t in tokens
+                    ]
+                    tokens = [t for t in tokens if len(t) > 0]
+                    all_captions.append(tokens)
+                    count += 1
+                    if count == self.num_captions_per_image:
+                        break
+                    if count < self.num_captions_per_image:
+                        raise ValueError(
+                            f"Number of captions for {filename} is only {count},\
+                                which is less than {self.num_captions_per_image}."
+                        )
+        return all_captions
+    def get_image(self, img_path: str, bbox: list, transform: Any) -> Any:  # type: ignore
+        """
+        Helper function to load and transform an image.
+        :param img_path: path to the image
+        :param bbox: bounding box coordinates [x, y, width, height]
+        :param transform: PyTorch transform to apply to the image
+        :return img_tensor: transformed image tensor
+        """
+        img = Image.open(img_path).convert("RGB")
+        width, height = img.size
+        if bbox is not None:
+            r_val = int(np.maximum(bbox[2], bbox[3]) * 0.75)
+            center_x = int((2 * bbox[0] + bbox[2]) / 2)
+            center_y = int((2 * bbox[1] + bbox[3]) / 2)
+            y1_coord = np.maximum(0, center_y - r_val)
+            y2_coord = np.minimum(height, center_y + r_val)
+            x1_coord = np.maximum(0, center_x - r_val)
+            x2_coord = np.minimum(width, center_x + r_val)
+            img = img.crop(
+                [x1_coord, y1_coord, x2_coord, y2_coord]
+            )  # This preprocessing steps seems to follow from
+            # Stackgan: Text to photo-realistic image synthesis
+        if transform is not None:
+            img_tensor = transform(img)  # this scales to 304x304, i.e. 256 x (76/64).
+            x_val = np.random.randint(0, 48)  # 304 - 256 = 48
+            y_val = np.random.randint(0, 48)
+            flip = np.random.rand() > 0.5
+            # crop
+            img_tensor = img_tensor.crop(
+                [x_val, y_val, x_val + 256, y_val + 256]
+            )  # this crops to 256x256
+            if flip:
+                img_tensor = F.hflip(img_tensor)
+        img_tensor = self.normalize(img_tensor)
+        return img_tensor
+    def load_filenames(self, data_dir: str, split: str) -> Any:
+        """
+        Helper function to get list of all image filenames.
+        :param data_dir: path to the data directory [i.e. './birds/' or './coco/']
+        :param split: 'train' or 'test' split
+        :return filenames: list of all image filenames
+        """
+        filepath = f"{data_dir}{split}/filenames.pickle"
+        if os.path.isfile(filepath):
+            with open(filepath, "rb") as pick_file:
+                filenames = pickle.load(pick_file)
+        else:
+            raise ValueError(
+                "Invalid split. Please use 'train' or 'test',\
+                     or make sure the filenames.pickle file exists."
+            )
+        return filenames
+    def get_class_id(self, data_dir: str, split: str, total_elems: int) -> Any:
+        """
+        Helper function to get list of all image class ids.
+        :param data_dir: path to the data directory [i.e. './birds/' or './coco/']
+        :param split: 'train' or 'test' split
+        :param total_elems: total number of elements in the dataset
+        :return class_ids: list of all image class ids
+        """
+        filepath = f"{data_dir}{split}/class_info.pickle"
+        if os.path.isfile(filepath):
+            with open(filepath, "rb") as class_file:
+                class_ids = pickle.load(class_file, encoding="latin1")
+        else:
+            class_ids = np.arange(total_elems)
+        return class_ids
+    def get_bound_box(self, data_path: str) -> Any:
+        """
+        Helper function to get the bounding box for birds dataset.
+        :param data_path: path to birds data directory [i.e. './data/birds/']
+        :return imageToBox: dictionary mapping image name to bounding box coordinates
+        """
+        bbox_path = os.path.join(data_path, "CUB_200_2011/bounding_boxes.txt")
+        df_bounding_boxes = pd.read_csv(
+            bbox_path, delim_whitespace=True, header=None
+        ).astype(int)
+        filepath = os.path.join(data_path, "CUB_200_2011/images.txt")
+        df_filenames = pd.read_csv(filepath, delim_whitespace=True, header=None)
+        filenames = df_filenames[
+            1
+        ].tolist()  # df_filenames[0] just contains the index or ID.
+        img_to_box = {  # type: ignore
+            img_file[:-4]: [] for img_file in filenames
+        }  # remove the .jpg extension from the names
+        num_imgs = len(filenames)
+        for i in range(0, num_imgs):
+            bbox = df_bounding_boxes.iloc[i][1:].tolist()
+            key = filenames[i][:-4]
+            img_to_box[key] = bbox
+        return img_to_box

src/data/stubs/bird.jpg ADDED Viewed

src/data/stubs/pigeon.jpg ADDED Viewed

src/data/stubs/rohit.jpeg ADDED Viewed

src/data/tokenizer.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import pickle
+import re
+from typing import List
+class TAIMGANTokenizer:
+    def __init__(self, captions_path):
+        with open(captions_path, "rb") as ckpt_file:
+            captions = pickle.load(ckpt_file)
+            self.ix_to_word = captions[2]
+            self.word_to_ix = captions[3]
+        self.token_regex = r'\w+'
+        self.pad_token_id = self.word_to_ix["<end>"]
+        self.pad_repr = "[PAD]"
+    def encode(self, text: str) -> List[int]:
+        return [self.word_to_ix.get(word, self.pad_token_id)
+                for word in re.findall(self.token_regex, text.lower())]
+    def decode(self, tokens: List[int]) -> str:
+        return ' '.join([self.ix_to_word[token]
+                         if token != self.pad_token_id else self.pad_repr
+                         for token in tokens])

src/features/.gitkeep ADDED Viewed

File without changes

src/features/__init__.py ADDED Viewed

File without changes

src/features/build_features.py ADDED Viewed

File without changes

src/models/.gitkeep ADDED Viewed

File without changes

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""Helper functions for training loop."""
+from .losses import discriminator_loss, generator_loss, kl_loss
+from .train_model import train
+from .utils import copy_gen_params, define_optimizers, load_params, prepare_labels

src/models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (461 Bytes). View file

src/models/__pycache__/losses.cpython-39.pyc ADDED Viewed

Binary file (8.36 kB). View file

src/models/__pycache__/train_model.cpython-39.pyc ADDED Viewed

Binary file (3.82 kB). View file

src/models/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (8.76 kB). View file

src/models/losses.py ADDED Viewed

	@@ -0,0 +1,344 @@

+"""Module containing the loss functions for the GANs."""
+from typing import Any, Dict
+import torch
+from torch import nn
+# pylint: disable=too-many-arguments
+# pylint: disable=too-many-locals
+def generator_loss(
+    logits: Dict[str, Dict[str, torch.Tensor]],
+    local_fake_incept_feat: torch.Tensor,
+    global_fake_incept_feat: torch.Tensor,
+    real_labels: torch.Tensor,
+    words_emb: torch.Tensor,
+    sent_emb: torch.Tensor,
+    match_labels: torch.Tensor,
+    cap_lens: torch.Tensor,
+    class_ids: torch.Tensor,
+    real_vgg_feat: torch.Tensor,
+    fake_vgg_feat: torch.Tensor,
+    const_dict: Dict[str, float],
+) -> Any:
+    """Calculate the loss for the generator.
+    Args:
+        logits: Dictionary with fake/real and word-level/uncond/cond logits
+        local_fake_incept_feat: The local inception features for the fake images.
+        global_fake_incept_feat: The global inception features for the fake images.
+        real_labels: Label for "real" image as predicted by discriminator,
+        this is a tensor of ones. [shape: (batch_size, 1)].
+        word_labels: POS tagged word labels for the captions. [shape: (batch_size, L)]
+        words_emb: The embeddings for all the words in the captions.
+        shape: (batch_size, embedding_size, max_caption_length)
+        sent_emb: The embeddings for the sentences.
+        shape: (batch_size, embedding_size)
+        match_labels: Tensor of shape: (batch_size, 1).
+        This is of the form torch.tensor([0, 1, 2, ..., batch-1])
+        cap_lens: The length of the 'actual' captions in the batch [without padding]
+        shape: (batch_size, 1)
+        class_ids: The class ids for the instance. shape: (batch_size, 1)
+        real_vgg_feat: The vgg features for the real images. shape: (batch_size, 128, 128, 128)
+        fake_vgg_feat: The vgg features for the fake images. shape: (batch_size, 128, 128, 128)
+        const_dict: The dictionary containing the constants.
+    """
+    lambda1 = const_dict["lambda1"]
+    total_error_g = 0.0
+    cond_logits = logits["fake"]["cond"]
+    cond_err_g = nn.BCEWithLogitsLoss()(cond_logits, real_labels)
+    uncond_logits = logits["fake"]["uncond"]
+    uncond_err_g = nn.BCEWithLogitsLoss()(uncond_logits, real_labels)
+    # add up the conditional and unconditional losses
+    loss_g = cond_err_g + uncond_err_g
+    total_error_g += loss_g
+    # DAMSM Loss from attnGAN.
+    loss_damsm = damsm_loss(
+        local_fake_incept_feat,
+        global_fake_incept_feat,
+        words_emb,
+        sent_emb,
+        match_labels,
+        cap_lens,
+        class_ids,
+        const_dict,
+    )
+    total_error_g += loss_damsm
+    loss_per = 0.5 * nn.MSELoss()(real_vgg_feat, fake_vgg_feat)  # perceptual loss
+    total_error_g += lambda1 * loss_per
+    return total_error_g
+def damsm_loss(
+    local_incept_feat: torch.Tensor,
+    global_incept_feat: torch.Tensor,
+    words_emb: torch.Tensor,
+    sent_emb: torch.Tensor,
+    match_labels: torch.Tensor,
+    cap_lens: torch.Tensor,
+    class_ids: torch.Tensor,
+    const_dict: Dict[str, float],
+) -> Any:
+    """Calculate the DAMSM loss from the attnGAN paper.
+    Args:
+        local_incept_feat: The local inception features. [shape: (batch, D, 17, 17)]
+        global_incept_feat: The global inception features. [shape: (batch, D)]
+        words_emb: The embeddings for all the words in the captions.
+        shape: (batch, D, max_caption_length)
+        sent_emb: The embeddings for the sentences. shape: (batch_size, D)
+        match_labels: Tensor of shape: (batch_size, 1).
+        This is of the form torch.tensor([0, 1, 2, ..., batch-1])
+        cap_lens: The length of the 'actual' captions in the batch [without padding]
+        shape: (batch_size, 1)
+        class_ids: The class ids for the instance. shape: (batch, 1)
+        const_dict: The dictionary containing the constants.
+    """
+    batch_size = match_labels.size(0)
+    # Mask mis-match samples, that come from the same class as the real sample
+    masks = []
+    match_scores = []
+    gamma1 = const_dict["gamma1"]
+    gamma2 = const_dict["gamma2"]
+    gamma3 = const_dict["gamma3"]
+    lambda3 = const_dict["lambda3"]
+    for i in range(batch_size):
+        mask = (class_ids == class_ids[i]).int()
+        # This ensures that "correct class" index is not included in the mask.
+        mask[i] = 0
+        masks.append(mask.reshape(1, -1))  # shape: (1, batch)
+        numb_words = int(cap_lens[i])
+        # shape: (1, D, L), this picks the caption at ith batch index.
+        query_words = words_emb[i, :, :numb_words].unsqueeze(0)
+        # shape: (batch, D, L), this expands the same caption for all batch indices.
+        query_words = query_words.repeat(batch_size, 1, 1)
+        c_i = compute_region_context_vector(
+            local_incept_feat, query_words, gamma1
+        )  # Taken from attnGAN paper. shape: (batch, D, L)
+        query_words = query_words.transpose(1, 2)  # shape: (batch, L, D)
+        c_i = c_i.transpose(1, 2)  # shape: (batch, L, D)
+        query_words = query_words.reshape(
+            batch_size * numb_words, -1
+        )  # shape: (batch * L, D)
+        c_i = c_i.reshape(batch_size * numb_words, -1)  # shape: (batch * L, D)
+        r_i = compute_relevance(
+            c_i, query_words
+        )  # cosine similarity, or R(c_i, e_i) from attnGAN paper. shape: (batch * L, 1)
+        r_i = r_i.view(batch_size, numb_words)  # shape: (batch, L)
+        r_i = torch.exp(r_i * gamma2)  # shape: (batch, L)
+        r_i = r_i.sum(dim=1, keepdim=True)  # shape: (batch, 1)
+        r_i = torch.log(
+            r_i
+        )  # This is image-text matching score b/w whole image and caption, shape: (batch, 1)
+        match_scores.append(r_i)
+    masks = torch.cat(masks, dim=0).bool()  # type: ignore
+    match_scores = torch.cat(match_scores, dim=1)  # type: ignore
+    # This corresponds to P(D|Q) from attnGAN.
+    match_scores = gamma3 * match_scores  # type: ignore
+    match_scores.data.masked_fill_(  # type: ignore
+        masks, -float("inf")
+    )  # mask out the scores for mis-matched samples
+    match_scores_t = match_scores.transpose(  # type: ignore
+        0, 1
+    )  # This corresponds to P(Q|D) from attnGAN.
+    # This corresponds to L1_w from attnGAN.
+    l1_w = nn.CrossEntropyLoss()(match_scores, match_labels)
+    # This corresponds to L2_w from attnGAN.
+    l2_w = nn.CrossEntropyLoss()(match_scores_t, match_labels)
+    incept_feat_norm = torch.linalg.norm(global_incept_feat, dim=1)
+    sent_emb_norm = torch.linalg.norm(sent_emb, dim=1)
+    # shape: (batch, batch)
+    global_match_score = global_incept_feat @ (sent_emb.T)
+    global_match_score = (
+        global_match_score / torch.outer(incept_feat_norm, sent_emb_norm)
+    ).clamp(min=1e-8)
+    global_match_score = gamma3 * global_match_score
+    # mask out the scores for mis-matched samples
+    global_match_score.data.masked_fill_(masks, -float("inf"))  # type: ignore
+    global_match_t = global_match_score.T  # shape: (batch, batch)
+    # This corresponds to L1_s from attnGAN.
+    l1_s = nn.CrossEntropyLoss()(global_match_score, match_labels)
+    # This corresponds to L2_s from attnGAN.
+    l2_s = nn.CrossEntropyLoss()(global_match_t, match_labels)
+    loss_damsm = lambda3 * (l1_w + l2_w + l1_s + l2_s)
+    return loss_damsm
+def compute_relevance(c_i: torch.Tensor, query_words: torch.Tensor) -> Any:
+    """Computes the cosine similarity between the region context vector and the query words.
+    Args:
+        c_i: The region context vector. shape: (batch * L, D)
+        query_words: The query words. shape: (batch * L, D)
+    """
+    prod = c_i * query_words  # shape: (batch * L, D)
+    numr = torch.sum(prod, dim=1)  # shape: (batch * L, 1)
+    norm_c = torch.linalg.norm(c_i, ord=2, dim=1)
+    norm_q = torch.linalg.norm(query_words, ord=2, dim=1)
+    denr = norm_c * norm_q
+    r_i = (numr / denr).clamp(min=1e-8).squeeze()  # shape: (batch * L, 1)
+    return r_i
+def compute_region_context_vector(
+    local_incept_feat: torch.Tensor, query_words: torch.Tensor, gamma1: float
+) -> Any:
+    """Compute the region context vector (c_i) from attnGAN paper.
+    Args:
+        local_incept_feat: The local inception features. [shape: (batch, D, 17, 17)]
+        query_words: The embeddings for all the words in the captions. shape: (batch, D, L)
+        gamma1: The gamma1 value from attnGAN paper.
+    """
+    batch, L = query_words.size(0), query_words.size(2)  # pylint: disable=invalid-name
+    feat_height, feat_width = local_incept_feat.size(2), local_incept_feat.size(3)
+    N = feat_height * feat_width  # pylint: disable=invalid-name
+    # Reshape the local inception features to (batch, D, N)
+    local_incept_feat = local_incept_feat.view(batch, -1, N)
+    # shape: (batch, N, D)
+    incept_feat_t = local_incept_feat.transpose(1, 2)
+    sim_matrix = incept_feat_t @ query_words  # shape: (batch, N, L)
+    sim_matrix = sim_matrix.view(batch * N, L)  # shape: (batch * N, L)
+    sim_matrix = nn.Softmax(dim=1)(sim_matrix)  # shape: (batch * N, L)
+    sim_matrix = sim_matrix.view(batch, N, L)  # shape: (batch, N, L)
+    sim_matrix = torch.transpose(sim_matrix, 1, 2)  # shape: (batch, L, N)
+    sim_matrix = sim_matrix.reshape(batch * L, N)  # shape: (batch * L, N)
+    alpha_j = gamma1 * sim_matrix  # shape: (batch * L, N)
+    alpha_j = nn.Softmax(dim=1)(alpha_j)  # shape: (batch * L, N)
+    alpha_j = alpha_j.view(batch, L, N)  # shape: (batch, L, N)
+    alpha_j_t = torch.transpose(alpha_j, 1, 2)  # shape: (batch, N, L)
+    c_i = (
+        local_incept_feat @ alpha_j_t
+    )  # shape: (batch, D, L) [summing over N dimension in paper, so we multiply like this]
+    return c_i
+def discriminator_loss(
+    logits: Dict[str, Dict[str, torch.Tensor]],
+    labels: Dict[str, Dict[str, torch.Tensor]],
+) -> Any:
+    """
+    Calculate discriminator objective
+    :param dict[str, dict[str, torch.Tensor]] logits:
+        Dictionary with fake/real and word-level/uncond/cond logits
+        Example:
+        logits = {
+            "fake": {
+                "word_level": torch.Tensor (BxL)
+                "uncond": torch.Tensor (Bx1)
+                "cond": torch.Tensor (Bx1)
+            },
+            "real": {
+                "word_level": torch.Tensor (BxL)
+                "uncond": torch.Tensor (Bx1)
+                "cond": torch.Tensor (Bx1)
+            },
+        }
+    :param dict[str, dict[str, torch.Tensor]] labels:
+        Dictionary with fake/real and word-level/image labels
+        Example:
+        labels = {
+            "fake": {
+                "word_level": torch.Tensor (BxL)
+                "image": torch.Tensor (Bx1)
+            },
+            "real": {
+                "word_level": torch.Tensor (BxL)
+                "image": torch.Tensor (Bx1)
+            },
+        }
+    :param float lambda_4: Hyperparameter for word loss in paper
+    :return: Discriminator objective loss
+    :rtype: Any
+    """
+    # define main loss functions for logit losses
+    tot_loss = 0.0
+    bce_logits = nn.BCEWithLogitsLoss()
+    bce = nn.BCELoss()
+    # calculate word-level loss
+    word_loss = bce(logits["real"]["word_level"], labels["real"]["word_level"])
+    # calculate unconditional adversarial loss
+    uncond_loss = bce_logits(logits["real"]["uncond"], labels["real"]["image"])
+    # calculate conditional adversarial loss
+    cond_loss = bce_logits(logits["real"]["cond"], labels["real"]["image"])
+    tot_loss = (uncond_loss + cond_loss) / 2.0
+    fake_uncond_loss = bce_logits(logits["fake"]["uncond"], labels["fake"]["image"])
+    fake_cond_loss = bce_logits(logits["fake"]["cond"], labels["fake"]["image"])
+    tot_loss += (fake_uncond_loss + fake_cond_loss) / 3.0
+    tot_loss += word_loss
+    return tot_loss
+def kl_loss(mu_tensor: torch.Tensor, logvar: torch.Tensor) -> Any:
+    """
+    Calculate KL loss
+    :param torch.Tensor mu_tensor: Mean of latent distribution
+    :param torch.Tensor logvar: Log variance of latent distribution
+    :return: KL loss [-0.5 * (1 + log(sigma) - mu^2 - sigma^2)]
+    :rtype: Any
+    """
+    return torch.mean(-0.5 * (1 + 0.5 * logvar - mu_tensor.pow(2) - torch.exp(logvar)))

src/models/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""All the modules used in creation of Generator and Discriminator"""
+from .acm import ACM
+from .attention import ChannelWiseAttention, SpatialAttention
+from .cond_augment import CondAugmentation
+from .conv_utils import calc_out_conv, conv1d, conv2d
+from .discriminator import Discriminator, WordLevelLogits
+from .downsample import down_sample
+from .generator import Generator
+from .image_encoder import InceptionEncoder, VGGEncoder
+from .residual import ResidualBlock
+from .text_encoder import TextEncoder
+from .upsample import img_up_block, up_sample

src/models/modules/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (891 Bytes). View file

src/models/modules/__pycache__/acm.cpython-39.pyc ADDED Viewed

Binary file (1.66 kB). View file

src/models/modules/__pycache__/attention.cpython-39.pyc ADDED Viewed

Binary file (3.38 kB). View file

src/models/modules/__pycache__/cond_augment.cpython-39.pyc ADDED Viewed

Binary file (2.52 kB). View file

src/models/modules/__pycache__/conv_utils.cpython-39.pyc ADDED Viewed

Binary file (2.37 kB). View file

src/models/modules/__pycache__/discriminator.cpython-39.pyc ADDED Viewed

Binary file (5.1 kB). View file

src/models/modules/__pycache__/downsample.cpython-39.pyc ADDED Viewed

Binary file (598 Bytes). View file

src/models/modules/__pycache__/generator.cpython-39.pyc ADDED Viewed

Binary file (9.03 kB). View file

src/models/modules/__pycache__/image_encoder.cpython-39.pyc ADDED Viewed

Binary file (4.27 kB). View file

src/models/modules/__pycache__/residual.cpython-39.pyc ADDED Viewed

Binary file (1.31 kB). View file

src/models/modules/__pycache__/text_encoder.cpython-39.pyc ADDED Viewed

Binary file (1.92 kB). View file

src/models/modules/__pycache__/upsample.cpython-39.pyc ADDED Viewed

Binary file (983 Bytes). View file

src/models/modules/acm.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""ACM and its variations"""
+from typing import Any
+import torch
+from torch import nn
+from .conv_utils import conv2d
+class ACM(nn.Module):
+    """Affine Combination Module from ManiGAN"""
+    def __init__(self, img_chans: int, text_chans: int, inner_dim: int = 64) -> None:
+        """
+        Initialize the convolutional layers
+        :param int img_chans: Channels in visual input
+        :param int text_chans: Channels of textual input
+        :param int inner_dim: Hyperparameters for inner dimensionality of features
+        """
+        super().__init__()
+        self.conv = conv2d(in_channels=img_chans, out_channels=inner_dim)
+        self.weights = conv2d(in_channels=inner_dim, out_channels=text_chans)
+        self.biases = conv2d(in_channels=inner_dim, out_channels=text_chans)
+    def forward(self, text: torch.Tensor, img: torch.Tensor) -> Any:
+        """
+        Propagate the textual and visual input through the ACM module
+        :param torch.Tensor text: Textual input (can be hidden features)
+        :param torch.Tensor img: Image input
+        :return: Affine combination of text and image
+        :rtype: torch.Tensor
+        """
+        img_features = self.conv(img)
+        return text * self.weights(img_features) + self.biases(img_features)

src/models/modules/attention.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""Attention modules"""
+from typing import Any, Optional
+import torch
+from torch import nn
+from src.models.modules.conv_utils import conv1d
+class ChannelWiseAttention(nn.Module):
+    """ChannelWise attention adapted from ControlGAN"""
+    def __init__(self, fm_size: int, text_d: int) -> None:
+        """
+        Initialize the Channel-Wise attention module
+        :param int fm_size:
+            Height and width of feature map on k-th iteration of forward-pass.
+            In paper, it's H_k * W_k
+        :param int text_d: Dimensionality of sentence. From paper, it's D
+        """
+        super().__init__()
+        # perception layer
+        self.text_conv = conv1d(text_d, fm_size)
+        # attention across channel dimension
+        self.softmax = nn.Softmax(2)
+    def forward(self, v_k: torch.Tensor, w_text: torch.Tensor) -> Any:
+        """
+        Apply attention to visual features taking into account features of words
+        :param torch.Tensor v_k: Visual context
+        :param torch.Tensor w_text: Textual features
+        :return: Fused hidden visual features and word features
+        :rtype: Any
+        """
+        w_hat = self.text_conv(w_text)
+        m_k = v_k @ w_hat
+        a_k = self.softmax(m_k)
+        w_hat = torch.transpose(w_hat, 1, 2)
+        return a_k @ w_hat
+class SpatialAttention(nn.Module):
+    """Spatial attention module for attending textual context to visual features"""
+    def __init__(self, d: int, d_hat: int) -> None:
+        """
+        Set up softmax and conv layers
+        :param int d: Initial embedding size for textual features. D from paper
+        :param int d_hat: Height of image feature map. D_hat from paper
+        """
+        super().__init__()
+        self.softmax = nn.Softmax(2)
+        self.conv = conv1d(d, d_hat)
+    def forward(
+        self,
+        text_context: torch.Tensor,
+        image: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> Any:
+        """
+        Project image features into the latent space
+        of textual features and apply attention
+        :param torch.Tensor text_context: D x T tensor of hidden textual features
+        :param torch.Tensor image: D_hat x N visual features
+        :param Optional[torch.Tensor] mask:
+            Boolean tensor for masking the padded words. BxL
+        :return: Word features attended by visual features
+        :rtype: Any
+        """
+        # number of features on image feature map H * W
+        feature_num = image.size(2)
+        # number of words in caption
+        len_caption = text_context.size(2)
+        text_context = self.conv(text_context)
+        image = torch.transpose(image, 1, 2)
+        s_i_j = image @ text_context
+        if mask is not None:
+            # duplicating mask and aligning dims with s_i_j
+            mask = mask.repeat(1, feature_num).view(-1, feature_num, len_caption)
+            s_i_j[mask] = -float("inf")
+        b_i_j = self.softmax(s_i_j)
+        c_i_j = b_i_j @ torch.transpose(text_context, 1, 2)
+        return torch.transpose(c_i_j, 1, 2)

src/models/modules/cond_augment.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Conditioning Augmentation Module"""
+from typing import Any
+import torch
+from torch import nn
+class CondAugmentation(nn.Module):
+    """Conditioning Augmentation Module"""
+    def __init__(self, D: int, conditioning_dim: int):
+        """
+        :param D: Dimension of the text embedding space [D from AttnGAN paper]
+        :param conditioning_dim: Dimension of the conditioning space
+        """
+        super().__init__()
+        self.cond_dim = conditioning_dim
+        self.cond_augment = nn.Linear(D, conditioning_dim * 4, bias=True)
+        self.glu = nn.GLU(dim=1)
+    def encode(self, text_embedding: torch.Tensor) -> Any:
+        """
+        This function encodes the text embedding into the conditioning space
+        :param text_embedding: Text embedding
+        :return: Conditioning embedding
+        """
+        x_tensor = self.glu(self.cond_augment(text_embedding))
+        mu_tensor = x_tensor[:, : self.cond_dim]
+        logvar = x_tensor[:, self.cond_dim :]
+        return mu_tensor, logvar
+    def sample(self, mu_tensor: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
+        """
+        This function samples from the Gaussian distribution
+        :param mu: Mean of the Gaussian distribution
+        :param logvar: Log variance of the Gaussian distribution
+        :return: Sample from the Gaussian distribution
+        """
+        std = torch.exp(0.5 * logvar)
+        eps = torch.randn_like(
+            std
+        )  # check if this should add requires_grad = True to this tensor?
+        return mu_tensor + eps * std
+    def forward(self, text_embedding: torch.Tensor) -> Any:
+        """
+        This function encodes the text embedding into the conditioning space,
+        and samples from the Gaussian distribution.
+        :param text_embedding: Text embedding
+        :return c_hat: Conditioning embedding (C^ from StackGAN++ paper)
+        :return mu: Mean of the Gaussian distribution
+        :return logvar: Log variance of the Gaussian distribution
+        """
+        mu_tensor, logvar = self.encode(text_embedding)
+        c_hat = self.sample(mu_tensor, logvar)
+        return c_hat, mu_tensor, logvar

src/models/modules/conv_utils.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""Frequently used convolution modules"""
+from torch import nn
+from typing import Tuple
+def conv2d(
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int = 3,
+    stride: int = 1,
+    padding: int = 1,
+) -> nn.Conv2d:
+    """
+    Template convolution which is typically used throughout the project
+    :param int in_channels: Number of input channels
+    :param int out_channels: Number of output channels
+    :param int kernel_size: Size of sliding kernel
+    :param int stride: How many steps kernel does when sliding
+    :param int padding: How many dimensions to pad
+    :return: Convolution layer with parameters
+    :rtype: nn.Conv2d
+    """
+    return nn.Conv2d(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+    )
+def conv1d(
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int = 1,
+    stride: int = 1,
+    padding: int = 0,
+) -> nn.Conv1d:
+    """
+    Template 1d convolution which is typically used throughout the project
+    :param int in_channels: Number of input channels
+    :param int out_channels: Number of output channels
+    :param int kernel_size: Size of sliding kernel
+    :param int stride: How many steps kernel does when sliding
+    :param int padding: How many dimensions to pad
+    :return: Convolution layer with parameters
+    :rtype: nn.Conv2d
+    """
+    return nn.Conv1d(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+    )
+def calc_out_conv(
+    h_in: int, w_in: int, kernel_size: int = 3, stride: int = 1, padding: int = 0
+) -> Tuple[int, int]:
+    """
+    Calculate the dimensionalities of images propagated through conv layers
+    :param h_in: Height of the image
+    :param w_in: Width of the image
+    :param kernel_size: Size of sliding kernel
+    :param stride: How many steps kernel does when sliding
+    :param padding: How many dimensions to pad
+    :return: Height and width of image through convolution
+    :rtype: tuple[int, int]
+    """
+    h_out = int((h_in + 2 * padding - kernel_size) / stride + 1)
+    w_out = int((w_in + 2 * padding - kernel_size) / stride + 1)
+    return h_out, w_out

src/models/modules/discriminator.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""Discriminator providing word-level feedback"""
+from typing import Any
+import torch
+from torch import nn
+from src.models.modules.conv_utils import conv1d, conv2d
+from src.models.modules.image_encoder import InceptionEncoder
+class WordLevelLogits(nn.Module):
+    """API for converting regional feature maps into logits for multi-class classification"""
+    def __init__(self) -> None:
+        """
+        Instantiate the module with softmax on channel dimension
+        """
+        super().__init__()
+        self.softmax = nn.Softmax(dim=1)
+        # layer for flattening the feature maps
+        self.flat = nn.Flatten(start_dim=2)
+        # change dism of of textual embs to correlate with chans of inception
+        self.chan_reduction = conv1d(256, 128)
+    def forward(
+        self, visual_features: torch.Tensor, word_embs: torch.Tensor, mask: torch.Tensor
+    ) -> Any:
+        """
+        Fuse two types of features together to get output for feeding into the classification loss
+        :param torch.Tensor visual_features:
+            Feature maps of an image after being processed by Inception encoder. Bx128x17x17
+        :param torch.Tensor word_embs:
+            Word-level embeddings from the text encoder Bx256xL
+        :return: Logits for each word in the picture. BxL
+        :rtype: Any
+        """
+        # make textual and visual features have the same amount of channels
+        word_embs = self.chan_reduction(word_embs)
+        # flattening the feature maps
+        visual_features = self.flat(visual_features)
+        word_embs = torch.transpose(word_embs, 1, 2)
+        word_region_correlations = word_embs @ visual_features
+        # normalize across L dimension
+        m_norm_l = nn.functional.normalize(word_region_correlations, dim=1)
+        # normalize across H*W dimension
+        m_norm_hw = nn.functional.normalize(m_norm_l, dim=2)
+        m_norm_hw = torch.transpose(m_norm_hw, 1, 2)
+        weighted_img_feats = visual_features @ m_norm_hw
+        weighted_img_feats = torch.sum(weighted_img_feats, dim=1)
+        weighted_img_feats[mask] = -float("inf")
+        deltas = self.softmax(weighted_img_feats)
+        return deltas
+class UnconditionalLogits(nn.Module):
+    """Head for retrieving logits from an image"""
+    def __init__(self) -> None:
+        """Initialize modules that reduce the features down to a set of logits"""
+        super().__init__()
+        self.conv = nn.Conv2d(128, 1, kernel_size=17)
+        # flattening BxLx1x1 into Bx1
+        self.flat = nn.Flatten()
+    def forward(self, visual_features: torch.Tensor) -> Any:
+        """
+        Compute logits for unconditioned adversarial loss
+        :param visual_features: Local features from Inception network. Bx128x17x17
+        :return: Logits for unconditioned adversarial loss. Bx1
+        :rtype: Any
+        """
+        # reduce channels and feature maps for visual features
+        visual_features = self.conv(visual_features)
+        # flatten Bx1x1x1 into Bx1
+        logits = self.flat(visual_features)
+        return logits
+class ConditionalLogits(nn.Module):
+    """Logits extractor for conditioned adversarial loss"""
+    def __init__(self) -> None:
+        super().__init__()
+        # layer for forming the feature maps out of textual info
+        self.text_to_fm = conv1d(256, 17 * 17)
+        # fitting the size of text channels to the size of visual channels
+        self.chan_aligner = conv2d(1, 128)
+        # for reduced textual + visual features down to 1x1 feature map
+        self.joint_conv = nn.Conv2d(2 * 128, 1, kernel_size=17)
+        # converting Bx1x1x1 into Bx1
+        self.flat = nn.Flatten()
+    def forward(self, visual_features: torch.Tensor, sent_embs: torch.Tensor) -> Any:
+        """
+        Compute logits for conditional adversarial loss
+        :param torch.Tensor visual_features: Features from Inception encoder. Bx128x17x17
+        :param torch.Tensor sent_embs: Sentence embeddings from text encoder. Bx256
+        :return: Logits for conditional adversarial loss. BxL
+        :rtype: Any
+        """
+        # make text and visual features have the same sizes of feature maps
+        # Bx256 -> Bx256x1 -> Bx289x1
+        sent_embs = sent_embs.view(-1, 256, 1)
+        sent_embs = self.text_to_fm(sent_embs)
+        # transform textual info into shape of visual feature maps
+        # Bx289x1 -> Bx1x17x17
+        sent_embs = sent_embs.view(-1, 1, 17, 17)
+        # propagate text embs through 1d conv to
+        # align dims with visual feature maps
+        sent_embs = self.chan_aligner(sent_embs)
+        # unite textual and visual features across the dim of channels
+        cross_features = torch.cat((visual_features, sent_embs), dim=1)
+        # reduce dims down to length of caption and form raw logits
+        cross_features = self.joint_conv(cross_features)
+        # form logits from Bx1x1x1 into Bx1
+        logits = self.flat(cross_features)
+        return logits
+class Discriminator(nn.Module):
+    """Simple CNN-based discriminator"""
+    def __init__(self) -> None:
+        """Use a pretrained InceptionNet to extract features"""
+        super().__init__()
+        self.encoder = InceptionEncoder(D=128)
+        # define different logit extractors for different losses
+        self.logits_word_level = WordLevelLogits()
+        self.logits_uncond = UnconditionalLogits()
+        self.logits_cond = ConditionalLogits()
+    def forward(self, images: torch.Tensor) -> Any:
+        """
+        Retrieves image features encoded by the image encoder
+        :param torch.Tensor images: Images to be analyzed. Bx3x256x256
+        :return: image features encoded by image encoder. Bx128x17x17
+        """
+        # only taking the local features from inception
+        # Bx3x256x256 -> Bx128x17x17
+        img_features, _ = self.encoder(images)
+        return img_features

src/models/modules/downsample.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""downsample module."""
+from torch import nn
+def down_sample(in_planes: int, out_planes: int) -> nn.Module:
+    """UpSample module."""
+    return nn.Sequential(
+        nn.Conv2d(
+            in_planes, out_planes, kernel_size=4, stride=2, padding=1, bias=False
+        ),
+        nn.BatchNorm2d(out_planes),
+        nn.LeakyReLU(0.2, inplace=True),
+    )

src/models/modules/generator.py ADDED Viewed

	@@ -0,0 +1,300 @@

+"""Generator Module"""
+from typing import Any, Optional
+import torch
+from torch import nn
+from src.models.modules.acm import ACM
+from src.models.modules.attention import ChannelWiseAttention, SpatialAttention
+from src.models.modules.cond_augment import CondAugmentation
+from src.models.modules.downsample import down_sample
+from src.models.modules.residual import ResidualBlock
+from src.models.modules.upsample import img_up_block, up_sample
+class InitStageG(nn.Module):
+    """Initial Stage Generator Module"""
+    # pylint: disable=too-many-instance-attributes
+    # pylint: disable=too-many-arguments
+    # pylint: disable=invalid-name
+    # pylint: disable=too-many-locals
+    def __init__(
+        self, Ng: int, Ng_init: int, conditioning_dim: int, D: int, noise_dim: int
+    ):
+        """
+        :param Ng: Number of channels.
+        :param Ng_init: Initial value of Ng, this is output channel of first image upsample.
+        :param conditioning_dim: Dimension of the conditioning space
+        :param D: Dimension of the text embedding space [D from AttnGAN paper]
+        :param noise_dim: Dimension of the noise space
+        """
+        super().__init__()
+        self.gf_dim = Ng
+        self.gf_init = Ng_init
+        self.in_dim = noise_dim + conditioning_dim + D
+        self.text_dim = D
+        self.define_module()
+    def define_module(self) -> None:
+        """Defines FC, Upsample, Residual, ACM, Attention modules"""
+        nz, ng = self.in_dim, self.gf_dim
+        self.fully_connect = nn.Sequential(
+            nn.Linear(nz, ng * 4 * 4 * 2, bias=False),
+            nn.BatchNorm1d(ng * 4 * 4 * 2),
+            nn.GLU(dim=1),  # we start from 4 x 4 feat_map and return hidden_64.
+        )
+        self.upsample1 = up_sample(ng, ng // 2)
+        self.upsample2 = up_sample(ng // 2, ng // 4)
+        self.upsample3 = up_sample(ng // 4, ng // 8)
+        self.upsample4 = up_sample(
+            ng // 8 * 3, ng // 16
+        )  # multiply channel by 3 because concat spatial and channel att
+        self.residual = self._make_layer(ResidualBlock, ng // 8 * 3)
+        self.acm_module = ACM(self.gf_init, ng // 8 * 3)
+        self.spatial_att = SpatialAttention(self.text_dim, ng // 8)
+        self.channel_att = ChannelWiseAttention(
+            32 * 32, self.text_dim
+        )  # 32 x 32 is the feature map size
+    def _make_layer(self, block: Any, channel_num: int) -> nn.Module:
+        layers = []
+        for _ in range(2):  # number of residual blocks hardcoded to 2
+            layers.append(block(channel_num))
+        return nn.Sequential(*layers)
+    def forward(
+        self,
+        noise: torch.Tensor,
+        condition: torch.Tensor,
+        global_inception: torch.Tensor,
+        local_upsampled_inception: torch.Tensor,
+        word_embeddings: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> Any:
+        """
+        :param noise: Noise tensor
+        :param condition: Condition tensor (c^ from stackGAN++ paper)
+        :param global_inception: Global inception feature
+        :param local_upsampled_inception: Local inception feature, upsampled to 32 x 32
+        :param word_embeddings: Word embeddings [shape: D x L or D x T]
+        :param mask: Mask for padding tokens
+        :return: Hidden Image feature map Tensor of 64 x 64 size
+        """
+        noise_concat = torch.cat((noise, condition), 1)
+        inception_concat = torch.cat((noise_concat, global_inception), 1)
+        hidden = self.fully_connect(inception_concat)
+        hidden = hidden.view(-1, self.gf_dim, 4, 4)  # convert to 4x4 image feature map
+        hidden = self.upsample1(hidden)
+        hidden = self.upsample2(hidden)
+        hidden_32 = self.upsample3(hidden)  # shape: (batch_size, gf_dim // 8, 32, 32)
+        hidden_32_view = hidden_32.view(
+            hidden_32.shape[0], -1, hidden_32.shape[2] * hidden_32.shape[3]
+        )  # this reshaping is done as attention module expects this shape.
+        spatial_att_feat = self.spatial_att(
+            word_embeddings, hidden_32_view, mask
+        )  # spatial att shape: (batch, D^, 32 * 32)
+        channel_att_feat = self.channel_att(
+            spatial_att_feat, word_embeddings
+        )  # channel att shape: (batch, D^, 32 * 32), or (batch, C, Hk* Wk) from controlGAN paper
+        spatial_att_feat = spatial_att_feat.view(
+            word_embeddings.shape[0], -1, hidden_32.shape[2], hidden_32.shape[3]
+        )  # reshape to (batch, D^, 32, 32)
+        channel_att_feat = channel_att_feat.view(
+            word_embeddings.shape[0], -1, hidden_32.shape[2], hidden_32.shape[3]
+        )  # reshape to (batch, D^, 32, 32)
+        spatial_concat = torch.cat(
+            (hidden_32, spatial_att_feat), 1
+        )  # concat spatial attention feature with hidden_32
+        attn_concat = torch.cat(
+            (spatial_concat, channel_att_feat), 1
+        )  # concat channel and spatial attention feature
+        hidden_32 = self.acm_module(attn_concat, local_upsampled_inception)
+        hidden_32 = self.residual(hidden_32)
+        hidden_64 = self.upsample4(hidden_32)
+        return hidden_64
+class NextStageG(nn.Module):
+    """Next Stage Generator Module"""
+    # pylint: disable=too-many-instance-attributes
+    # pylint: disable=too-many-arguments
+    # pylint: disable=invalid-name
+    # pylint: disable=too-many-locals
+    def __init__(self, Ng: int, Ng_init: int, D: int, image_size: int):
+        """
+        :param Ng: Number of channels.
+        :param Ng_init: Initial value of Ng.
+        :param D: Dimension of the text embedding space [D from AttnGAN paper]
+        :param image_size: Size of the output image from previous generator stage.
+        """
+        super().__init__()
+        self.gf_dim = Ng
+        self.gf_init = Ng_init
+        self.text_dim = D
+        self.img_size = image_size
+        self.define_module()
+    def define_module(self) -> None:
+        """Defines FC, Upsample, Residual, ACM, Attention modules"""
+        ng = self.gf_dim
+        self.spatial_att = SpatialAttention(self.text_dim, ng)
+        self.channel_att = ChannelWiseAttention(
+            self.img_size * self.img_size, self.text_dim
+        )
+        self.residual = self._make_layer(ResidualBlock, ng * 3)
+        self.upsample = up_sample(ng * 3, ng)
+        self.acm_module = ACM(self.gf_init, ng * 3)
+        self.upsample2 = up_sample(ng, ng)
+    def _make_layer(self, block: Any, channel_num: int) -> nn.Module:
+        layers = []
+        for _ in range(2):  # no of residual layers hardcoded to 2
+            layers.append(block(channel_num))
+        return nn.Sequential(*layers)
+    def forward(
+        self,
+        hidden_feat: Any,
+        word_embeddings: torch.Tensor,
+        vgg64_feat: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> Any:
+        """
+        :param hidden_feat: Hidden feature from previous generator stage [i.e. hidden_64]
+        :param word_embeddings: Word embeddings
+        :param vgg64_feat: VGG feature map of size 64 x 64
+        :param mask: Mask for the padding tokens
+        :return: Image feature map of size 256 x 256
+        """
+        hidden_view = hidden_feat.view(
+            hidden_feat.shape[0], -1, hidden_feat.shape[2] * hidden_feat.shape[3]
+        )  # reshape to pass into attention modules.
+        spatial_att_feat = self.spatial_att(
+            word_embeddings, hidden_view, mask
+        )  # spatial att shape: (batch, D^, 64 * 64), or D^ x N
+        channel_att_feat = self.channel_att(
+            spatial_att_feat, word_embeddings
+        )  # channel att shape: (batch, D^, 64 * 64), or (batch, C, Hk* Wk) from controlGAN paper
+        spatial_att_feat = spatial_att_feat.view(
+            word_embeddings.shape[0], -1, hidden_feat.shape[2], hidden_feat.shape[3]
+        )  # reshape to (batch, D^, 64, 64)
+        channel_att_feat = channel_att_feat.view(
+            word_embeddings.shape[0], -1, hidden_feat.shape[2], hidden_feat.shape[3]
+        )  # reshape to (batch, D^, 64, 64)
+        spatial_concat = torch.cat(
+            (hidden_feat, spatial_att_feat), 1
+        )  # concat spatial attention feature with hidden_64
+        attn_concat = torch.cat(
+            (spatial_concat, channel_att_feat), 1
+        )  # concat channel and spatial attention feature
+        hidden_64 = self.acm_module(attn_concat, vgg64_feat)
+        hidden_64 = self.residual(hidden_64)
+        hidden_128 = self.upsample(hidden_64)
+        hidden_256 = self.upsample2(hidden_128)
+        return hidden_256
+class GetImageG(nn.Module):
+    """Generates the Final Fake Image from the Image Feature Map"""
+    def __init__(self, Ng: int):
+        """
+        :param Ng: Number of channels.
+        """
+        super().__init__()
+        self.img = nn.Sequential(
+            nn.Conv2d(Ng, 3, kernel_size=3, stride=1, padding=1, bias=False), nn.Tanh()
+        )
+    def forward(self, hidden_feat: torch.Tensor) -> Any:
+        """
+        :param hidden_feat: Image feature map
+        :return: Final fake image
+        """
+        return self.img(hidden_feat)
+class Generator(nn.Module):
+    """Generator Module"""
+    # pylint: disable=too-many-instance-attributes
+    # pylint: disable=too-many-arguments
+    # pylint: disable=invalid-name
+    # pylint: disable=too-many-locals
+    def __init__(self, Ng: int, D: int, conditioning_dim: int, noise_dim: int):
+        """
+        :param Ng: Number of channels. [Taken from StackGAN++ paper]
+        :param D: Dimension of the text embedding space
+        :param conditioning_dim: Dimension of the conditioning space
+        :param noise_dim: Dimension of the noise space
+        """
+        super().__init__()
+        self.cond_augment = CondAugmentation(D, conditioning_dim)
+        self.hidden_net1 = InitStageG(Ng * 16, Ng, conditioning_dim, D, noise_dim)
+        self.inception_img_upsample = img_up_block(
+            D, Ng
+        )  # as channel size returned by inception encoder is D (Default in paper: 256)
+        self.hidden_net2 = NextStageG(Ng, Ng, D, 64)
+        self.generate_img = GetImageG(Ng)
+        self.acm_module = ACM(Ng, Ng)
+        self.vgg_downsample = down_sample(D // 2, Ng)
+        self.upsample1 = up_sample(Ng, Ng)
+        self.upsample2 = up_sample(Ng, Ng)
+    def forward(
+        self,
+        noise: torch.Tensor,
+        sentence_embeddings: torch.Tensor,
+        word_embeddings: torch.Tensor,
+        global_inception_feat: torch.Tensor,
+        local_inception_feat: torch.Tensor,
+        vgg_feat: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> Any:
+        """
+        :param noise: Noise vector [shape: (batch, noise_dim)]
+        :param sentence_embeddings: Sentence embeddings [shape: (batch, D)]
+        :param word_embeddings: Word embeddings [shape: D x L, where L is length of sentence]
+        :param global_inception_feat: Global Inception feature map [shape: (batch, D)]
+        :param local_inception_feat: Local Inception feature map [shape: (batch, D, 17, 17)]
+        :param vgg_feat: VGG feature map [shape: (batch, D // 2 = 128, 128, 128)]
+        :param mask: Mask for the padding tokens
+        :return: Final fake image
+        """
+        c_hat, mu_tensor, logvar = self.cond_augment(sentence_embeddings)
+        hidden_32 = self.inception_img_upsample(local_inception_feat)
+        hidden_64 = self.hidden_net1(
+            noise, c_hat, global_inception_feat, hidden_32, word_embeddings, mask
+        )
+        vgg_64 = self.vgg_downsample(vgg_feat)
+        hidden_256 = self.hidden_net2(hidden_64, word_embeddings, vgg_64, mask)
+        vgg_128 = self.upsample1(vgg_64)
+        vgg_256 = self.upsample2(vgg_128)
+        hidden_256 = self.acm_module(hidden_256, vgg_256)
+        fake_img = self.generate_img(hidden_256)
+        return fake_img, mu_tensor, logvar

src/models/modules/image_encoder.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""Image Encoder Module"""
+from typing import Any
+import torch
+from torch import nn
+from src.models.modules.conv_utils import conv2d
+# build inception v3 image encoder
+class InceptionEncoder(nn.Module):
+    """Image Encoder Module adapted from AttnGAN"""
+    def __init__(self, D: int):
+        """
+        :param D: Dimension of the text embedding space [D from AttnGAN paper]
+        """
+        super().__init__()
+        self.text_emb_dim = D
+        model = torch.hub.load(
+            "pytorch/vision:v0.10.0", "inception_v3", pretrained=True
+        )
+        for param in model.parameters():
+            param.requires_grad = False
+        self.define_module(model)
+        self.init_trainable_weights()
+    def define_module(self, model: nn.Module) -> None:
+        """
+        This function defines the modules of the image encoder
+        :param model: Pretrained Inception V3 model
+        """
+        model.cust_upsample = nn.Upsample(size=(299, 299), mode="bilinear")
+        model.cust_maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
+        model.cust_maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
+        model.cust_avgpool = nn.AvgPool2d(kernel_size=8)
+        attribute_list = [
+            "cust_upsample",
+            "Conv2d_1a_3x3",
+            "Conv2d_2a_3x3",
+            "Conv2d_2b_3x3",
+            "cust_maxpool1",
+            "Conv2d_3b_1x1",
+            "Conv2d_4a_3x3",
+            "cust_maxpool2",
+            "Mixed_5b",
+            "Mixed_5c",
+            "Mixed_5d",
+            "Mixed_6a",
+            "Mixed_6b",
+            "Mixed_6c",
+            "Mixed_6d",
+            "Mixed_6e",
+        ]
+        self.feature_extractor = nn.Sequential(
+            *[getattr(model, name) for name in attribute_list]
+        )
+        attribute_list2 = ["Mixed_7a", "Mixed_7b", "Mixed_7c", "cust_avgpool"]
+        self.feature_extractor2 = nn.Sequential(
+            *[getattr(model, name) for name in attribute_list2]
+        )
+        self.emb_features = conv2d(
+            768, self.text_emb_dim, kernel_size=1, stride=1, padding=0
+        )
+        self.emb_cnn_code = nn.Linear(2048, self.text_emb_dim)
+    def init_trainable_weights(self) -> None:
+        """
+        This function initializes the trainable weights of the image encoder
+        """
+        initrange = 0.1
+        self.emb_features.weight.data.uniform_(-initrange, initrange)
+        self.emb_cnn_code.weight.data.uniform_(-initrange, initrange)
+    def forward(self, image_tensor: torch.Tensor) -> Any:
+        """
+        :param image_tensor: Input image
+        :return: features: local feature matrix (v from attnGAN paper) [shape: (batch, D, 17, 17)]
+        :return: cnn_code: global image feature (v^ from attnGAN paper) [shape: (batch, D)]
+        """
+        # this is the image size
+        # x.shape: 10 3 256 256
+        features = self.feature_extractor(image_tensor)
+        # 17 x 17 x 768
+        image_tensor = self.feature_extractor2(features)
+        image_tensor = image_tensor.view(image_tensor.size(0), -1)
+        # 2048
+        # global image features
+        cnn_code = self.emb_cnn_code(image_tensor)
+        if features is not None:
+            features = self.emb_features(features)
+        # feature.shape: 10 256 17 17
+        # cnn_code.shape: 10 256
+        return features, cnn_code
+class VGGEncoder(nn.Module):
+    """Pre Trained VGG Encoder Module"""
+    def __init__(self) -> None:
+        """
+        Initialize pre-trained VGG model with frozen parameters
+        """
+        super().__init__()
+        self.select = "8"  ## We want to get the output of the 8th layer in VGG.
+        self.model = torch.hub.load("pytorch/vision:v0.10.0", "vgg16", pretrained=True)
+        for param in self.model.parameters():
+            param.resquires_grad = False
+        self.vgg_modules = self.model.features._modules
+    def forward(self, image_tensor: torch.Tensor) -> Any:
+        """
+        :param x: Input image tensor [shape: (batch, 3, 256, 256)]
+        :return: VGG features [shape: (batch, 128, 128, 128)]
+        """
+        for name, layer in self.vgg_modules.items():
+            image_tensor = layer(image_tensor)
+            if name == self.select:
+                return image_tensor
+        return None