Spaces:
Running
Running
| """ | |
| Title: Object detection with Vision Transformers | |
| Author: [Karan V. Dave](https://www.linkedin.com/in/karan-dave-811413164/) | |
| Date created: 2022/03/27 | |
| Last modified: 2023/11/20 | |
| Description: A simple Keras implementation of object detection using Vision Transformers. | |
| Accelerator: GPU | |
| """ | |
| """ | |
| ## Introduction | |
| The article | |
| [Vision Transformer (ViT)](https://arxiv.org/abs/2010.11929) | |
| architecture by Alexey Dosovitskiy et al. | |
| demonstrates that a pure transformer applied directly to sequences of image | |
| patches can perform well on object detection tasks. | |
| In this Keras example, we implement an object detection ViT | |
| and we train it on the | |
| [Caltech 101 dataset](http://www.vision.caltech.edu/datasets/) | |
| to detect an airplane in the given image. | |
| """ | |
| """ | |
| ## Imports and setup | |
| """ | |
| import os | |
| os.environ["KERAS_BACKEND"] = "jax" # @param ["tensorflow", "jax", "torch"] | |
| import numpy as np | |
| import keras | |
| from keras import layers | |
| from keras import ops | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import cv2 | |
| import os | |
| import scipy.io | |
| import shutil | |
| """ | |
| ## Prepare dataset | |
| We use the [Caltech 101 Dataset](https://data.caltech.edu/records/mzrjq-6wc02). | |
| """ | |
| # Path to images and annotations | |
| path_images = "./101_ObjectCategories/airplanes/" | |
| path_annot = "./Annotations/Airplanes_Side_2/" | |
| path_to_downloaded_file = keras.utils.get_file( | |
| fname="caltech_101_zipped", | |
| origin="https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip", | |
| extract=True, | |
| archive_format="zip", # downloaded file format | |
| cache_dir="/", # cache and extract in current directory | |
| ) | |
| download_base_dir = os.path.dirname(path_to_downloaded_file) | |
| # Extracting tar files found inside main zip file | |
| shutil.unpack_archive( | |
| os.path.join(download_base_dir, "caltech-101", "101_ObjectCategories.tar.gz"), "." | |
| ) | |
| shutil.unpack_archive( | |
| os.path.join(download_base_dir, "caltech-101", "Annotations.tar"), "." | |
| ) | |
| # list of paths to images and annotations | |
| image_paths = [ | |
| f for f in os.listdir(path_images) if os.path.isfile(os.path.join(path_images, f)) | |
| ] | |
| annot_paths = [ | |
| f for f in os.listdir(path_annot) if os.path.isfile(os.path.join(path_annot, f)) | |
| ] | |
| image_paths.sort() | |
| annot_paths.sort() | |
| image_size = 224 # resize input images to this size | |
| images, targets = [], [] | |
| # loop over the annotations and images, preprocess them and store in lists | |
| for i in range(0, len(annot_paths)): | |
| # Access bounding box coordinates | |
| annot = scipy.io.loadmat(path_annot + annot_paths[i])["box_coord"][0] | |
| top_left_x, top_left_y = annot[2], annot[0] | |
| bottom_right_x, bottom_right_y = annot[3], annot[1] | |
| image = keras.utils.load_img( | |
| path_images + image_paths[i], | |
| ) | |
| (w, h) = image.size[:2] | |
| # resize images | |
| image = image.resize((image_size, image_size)) | |
| # convert image to array and append to list | |
| images.append(keras.utils.img_to_array(image)) | |
| # apply relative scaling to bounding boxes as per given image and append to list | |
| targets.append( | |
| ( | |
| float(top_left_x) / w, | |
| float(top_left_y) / h, | |
| float(bottom_right_x) / w, | |
| float(bottom_right_y) / h, | |
| ) | |
| ) | |
| # Convert the list to numpy array, split to train and test dataset | |
| (x_train), (y_train) = ( | |
| np.asarray(images[: int(len(images) * 0.8)]), | |
| np.asarray(targets[: int(len(targets) * 0.8)]), | |
| ) | |
| (x_test), (y_test) = ( | |
| np.asarray(images[int(len(images) * 0.8) :]), | |
| np.asarray(targets[int(len(targets) * 0.8) :]), | |
| ) | |
| """ | |
| ## Implement multilayer-perceptron (MLP) | |
| We use the code from the Keras example | |
| [Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/) | |
| as a reference. | |
| """ | |
| def mlp(x, hidden_units, dropout_rate): | |
| for units in hidden_units: | |
| x = layers.Dense(units, activation=keras.activations.gelu)(x) | |
| x = layers.Dropout(dropout_rate)(x) | |
| return x | |
| """ | |
| ## Implement the patch creation layer | |
| """ | |
| class Patches(layers.Layer): | |
| def __init__(self, patch_size): | |
| super().__init__() | |
| self.patch_size = patch_size | |
| def call(self, images): | |
| input_shape = ops.shape(images) | |
| batch_size = input_shape[0] | |
| height = input_shape[1] | |
| width = input_shape[2] | |
| channels = input_shape[3] | |
| num_patches_h = height // self.patch_size | |
| num_patches_w = width // self.patch_size | |
| patches = keras.ops.image.extract_patches(images, size=self.patch_size) | |
| patches = ops.reshape( | |
| patches, | |
| ( | |
| batch_size, | |
| num_patches_h * num_patches_w, | |
| self.patch_size * self.patch_size * channels, | |
| ), | |
| ) | |
| return patches | |
| def get_config(self): | |
| config = super().get_config() | |
| config.update({"patch_size": self.patch_size}) | |
| return config | |
| """ | |
| ## Display patches for an input image | |
| """ | |
| patch_size = 32 # Size of the patches to be extracted from the input images | |
| plt.figure(figsize=(4, 4)) | |
| plt.imshow(x_train[0].astype("uint8")) | |
| plt.axis("off") | |
| patches = Patches(patch_size)(np.expand_dims(x_train[0], axis=0)) | |
| print(f"Image size: {image_size} X {image_size}") | |
| print(f"Patch size: {patch_size} X {patch_size}") | |
| print(f"{patches.shape[1]} patches per image \n{patches.shape[-1]} elements per patch") | |
| n = int(np.sqrt(patches.shape[1])) | |
| plt.figure(figsize=(4, 4)) | |
| for i, patch in enumerate(patches[0]): | |
| ax = plt.subplot(n, n, i + 1) | |
| patch_img = ops.reshape(patch, (patch_size, patch_size, 3)) | |
| plt.imshow(ops.convert_to_numpy(patch_img).astype("uint8")) | |
| plt.axis("off") | |
| """ | |
| ## Implement the patch encoding layer | |
| The `PatchEncoder` layer linearly transforms a patch by projecting it into a | |
| vector of size `projection_dim`. It also adds a learnable position | |
| embedding to the projected vector. | |
| """ | |
| class PatchEncoder(layers.Layer): | |
| def __init__(self, num_patches, projection_dim): | |
| super().__init__() | |
| self.num_patches = num_patches | |
| self.projection = layers.Dense(units=projection_dim) | |
| self.position_embedding = layers.Embedding( | |
| input_dim=num_patches, output_dim=projection_dim | |
| ) | |
| # Override function to avoid error while saving model | |
| def get_config(self): | |
| config = super().get_config().copy() | |
| config.update( | |
| { | |
| "input_shape": input_shape, | |
| "patch_size": patch_size, | |
| "num_patches": num_patches, | |
| "projection_dim": projection_dim, | |
| "num_heads": num_heads, | |
| "transformer_units": transformer_units, | |
| "transformer_layers": transformer_layers, | |
| "mlp_head_units": mlp_head_units, | |
| } | |
| ) | |
| return config | |
| def call(self, patch): | |
| positions = ops.expand_dims( | |
| ops.arange(start=0, stop=self.num_patches, step=1), axis=0 | |
| ) | |
| projected_patches = self.projection(patch) | |
| encoded = projected_patches + self.position_embedding(positions) | |
| return encoded | |
| """ | |
| ## Build the ViT model | |
| The ViT model has multiple Transformer blocks. | |
| The `MultiHeadAttention` layer is used for self-attention, | |
| applied to the sequence of image patches. The encoded patches (skip connection) | |
| and self-attention layer outputs are normalized and fed into a | |
| multilayer perceptron (MLP). | |
| The model outputs four dimensions representing | |
| the bounding box coordinates of an object. | |
| """ | |
| def create_vit_object_detector( | |
| input_shape, | |
| patch_size, | |
| num_patches, | |
| projection_dim, | |
| num_heads, | |
| transformer_units, | |
| transformer_layers, | |
| mlp_head_units, | |
| ): | |
| inputs = keras.Input(shape=input_shape) | |
| # Create patches | |
| patches = Patches(patch_size)(inputs) | |
| # Encode patches | |
| encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) | |
| # Create multiple layers of the Transformer block. | |
| for _ in range(transformer_layers): | |
| # Layer normalization 1. | |
| x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) | |
| # Create a multi-head attention layer. | |
| attention_output = layers.MultiHeadAttention( | |
| num_heads=num_heads, key_dim=projection_dim, dropout=0.1 | |
| )(x1, x1) | |
| # Skip connection 1. | |
| x2 = layers.Add()([attention_output, encoded_patches]) | |
| # Layer normalization 2. | |
| x3 = layers.LayerNormalization(epsilon=1e-6)(x2) | |
| # MLP | |
| x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1) | |
| # Skip connection 2. | |
| encoded_patches = layers.Add()([x3, x2]) | |
| # Create a [batch_size, projection_dim] tensor. | |
| representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) | |
| representation = layers.Flatten()(representation) | |
| representation = layers.Dropout(0.3)(representation) | |
| # Add MLP. | |
| features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.3) | |
| bounding_box = layers.Dense(4)( | |
| features | |
| ) # Final four neurons that output bounding box | |
| # return Keras model. | |
| return keras.Model(inputs=inputs, outputs=bounding_box) | |
| """ | |
| ## Run the experiment | |
| """ | |
| def run_experiment(model, learning_rate, weight_decay, batch_size, num_epochs): | |
| optimizer = keras.optimizers.AdamW( | |
| learning_rate=learning_rate, weight_decay=weight_decay | |
| ) | |
| # Compile model. | |
| model.compile(optimizer=optimizer, loss=keras.losses.MeanSquaredError()) | |
| checkpoint_filepath = "vit_object_detector.weights.h5" | |
| checkpoint_callback = keras.callbacks.ModelCheckpoint( | |
| checkpoint_filepath, | |
| monitor="val_loss", | |
| save_best_only=True, | |
| save_weights_only=True, | |
| ) | |
| history = model.fit( | |
| x=x_train, | |
| y=y_train, | |
| batch_size=batch_size, | |
| epochs=num_epochs, | |
| validation_split=0.1, | |
| callbacks=[ | |
| checkpoint_callback, | |
| keras.callbacks.EarlyStopping(monitor="val_loss", patience=10), | |
| ], | |
| ) | |
| return history | |
| input_shape = (image_size, image_size, 3) # input image shape | |
| learning_rate = 0.001 | |
| weight_decay = 0.0001 | |
| batch_size = 32 | |
| num_epochs = 100 | |
| num_patches = (image_size // patch_size) ** 2 | |
| projection_dim = 64 | |
| num_heads = 4 | |
| # Size of the transformer layers | |
| transformer_units = [ | |
| projection_dim * 2, | |
| projection_dim, | |
| ] | |
| transformer_layers = 4 | |
| mlp_head_units = [2048, 1024, 512, 64, 32] # Size of the dense layers | |
| history = [] | |
| num_patches = (image_size // patch_size) ** 2 | |
| vit_object_detector = create_vit_object_detector( | |
| input_shape, | |
| patch_size, | |
| num_patches, | |
| projection_dim, | |
| num_heads, | |
| transformer_units, | |
| transformer_layers, | |
| mlp_head_units, | |
| ) | |
| # Train model | |
| history = run_experiment( | |
| vit_object_detector, learning_rate, weight_decay, batch_size, num_epochs | |
| ) | |
| def plot_history(item): | |
| plt.plot(history.history[item], label=item) | |
| plt.plot(history.history["val_" + item], label="val_" + item) | |
| plt.xlabel("Epochs") | |
| plt.ylabel(item) | |
| plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14) | |
| plt.legend() | |
| plt.grid() | |
| plt.show() | |
| plot_history("loss") | |
| """ | |
| ## Evaluate the model | |
| """ | |
| import matplotlib.patches as patches | |
| # Saves the model in current path | |
| vit_object_detector.save("vit_object_detector.keras") | |
| # To calculate IoU (intersection over union, given two bounding boxes) | |
| def bounding_box_intersection_over_union(box_predicted, box_truth): | |
| # get (x, y) coordinates of intersection of bounding boxes | |
| top_x_intersect = max(box_predicted[0], box_truth[0]) | |
| top_y_intersect = max(box_predicted[1], box_truth[1]) | |
| bottom_x_intersect = min(box_predicted[2], box_truth[2]) | |
| bottom_y_intersect = min(box_predicted[3], box_truth[3]) | |
| # calculate area of the intersection bb (bounding box) | |
| intersection_area = max(0, bottom_x_intersect - top_x_intersect + 1) * max( | |
| 0, bottom_y_intersect - top_y_intersect + 1 | |
| ) | |
| # calculate area of the prediction bb and ground-truth bb | |
| box_predicted_area = (box_predicted[2] - box_predicted[0] + 1) * ( | |
| box_predicted[3] - box_predicted[1] + 1 | |
| ) | |
| box_truth_area = (box_truth[2] - box_truth[0] + 1) * ( | |
| box_truth[3] - box_truth[1] + 1 | |
| ) | |
| # calculate intersection over union by taking intersection | |
| # area and dividing it by the sum of predicted bb and ground truth | |
| # bb areas subtracted by the interesection area | |
| # return ioU | |
| return intersection_area / float( | |
| box_predicted_area + box_truth_area - intersection_area | |
| ) | |
| i, mean_iou = 0, 0 | |
| # Compare results for 10 images in the test set | |
| for input_image in x_test[:10]: | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 15)) | |
| im = input_image | |
| # Display the image | |
| ax1.imshow(im.astype("uint8")) | |
| ax2.imshow(im.astype("uint8")) | |
| input_image = cv2.resize( | |
| input_image, (image_size, image_size), interpolation=cv2.INTER_AREA | |
| ) | |
| input_image = np.expand_dims(input_image, axis=0) | |
| preds = vit_object_detector.predict(input_image)[0] | |
| (h, w) = (im).shape[0:2] | |
| top_left_x, top_left_y = int(preds[0] * w), int(preds[1] * h) | |
| bottom_right_x, bottom_right_y = int(preds[2] * w), int(preds[3] * h) | |
| box_predicted = [top_left_x, top_left_y, bottom_right_x, bottom_right_y] | |
| # Create the bounding box | |
| rect = patches.Rectangle( | |
| (top_left_x, top_left_y), | |
| bottom_right_x - top_left_x, | |
| bottom_right_y - top_left_y, | |
| facecolor="none", | |
| edgecolor="red", | |
| linewidth=1, | |
| ) | |
| # Add the bounding box to the image | |
| ax1.add_patch(rect) | |
| ax1.set_xlabel( | |
| "Predicted: " | |
| + str(top_left_x) | |
| + ", " | |
| + str(top_left_y) | |
| + ", " | |
| + str(bottom_right_x) | |
| + ", " | |
| + str(bottom_right_y) | |
| ) | |
| top_left_x, top_left_y = int(y_test[i][0] * w), int(y_test[i][1] * h) | |
| bottom_right_x, bottom_right_y = int(y_test[i][2] * w), int(y_test[i][3] * h) | |
| box_truth = top_left_x, top_left_y, bottom_right_x, bottom_right_y | |
| mean_iou += bounding_box_intersection_over_union(box_predicted, box_truth) | |
| # Create the bounding box | |
| rect = patches.Rectangle( | |
| (top_left_x, top_left_y), | |
| bottom_right_x - top_left_x, | |
| bottom_right_y - top_left_y, | |
| facecolor="none", | |
| edgecolor="red", | |
| linewidth=1, | |
| ) | |
| # Add the bounding box to the image | |
| ax2.add_patch(rect) | |
| ax2.set_xlabel( | |
| "Target: " | |
| + str(top_left_x) | |
| + ", " | |
| + str(top_left_y) | |
| + ", " | |
| + str(bottom_right_x) | |
| + ", " | |
| + str(bottom_right_y) | |
| + "\n" | |
| + "IoU" | |
| + str(bounding_box_intersection_over_union(box_predicted, box_truth)) | |
| ) | |
| i = i + 1 | |
| print("mean_iou: " + str(mean_iou / len(x_test[:10]))) | |
| plt.show() | |
| """ | |
| This example demonstrates that a pure Transformer can be trained | |
| to predict the bounding boxes of an object in a given image, | |
| thus extending the use of Transformers to object detection tasks. | |
| The model can be improved further by tuning hyper-parameters and pre-training. | |
| """ | |