Spaces:

i4ata
/

CustomTransformerClassification

Paused

App Files Files Community

i4ata commited on Feb 24, 2024

Commit

cce011e

1 Parent(s): 26a33b7

done myb

Browse files

Files changed (12) hide show

__pycache__/model.cpython-310.pyc +0 -0
app.py +18 -3
custom_transformer/__pycache__/embedding.cpython-310.pyc +0 -0
custom_transformer/__pycache__/encoder.cpython-310.pyc +0 -0
custom_transformer/__pycache__/vit.cpython-310.pyc +0 -0
custom_transformer/embedding.py +82 -0
custom_transformer/encoder.py +97 -0
custom_transformer/vit.py +43 -0
examples/angular_leaf_spot_example.jpg +0 -0
examples/bean_rust_example.jpg +0 -0
examples/healthy_example.jpg +0 -0
main.py +0 -4

__pycache__/model.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/model.cpython-310.pyc and b/__pycache__/model.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import gradio as gr
 from PIL import Image
-from typing import List, Dict, Union
 import torch
 from model import ClassifierModel
 class GradioApp:
     def __init__(self) -> None:
@@ -19,6 +21,7 @@ class GradioApp:
     def predict(self, img_file: str, model_name: str) -> Dict[str, float]:
         if isinstance(self.models[model_name], str):
             self.models[model_name] = torch.load(self.models[model_name], map_location='cpu')
@@ -29,10 +32,22 @@ class GradioApp:
     def launch(self):
         demo = gr.Interface(
             fn=self.predict,
-            inputs=[gr.Image(type='filepath'), gr.Radio(('Custom', 'Pretrained'))],
-            outputs=gr.Label(num_top_classes=3),
         )
         demo.launch()

 import gradio as gr
 from PIL import Image
+import os
 import torch
 from model import ClassifierModel
+from typing import List, Dict, Union
 class GradioApp:
     def __init__(self) -> None:
     def predict(self, img_file: str, model_name: str) -> Dict[str, float]:
+        # Lazy loading of models
         if isinstance(self.models[model_name], str):
             self.models[model_name] = torch.load(self.models[model_name], map_location='cpu')
     def launch(self):
+        dataset_url = 'https://www.kaggle.com/datasets/marquis03/bean-leaf-lesions-classification/data'
+        github_repo_url = 'https://github.com/i4ata/TransformerClassification'
+        examples_list = [['examples/' + example] for example in os.listdir('examples')]
         demo = gr.Interface(
             fn=self.predict,
+            inputs=[
+                gr.Image(type='filepath', label='Input image to classify'),
+                gr.Radio(choices=('Custom', 'Pretrained'), label='Available models')
+            ],
+            outputs=gr.Label(num_top_classes=3, label='Model predictions'),
+            title='Plants Diseases Classification',
+            description=f'This model performs classification on images of leaves that are either healthy, \
+                have bean rust, or have an angular leaf spot. A vision transformer neural network architecture is used. \
+                The dataset can be downloaded from [Kaggle]({dataset_url}) and the source code is on [GitHub]({github_repo_url}).',
+            examples=examples_list
         )
         demo.launch()

custom_transformer/__pycache__/embedding.cpython-310.pyc ADDED Viewed

Binary file (3.07 kB). View file

custom_transformer/__pycache__/encoder.cpython-310.pyc ADDED Viewed

Binary file (4.55 kB). View file

custom_transformer/__pycache__/vit.cpython-310.pyc ADDED Viewed

Binary file (1.87 kB). View file

custom_transformer/embedding.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch
+import torch.nn as nn
+import math
+# Use that for fancy colored prints
+from termcolor import colored
+DEBUG = False
+class PatchEmbedding(nn.Module):
+    def __init__(self, in_channels: int = 3, embedding_dim: int = 768, patch_size: int = 16) -> None:
+        super().__init__()
+        # Linear projection:
+        self.linear_projection = nn.Conv2d(in_channels=in_channels, out_channels=embedding_dim, kernel_size=patch_size, stride=patch_size)
+        # Flattening:
+        self.flatten = nn.Flatten(start_dim=2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Input: [batch_size, in_channels, H, W]
+        if DEBUG: print(f'Patch embedding input shape: {x.shape} [batch_size, in_channels, image_height, image_width]')
+        # Linear Projection: [batch_size, embedding_dim, sqrt(n_patches), sqrt(n_patches)]
+        x = self.linear_projection(x)
+        if DEBUG: print(f'Linearly projected input: {x.shape} [batch_size, embedding_dim, sqrt(n_patches), sqrt(n_patches)]')
+        # Flattening: [batch_size, embedding_dim, n_patches]
+        x = self.flatten(x)
+        if DEBUG: print(f'Flattening of last 2 dimensions of linear projection: {x.shape} [batch_size, embedding_dim, n_patches]')
+        # Transpose last 2 dimensions: [batch_size, n_patches, embedding_dim]
+        x = x.mT
+        if DEBUG: print(f'Transpose last 2 dimensions: {x.shape} [batch_size, n_patches, embedding_dim]')
+        return x
+class Embedding(nn.Module):
+    def __init__(self, image_size: int = 224, in_channels: int = 3, embedding_dim: int = 768, patch_size: int = 16) -> None:
+        super().__init__()
+        assert (image_size * image_size) % (patch_size * patch_size) == 0
+        self.n_patches = (image_size * image_size) // (patch_size * patch_size)
+        if DEBUG: print(f'Total number of patches: {self.n_patches}, i.e. {int(math.sqrt(self.n_patches))} x {int(math.sqrt(self.n_patches))}')
+        # Patch embedding defined above
+        self.patch_embedding = PatchEmbedding(in_channels=in_channels, embedding_dim=embedding_dim, patch_size=patch_size)
+        # The class token x0, 1 for each embedding dim
+        self.class_token = nn.Parameter(torch.randn(1, 1, embedding_dim))
+        # The positional embedding, `n_patches` many for each embedding dim
+        self.position_embedding = nn.Parameter(torch.randn(1, self.n_patches + 1, embedding_dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if DEBUG: print(f'Embedding input shape: {x.shape}: [batch_size, in_channels, height, width]')
+        x = self.patch_embedding(x)
+        if DEBUG: print(f'Patch embedding output: {x.shape}: [batch_size, n_patches, embedding_dim]')
+        x = torch.cat((self.class_token.expand(len(x), -1, -1), x), dim=1)
+        if DEBUG: print(f'Class token prepended: {x.shape}: [batch_size, n_patches + 1, embedding_dim]')
+        x = x + self.position_embedding
+        if DEBUG: print(f'Positional embedding added: {x.shape}: [batch_size, n_patches + 1, embedding_dim]')
+        return x
+if __name__ == '__main__':
+    DEBUG = True
+    sample_image_batch = torch.rand(5,3,224,224)
+    embedding = Embedding()
+    out = embedding(sample_image_batch)
+    print(out)

custom_transformer/encoder.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+import torch.nn as nn
+DEBUG = False
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self, embedding_dim: int = 768, num_heads: int = 12) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = embedding_dim // num_heads
+        self.q_w, self.k_w, self.v_w, self.out_w = (nn.Linear(embedding_dim, embedding_dim) for _ in range(4))
+    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+        if DEBUG: print(f'MSA Input shape (Q, K, V): {q.shape}: [batch_size, n_patches, embedding_dim]')
+        # Linear projections for Q, K, V
+        if DEBUG: print(f'Linear projection for Q, K, V: {q.shape} [batch_size, n_patches, embedding_dim]')
+        q = self.q_w(q).view(*q.shape[:-1], self.num_heads, self.head_dim)
+        k = self.k_w(k).view(*k.shape[:-1], self.num_heads, self.head_dim)
+        v = self.q_w(v).view(*v.shape[:-1], self.num_heads, self.head_dim)
+        if DEBUG: print(f'Splitting the last dimension once for each head: {q.shape} [batch_size, n_patches, num_heads, head_dim]')
+        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
+        if DEBUG: print(f'Swap patches and head to have the head come first: {q.shape} [batch_size, num_heads, n_patches, head_dim]')
+        attention_scores = torch.matmul(q, k.mT) / (self.head_dim ** .5)
+        if DEBUG: print(f'Compute attention scores for each head (scaled dot product): {attention_scores.shape} [batch_size, num_heads, n_patches, n_patches]')
+        attention_weights = torch.softmax(attention_scores, dim=-1)
+        if DEBUG: print(f'Softmax of attention scores: {attention_weights.shape} [batch_size, num_batches, n_patches, n_patches]')
+        weighted_sum = torch.matmul(attention_weights, v)
+        if DEBUG: print(f'Weighted sum of Values: {weighted_sum.shape} [batch_size, num_heads, n_patches, head_dim]')
+        weighted_sum = weighted_sum.transpose(1, 2).contiguous()
+        if DEBUG: print(f'Swap again the patches and the heads: {weighted_sum.shape} [batch_size, n_patches, num_heads, head_dim]')
+        weighted_sum = weighted_sum.view(*weighted_sum.shape[:-2], -1)
+        if DEBUG: print(f'Recover the original dimensions by merging the last 2: {weighted_sum.shape} [batch_size, n_patches, embedding_dim]')
+        output = self.out_w(weighted_sum)
+        if DEBUG: print(f'(Output) Linear projection of the weighted sum: {output.shape} [batch_size, num_heads, n_patches, embedding_dim]')
+        return output
+class MSABlock(nn.Module):
+    def __init__(self, embedding_dim: int = 768, num_heads: int = 12) -> None:
+        super().__init__()
+        self.msa = MultiHeadSelfAttention(embedding_dim=embedding_dim, num_heads=num_heads)
+        self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.layer_norm(x)
+        return self.msa(x, x, x)
+class MLPBlock(nn.Module):
+    def __init__(self, embedding_dim: int = 768, hidden_size: int = 3072) -> None:
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(in_features=embedding_dim, out_features=hidden_size),
+            nn.GELU(),
+            nn.Linear(in_features=hidden_size, out_features=embedding_dim)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.mlp(self.layer_norm(x))
+class TransformerEncoderBlock(nn.Module):
+    def __init__(self, embedding_dim: int = 768, hidden_size: int = 3072, num_heads: int = 12) -> None:
+        super().__init__()
+        self.msa = MSABlock(embedding_dim=embedding_dim, num_heads=num_heads)
+        self.mlp = MLPBlock(embedding_dim=embedding_dim, hidden_size=hidden_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.msa(x) + x
+        x = self.mlp(x) + x
+        return x
+if __name__ == '__main__':
+    DEBUG = True
+    x = torch.rand(5, 197, 768)
+    msa = MultiHeadSelfAttention()
+    out = msa(x,x,x)
+    print(out.shape)

custom_transformer/vit.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+import torch.nn as nn
+import sys
+sys.path.append('..')
+from custom_transformer.embedding import Embedding
+from custom_transformer.encoder import TransformerEncoderBlock
+class ViT(nn.Module):
+    def __init__(self,
+                 image_size: int = 224,
+                 in_channels: int = 3,
+                 patch_size: int = 16,
+                 num_transformer_layers: int = 12,
+                 embedding_dim: int = 768,
+                 mlp_size: int = 3072,
+                 num_heads: int = 12,
+                 num_classes: int = 3) -> None:
+        super().__init__()
+        self.embedding = Embedding(image_size=image_size, in_channels=in_channels, embedding_dim=embedding_dim, patch_size=patch_size)
+        self.transformer_encoders = nn.Sequential(
+            *[TransformerEncoderBlock(embedding_dim=embedding_dim, hidden_size=mlp_size, num_heads=num_heads)
+              for _ in range(num_transformer_layers)]
+        )
+        self.classifier = nn.Sequential(
+            nn.LayerNorm(normalized_shape=embedding_dim),
+            nn.Linear(in_features=embedding_dim, out_features=num_classes)
+        )
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.transformer_encoders(x)
+        x = self.classifier(x[:, 0])
+        return x
+if __name__ == '__main__':
+    sample_image_batch = torch.rand(5,3,500,500)
+    vit = ViT(image_size=500, patch_size=50)
+    print(vit(sample_image_batch).shape)

examples/angular_leaf_spot_example.jpg ADDED Viewed

examples/bean_rust_example.jpg ADDED Viewed

examples/healthy_example.jpg ADDED Viewed

main.py DELETED Viewed

@@ -1,4 +0,0 @@
-import torch
-a = torch.load('models/pretrained_vit.pth', map_location='cpu')
-print(a)