Spaces:

hp1318
/

First_ViT_APP

Sleeping

App Files Files Community

hp1318 commited on Oct 14, 2024

Commit

a34c6f1

verified ·

1 Parent(s): 98b596e

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -12

app.py CHANGED Viewed

@@ -1,41 +1,77 @@
 import torch
-import torchvision.models as models  # Replace with your ViT model if needed
 import torchvision.transforms as transforms
 from PIL import Image
 import gradio as gr
-# CIFAR-10 class names
 classes = ['airplane', 'automobile', 'bird', 'cat', 'deer',
            'dog', 'frog', 'horse', 'ship', 'truck']
-# Define the model architecture (replace with your ViT if needed)
-model = models.resnet18(num_classes=10)  # Use your custom model here
-# Load the model weights
 model.load_state_dict(torch.load('model.pth', map_location=torch.device('cpu')))
-model.eval()  # Set the model to evaluation mode
-# Define image transformations
 transform = transforms.Compose([
     transforms.Resize((32, 32)),
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
 ])
-# Define the prediction function
 def predict(image):
-    image = transform(image).unsqueeze(0)  # Add batch dimension
     with torch.no_grad():
         output = model(image)
         _, predicted = torch.max(output, 1)
         return classes[predicted.item()]
-# Create Gradio interface
 interface = gr.Interface(fn=predict,
                          inputs=gr.Image(type="pil"),
                          outputs="label",
-                         title="CIFAR-10 Image Classification")
-# Launch the app
 interface.launch()

 import torch
+import torch.nn as nn
 import torchvision.transforms as transforms
 from PIL import Image
 import gradio as gr
 classes = ['airplane', 'automobile', 'bird', 'cat', 'deer',
            'dog', 'frog', 'horse', 'ship', 'truck']
+class PatchEmbedding(nn.Module):
+    def __init__(self, in_channels=3, patch_size=4, embed_dim=64):
+        super().__init__()
+        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x):
+        x = self.proj(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super().__init__()
+        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
+    def forward(self, x):
+        x = x.permute(1, 0, 2)
+        attn_output, _ = self.attention(x, x, x)
+        return attn_output.permute(1, 0, 2)
+class ViT(nn.Module):
+    def __init__(self, num_classes=10, embed_dim=64, num_heads=4, num_layers=2):
+        super().__init__()
+        self.patch_embed = PatchEmbedding(embed_dim=embed_dim)
+        self.transformer_layers = nn.ModuleList([
+            MultiHeadSelfAttention(embed_dim, num_heads) for _ in range(num_layers)
+        ])
+        self.classifier = nn.Linear(embed_dim, num_classes)
+    def forward(self, x):
+        x = self.patch_embed(x)
+        for layer in self.transformer_layers:
+            x = layer(x) + x
+        x = x.mean(dim=1)
+        return self.classifier(x)
+model = ViT()
 model.load_state_dict(torch.load('model.pth', map_location=torch.device('cpu')))
+model.eval()
 transform = transforms.Compose([
     transforms.Resize((32, 32)),
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
 ])
 def predict(image):
+    image = transform(image).unsqueeze(0)
     with torch.no_grad():
         output = model(image)
         _, predicted = torch.max(output, 1)
         return classes[predicted.item()]
 interface = gr.Interface(fn=predict,
                          inputs=gr.Image(type="pil"),
                          outputs="label",
+                         title="CIFAR-10 Image Classification with ViT")
 interface.launch()