First_ViT_APP / app.py
hp1318's picture
Update app.py
a34c6f1 verified
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
import gradio as gr
classes = ['airplane', 'automobile', 'bird', 'cat', 'deer',
'dog', 'frog', 'horse', 'ship', 'truck']
class PatchEmbedding(nn.Module):
def __init__(self, in_channels=3, patch_size=4, embed_dim=64):
super().__init__()
self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
def forward(self, x):
x = self.proj(x)
x = x.flatten(2).transpose(1, 2)
return x
class MultiHeadSelfAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super().__init__()
self.attention = nn.MultiheadAttention(embed_dim, num_heads)
def forward(self, x):
x = x.permute(1, 0, 2)
attn_output, _ = self.attention(x, x, x)
return attn_output.permute(1, 0, 2)
class ViT(nn.Module):
def __init__(self, num_classes=10, embed_dim=64, num_heads=4, num_layers=2):
super().__init__()
self.patch_embed = PatchEmbedding(embed_dim=embed_dim)
self.transformer_layers = nn.ModuleList([
MultiHeadSelfAttention(embed_dim, num_heads) for _ in range(num_layers)
])
self.classifier = nn.Linear(embed_dim, num_classes)
def forward(self, x):
x = self.patch_embed(x)
for layer in self.transformer_layers:
x = layer(x) + x
x = x.mean(dim=1)
return self.classifier(x)
model = ViT()
model.load_state_dict(torch.load('model.pth', map_location=torch.device('cpu')))
model.eval()
transform = transforms.Compose([
transforms.Resize((32, 32)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
def predict(image):
image = transform(image).unsqueeze(0)
with torch.no_grad():
output = model(image)
_, predicted = torch.max(output, 1)
return classes[predicted.item()]
interface = gr.Interface(fn=predict,
inputs=gr.Image(type="pil"),
outputs="label",
title="CIFAR-10 Image Classification with ViT")
interface.launch()