Spaces:

Aumkeshchy2003
/

ViT_For_100_Class

Running

App Files Files Community

Aumkeshchy2003 commited on Oct 29, 2025

Commit

3b058d0

verified ·

1 Parent(s): 03d5776

Create app.py

Browse files

Files changed (1) hide show

app.py +187 -0

app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+from einops import rearrange
+import gradio as gr
+from PIL import Image
+import math
+# ------------------------
+# Configuration (must match your trained model)
+cfg = {
+    "image_size": 32,
+    "patch_size": 4,
+    "in_channels": 3,
+    "num_classes": 100,
+    "emb_dim": 192,
+    "num_heads": 6,
+    "depth": 8,
+    "mlp_ratio": 4.0,
+    "drop": 0.1
+}
+# CIFAR-100 class names
+classes = [
+    'apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle',
+    'bicycle', 'bottle', 'bowl', 'boy', 'bridge', 'bus', 'butterfly', 'camel',
+    'can', 'castle', 'caterpillar', 'cattle', 'chair', 'chimpanzee', 'clock',
+    'cloud', 'cockroach', 'couch', 'crab', 'crocodile', 'cup', 'dinosaur',
+    'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster',
+    'house', 'kangaroo', 'keyboard', 'lamp', 'lawn_mower', 'leopard', 'lion',
+    'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse',
+    'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear',
+    'pickup_truck', 'pine_tree', 'plain', 'plate', 'poppy', 'porcupine',
+    'possum', 'rabbit', 'raccoon', 'ray', 'road', 'rocket', 'rose', 'sea',
+    'seal', 'shark', 'shrew', 'skunk', 'skyscraper', 'snail', 'snake', 'spider',
+    'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'table', 'tank',
+    'telephone', 'television', 'tiger', 'tractor', 'train', 'trout', 'tulip',
+    'turtle', 'wardrobe', 'whale', 'willow_tree', 'wolf', 'woman', 'worm'
+]
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ------------------------
+# Model definition
+class PatchEmbed(nn.Module):
+    def __init__(self, img_size=32, patch_size=4, in_chans=3, embed_dim=192):
+        super().__init__()
+        self.patch_size = patch_size
+        self.n_patches = (img_size // patch_size) ** 2
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x):
+        x = self.proj(x)
+        x = x.flatten(2).transpose(1,2)
+        return x
+class MLP(nn.Module):
+    def __init__(self, in_features, hidden_features=None, drop=0.):
+        super().__init__()
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_features, in_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim*3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B,N,3,self.num_heads,C//self.num_heads).permute(2,0,3,1,4)
+        q,k,v = qkv[0], qkv[1], qkv[2]
+        attn = (q @ k.transpose(-2,-1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1,2).reshape(B,N,C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class _StochasticDepth(nn.Module):
+    def __init__(self, p):
+        super().__init__()
+        self.p = p
+    def forward(self, x):
+        if not self.training or self.p==0.:
+            return x
+        keep = torch.rand(x.shape[0],1,1, device=x.device) >= self.p
+        return x * keep / (1 - self.p)
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., drop=0., drop_path=0.):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = Attention(dim, num_heads=num_heads, attn_drop=drop, proj_drop=drop)
+        self.drop_path = nn.Identity() if drop_path==0. else _StochasticDepth(drop_path)
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = MLP(dim, int(dim*mlp_ratio), drop=drop)
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class ViT(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.patch_embed = PatchEmbed(cfg["image_size"], cfg["patch_size"], cfg["in_channels"], cfg["emb_dim"])
+        n_patches = self.patch_embed.n_patches
+        self.cls_token = nn.Parameter(torch.zeros(1,1,cfg["emb_dim"]))
+        self.pos_embed = nn.Parameter(torch.zeros(1, 1+n_patches, cfg["emb_dim"]))
+        self.pos_drop = nn.Dropout(p=cfg["drop"])
+        dpr = [x.item() for x in torch.linspace(0, 0.1, cfg["depth"])]
+        self.blocks = nn.ModuleList([Block(cfg["emb_dim"], cfg["num_heads"], cfg["mlp_ratio"], cfg["drop"], dpr[i]) for i in range(cfg["depth"])])
+        self.norm = nn.LayerNorm(cfg["emb_dim"])
+        self.head = nn.Linear(cfg["emb_dim"], cfg["num_classes"])
+        nn.init.trunc_normal_(self.pos_embed,std=.02)
+        nn.init.trunc_normal_(self.cls_token,std=.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.zeros_(m.bias)
+            nn.init.ones_(m.weight)
+    def forward(self,x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+        cls_tokens = self.cls_token.expand(B,-1,-1)
+        x = torch.cat((cls_tokens,x),dim=1)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        cls = x[:,0]
+        out = self.head(cls)
+        return out
+# ------------------------
+# Load model weights
+model = ViT(cfg).to(device)
+model.load_state_dict(torch.load("best_vit_cifar100.pt", map_location=device))
+model.eval()
+# ------------------------
+# Image preprocessing
+transform = transforms.Compose([
+    transforms.Resize((32,32)),
+    transforms.ToTensor(),
+    transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))  # CIFAR-100 stats
+])
+def predict(img: Image.Image):
+    img = transform(img).unsqueeze(0).to(device)
+    with torch.no_grad():
+        out = model(img)
+        pred = out.argmax(1).item()
+    return classes[pred]
+# ------------------------
+# Gradio interface
+iface = gr.Interface(
+    fn=predict,
+    inputs=gr.Image(type="pil"),
+    outputs=gr.Label(num_top_classes=1),
+    title="ViT CIFAR-100 Classifier",
+    description="Upload a 32x32 image, and the model predicts the CIFAR-100 class."
+)
+iface.launch()