Spaces:

Aumkeshchy2003
/

ViT_For_100_Class

Running

File size: 8,776 Bytes

3b058d0
 
 
 
 
 
 
 
 
1396142
3b058d0
 
 
 
 
db8f24a
9a8716b
db8f24a
3b058d0
a88a5ae
45ac9e4
3b058d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a8716b
db8f24a
3b058d0
db8f24a
 
9a8716b
 
 
 
db8f24a
9a8716b
 
 
db8f24a
9a8716b
 
 
db8f24a
 
 
 
3b058d0
 
db8f24a
 
 
3b058d0
 
 
 
 
 
 
 
 
 
db8f24a
3b058d0
 
 
 
 
 
 
 
 
db8f24a
3b058d0
 
 
 
9a8716b
db8f24a
3b058d0
 
 
9a8716b
3b058d0
 
db8f24a
 
 
 
9a8716b
3b058d0
 
db8f24a
 
3b058d0
 
 
 
db8f24a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b058d0
9a8716b
3b058d0
 
9a8716b
db8f24a
3b058d0
db8f24a
9a8716b
3b058d0
 
 
 
 
 
 
 
db8f24a
 
 
 
 
 
 
 
9a8716b
db8f24a
9a8716b
3b058d0
9a8716b
db8f24a
 
9a8716b
db8f24a
 
 
 
 
 
 
9a8716b
 
3b058d0
 
9a8716b
 
 
 
3b058d0
9a8716b
3b058d0
 
 
 
 
 
 
 
9a8716b
 
 
 
 
 
3b058d0
db8f24a
9a8716b
db8f24a
3b058d0
 
9a8716b
3b058d0
 
9a8716b
3b058d0
9a8716b
3b058d0
 
 
1396142
3b058d0
db8f24a
890dc2c
 
 
 
808c348
3b058d0
 
890dc2c
3b058d0
 
 
 
 
 
 
 
cafcdec
3b058d0
cafcdec
 
 
 
 
3b058d0
 
 
 
 
7028c06
9a8716b
961131f
00a96fa
961131f
 
 
 
 
3b058d0
 
dcb2ce2

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from einops import rearrange
import gradio as gr
from PIL import Image
import math

# Configuration
cfg = {
    "image_size": 32,
    "patch_size": 4,
    "in_channels": 3,
    "num_classes": 100,
    "emb_dim": 192,
    "num_heads": 6,
    "depth": 6,
    "mlp_ratio": 4.0,
    "drop": 0.1,
    "drop_path": 0.1
}

# CIFAR-100 class names
classes = [
    'apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle', 
    'bicycle', 'bottle', 'bowl', 'boy', 'bridge', 'bus', 'butterfly', 'camel', 
    'can', 'castle', 'caterpillar', 'cattle', 'chair', 'chimpanzee', 'clock', 
    'cloud', 'cockroach', 'couch', 'crab', 'crocodile', 'cup', 'dinosaur', 
    'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster', 
    'house', 'kangaroo', 'keyboard', 'lamp', 'lawn_mower', 'leopard', 'lion', 
    'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse', 
    'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear', 
    'pickup_truck', 'pine_tree', 'plain', 'plate', 'poppy', 'porcupine', 
    'possum', 'rabbit', 'raccoon', 'ray', 'road', 'rocket', 'rose', 'sea', 
    'seal', 'shark', 'shrew', 'skunk', 'skyscraper', 'snail', 'snake', 'spider', 
    'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'table', 'tank', 
    'telephone', 'television', 'tiger', 'tractor', 'train', 'trout', 'tulip', 
    'turtle', 'wardrobe', 'whale', 'willow_tree', 'wolf', 'woman', 'worm'
]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ConvPatchEmbed(nn.Module):
    def __init__(self, img_size=32, in_chans=3, embed_dim=192):
        super().__init__()
        # 32x32 -> 32x32 -> 16x16 -> 16x16
        self.proj = nn.Sequential(
            nn.Conv2d(in_chans, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),

            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1, bias=False),  # 32 -> 16
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),

            nn.Conv2d(128, embed_dim, kernel_size=3, stride=1, padding=1, bias=False),  # stays 16x16
            nn.BatchNorm2d(embed_dim),
            nn.ReLU(inplace=True),
        )

        grid_size = (img_size // 2, img_size // 2)  # (16,16)
        self.grid_size = grid_size
        self.num_patches = grid_size[0] * grid_size[1]

    def forward(self, x):
        x = self.proj(x)         # (B, E, H=16, W=16)
        B, C, H, W = x.shape
        x = x.flatten(2).transpose(1, 2)  # (B, N=H*W, E)
        return x

class MLP(nn.Module):
    def __init__(self, in_features, hidden_features=None, drop=0.):
        super().__init__()
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, in_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=True, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        # (B, N, 3C) -> (3, B, heads, N, head_dim)
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

# Simple Stochastic Depth
class StochasticDepth(nn.Module):
    def __init__(self, p):
        super().__init__()
        self.p = float(p)

    def forward(self, x):
        if not self.training or self.p == 0.0:
            return x
        keep_prob = 1.0 - self.p
        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x / keep_prob * random_tensor

class Block(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=4., drop=0., attn_drop=0., drop_path=0.):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.attn = Attention(dim, num_heads=num_heads, attn_drop=attn_drop, proj_drop=drop)
        self.drop_path = StochasticDepth(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = nn.LayerNorm(dim)
        self.mlp = MLP(dim, int(dim * mlp_ratio), drop=drop)

    def forward(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

class ViT(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        img_size = cfg["image_size"]

        self.patch_embed = ConvPatchEmbed(
            img_size=img_size,
            in_chans=cfg["in_channels"],
            embed_dim=cfg["emb_dim"]
        )
        n_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, cfg["emb_dim"]))
        self.pos_embed = nn.Parameter(torch.zeros(1, 1 + n_patches, cfg["emb_dim"]))
        self.pos_drop = nn.Dropout(p=cfg["drop"])

        # stochastic depth decay rule
        dpr = torch.linspace(0, cfg["drop_path"], cfg["depth"]).tolist()
        self.blocks = nn.ModuleList([
            Block(
                dim=cfg["emb_dim"],
                num_heads=cfg["num_heads"],
                mlp_ratio=cfg["mlp_ratio"],
                drop=cfg["drop"],
                drop_path=dpr[i]
            )
            for i in range(cfg["depth"])
        ])
        self.norm = nn.LayerNorm(cfg["emb_dim"])
        self.head = nn.Linear(cfg["emb_dim"], cfg["num_classes"])

        # init
        nn.init.trunc_normal_(self.pos_embed, std=.02)
        nn.init.trunc_normal_(self.cls_token, std=.02)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            nn.init.zeros_(m.bias)
            nn.init.ones_(m.weight)
        elif isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            if getattr(m, "bias", None) is not None:
                nn.init.zeros_(m.bias)

    def forward(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)                   # (B, N, E)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)     # (B, 1+N, E)
        x = x + self.pos_embed
        x = self.pos_drop(x)

        for blk in self.blocks:
            x = blk(x)

        x = self.norm(x)
        cls = x[:, 0]
        out = self.head(cls)
        return out


# Load model weights
checkpoint = torch.load("Revised_best_ViT_CIFAR100_baseline_checkpoint.pth", map_location=device)

model = ViT(cfg).to(device)

# Load only the model weights
model.load_state_dict(checkpoint["model_state"])
model.eval()


# Image preprocessing
transform = transforms.Compose([
    transforms.Resize((32,32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))  # CIFAR-100 stats
])

def predict(img: Image.Image):
    img_t = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        out = model(img_t)
        probs = torch.softmax(out, dim=1)[0]
        top5 = probs.topk(5)
    result = {classes[i]: float(probs[i]) for i in top5.indices}
    return result

# Gradio interface
iface = gr.Interface(
    fn=predict,
    inputs=gr.Image(type="pil"),
    outputs=gr.Label(num_top_classes=5, label="Top-5 Predictions"),
    title="Hybrid ViT+CNN CIFAR-100 Classifier",
    description="Upload a 32x32 image, and the model predicts the CIFAR-100 class.",
    examples=["_20230926_on_kangaroos.jpg",
             "complex-aerial-view-city.jpg",
             "apples-101-about-1440x810.webp",
             "detect(1).jpg",
             "Arabian-dromedary-camel-calf.webp",
             "1_9527341a-93b9-4566-9eb3-3bfe92cfed5f.webp"]
)

iface.launch()