nigamx
/

MiniViT

Image Classification

Model card Files Files and versions

xet

Community

Upload 2 files

by nigamx - opened Oct 29, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+198

-0

Files changed (2) hide show

main.py +120 -0
model.py +78 -0

main.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import torch
+import torchvision
+import torchvision.transforms as transforms
+import torch.nn as nn
+from model import MiniViT
+#This is a standard transformation to convert images to PyTorch Tensors
+transform = transforms.Compose([transforms.ToTensor()])
+# Download and load the CIFAR-10 training dataset
+trainset = torchvision.datasets.CIFAR10(root='./data',
+                                        train=True,
+                                        download=True,
+                                        transform=transform)
+# Create a DataLoader to handle batching and shuffling
+trainloader = torch.utils.data.DataLoader(trainset,
+                                          batch_size=4,
+                                          shuffle=True)
+# --- INSPECT ONE IMAGE ---
+# Get one batch of training images
+dataiter = iter(trainloader)
+images, labels = next(dataiter)
+# Select the very first image and its label from the batch
+first_image = images[0]
+first_label = labels[0]
+# Print the shape of the image tensor and its label
+print("----Data Inspection---")
+print(f"Image shape: {first_image.shape}")
+print(f"Label : {first_label.item()}")
+model = MiniViT()
+# --- TRAINING SETUP ---
+# 1. The Loss Function
+# CrossEntropyLoss is a standard choice for classification problems.
+criterion = nn.CrossEntropyLoss()
+# 2. The Optimizer
+# Adam is a popular and effective optimizer. We tell it which parameters
+# to tune (model.parameters()) and the learning rate (lr).
+optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+# --- THE TRAINING LOOP ---
+print("\n--- Starting Training ---")
+num_epochs = 20  # Let's train for 5 full cycles through the data
+for epoch in range(num_epochs):
+    running_loss = 0.0
+    for i, data in enumerate(trainloader, 0):
+        # Get the inputs; data is a list of [inputs, labels]
+        inputs, labels = data
+        # --- The 5 Core Steps of Training ---
+        # 1. Zero the parameter gradients (important!)
+        optimizer.zero_grad()
+        # 2. Forward pass: get the model's predictions
+        outputs = model(inputs)
+        # 3. Calculate the loss (how wrong the model was)
+        loss = criterion(outputs, labels)
+        # 4. Backward pass: calculate the gradients
+        loss.backward()
+        # 5. Update the weights: the optimizer tunes the model
+        optimizer.step()
+        # Print statistics
+        running_loss += loss.item()
+        if i % 2000 == 1999:  # Print every 2000 mini-batches
+            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
+            running_loss = 0.0
+print('--- Finished Training ---')
+# --- EVALUATION ---
+print("\n--- Starting Evaluation ---")
+# First, we need to load the test dataset
+testset = torchvision.datasets.CIFAR10(root='./data',
+                                       train=False,  # IMPORTANT: use the test set
+                                       download=True,
+                                       transform=transform)
+testloader = torch.utils.data.DataLoader(testset,
+                                         batch_size=4,
+                                         shuffle=False)  # No need to shuffle for testing
+correct = 0
+total = 0
+# Set the model to evaluation mode (disables dropout, etc.)
+model.eval()
+# We don't need to calculate gradients for evaluation, which saves memory and computations
+with torch.no_grad():
+    for data in testloader:
+        images, labels = data
+        # Get the model's predictions
+        outputs = model(images)
+        # Find the prediction with the highest score (the predicted class)
+        _, predicted = torch.max(outputs.data, 1)
+        # Count the total and correct predictions
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+accuracy = 100 * correct / total
+print(f'Accuracy of the network on the 10000 test images: {accuracy:.2f} %')

model.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Add this import to the top of your file
+import torch
+from torch import nn
+# --- MODEL ARCHITECTURE ---
+class MiniViT(nn.Module):
+    def __init__(self, patch_size=4, hidden_dim=128, num_heads=4, num_layers=2, num_classes=10):
+        super().__init__()
+        # --- 1. Patching and Embedding ---
+        self.patch_size = patch_size
+        # An image is 32x32 with 3 color channels.
+        # Patch dimension is 4 * 4 * 3 = 48
+        patch_dim = 3 * patch_size * patch_size
+        num_patches = (32 // patch_size) ** 2
+        # This layer projects the flattened patches into the hidden_dim
+        self.patch_embedding = nn.Linear(patch_dim, hidden_dim)
+        # --- 2. CLS Token and Positional Embedding ---
+        # A special token that will be used for classification
+        self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_dim))
+        # A learnable embedding to give the model spatial information
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, hidden_dim))
+        # --- 3. Transformer Encoder ---
+        # This is the main workhorse of the model
+        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, batch_first=True)
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        # --- 4. Classifier Head ---
+        # This takes the processed CLS token and makes the final prediction
+        self.classifier = nn.Linear(hidden_dim, num_classes)
+    def forward(self, x):
+        # x has shape [batch_size, 3, 32, 32]
+        # 1. Patching
+        # Reshape the image into a sequence of flattened patches
+        patches = x.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size)
+        patches = patches.contiguous().view(x.size(0), -1, 3 * self.patch_size * self.patch_size)
+        # Patches now have shape [batch_size, num_patches, patch_dim]
+        # 2. Embedding
+        # Project patches to the hidden dimension
+        x = self.patch_embedding(patches)  # [batch_size, num_patches, hidden_dim]
+        # 3. Prepend CLS token and add Positional Embedding
+        # Expand CLS token for the whole batch and add it to the front
+        cls_tokens = self.cls_token.expand(x.size(0), -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)  # [batch_size, num_patches + 1, hidden_dim]
+        # Add the positional information
+        x = x + self.pos_embedding
+        # 4. Pass through Transformer Encoder
+        x = self.transformer_encoder(x)  # [batch_size, num_patches + 1, hidden_dim]
+        # 5. Get the CLS token output and classify
+        cls_output = x[:, 0]  # Get the output of the first token (CLS)
+        output = self.classifier(cls_output)
+        return output
+# --- Create an instance of the model ---
+# Add this line at the end of your script
+model = MiniViT()
+print("\n--- Model Architecture ---")
+print(model)
+# You can also test it with a dummy image
+dummy_image = torch.randn(1, 3, 32, 32)  # A single random image
+prediction = model(dummy_image)
+print("\n--- Dummy Prediction Test ---")
+print(f"Output shape: {prediction.shape}")  # Should be [1, 10]