Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

config.json +49 -0
model.safetensors +3 -0
train.py +110 -0
vit_mnist.pth +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "architectures": [
+    "ViTForImageClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "encoder_stride": 16,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 128,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9"
+  },
+  "image_size": 28,
+  "initializer_range": 0.02,
+  "intermediate_size": 256,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2,
+    "LABEL_3": 3,
+    "LABEL_4": 4,
+    "LABEL_5": 5,
+    "LABEL_6": 6,
+    "LABEL_7": 7,
+    "LABEL_8": 8,
+    "LABEL_9": 9
+  },
+  "layer_norm_eps": 1e-12,
+  "model_type": "vit",
+  "num_attention_heads": 4,
+  "num_channels": 3,
+  "num_hidden_layers": 4,
+  "patch_size": 7,
+  "pooler_act": "tanh",
+  "pooler_output_size": 128,
+  "problem_type": "single_label_classification",
+  "qkv_bias": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce9f6e60324eef57eb3014c9c3e5fbcc77f948c9a4a614f21bb56f10ad7a2ce2
+size 2218808

train.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import torch
+from torch.utils.data import DataLoader
+from torchvision import transforms, datasets
+from transformers import ViTModel, ViTConfig, ViTForImageClassification
+import torch.nn as nn
+import torch.optim as optim
+from tqdm import tqdm
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Hyperparameters
+IMAGE_SIZE = 28  # MNIST image size
+PATCH_SIZE = 7   # Patch size to divide 28x28 image
+NUM_CLASSES = 10
+BATCH_SIZE = 128
+EPOCHS = 5
+LR = 2e-4
+# Resize and normalize
+transform = transforms.Compose([
+    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
+    transforms.ToTensor(),
+    transforms.Normalize((0.5,), (0.5,))
+])
+# Load MNIST dataset
+train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
+test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
+train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
+# Use a pre-configured ViT for image classification
+configuration = ViTConfig(
+    image_size=IMAGE_SIZE,
+    patch_size=PATCH_SIZE,
+    num_labels=NUM_CLASSES,
+    hidden_size=128,
+    num_hidden_layers=4,
+    num_attention_heads=4,
+    intermediate_size=256,
+    hidden_act="gelu",
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    initializer_range=0.02
+)
+model = ViTForImageClassification(configuration).to(device)
+# Alternatively, you can also load a pretrained ViT and fine-tune it:
+# model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=10)
+# Optimizer
+optimizer = optim.AdamW(model.parameters(), lr=LR)
+criterion = nn.CrossEntropyLoss()
+# Training loop
+def train():
+    model.train()
+    for epoch in range(EPOCHS):
+        total_loss = 0
+        correct = 0
+        total = 0
+        for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
+            images, labels = images.to(device), labels.to(device)
+            # Repeat grayscale channel to match expected input shape (ViT expects 3 channels)
+            images = images.repeat(1, 3, 1, 1)
+            outputs = model(images, labels=labels)
+            loss = outputs.loss
+            logits = outputs.logits
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            preds = torch.argmax(logits, dim=-1)
+            correct += (preds == labels).sum().item()
+            total += labels.size(0)
+        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}, Accuracy: {correct/total:.4f}")
+# Evaluation loop
+def evaluate():
+    model.eval()
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for images, labels in test_loader:
+            images, labels = images.to(device), labels.to(device)
+            images = images.repeat(1, 3, 1, 1)
+            outputs = model(images)
+            logits = outputs.logits
+            preds = torch.argmax(logits, dim=-1)
+            correct += (preds == labels).sum().item()
+            total += labels.size(0)
+    print(f"Test Accuracy: {correct / total:.4f}")
+# Run training and evaluation
+if __name__ == "__main__":
+    train()
+    evaluate()
+    model.save_pretrained(".")
+    torch.save(model, "vit_mnist.pth")

vit_mnist.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5eb7d550d17e6e7f76658bd8e70a65b3e9e451f5bef9deb4ada7cb5be5c7350
+size 2254631