Spaces:

scymz2
/

MNISTFormer

Sleeping

App Files Files Community

mochuan zhan commited on Nov 11, 2024

Commit

78245fb

1 Parent(s): a121f59

Initial commit from desktop

Browse files

Files changed (3) hide show

app.py +122 -0
requirements.txt +4 -0
vit_model.pth +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import gradio as gr
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+import torch.nn as nn
+# 如果你的模型结构与标准的torchvision模型不同，请确保在此处定义或导入你的模型结构
+# 例如，如果你有一个model.py文件：
+# from model import ViTModel
+# 示例：定义一个简单的ViT模型结构（请根据你的实际模型调整）
+class ViT(nn.Module):
+    def __init__(self, image_size=28, patch_size=7, num_classes=10, dim=128, depth=6, heads=8, mlp_dim=256, dropout=0.1):
+        super(ViT, self).__init__()
+        assert image_size % patch_size == 0, 'image dimensions must be divisible by the patch size'
+        num_patches = (image_size // patch_size) ** 2
+        patch_dim = 1 * patch_size ** 2
+        # 定义线性层将图像分块并映射到嵌入空间
+        self.patch_embedding = nn.Linear(patch_dim, dim)
+        # 位置编码
+        # nn.Parameter是Pytorch中的一个类，用于将一个张量注册为模型的参数
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches, dim))
+        # Dropout层
+        self.dropout = nn.Dropout(dropout)
+        # Transformer编码器
+        # 当 batch_first=True 时，输入和输出张量的形状为 (batch_size, seq_length, feature_dim)。当 batch_first=False 时，输入和输出张量的形状为 (seq_length, batch_size, feature_dim)。
+        self.transformer = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                d_model=dim,
+                nhead=heads,
+                dim_feedforward=mlp_dim
+                # batch_first=True
+            ),
+            num_layers=depth
+        )
+        # 分类头
+        # nn.Identity()是一个空的层，它不执行任何操作，只是返回输入
+        # self.to_cls_token = nn.Identity()
+        # self.mlp_head = nn.Linear(dim, num_classes)
+        self.mlp_head = nn.Sequential(
+            nn.LayerNorm(dim),
+            nn.Linear(dim, num_classes)
+        )
+    def forward(self, x):
+        # x shape: [batch_size, 1, 28, 28]
+        batch_size = x.size(0)
+        x = x.view(batch_size, -1, 7*7)  # 将图像划分为7x7的Patch
+        x = self.patch_embedding(x)  # [batch_size, num_patches, dim]
+        x += self.pos_embedding  # 添加位置编码
+        x = self.dropout(x)  # 应用Dropout
+        x = x.permute(1, 0, 2)  # Transformer期望的输入形状：[seq_len, batch_size, embedding_dim]
+        x = self.transformer(x)  # [序列长度, batch_size, dim]
+        x = x.permute(1, 0, 2)  # 转回原来的形状：[batch_size, seq_len, dim]
+        x = x.mean(dim=1)  # 对所有Patch取平均，x.mean(dim=1) 这一步是对所有 Patch 的特征向量取平均值，从而得到一个代表整个图像的全局特征向量。
+        x = self.mlp_head(x)  # [batch_size, num_classes]
+        return x
+# 加载模型
+model = ViT(num_classes=10)  # 确保num_classes与你的MNIST任务一致
+model_path = "vit_mnist.pth"  # 模型权重文件名
+model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+model.eval()
+# 定义图像预处理
+transform = transforms.Compose([
+    transforms.Resize((224, 224)),          # 适应ViT的输入大小
+    transforms.ToTensor(),
+    transforms.Normalize((0.5,), (0.5,))    # 根据训练时的归一化参数调整
+])
+# 定义预测函数
+def classify_image(image):
+    # 如果输入是灰度图，将其转换为RGB
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    # 预处理图像
+    img = transform(image).unsqueeze(0)  # 添加批次维度
+    # 模型预测
+    with torch.no_grad():
+        outputs = model(img)
+    # 获取预测结果
+    _, predicted = torch.max(outputs, 1)
+    return str(predicted.item())
+# # 创建Gradio界面
+# iface = gr.Interface(
+#     fn=classify_image,
+#     inputs=gr.Image(shape=(28, 28), image_mode='L', source="upload", tool="editor"),
+#     outputs=gr.Label(num_top_classes=1),
+#     title="MNIST Classification with ViT",
+#     description="上传一张28x28的灰度图像，模型将预测其所属的数字类别。"
+# )
+iface = gr.Interface(
+    fn=classify_image,
+    inputs=gr.Image(
+        source="canvas",
+        tool="editor",
+        type="pil",
+        invert_colors=True,
+        shape=(224, 224),
+        image_mode="L",
+        label="Draw a digit"
+    ),
+    outputs=gr.Label(num_top_classes=1),
+    title="MNIST Digit Classification with ViT",
+    description="使用鼠标手绘一个数字，模型将预测其所属的类别。"
+)
+iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch
+torchvision
+gradio
+Pillow

vit_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:223c6c32c2a9d4c274b09c35ef089b358ee7cf1729b9d939fca898db5765dcdb
+size 3248655