Airin-chan commited on
Commit
e5a23d1
·
verified ·
1 Parent(s): 982b3d5

Upload 3 files

Browse files
Files changed (3) hide show
  1. MiniVIT.py +87 -0
  2. Untitled29.ipynb +113 -0
  3. VIT_Encoder.pth +3 -0
MiniVIT.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ class Patch_Embedding (nn.Module):
5
+ def __init__(self,img_size,patch_size,embed_dim) :
6
+ super(Patch_Embedding,self).__init__()
7
+ self.img_size = img_size
8
+ self.patch_size = patch_size
9
+ self.embed_dim = embed_dim
10
+ self.n_patch = (img_size//patch_size)**2
11
+ self.projection_layers = nn.Conv2d(in_channels=3,out_channels=embed_dim,kernel_size=patch_size,stride=patch_size)
12
+
13
+ def forward(self,x) :
14
+ x = self.projection_layers(x)
15
+ B,D,H,W = x.shape
16
+ x = x.flatten(2)
17
+ x = x.transpose(1,2)
18
+ return x
19
+
20
+ class Positional_Encoding (nn.Module) :
21
+ def __init__ (self,n_patch,embedd_dim) :
22
+ super(Positional_Encoding,self).__init__()
23
+ self.n_patch = n_patch
24
+ self.embedd_dim = embedd_dim
25
+ self.positional_encoding = nn.Parameter(torch.normal(0,0.02,size=(1,n_patch + 1,embedd_dim)))
26
+ self.cls_token = nn.Parameter(torch.normal(0,0.02,size=(1,1,embedd_dim)))
27
+
28
+ def forward(self,x) :
29
+ batch = x.shape[0]
30
+ cls_token = torch.broadcast_to(self.cls_token,(batch,1,self.embedd_dim))
31
+ x = torch.cat((cls_token,x),dim=1)
32
+ x = x + self.positional_encoding
33
+ return x
34
+
35
+ class BlockTransformers (nn.Module) :
36
+ def __init__ (self,d_Model,d_ff,n_head) :
37
+ super(BlockTransformers,self).__init__()
38
+ self.MHA = nn.MultiheadAttention(embed_dim=d_Model,num_heads=n_head,batch_first=True)
39
+ self.FFN = nn.Sequential(
40
+ nn.Linear(d_Model,d_ff),
41
+ nn.GELU(),
42
+ nn.Linear(d_ff,d_Model)
43
+ )
44
+ self.drop_out = nn.Dropout(p=0.1)
45
+ self.drop_out2 = nn.Dropout(p=0.1)
46
+ self.layer_norm = nn.LayerNorm(d_Model)
47
+ self.layer_norm2 = nn.LayerNorm(d_Model)
48
+
49
+ def forward(self,x) :
50
+ residural = x
51
+ x = self.layer_norm(x)
52
+ attention,_ = self.MHA(x,x,x)
53
+ attention = self.drop_out(attention)
54
+ x = x + attention
55
+
56
+ residural = x
57
+ ffn = self.layer_norm2(x)
58
+ ffn = self.FFN(ffn)
59
+ ffn = self.drop_out2(ffn)
60
+ x = residural + ffn
61
+ return x
62
+
63
+ class MiniVisualTransformers (nn.Module) :
64
+ def __init__(self) :
65
+ super(MiniVisualTransformers,self).__init__()
66
+ self.Patch_Embedding = Patch_Embedding(img_size=144,patch_size=32,embed_dim=64)
67
+ self.Positional_Encoding = Positional_Encoding(n_patch=self.Patch_Embedding.n_patch,embedd_dim=self.Patch_Embedding.embed_dim)
68
+ self.BT = nn.ModuleList([BlockTransformers(d_Model=64,d_ff=256,n_head=4) for _ in range(4)])
69
+
70
+ def forward(self,x) :
71
+ x = self.Patch_Embedding(x)
72
+ x = self.Positional_Encoding(x)
73
+ for block in self.BT :
74
+ x = block(x)
75
+ return x
76
+
77
+ class Classifier (nn.Module) :
78
+ def __init__ (self,n_class) :
79
+ super(Classifier,self).__init__()
80
+ self.MiniVIT = MiniVisualTransformers()
81
+ self.linear = nn.Linear(64,n_class)
82
+
83
+ def forward(self,x) :
84
+ x = self.MiniVIT(x)
85
+ x = x[:,0,:]
86
+ x = self.linear(x)
87
+ return x
Untitled29.ipynb ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "markdown",
19
+ "source": [
20
+ "Follow this step for use SuperMiniVIT\n",
21
+ "----------------------------------------"
22
+ ],
23
+ "metadata": {
24
+ "id": "kFgTthdhoCFa"
25
+ }
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 3,
30
+ "metadata": {
31
+ "id": "_mE-O3tynzyy"
32
+ },
33
+ "outputs": [],
34
+ "source": [
35
+ "import torch\n",
36
+ "from MiniVIT import *"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "source": [
42
+ "model = MiniVisualTransformers()"
43
+ ],
44
+ "metadata": {
45
+ "id": "pCLQjtDxoPI0"
46
+ },
47
+ "execution_count": 4,
48
+ "outputs": []
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "source": [
53
+ "checkpoint = torch.load('VIT_Encoder.pth')\n",
54
+ "model.load_state_dict(checkpoint)"
55
+ ],
56
+ "metadata": {
57
+ "colab": {
58
+ "base_uri": "https://localhost:8080/",
59
+ "height": 0
60
+ },
61
+ "id": "R-EJhGpbolbl",
62
+ "outputId": "3dc9d1aa-49eb-4516-b16c-2305677bec63"
63
+ },
64
+ "execution_count": 6,
65
+ "outputs": [
66
+ {
67
+ "output_type": "execute_result",
68
+ "data": {
69
+ "text/plain": [
70
+ "<All keys matched successfully>"
71
+ ]
72
+ },
73
+ "metadata": {},
74
+ "execution_count": 6
75
+ }
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "source": [
81
+ "model(torch.randn(1,3,144,144))"
82
+ ],
83
+ "metadata": {
84
+ "colab": {
85
+ "base_uri": "https://localhost:8080/",
86
+ "height": 0
87
+ },
88
+ "id": "QFXaAujfouIi",
89
+ "outputId": "6a7ef31e-ef73-4f08-b68d-cd635174057c"
90
+ },
91
+ "execution_count": 7,
92
+ "outputs": [
93
+ {
94
+ "output_type": "execute_result",
95
+ "data": {
96
+ "text/plain": [
97
+ "tensor([[[-0.1631, 1.8401, 1.4494, ..., -1.2810, -2.9000, 0.1974],\n",
98
+ " [-0.7426, 1.0433, 0.3615, ..., -1.4665, -0.2818, 2.2017],\n",
99
+ " [ 1.3605, 1.0501, 0.9630, ..., 2.9057, 1.3372, -2.2445],\n",
100
+ " ...,\n",
101
+ " [-1.6320, 0.7411, -0.3816, ..., -1.9780, 1.6325, -0.0490],\n",
102
+ " [-1.2490, -0.6153, 0.8643, ..., -0.8104, 1.2853, -0.2412],\n",
103
+ " [-1.7517, -1.4150, -0.2602, ..., -2.3606, -0.3698, 1.9745]]],\n",
104
+ " grad_fn=<AddBackward0>)"
105
+ ]
106
+ },
107
+ "metadata": {},
108
+ "execution_count": 7
109
+ }
110
+ ]
111
+ }
112
+ ]
113
+ }
VIT_Encoder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c1552e3aab08ca59f0d7edd37f5d1675bfffdb40a8832662615aaf99e19b632
3
+ size 1608943