TenFate commited on
Commit
3bd22db
·
verified ·
1 Parent(s): 4fd3e38

Upload visual.py

Browse files
Files changed (1) hide show
  1. visual.py +180 -0
visual.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from argparse import Namespace
4
+ import torch.nn.functional as F
5
+ from transformers.activations import ACT2FN
6
+ import math
7
+ from torch.nn import LayerNorm
8
+
9
+
10
+ def standard_attention(query_layer, key_layer, value_layer, scaling_attention_score=True):
11
+ if scaling_attention_score:
12
+ query_layer = query_layer / math.sqrt(query_layer.shape[-1])
13
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
14
+
15
+ attention_probs = F.softmax(attention_scores, dim=-1)
16
+
17
+ context_layer = torch.matmul(attention_probs, value_layer)
18
+ return context_layer
19
+
20
+
21
+ def attention_fn_default(query_layer, key_layer, value_layer, scaling_attention_score=True):
22
+ if int(torch.__version__.split('.')[0]) >= 2 and scaling_attention_score:
23
+ # Pytorch 2.0 attention uses very much memory if attention_mask is float, and has NaN bug if attention_mask is None.
24
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
25
+ query_layer, key_layer, value_layer,
26
+ attn_mask=None,
27
+ dropout_p=0.,
28
+ is_causal=False
29
+ )
30
+ return attn_output
31
+ else:
32
+ return standard_attention(
33
+ query_layer, key_layer, value_layer, scaling_attention_score=scaling_attention_score
34
+ )
35
+
36
+
37
+ class PatchEmbedding(nn.Module):
38
+ def __init__(self, config):
39
+ super().__init__()
40
+ self.proj = nn.Conv2d(config.in_channels, config.hidden_size, kernel_size=config.patch_size,
41
+ stride=config.patch_size)
42
+ self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size))
43
+ self.position_embedding = nn.Embedding(config.num_positions, config.hidden_size)
44
+
45
+ def forward(self, images: "tensor(B, C, H, W)") -> "tensor(B, L, D)":
46
+ x = self.proj(images)
47
+ x = x.flatten(2).transpose(1, 2)
48
+ cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)
49
+ x = torch.cat((cls_token, x), dim=1)
50
+ x += self.position_embedding.weight.unsqueeze(0)
51
+ return x
52
+
53
+
54
+ class Attention(nn.Module):
55
+ def __init__(self, config):
56
+ super().__init__()
57
+ self.num_heads = config.num_heads
58
+ head_dim = config.hidden_size // config.num_heads
59
+ self.scale = head_dim ** -0.5
60
+ self.query_key_value = nn.Linear(config.hidden_size, config.hidden_size * 3)
61
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
62
+ self.output_dropout = torch.nn.Dropout(config.dropout_prob)
63
+
64
+ def forward(self, x: "tensor(B, L, D)") -> "tensor(B, L, D)":
65
+ B, L, _ = x.shape
66
+ qkv = self.query_key_value(x)
67
+ qkv = qkv.reshape(B, L, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) # 3, B, H, L, D
68
+ q, k, v = qkv[0], qkv[1], qkv[2]
69
+
70
+ out = attention_fn_default(
71
+ q, k, v
72
+ )
73
+ output = self.dense(out.transpose(1, 2).reshape(B, L, -1))
74
+ output = self.output_dropout(output)
75
+ return output
76
+
77
+ def attention(self, q, k, v):
78
+ attn_weights = torch.matmul(q * self.scale, k.transpose(-2, -1))
79
+ attn_weights = attn_weights.softmax(dim=-1)
80
+ output = torch.matmul(attn_weights, v)
81
+ return output
82
+
83
+
84
+ class MLP(nn.Module):
85
+ def __init__(self, config):
86
+ super().__init__()
87
+ self.config = config
88
+ self.activation_fn = ACT2FN[config.hidden_act]
89
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
90
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
91
+
92
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
93
+ x = self.fc1(x)
94
+ x = self.activation_fn(x)
95
+ x = self.fc2(x)
96
+ return x
97
+
98
+
99
+ class TransformerLayer(nn.Module):
100
+ def __init__(self, config):
101
+ super().__init__()
102
+ self.input_layernorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
103
+ self.attention = Attention(config)
104
+ self.mlp = MLP(config)
105
+ self.post_attention_layernorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
106
+
107
+ def forward(self, hidden_states):
108
+ attention_input = hidden_states
109
+ attention_output = self.input_layernorm(self.attention(attention_input))
110
+ hidden_states = attention_input + attention_output
111
+ mlp_input = hidden_states
112
+
113
+ # https://github.com/THUDM/GLM-4/issues/350
114
+ mlp_output = self.post_attention_layernorm(self.mlp(mlp_input)).to(mlp_input.device)
115
+ output = mlp_input + mlp_output
116
+ return output
117
+
118
+
119
+ class Transformer(nn.Module):
120
+ def __init__(self, config):
121
+ super().__init__()
122
+ self.layers = nn.ModuleList([TransformerLayer(config) for _ in range(config.num_hidden_layers)])
123
+
124
+ def forward(self, hidden_states):
125
+ for layer_module in self.layers:
126
+ hidden_states = layer_module(hidden_states)
127
+ return hidden_states
128
+
129
+
130
+ class GLU(nn.Module):
131
+ def __init__(self, config, in_features):
132
+ super().__init__()
133
+ self.linear_proj = nn.Linear(in_features, config.hidden_size, bias=False)
134
+ self.norm1 = nn.LayerNorm(config.hidden_size)
135
+ self.act1 = nn.GELU()
136
+ self.act2 = nn.functional.silu
137
+ self.dense_h_to_4h = nn.Linear(config.hidden_size, config.ffn_hidden_size, bias=False)
138
+ self.gate_proj = nn.Linear(config.hidden_size, config.ffn_hidden_size, bias=False)
139
+ self.dense_4h_to_h = nn.Linear(config.ffn_hidden_size, config.hidden_size, bias=False)
140
+
141
+ def forward(self, x):
142
+ x = self.linear_proj(x)
143
+ x = self.act1(self.norm1(x))
144
+ x = self.act2(self.gate_proj(x)) * self.dense_h_to_4h(x)
145
+ x = self.dense_4h_to_h(x)
146
+ return x
147
+
148
+
149
+ class EVA2CLIPModel(nn.Module):
150
+ def __init__(self, config):
151
+ super().__init__()
152
+ vision_config = Namespace(**config.vision_config)
153
+ self.patch_embedding = PatchEmbedding(vision_config)
154
+ self.transformer = Transformer(vision_config)
155
+ self.linear_proj = GLU(config, in_features=config.hidden_size)
156
+ self.conv = nn.Conv2d(in_channels=vision_config.hidden_size, out_channels=config.hidden_size, kernel_size=2,
157
+ stride=2)
158
+ self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
159
+ self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
160
+ self.scaling_factor = vision_config.scaling_factor
161
+
162
+ def forward(self, images: "tensor(B, C, H, W)") -> "tensor(B, L, D)":
163
+ x = self.patch_embedding(images)
164
+ x = self.transformer(x)
165
+ x = x[:, 1:]
166
+
167
+ b, s, h = x.shape
168
+ grid_size = int(s ** 0.5)
169
+ x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2)
170
+ x = self.conv(x)
171
+
172
+ x = x.flatten(2).transpose(1, 2)
173
+ x = self.linear_proj(x)
174
+
175
+ # https://github.com/THUDM/GLM-4/issues/350
176
+ boi = self.boi.expand(x.shape[0], -1, -1).to(x.device)
177
+ eoi = self.eoi.expand(x.shape[0], -1, -1).to(x.device)
178
+ x = torch.cat((boi, x, eoi), dim=1)
179
+ x = x / self.scaling_factor
180
+ return x