nev8r commited on
Commit
62f28ff
·
verified ·
1 Parent(s): 4ba1101

Upload VerMind model

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0]['role'] == 'system' -%}
14
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
15
+ {%- else -%}
16
+ {{- '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}
17
+ {%- endif %}
18
+ {%- endif %}
19
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
20
+ {%- for message in messages[::-1] %}
21
+ {%- set index = (messages|length - 1) - loop.index0 %}
22
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
23
+ {%- set ns.multi_step_tool = false %}
24
+ {%- set ns.last_query_index = index %}
25
+ {%- endif %}
26
+ {%- endfor %}
27
+ {%- for message in messages %}
28
+ {%- if message.content is string %}
29
+ {%- set content = message.content %}
30
+ {%- else %}
31
+ {%- set content = '' %}
32
+ {%- endif %}
33
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
34
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
35
+ {%- elif message.role == "assistant" %}
36
+ {{- '<|im_start|>' + message.role + '\n' + content }}
37
+ {%- if message.tool_calls %}
38
+ {%- for tool_call in message.tool_calls %}
39
+ {%- if (loop.first and content) or (not loop.first) %}
40
+ {{- '\n' }}
41
+ {%- endif %}
42
+ {%- if tool_call.function %}
43
+ {%- set tool_call = tool_call.function %}
44
+ {%- endif %}
45
+ {{- '<tool_call>\n{"name": "' }}
46
+ {{- tool_call.name }}
47
+ {{- '", "arguments": ' }}
48
+ {%- if tool_call.arguments is string %}
49
+ {{- tool_call.arguments }}
50
+ {%- else %}
51
+ {{- tool_call.arguments | tojson }}
52
+ {%- endif %}
53
+ {{- '}\n</tool_call>' }}
54
+ {%- endfor %}
55
+ {%- endif %}
56
+ {{- '<|im_end|>\n' }}
57
+ {%- elif message.role == "tool" %}
58
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
59
+ {{- '<|im_start|>user' }}
60
+ {%- endif %}
61
+ {{- '\n<tool_response>\n' }}
62
+ {{- content }}
63
+ {{- '\n</tool_response>' }}
64
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
65
+ {{- '<|im_end|>\n' }}
66
+ {%- endif %}
67
+ {%- endif %}
68
+ {%- endfor %}
69
+ {%- if add_generation_prompt %}
70
+ {{- '<|im_start|>assistant\n' }}
71
+ {%- if enable_thinking is defined and enable_thinking is false %}
72
+ {{- '<think>\n\n</think>\n\n' }}
73
+ {%- endif %}
74
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "VerMindVLM"
4
+ ],
5
+ "aux_loss_alpha": 0.01,
6
+ "bos_token_id": 1,
7
+ "dropout": 0.0,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "flash_attn": true,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 768,
13
+ "image_ids": [
14
+ 34,
15
+ 34,
16
+ 34,
17
+ 34,
18
+ 34,
19
+ 34,
20
+ 34,
21
+ 34,
22
+ 34,
23
+ 34,
24
+ 34,
25
+ 34,
26
+ 34,
27
+ 34,
28
+ 34,
29
+ 34,
30
+ 34,
31
+ 34,
32
+ 34,
33
+ 34,
34
+ 34,
35
+ 34,
36
+ 34,
37
+ 34,
38
+ 34,
39
+ 34,
40
+ 34,
41
+ 34,
42
+ 34,
43
+ 34,
44
+ 34,
45
+ 34,
46
+ 34,
47
+ 34,
48
+ 34,
49
+ 34,
50
+ 34,
51
+ 34,
52
+ 34,
53
+ 34,
54
+ 34,
55
+ 34,
56
+ 34,
57
+ 34,
58
+ 34,
59
+ 34,
60
+ 34,
61
+ 34,
62
+ 34,
63
+ 34,
64
+ 34,
65
+ 34,
66
+ 34,
67
+ 34,
68
+ 34,
69
+ 34,
70
+ 34,
71
+ 34,
72
+ 34,
73
+ 34,
74
+ 34,
75
+ 34,
76
+ 34,
77
+ 34,
78
+ 34,
79
+ 34,
80
+ 34,
81
+ 34,
82
+ 34,
83
+ 34,
84
+ 34,
85
+ 34,
86
+ 34,
87
+ 34,
88
+ 34,
89
+ 34,
90
+ 34,
91
+ 34,
92
+ 34,
93
+ 34,
94
+ 34,
95
+ 34,
96
+ 34,
97
+ 34,
98
+ 34,
99
+ 34,
100
+ 34,
101
+ 34,
102
+ 34,
103
+ 34,
104
+ 34,
105
+ 34,
106
+ 34,
107
+ 34,
108
+ 34,
109
+ 34,
110
+ 34,
111
+ 34,
112
+ 34,
113
+ 34,
114
+ 34,
115
+ 34,
116
+ 34,
117
+ 34,
118
+ 34,
119
+ 34,
120
+ 34,
121
+ 34,
122
+ 34,
123
+ 34,
124
+ 34,
125
+ 34,
126
+ 34,
127
+ 34,
128
+ 34,
129
+ 34,
130
+ 34,
131
+ 34,
132
+ 34,
133
+ 34,
134
+ 34,
135
+ 34,
136
+ 34,
137
+ 34,
138
+ 34,
139
+ 34,
140
+ 34,
141
+ 34,
142
+ 34,
143
+ 34,
144
+ 34,
145
+ 34,
146
+ 34,
147
+ 34,
148
+ 34,
149
+ 34,
150
+ 34,
151
+ 34,
152
+ 34,
153
+ 34,
154
+ 34,
155
+ 34,
156
+ 34,
157
+ 34,
158
+ 34,
159
+ 34,
160
+ 34,
161
+ 34,
162
+ 34,
163
+ 34,
164
+ 34,
165
+ 34,
166
+ 34,
167
+ 34,
168
+ 34,
169
+ 34,
170
+ 34,
171
+ 34,
172
+ 34,
173
+ 34,
174
+ 34,
175
+ 34,
176
+ 34,
177
+ 34,
178
+ 34,
179
+ 34,
180
+ 34,
181
+ 34,
182
+ 34,
183
+ 34,
184
+ 34,
185
+ 34,
186
+ 34,
187
+ 34,
188
+ 34,
189
+ 34,
190
+ 34,
191
+ 34,
192
+ 34,
193
+ 34,
194
+ 34,
195
+ 34,
196
+ 34,
197
+ 34,
198
+ 34,
199
+ 34,
200
+ 34,
201
+ 34,
202
+ 34,
203
+ 34,
204
+ 34,
205
+ 34,
206
+ 34,
207
+ 34,
208
+ 34,
209
+ 34
210
+ ],
211
+ "image_special_token": "<image>",
212
+ "inference_rope_scaling": false,
213
+ "intermediate_size": 2048,
214
+ "max_position_embeddings": 32768,
215
+ "model_type": "vermind-v",
216
+ "n_routed_experts": 4,
217
+ "n_shared_experts": 1,
218
+ "norm_topk_prob": true,
219
+ "num_attention_heads": 8,
220
+ "num_experts_per_tok": 2,
221
+ "num_hidden_layers": 16,
222
+ "num_key_value_heads": 2,
223
+ "rms_norm_eps": 1e-05,
224
+ "rope_scaling": null,
225
+ "rope_theta": 1000000.0,
226
+ "scoring_func": "softmax",
227
+ "seq_aux": true,
228
+ "transformers_version": "4.57.6",
229
+ "use_moe": false,
230
+ "vocab_size": 6400
231
+ }
configuration_vermind.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ """
3
+ Configuration file for VerMind model - Standalone Version
4
+ """
5
+
6
+ from transformers import PretrainedConfig, AutoConfig
7
+
8
+
9
+ class VerMindConfig(PretrainedConfig):
10
+ """Configuration class for VerMind model"""
11
+ model_type = "vermind"
12
+
13
+ def __init__(
14
+ self,
15
+ dropout: float = 0.0,
16
+ bos_token_id: int = 1,
17
+ eos_token_id: int = 2,
18
+ hidden_act: str = 'silu',
19
+ hidden_size: int = 768,
20
+ intermediate_size: int = None,
21
+ max_position_embeddings: int = 32768,
22
+ num_attention_heads: int = 8,
23
+ num_hidden_layers: int = 16,
24
+ num_key_value_heads: int = 2,
25
+ vocab_size: int = 6400,
26
+ rms_norm_eps: float = 1e-05,
27
+ rope_theta: float = 1000000.0,
28
+ inference_rope_scaling: bool = False,
29
+ flash_attn: bool = True,
30
+ use_moe: bool = False,
31
+ num_experts_per_tok: int = 2,
32
+ n_routed_experts: int = 4,
33
+ n_shared_experts: int = 1,
34
+ scoring_func: str = 'softmax',
35
+ aux_loss_alpha: float = 0.01,
36
+ seq_aux: bool = True,
37
+ norm_topk_prob: bool = True,
38
+ **kwargs
39
+ ):
40
+ super().__init__(**kwargs)
41
+ self.dropout = dropout
42
+ self.bos_token_id = bos_token_id
43
+ self.eos_token_id = eos_token_id
44
+ self.hidden_act = hidden_act
45
+ self.hidden_size = hidden_size
46
+ self.intermediate_size = intermediate_size
47
+ self.max_position_embeddings = max_position_embeddings
48
+ self.num_attention_heads = num_attention_heads
49
+ self.num_hidden_layers = num_hidden_layers
50
+ self.num_key_value_heads = num_key_value_heads
51
+ self.vocab_size = vocab_size
52
+ self.rms_norm_eps = rms_norm_eps
53
+ self.rope_theta = rope_theta
54
+ self.inference_rope_scaling = inference_rope_scaling
55
+
56
+ self.rope_scaling = {
57
+ "beta_fast": 32,
58
+ "beta_slow": 1,
59
+ "factor": 16,
60
+ "original_max_position_embeddings": 2048,
61
+ "attention_factor": 1.0,
62
+ "type": "yarn"
63
+ } if self.inference_rope_scaling else None
64
+ self.flash_attn = flash_attn
65
+
66
+ self.use_moe = use_moe
67
+ self.num_experts_per_tok = num_experts_per_tok
68
+ self.n_routed_experts = n_routed_experts
69
+ self.n_shared_experts = n_shared_experts
70
+ self.scoring_func = scoring_func
71
+ self.aux_loss_alpha = aux_loss_alpha
72
+ self.seq_aux = seq_aux
73
+ self.norm_topk_prob = norm_topk_prob
74
+
75
+
76
+ # Register the config class
77
+ AutoConfig.register("vermind", VerMindConfig)
78
+
79
+ __all__ = ["VerMindConfig"]
configuration_vermind_v.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ """
3
+ Configuration file for VerMind-V model - Standalone Version
4
+ """
5
+
6
+ from typing import List
7
+ from transformers import AutoConfig
8
+
9
+ from ..core.configuration_vermind import VerMindConfig
10
+
11
+
12
+ class VLMConfig(VerMindConfig):
13
+ """Configuration class for VerMind-V (Vision-Language) model"""
14
+ model_type = "vermind-v"
15
+
16
+ def __init__(
17
+ self,
18
+ image_special_token: str = '<image>',
19
+ image_ids: List = None,
20
+ **kwargs,
21
+ ):
22
+ if image_ids is None:
23
+ image_ids = [34] * 196 # SigLIP 14x14 = 196 tokens, no pooling
24
+ self.image_special_token = image_special_token
25
+ self.image_ids = image_ids
26
+ super().__init__(**kwargs)
27
+
28
+
29
+ # Register the config class
30
+ AutoConfig.register("vermind-v", VLMConfig)
31
+
32
+ __all__ = ["VLMConfig"]
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.57.6"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c9a6c8ce7b30883286b8cda1a1f5c638f5cd970d5f780ae9ac15e309dfd06a5
3
+ size 812097376
modeling_vermind.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ """
3
+ Model file for VerMind model - Standalone Version
4
+ Contains complete implementation without external dependencies
5
+ """
6
+
7
+ import math
8
+ from typing import Optional, Tuple, List, Union
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from transformers import PreTrainedModel, GenerationMixin, AutoModelForCausalLM
14
+ from transformers.activations import ACT2FN
15
+ from transformers.modeling_outputs import CausalLMOutputWithPast
16
+
17
+ from .configuration_vermind import VerMindConfig
18
+
19
+
20
+ # ==================== Base Module Functions ====================
21
+
22
+ def precompute_freqs_cis(dim: int, end: int = int(32 * 1024), rope_base: float = 1e6,
23
+ rope_scaling: Optional[dict] = None):
24
+ """Precompute rotary position embedding frequencies"""
25
+ freqs, attn_factor = 1.0 / (rope_base ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)), 1.0
26
+ if rope_scaling is not None:
27
+ orig_max, factor, beta_fast, beta_slow, attn_factor = (
28
+ rope_scaling.get("original_max_position_embeddings", 2048),
29
+ rope_scaling.get("factor", 16),
30
+ rope_scaling.get("beta_fast", 32.0),
31
+ rope_scaling.get("beta_slow", 1.0),
32
+ rope_scaling.get("attention_factor", 1.0)
33
+ )
34
+ if end / orig_max > 1.0:
35
+ inv_dim = lambda b: (dim * math.log(orig_max / (b * 2 * math.pi))) / (2 * math.log(rope_base))
36
+ low, high = max(math.floor(inv_dim(beta_fast)), 0), min(math.ceil(inv_dim(beta_slow)), dim // 2 - 1)
37
+ ramp = torch.clamp((torch.arange(dim // 2, device=freqs.device).float() - low) / max(high - low, 0.001), 0, 1)
38
+ freqs = freqs * (1 - ramp + ramp / factor)
39
+
40
+ t = torch.arange(end, device=freqs.device)
41
+ freqs = torch.outer(t, freqs).float()
42
+ freqs_cos = torch.cat([torch.cos(freqs), torch.cos(freqs)], dim=-1) * attn_factor
43
+ freqs_sin = torch.cat([torch.sin(freqs), torch.sin(freqs)], dim=-1) * attn_factor
44
+ return freqs_cos, freqs_sin
45
+
46
+
47
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
48
+ """Apply rotary position embeddings to queries and keys"""
49
+ def rotate_half(x):
50
+ return torch.cat((-x[..., x.shape[-1] // 2:], x[..., : x.shape[-1] // 2]), dim=-1)
51
+
52
+ if position_ids is not None:
53
+ if position_ids.dim() == 1:
54
+ pos_ids = position_ids
55
+ cos_selected = cos[pos_ids]
56
+ sin_selected = sin[pos_ids]
57
+ cos_selected = cos_selected.unsqueeze(0).unsqueeze(2)
58
+ sin_selected = sin_selected.unsqueeze(0).unsqueeze(2)
59
+ else:
60
+ cos_selected = cos[position_ids]
61
+ sin_selected = sin[position_ids]
62
+ cos_selected = cos_selected.unsqueeze(2)
63
+ sin_selected = sin_selected.unsqueeze(2)
64
+
65
+ q_embed = (q * cos_selected) + (rotate_half(q) * sin_selected)
66
+ k_embed = (k * cos_selected) + (rotate_half(k) * sin_selected)
67
+ else:
68
+ seq_len = q.shape[1]
69
+ cos_s = cos[:seq_len]
70
+ sin_s = sin[:seq_len]
71
+ cos_s = cos_s.unsqueeze(0).unsqueeze(2)
72
+ sin_s = sin_s.unsqueeze(0).unsqueeze(2)
73
+ q_embed = (q * cos_s) + (rotate_half(q) * sin_s)
74
+ k_embed = (k * cos_s) + (rotate_half(k) * sin_s)
75
+
76
+ return q_embed, k_embed
77
+
78
+
79
+ def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
80
+ """Repeat key/value heads for GQA"""
81
+ bs, slen, num_key_value_heads, head_dim = x.shape
82
+ if n_rep == 1:
83
+ return x
84
+ return x[:, :, :, None, :].expand(bs, slen, num_key_value_heads, n_rep, head_dim).reshape(
85
+ bs, slen, num_key_value_heads * n_rep, head_dim
86
+ )
87
+
88
+
89
+ # ==================== Module Classes ====================
90
+
91
+ class RMSNorm(nn.Module):
92
+ """Root Mean Square Layer Normalization"""
93
+ def __init__(self, dim: int, eps: float = 1e-5):
94
+ super().__init__()
95
+ self.eps = eps
96
+ self.weight = nn.Parameter(torch.ones(dim))
97
+
98
+ def _norm(self, x):
99
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
100
+
101
+ def forward(self, x):
102
+ return self.weight * self._norm(x.float()).type_as(x)
103
+
104
+
105
+ class FeedForward(nn.Module):
106
+ """SwiGLU Feed-Forward Network"""
107
+ def __init__(self, config: VerMindConfig):
108
+ super().__init__()
109
+ if config.intermediate_size is None:
110
+ intermediate_size = int(config.hidden_size * 8 / 3)
111
+ config.intermediate_size = 64 * ((intermediate_size + 64 - 1) // 64)
112
+ self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
113
+ self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
114
+ self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
115
+ self.dropout = nn.Dropout(config.dropout)
116
+ self.act_fn = ACT2FN[config.hidden_act]
117
+
118
+ def forward(self, x):
119
+ return self.dropout(self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)))
120
+
121
+
122
+ class Attention(nn.Module):
123
+ """Grouped Query Attention with RoPE"""
124
+ def __init__(self, args: VerMindConfig):
125
+ super().__init__()
126
+ self.num_key_value_heads = args.num_attention_heads if args.num_key_value_heads is None else args.num_key_value_heads
127
+ assert args.num_attention_heads % self.num_key_value_heads == 0
128
+ self.n_local_heads = args.num_attention_heads
129
+ self.n_local_kv_heads = self.num_key_value_heads
130
+ self.n_rep = self.n_local_heads // self.n_local_kv_heads
131
+ self.head_dim = args.hidden_size // args.num_attention_heads
132
+ self.q_proj = nn.Linear(args.hidden_size, args.num_attention_heads * self.head_dim, bias=False)
133
+ self.k_proj = nn.Linear(args.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
134
+ self.v_proj = nn.Linear(args.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
135
+ self.o_proj = nn.Linear(args.num_attention_heads * self.head_dim, args.hidden_size, bias=False)
136
+ self.attn_dropout = nn.Dropout(args.dropout)
137
+ self.resid_dropout = nn.Dropout(args.dropout)
138
+ self.dropout = args.dropout
139
+ self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') and args.flash_attn
140
+
141
+ def forward(self, x, position_embeddings, past_key_value=None, use_cache=False,
142
+ attention_mask=None, position_ids=None, cu_seqlens=None):
143
+ bsz, seq_len, _ = x.shape
144
+ xq, xk, xv = self.q_proj(x), self.k_proj(x), self.v_proj(x)
145
+ xq = xq.view(bsz, seq_len, self.n_local_heads, self.head_dim)
146
+ xk = xk.view(bsz, seq_len, self.n_local_kv_heads, self.head_dim)
147
+ xv = xv.view(bsz, seq_len, self.n_local_kv_heads, self.head_dim)
148
+
149
+ cos, sin = position_embeddings
150
+ xq, xk = apply_rotary_pos_emb(xq, xk, cos, sin, position_ids=position_ids)
151
+
152
+ if past_key_value is not None:
153
+ xk = torch.cat([past_key_value[0], xk], dim=1)
154
+ xv = torch.cat([past_key_value[1], xv], dim=1)
155
+ past_kv = (xk, xv) if use_cache else None
156
+
157
+ xq, xk, xv = xq.transpose(1, 2), repeat_kv(xk, self.n_rep).transpose(1, 2), repeat_kv(xv, self.n_rep).transpose(1, 2)
158
+
159
+ is_2d_mask = attention_mask is not None and attention_mask.dim() == 3
160
+ use_flash = self.flash and (seq_len > 1) and (past_key_value is None)
161
+
162
+ if use_flash and (attention_mask is None or (not is_2d_mask and torch.all(attention_mask == 1))):
163
+ output = F.scaled_dot_product_attention(
164
+ xq, xk, xv,
165
+ dropout_p=self.dropout if self.training else 0.0,
166
+ is_causal=True
167
+ )
168
+ else:
169
+ scores = (xq @ xk.transpose(-2, -1)) / math.sqrt(self.head_dim)
170
+ if not is_2d_mask:
171
+ scores[:, :, :, -seq_len:] += torch.triu(torch.full((seq_len, seq_len), float("-inf"), device=scores.device), diagonal=1)
172
+ if attention_mask is not None:
173
+ if is_2d_mask:
174
+ attention_mask = attention_mask[:, 0, :] if attention_mask.dim() == 3 else attention_mask
175
+ extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
176
+ extended_attention_mask = (1.0 - extended_attention_mask.float()) * -1e9
177
+ scores = scores + extended_attention_mask
178
+ scores = F.softmax(scores.float(), dim=-1).type_as(xq)
179
+ scores = self.attn_dropout(scores)
180
+ output = scores @ xv
181
+
182
+ output = output.transpose(1, 2).reshape(bsz, seq_len, -1)
183
+ output = self.resid_dropout(self.o_proj(output))
184
+ return output, past_kv
185
+
186
+
187
+ # ==================== Main Model Classes ====================
188
+
189
+ class VerMindBlock(nn.Module):
190
+ """Transformer Decoder Block"""
191
+ def __init__(self, layer_id: int, config: VerMindConfig):
192
+ super().__init__()
193
+ self.num_attention_heads = config.num_attention_heads
194
+ self.hidden_size = config.hidden_size
195
+ self.head_dim = config.hidden_size // config.num_attention_heads
196
+ self.self_attn = Attention(config)
197
+ self.layer_id = layer_id
198
+ self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
199
+ self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
200
+ self.mlp = FeedForward(config)
201
+
202
+ def forward(self, hidden_states, position_embeddings, past_key_value=None, use_cache=False,
203
+ attention_mask=None, position_ids=None, cu_seqlens=None):
204
+ residual = hidden_states
205
+ hidden_states, present_key_value = self.self_attn(
206
+ self.input_layernorm(hidden_states),
207
+ position_embeddings,
208
+ past_key_value,
209
+ use_cache,
210
+ attention_mask,
211
+ position_ids=position_ids,
212
+ cu_seqlens=cu_seqlens
213
+ )
214
+ hidden_states += residual
215
+ hidden_states = hidden_states + self.mlp(self.post_attention_layernorm(hidden_states))
216
+ return hidden_states, present_key_value
217
+
218
+
219
+ class VerMindModel(nn.Module):
220
+ """VerMind Model (Transformer backbone)"""
221
+ def __init__(self, config: VerMindConfig):
222
+ super().__init__()
223
+ self.config = config
224
+ self.vocab_size = config.vocab_size
225
+ self.num_hidden_layers = config.num_hidden_layers
226
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
227
+ self.dropout = nn.Dropout(config.dropout)
228
+ self.layers = nn.ModuleList([VerMindBlock(l, config) for l in range(self.num_hidden_layers)])
229
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
230
+
231
+ freqs_cos, freqs_sin = precompute_freqs_cis(
232
+ dim=config.hidden_size // config.num_attention_heads,
233
+ end=config.max_position_embeddings,
234
+ rope_base=config.rope_theta,
235
+ rope_scaling=config.rope_scaling
236
+ )
237
+ self.register_buffer("freqs_cos", freqs_cos, persistent=False)
238
+ self.register_buffer("freqs_sin", freqs_sin, persistent=False)
239
+
240
+ def forward(self, input_ids=None, attention_mask=None, past_key_values=None,
241
+ use_cache=False, position_ids=None, cu_seqlens=None, **kwargs):
242
+ if past_key_values is not None and hasattr(past_key_values, 'layers'):
243
+ past_key_values = None
244
+ past_key_values = past_key_values or [None] * len(self.layers)
245
+ start_pos = past_key_values[0][0].shape[1] if past_key_values[0] is not None else 0
246
+
247
+ hidden_states = self.dropout(self.embed_tokens(input_ids))
248
+ position_embeddings = (self.freqs_cos, self.freqs_sin)
249
+
250
+ presents = []
251
+ for layer_idx, (layer, past_key_value) in enumerate(zip(self.layers, past_key_values)):
252
+ hidden_states, present = layer(
253
+ hidden_states,
254
+ position_embeddings,
255
+ past_key_value=past_key_value,
256
+ use_cache=use_cache,
257
+ attention_mask=attention_mask,
258
+ position_ids=position_ids,
259
+ cu_seqlens=cu_seqlens
260
+ )
261
+ presents.append(present)
262
+
263
+ hidden_states = self.norm(hidden_states)
264
+ aux_loss = 0
265
+ return hidden_states, presents, aux_loss
266
+
267
+
268
+ class VerMindForCausalLM(PreTrainedModel, GenerationMixin):
269
+ """VerMind Causal Language Model"""
270
+ config_class = VerMindConfig
271
+
272
+ def __init__(self, config: VerMindConfig = None):
273
+ self.config = config or VerMindConfig()
274
+ super().__init__(self.config)
275
+ self.model = VerMindModel(self.config)
276
+ self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)
277
+ self.model.embed_tokens.weight = self.lm_head.weight
278
+
279
+ def forward(self, input_ids=None, attention_mask=None, labels=None,
280
+ past_key_values=None, use_cache=False, logits_to_keep=0,
281
+ position_ids=None, cu_seqlens=None, **args):
282
+ hidden_states, past_key_values, aux_loss = self.model(
283
+ input_ids=input_ids,
284
+ attention_mask=attention_mask,
285
+ past_key_values=past_key_values,
286
+ use_cache=use_cache,
287
+ position_ids=position_ids,
288
+ cu_seqlens=cu_seqlens,
289
+ **args
290
+ )
291
+
292
+ is_varlen = cu_seqlens is not None
293
+ if is_varlen:
294
+ logits = self.lm_head(hidden_states)
295
+ else:
296
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
297
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
298
+
299
+ loss = None
300
+ if labels is not None:
301
+ if is_varlen:
302
+ shift_logits = logits[:-1, :].contiguous()
303
+ shift_labels = labels[1:].contiguous()
304
+ loss = F.cross_entropy(shift_logits, shift_labels, ignore_index=-100)
305
+ else:
306
+ shift_logits = logits[..., :-1, :].contiguous()
307
+ shift_labels = labels[..., 1:].contiguous()
308
+ loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), ignore_index=-100)
309
+
310
+ output = CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=past_key_values, hidden_states=hidden_states)
311
+ output.aux_loss = aux_loss
312
+ return output
313
+
314
+
315
+ # Register the model class
316
+ AutoModelForCausalLM.register(VerMindForCausalLM.config_class, VerMindForCausalLM)
317
+
318
+ __all__ = ["VerMindForCausalLM", "VerMindModel", "VerMindBlock", "Attention", "FeedForward", "RMSNorm"]
modeling_vermind_v.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ """
3
+ Model file for VerMind-V (VLM) model - Standalone Version
4
+ Contains complete VLM implementation without external dependencies
5
+ """
6
+
7
+ import os
8
+ import warnings
9
+ from typing import Optional, Tuple, List, Union
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ import torch.nn.functional as F
14
+ from transformers import AutoModelForCausalLM
15
+ from transformers.modeling_outputs import CausalLMOutputWithPast
16
+
17
+ from ..core.modeling_vermind import VerMindForCausalLM, VerMindModel, RMSNorm, precompute_freqs_cis, apply_rotary_pos_emb, repeat_kv
18
+ from .configuration_vermind_v import VLMConfig
19
+
20
+ warnings.filterwarnings('ignore')
21
+
22
+
23
+ class VisionProj(nn.Module):
24
+ """Vision Projection Layer - Projects vision features to language model space"""
25
+ def __init__(self, ve_hidden_size=768, hidden_size=512):
26
+ super().__init__()
27
+ self.ve_hidden_size = ve_hidden_size
28
+ self.hidden_size = hidden_size
29
+ intermediate_size = min(ve_hidden_size, hidden_size)
30
+ self.proj = nn.Sequential(
31
+ nn.LayerNorm(ve_hidden_size),
32
+ nn.Linear(ve_hidden_size, intermediate_size),
33
+ nn.GELU(),
34
+ nn.Linear(intermediate_size, hidden_size)
35
+ )
36
+
37
+ def forward(self, image_encoders):
38
+ return self.proj(image_encoders)
39
+
40
+
41
+ class VerMindVLM(VerMindForCausalLM):
42
+ """VerMind Vision-Language Model"""
43
+ config_class = VLMConfig
44
+
45
+ def __init__(self, params: VLMConfig = None, vision_model_path="google/siglip-base-patch16-224"):
46
+ # Initialize using parent init but with our own model structure
47
+ self.params = params or VLMConfig()
48
+ # Call PreTrainedModel init directly to avoid double initialization
49
+ nn.Module.__init__(self)
50
+ self.config = self.params
51
+
52
+ # Build the model components
53
+ self.model = VerMindVLMModel(self.params)
54
+ self.lm_head = nn.Linear(self.params.hidden_size, self.params.vocab_size, bias=False)
55
+ self.model.embed_tokens.weight = self.lm_head.weight
56
+
57
+ # Vision components
58
+ self.vision_encoder, self.processor = self.__class__.get_vision_model(vision_model_path)
59
+ self.vision_proj = VisionProj(ve_hidden_size=768, hidden_size=params.hidden_size)
60
+
61
+ @staticmethod
62
+ def get_vision_model(model_path: str):
63
+ """Load vision encoder (SigLIP)"""
64
+ from transformers import logging as hf_logging
65
+ from transformers import SiglipVisionModel, SiglipProcessor
66
+ hf_logging.set_verbosity_error()
67
+
68
+ if not os.path.exists(model_path) and "/" not in model_path:
69
+ return None, None
70
+
71
+ print(f"[VerMind-V] Loading Vision Encoder: {model_path}...")
72
+ try:
73
+ vision_model = SiglipVisionModel.from_pretrained(model_path)
74
+ processor = SiglipProcessor.from_pretrained(model_path)
75
+ except Exception as e:
76
+ print(f"Error loading SigLIP Vision: {e}")
77
+ return None, None
78
+
79
+ for param in vision_model.parameters():
80
+ param.requires_grad = False
81
+ return vision_model.eval(), processor
82
+
83
+ @staticmethod
84
+ def image2tensor(image, processor):
85
+ """Convert PIL image to tensor"""
86
+ if image.mode in ['RGBA', 'LA']:
87
+ image = image.convert('RGB')
88
+ inputs = processor(images=image, return_tensors="pt")['pixel_values']
89
+ return inputs
90
+
91
+ @staticmethod
92
+ def get_image_embeddings(image_tensors, vision_model):
93
+ """Extract image features from vision encoder"""
94
+ outputs = vision_model(pixel_values=image_tensors)
95
+ return outputs.last_hidden_state
96
+
97
+ def count_vision_proj(self, tokens, h, vision_tensors=None, seqlen=512):
98
+ """Insert vision projections into hidden states at image token positions"""
99
+ def find_indices(tokens, image_ids):
100
+ image_ids_tensor = torch.tensor(image_ids).to(tokens.device)
101
+ len_image_ids = len(image_ids)
102
+ if len_image_ids > tokens.size(1):
103
+ return None
104
+ tokens_view = tokens.unfold(1, len_image_ids, 1)
105
+ matches = (tokens_view == image_ids_tensor).all(dim=2)
106
+ return {
107
+ batch_idx: [(idx.item(), idx.item() + len_image_ids - 1) for idx in
108
+ matches[batch_idx].nonzero(as_tuple=True)[0]]
109
+ for batch_idx in range(tokens.size(0)) if matches[batch_idx].any()
110
+ } or None
111
+
112
+ image_indices = find_indices(tokens, self.params.image_ids)
113
+
114
+ if vision_tensors is not None and image_indices:
115
+ vision_proj = self.vision_proj(vision_tensors)
116
+ if len(vision_proj.shape) == 3:
117
+ vision_proj = vision_proj.unsqueeze(0)
118
+
119
+ new_h = []
120
+ for i in range(h.size(0)):
121
+ if i in image_indices:
122
+ h_i = h[i]
123
+ img_idx = 0
124
+ for start_idx, end_idx in image_indices[i]:
125
+ if vision_proj.dim() == 4:
126
+ current_vision_embeds = vision_proj[0, i]
127
+ else:
128
+ current_vision_embeds = vision_proj[i]
129
+
130
+ if img_idx < 1:
131
+ h_i = torch.cat((h_i[:start_idx], current_vision_embeds, h_i[end_idx + 1:]), dim=0)[:seqlen]
132
+ img_idx += 1
133
+ new_h.append(h_i)
134
+ else:
135
+ new_h.append(h[i])
136
+ return torch.stack(new_h, dim=0)
137
+ return h
138
+
139
+ def forward(self, input_ids=None, attention_mask=None, labels=None,
140
+ past_key_values=None, use_cache=False, logits_to_keep=0,
141
+ pixel_values=None, **args):
142
+ batch_size, seq_length = input_ids.shape
143
+ if hasattr(past_key_values, 'layers'):
144
+ past_key_values = None
145
+ past_key_values = past_key_values or [None] * len(self.model.layers)
146
+ start_pos = past_key_values[0][0].shape[1] if past_key_values[0] is not None else 0
147
+
148
+ hidden_states = self.model.dropout(self.model.embed_tokens(input_ids))
149
+
150
+ if pixel_values is not None and start_pos == 0:
151
+ if len(pixel_values.shape) == 5:
152
+ pixel_values = pixel_values[:, 0, :, :, :]
153
+ vision_tensors = VerMindVLM.get_image_embeddings(pixel_values, self.vision_encoder)
154
+ hidden_states = self.count_vision_proj(
155
+ tokens=input_ids,
156
+ h=hidden_states,
157
+ vision_tensors=vision_tensors,
158
+ seqlen=input_ids.shape[1]
159
+ )
160
+
161
+ position_embeddings = (
162
+ self.model.freqs_cos[start_pos:start_pos + seq_length],
163
+ self.model.freqs_sin[start_pos:start_pos + seq_length]
164
+ )
165
+
166
+ presents = []
167
+ for layer_idx, (layer, past_key_value) in enumerate(zip(self.model.layers, past_key_values)):
168
+ hidden_states, present = layer(
169
+ hidden_states,
170
+ position_embeddings,
171
+ past_key_value=past_key_value,
172
+ use_cache=use_cache,
173
+ attention_mask=attention_mask
174
+ )
175
+ presents.append(present)
176
+
177
+ hidden_states = self.model.norm(hidden_states)
178
+
179
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
180
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
181
+
182
+ loss = None
183
+ if labels is not None:
184
+ shift_logits = logits[..., :-1, :].contiguous()
185
+ shift_labels = labels[..., 1:].contiguous()
186
+ loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), ignore_index=-100)
187
+
188
+ output = CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=presents, hidden_states=hidden_states)
189
+ return output
190
+
191
+
192
+ class VerMindVLMModel(VerMindModel):
193
+ """VerMind-V Model (extends VerMindModel for VLM)"""
194
+ pass # Inherits everything from VerMindModel
195
+
196
+
197
+ # Register the model class
198
+ AutoModelForCausalLM.register(VerMindVLM.config_class, VerMindVLM)
199
+
200
+ __all__ = ["VerMindVLM", "VisionProj", "VerMindVLMModel"]
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|im_start|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<|im_start|>",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "<|im_end|>",
35
+ "extra_special_tokens": {},
36
+ "legacy": true,
37
+ "model_max_length": 32768,
38
+ "pad_token": "<|endoftext|>",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "PreTrainedTokenizerFast",
42
+ "unk_token": "<|endoftext|>"
43
+ }