hbpkillerX commited on
Commit
32d3d5b
·
verified ·
1 Parent(s): ca5da25

Complete model upload with all necessary files

Browse files
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CustomLlamaForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "modeling_custom_llama.CustomLlamaConfig",
7
+ "AutoModelForCausalLM": "modeling_custom_llama.CustomLlamaForCausalLM"
8
+ },
9
+ "d_head": 64,
10
+ "d_mlp_proj": 2560,
11
+ "d_model": 960,
12
+ "dtype": "float32",
13
+ "initializer_range": 0.02,
14
+ "model_type": "custom_llama",
15
+ "n_attn_heads": 15,
16
+ "n_kv_heads": 5,
17
+ "n_layers": 16,
18
+ "pad_token_id": 0,
19
+ "rms_norm_eps": 1e-05,
20
+ "rope_theta": 100000.0,
21
+ "tie_word_embeddings": false,
22
+ "transformers_version": "4.56.1",
23
+ "vocab_size": 49152
24
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c1a12cab395be35c66733a2e57c17f5290540b46e8ecb6581df574f2415b49b
3
+ size 1006775160
modeling_custom_llama.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modeling_custom_llama.py
2
+ # Note: We are adapting your original code to fit the transformers library structure.
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Optional, Tuple
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import torch.nn as nn
10
+
11
+ # Import the necessary base classes from transformers
12
+ from transformers.configuration_utils import PretrainedConfig
13
+ from transformers.modeling_utils import PreTrainedModel
14
+ from transformers.utils import logging
15
+
16
+
17
+ logger = logging.get_logger(__name__)
18
+
19
+ # Step 2a: Create a Config class that inherits from PretrainedConfig
20
+ # This is crucial for saving/loading the model's architecture.
21
+ class CustomLlamaConfig(PretrainedConfig):
22
+ model_type = "custom_llama"
23
+
24
+ def __init__(
25
+ self,
26
+ vocab_size: int = 32000,
27
+ d_model: int = 960,
28
+ d_head: int = 64,
29
+ d_mlp_proj: int = 2560,
30
+ n_kv_heads: int = 5,
31
+ n_attn_heads: int = 15,
32
+ n_layers: int = 16,
33
+ rms_norm_eps: float = 1e-5,
34
+ rope_theta: float = 100000.0,
35
+ initializer_range: float = 0.02,
36
+ # CHANGE 1: Use `pad_token_id` directly instead of `padding_idx`
37
+ pad_token_id: Optional[int] = None,
38
+ tie_word_embeddings: bool = False,
39
+ **kwargs
40
+ ):
41
+ self.vocab_size = vocab_size
42
+ self.d_model = d_model
43
+ self.d_head = d_head
44
+ self.d_mlp_proj = d_mlp_proj
45
+ self.n_kv_heads = n_kv_heads
46
+ self.n_attn_heads = n_attn_heads
47
+ self.n_layers = n_layers
48
+ self.rms_norm_eps = rms_norm_eps
49
+ self.rope_theta = rope_theta
50
+ self.initializer_range = initializer_range
51
+
52
+ # CHANGE 2: Pass `pad_token_id` directly to the super() call.
53
+ # Now there's no conflict with kwargs.
54
+ super().__init__(
55
+ pad_token_id=pad_token_id,
56
+ tie_word_embeddings=tie_word_embeddings,
57
+ **kwargs
58
+ )
59
+
60
+ # Your original helper modules (Rotary, GQA, GatedMlp, DecoderLayer)
61
+ # can stay exactly the same. Just copy them here.
62
+ class Rotary(nn.Module):
63
+ # ... (your exact Rotary class code) ...
64
+ def __init__(self, config):
65
+ super(Rotary, self).__init__()
66
+ inv_freq = 1.0 / (config.rope_theta ** (torch.arange(0, config.d_head, 2).float() / config.d_head))
67
+ self.register_buffer('inv_freq', inv_freq, persistent=False)
68
+ self.seq_len_cached = None
69
+ self.cos_cached = None
70
+ self.sin_cached = None
71
+
72
+ def forward(self, x, seq_dim=1):
73
+ seq_len = x.size(seq_dim)
74
+ if seq_len != self.seq_len_cached:
75
+ self.seq_len_cached = seq_len
76
+ t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
77
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
78
+ emb = torch.cat((freqs, freqs), dim=-1)
79
+ self.cos_cached = emb.cos()
80
+ self.sin_cached = emb.sin()
81
+
82
+ return self.cos_cached, self.sin_cached
83
+
84
+
85
+ class GroupedQueryAttention(nn.Module):
86
+ # ... (your exact GQA class code) ...
87
+ def __init__(self, config):
88
+ super(GroupedQueryAttention, self).__init__()
89
+ self.q_proj = nn.Linear(config.d_model, config.n_attn_heads * config.d_head, bias=False)
90
+ self.k_proj = nn.Linear(config.d_model, config.n_kv_heads * config.d_head, bias=False)
91
+ self.v_proj = nn.Linear(config.d_model, config.n_kv_heads * config.d_head, bias=False)
92
+ self.o_proj = nn.Linear(config.d_model, config.d_model, bias=False)
93
+
94
+ self.config = config
95
+ self.attn_scale = config.d_head ** -0.5
96
+
97
+ self.use_flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
98
+
99
+ @staticmethod
100
+ def _rotate_half(x):
101
+ half = x.shape[-1] // 2
102
+ x1, x2 = x[..., :half], x[..., half:]
103
+ return torch.cat([-x2, x1], dim=-1)
104
+
105
+
106
+ def _apply_rotary_pos_emb(self, q, k, cos, sin):
107
+ return q * cos + self._rotate_half(q) * sin, k * cos + self._rotate_half(k) * sin
108
+
109
+
110
+ def forward(self, x, cos, sin):
111
+ b_size, seq_len, _ = x.shape
112
+ q = self.q_proj(x)
113
+ k = self.k_proj(x)
114
+ v = self.v_proj(x)
115
+
116
+ # Shape to (b_size, n_heads or n_kv_heads, seq_len, d_head)
117
+ q = q.view(b_size, seq_len, -1, self.config.d_head).transpose(1, 2)
118
+ k = k.view(b_size, seq_len, -1, self.config.d_head).transpose(1, 2)
119
+ v = v.view(b_size, seq_len, -1, self.config.d_head).transpose(1, 2)
120
+
121
+ q, k = self._apply_rotary_pos_emb(q, k, cos, sin)
122
+
123
+ if self.use_flash:
124
+ # GQA is enabled by default in recent PyTorch versions
125
+ # when n_heads_q != n_heads_kv
126
+ out = F.scaled_dot_product_attention(q, k, v, is_causal=True)
127
+ else:
128
+ k = k.repeat_interleave(self.config.n_attn_heads // self.config.n_kv_heads, dim=1)
129
+ v = v.repeat_interleave(self.config.n_attn_heads // self.config.n_kv_heads, dim=1)
130
+
131
+ qk_scaled = q @ k.transpose(-2, -1) * self.attn_scale
132
+
133
+ attn_bias = torch.zeros(1, 1, seq_len, seq_len, device=q.device, dtype=q.dtype)
134
+ temp_mask = torch.ones(seq_len, seq_len, dtype=torch.bool, device=q.device).tril(diagonal=0)
135
+ attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
136
+
137
+ attn = F.softmax(qk_scaled + attn_bias, dim=-1)
138
+ out = attn @ v
139
+
140
+ out = out.transpose(1, 2).contiguous().view(b_size, seq_len, -1)
141
+ return self.o_proj(out)
142
+
143
+
144
+ class GatedMlp(nn.Module):
145
+ # ... (your exact GatedMlp class code) ...
146
+ def __init__(self, config):
147
+ super(GatedMlp, self).__init__()
148
+
149
+ self.up_proj = nn.Linear(config.d_model, config.d_mlp_proj, bias=False)
150
+ self.gate_proj = nn.Linear(config.d_model, config.d_mlp_proj, bias=False)
151
+ self.down_proj = nn.Linear(config.d_mlp_proj, config.d_model, bias=False)
152
+ self.silu = nn.SiLU()
153
+
154
+
155
+ def forward(self, x):
156
+ up = self.silu(self.gate_proj(x)) * self.up_proj(x)
157
+ return self.down_proj(up)
158
+
159
+
160
+ class DecoderLayer(nn.Module):
161
+ # ... (your exact DecoderLayer class code) ...
162
+ def __init__(self, config):
163
+ super(DecoderLayer, self).__init__()
164
+
165
+ self.self_attn = GroupedQueryAttention(config)
166
+ self.mlp = GatedMlp(config)
167
+ self.input_layernorm = nn.modules.normalization.RMSNorm(config.d_model, config.rms_norm_eps)
168
+ self.post_attention_layernorm = nn.modules.normalization.RMSNorm(config.d_model, config.rms_norm_eps)
169
+
170
+ def forward(self, x, cos, sin):
171
+ x = x + self.self_attn(self.input_layernorm(x), cos, sin)
172
+ x = x + self.mlp(self.post_attention_layernorm(x))
173
+ return x
174
+
175
+
176
+ # Step 2b: Create the main Model class that inherits from PreTrainedModel
177
+ # We'll rename it to follow HF conventions: `...ForCausalLM`
178
+ class CustomLlamaForCausalLM(PreTrainedModel):
179
+ # Link this model to its config class
180
+ config_class = CustomLlamaConfig
181
+
182
+ def __init__(self, config: CustomLlamaConfig):
183
+ super().__init__(config)
184
+ self.config = config
185
+
186
+ self.embed_tokens = nn.Embedding(
187
+ num_embeddings=config.vocab_size,
188
+ embedding_dim=config.d_model,
189
+ # CHANGE 3: `nn.Embedding` expects a parameter named `padding_idx`.
190
+ # Its value comes from the standard `config.pad_token_id`. This is the correct mapping.
191
+ padding_idx=config.pad_token_id
192
+ )
193
+ self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.n_layers)])
194
+ self.norm = nn.modules.normalization.RMSNorm(config.d_model, config.rms_norm_eps)
195
+ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
196
+ self.rotary_emb = Rotary(config)
197
+
198
+ self.post_init()
199
+
200
+ # The `_init_weights` method is called by `post_init` and is the place
201
+ # to put your custom initialization logic.
202
+ def _init_weights(self, module):
203
+ std = self.config.initializer_range
204
+ if isinstance(module, nn.Linear):
205
+ torch.nn.init.normal_(module.weight, mean=0.0, std=std)
206
+ if module.bias is not None:
207
+ module.bias.data.zero_()
208
+ elif isinstance(module, nn.Embedding):
209
+ torch.nn.init.normal_(module.weight, mean=0.0, std=std)
210
+ if self.config.pad_token_id is not None:
211
+ module.weight.data[self.config.pad_token_id].zero_()
212
+
213
+ # Step 2c: Adapt the forward method signature
214
+ # It should accept `labels` for loss calculation and return a special output object
215
+ # or a tuple. Returning a tuple `(loss, logits)` is the simplest way.
216
+ def forward(
217
+ self,
218
+ input_ids: torch.LongTensor,
219
+ labels: Optional[torch.LongTensor] = None,
220
+ **kwargs,
221
+ ) -> Tuple:
222
+
223
+ x = self.embed_tokens(input_ids)
224
+ cos, sin = self.rotary_emb(x, seq_dim=1)
225
+ for layer in self.layers:
226
+ x = layer(x, cos, sin)
227
+ x = self.norm(x)
228
+ logits = self.lm_head(x)
229
+
230
+ loss = None
231
+ if labels is not None:
232
+ # Shift so that tokens < n predict n
233
+ shift_logits = logits[..., :-1, :].contiguous()
234
+ shift_labels = labels[..., 1:].contiguous()
235
+ # Flatten the tokens
236
+ loss_fct = nn.CrossEntropyLoss()
237
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
238
+ shift_labels = shift_labels.view(-1)
239
+ # Enable model parallelism
240
+ shift_labels = shift_labels.to(shift_logits.device)
241
+ loss = loss_fct(shift_logits, shift_labels)
242
+
243
+ return (loss, logits)
244
+ @torch.no_grad()
245
+ def generate(self, idx, temperature=1.0, top_k=None, max_new_tokens=128):
246
+ for _ in range(max_new_tokens):
247
+ logits, _, _ = self(idx)
248
+ logits = logits[:, -1, :] / temperature
249
+
250
+ if top_k is not None:
251
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
252
+ logits[logits < v[:, [-1]]] = -float('Inf')
253
+
254
+ probs = F.softmax(logits, dim=-1)
255
+
256
+ idx_next = torch.multinomial(probs, num_samples=1)
257
+
258
+ idx = torch.cat((idx, idx_next), dim=1)
259
+
260
+ return idx
261
+
262
+ def using_flash_attention(self):
263
+ return self.layers[0].self_attn.use_flash
264
+
265
+ # Step 2d: Register your custom classes with the Auto-classes
266
+ # This is the magic that allows `AutoModelForCausalLM.from_pretrained` to find your model.
267
+ from transformers import AutoConfig, AutoModelForCausalLM
268
+
269
+ AutoConfig.register("custom_llama", CustomLlamaConfig)
270
+ AutoModelForCausalLM.register(CustomLlamaConfig, CustomLlamaForCausalLM)
special_tokens_map.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<|im_start|>",
5
+ "<|im_end|>",
6
+ "<repo_name>",
7
+ "<reponame>",
8
+ "<file_sep>",
9
+ "<filename>",
10
+ "<gh_stars>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<jupyter_script>",
19
+ "<empty_output>"
20
+ ],
21
+ "bos_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "eos_token": {
29
+ "content": "<|endoftext|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "pad_token": "<|endoftext|>",
36
+ "unk_token": {
37
+ "content": "<|endoftext|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false
42
+ }
43
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<repo_name>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<reponame>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<file_sep>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<jupyter_script>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<empty_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ }
140
+ },
141
+ "additional_special_tokens": [
142
+ "<|endoftext|>",
143
+ "<|im_start|>",
144
+ "<|im_end|>",
145
+ "<repo_name>",
146
+ "<reponame>",
147
+ "<file_sep>",
148
+ "<filename>",
149
+ "<gh_stars>",
150
+ "<issue_start>",
151
+ "<issue_comment>",
152
+ "<issue_closed>",
153
+ "<jupyter_start>",
154
+ "<jupyter_text>",
155
+ "<jupyter_code>",
156
+ "<jupyter_output>",
157
+ "<jupyter_script>",
158
+ "<empty_output>"
159
+ ],
160
+ "bos_token": "<|endoftext|>",
161
+ "clean_up_tokenization_spaces": false,
162
+ "eos_token": "<|endoftext|>",
163
+ "extra_special_tokens": {},
164
+ "model_max_length": 8192,
165
+ "pad_token": "<|endoftext|>",
166
+ "tokenizer_class": "GPT2Tokenizer",
167
+ "unk_token": "<|endoftext|>",
168
+ "vocab_size": 49152
169
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff