NotShrirang commited on
Commit
f9db966
·
verified ·
1 Parent(s): 0c6578d

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "TinyGPT2ForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_tinygpt2.TinyGPT2HFConfig",
7
+ "AutoModelForCausalLM": "modeling_tinygpt2.TinyGPT2ForCausalLM"
8
+ },
9
+ "block_size": 512,
10
+ "bos_token_id": null,
11
+ "dropout": 0.1,
12
+ "dtype": "float32",
13
+ "eos_token_id": 50256,
14
+ "gqa_kv_head": 4,
15
+ "hidden_size": 2048,
16
+ "model_type": "tinygpt2",
17
+ "n_embd": 768,
18
+ "n_head": 12,
19
+ "n_layer": 12,
20
+ "pad_token_id": 50257,
21
+ "transformers_version": "5.5.4",
22
+ "vocab_size": 50304
23
+ }
configuration_tinygpt2.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HuggingFace-compatible configuration for TinyGPT2 models."""
2
+
3
+ from transformers import PretrainedConfig
4
+
5
+
6
+ class TinyGPT2HFConfig(PretrainedConfig):
7
+ model_type = "tinygpt2"
8
+
9
+ def __init__(
10
+ self,
11
+ vocab_size=50304,
12
+ block_size=512,
13
+ n_embd=768,
14
+ n_head=12,
15
+ n_layer=12,
16
+ gqa_kv_head=4,
17
+ hidden_size=2048,
18
+ dropout=0.1,
19
+ pad_token_id=50257,
20
+ eos_token_id=50256,
21
+ bos_token_id=None,
22
+ **kwargs,
23
+ ):
24
+ self.vocab_size = vocab_size
25
+ self.block_size = block_size
26
+ self.n_embd = n_embd
27
+ self.n_head = n_head
28
+ self.n_layer = n_layer
29
+ self.gqa_kv_head = gqa_kv_head
30
+ self.hidden_size = hidden_size
31
+ self.dropout = dropout
32
+ super().__init__(
33
+ pad_token_id=pad_token_id,
34
+ eos_token_id=eos_token_id,
35
+ bos_token_id=bos_token_id,
36
+ **kwargs,
37
+ )
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50256,
4
+ "output_attentions": false,
5
+ "output_hidden_states": false,
6
+ "pad_token_id": 50257,
7
+ "transformers_version": "5.5.4"
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60dd608d7c9fe3fec24c2339f8b446181228fcc5d6e7bcff1f26912a702983b5
3
+ size 381512120
modeling_tinygpt2.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HuggingFace-compatible model definition for TinyGPT2.
2
+
3
+ This file is self-contained so it works when downloaded from the HuggingFace Hub
4
+ with `trust_remote_code=True`.
5
+ """
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+ from transformers import PreTrainedModel, GenerationMixin
11
+ from transformers.modeling_outputs import CausalLMOutputWithPast
12
+
13
+ from configuration_tinygpt2 import TinyGPT2HFConfig
14
+
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Layers (self-contained copies so this file works standalone on HF Hub)
18
+ # ---------------------------------------------------------------------------
19
+
20
+ class RMSNorm(nn.Module):
21
+ def __init__(self, dim, eps=1e-6):
22
+ super().__init__()
23
+ self.eps = eps
24
+ self.weight = nn.Parameter(torch.ones(dim))
25
+
26
+ def forward(self, x):
27
+ rms = torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)
28
+ return self.weight * (x / rms)
29
+
30
+
31
+ def precompute_freqs_cis(dim, seq_len, theta=10000.0):
32
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
33
+ t = torch.arange(seq_len, dtype=torch.float)
34
+ freqs = torch.outer(t, freqs)
35
+ return torch.polar(torch.ones_like(freqs), freqs)
36
+
37
+
38
+ def apply_rotary_emb(x, freqs_cis):
39
+ # x: (B, T, H, D)
40
+ x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
41
+ freqs_cis = freqs_cis[:x.shape[1]].view(1, x.shape[1], 1, -1)
42
+ x_rotated = x_complex * freqs_cis
43
+ return torch.view_as_real(x_rotated).flatten(-2).type_as(x)
44
+
45
+
46
+ class GroupedQueryAttention(nn.Module):
47
+ def __init__(self, n_embd, n_head, n_query_groups, dropout=0.1):
48
+ super().__init__()
49
+ assert n_head % n_query_groups == 0
50
+ self.n_head = n_head
51
+ self.n_query_groups = n_query_groups
52
+ self.head_dim = n_embd // n_head
53
+
54
+ self.q_proj = nn.Linear(n_embd, n_embd, bias=False)
55
+ self.k_proj = nn.Linear(n_embd, n_query_groups * self.head_dim, bias=False)
56
+ self.v_proj = nn.Linear(n_embd, n_query_groups * self.head_dim, bias=False)
57
+ self.out_proj = nn.Linear(n_embd, n_embd, bias=False)
58
+ self.dropout = nn.Dropout(dropout)
59
+
60
+ def forward(self, x, freqs_cis, is_causal=True, kv_cache=None):
61
+ B, T, C = x.shape
62
+ H, G, D = self.n_head, self.n_query_groups, self.head_dim
63
+
64
+ q = self.q_proj(x).view(B, T, H, D)
65
+ k = self.k_proj(x).view(B, T, G, D)
66
+ v = self.v_proj(x).view(B, T, G, D)
67
+
68
+ q = apply_rotary_emb(q, freqs_cis)
69
+ k = apply_rotary_emb(k, freqs_cis)
70
+
71
+ if kv_cache is not None:
72
+ k_past, v_past = kv_cache
73
+ k = torch.cat([k_past, k], dim=1)
74
+ v = torch.cat([v_past, v], dim=1)
75
+
76
+ new_kv_cache = (k, v)
77
+
78
+ k = k[:, :, :, None, :].expand(B, -1, G, H // G, D).reshape(B, -1, H, D)
79
+ v = v[:, :, :, None, :].expand(B, -1, G, H // G, D).reshape(B, -1, H, D)
80
+
81
+ q, k, v = (t.transpose(1, 2) for t in (q, k, v))
82
+
83
+ use_causal = is_causal and kv_cache is None
84
+ attn_output = F.scaled_dot_product_attention(q, k, v, is_causal=use_causal)
85
+ attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, C)
86
+ return self.out_proj(attn_output), new_kv_cache
87
+
88
+
89
+ class TinyGPT2Block(nn.Module):
90
+ def __init__(self, config):
91
+ super().__init__()
92
+ self.ln1 = RMSNorm(config.n_embd)
93
+ self.attn = GroupedQueryAttention(
94
+ config.n_embd, config.n_head, config.gqa_kv_head, config.dropout
95
+ )
96
+ self.ln2 = RMSNorm(config.n_embd)
97
+ self.ffwd = nn.Sequential(
98
+ nn.Linear(config.n_embd, config.hidden_size),
99
+ nn.GELU(),
100
+ nn.Linear(config.hidden_size, config.n_embd),
101
+ nn.Dropout(config.dropout),
102
+ )
103
+
104
+ def forward(self, x, freqs_cis, is_causal=True, kv_cache=None):
105
+ residual = x
106
+ x = self.ln1(x)
107
+ attn_out, new_kv_cache = self.attn(x, freqs_cis, is_causal, kv_cache)
108
+ x = residual + attn_out
109
+
110
+ residual = x
111
+ x = self.ln2(x)
112
+ x = residual + self.ffwd(x)
113
+ return x, new_kv_cache
114
+
115
+
116
+ # ---------------------------------------------------------------------------
117
+ # HuggingFace PreTrainedModel wrapper
118
+ # ---------------------------------------------------------------------------
119
+
120
+ class TinyGPT2ForCausalLM(PreTrainedModel, GenerationMixin):
121
+ _tied_weights_keys = {"lm_head.weight": "token_embedding.weight"}
122
+ config_class = TinyGPT2HFConfig
123
+
124
+ def __init__(self, config: TinyGPT2HFConfig):
125
+ super().__init__(config)
126
+ self.config = config
127
+
128
+ self.token_embedding = nn.Embedding(config.vocab_size, config.n_embd)
129
+ self.blocks = nn.ModuleList(
130
+ [TinyGPT2Block(config) for _ in range(config.n_layer)]
131
+ )
132
+ self.ln_f = RMSNorm(config.n_embd)
133
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
134
+
135
+ # Weight tying
136
+ self.token_embedding.weight = self.lm_head.weight
137
+
138
+ # Precompute RoPE frequencies
139
+ self.register_buffer(
140
+ "freqs_cis",
141
+ precompute_freqs_cis(
142
+ config.n_embd // config.n_head, config.block_size * 2
143
+ ),
144
+ )
145
+
146
+ self.post_init()
147
+
148
+ def get_input_embeddings(self):
149
+ return self.token_embedding
150
+
151
+ def set_input_embeddings(self, value):
152
+ self.token_embedding = value
153
+
154
+ def get_output_embeddings(self):
155
+ return self.lm_head
156
+
157
+ def set_output_embeddings(self, new_embeddings):
158
+ self.lm_head = new_embeddings
159
+
160
+ def forward(
161
+ self,
162
+ input_ids=None,
163
+ attention_mask=None,
164
+ past_key_values=None,
165
+ labels=None,
166
+ use_cache=False,
167
+ **kwargs,
168
+ ):
169
+ B, T = input_ids.shape
170
+
171
+ x = self.token_embedding(input_ids)
172
+
173
+ if past_key_values is not None and len(past_key_values) > 0:
174
+ start_pos = past_key_values[0][0].shape[1] # length of cached keys
175
+ freqs_cis = self.freqs_cis[start_pos : start_pos + T]
176
+ else:
177
+ freqs_cis = self.freqs_cis[:T]
178
+
179
+ new_kv_caches = []
180
+ for i, block in enumerate(self.blocks):
181
+ kv_cache = past_key_values[i] if past_key_values else None
182
+ x, new_cache = block(x, freqs_cis, is_causal=True, kv_cache=kv_cache)
183
+ new_kv_caches.append(new_cache)
184
+
185
+ x = self.ln_f(x)
186
+ logits = self.lm_head(x)
187
+
188
+ loss = None
189
+ if labels is not None:
190
+ shift_logits = logits[..., :-1, :].contiguous()
191
+ shift_labels = labels[..., 1:].contiguous()
192
+ loss = F.cross_entropy(
193
+ shift_logits.view(-1, shift_logits.size(-1)),
194
+ shift_labels.view(-1),
195
+ ignore_index=self.config.pad_token_id,
196
+ )
197
+
198
+ return CausalLMOutputWithPast(
199
+ loss=loss,
200
+ logits=logits,
201
+ past_key_values=new_kv_caches if use_cache else None,
202
+ )
203
+
204
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
205
+ if past_key_values is not None and len(past_key_values) > 0:
206
+ input_ids = input_ids[:, -1:]
207
+ return {
208
+ "input_ids": input_ids,
209
+ "past_key_values": past_key_values,
210
+ "use_cache": True,
211
+ }
212
+
213
+ @staticmethod
214
+ def _reorder_cache(past_key_values, beam_idx):
215
+ return tuple(
216
+ (k.index_select(0, beam_idx), v.index_select(0, beam_idx))
217
+ for k, v in past_key_values
218
+ )