harshit36 commited on
Commit
e63dd1f
·
verified ·
1 Parent(s): 4f31c83

Upload 6 files

Browse files
config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "NovaForCausalLM"
4
+ ],
5
+ "block_size": 256,
6
+ "model_type": "nova",
7
+ "n_embd": 640,
8
+ "n_head": 8,
9
+ "n_layer": 4,
10
+ "torch_dtype": "float32",
11
+ "transformers_version": "4.55.4",
12
+ "vocab_size": 6000
13
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2b769ee0e5b1f8532e9abcf402a6e59ebe154b0c4d538dfd0069e3854350d9c
3
+ size 66712216
nova_modelling.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.nn import functional
5
+ from transformers import PreTrainedModel, PretrainedConfig
6
+
7
+ class Heads(nn.Module):
8
+ def __init__(self, feature_embed, head_size, block_size):
9
+ super().__init__()
10
+
11
+ self.q = nn.Linear(feature_embed, head_size, bias=False)
12
+ self.k = nn.Linear(feature_embed, head_size, bias=False)
13
+ self.v = nn.Linear(feature_embed, head_size, bias=False)
14
+ self.register_buffer('tril', torch.tril(torch.ones(block_size,block_size)))
15
+ self.dropout = nn.Dropout(0.15)
16
+
17
+ def forward(self, x):
18
+ B, T, C = x.shape
19
+ k = self.k(x)
20
+ q = self.q(x)
21
+ v = self.v(x)
22
+
23
+ weighted = q @ k.transpose(-2,-1) * (k.shape[-1] ** -0.5)
24
+ weighted = weighted.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
25
+ weighted = functional.softmax(weighted, dim=-1)
26
+ weighted = self.dropout(weighted)
27
+ return weighted @ v
28
+
29
+ class MultiHeadAttention(nn.Module):
30
+ def __init__(self, head_size, n_heads, feature_embed, block_size):
31
+ super().__init__()
32
+
33
+ self.multiple_heads = nn.ModuleList(Heads(feature_embed, head_size, block_size) for _ in range(n_heads))
34
+ self.linear = nn.Linear(head_size*n_heads, feature_embed)
35
+ self.dropout = nn.Dropout(0.1)
36
+
37
+ def forward(self, x):
38
+ out = torch.cat([head(x) for head in self.multiple_heads], dim=-1)
39
+ out = self.linear(out)
40
+ return self.dropout(out)
41
+
42
+ class Decoder(nn.Module):
43
+ def __init__(self, feature_embed, n_heads, block_size):
44
+ super().__init__()
45
+
46
+ head_size = feature_embed // n_heads
47
+ self.multihead = MultiHeadAttention(head_size, n_heads, feature_embed, block_size=block_size)
48
+ self.layerNorm = nn.LayerNorm(feature_embed)
49
+
50
+ def forward(self, x):
51
+ y = self.multihead(x)
52
+ return self.layerNorm(x+y)
53
+
54
+ class NOVA(nn.Module):
55
+ def __init__(self, vocab_size, block_size=256, feature_embed=640, n_layers=4, n_heads=8):
56
+ super().__init__()
57
+
58
+ self.vocab_size = vocab_size
59
+ self.block_size = block_size
60
+ self.feature_embed = feature_embed
61
+ self.n_layers = n_layers
62
+ self.n_heads = n_heads
63
+
64
+ self.vector_embedding = nn.Embedding(vocab_size, feature_embed)
65
+ self.learnable_position = nn.Embedding(block_size, feature_embed) # learnable positional encoding
66
+
67
+ # Sinusoidal Positional encoding
68
+ sinusoid = torch.zeros(block_size, feature_embed)
69
+ position = torch.arange(0, block_size, dtype=torch.float32).unsqueeze(1)
70
+ div_term = torch.exp(torch.arange(0, feature_embed, 2).float() * (-math.log(10000.0) / feature_embed))
71
+ sinusoid[:, 0::2] = torch.sin(position * div_term)
72
+ sinusoid[:, 1::2] = torch.cos(position * div_term)
73
+ self.register_buffer('sinusoidal_encoding', sinusoid) # not trainable
74
+
75
+ # initialising Decoder Model
76
+ self.decoder_block = nn.Sequential(*[
77
+ Decoder(feature_embed, n_heads=n_heads, block_size=self.block_size) for _ in range(n_layers)
78
+ ])
79
+ self.linear_head = nn.Linear(feature_embed, vocab_size)
80
+ self.layer_norm = nn.LayerNorm(feature_embed)
81
+ self.apply(self._init_weights)
82
+
83
+ def _init_weights(self, module):
84
+ if isinstance(module, nn.Linear):
85
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.01)
86
+ if module.bias is not None:
87
+ torch.nn.init.zeros_(module.bias)
88
+ if isinstance(module, nn.Embedding):
89
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.01)
90
+
91
+ def forward(self, indx, target=None):
92
+ B, T = indx.shape
93
+
94
+ token_embedding = self.vector_embedding(indx) # [B, T, C]
95
+
96
+ # Positional encoding (hybrid: learned + sinusoidal)
97
+ learned = self.learnable_position(torch.arange(T, device=indx.device)) # [T, C]
98
+ sinusoidal = self.sinusoidal_encoding[:T] # [T, C]
99
+ positional_encoding = learned + sinusoidal # [T, C]
100
+ positional_encoding = positional_encoding.unsqueeze(0).expand(B, -1, -1) # [B, T, C]
101
+
102
+ x = token_embedding + positional_encoding # [B, T, C]
103
+ x = self.decoder_block(x) # [B, T, C]
104
+ x = self.layer_norm(x) # [B, T, C]
105
+ logits = self.linear_head(x) # [B, T, vocab_size]
106
+
107
+ if target is None:
108
+ return logits, None
109
+
110
+ # Shift logits and targets for causal language modeling
111
+ logits = logits[:, :-1, :] # [B, T-1, vocab_size]
112
+ target = target[:, 1:] # [B, T-1]
113
+
114
+ # Flatten for loss
115
+ logits = logits.contiguous().view(-1, logits.size(-1)) # [B*(T-1), vocab_size]
116
+ target = target.contiguous().view(-1) # [B*(T-1)]
117
+
118
+ loss = functional.cross_entropy(logits, target, ignore_index=-100)
119
+
120
+ return logits, loss
121
+
122
+
123
+ @torch.no_grad()
124
+ def generate(self, index, max_tokens=512):
125
+ for _ in range(max_tokens):
126
+ index_cond = index[:,-self.block_size:]
127
+ logits, loss = self.forward(index_cond)
128
+ logits = logits[:,-1,:]
129
+ probs = torch.softmax(logits, dim=-1)
130
+
131
+ next_index = torch.multinomial(probs, num_samples=1)
132
+ # if next_index == self.eos_id:
133
+ # break
134
+ index = torch.cat((index,next_index), dim=1)
135
+ return index
136
+
137
+ class NovaConfig(PretrainedConfig):
138
+ model_type = "nova"
139
+
140
+ def __init__(self, vocab_size=6000, block_size=256, feature_embed=640, n_layers=4, n_heads=8, **kwargs):
141
+ super().__init__(**kwargs)
142
+ self.vocab_size = vocab_size
143
+ self.block_size = block_size
144
+ self.n_embd = feature_embed
145
+ self.n_layer = n_layers
146
+ self.n_head = n_heads
147
+
148
+ class NovaForCausalLM(PreTrainedModel):
149
+ config_class = NovaConfig
150
+
151
+ def __init__(self, config: NovaConfig):
152
+ super().__init__(config)
153
+ # your original model init logic here
154
+ self.vocab_size = config.vocab_size
155
+ self.block_size = config.block_size
156
+ self.model = NOVA(vocab_size=self.vocab_size, block_size=self.block_size,
157
+ feature_embed=config.n_embd, n_layers=config.n_layer, n_heads=config.n_head)
158
+ self.post_init() # important for HF compatibility
159
+
160
+ def forward(self, input_ids, labels=None):
161
+ return self.model(input_ids, labels)
162
+
163
+ def generate(self, input_ids, max_length=256):
164
+ return self.model.generate(input_ids, max_length)
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pad_token": {
3
+ "content": "<pad>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ }
9
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "extra_special_tokens": {},
46
+ "model_max_length": 1000000000000000019884624838656,
47
+ "pad_token": "<pad>",
48
+ "tokenizer_class": "PreTrainedTokenizerFast"
49
+ }