vedaco commited on
Commit
b3a9dc5
·
verified ·
1 Parent(s): 8eda8fc

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +59 -228
model.py CHANGED
@@ -2,106 +2,13 @@ import tensorflow as tf
2
  from tensorflow import keras
3
  from tensorflow.keras import layers
4
  import numpy as np
5
- from typing import Optional
6
-
7
- class PositionalEncoding(layers.Layer):
8
- """Positional encoding layer for transformer"""
9
-
10
- def __init__(self, max_length: int, d_model: int, **kwargs):
11
- super().__init__(**kwargs)
12
- self.max_length = max_length
13
- self.d_model = d_model
14
-
15
- # Create positional encoding matrix
16
- position = np.arange(max_length)[:, np.newaxis]
17
- div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
18
-
19
- pe = np.zeros((max_length, d_model))
20
- pe[:, 0::2] = np.sin(position * div_term)
21
- pe[:, 1::2] = np.cos(position * div_term)
22
-
23
- self.positional_encoding = tf.constant(pe, dtype=tf.float32)
24
-
25
- def call(self, x):
26
- seq_length = tf.shape(x)[1]
27
- return x + self.positional_encoding[:seq_length, :]
28
-
29
- def get_config(self):
30
- config = super().get_config()
31
- config.update({
32
- 'max_length': self.max_length,
33
- 'd_model': self.d_model
34
- })
35
- return config
36
-
37
-
38
- class TransformerBlock(layers.Layer):
39
- """Transformer decoder block"""
40
-
41
- def __init__(self, d_model: int, num_heads: int, ff_dim: int,
42
- dropout_rate: float = 0.1, **kwargs):
43
- super().__init__(**kwargs)
44
- self.d_model = d_model
45
- self.num_heads = num_heads
46
- self.ff_dim = ff_dim
47
- self.dropout_rate = dropout_rate
48
-
49
- self.attention = layers.MultiHeadAttention(
50
- num_heads=num_heads,
51
- key_dim=d_model // num_heads,
52
- dropout=dropout_rate
53
- )
54
- self.ffn = keras.Sequential([
55
- layers.Dense(ff_dim, activation='gelu'),
56
- layers.Dropout(dropout_rate),
57
- layers.Dense(d_model),
58
- layers.Dropout(dropout_rate)
59
- ])
60
- self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
61
- self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
62
- self.dropout = layers.Dropout(dropout_rate)
63
-
64
- def call(self, x, training=False, mask=None):
65
- # Causal self-attention
66
- attn_output = self.attention(
67
- query=x,
68
- value=x,
69
- key=x,
70
- attention_mask=mask,
71
- training=training
72
- )
73
- attn_output = self.dropout(attn_output, training=training)
74
- out1 = self.layernorm1(x + attn_output)
75
-
76
- # Feed forward network
77
- ffn_output = self.ffn(out1, training=training)
78
- return self.layernorm2(out1 + ffn_output)
79
-
80
- def get_config(self):
81
- config = super().get_config()
82
- config.update({
83
- 'd_model': self.d_model,
84
- 'num_heads': self.num_heads,
85
- 'ff_dim': self.ff_dim,
86
- 'dropout_rate': self.dropout_rate
87
- })
88
- return config
89
-
90
 
91
  class VedaProgrammingLLM(keras.Model):
92
  """Veda Programming Language Model"""
93
 
94
- def __init__(
95
- self,
96
- vocab_size: int,
97
- max_length: int = 512,
98
- d_model: int = 256,
99
- num_heads: int = 8,
100
- num_layers: int = 6,
101
- ff_dim: int = 1024,
102
- dropout_rate: float = 0.1,
103
- **kwargs
104
- ):
105
  super().__init__(**kwargs)
106
 
107
  self.vocab_size = vocab_size
@@ -110,117 +17,85 @@ class VedaProgrammingLLM(keras.Model):
110
  self.num_heads = num_heads
111
  self.num_layers = num_layers
112
  self.ff_dim = ff_dim
113
- self.dropout_rate = dropout_rate
114
 
115
- # Embedding layers
116
- self.token_embedding = layers.Embedding(
117
- input_dim=vocab_size,
118
- output_dim=d_model
119
- )
120
- self.positional_encoding = PositionalEncoding(max_length, d_model)
121
- self.dropout = layers.Dropout(dropout_rate)
122
 
123
- # Transformer blocks
124
- self.transformer_blocks = [
125
- TransformerBlock(d_model, num_heads, ff_dim, dropout_rate)
126
- for _ in range(num_layers)
127
- ]
 
 
 
 
 
 
 
 
 
128
 
129
- # Output layer
130
  self.output_layer = layers.Dense(vocab_size)
131
 
132
- def _create_causal_mask(self, seq_length):
133
- """Create causal attention mask"""
134
- mask = tf.linalg.band_part(
135
- tf.ones((seq_length, seq_length)), -1, 0
136
- )
137
- return mask
138
-
139
  def call(self, inputs, training=False):
140
- seq_length = tf.shape(inputs)[1]
141
 
142
  # Create causal mask
143
- mask = self._create_causal_mask(seq_length)
144
 
145
  # Embeddings
146
- x = self.token_embedding(inputs)
147
- x = x * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
148
- x = self.positional_encoding(x)
149
  x = self.dropout(x, training=training)
150
 
151
  # Transformer blocks
152
- for transformer_block in self.transformer_blocks:
153
- x = transformer_block(x, training=training, mask=mask)
 
 
 
 
 
 
 
 
154
 
155
- # Output projection
156
- logits = self.output_layer(x)
157
- return logits
 
 
 
 
158
 
159
- def generate(
160
- self,
161
- prompt_tokens: list,
162
- max_new_tokens: int = 100,
163
- temperature: float = 0.7,
164
- top_k: int = 50,
165
- top_p: float = 0.9
166
- ):
167
- """Generate code given a prompt"""
168
  generated = list(prompt_tokens)
169
 
170
  for _ in range(max_new_tokens):
171
- # Truncate if too long
172
  context = generated[-self.max_length:]
 
173
 
174
- # Get predictions
175
- input_tensor = tf.expand_dims(context, 0)
176
  logits = self(input_tensor, training=False)
177
- next_token_logits = logits[0, -1, :] / temperature
178
 
179
- # Apply top-k filtering
180
  if top_k > 0:
181
- top_k_logits, top_k_indices = tf.math.top_k(
182
- next_token_logits, k=min(top_k, self.vocab_size)
183
- )
184
- # Create mask for non-top-k tokens
185
- indices_to_remove = tf.less(
186
- next_token_logits,
187
- top_k_logits[-1]
188
- )
189
- next_token_logits = tf.where(
190
- indices_to_remove,
191
- tf.ones_like(next_token_logits) * float('-inf'),
192
- next_token_logits
193
- )
194
-
195
- # Apply top-p (nucleus) filtering
196
- if top_p < 1.0:
197
- sorted_logits = tf.sort(next_token_logits, direction='DESCENDING')
198
- sorted_probs = tf.nn.softmax(sorted_logits)
199
- cumulative_probs = tf.cumsum(sorted_probs)
200
-
201
- # Find cutoff
202
- sorted_indices_to_remove = cumulative_probs > top_p
203
- sorted_indices_to_remove = tf.concat([
204
- [False],
205
- sorted_indices_to_remove[:-1]
206
- ], axis=0)
207
-
208
- sorted_logits = tf.where(
209
- sorted_indices_to_remove,
210
- tf.ones_like(sorted_logits) * float('-inf'),
211
- sorted_logits
212
- )
213
 
214
- # Sample from distribution
215
- probs = tf.nn.softmax(next_token_logits)
216
- next_token = tf.random.categorical(
217
- tf.expand_dims(next_token_logits, 0),
218
- num_samples=1
219
- )[0, 0]
220
 
221
- generated.append(int(next_token.numpy()))
222
-
223
- # Stop if end token
224
  if next_token == 3: # END token
225
  break
226
 
@@ -233,49 +108,5 @@ class VedaProgrammingLLM(keras.Model):
233
  'd_model': self.d_model,
234
  'num_heads': self.num_heads,
235
  'num_layers': self.num_layers,
236
- 'ff_dim': self.ff_dim,
237
- 'dropout_rate': self.dropout_rate
238
- }
239
-
240
- @classmethod
241
- def from_config(cls, config):
242
- return cls(**config)
243
-
244
-
245
- def create_veda_model(
246
- vocab_size: int,
247
- max_length: int = 512,
248
- model_size: str = "small"
249
- ) -> VedaProgrammingLLM:
250
- """Factory function to create Veda Programming model"""
251
-
252
- configs = {
253
- "small": {
254
- "d_model": 256,
255
- "num_heads": 4,
256
- "num_layers": 4,
257
- "ff_dim": 512
258
- },
259
- "medium": {
260
- "d_model": 512,
261
- "num_heads": 8,
262
- "num_layers": 6,
263
- "ff_dim": 1024
264
- },
265
- "large": {
266
- "d_model": 768,
267
- "num_heads": 12,
268
- "num_layers": 12,
269
- "ff_dim": 2048
270
- }
271
- }
272
-
273
- config = configs.get(model_size, configs["small"])
274
-
275
- model = VedaProgrammingLLM(
276
- vocab_size=vocab_size,
277
- max_length=max_length,
278
- **config
279
- )
280
-
281
- return model
 
2
  from tensorflow import keras
3
  from tensorflow.keras import layers
4
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  class VedaProgrammingLLM(keras.Model):
7
  """Veda Programming Language Model"""
8
 
9
+ def __init__(self, vocab_size: int, max_length: int = 256,
10
+ d_model: int = 128, num_heads: int = 4,
11
+ num_layers: int = 2, ff_dim: int = 256, **kwargs):
 
 
 
 
 
 
 
 
12
  super().__init__(**kwargs)
13
 
14
  self.vocab_size = vocab_size
 
17
  self.num_heads = num_heads
18
  self.num_layers = num_layers
19
  self.ff_dim = ff_dim
 
20
 
21
+ # Embeddings
22
+ self.token_embedding = layers.Embedding(vocab_size, d_model)
23
+ self.pos_embedding = layers.Embedding(max_length, d_model)
24
+ self.dropout = layers.Dropout(0.1)
 
 
 
25
 
26
+ # Transformer layers
27
+ self.transformer_blocks = []
28
+ for _ in range(num_layers):
29
+ self.transformer_blocks.append({
30
+ 'attn': layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads),
31
+ 'ffn': keras.Sequential([
32
+ layers.Dense(ff_dim, activation='relu'),
33
+ layers.Dense(d_model)
34
+ ]),
35
+ 'ln1': layers.LayerNormalization(),
36
+ 'ln2': layers.LayerNormalization(),
37
+ 'dropout1': layers.Dropout(0.1),
38
+ 'dropout2': layers.Dropout(0.1)
39
+ })
40
 
41
+ self.final_ln = layers.LayerNormalization()
42
  self.output_layer = layers.Dense(vocab_size)
43
 
 
 
 
 
 
 
 
44
  def call(self, inputs, training=False):
45
+ seq_len = tf.shape(inputs)[1]
46
 
47
  # Create causal mask
48
+ mask = self._create_causal_mask(seq_len)
49
 
50
  # Embeddings
51
+ positions = tf.range(seq_len)
52
+ x = self.token_embedding(inputs) + self.pos_embedding(positions)
 
53
  x = self.dropout(x, training=training)
54
 
55
  # Transformer blocks
56
+ for block in self.transformer_blocks:
57
+ # Self attention
58
+ attn_out = block['attn'](x, x, attention_mask=mask, training=training)
59
+ attn_out = block['dropout1'](attn_out, training=training)
60
+ x = block['ln1'](x + attn_out)
61
+
62
+ # FFN
63
+ ffn_out = block['ffn'](x)
64
+ ffn_out = block['dropout2'](ffn_out, training=training)
65
+ x = block['ln2'](x + ffn_out)
66
 
67
+ x = self.final_ln(x)
68
+ return self.output_layer(x)
69
+
70
+ def _create_causal_mask(self, seq_len):
71
+ """Create causal attention mask"""
72
+ mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
73
+ return mask
74
 
75
+ def generate(self, prompt_tokens: list, max_new_tokens: int = 50,
76
+ temperature: float = 0.8, top_k: int = 40):
77
+ """Generate code"""
 
 
 
 
 
 
78
  generated = list(prompt_tokens)
79
 
80
  for _ in range(max_new_tokens):
 
81
  context = generated[-self.max_length:]
82
+ input_tensor = tf.constant([context], dtype=tf.int32)
83
 
 
 
84
  logits = self(input_tensor, training=False)
85
+ next_logits = logits[0, -1, :] / temperature
86
 
87
+ # Top-k sampling
88
  if top_k > 0:
89
+ top_k_logits, top_k_indices = tf.math.top_k(next_logits, k=min(top_k, self.vocab_size))
90
+ probs = tf.nn.softmax(top_k_logits)
91
+ idx = tf.random.categorical(tf.expand_dims(tf.math.log(probs + 1e-10), 0), 1)[0, 0]
92
+ next_token = top_k_indices[idx].numpy()
93
+ else:
94
+ probs = tf.nn.softmax(next_logits)
95
+ next_token = tf.random.categorical(tf.expand_dims(tf.math.log(probs + 1e-10), 0), 1)[0, 0].numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ generated.append(int(next_token))
 
 
 
 
 
98
 
 
 
 
99
  if next_token == 3: # END token
100
  break
101
 
 
108
  'd_model': self.d_model,
109
  'num_heads': self.num_heads,
110
  'num_layers': self.num_layers,
111
+ 'ff_dim': self.ff_dim
112
+ }