vedaco commited on
Commit
5b1197f
·
verified ·
1 Parent(s): dbb535a

Create model.py

Browse files
Files changed (1) hide show
  1. model.py +281 -0
model.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from tensorflow import keras
3
+ from tensorflow.keras import layers
4
+ import numpy as np
5
+ from typing import Optional
6
+
7
+ class PositionalEncoding(layers.Layer):
8
+ """Positional encoding layer for transformer"""
9
+
10
+ def __init__(self, max_length: int, d_model: int, **kwargs):
11
+ super().__init__(**kwargs)
12
+ self.max_length = max_length
13
+ self.d_model = d_model
14
+
15
+ # Create positional encoding matrix
16
+ position = np.arange(max_length)[:, np.newaxis]
17
+ div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
18
+
19
+ pe = np.zeros((max_length, d_model))
20
+ pe[:, 0::2] = np.sin(position * div_term)
21
+ pe[:, 1::2] = np.cos(position * div_term)
22
+
23
+ self.positional_encoding = tf.constant(pe, dtype=tf.float32)
24
+
25
+ def call(self, x):
26
+ seq_length = tf.shape(x)[1]
27
+ return x + self.positional_encoding[:seq_length, :]
28
+
29
+ def get_config(self):
30
+ config = super().get_config()
31
+ config.update({
32
+ 'max_length': self.max_length,
33
+ 'd_model': self.d_model
34
+ })
35
+ return config
36
+
37
+
38
+ class TransformerBlock(layers.Layer):
39
+ """Transformer decoder block"""
40
+
41
+ def __init__(self, d_model: int, num_heads: int, ff_dim: int,
42
+ dropout_rate: float = 0.1, **kwargs):
43
+ super().__init__(**kwargs)
44
+ self.d_model = d_model
45
+ self.num_heads = num_heads
46
+ self.ff_dim = ff_dim
47
+ self.dropout_rate = dropout_rate
48
+
49
+ self.attention = layers.MultiHeadAttention(
50
+ num_heads=num_heads,
51
+ key_dim=d_model // num_heads,
52
+ dropout=dropout_rate
53
+ )
54
+ self.ffn = keras.Sequential([
55
+ layers.Dense(ff_dim, activation='gelu'),
56
+ layers.Dropout(dropout_rate),
57
+ layers.Dense(d_model),
58
+ layers.Dropout(dropout_rate)
59
+ ])
60
+ self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
61
+ self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
62
+ self.dropout = layers.Dropout(dropout_rate)
63
+
64
+ def call(self, x, training=False, mask=None):
65
+ # Causal self-attention
66
+ attn_output = self.attention(
67
+ query=x,
68
+ value=x,
69
+ key=x,
70
+ attention_mask=mask,
71
+ training=training
72
+ )
73
+ attn_output = self.dropout(attn_output, training=training)
74
+ out1 = self.layernorm1(x + attn_output)
75
+
76
+ # Feed forward network
77
+ ffn_output = self.ffn(out1, training=training)
78
+ return self.layernorm2(out1 + ffn_output)
79
+
80
+ def get_config(self):
81
+ config = super().get_config()
82
+ config.update({
83
+ 'd_model': self.d_model,
84
+ 'num_heads': self.num_heads,
85
+ 'ff_dim': self.ff_dim,
86
+ 'dropout_rate': self.dropout_rate
87
+ })
88
+ return config
89
+
90
+
91
+ class VedaProgrammingLLM(keras.Model):
92
+ """Veda Programming Language Model"""
93
+
94
+ def __init__(
95
+ self,
96
+ vocab_size: int,
97
+ max_length: int = 512,
98
+ d_model: int = 256,
99
+ num_heads: int = 8,
100
+ num_layers: int = 6,
101
+ ff_dim: int = 1024,
102
+ dropout_rate: float = 0.1,
103
+ **kwargs
104
+ ):
105
+ super().__init__(**kwargs)
106
+
107
+ self.vocab_size = vocab_size
108
+ self.max_length = max_length
109
+ self.d_model = d_model
110
+ self.num_heads = num_heads
111
+ self.num_layers = num_layers
112
+ self.ff_dim = ff_dim
113
+ self.dropout_rate = dropout_rate
114
+
115
+ # Embedding layers
116
+ self.token_embedding = layers.Embedding(
117
+ input_dim=vocab_size,
118
+ output_dim=d_model
119
+ )
120
+ self.positional_encoding = PositionalEncoding(max_length, d_model)
121
+ self.dropout = layers.Dropout(dropout_rate)
122
+
123
+ # Transformer blocks
124
+ self.transformer_blocks = [
125
+ TransformerBlock(d_model, num_heads, ff_dim, dropout_rate)
126
+ for _ in range(num_layers)
127
+ ]
128
+
129
+ # Output layer
130
+ self.output_layer = layers.Dense(vocab_size)
131
+
132
+ def _create_causal_mask(self, seq_length):
133
+ """Create causal attention mask"""
134
+ mask = tf.linalg.band_part(
135
+ tf.ones((seq_length, seq_length)), -1, 0
136
+ )
137
+ return mask
138
+
139
+ def call(self, inputs, training=False):
140
+ seq_length = tf.shape(inputs)[1]
141
+
142
+ # Create causal mask
143
+ mask = self._create_causal_mask(seq_length)
144
+
145
+ # Embeddings
146
+ x = self.token_embedding(inputs)
147
+ x = x * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
148
+ x = self.positional_encoding(x)
149
+ x = self.dropout(x, training=training)
150
+
151
+ # Transformer blocks
152
+ for transformer_block in self.transformer_blocks:
153
+ x = transformer_block(x, training=training, mask=mask)
154
+
155
+ # Output projection
156
+ logits = self.output_layer(x)
157
+ return logits
158
+
159
+ def generate(
160
+ self,
161
+ prompt_tokens: list,
162
+ max_new_tokens: int = 100,
163
+ temperature: float = 0.7,
164
+ top_k: int = 50,
165
+ top_p: float = 0.9
166
+ ):
167
+ """Generate code given a prompt"""
168
+ generated = list(prompt_tokens)
169
+
170
+ for _ in range(max_new_tokens):
171
+ # Truncate if too long
172
+ context = generated[-self.max_length:]
173
+
174
+ # Get predictions
175
+ input_tensor = tf.expand_dims(context, 0)
176
+ logits = self(input_tensor, training=False)
177
+ next_token_logits = logits[0, -1, :] / temperature
178
+
179
+ # Apply top-k filtering
180
+ if top_k > 0:
181
+ top_k_logits, top_k_indices = tf.math.top_k(
182
+ next_token_logits, k=min(top_k, self.vocab_size)
183
+ )
184
+ # Create mask for non-top-k tokens
185
+ indices_to_remove = tf.less(
186
+ next_token_logits,
187
+ top_k_logits[-1]
188
+ )
189
+ next_token_logits = tf.where(
190
+ indices_to_remove,
191
+ tf.ones_like(next_token_logits) * float('-inf'),
192
+ next_token_logits
193
+ )
194
+
195
+ # Apply top-p (nucleus) filtering
196
+ if top_p < 1.0:
197
+ sorted_logits = tf.sort(next_token_logits, direction='DESCENDING')
198
+ sorted_probs = tf.nn.softmax(sorted_logits)
199
+ cumulative_probs = tf.cumsum(sorted_probs)
200
+
201
+ # Find cutoff
202
+ sorted_indices_to_remove = cumulative_probs > top_p
203
+ sorted_indices_to_remove = tf.concat([
204
+ [False],
205
+ sorted_indices_to_remove[:-1]
206
+ ], axis=0)
207
+
208
+ sorted_logits = tf.where(
209
+ sorted_indices_to_remove,
210
+ tf.ones_like(sorted_logits) * float('-inf'),
211
+ sorted_logits
212
+ )
213
+
214
+ # Sample from distribution
215
+ probs = tf.nn.softmax(next_token_logits)
216
+ next_token = tf.random.categorical(
217
+ tf.expand_dims(next_token_logits, 0),
218
+ num_samples=1
219
+ )[0, 0]
220
+
221
+ generated.append(int(next_token.numpy()))
222
+
223
+ # Stop if end token
224
+ if next_token == 3: # END token
225
+ break
226
+
227
+ return generated
228
+
229
+ def get_config(self):
230
+ return {
231
+ 'vocab_size': self.vocab_size,
232
+ 'max_length': self.max_length,
233
+ 'd_model': self.d_model,
234
+ 'num_heads': self.num_heads,
235
+ 'num_layers': self.num_layers,
236
+ 'ff_dim': self.ff_dim,
237
+ 'dropout_rate': self.dropout_rate
238
+ }
239
+
240
+ @classmethod
241
+ def from_config(cls, config):
242
+ return cls(**config)
243
+
244
+
245
+ def create_veda_model(
246
+ vocab_size: int,
247
+ max_length: int = 512,
248
+ model_size: str = "small"
249
+ ) -> VedaProgrammingLLM:
250
+ """Factory function to create Veda Programming model"""
251
+
252
+ configs = {
253
+ "small": {
254
+ "d_model": 256,
255
+ "num_heads": 4,
256
+ "num_layers": 4,
257
+ "ff_dim": 512
258
+ },
259
+ "medium": {
260
+ "d_model": 512,
261
+ "num_heads": 8,
262
+ "num_layers": 6,
263
+ "ff_dim": 1024
264
+ },
265
+ "large": {
266
+ "d_model": 768,
267
+ "num_heads": 12,
268
+ "num_layers": 12,
269
+ "ff_dim": 2048
270
+ }
271
+ }
272
+
273
+ config = configs.get(model_size, configs["small"])
274
+
275
+ model = VedaProgrammingLLM(
276
+ vocab_size=vocab_size,
277
+ max_length=max_length,
278
+ **config
279
+ )
280
+
281
+ return model