vedaco commited on
Commit
b84ae93
·
verified ·
1 Parent(s): c351467

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +123 -49
model.py CHANGED
@@ -1,14 +1,23 @@
 
 
1
  import tensorflow as tf
2
  from tensorflow import keras
3
  from tensorflow.keras import layers
4
  import numpy as np
5
 
6
  class VedaProgrammingLLM(keras.Model):
7
- """Veda Programming Language Model"""
8
 
9
- def __init__(self, vocab_size: int, max_length: int = 256,
10
- d_model: int = 128, num_heads: int = 4,
11
- num_layers: int = 2, ff_dim: int = 256, **kwargs):
 
 
 
 
 
 
 
12
  super().__init__(**kwargs)
13
 
14
  self.vocab_size = vocab_size
@@ -24,79 +33,144 @@ class VedaProgrammingLLM(keras.Model):
24
  self.dropout = layers.Dropout(0.1)
25
 
26
  # Transformer layers
27
- self.transformer_blocks = []
 
 
 
 
28
  for _ in range(num_layers):
29
- self.transformer_blocks.append({
30
- 'attn': layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads),
31
- 'ffn': keras.Sequential([
32
- layers.Dense(ff_dim, activation='relu'),
33
- layers.Dense(d_model)
34
- ]),
35
- 'ln1': layers.LayerNormalization(),
36
- 'ln2': layers.LayerNormalization(),
37
- 'dropout1': layers.Dropout(0.1),
38
- 'dropout2': layers.Dropout(0.1)
39
- })
 
 
 
 
 
 
40
 
41
- self.final_ln = layers.LayerNormalization()
42
  self.output_layer = layers.Dense(vocab_size)
43
 
44
  def call(self, inputs, training=False):
45
  seq_len = tf.shape(inputs)[1]
46
 
47
- # Create causal mask
48
- mask = self._create_causal_mask(seq_len)
49
 
50
  # Embeddings
51
  positions = tf.range(seq_len)
52
- x = self.token_embedding(inputs) + self.pos_embedding(positions)
 
 
53
  x = self.dropout(x, training=training)
54
 
55
  # Transformer blocks
56
- for block in self.transformer_blocks:
57
- # Self attention
58
- attn_out = block['attn'](x, x, attention_mask=mask, training=training)
59
- attn_out = block['dropout1'](attn_out, training=training)
60
- x = block['ln1'](x + attn_out)
61
-
62
- # FFN
63
- ffn_out = block['ffn'](x)
64
- ffn_out = block['dropout2'](ffn_out, training=training)
65
- x = block['ln2'](x + ffn_out)
66
 
67
  x = self.final_ln(x)
68
  return self.output_layer(x)
69
 
70
- def _create_causal_mask(self, seq_len):
71
- """Create causal attention mask"""
72
- mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
73
- return mask
74
-
75
- def generate(self, prompt_tokens: list, max_new_tokens: int = 50,
76
- temperature: float = 0.8, top_k: int = 40):
77
- """Generate code"""
 
 
 
 
78
  generated = list(prompt_tokens)
79
 
80
- for _ in range(max_new_tokens):
 
81
  context = generated[-self.max_length:]
82
  input_tensor = tf.constant([context], dtype=tf.int32)
83
 
 
84
  logits = self(input_tensor, training=False)
85
- next_logits = logits[0, -1, :] / temperature
 
 
 
 
 
 
 
 
 
86
 
87
- # Top-k sampling
88
- if top_k > 0:
89
- top_k_logits, top_k_indices = tf.math.top_k(next_logits, k=min(top_k, self.vocab_size))
90
- probs = tf.nn.softmax(top_k_logits)
91
- idx = tf.random.categorical(tf.expand_dims(tf.math.log(probs + 1e-10), 0), 1)[0, 0]
92
- next_token = top_k_indices[idx].numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  else:
94
- probs = tf.nn.softmax(next_logits)
95
- next_token = tf.random.categorical(tf.expand_dims(tf.math.log(probs + 1e-10), 0), 1)[0, 0].numpy()
 
 
 
 
 
 
96
 
97
  generated.append(int(next_token))
98
 
99
- if next_token == 3: # END token
 
 
 
 
100
  break
101
 
102
  return generated
 
1
+ """Veda Programming LLM Model - Fixed Version"""
2
+
3
  import tensorflow as tf
4
  from tensorflow import keras
5
  from tensorflow.keras import layers
6
  import numpy as np
7
 
8
  class VedaProgrammingLLM(keras.Model):
9
+ """Veda Programming Language Model with all generation features"""
10
 
11
+ def __init__(
12
+ self,
13
+ vocab_size: int,
14
+ max_length: int = 256,
15
+ d_model: int = 256,
16
+ num_heads: int = 8,
17
+ num_layers: int = 4,
18
+ ff_dim: int = 512,
19
+ **kwargs
20
+ ):
21
  super().__init__(**kwargs)
22
 
23
  self.vocab_size = vocab_size
 
33
  self.dropout = layers.Dropout(0.1)
34
 
35
  # Transformer layers
36
+ self.attn_layers = []
37
+ self.ffn_layers = []
38
+ self.ln1_layers = []
39
+ self.ln2_layers = []
40
+
41
  for _ in range(num_layers):
42
+ self.attn_layers.append(
43
+ layers.MultiHeadAttention(
44
+ num_heads=num_heads,
45
+ key_dim=d_model // num_heads,
46
+ dropout=0.1
47
+ )
48
+ )
49
+ self.ffn_layers.append(
50
+ keras.Sequential([
51
+ layers.Dense(ff_dim, activation='gelu'),
52
+ layers.Dropout(0.1),
53
+ layers.Dense(d_model),
54
+ layers.Dropout(0.1)
55
+ ])
56
+ )
57
+ self.ln1_layers.append(layers.LayerNormalization(epsilon=1e-6))
58
+ self.ln2_layers.append(layers.LayerNormalization(epsilon=1e-6))
59
 
60
+ self.final_ln = layers.LayerNormalization(epsilon=1e-6)
61
  self.output_layer = layers.Dense(vocab_size)
62
 
63
  def call(self, inputs, training=False):
64
  seq_len = tf.shape(inputs)[1]
65
 
66
+ # Causal mask
67
+ mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
68
 
69
  # Embeddings
70
  positions = tf.range(seq_len)
71
+ x = self.token_embedding(inputs)
72
+ x = x * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
73
+ x = x + self.pos_embedding(positions)
74
  x = self.dropout(x, training=training)
75
 
76
  # Transformer blocks
77
+ for i in range(self.num_layers):
78
+ attn_out = self.attn_layers[i](x, x, attention_mask=mask, training=training)
79
+ x = self.ln1_layers[i](x + attn_out)
80
+ ffn_out = self.ffn_layers[i](x, training=training)
81
+ x = self.ln2_layers[i](x + ffn_out)
 
 
 
 
 
82
 
83
  x = self.final_ln(x)
84
  return self.output_layer(x)
85
 
86
+ def generate(
87
+ self,
88
+ prompt_tokens: list,
89
+ max_new_tokens: int = 100,
90
+ temperature: float = 0.7,
91
+ top_k: int = 50,
92
+ top_p: float = 0.9,
93
+ repetition_penalty: float = 1.2, # NOW INCLUDED
94
+ stop_tokens: list = None
95
+ ) -> list:
96
+ """Generate code with all sampling features"""
97
+
98
  generated = list(prompt_tokens)
99
 
100
+ for step in range(max_new_tokens):
101
+ # Use last max_length tokens
102
  context = generated[-self.max_length:]
103
  input_tensor = tf.constant([context], dtype=tf.int32)
104
 
105
+ # Get logits
106
  logits = self(input_tensor, training=False)
107
+ next_logits = logits[0, -1, :].numpy().astype(np.float64)
108
+
109
+ # Apply repetition penalty
110
+ if repetition_penalty != 1.0:
111
+ for token_id in set(generated[-50:]):
112
+ if 0 <= token_id < len(next_logits):
113
+ if next_logits[token_id] > 0:
114
+ next_logits[token_id] /= repetition_penalty
115
+ else:
116
+ next_logits[token_id] *= repetition_penalty
117
 
118
+ # Apply temperature
119
+ next_logits = next_logits / max(temperature, 0.1)
120
+
121
+ # Apply top-k filtering
122
+ if top_k > 0 and top_k < len(next_logits):
123
+ indices_to_remove = next_logits < np.partition(next_logits, -top_k)[-top_k]
124
+ next_logits[indices_to_remove] = -np.inf
125
+
126
+ # Apply top-p (nucleus) filtering
127
+ if top_p < 1.0:
128
+ sorted_indices = np.argsort(next_logits)[::-1]
129
+ sorted_logits = next_logits[sorted_indices]
130
+
131
+ # Compute softmax
132
+ max_logit = np.max(sorted_logits[sorted_logits > -np.inf])
133
+ exp_logits = np.exp(sorted_logits - max_logit)
134
+ probs = exp_logits / (np.sum(exp_logits) + 1e-10)
135
+
136
+ cumulative_probs = np.cumsum(probs)
137
+
138
+ # Remove tokens above threshold
139
+ sorted_indices_to_remove = cumulative_probs > top_p
140
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
141
+ sorted_indices_to_remove[0] = False
142
+
143
+ indices_to_remove = sorted_indices[sorted_indices_to_remove]
144
+ next_logits[indices_to_remove] = -np.inf
145
+
146
+ # Convert to probabilities
147
+ max_logit = np.max(next_logits[next_logits > -np.inf]) if np.any(next_logits > -np.inf) else 0
148
+ exp_logits = np.exp(next_logits - max_logit)
149
+ exp_logits[next_logits == -np.inf] = 0
150
+ probs = exp_logits / (np.sum(exp_logits) + 1e-10)
151
+
152
+ # Ensure valid distribution
153
+ probs = np.clip(probs, 0, 1)
154
+ prob_sum = np.sum(probs)
155
+ if prob_sum > 0:
156
+ probs = probs / prob_sum
157
  else:
158
+ # Fallback to uniform
159
+ probs = np.ones_like(probs) / len(probs)
160
+
161
+ # Sample
162
+ try:
163
+ next_token = np.random.choice(len(probs), p=probs)
164
+ except ValueError:
165
+ next_token = np.argmax(probs)
166
 
167
  generated.append(int(next_token))
168
 
169
+ # Stop conditions
170
+ if next_token == 0: # PAD
171
+ break
172
+
173
+ if stop_tokens and next_token in stop_tokens:
174
  break
175
 
176
  return generated