File size: 9,664 Bytes
dbb535a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
from typing import List, Tuple, Optional
from model import VedaProgrammingLLM, create_veda_model
from tokenizer import VedaTokenizer

class VedaTrainer:
    """Trainer class for Veda Programming LLM"""
    
    def __init__(
        self,
        data_path: str = "programming.txt",
        vocab_size: int = 10000,
        max_length: int = 256,
        batch_size: int = 32,
        model_size: str = "small"
    ):
        self.data_path = data_path
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.batch_size = batch_size
        self.model_size = model_size
        
        self.tokenizer = VedaTokenizer(vocab_size=vocab_size)
        self.model: Optional[VedaProgrammingLLM] = None
    
    def load_data(self) -> List[str]:
        """Load programming data from file"""
        if not os.path.exists(self.data_path):
            print(f"Creating sample {self.data_path}...")
            self._create_sample_data()
        
        with open(self.data_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Split into code samples (by double newlines or function definitions)
        samples = []
        current_sample = []
        
        for line in content.split('\n'):
            if line.strip() == '' and current_sample:
                samples.append('\n'.join(current_sample))
                current_sample = []
            else:
                current_sample.append(line)
        
        if current_sample:
            samples.append('\n'.join(current_sample))
        
        # Filter empty samples
        samples = [s.strip() for s in samples if s.strip()]
        print(f"Loaded {len(samples)} code samples")
        return samples
    
    def _create_sample_data(self):
        """Create sample programming data"""
        sample_code = '''
def hello_world():
    print("Hello, World!")
    return True

def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)

def factorial(n):
    if n == 0:
        return 1
    return n * factorial(n-1)

class Calculator:
    def __init__(self):
        self.result = 0
    
    def add(self, a, b):
        self.result = a + b
        return self.result
    
    def subtract(self, a, b):
        self.result = a - b
        return self.result
    
    def multiply(self, a, b):
        self.result = a * b
        return self.result
    
    def divide(self, a, b):
        if b != 0:
            self.result = a / b
        return self.result

def bubble_sort(arr):
    n = len(arr)
    for i in range(n):
        for j in range(0, n-i-1):
            if arr[j] > arr[j+1]:
                arr[j], arr[j+1] = arr[j+1], arr[j]
    return arr

def binary_search(arr, target):
    left, right = 0, len(arr) - 1
    while left <= right:
        mid = (left + right) // 2
        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return -1

def quicksort(arr):
    if len(arr) <= 1:
        return arr
    pivot = arr[len(arr) // 2]
    left = [x for x in arr if x < pivot]
    middle = [x for x in arr if x == pivot]
    right = [x for x in arr if x > pivot]
    return quicksort(left) + middle + quicksort(right)

class LinkedList:
    def __init__(self):
        self.head = None
    
    def append(self, data):
        new_node = Node(data)
        if not self.head:
            self.head = new_node
            return
        current = self.head
        while current.next:
            current = current.next
        current.next = new_node

def merge_sort(arr):
    if len(arr) <= 1:
        return arr
    mid = len(arr) // 2
    left = merge_sort(arr[:mid])
    right = merge_sort(arr[mid:])
    return merge(left, right)

def is_palindrome(s):
    s = s.lower().replace(" ", "")
    return s == s[::-1]

def count_words(text):
    words = text.split()
    return len(words)

async def fetch_data(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            return await response.json()

def read_file(filename):
    with open(filename, 'r') as f:
        return f.read()

def write_file(filename, content):
    with open(filename, 'w') as f:
        f.write(content)
'''
        with open(self.data_path, 'w', encoding='utf-8') as f:
            f.write(sample_code)
        print(f"Created sample {self.data_path}")
    
    def prepare_dataset(self, samples: List[str]) -> tf.data.Dataset:
        """Prepare TensorFlow dataset for training"""
        # Fit tokenizer
        self.tokenizer.fit(samples)
        
        # Encode all samples
        all_tokens = []
        for sample in samples:
            tokens = self.tokenizer.encode(sample)
            all_tokens.extend(tokens)
        
        # Create sequences
        sequences = []
        for i in range(0, len(all_tokens) - self.max_length, self.max_length // 2):
            seq = all_tokens[i:i + self.max_length + 1]
            if len(seq) == self.max_length + 1:
                sequences.append(seq)
        
        if not sequences:
            # Create padded sequences if not enough data
            for sample in samples:
                tokens = self.tokenizer.encode(sample, max_length=self.max_length + 1)
                sequences.append(tokens)
        
        print(f"Created {len(sequences)} training sequences")
        
        # Convert to numpy arrays
        sequences = np.array(sequences)
        
        # Split into input and target
        X = sequences[:, :-1]
        y = sequences[:, 1:]
        
        # Create dataset
        dataset = tf.data.Dataset.from_tensor_slices((X, y))
        dataset = dataset.shuffle(buffer_size=len(sequences))
        dataset = dataset.batch(self.batch_size)
        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        
        return dataset
    
    def build_model(self):
        """Build the Veda Programming model"""
        self.model = create_veda_model(
            vocab_size=self.tokenizer.vocabulary_size,
            max_length=self.max_length,
            model_size=self.model_size
        )
        
        # Compile model
        optimizer = keras.optimizers.Adam(learning_rate=1e-4)
        loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        
        self.model.compile(
            optimizer=optimizer,
            loss=loss_fn,
            metrics=['accuracy']
        )
        
        # Build model with dummy input
        dummy_input = tf.zeros((1, self.max_length), dtype=tf.int32)
        self.model(dummy_input)
        
        self.model.summary()
        return self.model
    
    def train(
        self,
        epochs: int = 10,
        save_path: str = "veda_model"
    ):
        """Train the model"""
        # Load and prepare data
        samples = self.load_data()
        dataset = self.prepare_dataset(samples)
        
        # Build model
        self.build_model()
        
        # Callbacks
        callbacks = [
            keras.callbacks.ModelCheckpoint(
                filepath=os.path.join(save_path, "model_checkpoint.keras"),
                save_best_only=True,
                monitor='loss'
            ),
            keras.callbacks.EarlyStopping(
                monitor='loss',
                patience=5,
                restore_best_weights=True
            ),
            keras.callbacks.ReduceLROnPlateau(
                monitor='loss',
                factor=0.5,
                patience=2
            )
        ]
        
        # Create save directory
        os.makedirs(save_path, exist_ok=True)
        
        # Train
        history = self.model.fit(
            dataset,
            epochs=epochs,
            callbacks=callbacks
        )
        
        # Save final model and tokenizer
        self.model.save_weights(os.path.join(save_path, "model_weights.h5"))
        self.tokenizer.save(os.path.join(save_path, "tokenizer.json"))
        
        # Save model config
        config = self.model.get_config()
        config['tokenizer_vocab_size'] = self.tokenizer.vocabulary_size
        
        import json
        with open(os.path.join(save_path, "config.json"), 'w') as f:
            json.dump(config, f)
        
        print(f"Model saved to {save_path}")
        return history
    
    def generate(
        self,
        prompt: str,
        max_new_tokens: int = 100,
        temperature: float = 0.7
    ) -> str:
        """Generate code from prompt"""
        if self.model is None:
            raise ValueError("Model not loaded. Train or load a model first.")
        
        # Encode prompt
        prompt_tokens = self.tokenizer.encode(prompt)
        
        # Generate
        generated_tokens = self.model.generate(
            prompt_tokens,
            max_new_tokens=max_new_tokens,
            temperature=temperature
        )
        
        # Decode
        generated_text = self.tokenizer.decode(generated_tokens)
        return generated_text


def main():
    """Main training function"""
    trainer = VedaTrainer(
        data_path="programming.txt",
        vocab_size=10000,
        max_length=256,
        batch_size=16,
        model_size="small"
    )
    
    # Train model
    history = trainer.train(epochs=20, save_path="veda_model")
    
    # Test generation
    test_prompt = "def calculate"
    generated = trainer.generate(test_prompt, max_new_tokens=50)
    print(f"\nGenerated code:\n{generated}")


if __name__ == "__main__":
    main()