Spaces:

vedaco
/

veda-programming

Sleeping

App Files Files Community

vedaco commited on Jan 9

Commit

85f3c75

verified ·

1 Parent(s): b3a9dc5

Update train.py

Browse files

Files changed (1) hide show

train.py +79 -227

train.py CHANGED Viewed

@@ -2,61 +2,11 @@ import tensorflow as tf
 from tensorflow import keras
 import numpy as np
 import os
-from typing import List, Tuple, Optional
-from model import VedaProgrammingLLM, create_veda_model
 from tokenizer import VedaTokenizer
-class VedaTrainer:
-    """Trainer class for Veda Programming LLM"""
-    def __init__(
-        self,
-        data_path: str = "programming.txt",
-        vocab_size: int = 10000,
-        max_length: int = 256,
-        batch_size: int = 32,
-        model_size: str = "small"
-    ):
-        self.data_path = data_path
-        self.vocab_size = vocab_size
-        self.max_length = max_length
-        self.batch_size = batch_size
-        self.model_size = model_size
-        self.tokenizer = VedaTokenizer(vocab_size=vocab_size)
-        self.model: Optional[VedaProgrammingLLM] = None
-    def load_data(self) -> List[str]:
-        """Load programming data from file"""
-        if not os.path.exists(self.data_path):
-            print(f"Creating sample {self.data_path}...")
-            self._create_sample_data()
-        with open(self.data_path, 'r', encoding='utf-8') as f:
-            content = f.read()
-        # Split into code samples (by double newlines or function definitions)
-        samples = []
-        current_sample = []
-        for line in content.split('\n'):
-            if line.strip() == '' and current_sample:
-                samples.append('\n'.join(current_sample))
-                current_sample = []
-            else:
-                current_sample.append(line)
-        if current_sample:
-            samples.append('\n'.join(current_sample))
-        # Filter empty samples
-        samples = [s.strip() for s in samples if s.strip()]
-        print(f"Loaded {len(samples)} code samples")
-        return samples
-    def _create_sample_data(self):
-        """Create sample programming data"""
-        sample_code = '''
 def hello_world():
     print("Hello, World!")
     return True
@@ -76,21 +26,13 @@ class Calculator:
         self.result = 0
     def add(self, a, b):
-        self.result = a + b
-        return self.result
     def subtract(self, a, b):
-        self.result = a - b
-        return self.result
     def multiply(self, a, b):
-        self.result = a * b
-        return self.result
-    def divide(self, a, b):
-        if b != 0:
-            self.result = a / b
-        return self.result
 def bubble_sort(arr):
     n = len(arr)
@@ -112,228 +54,138 @@ def binary_search(arr, target):
             right = mid - 1
     return -1
-def quicksort(arr):
-    if len(arr) <= 1:
-        return arr
-    pivot = arr[len(arr) // 2]
-    left = [x for x in arr if x < pivot]
-    middle = [x for x in arr if x == pivot]
-    right = [x for x in arr if x > pivot]
-    return quicksort(left) + middle + quicksort(right)
-class LinkedList:
-    def __init__(self):
-        self.head = None
-    def append(self, data):
-        new_node = Node(data)
-        if not self.head:
-            self.head = new_node
-            return
-        current = self.head
-        while current.next:
-            current = current.next
-        current.next = new_node
-def merge_sort(arr):
-    if len(arr) <= 1:
-        return arr
-    mid = len(arr) // 2
-    left = merge_sort(arr[:mid])
-    right = merge_sort(arr[mid:])
-    return merge(left, right)
 def is_palindrome(s):
-    s = s.lower().replace(" ", "")
     return s == s[::-1]
-def count_words(text):
-    words = text.split()
-    return len(words)
-async def fetch_data(url):
-    async with aiohttp.ClientSession() as session:
-        async with session.get(url) as response:
-            return await response.json()
-def read_file(filename):
-    with open(filename, 'r') as f:
-        return f.read()
-def write_file(filename, content):
-    with open(filename, 'w') as f:
-        f.write(content)
 '''
-        with open(self.data_path, 'w', encoding='utf-8') as f:
-            f.write(sample_code)
-        print(f"Created sample {self.data_path}")
-    def prepare_dataset(self, samples: List[str]) -> tf.data.Dataset:
-        """Prepare TensorFlow dataset for training"""
-        # Fit tokenizer
         self.tokenizer.fit(samples)
-        # Encode all samples
         all_tokens = []
         for sample in samples:
             tokens = self.tokenizer.encode(sample)
             all_tokens.extend(tokens)
-        # Create sequences
         sequences = []
-        for i in range(0, len(all_tokens) - self.max_length, self.max_length // 2):
             seq = all_tokens[i:i + self.max_length + 1]
             if len(seq) == self.max_length + 1:
                 sequences.append(seq)
-        if not sequences:
-            # Create padded sequences if not enough data
             for sample in samples:
                 tokens = self.tokenizer.encode(sample, max_length=self.max_length + 1)
                 sequences.append(tokens)
-        print(f"Created {len(sequences)} training sequences")
-        # Convert to numpy arrays
-        sequences = np.array(sequences)
-        # Split into input and target
         X = sequences[:, :-1]
         y = sequences[:, 1:]
-        # Create dataset
         dataset = tf.data.Dataset.from_tensor_slices((X, y))
-        dataset = dataset.shuffle(buffer_size=len(sequences))
-        dataset = dataset.batch(self.batch_size)
-        dataset = dataset.prefetch(tf.data.AUTOTUNE)
         return dataset
     def build_model(self):
-        """Build the Veda Programming model"""
-        self.model = create_veda_model(
             vocab_size=self.tokenizer.vocabulary_size,
             max_length=self.max_length,
-            model_size=self.model_size
         )
-        # Compile model
-        optimizer = keras.optimizers.Adam(learning_rate=1e-4)
-        loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
         self.model.compile(
-            optimizer=optimizer,
-            loss=loss_fn,
             metrics=['accuracy']
         )
-        # Build model with dummy input
-        dummy_input = tf.zeros((1, self.max_length), dtype=tf.int32)
-        self.model(dummy_input)
         self.model.summary()
         return self.model
-    def train(
-        self,
-        epochs: int = 10,
-        save_path: str = "veda_model"
-    ):
         """Train the model"""
-        # Load and prepare data
         samples = self.load_data()
         dataset = self.prepare_dataset(samples)
-        # Build model
         self.build_model()
-        # Callbacks
-        callbacks = [
-            keras.callbacks.ModelCheckpoint(
-                filepath=os.path.join(save_path, "model_checkpoint.keras"),
-                save_best_only=True,
-                monitor='loss'
-            ),
-            keras.callbacks.EarlyStopping(
-                monitor='loss',
-                patience=5,
-                restore_best_weights=True
-            ),
-            keras.callbacks.ReduceLROnPlateau(
-                monitor='loss',
-                factor=0.5,
-                patience=2
-            )
-        ]
-        # Create save directory
         os.makedirs(save_path, exist_ok=True)
-        # Train
-        history = self.model.fit(
-            dataset,
-            epochs=epochs,
-            callbacks=callbacks
-        )
-        # Save final model and tokenizer
-        self.model.save_weights(os.path.join(save_path, "model_weights.h5"))
         self.tokenizer.save(os.path.join(save_path, "tokenizer.json"))
-        # Save model config
         config = self.model.get_config()
-        config['tokenizer_vocab_size'] = self.tokenizer.vocabulary_size
-        import json
         with open(os.path.join(save_path, "config.json"), 'w') as f:
             json.dump(config, f)
         print(f"Model saved to {save_path}")
         return history
-    def generate(
-        self,
-        prompt: str,
-        max_new_tokens: int = 100,
-        temperature: float = 0.7
-    ) -> str:
-        """Generate code from prompt"""
         if self.model is None:
-            raise ValueError("Model not loaded. Train or load a model first.")
-        # Encode prompt
-        prompt_tokens = self.tokenizer.encode(prompt)
-        # Generate
-        generated_tokens = self.model.generate(
-            prompt_tokens,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature
-        )
-        # Decode
-        generated_text = self.tokenizer.decode(generated_tokens)
-        return generated_text
-def main():
-    """Main training function"""
-    trainer = VedaTrainer(
-        data_path="programming.txt",
-        vocab_size=10000,
-        max_length=256,
-        batch_size=16,
-        model_size="small"
-    )
-    # Train model
-    history = trainer.train(epochs=20, save_path="veda_model")
-    # Test generation
-    test_prompt = "def calculate"
-    generated = trainer.generate(test_prompt, max_new_tokens=50)
-    print(f"\nGenerated code:\n{generated}")
 if __name__ == "__main__":
-    main()

 from tensorflow import keras
 import numpy as np
 import os
+import json
+from model import VedaProgrammingLLM
 from tokenizer import VedaTokenizer
+SAMPLE_CODE = '''
 def hello_world():
     print("Hello, World!")
     return True
         self.result = 0
     def add(self, a, b):
+        return a + b
     def subtract(self, a, b):
+        return a - b
     def multiply(self, a, b):
+        return a * b
 def bubble_sort(arr):
     n = len(arr)
             right = mid - 1
     return -1
 def is_palindrome(s):
+    s = s.lower()
     return s == s[::-1]
+def sum_list(numbers):
+    total = 0
+    for num in numbers:
+        total += num
+    return total
+def find_max(arr):
+    if not arr:
+        return None
+    max_val = arr[0]
+    for val in arr:
+        if val > max_val:
+            max_val = val
+    return max_val
 '''
+class VedaTrainer:
+    """Trainer for Veda Programming LLM"""
+    def __init__(self, data_path: str = "programming.txt",
+                 vocab_size: int = 3000, max_length: int = 128, batch_size: int = 8):
+        self.data_path = data_path
+        self.vocab_size = vocab_size
+        self.max_length = max_length
+        self.batch_size = batch_size
+        self.tokenizer = VedaTokenizer(vocab_size=vocab_size)
+        self.model = None
+    def load_data(self):
+        """Load training data"""
+        if not os.path.exists(self.data_path):
+            with open(self.data_path, 'w') as f:
+                f.write(SAMPLE_CODE)
+        with open(self.data_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        samples = [s.strip() for s in content.split('\n\n') if s.strip()]
+        print(f"Loaded {len(samples)} samples")
+        return samples
+    def prepare_dataset(self, samples):
+        """Prepare TensorFlow dataset"""
         self.tokenizer.fit(samples)
         all_tokens = []
         for sample in samples:
             tokens = self.tokenizer.encode(sample)
             all_tokens.extend(tokens)
         sequences = []
+        step = max(1, self.max_length // 4)
+        for i in range(0, len(all_tokens) - self.max_length, step):
             seq = all_tokens[i:i + self.max_length + 1]
             if len(seq) == self.max_length + 1:
                 sequences.append(seq)
+        if len(sequences) < 2:
             for sample in samples:
                 tokens = self.tokenizer.encode(sample, max_length=self.max_length + 1)
                 sequences.append(tokens)
+        sequences = np.array(sequences[:100])  # Limit for memory
         X = sequences[:, :-1]
         y = sequences[:, 1:]
         dataset = tf.data.Dataset.from_tensor_slices((X, y))
+        dataset = dataset.shuffle(100).batch(self.batch_size).prefetch(1)
+        print(f"Created {len(sequences)} sequences")
         return dataset
     def build_model(self):
+        """Build the model"""
+        self.model = VedaProgrammingLLM(
             vocab_size=self.tokenizer.vocabulary_size,
             max_length=self.max_length,
+            d_model=128,
+            num_heads=4,
+            num_layers=2,
+            ff_dim=256
         )
         self.model.compile(
+            optimizer=keras.optimizers.Adam(1e-4),
+            loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
             metrics=['accuracy']
         )
+        # Build
+        dummy = tf.zeros((1, self.max_length), dtype=tf.int32)
+        self.model(dummy)
         self.model.summary()
         return self.model
+    def train(self, epochs: int = 5, save_path: str = "veda_model"):
         """Train the model"""
         samples = self.load_data()
         dataset = self.prepare_dataset(samples)
         self.build_model()
         os.makedirs(save_path, exist_ok=True)
+        history = self.model.fit(dataset, epochs=epochs, verbose=1)
+        # Save weights only (more reliable)
+        self.model.save_weights(os.path.join(save_path, "weights.h5"))
         self.tokenizer.save(os.path.join(save_path, "tokenizer.json"))
         config = self.model.get_config()
         with open(os.path.join(save_path, "config.json"), 'w') as f:
             json.dump(config, f)
         print(f"Model saved to {save_path}")
         return history
+    def generate(self, prompt: str, max_tokens: int = 50, temperature: float = 0.8):
+        """Generate code"""
         if self.model is None:
+            raise ValueError("Model not loaded")
+        tokens = self.tokenizer.encode(prompt)
+        generated = self.model.generate(tokens, max_tokens, temperature)
+        return self.tokenizer.decode(generated)
 if __name__ == "__main__":
+    trainer = VedaTrainer()
+    trainer.train(epochs=10)
+    print("\nTest generation:")
+    print(trainer.generate("def calculate"))