Spaces:

vedaco
/

veda-programming

Sleeping

App Files Files Community

vedaco commited on Jan 9

Commit

dbb535a

verified ·

1 Parent(s): 8f5ff41

Create train.py

Browse files

Files changed (1) hide show

train.py +339 -0

train.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import tensorflow as tf
+from tensorflow import keras
+import numpy as np
+import os
+from typing import List, Tuple, Optional
+from model import VedaProgrammingLLM, create_veda_model
+from tokenizer import VedaTokenizer
+class VedaTrainer:
+    """Trainer class for Veda Programming LLM"""
+    def __init__(
+        self,
+        data_path: str = "programming.txt",
+        vocab_size: int = 10000,
+        max_length: int = 256,
+        batch_size: int = 32,
+        model_size: str = "small"
+    ):
+        self.data_path = data_path
+        self.vocab_size = vocab_size
+        self.max_length = max_length
+        self.batch_size = batch_size
+        self.model_size = model_size
+        self.tokenizer = VedaTokenizer(vocab_size=vocab_size)
+        self.model: Optional[VedaProgrammingLLM] = None
+    def load_data(self) -> List[str]:
+        """Load programming data from file"""
+        if not os.path.exists(self.data_path):
+            print(f"Creating sample {self.data_path}...")
+            self._create_sample_data()
+        with open(self.data_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        # Split into code samples (by double newlines or function definitions)
+        samples = []
+        current_sample = []
+        for line in content.split('\n'):
+            if line.strip() == '' and current_sample:
+                samples.append('\n'.join(current_sample))
+                current_sample = []
+            else:
+                current_sample.append(line)
+        if current_sample:
+            samples.append('\n'.join(current_sample))
+        # Filter empty samples
+        samples = [s.strip() for s in samples if s.strip()]
+        print(f"Loaded {len(samples)} code samples")
+        return samples
+    def _create_sample_data(self):
+        """Create sample programming data"""
+        sample_code = '''
+def hello_world():
+    print("Hello, World!")
+    return True
+def fibonacci(n):
+    if n <= 1:
+        return n
+    return fibonacci(n-1) + fibonacci(n-2)
+def factorial(n):
+    if n == 0:
+        return 1
+    return n * factorial(n-1)
+class Calculator:
+    def __init__(self):
+        self.result = 0
+    def add(self, a, b):
+        self.result = a + b
+        return self.result
+    def subtract(self, a, b):
+        self.result = a - b
+        return self.result
+    def multiply(self, a, b):
+        self.result = a * b
+        return self.result
+    def divide(self, a, b):
+        if b != 0:
+            self.result = a / b
+        return self.result
+def bubble_sort(arr):
+    n = len(arr)
+    for i in range(n):
+        for j in range(0, n-i-1):
+            if arr[j] > arr[j+1]:
+                arr[j], arr[j+1] = arr[j+1], arr[j]
+    return arr
+def binary_search(arr, target):
+    left, right = 0, len(arr) - 1
+    while left <= right:
+        mid = (left + right) // 2
+        if arr[mid] == target:
+            return mid
+        elif arr[mid] < target:
+            left = mid + 1
+        else:
+            right = mid - 1
+    return -1
+def quicksort(arr):
+    if len(arr) <= 1:
+        return arr
+    pivot = arr[len(arr) // 2]
+    left = [x for x in arr if x < pivot]
+    middle = [x for x in arr if x == pivot]
+    right = [x for x in arr if x > pivot]
+    return quicksort(left) + middle + quicksort(right)
+class LinkedList:
+    def __init__(self):
+        self.head = None
+    def append(self, data):
+        new_node = Node(data)
+        if not self.head:
+            self.head = new_node
+            return
+        current = self.head
+        while current.next:
+            current = current.next
+        current.next = new_node
+def merge_sort(arr):
+    if len(arr) <= 1:
+        return arr
+    mid = len(arr) // 2
+    left = merge_sort(arr[:mid])
+    right = merge_sort(arr[mid:])
+    return merge(left, right)
+def is_palindrome(s):
+    s = s.lower().replace(" ", "")
+    return s == s[::-1]
+def count_words(text):
+    words = text.split()
+    return len(words)
+async def fetch_data(url):
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url) as response:
+            return await response.json()
+def read_file(filename):
+    with open(filename, 'r') as f:
+        return f.read()
+def write_file(filename, content):
+    with open(filename, 'w') as f:
+        f.write(content)
+'''
+        with open(self.data_path, 'w', encoding='utf-8') as f:
+            f.write(sample_code)
+        print(f"Created sample {self.data_path}")
+    def prepare_dataset(self, samples: List[str]) -> tf.data.Dataset:
+        """Prepare TensorFlow dataset for training"""
+        # Fit tokenizer
+        self.tokenizer.fit(samples)
+        # Encode all samples
+        all_tokens = []
+        for sample in samples:
+            tokens = self.tokenizer.encode(sample)
+            all_tokens.extend(tokens)
+        # Create sequences
+        sequences = []
+        for i in range(0, len(all_tokens) - self.max_length, self.max_length // 2):
+            seq = all_tokens[i:i + self.max_length + 1]
+            if len(seq) == self.max_length + 1:
+                sequences.append(seq)
+        if not sequences:
+            # Create padded sequences if not enough data
+            for sample in samples:
+                tokens = self.tokenizer.encode(sample, max_length=self.max_length + 1)
+                sequences.append(tokens)
+        print(f"Created {len(sequences)} training sequences")
+        # Convert to numpy arrays
+        sequences = np.array(sequences)
+        # Split into input and target
+        X = sequences[:, :-1]
+        y = sequences[:, 1:]
+        # Create dataset
+        dataset = tf.data.Dataset.from_tensor_slices((X, y))
+        dataset = dataset.shuffle(buffer_size=len(sequences))
+        dataset = dataset.batch(self.batch_size)
+        dataset = dataset.prefetch(tf.data.AUTOTUNE)
+        return dataset
+    def build_model(self):
+        """Build the Veda Programming model"""
+        self.model = create_veda_model(
+            vocab_size=self.tokenizer.vocabulary_size,
+            max_length=self.max_length,
+            model_size=self.model_size
+        )
+        # Compile model
+        optimizer = keras.optimizers.Adam(learning_rate=1e-4)
+        loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        self.model.compile(
+            optimizer=optimizer,
+            loss=loss_fn,
+            metrics=['accuracy']
+        )
+        # Build model with dummy input
+        dummy_input = tf.zeros((1, self.max_length), dtype=tf.int32)
+        self.model(dummy_input)
+        self.model.summary()
+        return self.model
+    def train(
+        self,
+        epochs: int = 10,
+        save_path: str = "veda_model"
+    ):
+        """Train the model"""
+        # Load and prepare data
+        samples = self.load_data()
+        dataset = self.prepare_dataset(samples)
+        # Build model
+        self.build_model()
+        # Callbacks
+        callbacks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath=os.path.join(save_path, "model_checkpoint.keras"),
+                save_best_only=True,
+                monitor='loss'
+            ),
+            keras.callbacks.EarlyStopping(
+                monitor='loss',
+                patience=5,
+                restore_best_weights=True
+            ),
+            keras.callbacks.ReduceLROnPlateau(
+                monitor='loss',
+                factor=0.5,
+                patience=2
+            )
+        ]
+        # Create save directory
+        os.makedirs(save_path, exist_ok=True)
+        # Train
+        history = self.model.fit(
+            dataset,
+            epochs=epochs,
+            callbacks=callbacks
+        )
+        # Save final model and tokenizer
+        self.model.save_weights(os.path.join(save_path, "model_weights.h5"))
+        self.tokenizer.save(os.path.join(save_path, "tokenizer.json"))
+        # Save model config
+        config = self.model.get_config()
+        config['tokenizer_vocab_size'] = self.tokenizer.vocabulary_size
+        import json
+        with open(os.path.join(save_path, "config.json"), 'w') as f:
+            json.dump(config, f)
+        print(f"Model saved to {save_path}")
+        return history
+    def generate(
+        self,
+        prompt: str,
+        max_new_tokens: int = 100,
+        temperature: float = 0.7
+    ) -> str:
+        """Generate code from prompt"""
+        if self.model is None:
+            raise ValueError("Model not loaded. Train or load a model first.")
+        # Encode prompt
+        prompt_tokens = self.tokenizer.encode(prompt)
+        # Generate
+        generated_tokens = self.model.generate(
+            prompt_tokens,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature
+        )
+        # Decode
+        generated_text = self.tokenizer.decode(generated_tokens)
+        return generated_text
+def main():
+    """Main training function"""
+    trainer = VedaTrainer(
+        data_path="programming.txt",
+        vocab_size=10000,
+        max_length=256,
+        batch_size=16,
+        model_size="small"
+    )
+    # Train model
+    history = trainer.train(epochs=20, save_path="veda_model")
+    # Test generation
+    test_prompt = "def calculate"
+    generated = trainer.generate(test_prompt, max_new_tokens=50)
+    print(f"\nGenerated code:\n{generated}")
+if __name__ == "__main__":
+    main()