Spaces:
Sleeping
Sleeping
| import tensorflow as tf | |
| from tensorflow import keras | |
| import numpy as np | |
| import os | |
| import json | |
| from model import VedaProgrammingLLM | |
| from tokenizer import VedaTokenizer | |
| SAMPLE_CODE = ''' | |
| def hello_world(): | |
| print("Hello, World!") | |
| return True | |
| def fibonacci(n): | |
| if n <= 1: | |
| return n | |
| return fibonacci(n-1) + fibonacci(n-2) | |
| def factorial(n): | |
| if n == 0: | |
| return 1 | |
| return n * factorial(n-1) | |
| class Calculator: | |
| def __init__(self): | |
| self.result = 0 | |
| def add(self, a, b): | |
| return a + b | |
| def subtract(self, a, b): | |
| return a - b | |
| def multiply(self, a, b): | |
| return a * b | |
| def bubble_sort(arr): | |
| n = len(arr) | |
| for i in range(n): | |
| for j in range(0, n-i-1): | |
| if arr[j] > arr[j+1]: | |
| arr[j], arr[j+1] = arr[j+1], arr[j] | |
| return arr | |
| def binary_search(arr, target): | |
| left, right = 0, len(arr) - 1 | |
| while left <= right: | |
| mid = (left + right) // 2 | |
| if arr[mid] == target: | |
| return mid | |
| elif arr[mid] < target: | |
| left = mid + 1 | |
| else: | |
| right = mid - 1 | |
| return -1 | |
| def is_palindrome(s): | |
| s = s.lower() | |
| return s == s[::-1] | |
| def sum_list(numbers): | |
| total = 0 | |
| for num in numbers: | |
| total += num | |
| return total | |
| def find_max(arr): | |
| if not arr: | |
| return None | |
| max_val = arr[0] | |
| for val in arr: | |
| if val > max_val: | |
| max_val = val | |
| return max_val | |
| ''' | |
| class VedaTrainer: | |
| """Trainer for Veda Programming LLM""" | |
| def __init__(self, data_path: str = "programming.txt", | |
| vocab_size: int = 3000, max_length: int = 128, batch_size: int = 8): | |
| self.data_path = data_path | |
| self.vocab_size = vocab_size | |
| self.max_length = max_length | |
| self.batch_size = batch_size | |
| self.tokenizer = VedaTokenizer(vocab_size=vocab_size) | |
| self.model = None | |
| def load_data(self): | |
| """Load training data""" | |
| if not os.path.exists(self.data_path): | |
| with open(self.data_path, 'w') as f: | |
| f.write(SAMPLE_CODE) | |
| with open(self.data_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| samples = [s.strip() for s in content.split('\n\n') if s.strip()] | |
| print(f"Loaded {len(samples)} samples") | |
| return samples | |
| def prepare_dataset(self, samples): | |
| """Prepare TensorFlow dataset""" | |
| self.tokenizer.fit(samples) | |
| all_tokens = [] | |
| for sample in samples: | |
| tokens = self.tokenizer.encode(sample) | |
| all_tokens.extend(tokens) | |
| sequences = [] | |
| step = max(1, self.max_length // 4) | |
| for i in range(0, len(all_tokens) - self.max_length, step): | |
| seq = all_tokens[i:i + self.max_length + 1] | |
| if len(seq) == self.max_length + 1: | |
| sequences.append(seq) | |
| if len(sequences) < 2: | |
| for sample in samples: | |
| tokens = self.tokenizer.encode(sample, max_length=self.max_length + 1) | |
| sequences.append(tokens) | |
| sequences = np.array(sequences[:100]) # Limit for memory | |
| X = sequences[:, :-1] | |
| y = sequences[:, 1:] | |
| dataset = tf.data.Dataset.from_tensor_slices((X, y)) | |
| dataset = dataset.shuffle(100).batch(self.batch_size).prefetch(1) | |
| print(f"Created {len(sequences)} sequences") | |
| return dataset | |
| def build_model(self): | |
| """Build the model""" | |
| self.model = VedaProgrammingLLM( | |
| vocab_size=self.tokenizer.vocabulary_size, | |
| max_length=self.max_length, | |
| d_model=128, | |
| num_heads=4, | |
| num_layers=2, | |
| ff_dim=256 | |
| ) | |
| self.model.compile( | |
| optimizer=keras.optimizers.Adam(1e-4), | |
| loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), | |
| metrics=['accuracy'] | |
| ) | |
| # Build | |
| dummy = tf.zeros((1, self.max_length), dtype=tf.int32) | |
| self.model(dummy) | |
| self.model.summary() | |
| return self.model | |
| def train(self, epochs: int = 5, save_path: str = "veda_model"): | |
| """Train the model""" | |
| samples = self.load_data() | |
| dataset = self.prepare_dataset(samples) | |
| self.build_model() | |
| os.makedirs(save_path, exist_ok=True) | |
| history = self.model.fit(dataset, epochs=epochs, verbose=1) | |
| # Save weights only (more reliable) | |
| self.model.save_weights(os.path.join(save_path, "weights.h5")) | |
| self.tokenizer.save(os.path.join(save_path, "tokenizer.json")) | |
| config = self.model.get_config() | |
| with open(os.path.join(save_path, "config.json"), 'w') as f: | |
| json.dump(config, f) | |
| print(f"Model saved to {save_path}") | |
| return history | |
| def generate(self, prompt: str, max_tokens: int = 50, temperature: float = 0.8): | |
| """Generate code""" | |
| if self.model is None: | |
| raise ValueError("Model not loaded") | |
| tokens = self.tokenizer.encode(prompt) | |
| generated = self.model.generate(tokens, max_tokens, temperature) | |
| return self.tokenizer.decode(generated) | |
| if __name__ == "__main__": | |
| trainer = VedaTrainer() | |
| trainer.train(epochs=10) | |
| print("\nTest generation:") | |
| print(trainer.generate("def calculate")) |