veda-programming / train.py
vedaco's picture
Update train.py
85f3c75 verified
raw
history blame
5.5 kB
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
import json
from model import VedaProgrammingLLM
from tokenizer import VedaTokenizer
SAMPLE_CODE = '''
def hello_world():
print("Hello, World!")
return True
def fibonacci(n):
if n <= 1:
return n
return fibonacci(n-1) + fibonacci(n-2)
def factorial(n):
if n == 0:
return 1
return n * factorial(n-1)
class Calculator:
def __init__(self):
self.result = 0
def add(self, a, b):
return a + b
def subtract(self, a, b):
return a - b
def multiply(self, a, b):
return a * b
def bubble_sort(arr):
n = len(arr)
for i in range(n):
for j in range(0, n-i-1):
if arr[j] > arr[j+1]:
arr[j], arr[j+1] = arr[j+1], arr[j]
return arr
def binary_search(arr, target):
left, right = 0, len(arr) - 1
while left <= right:
mid = (left + right) // 2
if arr[mid] == target:
return mid
elif arr[mid] < target:
left = mid + 1
else:
right = mid - 1
return -1
def is_palindrome(s):
s = s.lower()
return s == s[::-1]
def sum_list(numbers):
total = 0
for num in numbers:
total += num
return total
def find_max(arr):
if not arr:
return None
max_val = arr[0]
for val in arr:
if val > max_val:
max_val = val
return max_val
'''
class VedaTrainer:
"""Trainer for Veda Programming LLM"""
def __init__(self, data_path: str = "programming.txt",
vocab_size: int = 3000, max_length: int = 128, batch_size: int = 8):
self.data_path = data_path
self.vocab_size = vocab_size
self.max_length = max_length
self.batch_size = batch_size
self.tokenizer = VedaTokenizer(vocab_size=vocab_size)
self.model = None
def load_data(self):
"""Load training data"""
if not os.path.exists(self.data_path):
with open(self.data_path, 'w') as f:
f.write(SAMPLE_CODE)
with open(self.data_path, 'r', encoding='utf-8') as f:
content = f.read()
samples = [s.strip() for s in content.split('\n\n') if s.strip()]
print(f"Loaded {len(samples)} samples")
return samples
def prepare_dataset(self, samples):
"""Prepare TensorFlow dataset"""
self.tokenizer.fit(samples)
all_tokens = []
for sample in samples:
tokens = self.tokenizer.encode(sample)
all_tokens.extend(tokens)
sequences = []
step = max(1, self.max_length // 4)
for i in range(0, len(all_tokens) - self.max_length, step):
seq = all_tokens[i:i + self.max_length + 1]
if len(seq) == self.max_length + 1:
sequences.append(seq)
if len(sequences) < 2:
for sample in samples:
tokens = self.tokenizer.encode(sample, max_length=self.max_length + 1)
sequences.append(tokens)
sequences = np.array(sequences[:100]) # Limit for memory
X = sequences[:, :-1]
y = sequences[:, 1:]
dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.shuffle(100).batch(self.batch_size).prefetch(1)
print(f"Created {len(sequences)} sequences")
return dataset
def build_model(self):
"""Build the model"""
self.model = VedaProgrammingLLM(
vocab_size=self.tokenizer.vocabulary_size,
max_length=self.max_length,
d_model=128,
num_heads=4,
num_layers=2,
ff_dim=256
)
self.model.compile(
optimizer=keras.optimizers.Adam(1e-4),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy']
)
# Build
dummy = tf.zeros((1, self.max_length), dtype=tf.int32)
self.model(dummy)
self.model.summary()
return self.model
def train(self, epochs: int = 5, save_path: str = "veda_model"):
"""Train the model"""
samples = self.load_data()
dataset = self.prepare_dataset(samples)
self.build_model()
os.makedirs(save_path, exist_ok=True)
history = self.model.fit(dataset, epochs=epochs, verbose=1)
# Save weights only (more reliable)
self.model.save_weights(os.path.join(save_path, "weights.h5"))
self.tokenizer.save(os.path.join(save_path, "tokenizer.json"))
config = self.model.get_config()
with open(os.path.join(save_path, "config.json"), 'w') as f:
json.dump(config, f)
print(f"Model saved to {save_path}")
return history
def generate(self, prompt: str, max_tokens: int = 50, temperature: float = 0.8):
"""Generate code"""
if self.model is None:
raise ValueError("Model not loaded")
tokens = self.tokenizer.encode(prompt)
generated = self.model.generate(tokens, max_tokens, temperature)
return self.tokenizer.decode(generated)
if __name__ == "__main__":
trainer = VedaTrainer()
trainer.train(epochs=10)
print("\nTest generation:")
print(trainer.generate("def calculate"))