Spaces:

vedaco
/

veda-programming

Sleeping

App Files Files Community

vedaco commited on Jan 14

Commit

5740a72

verified ·

1 Parent(s): f162639

Update train.py

Browse files

Files changed (1) hide show

train.py +607 -74

train.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Training for Veda Programming Assistant"""
 import tensorflow as tf
 from tensorflow import keras
@@ -8,10 +8,12 @@ import json
 from model import VedaProgrammingLLM
 from tokenizer import VedaTokenizer
 from config import MODEL_DIR, VOCAB_SIZE, MAX_LENGTH, BATCH_SIZE
-TRAINING_DATA = '''
 <USER> Hello!
 <ASSISTANT> Hello! Welcome to Veda Programming Assistant! I can help you with coding, explain concepts, or answer programming questions. What would you like to learn today?
@@ -46,6 +48,8 @@ result = add(5, 3)
 print(result)
 <ENDCODE>
 This function takes two parameters and returns their sum.
 <USER> Explain what a function is
@@ -75,6 +79,8 @@ def greet(name):
 print(greet("Alice"))
 <ENDCODE>
 <USER> What is a list?
 <ASSISTANT> A list is an ordered, mutable collection in Python that can hold items of any type.
@@ -107,19 +113,23 @@ print(len(fruits))
 <USER> Write a fibonacci function
 <ASSISTANT> Here is a Fibonacci function:
 <CODE>
 def fibonacci(n):
     if n <= 0:
         return 0
     if n == 1:
         return 1
-    return fibonacci(n-1) + fibonacci(n-2)
 for i in range(10):
     print(fibonacci(i), end=" ")
 <ENDCODE>
-Iterative version:
 <CODE>
 def fibonacci_iterative(n):
@@ -127,7 +137,6 @@ def fibonacci_iterative(n):
         return 0
     if n == 1:
         return 1
     a, b = 0, 1
     for _ in range(2, n + 1):
         a, b = b, a + b
@@ -141,7 +150,7 @@ Key components:
 1. Base case - stops the recursion
 2. Recursive case - function calls itself
-Example:
 <CODE>
 def countdown(n):
@@ -154,6 +163,8 @@ def countdown(n):
 countdown(5)
 <ENDCODE>
 Factorial example:
 <CODE>
@@ -165,32 +176,39 @@ def factorial(n):
 print(factorial(5))
 <ENDCODE>
 <USER> Write bubble sort
 <ASSISTANT> Here is the Bubble Sort algorithm:
 <CODE>
 def bubble_sort(arr):
     n = len(arr)
     for i in range(n):
         swapped = False
         for j in range(0, n - i - 1):
             if arr[j] > arr[j + 1]:
                 arr[j], arr[j + 1] = arr[j + 1], arr[j]
                 swapped = True
         if not swapped:
             break
     return arr
 numbers = [64, 34, 25, 12, 22, 11, 90]
 print("Sorted:", bubble_sort(numbers.copy()))
 <ENDCODE>
 <USER> Write binary search
-<ASSISTANT> Here is Binary Search:
 <CODE>
 def binary_search(arr, target):
@@ -211,10 +229,18 @@ def binary_search(arr, target):
 numbers = [1, 3, 5, 7, 9, 11, 13, 15]
 print(binary_search(numbers, 7))
 <ENDCODE>
 <USER> What is a class?
-<ASSISTANT> A class is a blueprint for creating objects:
 <CODE>
 class Dog:
@@ -224,26 +250,106 @@ class Dog:
     def bark(self):
         return f"{self.name} says Woof!"
 my_dog = Dog("Buddy", 3)
 print(my_dog.bark())
 <ENDCODE>
 <USER> How do I read a file?
-<ASSISTANT> Here is how to read files:
 <CODE>
 with open("filename.txt", "r") as file:
     content = file.read()
     print(content)
 with open("filename.txt", "r") as file:
     for line in file:
         print(line.strip())
 <ENDCODE>
 <USER> How do I handle errors?
-<ASSISTANT> Use try-except blocks:
 <CODE>
 try:
@@ -266,82 +372,480 @@ except Exception as e:
     print(f"Error: {e}")
 <ENDCODE>
-<USER> Thanks!
-<ASSISTANT> You are welcome! Feel free to ask me anything anytime! Happy coding!
-<USER> Hi
-<ASSISTANT> Hello! How can I help you today? I can write code, explain concepts, or answer your programming questions!
 <USER> What is the difference between list and tuple?
 <ASSISTANT> The main difference is mutability:
-- Lists are mutable (can change)
-- Tuples are immutable (cannot change)
 <CODE>
 my_list = [1, 2, 3]
 my_tuple = (1, 2, 3)
 my_list[0] = 100
 my_list.append(4)
 <ENDCODE>
-Use lists for changing data, tuples for fixed data.
-'''
-class VedaTrainer:
-    """Trainer for Veda Programming Assistant"""
     def __init__(self, vocab_size: int = 8000, max_length: int = 512, batch_size: int = 4):
         self.vocab_size = vocab_size
         self.max_length = max_length
         self.batch_size = batch_size
         self.tokenizer = VedaTokenizer(vocab_size=vocab_size)
         self.model = None
-    def prepare_data(self, extra_data: str = ""):
-        """Prepare training data"""
         data = TRAINING_DATA
         if extra_data:
             data += "\n\n" + extra_data
         if os.path.exists("programming.txt"):
-            with open("programming.txt", 'r', encoding='utf-8') as f:
-                code_data = f.read()
-            data += "\n\n" + code_data
         self.tokenizer.fit([data])
         all_tokens = self.tokenizer.encode(data)
         print(f"Total tokens: {len(all_tokens)}")
         sequences = []
         stride = self.max_length // 2
         for i in range(0, len(all_tokens) - self.max_length - 1, stride):
-            seq = all_tokens[i:i + self.max_length + 1]
             if len(seq) == self.max_length + 1:
                 sequences.append(seq)
         if len(sequences) < 10:
             stride = self.max_length // 4
             sequences = []
             for i in range(0, len(all_tokens) - self.max_length - 1, stride):
-                seq = all_tokens[i:i + self.max_length + 1]
                 if len(seq) == self.max_length + 1:
                     sequences.append(seq)
         print(f"Created {len(sequences)} training sequences")
         sequences = np.array(sequences)
         X = sequences[:, :-1]
         y = sequences[:, 1:]
         dataset = tf.data.Dataset.from_tensor_slices((X, y))
         dataset = dataset.shuffle(1000).batch(self.batch_size).prefetch(1)
         return dataset
     def build_model(self):
         """Build the model"""
         self.model = VedaProgrammingLLM(
@@ -350,73 +854,102 @@ class VedaTrainer:
             d_model=256,
             num_heads=8,
             num_layers=4,
-            ff_dim=512
         )
         self.model.compile(
-            optimizer=keras.optimizers.Adam(1e-4),
             loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-            metrics=['accuracy']
         )
         dummy = tf.zeros((1, self.max_length), dtype=tf.int32)
         self.model(dummy)
         return self.model
-    def train(self, epochs: int = 15, save_path: str = None, extra_data: str = ""):
         """Train the model"""
         if save_path is None:
             save_path = MODEL_DIR
-        dataset = self.prepare_data(extra_data)
         self.build_model()
         self.model.summary()
         os.makedirs(save_path, exist_ok=True)
         history = self.model.fit(dataset, epochs=epochs, verbose=1)
         self.model.save_weights(os.path.join(save_path, "weights.h5"))
-        self.tokenizer.save(os.path.join(save_path, "tokenizer.json"))
         config = self.model.get_config()
-        with open(os.path.join(save_path, "config.json"), 'w') as f:
-            json.dump(config, f)
         print(f"Model saved to {save_path}")
         return history
-    def generate_response(self, user_input: str, max_tokens: int = 200, temperature: float = 0.7) -> str:
         """Generate a response"""
         prompt = f"<USER> {user_input}\n<ASSISTANT>"
         tokens = self.tokenizer.encode(prompt)
         generated = self.model.generate(
             tokens,
             max_new_tokens=max_tokens,
             temperature=temperature,
-            repetition_penalty=1.2
         )
         response = self.tokenizer.decode(generated)
         if "<ASSISTANT>" in response:
             response = response.split("<ASSISTANT>")[-1].strip()
         if "<USER>" in response:
             response = response.split("<USER>")[0].strip()
         return response
 if __name__ == "__main__":
     trainer = VedaTrainer()
     trainer.train(epochs=20)
-    print("\nTesting:")
-    tests = ["Hello!", "What is a function?"]
-    for test in tests:
-        print(f"User: {test}")
-        print(f"Assistant: {trainer.generate_response(test)}")

+"""Training for Veda Programming Assistant with Distillation Support"""
 import tensorflow as tf
 from tensorflow import keras
 from model import VedaProgrammingLLM
 from tokenizer import VedaTokenizer
+from database import db
 from config import MODEL_DIR, VOCAB_SIZE, MAX_LENGTH, BATCH_SIZE
+# Training data with proper format (no markdown code blocks - using <CODE> tags)
+TRAINING_DATA = """
 <USER> Hello!
 <ASSISTANT> Hello! Welcome to Veda Programming Assistant! I can help you with coding, explain concepts, or answer programming questions. What would you like to learn today?
 print(result)
 <ENDCODE>
+Output: 8
 This function takes two parameters and returns their sum.
 <USER> Explain what a function is
 print(greet("Alice"))
 <ENDCODE>
+Output: Hello, Alice!
 <USER> What is a list?
 <ASSISTANT> A list is an ordered, mutable collection in Python that can hold items of any type.
 <USER> Write a fibonacci function
 <ASSISTANT> Here is a Fibonacci function:
+Recursive approach:
 <CODE>
 def fibonacci(n):
     if n <= 0:
         return 0
     if n == 1:
         return 1
+    return fibonacci(n - 1) + fibonacci(n - 2)
 for i in range(10):
     print(fibonacci(i), end=" ")
 <ENDCODE>
+Output: 0 1 1 2 3 5 8 13 21 34
+Iterative approach (more efficient):
 <CODE>
 def fibonacci_iterative(n):
         return 0
     if n == 1:
         return 1
     a, b = 0, 1
     for _ in range(2, n + 1):
         a, b = b, a + b
 1. Base case - stops the recursion
 2. Recursive case - function calls itself
+Example - countdown:
 <CODE>
 def countdown(n):
 countdown(5)
 <ENDCODE>
+Output: 5, 4, 3, 2, 1, Done!
 Factorial example:
 <CODE>
 print(factorial(5))
 <ENDCODE>
+Output: 120
 <USER> Write bubble sort
 <ASSISTANT> Here is the Bubble Sort algorithm:
 <CODE>
 def bubble_sort(arr):
     n = len(arr)
     for i in range(n):
         swapped = False
         for j in range(0, n - i - 1):
             if arr[j] > arr[j + 1]:
                 arr[j], arr[j + 1] = arr[j + 1], arr[j]
                 swapped = True
         if not swapped:
             break
     return arr
 numbers = [64, 34, 25, 12, 22, 11, 90]
+print("Original:", numbers)
 print("Sorted:", bubble_sort(numbers.copy()))
 <ENDCODE>
+Output:
+Original: [64, 34, 25, 12, 22, 11, 90]
+Sorted: [11, 12, 22, 25, 34, 64, 90]
+How it works: repeatedly compares adjacent elements and swaps them if in wrong order. Largest elements bubble up to the end.
+Time complexity: O(n^2)
 <USER> Write binary search
+<ASSISTANT> Here is the Binary Search algorithm:
 <CODE>
 def binary_search(arr, target):
 numbers = [1, 3, 5, 7, 9, 11, 13, 15]
 print(binary_search(numbers, 7))
+print(binary_search(numbers, 6))
 <ENDCODE>
+Output:
+3
+-1
+Important: The array must be sorted!
+Time complexity: O(log n)
 <USER> What is a class?
+<ASSISTANT> A class is a blueprint for creating objects. It bundles data (attributes) and functions (methods) together.
 <CODE>
 class Dog:
     def bark(self):
         return f"{self.name} says Woof!"
+    def get_info(self):
+        return f"{self.name} is {self.age} years old"
 my_dog = Dog("Buddy", 3)
 print(my_dog.bark())
+print(my_dog.get_info())
 <ENDCODE>
+Output:
+Buddy says Woof!
+Buddy is 3 years old
+Key concepts:
+- __init__: constructor, runs when object is created
+- self: refers to the current object
+- Attributes: variables inside the class
+- Methods: functions inside the class
 <USER> How do I read a file?
+<ASSISTANT> Here is how to read files in Python:
+Read entire file:
 <CODE>
 with open("filename.txt", "r") as file:
     content = file.read()
     print(content)
+<ENDCODE>
+Read line by line:
+<CODE>
 with open("filename.txt", "r") as file:
     for line in file:
         print(line.strip())
 <ENDCODE>
+Read all lines into list:
+<CODE>
+with open("filename.txt", "r") as file:
+    lines = file.readlines()
+<ENDCODE>
+With error handling:
+<CODE>
+def read_file_safely(filename):
+    try:
+        with open(filename, "r") as file:
+            return file.read()
+    except FileNotFoundError:
+        print(f"File not found: {filename}")
+        return None
+content = read_file_safely("myfile.txt")
+<ENDCODE>
+The with statement automatically closes the file when done.
+<USER> How do I write to a file?
+<ASSISTANT> Here is how to write to files in Python:
+Write (overwrites existing content):
+<CODE>
+with open("output.txt", "w") as file:
+    file.write("Hello, World!")
+    file.write("This is a new line")
+<ENDCODE>
+Append (adds to existing content):
+<CODE>
+with open("output.txt", "a") as file:
+    file.write("This is appended")
+<ENDCODE>
+Write multiple lines:
+<CODE>
+lines = ["Line 1", "Line 2", "Line 3"]
+with open("output.txt", "w") as file:
+    for line in lines:
+        file.write(line + "\\n")
+<ENDCODE>
+Or use writelines:
+<CODE>
+lines = ["Line 1\\n", "Line 2\\n", "Line 3\\n"]
+with open("output.txt", "w") as file:
+    file.writelines(lines)
+<ENDCODE>
 <USER> How do I handle errors?
+<ASSISTANT> Python uses try-except blocks for error handling:
+Basic try-except:
 <CODE>
 try:
     print(f"Error: {e}")
 <ENDCODE>
+Complete structure:
+<CODE>
+try:
+    result = risky_operation()
+except SomeError:
+    print("Handle error")
+else:
+    print("Success! No error occurred")
+finally:
+    print("This always runs")
+<ENDCODE>
+Common exceptions:
+- ValueError: wrong value
+- TypeError: wrong type
+- IndexError: index out of range
+- KeyError: dictionary key not found
+- FileNotFoundError: file does not exist
+- ZeroDivisionError: division by zero
+<USER> What is a dictionary?
+<ASSISTANT> A dictionary is a collection of key-value pairs in Python.
+Creating dictionaries:
+<CODE>
+empty_dict = {}
+person = {"name": "Alice", "age": 25, "city": "NYC"}
+another = dict(name="Bob", age=30)
+<ENDCODE>
+Accessing values:
+<CODE>
+person = {"name": "Alice", "age": 25}
+print(person["name"])
+print(person.get("name"))
+print(person.get("job", "Unknown"))
+<ENDCODE>
+Modifying:
+<CODE>
+person = {"name": "Alice", "age": 25}
+person["city"] = "Boston"
+person["age"] = 26
+del person["city"]
+age = person.pop("age")
+<ENDCODE>
+Iterating:
+<CODE>
+person = {"name": "Alice", "age": 25}
+for key in person:
+    print(key)
+for value in person.values():
+    print(value)
+for key, value in person.items():
+    print(f"{key}: {value}")
+<ENDCODE>
 <USER> What is the difference between list and tuple?
 <ASSISTANT> The main difference is mutability:
+- Lists are mutable (can be changed)
+- Tuples are immutable (cannot be changed)
+Syntax:
 <CODE>
 my_list = [1, 2, 3]
 my_tuple = (1, 2, 3)
+<ENDCODE>
+Mutability difference:
+<CODE>
+my_list = [1, 2, 3]
 my_list[0] = 100
 my_list.append(4)
+print(my_list)
+my_tuple = (1, 2, 3)
+# my_tuple[0] = 100  # ERROR! Cannot modify
+# my_tuple.append(4)  # ERROR! No append method
 <ENDCODE>
+When to use:
+- List: data that will change (shopping cart, scores)
+- Tuple: fixed data (coordinates, RGB colors)
+Examples:
+<CODE>
+coordinates = (10, 20)
+rgb_color = (255, 128, 0)
+shopping_cart = ["apple", "bread"]
+shopping_cart.append("milk")
+<ENDCODE>
+Other differences:
+- Tuples are slightly faster
+- Tuples use less memory
+- Tuples can be dictionary keys
+<USER> How do I use list comprehension?
+<ASSISTANT> List comprehension is a concise way to create lists.
+Basic syntax:
+<CODE>
+new_list = [expression for item in iterable]
+<ENDCODE>
+Examples:
+<CODE>
+squares = [x ** 2 for x in range(5)]
+print(squares)
+<ENDCODE>
+Output: [0, 1, 4, 9, 16]
+With condition:
+<CODE>
+evens = [x for x in range(10) if x % 2 == 0]
+print(evens)
+<ENDCODE>
+Output: [0, 2, 4, 6, 8]
+Transform strings:
+<CODE>
+words = ["hello", "world"]
+upper_words = [word.upper() for word in words]
+print(upper_words)
+<ENDCODE>
+Output: ["HELLO", "WORLD"]
+With if-else:
+<CODE>
+numbers = [1, 2, 3, 4, 5]
+labels = ["even" if x % 2 == 0 else "odd" for x in numbers]
+print(labels)
+<ENDCODE>
+Output: ["odd", "even", "odd", "even", "odd"]
+<USER> Write quicksort
+<ASSISTANT> Here is the QuickSort algorithm:
+<CODE>
+def quicksort(arr):
+    if len(arr) <= 1:
+        return arr
+    pivot = arr[len(arr) // 2]
+    left = [x for x in arr if x < pivot]
+    middle = [x for x in arr if x == pivot]
+    right = [x for x in arr if x > pivot]
+    return quicksort(left) + middle + quicksort(right)
+numbers = [64, 34, 25, 12, 22, 11, 90]
+print("Original:", numbers)
+print("Sorted:", quicksort(numbers))
+<ENDCODE>
+Output:
+Original: [64, 34, 25, 12, 22, 11, 90]
+Sorted: [11, 12, 22, 25, 34, 64, 90]
+In-place version:
+<CODE>
+def quicksort_inplace(arr, low, high):
+    if low < high:
+        pivot_idx = partition(arr, low, high)
+        quicksort_inplace(arr, low, pivot_idx - 1)
+        quicksort_inplace(arr, pivot_idx + 1, high)
+def partition(arr, low, high):
+    pivot = arr[high]
+    i = low - 1
+    for j in range(low, high):
+        if arr[j] <= pivot:
+            i += 1
+            arr[i], arr[j] = arr[j], arr[i]
+    arr[i + 1], arr[high] = arr[high], arr[i + 1]
+    return i + 1
+numbers = [64, 34, 25, 12, 22, 11, 90]
+quicksort_inplace(numbers, 0, len(numbers) - 1)
+print(numbers)
+<ENDCODE>
+Time complexity:
+- Average: O(n log n)
+- Worst: O(n^2)
+<USER> Write merge sort
+<ASSISTANT> Here is the Merge Sort algorithm:
+<CODE>
+def merge_sort(arr):
+    if len(arr) <= 1:
+        return arr
+    mid = len(arr) // 2
+    left = merge_sort(arr[:mid])
+    right = merge_sort(arr[mid:])
+    return merge(left, right)
+def merge(left, right):
+    result = []
+    i = j = 0
+    while i < len(left) and j < len(right):
+        if left[i] <= right[j]:
+            result.append(left[i])
+            i += 1
+        else:
+            result.append(right[j])
+            j += 1
+    result.extend(left[i:])
+    result.extend(right[j:])
+    return result
+numbers = [64, 34, 25, 12, 22, 11, 90]
+print("Original:", numbers)
+print("Sorted:", merge_sort(numbers))
+<ENDCODE>
+Output:
+Original: [64, 34, 25, 12, 22, 11, 90]
+Sorted: [11, 12, 22, 25, 34, 64, 90]
+How it works:
+1. Divide array into two halves
+2. Recursively sort each half
+3. Merge the sorted halves
+Time complexity: O(n log n) always
+Space complexity: O(n)
+<USER> What is a lambda function?
+<ASSISTANT> A lambda function is a small anonymous function defined in one line.
+Syntax:
+<CODE>
+lambda arguments: expression
+<ENDCODE>
+Examples:
+<CODE>
+add = lambda x, y: x + y
+print(add(5, 3))
+square = lambda x: x ** 2
+print(square(4))
+is_even = lambda x: x % 2 == 0
+print(is_even(4))
+<ENDCODE>
+Output:
+8
+16
+True
+Common uses:
+With sorted():
+<CODE>
+students = [("Alice", 85), ("Bob", 92), ("Charlie", 78)]
+sorted_by_score = sorted(students, key=lambda x: x[1])
+print(sorted_by_score)
+<ENDCODE>
+With map():
+<CODE>
+numbers = [1, 2, 3, 4, 5]
+squares = list(map(lambda x: x ** 2, numbers))
+print(squares)
+<ENDCODE>
+With filter():
+<CODE>
+numbers = [1, 2, 3, 4, 5, 6]
+evens = list(filter(lambda x: x % 2 == 0, numbers))
+print(evens)
+<ENDCODE>
+<USER> What are decorators?
+<ASSISTANT> Decorators are functions that modify the behavior of other functions.
+Basic decorator:
+<CODE>
+def my_decorator(func):
+    def wrapper():
+        print("Before function")
+        func()
+        print("After function")
+    return wrapper
+@my_decorator
+def say_hello():
+    print("Hello!")
+say_hello()
+<ENDCODE>
+Output:
+Before function
+Hello!
+After function
+With arguments:
+<CODE>
+def my_decorator(func):
+    def wrapper(*args, **kwargs):
+        print("Before function")
+        result = func(*args, **kwargs)
+        print("After function")
+        return result
+    return wrapper
+@my_decorator
+def add(a, b):
+    return a + b
+print(add(5, 3))
+<ENDCODE>
+Practical example - timer:
+<CODE>
+import time
+def timer(func):
+    def wrapper(*args, **kwargs):
+        start = time.time()
+        result = func(*args, **kwargs)
+        end = time.time()
+        print(f"{func.__name__} took {end - start:.4f} seconds")
+        return result
+    return wrapper
+@timer
+def slow_function():
+    time.sleep(1)
+    return "Done"
+slow_function()
+<ENDCODE>
+<USER> Thanks!
+<ASSISTANT> You are welcome! I am glad I could help. Feel free to ask me anything anytime. Happy coding!
+<USER> Hi
+<ASSISTANT> Hello! How can I help you today? I can write code, explain concepts, or answer your programming questions!
+<USER> What can you do?
+<ASSISTANT> I am Veda Programming Assistant! I can help you with:
+1. Writing Python code
+2. Explaining programming concepts
+3. Debugging code
+4. Answering questions about algorithms
+5. Teaching data structures
+6. Helping with file handling
+7. Explaining object-oriented programming
+8. And much more!
+Just ask me anything about programming and I will do my best to help you!
+<USER> 2+2=?
+<ASSISTANT> 4
+<USER> What is 10 times 5?
+<ASSISTANT> 10 times 5 equals 50.
+<USER> Calculate 100 divided by 4
+<ASSISTANT> 100 divided by 4 equals 25.
+"""
+class VedaTrainer:
+    """Trainer for Veda Programming Assistant with Distillation Support"""
     def __init__(self, vocab_size: int = 8000, max_length: int = 512, batch_size: int = 4):
         self.vocab_size = vocab_size
         self.max_length = max_length
         self.batch_size = batch_size
         self.tokenizer = VedaTokenizer(vocab_size=vocab_size)
         self.model = None
+    def prepare_data(self, extra_data: str = "", distillation_data: str = ""):
+        """Prepare training data including distillation data"""
         data = TRAINING_DATA
         if extra_data:
             data += "\n\n" + extra_data
+        if distillation_data:
+            data += "\n\n" + distillation_data
         if os.path.exists("programming.txt"):
+            try:
+                with open("programming.txt", "r", encoding="utf-8") as f:
+                    code_data = f.read()
+                data += "\n\n" + code_data
+            except Exception as e:
+                print(f"Warning: Could not read programming.txt: {e}")
         self.tokenizer.fit([data])
         all_tokens = self.tokenizer.encode(data)
         print(f"Total tokens: {len(all_tokens)}")
         sequences = []
         stride = self.max_length // 2
         for i in range(0, len(all_tokens) - self.max_length - 1, stride):
+            seq = all_tokens[i : i + self.max_length + 1]
             if len(seq) == self.max_length + 1:
                 sequences.append(seq)
         if len(sequences) < 10:
             stride = self.max_length // 4
             sequences = []
             for i in range(0, len(all_tokens) - self.max_length - 1, stride):
+                seq = all_tokens[i : i + self.max_length + 1]
                 if len(seq) == self.max_length + 1:
                     sequences.append(seq)
         print(f"Created {len(sequences)} training sequences")
+        if len(sequences) == 0:
+            print("Warning: No sequences created. Using minimal sequence.")
+            min_seq = all_tokens[:self.max_length + 1]
+            while len(min_seq) < self.max_length + 1:
+                min_seq.append(0)
+            sequences = [min_seq]
         sequences = np.array(sequences)
         X = sequences[:, :-1]
         y = sequences[:, 1:]
         dataset = tf.data.Dataset.from_tensor_slices((X, y))
         dataset = dataset.shuffle(1000).batch(self.batch_size).prefetch(1)
         return dataset
     def build_model(self):
         """Build the model"""
         self.model = VedaProgrammingLLM(
             d_model=256,
             num_heads=8,
             num_layers=4,
+            ff_dim=512,
         )
         self.model.compile(
+            optimizer=keras.optimizers.Adam(learning_rate=1e-4),
             loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+            metrics=["accuracy"],
         )
         dummy = tf.zeros((1, self.max_length), dtype=tf.int32)
         self.model(dummy)
         return self.model
+    def train(
+        self,
+        epochs: int = 15,
+        save_path: str = None,
+        extra_data: str = "",
+        distillation_data: str = "",
+    ):
         """Train the model"""
         if save_path is None:
             save_path = MODEL_DIR
+        dataset = self.prepare_data(extra_data, distillation_data)
         self.build_model()
         self.model.summary()
         os.makedirs(save_path, exist_ok=True)
         history = self.model.fit(dataset, epochs=epochs, verbose=1)
+        # Save weights
         self.model.save_weights(os.path.join(save_path, "weights.h5"))
+        # Save tokenizer
+        self.tokenizer.save(os.path.join(save_path, "tokenizer.json"))
+        # Save config
         config = self.model.get_config()
+        with open(os.path.join(save_path, "config.json"), "w") as f:
+            json.dump(config, f, indent=2)
         print(f"Model saved to {save_path}")
         return history
+    def generate_response(
+        self, user_input: str, max_tokens: int = 200, temperature: float = 0.7
+    ) -> str:
         """Generate a response"""
+        if self.model is None:
+            return "Model not loaded."
         prompt = f"<USER> {user_input}\n<ASSISTANT>"
         tokens = self.tokenizer.encode(prompt)
         generated = self.model.generate(
             tokens,
             max_new_tokens=max_tokens,
             temperature=temperature,
+            repetition_penalty=1.2,
         )
         response = self.tokenizer.decode(generated)
         if "<ASSISTANT>" in response:
             response = response.split("<ASSISTANT>")[-1].strip()
         if "<USER>" in response:
             response = response.split("<USER>")[0].strip()
         return response
 if __name__ == "__main__":
+    print("=" * 50)
+    print("Training Veda Programming Assistant")
+    print("=" * 50)
     trainer = VedaTrainer()
     trainer.train(epochs=20)
+    print("\n" + "=" * 50)
+    print("Testing the model:")
+    print("=" * 50)
+    test_prompts = [
+        "Hello!",
+        "What is a function?",
+        "Write a function to reverse a string",
+        "2+2=?",
+    ]
+    for prompt in test_prompts:
+        print(f"\nUser: {prompt}")
+        response = trainer.generate_response(prompt)
+        print(f"Assistant: {response}")