vedaco commited on
Commit
8e0eaae
·
verified ·
1 Parent(s): 108c40e

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +228 -1
train.py CHANGED
@@ -375,4 +375,231 @@ Feel free to ask me anything anytime! Happy coding! 🐍
375
 
376
  Your code:
377
  ```python
378
- def add(a b): return a + b # ❌ Missing comma
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
  Your code:
377
  ```python
378
+ def add(a b): return a + b # ❌ Missing commaFixed code:
379
+
380
+ <CODE> def add(a, b): return a + b <ENDCODE>
381
+ The fix: Add a comma between a and b in the parameters.
382
+
383
+ Testing it:
384
+
385
+ <CODE> def add(a, b): return a + b
386
+ print(add(5, 3)) # Output: 8
387
+ print(add(10, 20)) # Output: 30
388
+ <ENDCODE>
389
+
390
+ Common syntax errors to watch:
391
+
392
+ Missing commas between parameters
393
+ Missing colons after def, if, for, while
394
+ Incorrect indentation
395
+ Mismatched parentheses
396
+ <USER> What is the difference between list and tuple?
397
+ <ASSISTANT> The main difference is mutability:
398
+
399
+ Lists are mutable (can be changed)
400
+ Tuples are immutable (cannot be changed)
401
+ Syntax:
402
+
403
+ <CODE> # List - square brackets my_list = [1, 2, 3]
404
+ Tuple - parentheses
405
+ my_tuple = (1, 2, 3)
406
+ <ENDCODE>
407
+
408
+ Mutability difference:
409
+
410
+ <CODE> # Lists can be modified my_list = [1, 2, 3] my_list[0] = 100 # OK my_list.append(4) # OK
411
+ Tuples cannot be modified
412
+ my_tuple = (1, 2, 3)
413
+
414
+ my_tuple[0] = 100 # ERROR!
415
+ my_tuple.append(4) # ERROR!
416
+ <ENDCODE>
417
+ When to use:
418
+
419
+ List: data that will change (shopping cart, scores)
420
+ Tuple: fixed data (coordinates, RGB colors)
421
+ Example:
422
+
423
+ <CODE> # Tuple for fixed data coordinates = (10, 20) rgb_color = (255, 128, 0)
424
+ List for changing data
425
+ shopping_cart = ["apple", "bread"]
426
+ shopping_cart.append("milk")
427
+ <ENDCODE>
428
+
429
+ Other differences:
430
+
431
+ Tuples are slightly faster
432
+ Tuples use less memory
433
+ Tuples can be dictionary keys
434
+ '''
435
+ class VedaTrainer:
436
+ """Trainer for Veda Programming Assistant"""def __init__(self, vocab_size: int = 8000, max_length: int = 512, batch_size: int = 4):
437
+ self.vocab_size = vocab_size
438
+ self.max_length = max_length
439
+ self.batch_size = batch_size
440
+ self.tokenizer = VedaTokenizer(vocab_size=vocab_size)
441
+ self.model = None
442
+
443
+ def prepare_data(self, extra_data: str = ""):
444
+ """Prepare training data"""
445
+ # Combine training data
446
+ data = TRAINING_DATA
447
+ if extra_data:
448
+ data += "\n\n" + extra_data
449
+
450
+ # Load additional code from programming.txt if exists
451
+ if os.path.exists("programming.txt"):
452
+ with open("programming.txt", 'r', encoding='utf-8') as f:
453
+ code_data = f.read()
454
+ data += "\n\n" + code_data
455
+
456
+ # Fit tokenizer
457
+ self.tokenizer.fit([data])
458
+
459
+ # Encode
460
+ all_tokens = self.tokenizer.encode(data)
461
+ print(f"Total tokens: {len(all_tokens)}")
462
+
463
+ # Create sequences
464
+ sequences = []
465
+ stride = self.max_length // 2
466
+
467
+ for i in range(0, len(all_tokens) - self.max_length - 1, stride):
468
+ seq = all_tokens[i:i + self.max_length + 1]
469
+ if len(seq) == self.max_length + 1:
470
+ sequences.append(seq)
471
+
472
+ if len(sequences) < 10:
473
+ stride = self.max_length // 4
474
+ sequences = []
475
+ for i in range(0, len(all_tokens) - self.max_length - 1, stride):
476
+ seq = all_tokens[i:i + self.max_length + 1]
477
+ if len(seq) == self.max_length + 1:
478
+ sequences.append(seq)
479
+
480
+ print(f"Created {len(sequences)} training sequences")
481
+
482
+ sequences = np.array(sequences)
483
+ X = sequences[:, :-1]
484
+ y = sequences[:, 1:]
485
+
486
+ dataset = tf.data.Dataset.from_tensor_slices((X, y))
487
+ dataset = dataset.shuffle(1000).batch(self.batch_size).prefetch(1)
488
+
489
+ return dataset
490
+
491
+ def build_model(self):
492
+ """Build the model"""
493
+ self.model = VedaProgrammingLLM(
494
+ vocab_size=self.tokenizer.vocabulary_size,
495
+ max_length=self.max_length,
496
+ d_model=256,
497
+ num_heads=8,
498
+ num_layers=4,
499
+ ff_dim=512
500
+ )
501
+
502
+ self.model.compile(
503
+ optimizer=keras.optimizers.Adam(1e-4),
504
+ loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
505
+ metrics=['accuracy']
506
+ )
507
+
508
+ dummy = tf.zeros((1, self.max_length), dtype=tf.int32)
509
+ self.model(dummy)
510
+
511
+ return self.model
512
+
513
+ def train(self, epochs: int = 15, save_path: str = None, extra_data: str = ""):
514
+ """Train the model"""
515
+ if save_path is None:
516
+ save_path = MODEL_DIR
517
+
518
+ dataset = self.prepare_data(extra_data)
519
+ self.build_model()
520
+
521
+ self.model.summary()
522
+
523
+ os.makedirs(save_path, exist_ok=True)
524
+
525
+ history = self.model.fit(dataset, epochs=epochs, verbose=1)
526
+
527
+ # Save
528
+ self.model.save_weights(os.path.join(save_path, "weights.h5"))
529
+ self.tokenizer.save(os.path.join(save_path, "tokenizer.json"))
530
+
531
+ config = self.model.get_config()
532
+ with open(os.path.join(save_path, "config.json"), 'w') as f:
533
+ json.dump(config, f)
534
+
535
+ print(f"Model saved to {save_path}")
536
+ return history
537
+
538
+ def generate_response(self, user_input: str, max_tokens: int = 200,
539
+ temperature: float = 0.7) -> str:
540
+ """Generate a response"""
541
+ prompt = f"<USER> {user_input}\n<ASSISTANT>"
542
+
543
+ tokens = self.tokenizer.encode(prompt)
544
+
545
+ generated = self.model.generate(
546
+ tokens,
547
+ max_new_tokens=max_tokens,
548
+ temperature=temperature,
549
+ repetition_penalty=1.2
550
+ )
551
+
552
+ response = self.tokenizer.decode(generated)
553
+
554
+ # Extract assistant response
555
+ if "<ASSISTANT>" in response:
556
+ response = response.split("<ASSISTANT>")[-1].strip()
557
+ if "<USER>" in response:
558
+ response = response.split("<USER>")[0].strip()
559
+
560
+ return responseif name == "main":
561
+ trainer = VedaTrainer()
562
+ trainer.train(epochs=20)# Test
563
+ print("\n" + "="*50)
564
+ print("Testing:")
565
+ print("="*50)
566
+
567
+ tests = [
568
+ "Hello!",
569
+ "What is a function?",
570
+ "Write a function to reverse a string",
571
+ ]
572
+
573
+ for test in tests:
574
+ print(f"\nUser: {test}")
575
+ print(f"Assistant: {trainer.generate_response(test)}")
576
+ ---
577
+
578
+ ### 3. config.py (MODIFY - Increase max_length)
579
+
580
+ ```python
581
+ """Configuration - MODIFIED for conversation"""
582
+
583
+ import os
584
+
585
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
586
+ DATA_DIR = os.path.join(BASE_DIR, "data")
587
+ MODEL_DIR = os.path.join(BASE_DIR, "veda_model")
588
+
589
+ os.makedirs(DATA_DIR, exist_ok=True)
590
+ os.makedirs(MODEL_DIR, exist_ok=True)
591
+
592
+ DATABASE_PATH = os.path.join(DATA_DIR, "conversations.db")
593
+
594
+ # Model settings - MODIFIED
595
+ VOCAB_SIZE = 8000 # Increased for more words
596
+ MAX_LENGTH = 512 # Increased for longer conversations
597
+ D_MODEL = 256
598
+ NUM_HEADS = 8
599
+ NUM_LAYERS = 4
600
+ FF_DIM = 512
601
+ BATCH_SIZE = 4 # Smaller for longer sequences
602
+
603
+ # Generation defaults
604
+ DEFAULT_TEMPERATURE = 0.7
605
+ DEFAULT_MAX_TOKENS = 200