File size: 3,657 Bytes
e866020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import tensorflow as tf
from transformers import TFAutoModelForCausalLM, AutoTokenizer
import datasets
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import numpy as np

class VedaTrainer:
    """
    Advanced training pipeline for VEDA LLM
    """
    
    def __init__(self, base_model="gpt2"):
        self.base_model = base_model
        self.tokenizer = AutoTokenizer.from_pretrained(base_model)
        self.model = None
        
        # Configure tokenizer for VEDA
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Add VEDA special tokens
        special_tokens = {
            "pad_token": "[VEDA_PAD]",
            "bos_token": "[VEDA_START]",
            "eos_token": "[VEDA_END]",
            "unk_token": "[VEDA_UNK]"
        }
        
        self.tokenizer.add_special_tokens(special_tokens)
    
    def prepare_veda_dataset(self, dataset_name="wikitext", dataset_config="wikitext-2-raw-v1"):
        """Prepare dataset for VEDA training"""
        
        print("๐Ÿ“š Loading dataset for VEDA training...")
        dataset = datasets.load_dataset(dataset_name, dataset_config)
        
        def tokenize_function(examples):
            # Add VEDA tokens
            texts = [f"[VEDA_START] {text} [VEDA_END]" for text in examples["text"]]
            
            return self.tokenizer(
                texts,
                truncation=True,
                padding=True,
                max_length=256,
                return_tensors="tf"
            )
        
        tokenized_dataset = dataset.map(tokenize_function, batched=True)
        return tokenized_dataset
    
    def create_veda_model(self):
        """Create VEDA model with custom architecture"""
        
        print("๐Ÿ—๏ธ  Building VEDA model...")
        
        # Load base model
        self.model = TFAutoModelForCausalLM.from_pretrained(self.base_model)
        self.model.resize_token_embeddings(len(self.tokenizer))
        
        # Compile with VEDA optimizer settings
        optimizer = Adam(
            learning_rate=3e-5,
            beta_1=0.9,
            beta_2=0.95,
            epsilon=1e-9
        )
        
        loss = SparseCategoricalCrossentropy(from_logits=True)
        
        self.model.compile(
            optimizer=optimizer,
            loss=loss,
            metrics=['accuracy']
        )
        
        return self.model
    
    def train_veda(self, dataset, epochs=3, batch_size=4):
        """Train VEDA model"""
        
        model = self.create_veda_model()
        
        print("๐ŸŽฏ Starting VEDA training...")
        
        # Prepare training data
        train_data = dataset["train"].to_tf_dataset(
            columns=["input_ids", "attention_mask", "labels"],
            shuffle=True,
            batch_size=batch_size
        )
        
        # Training
        history = model.fit(
            train_data,
            epochs=epochs,
            validation_split=0.1
        )
        
        print("โœ… VEDA training completed!")
        
        return model, history
    
    def save_veda_model(self, model, path="./veda_model"):
        """Save trained VEDA model"""
        
        print(f"๐Ÿ’พ Saving VEDA model to {path}...")
        model.save_pretrained(path)
        self.tokenizer.save_pretrained(path)
        print("โœ… VEDA model saved!")

# Usage
if __name__ == "__main__":
    trainer = VedaTrainer()
    dataset = trainer.prepare_veda_dataset()
    model, history = trainer.train_veda(dataset)
    trainer.save_veda_model(model)