vedaco commited on
Commit
e866020
·
verified ·
1 Parent(s): b5dd607

Create veda_train.py

Browse files
Files changed (1) hide show
  1. veda_train.py +118 -0
veda_train.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from transformers import TFAutoModelForCausalLM, AutoTokenizer
3
+ import datasets
4
+ from tensorflow.keras.optimizers import Adam
5
+ from tensorflow.keras.losses import SparseCategoricalCrossentropy
6
+ import numpy as np
7
+
8
+ class VedaTrainer:
9
+ """
10
+ Advanced training pipeline for VEDA LLM
11
+ """
12
+
13
+ def __init__(self, base_model="gpt2"):
14
+ self.base_model = base_model
15
+ self.tokenizer = AutoTokenizer.from_pretrained(base_model)
16
+ self.model = None
17
+
18
+ # Configure tokenizer for VEDA
19
+ if self.tokenizer.pad_token is None:
20
+ self.tokenizer.pad_token = self.tokenizer.eos_token
21
+
22
+ # Add VEDA special tokens
23
+ special_tokens = {
24
+ "pad_token": "[VEDA_PAD]",
25
+ "bos_token": "[VEDA_START]",
26
+ "eos_token": "[VEDA_END]",
27
+ "unk_token": "[VEDA_UNK]"
28
+ }
29
+
30
+ self.tokenizer.add_special_tokens(special_tokens)
31
+
32
+ def prepare_veda_dataset(self, dataset_name="wikitext", dataset_config="wikitext-2-raw-v1"):
33
+ """Prepare dataset for VEDA training"""
34
+
35
+ print("📚 Loading dataset for VEDA training...")
36
+ dataset = datasets.load_dataset(dataset_name, dataset_config)
37
+
38
+ def tokenize_function(examples):
39
+ # Add VEDA tokens
40
+ texts = [f"[VEDA_START] {text} [VEDA_END]" for text in examples["text"]]
41
+
42
+ return self.tokenizer(
43
+ texts,
44
+ truncation=True,
45
+ padding=True,
46
+ max_length=256,
47
+ return_tensors="tf"
48
+ )
49
+
50
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
51
+ return tokenized_dataset
52
+
53
+ def create_veda_model(self):
54
+ """Create VEDA model with custom architecture"""
55
+
56
+ print("🏗️ Building VEDA model...")
57
+
58
+ # Load base model
59
+ self.model = TFAutoModelForCausalLM.from_pretrained(self.base_model)
60
+ self.model.resize_token_embeddings(len(self.tokenizer))
61
+
62
+ # Compile with VEDA optimizer settings
63
+ optimizer = Adam(
64
+ learning_rate=3e-5,
65
+ beta_1=0.9,
66
+ beta_2=0.95,
67
+ epsilon=1e-9
68
+ )
69
+
70
+ loss = SparseCategoricalCrossentropy(from_logits=True)
71
+
72
+ self.model.compile(
73
+ optimizer=optimizer,
74
+ loss=loss,
75
+ metrics=['accuracy']
76
+ )
77
+
78
+ return self.model
79
+
80
+ def train_veda(self, dataset, epochs=3, batch_size=4):
81
+ """Train VEDA model"""
82
+
83
+ model = self.create_veda_model()
84
+
85
+ print("🎯 Starting VEDA training...")
86
+
87
+ # Prepare training data
88
+ train_data = dataset["train"].to_tf_dataset(
89
+ columns=["input_ids", "attention_mask", "labels"],
90
+ shuffle=True,
91
+ batch_size=batch_size
92
+ )
93
+
94
+ # Training
95
+ history = model.fit(
96
+ train_data,
97
+ epochs=epochs,
98
+ validation_split=0.1
99
+ )
100
+
101
+ print("✅ VEDA training completed!")
102
+
103
+ return model, history
104
+
105
+ def save_veda_model(self, model, path="./veda_model"):
106
+ """Save trained VEDA model"""
107
+
108
+ print(f"💾 Saving VEDA model to {path}...")
109
+ model.save_pretrained(path)
110
+ self.tokenizer.save_pretrained(path)
111
+ print("✅ VEDA model saved!")
112
+
113
+ # Usage
114
+ if __name__ == "__main__":
115
+ trainer = VedaTrainer()
116
+ dataset = trainer.prepare_veda_dataset()
117
+ model, history = trainer.train_veda(dataset)
118
+ trainer.save_veda_model(model)