|
|
import tensorflow as tf |
|
|
from transformers import TFAutoModelForCausalLM, AutoTokenizer |
|
|
import datasets |
|
|
from tensorflow.keras.optimizers import Adam |
|
|
from tensorflow.keras.losses import SparseCategoricalCrossentropy |
|
|
import numpy as np |
|
|
|
|
|
class VedaTrainer: |
|
|
""" |
|
|
Advanced training pipeline for VEDA LLM |
|
|
""" |
|
|
|
|
|
def __init__(self, base_model="gpt2"): |
|
|
self.base_model = base_model |
|
|
self.tokenizer = AutoTokenizer.from_pretrained(base_model) |
|
|
self.model = None |
|
|
|
|
|
|
|
|
if self.tokenizer.pad_token is None: |
|
|
self.tokenizer.pad_token = self.tokenizer.eos_token |
|
|
|
|
|
|
|
|
special_tokens = { |
|
|
"pad_token": "[VEDA_PAD]", |
|
|
"bos_token": "[VEDA_START]", |
|
|
"eos_token": "[VEDA_END]", |
|
|
"unk_token": "[VEDA_UNK]" |
|
|
} |
|
|
|
|
|
self.tokenizer.add_special_tokens(special_tokens) |
|
|
|
|
|
def prepare_veda_dataset(self, dataset_name="wikitext", dataset_config="wikitext-2-raw-v1"): |
|
|
"""Prepare dataset for VEDA training""" |
|
|
|
|
|
print("π Loading dataset for VEDA training...") |
|
|
dataset = datasets.load_dataset(dataset_name, dataset_config) |
|
|
|
|
|
def tokenize_function(examples): |
|
|
|
|
|
texts = [f"[VEDA_START] {text} [VEDA_END]" for text in examples["text"]] |
|
|
|
|
|
return self.tokenizer( |
|
|
texts, |
|
|
truncation=True, |
|
|
padding=True, |
|
|
max_length=256, |
|
|
return_tensors="tf" |
|
|
) |
|
|
|
|
|
tokenized_dataset = dataset.map(tokenize_function, batched=True) |
|
|
return tokenized_dataset |
|
|
|
|
|
def create_veda_model(self): |
|
|
"""Create VEDA model with custom architecture""" |
|
|
|
|
|
print("ποΈ Building VEDA model...") |
|
|
|
|
|
|
|
|
self.model = TFAutoModelForCausalLM.from_pretrained(self.base_model) |
|
|
self.model.resize_token_embeddings(len(self.tokenizer)) |
|
|
|
|
|
|
|
|
optimizer = Adam( |
|
|
learning_rate=3e-5, |
|
|
beta_1=0.9, |
|
|
beta_2=0.95, |
|
|
epsilon=1e-9 |
|
|
) |
|
|
|
|
|
loss = SparseCategoricalCrossentropy(from_logits=True) |
|
|
|
|
|
self.model.compile( |
|
|
optimizer=optimizer, |
|
|
loss=loss, |
|
|
metrics=['accuracy'] |
|
|
) |
|
|
|
|
|
return self.model |
|
|
|
|
|
def train_veda(self, dataset, epochs=3, batch_size=4): |
|
|
"""Train VEDA model""" |
|
|
|
|
|
model = self.create_veda_model() |
|
|
|
|
|
print("π― Starting VEDA training...") |
|
|
|
|
|
|
|
|
train_data = dataset["train"].to_tf_dataset( |
|
|
columns=["input_ids", "attention_mask", "labels"], |
|
|
shuffle=True, |
|
|
batch_size=batch_size |
|
|
) |
|
|
|
|
|
|
|
|
history = model.fit( |
|
|
train_data, |
|
|
epochs=epochs, |
|
|
validation_split=0.1 |
|
|
) |
|
|
|
|
|
print("β
VEDA training completed!") |
|
|
|
|
|
return model, history |
|
|
|
|
|
def save_veda_model(self, model, path="./veda_model"): |
|
|
"""Save trained VEDA model""" |
|
|
|
|
|
print(f"πΎ Saving VEDA model to {path}...") |
|
|
model.save_pretrained(path) |
|
|
self.tokenizer.save_pretrained(path) |
|
|
print("β
VEDA model saved!") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
trainer = VedaTrainer() |
|
|
dataset = trainer.prepare_veda_dataset() |
|
|
model, history = trainer.train_veda(dataset) |
|
|
trainer.save_veda_model(model) |