|
|
from transformers import ( |
|
|
AutoTokenizer, |
|
|
AutoModelForCausalLM, |
|
|
TrainingArguments, |
|
|
Trainer, |
|
|
DataCollatorForLanguageModeling |
|
|
) |
|
|
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training |
|
|
from datasets import load_dataset |
|
|
import torch |
|
|
import json |
|
|
|
|
|
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" |
|
|
MAX_LENGTH = 512 |
|
|
|
|
|
|
|
|
print("Loading model and tokenizer...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_NAME, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto" |
|
|
) |
|
|
|
|
|
|
|
|
lora_config = LoraConfig( |
|
|
r=16, |
|
|
lora_alpha=32, |
|
|
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], |
|
|
lora_dropout=0.1, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM" |
|
|
) |
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
|
model.print_trainable_parameters() |
|
|
|
|
|
|
|
|
print("Loading dataset...") |
|
|
dataset = load_dataset("json", data_files="train.jsonl") |
|
|
|
|
|
|
|
|
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) |
|
|
train_dataset = split_dataset["train"] |
|
|
eval_dataset = split_dataset["test"] |
|
|
|
|
|
print(f"Training samples: {len(train_dataset)}") |
|
|
print(f"Validation samples: {len(eval_dataset)}") |
|
|
|
|
|
def tokenize_function(examples): |
|
|
"""Tokenize the examples with proper formatting""" |
|
|
texts = [] |
|
|
for messages in examples["messages"]: |
|
|
|
|
|
text = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=False |
|
|
) |
|
|
texts.append(text) |
|
|
|
|
|
|
|
|
tokenized = tokenizer( |
|
|
texts, |
|
|
truncation=True, |
|
|
max_length=MAX_LENGTH, |
|
|
padding="max_length", |
|
|
return_tensors=None |
|
|
) |
|
|
|
|
|
|
|
|
tokenized["labels"] = tokenized["input_ids"].copy() |
|
|
|
|
|
return tokenized |
|
|
|
|
|
|
|
|
print("Tokenizing datasets...") |
|
|
tokenized_train = train_dataset.map( |
|
|
tokenize_function, |
|
|
batched=True, |
|
|
remove_columns=train_dataset.column_names |
|
|
) |
|
|
|
|
|
tokenized_eval = eval_dataset.map( |
|
|
tokenize_function, |
|
|
batched=True, |
|
|
remove_columns=eval_dataset.column_names |
|
|
) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir="./brad-ai-lora", |
|
|
|
|
|
|
|
|
num_train_epochs=5, |
|
|
per_device_train_batch_size=2, |
|
|
per_device_eval_batch_size=2, |
|
|
gradient_accumulation_steps=4, |
|
|
|
|
|
|
|
|
learning_rate=3e-4, |
|
|
lr_scheduler_type="cosine", |
|
|
warmup_ratio=0.1, |
|
|
|
|
|
|
|
|
optim="adamw_torch", |
|
|
weight_decay=0.01, |
|
|
max_grad_norm=1.0, |
|
|
|
|
|
|
|
|
logging_steps=10, |
|
|
eval_strategy="steps", |
|
|
eval_steps=50, |
|
|
save_strategy="steps", |
|
|
save_steps=50, |
|
|
save_total_limit=3, |
|
|
|
|
|
|
|
|
fp16=True, |
|
|
dataloader_num_workers=2, |
|
|
|
|
|
|
|
|
load_best_model_at_end=True, |
|
|
metric_for_best_model="eval_loss", |
|
|
greater_is_better=False, |
|
|
|
|
|
|
|
|
report_to="none", |
|
|
seed=42 |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=tokenized_train, |
|
|
eval_dataset=tokenized_eval, |
|
|
tokenizer=tokenizer |
|
|
) |
|
|
|
|
|
|
|
|
print("Starting training...") |
|
|
trainer.train() |
|
|
|
|
|
|
|
|
print("Saving model...") |
|
|
trainer.save_model("./brad-ai-lora-final") |
|
|
tokenizer.save_pretrained("./brad-ai-lora-final") |
|
|
|
|
|
|
|
|
print("Final evaluation:") |
|
|
eval_results = trainer.evaluate() |
|
|
print(eval_results) |
|
|
|
|
|
print("Training complete!") |