File size: 4,128 Bytes
37f6677
 
 
 
 
 
 
 
4f63d46
37f6677
 
4f63d46
 
37f6677
4f63d46
37f6677
 
4f63d46
37f6677
 
 
 
 
 
 
4f63d46
37f6677
4f63d46
37f6677
 
 
 
4f63d46
 
 
 
 
37f6677
4f63d46
37f6677
 
4f63d46
 
37f6677
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f63d46
37f6677
 
 
 
 
4f63d46
37f6677
 
 
 
 
 
 
4f63d46
37f6677
 
 
 
 
 
 
 
4f63d46
37f6677
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f63d46
 
37f6677
4f63d46
 
37f6677
 
 
 
4f63d46
 
37f6677
 
4f63d46
37f6677
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import torch
import json

MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
MAX_LENGTH = 512

# Load tokenizer and model
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Improved LoRA configuration
lora_config = LoraConfig(
    r=16,  # Increased from 8 for better capacity
    lora_alpha=32,  # Increased from 16
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],  # More modules
    lora_dropout=0.1,  # Increased for better regularization
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Load and split dataset
print("Loading dataset...")
dataset = load_dataset("json", data_files="train.jsonl")

# Split into train/validation (80/20)
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")

def tokenize_function(examples):
    """Tokenize the examples with proper formatting"""
    texts = []
    for messages in examples["messages"]:
        # Apply chat template
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)
    
    # Tokenize with padding and truncation
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
        return_tensors=None
    )
    
    # Labels are the same as input_ids for causal LM
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

# Tokenize datasets
print("Tokenizing datasets...")
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_eval = eval_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=eval_dataset.column_names
)

# Improved training arguments
training_args = TrainingArguments(
    output_dir="./brad-ai-lora",
    
    # Training hyperparameters
    num_train_epochs=5,  # Increased from 3
    per_device_train_batch_size=2,  # Increased from 1
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Effective batch size = 8
    
    # Learning rate and scheduling
    learning_rate=3e-4,  # Slightly increased
    lr_scheduler_type="cosine",  # Better than default
    warmup_ratio=0.1,  # Warmup for 10% of training
    
    # Optimization
    optim="adamw_torch",
    weight_decay=0.01,
    max_grad_norm=1.0,
    
    # Logging and evaluation
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=3,  # Keep only best 3 checkpoints
    
    # Performance
    fp16=True,  # Mixed precision training
    dataloader_num_workers=2,
    
    # Monitoring
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # Misc
    report_to="none",  # Change to "tensorboard" if you want logging
    seed=42
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer
)

# Train the model
print("Starting training...")
trainer.train()

# Save the final model
print("Saving model...")
trainer.save_model("./brad-ai-lora-final")
tokenizer.save_pretrained("./brad-ai-lora-final")

# Evaluate final model
print("Final evaluation:")
eval_results = trainer.evaluate()
print(eval_results)

print("Training complete!")