File size: 4,691 Bytes
2e31dbb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | #!/usr/bin/env python3
"""Phase 2: SFT training on Qwen3-4B"""
import os
import time
import torch
from pathlib import Path
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
# Config
BASE_MODEL = "Qwen/Qwen3-4B"
DATA_DIR = Path("./qwen3_pipeline/data")
CKPT_DIR = Path("./qwen3_pipeline/checkpoint")
CKPT_DIR.mkdir(parents=True, exist_ok=True)
EPOCHS = 1
BATCH_SIZE = 2
GRAD_ACCUM = 8
LR = 2e-4
MAX_SEQ_LEN = 4096
LORA_RANK = 32
LORA_ALPHA = 64
LORA_TARGETS = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
print("="*70)
print("PHASE 2: SFT TRAINING")
print("="*70)
# [1/4] Load model
print(f"\n[1/4] Loading {BASE_MODEL}...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
attn_implementation="eager"
)
print(f" Model loaded")
print(f" GPU memory: {torch.cuda.memory_allocated()/1e9:.1f} GB")
# [2/4] Apply LoRA
print(f"\n[2/4] Applying LoRA...")
lora_config = LoraConfig(
r=LORA_RANK,
lora_alpha=LORA_ALPHA,
target_modules=LORA_TARGETS,
lora_dropout=0.0,
bias="none",
task_type="CAUSAL_LM",
init_lora_weights="gaussian",
use_rslora=True,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Enable input gradients for LoRA
model.enable_input_require_grads()
# [3/4] Load and tokenize data
print(f"\n[3/4] Loading and tokenizing data...")
dataset = load_from_disk(str(DATA_DIR / "sft"))
print(f" Dataset: {len(dataset)} samples")
def tokenize_function(examples):
# Format messages using chat template
texts = []
for msg in examples["messages"]:
text = tokenizer.apply_chat_template(
msg,
tokenize=False,
add_generation_prompt=False
)
texts.append(text + tokenizer.eos_token)
# Tokenize with padding and truncation
result = tokenizer(
texts,
truncation=True,
max_length=MAX_SEQ_LEN,
padding="max_length",
return_tensors=None
)
# Labels = input_ids (simple list, not nested)
result["labels"] = result["input_ids"].copy()
return result
print(" Tokenizing...")
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset.column_names,
desc="Tokenizing",
num_proc=4
)
print(f" Tokenized: {len(tokenized_dataset)} samples")
# [4/4] Train
print(f"\n[4/4] Training...")
steps_per_epoch = len(tokenized_dataset) // (BATCH_SIZE * GRAD_ACCUM)
total_steps = steps_per_epoch * EPOCHS
print(f" Batch size: {BATCH_SIZE}")
print(f" Grad accum: {GRAD_ACCUM}")
print(f" Effective batch: {BATCH_SIZE * GRAD_ACCUM}")
print(f" Steps per epoch: {steps_per_epoch}")
print(f" Total steps: {total_steps}")
print(f" Learning rate: {LR}")
print(f" Estimated time: ~30-40 min")
training_args = TrainingArguments(
output_dir=str(CKPT_DIR),
num_train_epochs=EPOCHS,
per_device_train_batch_size=BATCH_SIZE,
gradient_accumulation_steps=GRAD_ACCUM,
learning_rate=LR,
lr_scheduler_type="cosine",
warmup_ratio=0.03,
weight_decay=0.01,
bf16=True,
logging_steps=10,
save_strategy="no",
optim="adamw_torch",
gradient_checkpointing=True,
seed=42,
report_to="none",
dataloader_num_workers=4,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
)
print(f"\n{'='*70}")
print("TRAINING STARTED")
print(f"{'='*70}\n")
start = time.time()
trainer.train()
elapsed = (time.time() - start) / 60
print(f"\n{'='*70}")
print(f"✓ TRAINING COMPLETE: {elapsed:.1f} minutes")
print(f"{'='*70}")
# Save
print(f"\nSaving model...")
adapter_path = CKPT_DIR / "adapter"
model.save_pretrained(str(adapter_path))
tokenizer.save_pretrained(str(adapter_path))
print(f" ✓ Adapter: {adapter_path}")
# Merge
print(f"\nMerging LoRA weights...")
model = model.merge_and_unload()
merged_path = CKPT_DIR / "merged"
model.save_pretrained(str(merged_path))
tokenizer.save_pretrained(str(merged_path))
print(f" ✓ Merged: {merged_path}")
del model, trainer
torch.cuda.empty_cache()
print(f"\n{'='*70}")
print(f"✓ PHASE 2 COMPLETE")
print(f"{'='*70}")
print(f"\nTime: {elapsed:.1f} minutes")
print(f"Cost: ~${elapsed/60 * 1.15:.2f}")
print(f"\n➡️ Next: python phase3_eval.py")
|