| | |
| | """Phase 2: SFT training on Qwen3-4B""" |
| |
|
| | import os |
| | import time |
| | import torch |
| | from pathlib import Path |
| | from datasets import load_from_disk |
| | from transformers import TrainingArguments, Trainer |
| | from transformers import AutoModelForCausalLM, AutoTokenizer |
| | from peft import LoraConfig, get_peft_model |
| |
|
| | |
| | BASE_MODEL = "Qwen/Qwen3-4B" |
| | DATA_DIR = Path("./qwen3_pipeline/data") |
| | CKPT_DIR = Path("./qwen3_pipeline/checkpoint") |
| | CKPT_DIR.mkdir(parents=True, exist_ok=True) |
| |
|
| | EPOCHS = 1 |
| | BATCH_SIZE = 2 |
| | GRAD_ACCUM = 8 |
| | LR = 2e-4 |
| | MAX_SEQ_LEN = 4096 |
| | LORA_RANK = 32 |
| | LORA_ALPHA = 64 |
| | LORA_TARGETS = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"] |
| |
|
| | print("="*70) |
| | print("PHASE 2: SFT TRAINING") |
| | print("="*70) |
| |
|
| | |
| | print(f"\n[1/4] Loading {BASE_MODEL}...") |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) |
| | if tokenizer.pad_token is None: |
| | tokenizer.pad_token = tokenizer.eos_token |
| | tokenizer.padding_side = "right" |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | BASE_MODEL, |
| | torch_dtype=torch.bfloat16, |
| | device_map="auto", |
| | trust_remote_code=True, |
| | attn_implementation="eager" |
| | ) |
| |
|
| | print(f" Model loaded") |
| | print(f" GPU memory: {torch.cuda.memory_allocated()/1e9:.1f} GB") |
| |
|
| | |
| | print(f"\n[2/4] Applying LoRA...") |
| |
|
| | lora_config = LoraConfig( |
| | r=LORA_RANK, |
| | lora_alpha=LORA_ALPHA, |
| | target_modules=LORA_TARGETS, |
| | lora_dropout=0.0, |
| | bias="none", |
| | task_type="CAUSAL_LM", |
| | init_lora_weights="gaussian", |
| | use_rslora=True, |
| | ) |
| |
|
| | model = get_peft_model(model, lora_config) |
| | model.print_trainable_parameters() |
| |
|
| | |
| | model.enable_input_require_grads() |
| |
|
| | |
| | print(f"\n[3/4] Loading and tokenizing data...") |
| |
|
| | dataset = load_from_disk(str(DATA_DIR / "sft")) |
| | print(f" Dataset: {len(dataset)} samples") |
| |
|
| | def tokenize_function(examples): |
| | |
| | texts = [] |
| | for msg in examples["messages"]: |
| | text = tokenizer.apply_chat_template( |
| | msg, |
| | tokenize=False, |
| | add_generation_prompt=False |
| | ) |
| | texts.append(text + tokenizer.eos_token) |
| | |
| | |
| | result = tokenizer( |
| | texts, |
| | truncation=True, |
| | max_length=MAX_SEQ_LEN, |
| | padding="max_length", |
| | return_tensors=None |
| | ) |
| | |
| | |
| | result["labels"] = result["input_ids"].copy() |
| | |
| | return result |
| |
|
| | print(" Tokenizing...") |
| | tokenized_dataset = dataset.map( |
| | tokenize_function, |
| | batched=True, |
| | remove_columns=dataset.column_names, |
| | desc="Tokenizing", |
| | num_proc=4 |
| | ) |
| |
|
| | print(f" Tokenized: {len(tokenized_dataset)} samples") |
| |
|
| | |
| | print(f"\n[4/4] Training...") |
| |
|
| | steps_per_epoch = len(tokenized_dataset) // (BATCH_SIZE * GRAD_ACCUM) |
| | total_steps = steps_per_epoch * EPOCHS |
| |
|
| | print(f" Batch size: {BATCH_SIZE}") |
| | print(f" Grad accum: {GRAD_ACCUM}") |
| | print(f" Effective batch: {BATCH_SIZE * GRAD_ACCUM}") |
| | print(f" Steps per epoch: {steps_per_epoch}") |
| | print(f" Total steps: {total_steps}") |
| | print(f" Learning rate: {LR}") |
| | print(f" Estimated time: ~30-40 min") |
| |
|
| | training_args = TrainingArguments( |
| | output_dir=str(CKPT_DIR), |
| | num_train_epochs=EPOCHS, |
| | per_device_train_batch_size=BATCH_SIZE, |
| | gradient_accumulation_steps=GRAD_ACCUM, |
| | learning_rate=LR, |
| | lr_scheduler_type="cosine", |
| | warmup_ratio=0.03, |
| | weight_decay=0.01, |
| | bf16=True, |
| | logging_steps=10, |
| | save_strategy="no", |
| | optim="adamw_torch", |
| | gradient_checkpointing=True, |
| | seed=42, |
| | report_to="none", |
| | dataloader_num_workers=4, |
| | ) |
| |
|
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=tokenized_dataset, |
| | ) |
| |
|
| | print(f"\n{'='*70}") |
| | print("TRAINING STARTED") |
| | print(f"{'='*70}\n") |
| |
|
| | start = time.time() |
| | trainer.train() |
| | elapsed = (time.time() - start) / 60 |
| |
|
| | print(f"\n{'='*70}") |
| | print(f"✓ TRAINING COMPLETE: {elapsed:.1f} minutes") |
| | print(f"{'='*70}") |
| |
|
| | |
| | print(f"\nSaving model...") |
| |
|
| | adapter_path = CKPT_DIR / "adapter" |
| | model.save_pretrained(str(adapter_path)) |
| | tokenizer.save_pretrained(str(adapter_path)) |
| | print(f" ✓ Adapter: {adapter_path}") |
| |
|
| | |
| | print(f"\nMerging LoRA weights...") |
| | model = model.merge_and_unload() |
| |
|
| | merged_path = CKPT_DIR / "merged" |
| | model.save_pretrained(str(merged_path)) |
| | tokenizer.save_pretrained(str(merged_path)) |
| | print(f" ✓ Merged: {merged_path}") |
| |
|
| | del model, trainer |
| | torch.cuda.empty_cache() |
| |
|
| | print(f"\n{'='*70}") |
| | print(f"✓ PHASE 2 COMPLETE") |
| | print(f"{'='*70}") |
| | print(f"\nTime: {elapsed:.1f} minutes") |
| | print(f"Cost: ~${elapsed/60 * 1.15:.2f}") |
| | print(f"\n➡️ Next: python phase3_eval.py") |
| |
|