File size: 4,691 Bytes
2e31dbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/env python3
"""Phase 2: SFT training on Qwen3-4B"""

import os
import time
import torch
from pathlib import Path
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

# Config
BASE_MODEL = "Qwen/Qwen3-4B"
DATA_DIR = Path("./qwen3_pipeline/data")
CKPT_DIR = Path("./qwen3_pipeline/checkpoint")
CKPT_DIR.mkdir(parents=True, exist_ok=True)

EPOCHS = 1
BATCH_SIZE = 2
GRAD_ACCUM = 8
LR = 2e-4
MAX_SEQ_LEN = 4096
LORA_RANK = 32
LORA_ALPHA = 64
LORA_TARGETS = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

print("="*70)
print("PHASE 2: SFT TRAINING")
print("="*70)

# [1/4] Load model
print(f"\n[1/4] Loading {BASE_MODEL}...")

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="eager"
)

print(f"  Model loaded")
print(f"  GPU memory: {torch.cuda.memory_allocated()/1e9:.1f} GB")

# [2/4] Apply LoRA
print(f"\n[2/4] Applying LoRA...")

lora_config = LoraConfig(
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGETS,
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
    init_lora_weights="gaussian",
    use_rslora=True,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Enable input gradients for LoRA
model.enable_input_require_grads()

# [3/4] Load and tokenize data
print(f"\n[3/4] Loading and tokenizing data...")

dataset = load_from_disk(str(DATA_DIR / "sft"))
print(f"  Dataset: {len(dataset)} samples")

def tokenize_function(examples):
    # Format messages using chat template
    texts = []
    for msg in examples["messages"]:
        text = tokenizer.apply_chat_template(
            msg,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text + tokenizer.eos_token)
    
    # Tokenize with padding and truncation
    result = tokenizer(
        texts,
        truncation=True,
        max_length=MAX_SEQ_LEN,
        padding="max_length",
        return_tensors=None
    )
    
    # Labels = input_ids (simple list, not nested)
    result["labels"] = result["input_ids"].copy()
    
    return result

print("  Tokenizing...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Tokenizing",
    num_proc=4
)

print(f"  Tokenized: {len(tokenized_dataset)} samples")

# [4/4] Train
print(f"\n[4/4] Training...")

steps_per_epoch = len(tokenized_dataset) // (BATCH_SIZE * GRAD_ACCUM)
total_steps = steps_per_epoch * EPOCHS

print(f"  Batch size: {BATCH_SIZE}")
print(f"  Grad accum: {GRAD_ACCUM}")
print(f"  Effective batch: {BATCH_SIZE * GRAD_ACCUM}")
print(f"  Steps per epoch: {steps_per_epoch}")
print(f"  Total steps: {total_steps}")
print(f"  Learning rate: {LR}")
print(f"  Estimated time: ~30-40 min")

training_args = TrainingArguments(
    output_dir=str(CKPT_DIR),
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    weight_decay=0.01,
    bf16=True,
    logging_steps=10,
    save_strategy="no",
    optim="adamw_torch",
    gradient_checkpointing=True,
    seed=42,
    report_to="none",
    dataloader_num_workers=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

print(f"\n{'='*70}")
print("TRAINING STARTED")
print(f"{'='*70}\n")

start = time.time()
trainer.train()
elapsed = (time.time() - start) / 60

print(f"\n{'='*70}")
print(f"✓ TRAINING COMPLETE: {elapsed:.1f} minutes")
print(f"{'='*70}")

# Save
print(f"\nSaving model...")

adapter_path = CKPT_DIR / "adapter"
model.save_pretrained(str(adapter_path))
tokenizer.save_pretrained(str(adapter_path))
print(f"  ✓ Adapter: {adapter_path}")

# Merge
print(f"\nMerging LoRA weights...")
model = model.merge_and_unload()

merged_path = CKPT_DIR / "merged"
model.save_pretrained(str(merged_path))
tokenizer.save_pretrained(str(merged_path))
print(f"  ✓ Merged: {merged_path}")

del model, trainer
torch.cuda.empty_cache()

print(f"\n{'='*70}")
print(f"✓ PHASE 2 COMPLETE")
print(f"{'='*70}")
print(f"\nTime: {elapsed:.1f} minutes")
print(f"Cost: ~${elapsed/60 * 1.15:.2f}")
print(f"\n➡️  Next: python phase3_eval.py")