prometheus04 commited on
Commit
2e31dbb
·
verified ·
1 Parent(s): d199eb1

Upload training_scripts/phase2_train.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. training_scripts/phase2_train.py +189 -0
training_scripts/phase2_train.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Phase 2: SFT training on Qwen3-4B"""
3
+
4
+ import os
5
+ import time
6
+ import torch
7
+ from pathlib import Path
8
+ from datasets import load_from_disk
9
+ from transformers import TrainingArguments, Trainer
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer
11
+ from peft import LoraConfig, get_peft_model
12
+
13
+ # Config
14
+ BASE_MODEL = "Qwen/Qwen3-4B"
15
+ DATA_DIR = Path("./qwen3_pipeline/data")
16
+ CKPT_DIR = Path("./qwen3_pipeline/checkpoint")
17
+ CKPT_DIR.mkdir(parents=True, exist_ok=True)
18
+
19
+ EPOCHS = 1
20
+ BATCH_SIZE = 2
21
+ GRAD_ACCUM = 8
22
+ LR = 2e-4
23
+ MAX_SEQ_LEN = 4096
24
+ LORA_RANK = 32
25
+ LORA_ALPHA = 64
26
+ LORA_TARGETS = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
27
+
28
+ print("="*70)
29
+ print("PHASE 2: SFT TRAINING")
30
+ print("="*70)
31
+
32
+ # [1/4] Load model
33
+ print(f"\n[1/4] Loading {BASE_MODEL}...")
34
+
35
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
36
+ if tokenizer.pad_token is None:
37
+ tokenizer.pad_token = tokenizer.eos_token
38
+ tokenizer.padding_side = "right"
39
+
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ BASE_MODEL,
42
+ torch_dtype=torch.bfloat16,
43
+ device_map="auto",
44
+ trust_remote_code=True,
45
+ attn_implementation="eager"
46
+ )
47
+
48
+ print(f" Model loaded")
49
+ print(f" GPU memory: {torch.cuda.memory_allocated()/1e9:.1f} GB")
50
+
51
+ # [2/4] Apply LoRA
52
+ print(f"\n[2/4] Applying LoRA...")
53
+
54
+ lora_config = LoraConfig(
55
+ r=LORA_RANK,
56
+ lora_alpha=LORA_ALPHA,
57
+ target_modules=LORA_TARGETS,
58
+ lora_dropout=0.0,
59
+ bias="none",
60
+ task_type="CAUSAL_LM",
61
+ init_lora_weights="gaussian",
62
+ use_rslora=True,
63
+ )
64
+
65
+ model = get_peft_model(model, lora_config)
66
+ model.print_trainable_parameters()
67
+
68
+ # Enable input gradients for LoRA
69
+ model.enable_input_require_grads()
70
+
71
+ # [3/4] Load and tokenize data
72
+ print(f"\n[3/4] Loading and tokenizing data...")
73
+
74
+ dataset = load_from_disk(str(DATA_DIR / "sft"))
75
+ print(f" Dataset: {len(dataset)} samples")
76
+
77
+ def tokenize_function(examples):
78
+ # Format messages using chat template
79
+ texts = []
80
+ for msg in examples["messages"]:
81
+ text = tokenizer.apply_chat_template(
82
+ msg,
83
+ tokenize=False,
84
+ add_generation_prompt=False
85
+ )
86
+ texts.append(text + tokenizer.eos_token)
87
+
88
+ # Tokenize with padding and truncation
89
+ result = tokenizer(
90
+ texts,
91
+ truncation=True,
92
+ max_length=MAX_SEQ_LEN,
93
+ padding="max_length",
94
+ return_tensors=None
95
+ )
96
+
97
+ # Labels = input_ids (simple list, not nested)
98
+ result["labels"] = result["input_ids"].copy()
99
+
100
+ return result
101
+
102
+ print(" Tokenizing...")
103
+ tokenized_dataset = dataset.map(
104
+ tokenize_function,
105
+ batched=True,
106
+ remove_columns=dataset.column_names,
107
+ desc="Tokenizing",
108
+ num_proc=4
109
+ )
110
+
111
+ print(f" Tokenized: {len(tokenized_dataset)} samples")
112
+
113
+ # [4/4] Train
114
+ print(f"\n[4/4] Training...")
115
+
116
+ steps_per_epoch = len(tokenized_dataset) // (BATCH_SIZE * GRAD_ACCUM)
117
+ total_steps = steps_per_epoch * EPOCHS
118
+
119
+ print(f" Batch size: {BATCH_SIZE}")
120
+ print(f" Grad accum: {GRAD_ACCUM}")
121
+ print(f" Effective batch: {BATCH_SIZE * GRAD_ACCUM}")
122
+ print(f" Steps per epoch: {steps_per_epoch}")
123
+ print(f" Total steps: {total_steps}")
124
+ print(f" Learning rate: {LR}")
125
+ print(f" Estimated time: ~30-40 min")
126
+
127
+ training_args = TrainingArguments(
128
+ output_dir=str(CKPT_DIR),
129
+ num_train_epochs=EPOCHS,
130
+ per_device_train_batch_size=BATCH_SIZE,
131
+ gradient_accumulation_steps=GRAD_ACCUM,
132
+ learning_rate=LR,
133
+ lr_scheduler_type="cosine",
134
+ warmup_ratio=0.03,
135
+ weight_decay=0.01,
136
+ bf16=True,
137
+ logging_steps=10,
138
+ save_strategy="no",
139
+ optim="adamw_torch",
140
+ gradient_checkpointing=True,
141
+ seed=42,
142
+ report_to="none",
143
+ dataloader_num_workers=4,
144
+ )
145
+
146
+ trainer = Trainer(
147
+ model=model,
148
+ args=training_args,
149
+ train_dataset=tokenized_dataset,
150
+ )
151
+
152
+ print(f"\n{'='*70}")
153
+ print("TRAINING STARTED")
154
+ print(f"{'='*70}\n")
155
+
156
+ start = time.time()
157
+ trainer.train()
158
+ elapsed = (time.time() - start) / 60
159
+
160
+ print(f"\n{'='*70}")
161
+ print(f"✓ TRAINING COMPLETE: {elapsed:.1f} minutes")
162
+ print(f"{'='*70}")
163
+
164
+ # Save
165
+ print(f"\nSaving model...")
166
+
167
+ adapter_path = CKPT_DIR / "adapter"
168
+ model.save_pretrained(str(adapter_path))
169
+ tokenizer.save_pretrained(str(adapter_path))
170
+ print(f" ✓ Adapter: {adapter_path}")
171
+
172
+ # Merge
173
+ print(f"\nMerging LoRA weights...")
174
+ model = model.merge_and_unload()
175
+
176
+ merged_path = CKPT_DIR / "merged"
177
+ model.save_pretrained(str(merged_path))
178
+ tokenizer.save_pretrained(str(merged_path))
179
+ print(f" ✓ Merged: {merged_path}")
180
+
181
+ del model, trainer
182
+ torch.cuda.empty_cache()
183
+
184
+ print(f"\n{'='*70}")
185
+ print(f"✓ PHASE 2 COMPLETE")
186
+ print(f"{'='*70}")
187
+ print(f"\nTime: {elapsed:.1f} minutes")
188
+ print(f"Cost: ~${elapsed/60 * 1.15:.2f}")
189
+ print(f"\n➡️ Next: python phase3_eval.py")