wlabchoi commited on
Commit
145a6a7
·
verified ·
1 Parent(s): 80fd90a

Upload train_qwen3_distillation.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_qwen3_distillation.py +9 -5
train_qwen3_distillation.py CHANGED
@@ -155,7 +155,11 @@ lora_config = LoraConfig(
155
  )
156
  student_model = get_peft_model(student_model, lora_config)
157
  student_model.print_trainable_parameters()
158
- print("✓ Student model loaded with LoRA")
 
 
 
 
159
 
160
  # MiniLLM Distillation Trainer
161
  class MiniLLMTrainer(Trainer):
@@ -261,9 +265,9 @@ training_args = TrainingArguments(
261
 
262
  # Training
263
  num_train_epochs=3,
264
- per_device_train_batch_size=1,
265
- per_device_eval_batch_size=1,
266
- gradient_accumulation_steps=16,
267
 
268
  # Optimization
269
  learning_rate=2e-4,
@@ -284,7 +288,7 @@ training_args = TrainingArguments(
284
  run_name="qwen3-0.6b-telecom-minillm",
285
 
286
  # Memory
287
- gradient_checkpointing=True,
288
  bf16=True,
289
 
290
  # Hub
 
155
  )
156
  student_model = get_peft_model(student_model, lora_config)
157
  student_model.print_trainable_parameters()
158
+
159
+ # Verify trainable parameters
160
+ trainable_params = sum(p.numel() for p in student_model.parameters() if p.requires_grad)
161
+ assert trainable_params > 0, "No trainable parameters found!"
162
+ print(f"✓ Student model loaded with LoRA ({trainable_params:,} trainable params)")
163
 
164
  # MiniLLM Distillation Trainer
165
  class MiniLLMTrainer(Trainer):
 
265
 
266
  # Training
267
  num_train_epochs=3,
268
+ per_device_train_batch_size=2, # Increased from 1 (no gradient checkpointing)
269
+ per_device_eval_batch_size=2,
270
+ gradient_accumulation_steps=8, # Effective batch size = 16
271
 
272
  # Optimization
273
  learning_rate=2e-4,
 
288
  run_name="qwen3-0.6b-telecom-minillm",
289
 
290
  # Memory
291
+ gradient_checkpointing=False, # Disabled - conflicts with LoRA + dual model distillation
292
  bf16=True,
293
 
294
  # Hub