wlabchoi
/

training-scripts

wlabchoi commited on Dec 11, 2025

Commit

145a6a7

verified ·

1 Parent(s): 80fd90a

Upload train_qwen3_distillation.py with huggingface_hub

Files changed (1) hide show

train_qwen3_distillation.py CHANGED Viewed

@@ -155,7 +155,11 @@ lora_config = LoraConfig(
 )
 student_model = get_peft_model(student_model, lora_config)
 student_model.print_trainable_parameters()
-print("✓ Student model loaded with LoRA")
 # MiniLLM Distillation Trainer
 class MiniLLMTrainer(Trainer):
@@ -261,9 +265,9 @@ training_args = TrainingArguments(
     # Training
     num_train_epochs=3,
-    per_device_train_batch_size=1,
-    per_device_eval_batch_size=1,
-    gradient_accumulation_steps=16,
     # Optimization
     learning_rate=2e-4,
@@ -284,7 +288,7 @@ training_args = TrainingArguments(
     run_name="qwen3-0.6b-telecom-minillm",
     # Memory
-    gradient_checkpointing=True,
     bf16=True,
     # Hub

 )
 student_model = get_peft_model(student_model, lora_config)
 student_model.print_trainable_parameters()
+# Verify trainable parameters
+trainable_params = sum(p.numel() for p in student_model.parameters() if p.requires_grad)
+assert trainable_params > 0, "No trainable parameters found!"
+print(f"✓ Student model loaded with LoRA ({trainable_params:,} trainable params)")
 # MiniLLM Distillation Trainer
 class MiniLLMTrainer(Trainer):
     # Training
     num_train_epochs=3,
+    per_device_train_batch_size=2,  # Increased from 1 (no gradient checkpointing)
+    per_device_eval_batch_size=2,
+    gradient_accumulation_steps=8,  # Effective batch size = 16
     # Optimization
     learning_rate=2e-4,
     run_name="qwen3-0.6b-telecom-minillm",
     # Memory
+    gradient_checkpointing=False,  # Disabled - conflicts with LoRA + dual model distillation
     bf16=True,
     # Hub