# config.py - Training configuration
from qwen_distill import QwenDistillationConfig

class MyConfig(QwenDistillationConfig):
    def __init__(self):
        super().__init__()
        
        # Paths
        self.data_file = "data/train.txt"
        self.teacher_model_name = "Qwen/Qwen2.5-0.5B"
        
        # Student size (adjust based on your needs)
        # Small: 3 layers, 128 hidden = ~30M params
        # Medium: 5 layers, 256 hidden = ~100M params
        # Large: 8 layers, 384 hidden = ~250M params
        
        self.student_num_layers = 5
        self.student_hidden_dim = 256
        self.student_num_heads = 4
        
        # Training
        self.batch_size = 2
        self.gradient_accumulation_steps = 4
        self.max_steps = 2000
        self.learning_rate = 8e-4
        
        # Distillation
        self.temperature = 3.0
        self.alpha = 0.8  # 80% KD loss
        self.beta = 0.2   # 20% feature loss
        
        # Memory
        self.use_gradient_checkpointing = True
        self.mixed_precision = "fp16"