dataset: name: "mohsin416/Python-Alpaca-5k" shuffle_seed: 42 SYSTEM_PROMPT: "<|system|>\nYou are a senior Python developer. Provide clear, correct, well-commented code.<|end|>\n\n" USER_TOKEN: "<|user|>\n" ASSISTANT_TOKEN: "<|assistant|>\n" END_TOKEN: "<|end|>" model: base_model_id: "microsoft/Phi-3-mini-128k-instruct" attn_implementation: "flash_attention_2" quantization: load_in_4bit: True bnb_4bit_quant_type: "nf4" bnb_4bit_compute_dtype: "bfloat16" bnb_4bit_use_double_quant: True lora: r: 32 load_alpha: 32 lora_dropout: 0.1 bias: "None" task_type: "CAUSAL_LM" use_rslora: True paths: output_dir: "artifacts/outputs" adapter_save_dir: "artifacts/phi3-python-instruct-adapter" final_model_repo: "mohsin416/phi3-python-instruct" training: per_device_train_batch_size: 4 per_device_eval_batch_size: 4 gradient_accumulation_steps: 8 num_train_epochs: 2 learning_rate: 2.0e-5 warmup_ratio: 0.1 warmup_steps: 0 bf16: True tf32: False fp16: False lr_scheduler_type: "cosine" optim: "paged_adamw_8bit" gradient_checkpointing: True gradient_checkpointing_kwargs: {"use_reentrant": False} max_grad_norm: 1.0 weight_decay: 0.01 logging_steps: 50 eval_steps: 50 save_steps: 50 eval_strategy: "steps" save_strategy: "steps" save_total_limit: 3 load_best_model_at_end: True metric_for_best_model: "eval_loss" greater_is_better: False prediction_loss_only: True report_to: "wandb" dataloader_num_workers: 4 dataloader_pin_memory: True max_seq_length: 4096 dataset_text_field: "text" label_names: ["labels"] neftune_noise_alpha: 5 wandb: project_name: "Phi-3-mini-128k-instruct-metrics"