Spaces:
Sleeping
Sleeping
| dataset: | |
| name: "mohsin416/Python-Alpaca-5k" | |
| shuffle_seed: 42 | |
| SYSTEM_PROMPT: "<|system|>\nYou are a senior Python developer. Provide clear, correct, well-commented code.<|end|>\n\n" | |
| USER_TOKEN: "<|user|>\n" | |
| ASSISTANT_TOKEN: "<|assistant|>\n" | |
| END_TOKEN: "<|end|>" | |
| model: | |
| base_model_id: "microsoft/Phi-3-mini-128k-instruct" | |
| attn_implementation: "flash_attention_2" | |
| quantization: | |
| load_in_4bit: True | |
| bnb_4bit_quant_type: "nf4" | |
| bnb_4bit_compute_dtype: "bfloat16" | |
| bnb_4bit_use_double_quant: True | |
| lora: | |
| r: 32 | |
| load_alpha: 32 | |
| lora_dropout: 0.1 | |
| bias: "None" | |
| task_type: "CAUSAL_LM" | |
| use_rslora: True | |
| paths: | |
| output_dir: "artifacts/outputs" | |
| adapter_save_dir: "artifacts/phi3-python-instruct-adapter" | |
| final_model_repo: "mohsin416/phi3-python-instruct" | |
| training: | |
| per_device_train_batch_size: 4 | |
| per_device_eval_batch_size: 4 | |
| gradient_accumulation_steps: 8 | |
| num_train_epochs: 2 | |
| learning_rate: 2.0e-5 | |
| warmup_ratio: 0.1 | |
| warmup_steps: 0 | |
| bf16: True | |
| tf32: False | |
| fp16: False | |
| lr_scheduler_type: "cosine" | |
| optim: "paged_adamw_8bit" | |
| gradient_checkpointing: True | |
| gradient_checkpointing_kwargs: {"use_reentrant": False} | |
| max_grad_norm: 1.0 | |
| weight_decay: 0.01 | |
| logging_steps: 50 | |
| eval_steps: 50 | |
| save_steps: 50 | |
| eval_strategy: "steps" | |
| save_strategy: "steps" | |
| save_total_limit: 3 | |
| load_best_model_at_end: True | |
| metric_for_best_model: "eval_loss" | |
| greater_is_better: False | |
| prediction_loss_only: True | |
| report_to: "wandb" | |
| dataloader_num_workers: 4 | |
| dataloader_pin_memory: True | |
| max_seq_length: 4096 | |
| dataset_text_field: "text" | |
| label_names: ["labels"] | |
| neftune_noise_alpha: 5 | |
| wandb: | |
| project_name: "Phi-3-mini-128k-instruct-metrics" |