| | --- |
| | library_name: peft |
| | base_model: athirdpath/BigMistral-13b |
| | --- |
| | base_model: athirdpath/BigMistral-13b |
| | |
| | model_type: MistralForCausalLM |
| |
|
| | tokenizer_type: LlamaTokenizer |
| | |
| | is_mistral_derived_model: true |
| |
|
| | load_in_4bit: true |
| |
|
| | datasets: |
| |
|
| | - path: glueLORA2.jsonl |
| | - |
| | type: alpaca |
| |
|
| | val_set_size: 0.07 |
| |
|
| | adapter: qlora |
| |
|
| | sequence_len: 4096 |
| | |
| | sample_packing: true |
| |
|
| | pad_to_sequence_len: true |
| | |
| | lora_r: 512 |
| |
|
| | lora_alpha: 32 |
| | |
| | lora_dropout: 0.04 |
| |
|
| | lora_target_linear: true |
| |
|
| | gradient_accumulation_steps: 6 |
| |
|
| | micro_batch_size: 3 |
| |
|
| | eval_batch_size: 3 |
| |
|
| | num_epochs: 4 |
| | |
| | optimizer: adamw_bnb_8bit |
| | |
| | lr_scheduler: cosine |
| |
|
| | learning_rate: 0.00005 |
| | |
| | bf16: true |
| | |
| | gradient_checkpointing: true |
| |
|
| | flash_attention: true |
| | |
| | warmup_steps: 10 |
| |
|
| | weight_decay: 0.00001 |