| max_seq_length = 500 |
|
|
| def fmt(examples): |
| print(len(examples)) |
| return examples |
| |
| # 'lora_r' is the dimension of the LoRA attention. |
| lora_r = 32 |
|
|
| # 'lora_alpha' is the alpha parameter for LoRA scaling. |
| lora_alpha = 16 |
|
|
| # 'lora_dropout' is the dropout probability for LoRA layers. |
| lora_dropout = 0.05 |
|
|
| # 'target_modules' is a list of the modules that should be targeted by LoRA. |
| target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"] |
|
|
| # 'se |
|
|
| peft_config = LoraConfig( |
| r=lora_r, |
| lora_alpha=lora_alpha, |
| lora_dropout=lora_dropout, |
| task_type=TaskType.CAUSAL_LM, |
| target_modules=target_modules, |
| ) |
|
|
| trainer = SFTTrainer( |
| model = model, |
| tokenizer = tokenizer, |
| train_dataset = qa_dataset['train'], |
| eval_dataset = qa_dataset['test'], |
| dataset_text_field = "text", |
| max_seq_length = max_seq_length, |
| dataset_num_proc = 4, |
| data_collator = collator, |
| # formatting_func = fmt, |
| # peft_config=peft_config, |
| args = TrainingArguments( |
| per_device_train_batch_size = 12, |
| gradient_checkpointing = True, |
| gradient_accumulation_steps = 4, |
| per_device_eval_batch_size = 40, |
| do_eval = True, |
| eval_strategy = 'steps', |
| eval_steps = 50, |
| # save_strategy = 'steps', |
| save_steps = 1000, |
|
|
| # Use num_train_epochs and warmup_ratio for longer runs! |
| # max_steps = 70, |
| # warmup_steps = 10, |
| # warmup_ratio = 0.1, |
| num_train_epochs = 2, |
|
|
| # Select a 2 to 10x smaller learning rate for the embedding matrices! |
| learning_rate = 3e-5, |
| # embedding_learning_rate = 1e-6, |
|
|
| # fp16 = not is_bfloat16_supported(), |
| bf16 = True, |
| logging_steps = 1, |
| optim = "adamw_torch", |
| weight_decay = 0.00, |
| lr_scheduler_type = "linear", |
| # seed = 3407, |
|
|
| output_dir = "llama_3b_step2_batch_v6", |
| ), |
| ) |