| model_name: meta-llama/Meta-Llama-3-8B-Instruct |
| exp_name: Llama-3-8b-optimizers |
| run_name: Riemannion-WD-OI-new |
| base_dir: /home/vabogachev/RiemanianFinetune/runs/${exp_name}/${run_name}/final |
| cfg_no: 1 |
| tasks: |
| - FINETUNE |
| - VALIDATE |
| - INFERENCE |
| max_length: 768 |
| dataset_path: /home/vabogachev/RiemanianFinetune/datasets/common_reasoning |
| n_shots: 0 |
| fp16: false |
| bf16: true |
| num_ths: 3 |
| loader_config: |
| num_proc: ${num_ths} |
| tokenizer_config: |
| padding_side: left |
| report_to: comet_ml |
| detailed_lora_logs: false |
| detailed_riemannian_logs: true |
| adapter_config: |
| peft_pretrained: true |
| peft_is_trainable: true |
| merge_tuned: true |
| peft_pretrained_path: ${base_dir}/finetuned_model_cfg-${cfg_no} |
| ft_strategy: LoRA |
| peft_init_path: ${base_dir}/initialization_${cfg_no} |
| target_modules: |
| - q_proj |
| - k_proj |
| - v_proj |
| - o_proj |
| - up_proj |
| - down_proj |
| - gate_proj |
| LoRA_config: |
| r: 32 |
| lora_alpha: ${adapter_config.LoRA_config.r} |
| lora_dropout: 0.05 |
| target_modules: ${adapter_config.target_modules} |
| split2zero: true |
| init_strategy: riemannian |
| B_gain_npwr: 0.5 |
| B_gain_rpwr: -0.5 |
| B_gain_mult: -1.0 |
| evaluation_config: |
| num_splits: 10 |
| max_new_tokens: 4 |
| batch_size: 8 |
| empty_cache: true |
| dump_path: ${base_dir}/cfg-${cfg_no}_preds_CR_{0}.bin |
| optimizer_config: |
| optim: Riemannion |
| lr: 0.0001 |
| momentum: 0.9 |
| nesterov: false |
| weight_decay: 0.00316 |
| trainer_config: |
| run_name: ${exp_name}/${run_name}-final-cfg${cfg_no} |
| output_dir: bogachevv/${exp_name}-${run_name}-final-cfg${cfg_no} |
| max_seq_length: ${max_length} |
| dataset_text_field: text |
| fp16: ${fp16} |
| bf16: ${bf16} |
| full_determinism: false |
| per_device_train_batch_size: 4 |
| per_device_eval_batch_size: 8 |
| gradient_accumulation_steps: 16 |
| lr_scheduler_type: linear |
| warmup_ratio: 0.1 |
| num_train_epochs: 2 |
| dataloader_num_workers: ${num_ths} |
| dataset_num_proc: ${num_ths} |
| eval_strategy: steps |
| eval_steps: 128 |
| logging_steps: 16 |
| load_best_model_at_end: true |
| seed: 23654 |
| data_seed: 23654 |
| report_to: ${report_to} |
| disable_tqdm: true |
| save_strategy: steps |
| save_steps: 128 |
| save_total_limit: 4 |
|
|
|
|