| bnb_cfgs: | |
| bnb_4bit_compute_dtype: float16 | |
| bnb_4bit_quant_type: nf4 | |
| bnb_4bit_use_double_quant: true | |
| load_in_4bit: true | |
| load_in_8bit: false | |
| use_bnb: false | |
| data_cfgs: | |
| eval_data_files: null | |
| eval_datasets: null | |
| eval_optional_args: [] | |
| eval_size: null | |
| eval_split: null | |
| eval_subset: null | |
| eval_template: null | |
| train_data_files: Decomposition_all_merge_result.json | |
| train_datasets: /home/hansirui/pcwen/align-anything/data/ | |
| train_name: null | |
| train_optional_args: [] | |
| train_size: null | |
| train_split: train | |
| train_template: DO | |
| logger_cfgs: | |
| cache_dir: null | |
| log_project: DOdataset_test | |
| log_run_name: DOdataset_Qwen2.5-3B | |
| log_type: wandb | |
| output_dir: /aifs4su/hansirui/pcwen_output/DO_models_Qwen2.5-3B | |
| save_interval: 100000 | |
| lora_cfgs: | |
| inference_mode: false | |
| lora_alpha: 16 | |
| lora_dropout: 0.1 | |
| r: 16 | |
| save_full_model: true | |
| target_modules: | |
| - q_proj | |
| - v_proj | |
| task_type: TaskType.CAUSAL_LM | |
| use_lora: false | |
| model_cfgs: | |
| model_max_length: 2048 | |
| model_name_or_path: Qwen/Qwen2.5-3B | |
| trust_remote_code: true | |
| special_tokens: null | |
| train_cfgs: | |
| adam_betas: | |
| - 0.9 | |
| - 0.95 | |
| adam_epsilon: 1.0e-08 | |
| bf16: true | |
| ds_cfgs: ds_z3_config.json | |
| epochs: 3 | |
| eval_interval: 10 | |
| eval_strategy: epoch | |
| fp16: false | |
| gradient_accumulation_steps: 16 | |
| gradient_checkpointing: true | |
| learning_rate: 2.0e-05 | |
| lr_scheduler_type: cosine | |
| lr_warmup_ratio: 0.03 | |
| max_grad_norm: 1.0 | |
| per_device_eval_batch_size: 4 | |
| per_device_train_batch_size: 4 | |
| seed: 42 | |
| weight_decay: 0.0 | |