DO_models_Qwen2.5-3B / arguments.yaml
w3en2g's picture
Upload folder using huggingface_hub
e7bd72d verified
bnb_cfgs:
bnb_4bit_compute_dtype: float16
bnb_4bit_quant_type: nf4
bnb_4bit_use_double_quant: true
load_in_4bit: true
load_in_8bit: false
use_bnb: false
data_cfgs:
eval_data_files: null
eval_datasets: null
eval_optional_args: []
eval_size: null
eval_split: null
eval_subset: null
eval_template: null
train_data_files: Decomposition_all_merge_result.json
train_datasets: /home/hansirui/pcwen/align-anything/data/
train_name: null
train_optional_args: []
train_size: null
train_split: train
train_template: DO
logger_cfgs:
cache_dir: null
log_project: DOdataset_test
log_run_name: DOdataset_Qwen2.5-3B
log_type: wandb
output_dir: /aifs4su/hansirui/pcwen_output/DO_models_Qwen2.5-3B
save_interval: 100000
lora_cfgs:
inference_mode: false
lora_alpha: 16
lora_dropout: 0.1
r: 16
save_full_model: true
target_modules:
- q_proj
- v_proj
task_type: TaskType.CAUSAL_LM
use_lora: false
model_cfgs:
model_max_length: 2048
model_name_or_path: Qwen/Qwen2.5-3B
trust_remote_code: true
special_tokens: null
train_cfgs:
adam_betas:
- 0.9
- 0.95
adam_epsilon: 1.0e-08
bf16: true
ds_cfgs: ds_z3_config.json
epochs: 3
eval_interval: 10
eval_strategy: epoch
fp16: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
learning_rate: 2.0e-05
lr_scheduler_type: cosine
lr_warmup_ratio: 0.03
max_grad_norm: 1.0
per_device_eval_batch_size: 4
per_device_train_batch_size: 4
seed: 42
weight_decay: 0.0