all_to_all / train_lqa_config.yaml
choidami's picture
Upload folder using huggingface_hub
6822c61 verified
batch_size: 64
decoder_assistance_role: reflect
decoder_base_conv_format: default
decoder_base_conv_format_kwargs: null
dropout: 0.0
eval_batch_size: 64
eval_data:
heldout:
- single_token_mc
- ../../llama31_8b_data/eval_synthsys/heldout.pkl
non_heldout:
- single_token_mc
- ../../llama31_8b_data/eval_synthsys/non_heldout.pkl
eval_interval: 100
eval_name_mapping:
test_templ_mc: test_mc
train_templ_mc: train_mc
eval_num_datapoints: 6400
eval_num_steps: null
eval_patch_regions:
- user
fsdp_config:
fsdp_activation_checkpointing: true
fsdp_cpu_offload: false
replica_group_size: 0
sharding_group_size: 0
sharding_strategy: 1
gradient_accumulation_steps: null
gradient_clipping: false
gradient_clipping_threshold: 1.0
hf_model_id: meta-llama/Llama-3.1-8B-Instruct
layer_mapping:
'0': 0
'1': 1
'10': 10
'11': 11
'12': 12
'13': 13
'14': 14
'15': 15
'16': 16
'17': 17
'18': 18
'19': 19
'2': 2
'20': 20
'21': 21
'22': 22
'23': 23
'24': 24
'25': 25
'26': 26
'27': 27
'28': 28
'29': 29
'3': 3
'30': 30
'31': 31
'4': 4
'5': 5
'6': 6
'7': 7
'8': 8
'9': 9
layer_to_read: 21
layer_to_write: 1
log_interval: 100
lora_config:
lora_alpha: 32
lora_bias: none
lora_dropout: 0.05
lora_r: 16
lora_target_modules:
- q_proj
- k_proj
- v_proj
- o_proj
- gate_proj
- up_proj
- down_proj
- lm_head
lora_task_type: CAUSAL_LM
lr: 0.0001
micro_batch_size: 8
min_lr_ratio: 0.1
new_context_tokens: null
num_steps: 5000
pretrain_data_config: null
read_layer_module_keys:
- layer: 0
module: residual
- layer: 1
module: residual
- layer: 2
module: residual
- layer: 3
module: residual
- layer: 4
module: residual
- layer: 5
module: residual
- layer: 6
module: residual
- layer: 7
module: residual
- layer: 8
module: residual
- layer: 9
module: residual
- layer: 10
module: residual
- layer: 11
module: residual
- layer: 12
module: residual
- layer: 13
module: residual
- layer: 14
module: residual
- layer: 15
module: residual
- layer: 16
module: residual
- layer: 17
module: residual
- layer: 18
module: residual
- layer: 19
module: residual
- layer: 20
module: residual
- layer: 21
module: residual
- layer: 22
module: residual
- layer: 23
module: residual
- layer: 24
module: residual
- layer: 25
module: residual
- layer: 26
module: residual
- layer: 27
module: residual
- layer: 28
module: residual
- layer: 29
module: residual
- layer: 30
module: residual
- layer: 31
module: residual
save_final_checkpoint: true
save_interval: 500
save_path: latentqa/llama31_8b_experiments/all_to_all
seed: 7236
train_data_path: ../../llama31_8b_data/train.pkl
train_patch_regions:
- user
use_fsdp: false
use_peft: true
use_wandb: true
valid_data_path: null
wandb_group_name: llama31_8b_experiments
wandb_project: latentqa
wandb_run_name: all_to_all
warmup_steps: 0
weight_decay: 0.01
write_layer_module_keys:
- layer: 0
module: residual
- layer: 1
module: residual
- layer: 2
module: residual
- layer: 3
module: residual
- layer: 4
module: residual
- layer: 5
module: residual
- layer: 6
module: residual
- layer: 7
module: residual
- layer: 8
module: residual
- layer: 9
module: residual
- layer: 10
module: residual
- layer: 11
module: residual
- layer: 12
module: residual
- layer: 13
module: residual
- layer: 14
module: residual
- layer: 15
module: residual
- layer: 16
module: residual
- layer: 17
module: residual
- layer: 18
module: residual
- layer: 19
module: residual
- layer: 20
module: residual
- layer: 21
module: residual
- layer: 22
module: residual
- layer: 23
module: residual
- layer: 24
module: residual
- layer: 25
module: residual
- layer: 26
module: residual
- layer: 27
module: residual
- layer: 28
module: residual
- layer: 29
module: residual
- layer: 30
module: residual
- layer: 31
module: residual