| batch_size: 64 | |
| decoder_assistance_role: reflect | |
| decoder_base_conv_format: default | |
| decoder_base_conv_format_kwargs: null | |
| dropout: 0.0 | |
| eval_batch_size: 64 | |
| eval_data: | |
| heldout: | |
| - single_token_mc | |
| - ../../llama31_8b_data/eval_synthsys/heldout.pkl | |
| non_heldout: | |
| - single_token_mc | |
| - ../../llama31_8b_data/eval_synthsys/non_heldout.pkl | |
| eval_interval: 100 | |
| eval_name_mapping: | |
| test_templ_mc: test_mc | |
| train_templ_mc: train_mc | |
| eval_num_datapoints: 6400 | |
| eval_num_steps: null | |
| eval_patch_regions: | |
| - user | |
| fsdp_config: | |
| fsdp_activation_checkpointing: true | |
| fsdp_cpu_offload: false | |
| replica_group_size: 0 | |
| sharding_group_size: 0 | |
| sharding_strategy: 1 | |
| gradient_accumulation_steps: null | |
| gradient_clipping: false | |
| gradient_clipping_threshold: 1.0 | |
| hf_model_id: meta-llama/Llama-3.1-8B-Instruct | |
| layer_mapping: | |
| '0': 0 | |
| '1': 1 | |
| '10': 10 | |
| '11': 11 | |
| '12': 12 | |
| '13': 13 | |
| '14': 14 | |
| '15': 15 | |
| '16': 16 | |
| '17': 17 | |
| '18': 18 | |
| '19': 19 | |
| '2': 2 | |
| '20': 20 | |
| '21': 21 | |
| '22': 22 | |
| '23': 23 | |
| '24': 24 | |
| '25': 25 | |
| '26': 26 | |
| '27': 27 | |
| '28': 28 | |
| '29': 29 | |
| '3': 3 | |
| '30': 30 | |
| '31': 31 | |
| '4': 4 | |
| '5': 5 | |
| '6': 6 | |
| '7': 7 | |
| '8': 8 | |
| '9': 9 | |
| layer_to_read: 21 | |
| layer_to_write: 1 | |
| log_interval: 100 | |
| lora_config: | |
| lora_alpha: 32 | |
| lora_bias: none | |
| lora_dropout: 0.05 | |
| lora_r: 16 | |
| lora_target_modules: | |
| - q_proj | |
| - k_proj | |
| - v_proj | |
| - o_proj | |
| - gate_proj | |
| - up_proj | |
| - down_proj | |
| - lm_head | |
| lora_task_type: CAUSAL_LM | |
| lr: 0.0001 | |
| micro_batch_size: 8 | |
| min_lr_ratio: 0.1 | |
| new_context_tokens: null | |
| num_steps: 5000 | |
| pretrain_data_config: null | |
| read_layer_module_keys: | |
| - layer: 0 | |
| module: residual | |
| - layer: 1 | |
| module: residual | |
| - layer: 2 | |
| module: residual | |
| - layer: 3 | |
| module: residual | |
| - layer: 4 | |
| module: residual | |
| - layer: 5 | |
| module: residual | |
| - layer: 6 | |
| module: residual | |
| - layer: 7 | |
| module: residual | |
| - layer: 8 | |
| module: residual | |
| - layer: 9 | |
| module: residual | |
| - layer: 10 | |
| module: residual | |
| - layer: 11 | |
| module: residual | |
| - layer: 12 | |
| module: residual | |
| - layer: 13 | |
| module: residual | |
| - layer: 14 | |
| module: residual | |
| - layer: 15 | |
| module: residual | |
| - layer: 16 | |
| module: residual | |
| - layer: 17 | |
| module: residual | |
| - layer: 18 | |
| module: residual | |
| - layer: 19 | |
| module: residual | |
| - layer: 20 | |
| module: residual | |
| - layer: 21 | |
| module: residual | |
| - layer: 22 | |
| module: residual | |
| - layer: 23 | |
| module: residual | |
| - layer: 24 | |
| module: residual | |
| - layer: 25 | |
| module: residual | |
| - layer: 26 | |
| module: residual | |
| - layer: 27 | |
| module: residual | |
| - layer: 28 | |
| module: residual | |
| - layer: 29 | |
| module: residual | |
| - layer: 30 | |
| module: residual | |
| - layer: 31 | |
| module: residual | |
| save_final_checkpoint: true | |
| save_interval: 500 | |
| save_path: latentqa/llama31_8b_experiments/all_to_all | |
| seed: 7236 | |
| train_data_path: ../../llama31_8b_data/train.pkl | |
| train_patch_regions: | |
| - user | |
| use_fsdp: false | |
| use_peft: true | |
| use_wandb: true | |
| valid_data_path: null | |
| wandb_group_name: llama31_8b_experiments | |
| wandb_project: latentqa | |
| wandb_run_name: all_to_all | |
| warmup_steps: 0 | |
| weight_decay: 0.01 | |
| write_layer_module_keys: | |
| - layer: 0 | |
| module: residual | |
| - layer: 1 | |
| module: residual | |
| - layer: 2 | |
| module: residual | |
| - layer: 3 | |
| module: residual | |
| - layer: 4 | |
| module: residual | |
| - layer: 5 | |
| module: residual | |
| - layer: 6 | |
| module: residual | |
| - layer: 7 | |
| module: residual | |
| - layer: 8 | |
| module: residual | |
| - layer: 9 | |
| module: residual | |
| - layer: 10 | |
| module: residual | |
| - layer: 11 | |
| module: residual | |
| - layer: 12 | |
| module: residual | |
| - layer: 13 | |
| module: residual | |
| - layer: 14 | |
| module: residual | |
| - layer: 15 | |
| module: residual | |
| - layer: 16 | |
| module: residual | |
| - layer: 17 | |
| module: residual | |
| - layer: 18 | |
| module: residual | |
| - layer: 19 | |
| module: residual | |
| - layer: 20 | |
| module: residual | |
| - layer: 21 | |
| module: residual | |
| - layer: 22 | |
| module: residual | |
| - layer: 23 | |
| module: residual | |
| - layer: 24 | |
| module: residual | |
| - layer: 25 | |
| module: residual | |
| - layer: 26 | |
| module: residual | |
| - layer: 27 | |
| module: residual | |
| - layer: 28 | |
| module: residual | |
| - layer: 29 | |
| module: residual | |
| - layer: 30 | |
| module: residual | |
| - layer: 31 | |
| module: residual | |