| _n_gpu: 1 |
| adafactor: false |
| adam_beta1: 0.9 |
| adam_beta2: 0.999 |
| adam_epsilon: 1.0e-08 |
| aggregator_type: perceiver |
| auto_find_batch_size: false |
| average_tokens_across_devices: false |
| batch_eval_metrics: true |
| bf16: true |
| bf16_full_eval: false |
| ctx_encoder_model_name_or_path: null |
| ctx_encoder_type: per_layer_activations |
| data_seed: null |
| dataloader_drop_last: false |
| dataloader_num_workers: 8 |
| dataloader_persistent_workers: false |
| dataloader_pin_memory: true |
| dataloader_prefetch_factor: 16 |
| ddp_backend: null |
| ddp_broadcast_buffers: null |
| ddp_bucket_cap_mb: null |
| ddp_find_unused_parameters: false |
| ddp_timeout: 1048576 |
| debug: [] |
| deepspeed: null |
| deepspeed_plugin: null |
| disable_tqdm: false |
| do_eval: true |
| do_predict: false |
| do_train: false |
| dropout_rate: 0.0 |
| eval_accumulation_steps: null |
| eval_delay: 0 |
| eval_do_concat_batches: true |
| eval_on_start: false |
| eval_steps: 1000 |
| eval_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy |
| - 'no' |
| eval_use_gather_object: false |
| exp_setup: !!python/object/apply:ctx_to_lora.configs.ExperimentSetup |
| - hyper_lora |
| extra_modules: null |
| fp16: false |
| fp16_backend: auto |
| fp16_full_eval: false |
| fp16_opt_level: O1 |
| from_pretrained_checkpoint: train_outputs/runs/Sep29_14-42-46_slurm0-a3nodeset-9_88483_1e7bb34e/checkpoint-80000/pytorch_model.bin |
| fsdp: [] |
| fsdp_config: |
| min_num_params: 0 |
| xla: false |
| xla_fsdp_grad_ckpt: false |
| xla_fsdp_v2: false |
| fsdp_min_num_params: 0 |
| fsdp_transformer_layer_cls_to_wrap: null |
| full_determinism: false |
| gen_lora_l1_reg_coef: 0.1 |
| gen_per_device_eval_batch_size: 1 |
| gradient_accumulation_steps: 16 |
| gradient_checkpointing: false |
| gradient_checkpointing_kwargs: null |
| greater_is_better: null |
| group_by_length: false |
| half_precision_backend: auto |
| hub_always_push: false |
| hub_model_id: null |
| hub_private_repo: null |
| hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy |
| - every_save |
| hub_token: null |
| ignore_data_skip: false |
| include_inputs_for_metrics: false |
| include_num_input_tokens_seen: false |
| include_tokens_per_second: false |
| jit_mode_eval: false |
| label_smoothing_factor: 0.0 |
| latent_size: 512 |
| layer_idx: null |
| learning_rate: 2.0e-05 |
| length_column_name: length |
| light_weight_latent_size: 128 |
| load_best_model_at_end: false |
| local_rank: 4 |
| log_level: passive |
| log_level_replica: warning |
| log_on_each_node: true |
| logging_dir: train_outputs/runs/Oct10_12-53-47_slurm0-a3nodeset-2_93442_fd3c1230 |
| logging_first_step: true |
| logging_nan_inf_filter: true |
| logging_steps: 100 |
| logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy |
| - steps |
| lora_dropout: 0.0 |
| lora_r: 8 |
| lr_scheduler_kwargs: |
| min_lr: 1.0e-07 |
| lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType |
| - cosine_with_min_lr |
| max_base_len: 8192 |
| max_ctx_chunk_len: 512 |
| max_ctx_chunk_num: null |
| max_ctx_len: -1 |
| max_grad_norm: 1.0 |
| max_new_tokens: 256 |
| max_packed_ctx_len: 2048 |
| max_packed_inp_len: 1024 |
| max_qas_len: 512 |
| max_qas_per_sample: 1 |
| max_steps: 20000 |
| max_train_samples_per_ds: null |
| max_val_samples_per_ds: 1000 |
| metric_for_best_model: null |
| min_ctx_chunk_len: 25 |
| model_name_or_path: google/gemma-2-2b-it |
| mp_parameters: '' |
| n_latent_queries: 8 |
| neftune_noise_alpha: 5.0 |
| no_cuda: false |
| notes: null |
| num_blocks: 9 |
| num_chunk_probs: |
| '1': '0.5' |
| '2': '0.125' |
| '3': '0.0625' |
| '4': '0.0625' |
| '5': '0.0625' |
| '6': '0.0625' |
| '7': '0.0625' |
| '8': '0.0625' |
| num_latent_factor: 8 |
| num_pre_head_layers: 1 |
| num_self_attn_per_block: 0 |
| num_train_epochs: 3.0 |
| optim: !!python/object/apply:transformers.training_args.OptimizerNames |
| - adamw_torch_fused |
| optim_args: null |
| optim_target_modules: null |
| output_dir: train_outputs/runs/Oct10_12-53-47_slurm0-a3nodeset-2_93442_fd3c1230 |
| overwrite_output_dir: false |
| past_index: -1 |
| per_device_eval_batch_size: 64 |
| per_device_train_batch_size: 1 |
| per_gpu_eval_batch_size: null |
| per_gpu_train_batch_size: null |
| per_layer_processing: true |
| per_rank_gen: true |
| pooling_type: mean |
| prediction_loss_only: false |
| push_to_hub: false |
| push_to_hub_model_id: null |
| push_to_hub_organization: null |
| push_to_hub_token: null |
| quantize_ctx_encoder: true |
| ray_scope: last |
| remove_unused_columns: false |
| report_to: |
| - tensorboard |
| - wandb |
| restore_callback_states_from_checkpoint: false |
| resume_from_checkpoint: null |
| run_name: Oct10_12-53-47_slurm0-a3nodeset-2_93442_fd3c1230 |
| save_on_each_node: false |
| save_only_model: false |
| save_safetensors: false |
| save_steps: 5000 |
| save_strategy: !!python/object/apply:transformers.trainer_utils.SaveStrategy |
| - steps |
| save_total_limit: 2 |
| seed: 42 |
| shared_weights: false |
| skip_memory_metrics: true |
| streaming: false |
| target_modules: |
| - down_proj |
| test_ds_names: null |
| tf32: true |
| torch_compile: false |
| torch_compile_backend: null |
| torch_compile_mode: null |
| torch_empty_cache_steps: 10 |
| torchdynamo: null |
| tp_size: 0 |
| tpu_metrics_debug: false |
| tpu_num_cores: null |
| train_ds_names: |
| - self_gen/google/gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0/fw_qa_v2/min_0_to_2000/train/*level_1*.parquet |
| - self_gen/google/gemma-2-2b-it_temp_0.0_closed_qa_prob_0.0/pwc_compact |
| - self_gen/google/gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0/squad_compact |
| - self_gen/google/gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0/ropes_compact |
| - self_gen/google/gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0/drop_compact |
| use_bias: true |
| use_cpu: false |
| use_flash_attn: true |
| use_ipex: false |
| use_kl_loss: true |
| use_legacy_prediction_loop: false |
| use_liger_kernel: false |
| use_light_weight_lora: false |
| use_mps_device: false |
| use_per_ctx_average_loss: true |
| use_per_rank_bias: false |
| use_sequence_packing: true |
| use_token_mixing: false |
| val_ds_names: |
| - squad |
| - pwc |
| - drop |
| - ropes |
| - self_gen/google/gemma-2-2b-it_temp_0.0_closed_qa_prob_0.0/fw_qa_v2/min_0_to_2000/train/*level_0_val*.parquet |
| warmup_ratio: 0.0 |
| warmup_steps: 2000 |
| weight_decay: 0.01 |
|
|