Rujikorn's picture
Upload folder using huggingface_hub
7514da3 verified
_n_gpu: 1
adafactor: false
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1.0e-08
aggregator_type: perceiver
auto_find_batch_size: false
average_tokens_across_devices: false
batch_eval_metrics: true
bf16: true
bf16_full_eval: false
ctx_encoder_model_name_or_path: null
ctx_encoder_type: per_layer_activations
data_seed: null
dataloader_drop_last: false
dataloader_num_workers: 8
dataloader_persistent_workers: false
dataloader_pin_memory: true
dataloader_prefetch_factor: 16
ddp_backend: null
ddp_broadcast_buffers: null
ddp_bucket_cap_mb: null
ddp_find_unused_parameters: false
ddp_timeout: 1048576
debug: []
deepspeed: null
deepspeed_plugin: null
disable_tqdm: false
do_eval: true
do_predict: false
do_train: false
dropout_rate: 0.0
eval_accumulation_steps: null
eval_delay: 0
eval_do_concat_batches: true
eval_on_start: false
eval_steps: 1000
eval_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
- 'no'
eval_use_gather_object: false
exp_setup: !!python/object/apply:ctx_to_lora.configs.ExperimentSetup
- hyper_lora
extra_modules: null
fp16: false
fp16_backend: auto
fp16_full_eval: false
fp16_opt_level: O1
from_pretrained_checkpoint: train_outputs/runs/Sep29_14-42-46_slurm0-a3nodeset-9_88483_1e7bb34e/checkpoint-80000/pytorch_model.bin
fsdp: []
fsdp_config:
min_num_params: 0
xla: false
xla_fsdp_grad_ckpt: false
xla_fsdp_v2: false
fsdp_min_num_params: 0
fsdp_transformer_layer_cls_to_wrap: null
full_determinism: false
gen_lora_l1_reg_coef: 0.1
gen_per_device_eval_batch_size: 1
gradient_accumulation_steps: 16
gradient_checkpointing: false
gradient_checkpointing_kwargs: null
greater_is_better: null
group_by_length: false
half_precision_backend: auto
hub_always_push: false
hub_model_id: null
hub_private_repo: null
hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy
- every_save
hub_token: null
ignore_data_skip: false
include_inputs_for_metrics: false
include_num_input_tokens_seen: false
include_tokens_per_second: false
jit_mode_eval: false
label_smoothing_factor: 0.0
latent_size: 512
layer_idx: null
learning_rate: 2.0e-05
length_column_name: length
light_weight_latent_size: 128
load_best_model_at_end: false
local_rank: 4
log_level: passive
log_level_replica: warning
log_on_each_node: true
logging_dir: train_outputs/runs/Oct10_12-53-47_slurm0-a3nodeset-2_93442_fd3c1230
logging_first_step: true
logging_nan_inf_filter: true
logging_steps: 100
logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
- steps
lora_dropout: 0.0
lora_r: 8
lr_scheduler_kwargs:
min_lr: 1.0e-07
lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
- cosine_with_min_lr
max_base_len: 8192
max_ctx_chunk_len: 512
max_ctx_chunk_num: null
max_ctx_len: -1
max_grad_norm: 1.0
max_new_tokens: 256
max_packed_ctx_len: 2048
max_packed_inp_len: 1024
max_qas_len: 512
max_qas_per_sample: 1
max_steps: 20000
max_train_samples_per_ds: null
max_val_samples_per_ds: 1000
metric_for_best_model: null
min_ctx_chunk_len: 25
model_name_or_path: google/gemma-2-2b-it
mp_parameters: ''
n_latent_queries: 8
neftune_noise_alpha: 5.0
no_cuda: false
notes: null
num_blocks: 9
num_chunk_probs:
'1': '0.5'
'2': '0.125'
'3': '0.0625'
'4': '0.0625'
'5': '0.0625'
'6': '0.0625'
'7': '0.0625'
'8': '0.0625'
num_latent_factor: 8
num_pre_head_layers: 1
num_self_attn_per_block: 0
num_train_epochs: 3.0
optim: !!python/object/apply:transformers.training_args.OptimizerNames
- adamw_torch_fused
optim_args: null
optim_target_modules: null
output_dir: train_outputs/runs/Oct10_12-53-47_slurm0-a3nodeset-2_93442_fd3c1230
overwrite_output_dir: false
past_index: -1
per_device_eval_batch_size: 64
per_device_train_batch_size: 1
per_gpu_eval_batch_size: null
per_gpu_train_batch_size: null
per_layer_processing: true
per_rank_gen: true
pooling_type: mean
prediction_loss_only: false
push_to_hub: false
push_to_hub_model_id: null
push_to_hub_organization: null
push_to_hub_token: null
quantize_ctx_encoder: true
ray_scope: last
remove_unused_columns: false
report_to:
- tensorboard
- wandb
restore_callback_states_from_checkpoint: false
resume_from_checkpoint: null
run_name: Oct10_12-53-47_slurm0-a3nodeset-2_93442_fd3c1230
save_on_each_node: false
save_only_model: false
save_safetensors: false
save_steps: 5000
save_strategy: !!python/object/apply:transformers.trainer_utils.SaveStrategy
- steps
save_total_limit: 2
seed: 42
shared_weights: false
skip_memory_metrics: true
streaming: false
target_modules:
- down_proj
test_ds_names: null
tf32: true
torch_compile: false
torch_compile_backend: null
torch_compile_mode: null
torch_empty_cache_steps: 10
torchdynamo: null
tp_size: 0
tpu_metrics_debug: false
tpu_num_cores: null
train_ds_names:
- self_gen/google/gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0/fw_qa_v2/min_0_to_2000/train/*level_1*.parquet
- self_gen/google/gemma-2-2b-it_temp_0.0_closed_qa_prob_0.0/pwc_compact
- self_gen/google/gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0/squad_compact
- self_gen/google/gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0/ropes_compact
- self_gen/google/gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0/drop_compact
use_bias: true
use_cpu: false
use_flash_attn: true
use_ipex: false
use_kl_loss: true
use_legacy_prediction_loop: false
use_liger_kernel: false
use_light_weight_lora: false
use_mps_device: false
use_per_ctx_average_loss: true
use_per_rank_bias: false
use_sequence_packing: true
use_token_mixing: false
val_ds_names:
- squad
- pwc
- drop
- ropes
- self_gen/google/gemma-2-2b-it_temp_0.0_closed_qa_prob_0.0/fw_qa_v2/min_0_to_2000/train/*level_0_val*.parquet
warmup_ratio: 0.0
warmup_steps: 2000
weight_decay: 0.01