output_dir: checkpoints/EleutherAI/pythia-14m overwrite_output_dir: 'False' do_train: 'False' do_eval: 'False' do_predict: 'False' eval_strategy: IntervalStrategy.NO prediction_loss_only: 'False' per_device_train_batch_size: '8' per_device_eval_batch_size: '8' per_gpu_train_batch_size: None per_gpu_eval_batch_size: None gradient_accumulation_steps: '8' eval_accumulation_steps: None eval_delay: '0' torch_empty_cache_steps: None learning_rate: '0.001' weight_decay: '0.0' adam_beta1: '0.9' adam_beta2: '0.999' adam_epsilon: 1e-08 max_grad_norm: '1.0' num_train_epochs: '3.0' max_steps: '-1' lr_scheduler_type: SchedulerType.LINEAR lr_scheduler_kwargs: '{}' warmup_ratio: '0.0' warmup_steps: '0' log_level: warning log_level_replica: warning log_on_each_node: 'True' logging_dir: checkpoints/EleutherAI/pythia-14m/runs/Jul16_19-19-58_0082549b2b6f logging_strategy: IntervalStrategy.STEPS logging_first_step: 'True' logging_steps: '250' logging_nan_inf_filter: 'True' save_strategy: IntervalStrategy.STEPS save_steps: '300' save_total_limit: None save_safetensors: 'True' save_on_each_node: 'False' save_only_model: 'False' restore_callback_states_from_checkpoint: 'False' no_cuda: 'False' use_cpu: 'False' use_mps_device: 'False' seed: '42' data_seed: None jit_mode_eval: 'False' use_ipex: 'False' bf16: 'False' fp16: 'False' fp16_opt_level: O1 half_precision_backend: auto bf16_full_eval: 'False' fp16_full_eval: 'False' tf32: None local_rank: '0' ddp_backend: None tpu_num_cores: None tpu_metrics_debug: 'False' debug: '[]' dataloader_drop_last: 'False' eval_steps: None dataloader_num_workers: '0' dataloader_prefetch_factor: None past_index: '-1' run_name: EleutherAI/pythia-14m_distilled_from_pythia-14m disable_tqdm: 'False' remove_unused_columns: 'False' label_names: '[''input_ids'']' load_best_model_at_end: 'False' metric_for_best_model: None greater_is_better: None ignore_data_skip: 'False' fsdp: '[]' fsdp_min_num_params: '0' fsdp_config: '{''min_num_params'': 0, ''xla'': False, ''xla_fsdp_v2'': False, ''xla_fsdp_grad_ckpt'': False}' fsdp_transformer_layer_cls_to_wrap: None accelerator_config: '{''split_batches'': False, ''dispatch_batches'': None, ''even_batches'': True, ''use_seedable_sampler'': True, ''non_blocking'': False, ''gradient_accumulation_kwargs'': None, ''use_configured_state'': False}' deepspeed: None label_smoothing_factor: '0.0' optim: OptimizerNames.ADAMW_TORCH optim_args: None adafactor: 'False' group_by_length: 'False' length_column_name: length report_to: '[''wandb'']' ddp_find_unused_parameters: None ddp_bucket_cap_mb: None ddp_broadcast_buffers: None dataloader_pin_memory: 'True' dataloader_persistent_workers: 'False' skip_memory_metrics: 'True' use_legacy_prediction_loop: 'False' push_to_hub: 'True' resume_from_checkpoint: None hub_model_id: test-distillation hub_strategy: HubStrategy.EVERY_SAVE hub_token: None hub_private_repo: 'False' hub_always_push: 'False' gradient_checkpointing: 'False' gradient_checkpointing_kwargs: None include_inputs_for_metrics: 'False' eval_do_concat_batches: 'True' fp16_backend: auto evaluation_strategy: None push_to_hub_model_id: None push_to_hub_organization: None push_to_hub_token: None _n_gpu: '1' mp_parameters: '' auto_find_batch_size: 'False' full_determinism: 'False' torchdynamo: None ray_scope: last ddp_timeout: '1800' torch_compile: 'False' torch_compile_backend: None torch_compile_mode: None dispatch_batches: None split_batches: None include_tokens_per_second: 'False' include_num_input_tokens_seen: 'False' neftune_noise_alpha: None optim_target_modules: None batch_eval_metrics: 'False' eval_on_start: 'False' use_liger_kernel: 'False' eval_use_gather_object: 'False' checkpoints_dir: .//checkpoints/ init_step: '0' save_log_steps: '250' bucket_name: devinterp-language s3_folder: checkpoints/tetrahedron-3m delete_after_upload: 'False' push_to_aws: 'False'