| output_dir: checkpoints/EleutherAI/pythia-14m | |
| overwrite_output_dir: 'False' | |
| do_train: 'False' | |
| do_eval: 'False' | |
| do_predict: 'False' | |
| eval_strategy: IntervalStrategy.NO | |
| prediction_loss_only: 'False' | |
| per_device_train_batch_size: '8' | |
| per_device_eval_batch_size: '8' | |
| per_gpu_train_batch_size: None | |
| per_gpu_eval_batch_size: None | |
| gradient_accumulation_steps: '8' | |
| eval_accumulation_steps: None | |
| eval_delay: '0' | |
| torch_empty_cache_steps: None | |
| learning_rate: '0.001' | |
| weight_decay: '0.0' | |
| adam_beta1: '0.9' | |
| adam_beta2: '0.999' | |
| adam_epsilon: 1e-08 | |
| max_grad_norm: '1.0' | |
| num_train_epochs: '3.0' | |
| max_steps: '-1' | |
| lr_scheduler_type: SchedulerType.LINEAR | |
| lr_scheduler_kwargs: '{}' | |
| warmup_ratio: '0.0' | |
| warmup_steps: '0' | |
| log_level: warning | |
| log_level_replica: warning | |
| log_on_each_node: 'True' | |
| logging_dir: checkpoints/EleutherAI/pythia-14m/runs/Jul16_19-19-58_0082549b2b6f | |
| logging_strategy: IntervalStrategy.STEPS | |
| logging_first_step: 'True' | |
| logging_steps: '250' | |
| logging_nan_inf_filter: 'True' | |
| save_strategy: IntervalStrategy.STEPS | |
| save_steps: '300' | |
| save_total_limit: None | |
| save_safetensors: 'True' | |
| save_on_each_node: 'False' | |
| save_only_model: 'False' | |
| restore_callback_states_from_checkpoint: 'False' | |
| no_cuda: 'False' | |
| use_cpu: 'False' | |
| use_mps_device: 'False' | |
| seed: '42' | |
| data_seed: None | |
| jit_mode_eval: 'False' | |
| use_ipex: 'False' | |
| bf16: 'False' | |
| fp16: 'False' | |
| fp16_opt_level: O1 | |
| half_precision_backend: auto | |
| bf16_full_eval: 'False' | |
| fp16_full_eval: 'False' | |
| tf32: None | |
| local_rank: '0' | |
| ddp_backend: None | |
| tpu_num_cores: None | |
| tpu_metrics_debug: 'False' | |
| debug: '[]' | |
| dataloader_drop_last: 'False' | |
| eval_steps: None | |
| dataloader_num_workers: '0' | |
| dataloader_prefetch_factor: None | |
| past_index: '-1' | |
| run_name: EleutherAI/pythia-14m_distilled_from_pythia-14m | |
| disable_tqdm: 'False' | |
| remove_unused_columns: 'False' | |
| label_names: '[''input_ids'']' | |
| load_best_model_at_end: 'False' | |
| metric_for_best_model: None | |
| greater_is_better: None | |
| ignore_data_skip: 'False' | |
| fsdp: '[]' | |
| fsdp_min_num_params: '0' | |
| fsdp_config: '{''min_num_params'': 0, ''xla'': False, ''xla_fsdp_v2'': False, ''xla_fsdp_grad_ckpt'': | |
| False}' | |
| fsdp_transformer_layer_cls_to_wrap: None | |
| accelerator_config: '{''split_batches'': False, ''dispatch_batches'': None, ''even_batches'': | |
| True, ''use_seedable_sampler'': True, ''non_blocking'': False, ''gradient_accumulation_kwargs'': | |
| None, ''use_configured_state'': False}' | |
| deepspeed: None | |
| label_smoothing_factor: '0.0' | |
| optim: OptimizerNames.ADAMW_TORCH | |
| optim_args: None | |
| adafactor: 'False' | |
| group_by_length: 'False' | |
| length_column_name: length | |
| report_to: '[''wandb'']' | |
| ddp_find_unused_parameters: None | |
| ddp_bucket_cap_mb: None | |
| ddp_broadcast_buffers: None | |
| dataloader_pin_memory: 'True' | |
| dataloader_persistent_workers: 'False' | |
| skip_memory_metrics: 'True' | |
| use_legacy_prediction_loop: 'False' | |
| push_to_hub: 'True' | |
| resume_from_checkpoint: None | |
| hub_model_id: test-distillation | |
| hub_strategy: HubStrategy.EVERY_SAVE | |
| hub_token: None | |
| hub_private_repo: 'False' | |
| hub_always_push: 'False' | |
| gradient_checkpointing: 'False' | |
| gradient_checkpointing_kwargs: None | |
| include_inputs_for_metrics: 'False' | |
| eval_do_concat_batches: 'True' | |
| fp16_backend: auto | |
| evaluation_strategy: None | |
| push_to_hub_model_id: None | |
| push_to_hub_organization: None | |
| push_to_hub_token: None | |
| _n_gpu: '1' | |
| mp_parameters: '' | |
| auto_find_batch_size: 'False' | |
| full_determinism: 'False' | |
| torchdynamo: None | |
| ray_scope: last | |
| ddp_timeout: '1800' | |
| torch_compile: 'False' | |
| torch_compile_backend: None | |
| torch_compile_mode: None | |
| dispatch_batches: None | |
| split_batches: None | |
| include_tokens_per_second: 'False' | |
| include_num_input_tokens_seen: 'False' | |
| neftune_noise_alpha: None | |
| optim_target_modules: None | |
| batch_eval_metrics: 'False' | |
| eval_on_start: 'False' | |
| use_liger_kernel: 'False' | |
| eval_use_gather_object: 'False' | |
| checkpoints_dir: .//checkpoints/ | |
| init_step: '0' | |
| save_log_steps: '250' | |
| bucket_name: devinterp-language | |
| s3_folder: checkpoints/tetrahedron-3m | |
| delete_after_upload: 'False' | |
| push_to_aws: 'False' | |