| | |
| | seed_everything: 123 |
| | trainer: |
| | accelerator: gpu |
| | strategy: |
| | class_path: lightning.pytorch.strategies.DeepSpeedStrategy |
| | init_args: |
| | accelerator: null |
| | zero_optimization: true |
| | stage: 2 |
| | remote_device: null |
| | offload_optimizer: false |
| | offload_parameters: true |
| | offload_params_device: cpu |
| | nvme_path: /local_nvme |
| | params_buffer_count: 5 |
| | params_buffer_size: 100000000 |
| | max_in_cpu: 1000000000 |
| | offload_optimizer_device: cpu |
| | optimizer_buffer_count: 4 |
| | block_size: 1048576 |
| | queue_depth: 8 |
| | single_submit: false |
| | overlap_events: true |
| | thread_count: 1 |
| | pin_memory: true |
| | sub_group_size: 1000000000000 |
| | contiguous_gradients: true |
| | overlap_comm: true |
| | allgather_partitions: true |
| | reduce_scatter: true |
| | allgather_bucket_size: 200000000 |
| | reduce_bucket_size: 200000000 |
| | zero_allow_untested_optimizer: true |
| | logging_batch_size_per_gpu: auto |
| | config: null |
| | logging_level: 30 |
| | parallel_devices: null |
| | cluster_environment: null |
| | loss_scale: 0.0 |
| | initial_scale_power: 16 |
| | loss_scale_window: 1000 |
| | hysteresis: 2 |
| | min_loss_scale: 1 |
| | partition_activations: false |
| | cpu_checkpointing: false |
| | contiguous_memory_optimization: false |
| | synchronize_checkpoint_boundary: false |
| | load_full_weights: false |
| | precision_plugin: null |
| | process_group_backend: null |
| | devices: 8 |
| | num_nodes: 1 |
| | precision: bf16-true |
| | logger: |
| | class_path: lightning.pytorch.loggers.TensorBoardLogger |
| | init_args: |
| | save_dir: /media/logs |
| | name: main |
| | version: null |
| | log_graph: false |
| | default_hp_metric: true |
| | prefix: '' |
| | sub_dir: null |
| | comment: '' |
| | purge_step: null |
| | max_queue: 10 |
| | flush_secs: 120 |
| | filename_suffix: '' |
| | callbacks: null |
| | fast_dev_run: false |
| | max_epochs: 2 |
| | min_epochs: null |
| | max_steps: -1 |
| | min_steps: null |
| | max_time: null |
| | limit_train_batches: null |
| | limit_val_batches: null |
| | limit_test_batches: null |
| | limit_predict_batches: null |
| | overfit_batches: 0.0 |
| | val_check_interval: null |
| | check_val_every_n_epoch: 1 |
| | num_sanity_val_steps: 0 |
| | log_every_n_steps: 1 |
| | enable_checkpointing: null |
| | enable_progress_bar: null |
| | enable_model_summary: null |
| | accumulate_grad_batches: 8 |
| | gradient_clip_val: null |
| | gradient_clip_algorithm: null |
| | deterministic: null |
| | benchmark: null |
| | inference_mode: true |
| | use_distributed_sampler: true |
| | profiler: null |
| | detect_anomaly: false |
| | barebones: false |
| | plugins: null |
| | sync_batchnorm: false |
| | reload_dataloaders_every_n_epochs: 0 |
| | default_root_dir: null |
| | model: |
| | config: |
| | model_name: Mistral-7B-v0.2 |
| | dtype: bfloat16 |
| | num_thoughts: 2 |
| | thought_length: 8 |
| | lookahead_tokens: 4 |
| | embedding_grad_weights: 100.0 |
| | temperature: 1.0 |
| | do_sample: true |
| | train_max_length: 120 |
| | offload_cache: false |
| | top_k: null |
| | top_p: null |
| | checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2 |
| | weight_decay: 0.001 |
| | warmup_steps: 20 |
| | policy_weight: 1.0 |
| | init_lr: 1.0e-06 |
| | optimizer: |
| | class_path: torch.optim.AdamW |
| | init_args: |
| | lr: 1.0e-06 |
| | betas: |
| | - 0.9 |
| | - 0.999 |
| | eps: 1.0e-08 |
| | weight_decay: 0.001 |
| | amsgrad: false |
| | maximize: false |
| | foreach: null |
| | capturable: false |
| | differentiable: false |
| | fused: null |
| | scheduler: null |
| | ckpt_path: null |
| | data: |
| | class_path: src.dataset.OpenWebMathDataModule |
| | init_args: |
| | data_path: /media/datasets/openwebmath |
| | tokenizer: |
| | class_path: src.dataset.SpecialTokenizer |
| | init_args: |
| | checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2 |
| | batch_size: 1 |
| | max_seq_length: 120 |
| | num_samples: 2048 |
| | ignore_index: -100 |
| | val_split_fraction: 0.125 |
| | seed: 42 |
| | num_workers: 1 |
| |
|