| checkpoints: |
| checkpoint_interval: 500 |
| checkpoints_path: /scratch/craffel/checkpoints/commav0p1-ablations-1p82G-commonpile0p1filteredcccc-seed-6- |
| checkpoints_path_is_shared_file_system: false |
| resume_checkpoint_path: s3://comma-v0.1-ablations/checkpoints/commav0p1-ablations-1p82G-commonpile0p1filteredcccc-seed-6- |
| save_initial_state: true |
| data: |
| dataset: |
| dataloader_type: single |
| dataset_max_tokens: null |
| dataset_weights: null |
| datasets: |
| - bits_per_token: 16 |
| filename_pattern: .*\.ds$ |
| folder: /scratch/dataset/commav0p1-ablations-1p82G-commonpile0p1filteredcccc-seed-6-/ |
| original_folder: null |
| seed: 6 |
| shuffle: true |
| skip_tokens: 0 |
| pad_samples_to_global_batch_size: false |
| skip_in_stream: true |
| num_loading_workers: 0 |
| seed: 6 |
| experiment_logger: |
| tensorboard_logger: |
| push_to_hub_interval: 300 |
| repo_id: craffel/commav0p1-ablations |
| repo_public: false |
| tensorboard_dir: /scratch/craffel/tensorboard-craffel-commav0p1-ablations |
| wandb_logger: null |
| general: |
| benchmark_csv_path: null |
| consumed_train_samples: 14336000 |
| ignore_sanity_checks: true |
| project: commav0p1-ablations |
| run: commav0p1-ablations-1p82G-commonpile0p1filteredcccc-seed-6- |
| seed: 42 |
| step: 14000 |
| kill_switch_path: null |
| lighteval: |
| batch_size: 16 |
| checkpoints_path: null |
| generation: null |
| logging: |
| hub_repo_details: null |
| hub_repo_results: null |
| hub_repo_tensorboard: craffel/commav0p1-ablations |
| local_output_path: /scratch/craffel/lighteval/commav0p1-ablations-1p82G-commonpile0p1filteredcccc-seed-6- |
| push_details_to_hub: false |
| push_results_to_hub: false |
| push_results_to_tensorboard: true |
| tensorboard_metric_prefix: e |
| parallelism: |
| dp: 8 |
| expert_parallel_size: 1 |
| pp: 1 |
| pp_engine: 1f1b |
| tp: 1 |
| tp_linear_async_communication: false |
| tp_mode: ALL_REDUCE |
| slurm_script_dir: /fsx/craffel/train/eval-scripts |
| slurm_template: /fsx/craffel/run_eval.slurm.jinja |
| tasks: |
| custom_tasks: brrr.lighteval.evaluation_tasks |
| dataset_loading_processes: 8 |
| max_samples: 1000 |
| multichoice_continuations_start_space: null |
| no_multichoice_continuations_start_space: null |
| num_fewshot_seeds: null |
| tasks: early-signal |
| wandb: null |
| logging: |
| iteration_step_info_interval: 1 |
| log_level: info |
| log_level_replica: info |
| model: |
| ddp_bucket_cap_mb: 25 |
| dtype: bfloat16 |
| init_method: |
| std: 0.02 |
| make_vocab_size_divisible_by: 1 |
| model_config: |
| bos_token_id: 1 |
| eos_token_id: 2 |
| hidden_act: silu |
| hidden_size: 2048 |
| initializer_range: 0.02 |
| intermediate_size: 8192 |
| is_llama_config: true |
| max_position_embeddings: 2048 |
| num_attention_heads: 32 |
| num_hidden_layers: 24 |
| num_key_value_heads: 32 |
| pad_token_id: null |
| pretraining_tp: 1 |
| rms_norm_eps: 1.0e-05 |
| rope_scaling: null |
| tie_word_embeddings: true |
| use_cache: true |
| vocab_size: 50272 |
| optimizer: |
| accumulate_grad_in_fp32: true |
| adam_beta1: 0.9 |
| adam_beta2: 0.95 |
| adam_eps: 1.0e-08 |
| clip_grad: 1.0 |
| learning_rate_scheduler: |
| learning_rate: 0.0003 |
| lr_decay_starting_step: null |
| lr_decay_steps: null |
| lr_decay_style: cosine |
| lr_warmup_steps: 500 |
| lr_warmup_style: linear |
| min_decay_lr: 3.0e-05 |
| torch_adam_is_fused: true |
| weight_decay: 0.1 |
| zero_stage: 0 |
| parallelism: |
| dp: 64 |
| expert_parallel_size: 1 |
| pp: 1 |
| pp_engine: 1f1b |
| tp: 1 |
| tp_linear_async_communication: true |
| tp_mode: REDUCE_SCATTER |
| profiler: null |
| s3_upload: |
| remove_after_upload: true |
| s5cmd_concurrency: 5 |
| s5cmd_numworkers: 16 |
| s5cmd_path: /fsx/craffel/miniconda3/envs/exp/bin/s5cmd |
| upload_s3_path: s3://comma-v0.1-ablations/checkpoints/commav0p1-ablations-1p82G-commonpile0p1filteredcccc-seed-6- |
| tokenizer: |
| tokenizer_max_length: null |
| tokenizer_name_or_path: gpt2 |
| tokenizer_revision: null |
| tokens: |
| batch_accumulation_per_replica: 4 |
| limit_test_batches: 0 |
| limit_val_batches: 0 |
| micro_batch_size: 4 |
| sequence_length: 2048 |
| train_steps: 14305 |
| val_check_interval: 100 |
|
|