run_name: OLMo-300M seed: 6198 epoch: null dry_run: false model: d_model: 1536 n_heads: 8 n_kv_heads: 4 clip_qkv: null n_layers: 8 mlp_ratio: 8 mlp_hidden_size: null activation_type: swiglu block_type: sequential block_group_size: 1 alibi: false alibi_bias_max: 8.0 rope: true rope_full_precision: true flash_attention: true attention_dropout: 0.0 multi_query_attention: null attention_layer_norm: false residual_dropout: 0.0 embedding_dropout: 0.0 layer_norm_type: default layer_norm_with_affine: false attention_layer_norm_with_affine: false max_sequence_length: 2048 include_bias: false bias_for_layer_norm: false scale_logits: false vocab_size: 50280 embedding_size: 50304 weight_tying: true eos_token_id: 50279 pad_token_id: 1 init_device: meta init_fn: mitchell init_std: 0.02 init_cutoff_factor: null precision: amp_bf16 use_moe: false moe_num_experts: 6 moe_top_k: 2 use_mod: false mod_capacity_factor: 0.125 mod_every: 2 optimizer: name: adamw learning_rate: 0.0005 weight_decay: 0.1 betas: - 0.9 - 0.95 no_decay_norm_and_bias: null decay_norm_and_bias: false decay_embeddings: false metrics_log_interval: 10 scheduler: name: cosine_with_warmup units: steps t_warmup: 2000 t_max: null alpha_f: 0.1 grad_clip_warmup_steps: null grad_clip_warmup_factor: null data: paths: - ../tokenized_data/text/part-0-00000.npy - ../tokenized_data/text/part-1-00000.npy - ../tokenized_data/text/part-2-00000.npy - ../tokenized_data/text/part-3-00000.npy datasets: null label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: true pin_memory: true prefetch_factor: 32 persistent_workers: true timeout: 0 seed: null restore_dataloader: true fast_forward_batches: null evaluators: - label: v3-small-ppl-validation type: lm data: paths: null datasets: v3-small-c4_en-validation: - ./validation/v3/c4_en/part-0-00000.npy v3-small-dolma_books-validation: - ./validation/v3/dolma_books/part-0-00000.npy v3-small-dolma_common-crawl-validation: - ./validation/v3/dolma_common-crawl/part-0-00000.npy v3-small-dolma_pes2o-validation: - ./validation/v3/dolma_pes2o/part-0-00000.npy v3-small-dolma_reddit-validation: - ./validation/v3/dolma_reddit/part-0-00000.npy v3-small-dolma_stack-validation: - ./validation/v3/dolma_stack/part-0-00000.npy v3-small-dolma_wiki-validation: - ./validation/v3/dolma_wiki/part-0-00000.npy v3-small-ice-validation: - ./validation/v3/ice/part-0-00000.npy v3-small-m2d2_s2orc-validation: - ./validation/v3/m2d2_s2orc/part-0-00000.npy v3-small-pile-validation: - ./validation/v3/pile/part-0-00000.npy v3-small-wikitext_103-validation: - ./validation/v3/wikitext_103/part-0-00000.npy label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: true pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null - label: v2-small-ppl-validation type: lm data: paths: null datasets: v2-small-4chan-validation: - ./validation/v2/4chan/val.npy v2-small-c4_100_domains-validation: - ./validation/v2/c4_100_domains/val.npy v2-small-c4_en-validation: - ./validation/v2/c4_en/val.npy v2-small-gab-validation: - ./validation/v2/gab/val.npy v2-small-ice-validation: - ./validation/v2/ice/val.npy v2-small-m2d2_s2orc-validation: - ./validation/v2/m2d2_s2orc/val.npy v2-small-m2d2_wiki-validation: - ./validation/v2/m2d2_wiki/val.npy v2-small-manosphere-validation: - ./validation/v2/manosphere/val.npy v2-small-mc4_en-validation: - ./validation/v2/mc4_en/val.npy v2-small-pile-validation: - ./validation/v2/pile/val.npy v2-small-ptb-validation: - ./validation/v2/ptb/val.npy v2-small-twitterAEE-validation: - ./validation/v2/twitterAEE/val.npy v2-small-wikitext_103-validation: - ./validation/v2/wikitext_103/val.npy label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: true pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null - label: piqa type: downstream data: paths: null datasets: null label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: false pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null - label: hellaswag type: downstream data: paths: null datasets: null label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: false pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null - label: winogrande type: downstream data: paths: null datasets: null label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: false pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null - label: openbook_qa type: downstream data: paths: null datasets: null label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: false pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null - label: sciq type: downstream data: paths: null datasets: null label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: false pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null - label: arc_easy type: downstream data: paths: null datasets: null label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: false pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null - label: copa type: downstream data: paths: null datasets: null label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: false pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null - label: rte type: downstream data: paths: null datasets: null label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: false pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null - label: commitment_bank type: downstream data: paths: null datasets: null label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: false pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null - label: mrpc type: downstream data: paths: null datasets: null label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: false pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null - label: sst2 type: downstream data: paths: null datasets: null label_mask_paths: null pad_direction: right generate_attention_mask: false num_workers: 0 drop_last: false pin_memory: false prefetch_factor: null persistent_workers: false timeout: 0 seed: null device_eval_batch_size: null subset_num_batches: null eval_interval: 1000 tokenizer: identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json truncate_direction: right save_folder: ./checkpoints/olmo-tiny/OLMo-300M remote_save_folder: null canceled_check_interval: 50 save_interval: 2000 save_interval_unsharded: 10000 save_interval_ephemeral: null save_num_checkpoints_to_keep: 9 save_num_unsharded_checkpoints_to_keep: -1 save_overwrite: false force_save_unsharded: false no_pre_train_checkpoint: false load_path: null load_path_sharded_checkpointer: null reset_optimizer_state: false reset_trainer_state: false sharded_checkpointer: torch_legacy new_style_checkpoints: null max_duration: 7393280 global_train_batch_size: 20 device_train_batch_size: 20 device_train_microbatch_size: 10 device_eval_batch_size: 10 eval_subset_num_batches: -1 eval_on_load: false device_train_grad_accum: 2 max_grad_norm: 1.0 max_grad_norm_ratio: null precision: amp_bf16 use_msamp: false wandb: project: olmo-300m entity: doosen group: null name: OLMo-300M tags: - watching log_artifacts: false rank_zero_only: true log_interval: 1 speed_monitor: window_size: 20 gpu_flops_available: null console_log_interval: 1 compile: mode: null fullgraph: false backend: inductor fsdp: use_orig_params: true sharding_strategy: FULL_SHARD wrapping_strategy: null precision: mixed softmax_auxiliary_loss: false time_limit: 171000.0 extra_steps_after_cancel: 10 early_stopping_factor: null save_data_indices: true python_profiling: false torch_profiling: false stop_at: null stop_after: null activation_checkpointing: null fused_loss: null