| { |
| "model": { |
| "d_model": 2048, |
| "vocab_size": 640, |
| "n_layers": 16, |
| "block": { |
| "attention": { |
| "name": "default", |
| "n_heads": 16, |
| "bias": false, |
| "rope": { |
| "name": "default", |
| "theta": 500000, |
| "full_precision": true, |
| "_CLASS_": "olmo_core.nn.rope.RoPEConfig" |
| }, |
| "qk_norm": { |
| "name": "rms", |
| "eps": 1e-06, |
| "bias": false, |
| "dtype": "float32", |
| "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig" |
| }, |
| "use_flash": true, |
| "dtype": "float32", |
| "_CLASS_": "olmo_core.nn.attention.AttentionConfig" |
| }, |
| "layer_norm": { |
| "name": "rms", |
| "eps": 1e-06, |
| "bias": false, |
| "dtype": "float32", |
| "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig" |
| }, |
| "feed_forward": { |
| "hidden_size": 8192, |
| "name": "default", |
| "bias": false, |
| "dtype": "float32", |
| "act_name": "silu", |
| "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig" |
| }, |
| "name": "reordered_norm", |
| "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig" |
| }, |
| "lm_head": { |
| "name": "default", |
| "layer_norm": { |
| "name": "rms", |
| "eps": 1e-06, |
| "bias": false, |
| "dtype": "float32", |
| "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig" |
| }, |
| "bias": false, |
| "dtype": "float32", |
| "loss_implementation": "default", |
| "_CLASS_": "olmo_core.nn.lm_head.LMHeadConfig" |
| }, |
| "name": "bolmo_distill", |
| "dtype": "float32", |
| "init_method": "normal", |
| "init_seed": 0, |
| "init_std": 0.02, |
| "freeze_params": [ |
| "boundary_predictor.*", |
| "teacher_embeddings.*" |
| ], |
| "local_encoder": { |
| "sliding_window_size": 0, |
| "d_model": 2048, |
| "n_layers": 1, |
| "block_config": { |
| "attention": { |
| "name": "default", |
| "n_heads": 16, |
| "dtype": "float32", |
| "_CLASS_": "olmo_core.nn.attention.AttentionConfig" |
| }, |
| "layer_norm": { |
| "name": "rms", |
| "eps": 1e-06, |
| "bias": false, |
| "dtype": "float32", |
| "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig" |
| }, |
| "feed_forward": { |
| "hidden_size": 2816, |
| "name": "default", |
| "bias": false, |
| "dtype": "float32", |
| "act_name": "silu", |
| "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig" |
| }, |
| "xlstm": { |
| "num_heads": 16, |
| "dtype": "float32", |
| "_CLASS_": "olmo_core.nn.xlstm.XLSTMConfig" |
| }, |
| "name": "xlstm", |
| "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig" |
| }, |
| "cross_attn_n_heads": 0, |
| "cross_attn_do_project": true, |
| "cross_attn_init_pooling": "amax", |
| "pooling": "hnet", |
| "add_hash_embeddings": false, |
| "add_expanded_embeddings": true, |
| "hash_byte_group_size": [ |
| 3, |
| 4, |
| 5, |
| 6, |
| 7, |
| 8 |
| ], |
| "hash_byte_group_vocab": [ |
| 1536, |
| 3072, |
| 6144, |
| 12288, |
| 24576, |
| 49152 |
| ], |
| "hash_byte_group_nb_functions": 1, |
| "add_norm_after_last_block": true, |
| "add_norm_after_pool": false, |
| "add_out_projection": true, |
| "boundary_predictor": "hnet", |
| "boundary_predictor_lookahead": 1, |
| "represent_bytes_with_embeddings": false, |
| "represent_bytes_with_last_mixed_out": false, |
| "blt_compat": false, |
| "dtype": "float32", |
| "_CLASS_": "olmo_core.nn.bolmo.config.LocalEncoderConfig" |
| }, |
| "local_decoder": { |
| "sliding_window_size": 0, |
| "d_model": 2048, |
| "n_layers": 4, |
| "cross_attn_n_heads": 0, |
| "block_config": { |
| "attention": { |
| "name": "default", |
| "n_heads": 16, |
| "dtype": "float32", |
| "_CLASS_": "olmo_core.nn.attention.AttentionConfig" |
| }, |
| "layer_norm": { |
| "name": "rms", |
| "eps": 1e-06, |
| "bias": false, |
| "dtype": "float32", |
| "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig" |
| }, |
| "feed_forward": { |
| "hidden_size": 2816, |
| "name": "default", |
| "bias": false, |
| "dtype": "float32", |
| "act_name": "silu", |
| "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig" |
| }, |
| "xlstm": { |
| "num_heads": 16, |
| "dtype": "float32", |
| "_CLASS_": "olmo_core.nn.xlstm.XLSTMConfig" |
| }, |
| "name": "xlstm", |
| "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig" |
| }, |
| "depooling": "hnet", |
| "add_norm_before_first_block": true, |
| "add_norm_onto_residual": false, |
| "add_in_projection": true, |
| "add_projected_patch_residuals": false, |
| "hnet_smooth": false, |
| "hnet_smooth_ste": false, |
| "hnet_modulate": false, |
| "blt_compat": false, |
| "fuse_boundaries": true, |
| "no_boundaries": false, |
| "dtype": "float32", |
| "_CLASS_": "olmo_core.nn.bolmo.config.LocalDecoderConfig" |
| }, |
| "share_blocks_between_teacher_and_student": false, |
| "_CLASS_": "olmo_core.nn.transformer.config.TransformerConfig" |
| }, |
| "dataset": { |
| "tokenizer": { |
| "vocab_size": 520, |
| "eos_token_id": 1, |
| "pad_token_id": 0, |
| "bos_token_id": 1, |
| "special_tokens": [ |
| "<pad>", |
| "<bos>", |
| "<eos>", |
| "<bpe_token_end>" |
| ], |
| "special_tokens_first": true, |
| "original_identifier": "allenai/dolma2-tokenizer", |
| "bpe_token_end_id": 3, |
| "_CLASS_": "olmo_core.data.tokenizer.ByteTokenizerConfig" |
| }, |
| "paths": [], |
| "expand_glob": false, |
| "include_instance_metadata": true, |
| "work_dir": "", |
| "ignore_fingerprint_mismatch": false, |
| "sequence_length": 4096, |
| "generate_doc_lengths": false, |
| "byte_sequence_length": 24576, |
| "_CLASS_": "olmo_core.data.numpy_dataset.NumpyByteFSLDatasetConfig" |
| }, |
| "data_loader": { |
| "global_batch_size": 1572864, |
| "seed": 1234, |
| "num_workers": 24, |
| "ignore_fingerprint_mismatch": false, |
| "_CLASS_": "olmo_core.data.data_loader.NumpyDataLoaderConfig" |
| }, |
| "train_module": { |
| "rank_microbatch_size": 98304, |
| "max_sequence_length": 24576, |
| "optim": { |
| "group_overrides": [ |
| { |
| "params": [ |
| "local_encoder.embedding.weight", |
| "local_encoder.expanded_embeddings.weight" |
| ], |
| "opts": { |
| "weight_decay": 0.0 |
| }, |
| "_CLASS_": "olmo_core.optim.config.OptimGroupOverride" |
| }, |
| { |
| "params": [ |
| "blocks.*" |
| ], |
| "opts": { |
| "lr": 2.6e-05 |
| }, |
| "_CLASS_": "olmo_core.optim.config.OptimGroupOverride" |
| } |
| ], |
| "compile": false, |
| "fixed_fields": [ |
| "initial_lr" |
| ], |
| "lr": 5.2e-05, |
| "betas": [ |
| 0.9, |
| 0.95 |
| ], |
| "eps": 1e-08, |
| "weight_decay": 0.1, |
| "_CLASS_": "olmo_core.optim.adamw.AdamWConfig" |
| }, |
| "max_grad_norm": 0.5, |
| "scheduler": { |
| "lr_field": "lr", |
| "initial_lr_field": "initial_lr", |
| "units": "steps", |
| "alpha_f": 0.0, |
| "warmup_fraction": 0.1, |
| "warmup_min_lr": 0.0, |
| "_CLASS_": "olmo_core.optim.scheduler.LinearWithWarmup" |
| }, |
| "compile_model": true, |
| "float8_config": { |
| "enabled": false, |
| "_CLASS_": "olmo_core.float8.Float8Config" |
| }, |
| "dp_config": { |
| "name": "fsdp", |
| "param_dtype": "bfloat16", |
| "reduce_dtype": "float32", |
| "wrapping_strategy": "full", |
| "prefetch_factor": 0, |
| "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerDataParallelConfig" |
| }, |
| "bolmo_config": { |
| "tokenizer": { |
| "vocab_size": 520, |
| "eos_token_id": 1, |
| "pad_token_id": 0, |
| "bos_token_id": 1, |
| "special_tokens": [ |
| "<pad>", |
| "<bos>", |
| "<eos>", |
| "<bpe_token_end>" |
| ], |
| "special_tokens_first": true, |
| "original_identifier": "allenai/dolma2-tokenizer", |
| "bpe_token_end_id": 3, |
| "_CLASS_": "olmo_core.data.tokenizer.ByteTokenizerConfig" |
| }, |
| "losses": [ |
| "ce", |
| "boundary" |
| ], |
| "loss_weights": [ |
| 1.0, |
| 4.0 |
| ], |
| "binarization_temp": 1.0, |
| "temperature": 1.0, |
| "div_fn": "tvd_temp_limit", |
| "boundary_mode": "end", |
| "merge_boundary_loss": false, |
| "use_output_boundary_jsd": false, |
| "eval_add_boundary_logp": false, |
| "do_alm_debiasing": false, |
| "rep_compare_fn": "l2", |
| "start_ratio": 4.3, |
| "target_ratio": 4.3, |
| "gradual_boundary_compression_steps": 150000, |
| "encoder_loss_lookahead": 0, |
| "encoder_loss_no_lookahead_weight": 1.0, |
| "encoder_loss_lookahead_weights": [], |
| "patching": "dolma2", |
| "epsilon": 1e-06, |
| "skip_blocks": false, |
| "skip_teacher_blocks": false, |
| "skip_teacher": true, |
| "compute_teacher_ce": false, |
| "use_student_patch_reps_for_teacher": false, |
| "use_oracle_patch_reps": false, |
| "teacher_blocks_no_grad": true, |
| "student_blocks_no_grad": false, |
| "decoder_backprop_through_encoder": true, |
| "decoder_backprop_through_boundary_predictor": true, |
| "boundary_predictor_backprop_through_encoder": true, |
| "teacher_force_boundaries": false, |
| "boundary_threshold": "sample:0", |
| "xlstm_igate_bias_init": -10.0, |
| "skip_boundary_before_eos": true, |
| "_CLASS_": "olmo_core.nn.bolmo.config.BolmoConfig" |
| }, |
| "label_ignore_index": -100, |
| "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerTrainModuleConfig" |
| }, |
| "trainer": {}, |
| "init_seed": 12536, |
| "_CLASS_": "__main__.ExperimentConfig" |
| } |