| {"model": {"d_model": 4096, "vocab_size": 100352, "n_layers": 32, "block": {"sequence_mixer": {"name": "default", "n_heads": 32, "bias": false, "rope": {"name": "default", "theta": 500000, "full_precision": true, "no_global_rope": false, "_CLASS_": "olmo_core.nn.rope.RoPEConfig"}, "qk_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "backend": "flash_4", "dtype": "float32", "use_head_qk_norm": false, "_CLASS_": "olmo_core.nn.attention.AttentionConfig", "type": "attention"}, "layer_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "feed_forward": {"hidden_size": 11008, "name": "default", "bias": false, "dtype": "float32", "activation": "silu", "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"}, "name": "reordered_norm", "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"}, "lm_head": {"name": "default", "layer_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "bias": false, "dtype": "float32", "loss_implementation": "default", "_CLASS_": "olmo_core.nn.lm_head.LMHeadConfig"}, "name": "default", "dtype": "float32", "init_method": "normal", "init_seed": 0, "init_std": 0.02, "_CLASS_": "olmo_core.nn.transformer.config.TransformerConfig"}, "dataset": {"tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "paths": ["/localhome/apanfero/data/olmo/climbmix_raw/train-00000-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00001-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00002-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00003-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00004-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00005-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00006-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00007-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00008-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00009-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00010-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00011-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00012-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00013-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00014-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00015-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00016-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00017-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00018-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00019-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00020-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00021-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00022-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00023-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00024-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00025-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00026-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00027-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00028-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00029-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00030-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00031-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00032-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00033-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00034-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00035-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00036-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00037-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00038-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00039-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00040-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00041-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00042-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00043-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00044-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00045-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00046-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00047-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00048-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00049-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00050-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00051-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00052-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00053-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00054-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00055-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00056-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00057-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00058-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00059-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00060-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00061-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00062-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00063-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00064-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00065-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00066-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00067-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00068-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00069-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00070-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00071-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00072-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00073-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00074-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00075-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00076-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00077-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00078-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00079-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00080-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00081-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00082-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00083-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00084-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00085-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00086-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00087-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00088-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00089-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00090-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00091-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00092-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00093-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00094-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00095-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00096-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00097-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00098-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00099-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00100-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00101-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00102-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00103-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00104-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00105-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00106-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00107-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00108-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00109-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00110-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00111-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00112-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00113-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00114-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00115-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00116-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00117-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00118-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00119-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00120-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00121-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00122-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00123-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00124-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00125-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00126-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00127-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00128-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00129-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00130-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00131-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00132-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00133-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00134-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00135-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00136-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00137-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00138-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00139-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00140-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00141-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00142-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00143-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00144-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00145-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00146-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00147-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00148-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00149-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00150-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00151-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00152-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00153-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00154-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00155-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00156-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00157-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00158-of-00160.npy", "/localhome/apanfero/data/olmo/climbmix_raw/train-00159-of-00160.npy"], "expand_glob": false, "include_instance_metadata": true, "work_dir": "/localhome/apanfero/models/quartet2/7B/0.5-chin/bf16", "ignore_fingerprint_mismatch": false, "sequence_length": 2048, "max_target_sequence_length": 8192, "generate_doc_lengths": false, "_CLASS_": "olmo_core.data.numpy_dataset.NumpyFSLDatasetConfig"}, "data_loader": {"global_batch_size": 1048576, "seed": 42, "num_workers": 8, "prefetch_factor": 8, "ignore_fingerprint_mismatch": false, "_CLASS_": "olmo_core.data.data_loader.NumpyDataLoaderConfig", "type": "numpy"}, "train_module": {"rank_microbatch_size": 8192, "max_sequence_length": 2048, "optim": {"group_overrides": [{"params": ["embeddings.weight"], "opts": {"weight_decay": 0.0}, "_CLASS_": "olmo_core.optim.config.OptimGroupOverride"}], "compile": false, "fixed_fields": ["initial_lr"], "lr": 0.0003, "betas": [0.9, 0.95], "eps": 1e-08, "weight_decay": 0.1, "foreach": true, "step_increment_bugfix": true, "rolling_interval_length": 128, "sigma_factor": 6, "_CLASS_": "olmo_core.optim.adamw.SkipStepAdamWConfig", "type": "skip_step_adamw"}, "max_grad_norm": 1.0, "scheduler": {"lr_field": "lr", "initial_lr_field": "initial_lr", "units": "steps", "warmup": 2000, "alpha_f": 0.1, "warmup_min_lr": 0.0, "_CLASS_": "olmo_core.optim.scheduler.CosWithWarmup", "type": "cos_with_warmup"}, "compile_model": true, "float8_config": {"enabled": false, "_CLASS_": "olmo_core.float8.Float8Config"}, "dp_config": {"name": "ddp", "param_dtype": "bfloat16", "reduce_dtype": "float32", "wrapping_strategy": "full", "prefetch_factor": 0, "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerDataParallelConfig"}, "z_loss_multiplier": 1e-05, "label_ignore_index": -100, "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerTrainModuleConfig"}, "trainer": {"save_folder": "/localhome/apanfero/models/quartet2/7B/0.5-chin/bf16", "work_dir": "/localhome/apanfero/models/quartet2/7B/0.5-chin/bf16", "load_strategy": "if_available", "checkpointer": {"pre_download": false, "throttle_uploads": false, "_CLASS_": "olmo_core.train.checkpoint.CheckpointerConfig"}, "save_overwrite": false, "max_duration": {"value": 68875755520, "unit": "tokens", "_CLASS_": "olmo_core.train.common.Duration"}, "cancel_check_interval": 10, "metrics_collect_interval": 50, "callbacks": {"config_saver": {"fname": "config.json", "_CLASS_": "olmo_core.train.callbacks.config_saver.ConfigSaverCallback"}, "checkpointer": {"ephemeral_save_interval": 1000, "save_async": false, "remove": "ephemeral_only", "fixed_steps": [0, 1990, 3980, 5971, 7961, 9952, 11942, 13933, 15923, 17914, 19904, 21895, 23885, 25875, 27866, 29856, 31847, 33837, 35828, 37818, 39809, 41799, 43790, 45780, 47770, 49761, 51751, 53742, 55732, 57723, 59713, 61704, 63694], "enabled": true, "_CLASS_": "olmo_core.train.callbacks.checkpointer.CheckpointerCallback"}, "wandb": {"enabled": true, "project": "pareto_front", "cancel_tags": ["cancel", "canceled", "cancelled"], "cancel_check_interval": 10, "_CLASS_": "olmo_core.train.callbacks.wandb.WandBCallback"}, "lm_evaluator": {"eval_dataset": {"tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "mix": "v3-small-ppl-validation", "mix_base_dir": "/localhome/apanfero/data/olmo/v3-small-ppl-validation", "expand_glob": false, "include_instance_metadata": true, "work_dir": "/localhome/apanfero/models/quartet2/7B/0.5-chin/bf16", "ignore_fingerprint_mismatch": false, "sequence_length": 2048, "_CLASS_": "olmo_core.data.numpy_dataset.NumpyPaddedFSLDatasetConfig"}, "eval_interval": 250, "eval_on_startup": false, "eval_on_finish": true, "cancel_after_first_eval": false, "eval_duration": {"value": 1, "unit": "epochs", "_CLASS_": "olmo_core.train.common.Duration"}, "log_interval": 5, "enabled": true, "_CLASS_": "olmo_core.train.callbacks.evaluator_callback.LMEvaluatorCallbackConfig"}, "downstream_evaluator": {"tasks": ["arc_challenge_test_bpb_5shot", "arc_challenge_test_mc_5shot_fast", "arc_easy_test_bpb_5shot", "arc_easy_test_mc_5shot_fast", "hellaswag_bpb_5shot"], "tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "eval_interval": 1000, "eval_duration": {"value": 1, "unit": "epochs", "_CLASS_": "olmo_core.train.common.Duration"}, "eval_on_startup": true, "eval_on_finish": true, "cancel_after_first_eval": false, "log_interval": 5, "lazy": false, "enabled": true, "_CLASS_": "olmo_core.train.callbacks.evaluator_callback.DownstreamEvaluatorCallbackConfig"}, "qat_enabler": {"qat_method": "quartet_2", "qat_start": 0.0, "qat_end": 0.0, "next_layer_strategy": "last_to_first", "eval_duration": {"value": 1000000, "unit": "tokens", "_CLASS_": "olmo_core.train.common.Duration"}, "_CLASS_": "olmo_core.train.callbacks.qat_enabler.QatEnablerCallback"}}, "bookkeeping_soft_timeout": 30, "no_checkpoints": false, "no_evals": false, "_CLASS_": "olmo_core.train.config.TrainerConfig"}, "init_seed": 12536, "_CLASS_": "olmo_core.script_utils.ExperimentConfig"} |