{ "name": "olmo3-lingua", "dump_dir": "logs/debug", "seed": 777, "grad_acc_steps": 1, "gc_collect_freq": 1000, "probe_freq": null, "steps": 9765625000, "data": { "root_dir": "data", "sources": { "dclm-baseline_shuffled": 1.0 }, "batch_size": 16, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": { "name": "huggingface", "path": "/raid/user_data/rsadhukh/checkpoints/olmo2-1b-stage1-token1T/" } }, "optim": { "lr": 0.0004, "weight_decay": 0.1, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 16384000, "lr_min_ratio": 0.1, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 0, "decay_fraction": 0.1, "exp_factor": 0.5 }, "distributed": { "dp_shard": 1, "dp_replicate": 1, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver" }, "checkpoint": { "dump": { "every": 10000, "keep": 1 }, "eval": { "every": 10000, "keep": 1 }, "path": "logs/debug/checkpoints", "init_ckpt_path": "", "continue_training_from_init": false }, "logging": { "freq": 10, "acc_freq": null, "wandb": null }, "model": { "dim": 2048, "n_layers": 16, "head_dim": 128, "n_heads": 16, "n_kv_heads": 16, "ffn_dim_multiplier": 1.5, "multiple_of": 256, "norm_eps": 1e-06, "rope_theta": 500000, "rope_scaling": null, "init_base_std": 0.02, "init_std_factor": "disabled", "max_seqlen": 4096, "seed": 42, "vocab_size": 100352, "weight_tying": false, "sliding_window": null } }