{ "accelerator": "auto", "accumulate_grad_batches": 2, "alignment_checkpoint": "output/alignment/exp12-lr5e5-b02-b8a4-val1.3030.ckpt", "architectures": [ "DotLMForCausalLM" ], "auto_map": { "AutoConfig": "modeling_dotlm.DotLMConfig", "AutoModelForCausalLM": "modeling_dotlm.DotLMForCausalLM" }, "batch_size": 16, "bos_token_id": null, "context_len": 4096, "d_model": 768, "dataset_path": "dataset/reasoning.json", "devices": "auto", "dtype": "float32", "eos_token_id": 3, "gradient_clip_val": 0.5, "hidden_dim": 2048, "initializer_range": 0.02, "learning_rate": 5e-05, "max_epochs": 10, "max_seq_len": 768, "max_steps": -1, "model_type": "dotlm", "n_heads": 6, "n_kv_heads": 2, "norm_eps": 1e-06, "num_hidden_layers": 24, "num_workers": 4, "output_dir": "output/reasoning", "pad_token_id": 0, "precision": "bf16-mixed", "run_prefix": "dotlm_165m", "save_top_k": 5, "scheduler_type": "cosine_with_warmup", "strategy": "auto", "theta_base": 10000.0, "tie_word_embeddings": true, "transformers_version": "5.5.0", "use_cache": true, "use_wandb": true, "val_split": 0.01, "val_steps": 200, "version": "auto", "vocab_size": 16384, "wandb_project": "DotLM", "warmup_steps": 50, "weight_decay": 0.01 }