| { |
| "accelerator": "auto", |
| "accumulate_grad_batches": 2, |
| "alignment_checkpoint": "output/alignment/exp12-lr5e5-b02-b8a4-val1.3030.ckpt", |
| "architectures": [ |
| "DotLMForCausalLM" |
| ], |
| "auto_map": { |
| "AutoConfig": "modeling_dotlm.DotLMConfig", |
| "AutoModelForCausalLM": "modeling_dotlm.DotLMForCausalLM" |
| }, |
| "batch_size": 16, |
| "bos_token_id": null, |
| "context_len": 4096, |
| "d_model": 768, |
| "dataset_path": "dataset/reasoning.json", |
| "devices": "auto", |
| "dtype": "float32", |
| "eos_token_id": 3, |
| "gradient_clip_val": 0.5, |
| "hidden_dim": 2048, |
| "initializer_range": 0.02, |
| "learning_rate": 5e-05, |
| "max_epochs": 10, |
| "max_seq_len": 768, |
| "max_steps": -1, |
| "model_type": "dotlm", |
| "n_heads": 6, |
| "n_kv_heads": 2, |
| "norm_eps": 1e-06, |
| "num_hidden_layers": 24, |
| "num_workers": 4, |
| "output_dir": "output/reasoning", |
| "pad_token_id": 0, |
| "precision": "bf16-mixed", |
| "run_prefix": "dotlm_165m", |
| "save_top_k": 5, |
| "scheduler_type": "cosine_with_warmup", |
| "strategy": "auto", |
| "theta_base": 10000.0, |
| "tie_word_embeddings": true, |
| "transformers_version": "5.5.0", |
| "use_cache": true, |
| "use_wandb": true, |
| "val_split": 0.01, |
| "val_steps": 200, |
| "version": "auto", |
| "vocab_size": 16384, |
| "wandb_project": "DotLM", |
| "warmup_steps": 50, |
| "weight_decay": 0.01 |
| } |
|
|