DotLM-165M / config.json
tensorfiend's picture
Upload DotLMForCausalLM
2e3c313 verified
{
"accelerator": "auto",
"accumulate_grad_batches": 2,
"alignment_checkpoint": "output/alignment/exp12-lr5e5-b02-b8a4-val1.3030.ckpt",
"architectures": [
"DotLMForCausalLM"
],
"auto_map": {
"AutoConfig": "modeling_dotlm.DotLMConfig",
"AutoModelForCausalLM": "modeling_dotlm.DotLMForCausalLM"
},
"batch_size": 16,
"bos_token_id": null,
"context_len": 4096,
"d_model": 768,
"dataset_path": "dataset/reasoning.json",
"devices": "auto",
"dtype": "float32",
"eos_token_id": 3,
"gradient_clip_val": 0.5,
"hidden_dim": 2048,
"initializer_range": 0.02,
"learning_rate": 5e-05,
"max_epochs": 10,
"max_seq_len": 768,
"max_steps": -1,
"model_type": "dotlm",
"n_heads": 6,
"n_kv_heads": 2,
"norm_eps": 1e-06,
"num_hidden_layers": 24,
"num_workers": 4,
"output_dir": "output/reasoning",
"pad_token_id": 0,
"precision": "bf16-mixed",
"run_prefix": "dotlm_165m",
"save_top_k": 5,
"scheduler_type": "cosine_with_warmup",
"strategy": "auto",
"theta_base": 10000.0,
"tie_word_embeddings": true,
"transformers_version": "5.5.0",
"use_cache": true,
"use_wandb": true,
"val_split": 0.01,
"val_steps": 200,
"version": "auto",
"vocab_size": 16384,
"wandb_project": "DotLM",
"warmup_steps": 50,
"weight_decay": 0.01
}