| { | |
| "model_type": "krdmodel", | |
| "vocab_size": 111, | |
| "dim": 1024, | |
| "n_layers": 16, | |
| "n_heads": 16, | |
| "n_kv_heads": 4, | |
| "ffn_dim": 2816, | |
| "max_seq_len": 2048, | |
| "batch_size": 4, | |
| "gradient_accumulation": 8, | |
| "train_steps": 5000, | |
| "lr": 2e-4, | |
| "mixed_precision": "fp16", | |
| "lora_rank": 32, | |
| "use_flash": true, | |
| "grad_checkpoint": true | |
| } | |