File size: 1,324 Bytes
b7e6c9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
{
"architectures": [
"HybriKoModel"
],
"auto_map": {
"AutoConfig": "configuration_hybridko.HybriKoConfig",
"AutoModel": "modeling_hybridko.HybriKoModel",
"AutoModelForCausalLM": "modeling_hybridko.HybriKoModel"
},
"bos_token_id": 2,
"d_model": 768,
"data": {
"num_samples": null,
"path": "data/processed_exp4_plus"
},
"distributed": {
"backend": "nccl",
"enabled": true,
"world_size": 8
},
"dtype": "float32",
"eos_token_id": 3,
"ff_mult": 3,
"max_seq_len": 512,
"model": {
"d_model": 768,
"ff_mult": 3,
"max_seq_len": 1024,
"n_heads": 12,
"n_kv_heads": 3,
"n_layers": 12,
"vocab_size": 32000
},
"model_type": "hybridko",
"n_heads": 12,
"n_kv_heads": 3,
"n_layers": 12,
"pad_token_id": 0,
"tokenizer": {
"character_coverage": 0.9995,
"model_type": "unigram",
"vocab_size": 32000
},
"training": {
"batch_size": 8,
"dropout": 0.15,
"grad_accum_steps": 1,
"grad_clip": 1.0,
"gradient_checkpointing": true,
"label_smoothing": 0.05,
"log_steps": 50,
"max_length": 1024,
"max_steps": 1962,
"min_lr": 5e-05,
"peak_lr": 0.0005,
"save_steps": 500,
"warmup_steps": 100,
"weight_decay": 0.1
},
"transformers_version": "4.57.3",
"vocab_size": 32000
} |