{ "architectures": [ "HybriKoModel" ], "auto_map": { "AutoConfig": "configuration_hybridko.HybriKoConfig", "AutoModel": "modeling_hybridko.HybriKoModel", "AutoModelForCausalLM": "modeling_hybridko.HybriKoModel" }, "bos_token_id": 2, "d_model": 768, "data": { "num_samples": null, "path": "data/processed_exp4_plus" }, "distributed": { "backend": "nccl", "enabled": true, "world_size": 8 }, "dtype": "float32", "eos_token_id": 3, "ff_mult": 3, "max_seq_len": 512, "model": { "d_model": 768, "ff_mult": 3, "max_seq_len": 1024, "n_heads": 12, "n_kv_heads": 3, "n_layers": 12, "vocab_size": 32000 }, "model_type": "hybridko", "n_heads": 12, "n_kv_heads": 3, "n_layers": 12, "pad_token_id": 0, "tokenizer": { "character_coverage": 0.9995, "model_type": "unigram", "vocab_size": 32000 }, "training": { "batch_size": 8, "dropout": 0.15, "grad_accum_steps": 1, "grad_clip": 1.0, "gradient_checkpointing": true, "label_smoothing": 0.05, "log_steps": 50, "max_length": 1024, "max_steps": 1962, "min_lr": 5e-05, "peak_lr": 0.0005, "save_steps": 500, "warmup_steps": 100, "weight_decay": 0.1 }, "transformers_version": "4.57.3", "vocab_size": 32000 }