Kairos_10m / config.json
GritLs
delete config
0187fe5
{
"architectures": [
"KairosModel"
],
"classifier_dropout": 0,
"context_length": 2048,
"cross_attention_pe_flip": false,
"d_ff": 1024,
"d_kv": 64,
"d_model": 256,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"dtype": "float32",
"eos_token_id": 1,
"feed_forward_proj": "relu",
"initializer_factor": 0.05,
"input_patch_size": 128,
"input_patch_stride": 128,
"instance_rope_input_feature_dim": 128,
"is_cross_attention_pe": true,
"is_encoder_decoder": true,
"is_gated_act": false,
"layer_norm_epsilon": 1e-06,
"levels": 3,
"loss_weight_scheme": "log_decay",
"max_period": "original_rope_init",
"min_period": "original_rope_init",
"model_type": "kairos",
"moe_inter_dim": 1408,
"n_activated_experts": 3,
"n_expert_groups": 1,
"n_limited_groups": 1,
"n_null_experts": 2,
"n_positions": 512,
"num_decoder_layers": 4,
"num_decoder_segments": 2,
"num_heads": 4,
"num_layers": 4,
"pad_token_id": 0,
"position_embedding_type": "instance_wise_rope",
"prediction_length": 64,
"pretrained_model_path": "",
"quantiles": [
0.1,
0.2,
0.3,
0.4,
0.5,
0.6,
0.7,
0.8,
0.9
],
"reg_token_id": 1,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"rope_init": "exp",
"scale_method": "log",
"score_func": "softmax",
"seq_balance_factor": 0.0001,
"target_dist": [
0.05,
0.1,
0.55,
0.15,
0.15
],
"transformers_version": "4.56.1",
"update_bias_rate": 0.01,
"use_cache": true,
"use_reg_token": true,
"vocab_size": 2
}