| { |
| "architectures": [ |
| "KairosModel" |
| ], |
| "classifier_dropout": 0, |
| "context_length": 2048, |
| "cross_attention_pe_flip": false, |
| "d_ff": 1024, |
| "d_kv": 64, |
| "d_model": 256, |
| "decoder_start_token_id": 0, |
| "dense_act_fn": "relu", |
| "dropout_rate": 0.1, |
| "dtype": "float32", |
| "eos_token_id": 1, |
| "feed_forward_proj": "relu", |
| "initializer_factor": 0.05, |
| "input_patch_size": 128, |
| "input_patch_stride": 128, |
| "instance_rope_input_feature_dim": 128, |
| "is_cross_attention_pe": true, |
| "is_encoder_decoder": true, |
| "is_gated_act": false, |
| "layer_norm_epsilon": 1e-06, |
| "levels": 3, |
| "loss_weight_scheme": "log_decay", |
| "max_period": "original_rope_init", |
| "min_period": "original_rope_init", |
| "model_type": "kairos", |
| "moe_inter_dim": 1408, |
| "n_activated_experts": 3, |
| "n_expert_groups": 1, |
| "n_limited_groups": 1, |
| "n_null_experts": 2, |
| "n_positions": 512, |
| "num_decoder_layers": 4, |
| "num_decoder_segments": 2, |
| "num_heads": 4, |
| "num_layers": 4, |
| "pad_token_id": 0, |
| "position_embedding_type": "instance_wise_rope", |
| "prediction_length": 64, |
| "pretrained_model_path": "", |
| "quantiles": [ |
| 0.1, |
| 0.2, |
| 0.3, |
| 0.4, |
| 0.5, |
| 0.6, |
| 0.7, |
| 0.8, |
| 0.9 |
| ], |
| "reg_token_id": 1, |
| "relative_attention_max_distance": 128, |
| "relative_attention_num_buckets": 32, |
| "rope_init": "exp", |
| "scale_method": "log", |
| "score_func": "softmax", |
| "seq_balance_factor": 0.0001, |
| "target_dist": [ |
| 0.05, |
| 0.1, |
| 0.55, |
| 0.15, |
| 0.15 |
| ], |
| "transformers_version": "4.56.1", |
| "update_bias_rate": 0.01, |
| "use_cache": true, |
| "use_reg_token": true, |
| "vocab_size": 2 |
| } |
|
|