{ "architectures": [ "TaoNetMiniT2ForCausalLM" ], "auto_map": { "AutoConfig": "configuration_taonet_mini_t2.TaoNetMiniT2Config", "AutoModelForCausalLM": "modeling_taonet_mini_t2.TaoNetMiniT2ForCausalLM", "AutoTokenizer": [ "tokenization_taonet_mini_t2.TaoNetMiniT2Tokenizer", null ] }, "bos_token_id": 1, "checkpoint_name": "pretrain_final_model.pt", "default_ssm_finite_tail_correction": true, "default_ssm_kernel_mode": "recurrent", "eos_token_id": 2, "model_type": "taonet_mini_t2", "pad_token_id": 3, "taotrain_model_config": { "architecture_type": "taonet_ssm", "vocab_size": 8192, "hidden_dim": 1024, "num_layers": 18, "num_heads": 8, "max_seq_length": 512, "d_latent_kv": 768, "d_rope": 128, "hidden_dim_ff": 3072, "dropout": 0.0, "gqa_groups": 1, "use_factorized_embedding": false, "d_embed_rank": 96, "init_std": 0.02, "ssm_core": "dplr", "ssm_hidden_dim": 32, "ssm_mixer_dim": 256, "ssm_num_lanes": 2, "ssm_lane_combine": "channel", "ssm_lane_mode": "split", "ssm_split_mix": "none", "ssm_rank": 1, "ssm_max_low_rank_scale": 0.1, "ssm_finite_tail_correction": true, "ssm_discretization": "bilinear", "ssm_kernel_mode": "recurrent", "ssm_kernel_threshold": 64, "ssm_dt_min": 0.001, "ssm_dt_max": 0.1, "ssm_dt_init": 0.01, "ssm_use_d": true, "ssm_activation": "gelu", "ssm_gate": true, "ssm_input_gate": true, "ssm_gate_type": "channel", "ssm_use_padding_mask": false, "ssm_layer_scale_init": 0.1, "ssm_branch_rms_norm": true, "ssm_branch_rms_eps": 0.000001, "ssm_branch_clip_value": null, "block_residual_rms_norm": false, "block_residual_rms_target": 1.0, "block_residual_rms_cap": null, "block_residual_rms_eps": 0.000001, "ssm_local_shift": true, "ssm_local_shift_init": 0.1, "ssm_local_shift_per_channel": true }, "tokenizer_class": "TaoNetMiniT2Tokenizer", "tokenizer_file": "tokenizer.model", "torch_dtype": "bfloat16", "transformers_version": "4.30.0", "unk_token_id": 0, "use_cache": false, "vocab_size": 8192 }