| { | |
| "architectures": [ | |
| "TokenFormerForCausalLM" | |
| ], | |
| "num_layers": 12, | |
| "hidden_size": 768, | |
| "num_attention_heads": 12, | |
| "qkv_slot_num": 768, | |
| "proj_slot_num": 768, | |
| "ffn_slot_num": 3072, | |
| "seq_length": 2048, | |
| "max_position_embeddings": 2048, | |
| "pos_emb": "rotary", | |
| "rotary_pct": 0.25, | |
| "no_weight_tying": false, | |
| "norm": "layernorm_nonparam", | |
| "final_norm": "layernorm", | |
| "gpt_j_residual": false, | |
| "output_layer_parallelism": "column", | |
| "use_bias_in_attn_linear": false, | |
| "attention_config": [[["tokenformer"], 12]], | |
| "norm_activation_type": "l2_norm_gelu", | |
| "scaled_upper_triang_masked_softmax_fusion": false, | |
| "bias_gelu_fusion": false, | |
| "rope_fusion": false, | |
| "layernorm_fusion": false, | |
| "init_method": "normal", | |
| "output_layer_init_method": "wang_init", | |
| "use_cache": true, | |
| "torch_dtype": "float16", | |
| "transformers_version": "4.36.0" | |
| } | |