{ "model_name": "Qwen3-5M-GPT2", "model_type": "Qwen3ForCausalLM", "tokenizer": "gpt2", "dtype": "bfloat16", "vocab_size": 50257, "hidden_size": 384, "num_layers": 12, "num_attention_heads": 6, "num_key_value_heads": 2, "head_dim": 64, "intermediate_size": 1024, "max_position_embeddings": 8192, "rope_theta": 500000, "sliding_window": 512, "use_sliding_window": true, "layer_types": [ "sliding_attention", "sliding_attention", "sliding_attention", "full_attention", "sliding_attention", "sliding_attention", "sliding_attention", "full_attention", "sliding_attention", "sliding_attention", "sliding_attention", "full_attention" ], "max_window_layers": 9, "query_pre_attn_scalar": 256, "parameters": 38184192, "tie_word_embeddings": true, "positional_encoding": "rope", "normalization": "rmsnorm", "activation": "swiglu", "xsa_enabled": false, "xsa_paper": "arxiv 2603.09078" }