C10X
/

wwwww

Model card Files Files and versions

wwwww / metadata.json

C10X's picture

Upload 9 files

7b1332d verified about 1 month ago

history blame contribute delete

605 Bytes

	{
	"model_name": "Qwen3-8M-GPT2",
	"model_type": "Qwen3ForCausalLM",
	"tokenizer": "gpt2",
	"dtype": "bfloat16",
	"vocab_size": 50257,
	"hidden_size": 128,
	"num_layers": 12,
	"num_attention_heads": 4,
	"num_key_value_heads": 2,
	"head_dim": 32,
	"intermediate_size": 384,
	"max_position_embeddings": 2048,
	"rope_theta": 10000,
	"parameters": 8796160,
	"tie_word_embeddings": true,
	"attention_type": "full_attention",
	"positional_encoding": "rope",
	"normalization": "rmsnorm",
	"activation": "swiglu",
	"xsa_enabled": true,
	"xsa_paper": "arxiv 2603.09078"
	}