{ "model_type": "embernet_vlm", "architecture": "BitNet b1.58 MoE VLM", "vision_encoder": { "model_name": "google/siglip-base-patch16-224", "num_image_tokens": 64, "freeze_vision": true }, "language_decoder": { "vocab_size": 32002, "hidden_size": 768, "intermediate_size": 2048, "num_layers": 16, "num_attention_heads": 12, "num_kv_heads": 6, "max_position_embeddings": 4096, "num_experts": 8, "num_experts_per_tok": 2, "use_shared_expert": true, "expert_domains": [ "vision_ocr", "vision_diagram", "code_math_chart", "code_math_formula", "spatial_scene", "spatial_reasoning", "agentic_knowledge", "agentic_reasoning" ], "quantisation": "BitNet b1.58 (ternary)", "activation_bits": 4 }, "torch_dtype": "bfloat16", "transformers_version": ">=4.36.0", "parameter_counts": { "vision_encoder": 107748864, "vision_encoder_breakdown": { "encoder": 92884224, "compressor": 2363904, "pooler": 2412288, "projector": 10088448 }, "decoder_total": 733055360, "decoder_embeddings": 24577536, "decoder_attention": 0, "decoder_router": 98432, "decoder_shared_expert": 75554816, "decoder_domain_experts": 604438528, "num_domain_experts": 8 } }