{ "llama2-7b": { "display_name": "LLaMA 2 7B", "description": "Meta LLaMA 2 7B model", "config": { "model": { "name": "llama2-7b", "num_parameters": "7B", "num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32, "vocab_size": 32000, "max_seq_len": 4096 }, "training": { "batch_size": 4, "gradient_accumulation_steps": 4, "optimizer": "adamw", "dtype": "bf16", "activation_checkpointing": 1 }, "parallelism": { "tensor_parallel_size": 1, "pipeline_parallel_size": 1, "data_parallel_size": 8, "sequence_parallel": false }, "engine": { "type": "deepspeed", "zero_stage": 3, "offload_optimizer": "cpu", "offload_param": "none" }, "hardware": { "num_gpus": 8, "gpu_memory_gb": 80 } } }, "llama2-13b": { "display_name": "LLaMA 2 13B", "description": "Meta LLaMA 2 13B model", "config": { "model": { "name": "llama2-13b", "num_parameters": "13B", "num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40, "vocab_size": 32000, "max_seq_len": 4096 }, "training": { "batch_size": 2, "gradient_accumulation_steps": 8, "optimizer": "adamw", "dtype": "bf16", "activation_checkpointing": 1 }, "parallelism": { "tensor_parallel_size": 1, "pipeline_parallel_size": 1, "data_parallel_size": 8, "sequence_parallel": false }, "engine": { "type": "deepspeed", "zero_stage": 3, "offload_optimizer": "cpu", "offload_param": "none" }, "hardware": { "num_gpus": 8, "gpu_memory_gb": 80 } } }, "llama2-70b": { "display_name": "LLaMA 2 70B", "description": "Meta LLaMA 2 70B model", "config": { "model": { "name": "llama2-70b", "num_parameters": "70B", "num_layers": 80, "hidden_size": 8192, "num_attention_heads": 64, "vocab_size": 32000, "max_seq_len": 4096 }, "training": { "batch_size": 1, "gradient_accumulation_steps": 16, "optimizer": "adamw", "dtype": "bf16", "activation_checkpointing": 2 }, "parallelism": { "tensor_parallel_size": 4, "pipeline_parallel_size": 2, "data_parallel_size": 8, "sequence_parallel": false }, "engine": { "type": "deepspeed", "zero_stage": 3, "offload_optimizer": "cpu", "offload_param": "none" }, "hardware": { "num_gpus": 64, "gpu_memory_gb": 80 } } }, "gpt3-175b": { "display_name": "GPT-3 175B", "description": "OpenAI GPT-3 175B model", "config": { "model": { "name": "gpt3-175b", "num_parameters": "175B", "num_layers": 96, "hidden_size": 12288, "num_attention_heads": 96, "vocab_size": 50257, "max_seq_len": 2048 }, "training": { "batch_size": 1, "gradient_accumulation_steps": 1, "optimizer": "adamw", "dtype": "bf16", "activation_checkpointing": 2 }, "parallelism": { "tensor_parallel_size": 8, "pipeline_parallel_size": 16, "data_parallel_size": 1, "sequence_parallel": true }, "engine": { "type": "megatron_lm" }, "hardware": { "num_gpus": 1024, "gpu_memory_gb": 80 } } }, "mixtral-8x7b": { "display_name": "Mixtral 8x7B (MoE)", "description": "Mistral AI Mixtral 8x7B - 46.7B total params, ~12.9B active per token", "config": { "model": { "name": "mixtral-8x7b", "num_parameters": "46.7B", "num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32, "vocab_size": 32000, "max_seq_len": 32768, "moe_enabled": true, "num_experts": 8, "top_k": 2, "expert_intermediate_size": 14336 }, "training": { "batch_size": 2, "gradient_accumulation_steps": 4, "optimizer": "adamw", "dtype": "bf16", "activation_checkpointing": 2 }, "parallelism": { "tensor_parallel_size": 2, "pipeline_parallel_size": 1, "data_parallel_size": 4, "sequence_parallel": false }, "engine": { "type": "deepspeed", "zero_stage": 3, "offload_optimizer": "cpu", "offload_param": "none" }, "hardware": { "num_gpus": 8, "gpu_memory_gb": 80 } } }, "glm-4-9b": { "display_name": "GLM-4 9B (MoE)", "description": "Tsinghua University GLM-4 9B with MoE architecture", "config": { "model": { "name": "glm-4-9b", "num_parameters": "9B", "num_layers": 40, "hidden_size": 4096, "num_attention_heads": 32, "vocab_size": 151552, "max_seq_len": 8192, "moe_enabled": true, "num_experts": 4, "top_k": 2, "expert_intermediate_size": 10240, "shared_expert_intermediate_size": 10240 }, "training": { "batch_size": 4, "gradient_accumulation_steps": 4, "optimizer": "adamw", "dtype": "bf16", "activation_checkpointing": 2 }, "parallelism": { "tensor_parallel_size": 1, "pipeline_parallel_size": 1, "data_parallel_size": 4, "sequence_parallel": false }, "engine": { "type": "deepspeed", "zero_stage": 2, "offload_optimizer": "none", "offload_param": "none" }, "hardware": { "num_gpus": 4, "gpu_memory_gb": 80 } } }, "glm-4.7-355b": { "display_name": "GLM-4.7 355B (MoE)", "description": "Tsinghua University GLM-4.7 - Latest flagship with 355B total / 32B active params", "config": { "model": { "name": "glm-4.7-355b", "num_parameters": "355B", "num_layers": 46, "hidden_size": 4096, "num_attention_heads": 96, "vocab_size": 151552, "max_seq_len": 131072, "moe_enabled": true, "num_experts": 128, "top_k": 8, "expert_intermediate_size": 1408, "shared_expert_intermediate_size": 10944 }, "training": { "batch_size": 1, "gradient_accumulation_steps": 16, "optimizer": "adamw", "dtype": "bf16", "activation_checkpointing": 4 }, "parallelism": { "tensor_parallel_size": 8, "pipeline_parallel_size": 4, "data_parallel_size": 16, "sequence_parallel": true }, "engine": { "type": "deepspeed", "zero_stage": 3, "offload_optimizer": "cpu", "offload_param": "cpu" }, "hardware": { "num_gpus": 512, "gpu_memory_gb": 80 } } }, "glm-4.5-air-106b": { "display_name": "GLM-4.5 Air 106B (MoE) ⭐ Air", "description": "Tsinghua University GLM-4.5 Air - 106B total / 12B active params, optimized for deployment", "config": { "model": { "name": "glm-4.5-air-106b", "num_parameters": "106B", "num_layers": 46, "hidden_size": 4096, "num_attention_heads": 96, "vocab_size": 151552, "max_seq_len": 131072, "moe_enabled": true, "num_experts": 128, "top_k": 8, "expert_intermediate_size": 1408, "shared_expert_intermediate_size": 10944 }, "training": { "batch_size": 2, "gradient_accumulation_steps": 8, "optimizer": "adamw", "dtype": "bf16", "activation_checkpointing": 2 }, "parallelism": { "tensor_parallel_size": 4, "pipeline_parallel_size": 2, "data_parallel_size": 8, "sequence_parallel": false }, "engine": { "type": "deepspeed", "zero_stage": 3, "offload_optimizer": "cpu", "offload_param": "none" }, "hardware": { "num_gpus": 64, "gpu_memory_gb": 80 } } }, "qwen1.5-moe-a2.7b": { "display_name": "Qwen1.5-MoE-A2.7B", "description": "Alibaba Qwen1.5 MoE - 14B total params, 2.7B active per token", "config": { "model": { "name": "qwen1.5-moe-a2.7b", "num_parameters": "14B", "num_layers": 28, "hidden_size": 5120, "num_attention_heads": 40, "vocab_size": 151936, "max_seq_len": 32768, "moe_enabled": true, "num_experts": 8, "top_k": 4, "expert_intermediate_size": 15360 }, "training": { "batch_size": 2, "gradient_accumulation_steps": 4, "optimizer": "adamw", "dtype": "bf16", "activation_checkpointing": 2 }, "parallelism": { "tensor_parallel_size": 2, "pipeline_parallel_size": 1, "data_parallel_size": 4, "sequence_parallel": false }, "engine": { "type": "deepspeed", "zero_stage": 3, "offload_optimizer": "cpu", "offload_param": "none" }, "hardware": { "num_gpus": 8, "gpu_memory_gb": 80 } } }, "deepseek-moe-16b": { "display_name": "DeepSeek-MoE 16B", "description": "DeepSeek MoE model with 16.4B total params, ~2.7B active per token", "config": { "model": { "name": "deepseek-moe-16b", "num_parameters": "16.4B", "num_layers": 28, "hidden_size": 2048, "num_attention_heads": 16, "vocab_size": 102400, "max_seq_len": 4096, "moe_enabled": true, "num_experts": 64, "top_k": 6, "expert_intermediate_size": 1408, "shared_expert_intermediate_size": 10944 }, "training": { "batch_size": 4, "gradient_accumulation_steps": 4, "optimizer": "adamw", "dtype": "bf16", "activation_checkpointing": 2 }, "parallelism": { "tensor_parallel_size": 2, "pipeline_parallel_size": 1, "data_parallel_size": 4, "sequence_parallel": false }, "engine": { "type": "deepspeed", "zero_stage": 2, "offload_optimizer": "none", "offload_param": "none" }, "hardware": { "num_gpus": 8, "gpu_memory_gb": 80 } } } }