George Yang
Initial deployment: Add GPU Memory Calculator with Docker
36ed1cd
{
"llama2-7b": {
"display_name": "LLaMA 2 7B",
"description": "Meta LLaMA 2 7B model",
"config": {
"model": {
"name": "llama2-7b",
"num_parameters": "7B",
"num_layers": 32,
"hidden_size": 4096,
"num_attention_heads": 32,
"vocab_size": 32000,
"max_seq_len": 4096
},
"training": {
"batch_size": 4,
"gradient_accumulation_steps": 4,
"optimizer": "adamw",
"dtype": "bf16",
"activation_checkpointing": 1
},
"parallelism": {
"tensor_parallel_size": 1,
"pipeline_parallel_size": 1,
"data_parallel_size": 8,
"sequence_parallel": false
},
"engine": {
"type": "deepspeed",
"zero_stage": 3,
"offload_optimizer": "cpu",
"offload_param": "none"
},
"hardware": {
"num_gpus": 8,
"gpu_memory_gb": 80
}
}
},
"llama2-13b": {
"display_name": "LLaMA 2 13B",
"description": "Meta LLaMA 2 13B model",
"config": {
"model": {
"name": "llama2-13b",
"num_parameters": "13B",
"num_layers": 40,
"hidden_size": 5120,
"num_attention_heads": 40,
"vocab_size": 32000,
"max_seq_len": 4096
},
"training": {
"batch_size": 2,
"gradient_accumulation_steps": 8,
"optimizer": "adamw",
"dtype": "bf16",
"activation_checkpointing": 1
},
"parallelism": {
"tensor_parallel_size": 1,
"pipeline_parallel_size": 1,
"data_parallel_size": 8,
"sequence_parallel": false
},
"engine": {
"type": "deepspeed",
"zero_stage": 3,
"offload_optimizer": "cpu",
"offload_param": "none"
},
"hardware": {
"num_gpus": 8,
"gpu_memory_gb": 80
}
}
},
"llama2-70b": {
"display_name": "LLaMA 2 70B",
"description": "Meta LLaMA 2 70B model",
"config": {
"model": {
"name": "llama2-70b",
"num_parameters": "70B",
"num_layers": 80,
"hidden_size": 8192,
"num_attention_heads": 64,
"vocab_size": 32000,
"max_seq_len": 4096
},
"training": {
"batch_size": 1,
"gradient_accumulation_steps": 16,
"optimizer": "adamw",
"dtype": "bf16",
"activation_checkpointing": 2
},
"parallelism": {
"tensor_parallel_size": 4,
"pipeline_parallel_size": 2,
"data_parallel_size": 8,
"sequence_parallel": false
},
"engine": {
"type": "deepspeed",
"zero_stage": 3,
"offload_optimizer": "cpu",
"offload_param": "none"
},
"hardware": {
"num_gpus": 64,
"gpu_memory_gb": 80
}
}
},
"gpt3-175b": {
"display_name": "GPT-3 175B",
"description": "OpenAI GPT-3 175B model",
"config": {
"model": {
"name": "gpt3-175b",
"num_parameters": "175B",
"num_layers": 96,
"hidden_size": 12288,
"num_attention_heads": 96,
"vocab_size": 50257,
"max_seq_len": 2048
},
"training": {
"batch_size": 1,
"gradient_accumulation_steps": 1,
"optimizer": "adamw",
"dtype": "bf16",
"activation_checkpointing": 2
},
"parallelism": {
"tensor_parallel_size": 8,
"pipeline_parallel_size": 16,
"data_parallel_size": 1,
"sequence_parallel": true
},
"engine": {
"type": "megatron_lm"
},
"hardware": {
"num_gpus": 1024,
"gpu_memory_gb": 80
}
}
},
"mixtral-8x7b": {
"display_name": "Mixtral 8x7B (MoE)",
"description": "Mistral AI Mixtral 8x7B - 46.7B total params, ~12.9B active per token",
"config": {
"model": {
"name": "mixtral-8x7b",
"num_parameters": "46.7B",
"num_layers": 32,
"hidden_size": 4096,
"num_attention_heads": 32,
"vocab_size": 32000,
"max_seq_len": 32768,
"moe_enabled": true,
"num_experts": 8,
"top_k": 2,
"expert_intermediate_size": 14336
},
"training": {
"batch_size": 2,
"gradient_accumulation_steps": 4,
"optimizer": "adamw",
"dtype": "bf16",
"activation_checkpointing": 2
},
"parallelism": {
"tensor_parallel_size": 2,
"pipeline_parallel_size": 1,
"data_parallel_size": 4,
"sequence_parallel": false
},
"engine": {
"type": "deepspeed",
"zero_stage": 3,
"offload_optimizer": "cpu",
"offload_param": "none"
},
"hardware": {
"num_gpus": 8,
"gpu_memory_gb": 80
}
}
},
"glm-4-9b": {
"display_name": "GLM-4 9B (MoE)",
"description": "Tsinghua University GLM-4 9B with MoE architecture",
"config": {
"model": {
"name": "glm-4-9b",
"num_parameters": "9B",
"num_layers": 40,
"hidden_size": 4096,
"num_attention_heads": 32,
"vocab_size": 151552,
"max_seq_len": 8192,
"moe_enabled": true,
"num_experts": 4,
"top_k": 2,
"expert_intermediate_size": 10240,
"shared_expert_intermediate_size": 10240
},
"training": {
"batch_size": 4,
"gradient_accumulation_steps": 4,
"optimizer": "adamw",
"dtype": "bf16",
"activation_checkpointing": 2
},
"parallelism": {
"tensor_parallel_size": 1,
"pipeline_parallel_size": 1,
"data_parallel_size": 4,
"sequence_parallel": false
},
"engine": {
"type": "deepspeed",
"zero_stage": 2,
"offload_optimizer": "none",
"offload_param": "none"
},
"hardware": {
"num_gpus": 4,
"gpu_memory_gb": 80
}
}
},
"glm-4.7-355b": {
"display_name": "GLM-4.7 355B (MoE)",
"description": "Tsinghua University GLM-4.7 - Latest flagship with 355B total / 32B active params",
"config": {
"model": {
"name": "glm-4.7-355b",
"num_parameters": "355B",
"num_layers": 46,
"hidden_size": 4096,
"num_attention_heads": 96,
"vocab_size": 151552,
"max_seq_len": 131072,
"moe_enabled": true,
"num_experts": 128,
"top_k": 8,
"expert_intermediate_size": 1408,
"shared_expert_intermediate_size": 10944
},
"training": {
"batch_size": 1,
"gradient_accumulation_steps": 16,
"optimizer": "adamw",
"dtype": "bf16",
"activation_checkpointing": 4
},
"parallelism": {
"tensor_parallel_size": 8,
"pipeline_parallel_size": 4,
"data_parallel_size": 16,
"sequence_parallel": true
},
"engine": {
"type": "deepspeed",
"zero_stage": 3,
"offload_optimizer": "cpu",
"offload_param": "cpu"
},
"hardware": {
"num_gpus": 512,
"gpu_memory_gb": 80
}
}
},
"glm-4.5-air-106b": {
"display_name": "GLM-4.5 Air 106B (MoE) ⭐ Air",
"description": "Tsinghua University GLM-4.5 Air - 106B total / 12B active params, optimized for deployment",
"config": {
"model": {
"name": "glm-4.5-air-106b",
"num_parameters": "106B",
"num_layers": 46,
"hidden_size": 4096,
"num_attention_heads": 96,
"vocab_size": 151552,
"max_seq_len": 131072,
"moe_enabled": true,
"num_experts": 128,
"top_k": 8,
"expert_intermediate_size": 1408,
"shared_expert_intermediate_size": 10944
},
"training": {
"batch_size": 2,
"gradient_accumulation_steps": 8,
"optimizer": "adamw",
"dtype": "bf16",
"activation_checkpointing": 2
},
"parallelism": {
"tensor_parallel_size": 4,
"pipeline_parallel_size": 2,
"data_parallel_size": 8,
"sequence_parallel": false
},
"engine": {
"type": "deepspeed",
"zero_stage": 3,
"offload_optimizer": "cpu",
"offload_param": "none"
},
"hardware": {
"num_gpus": 64,
"gpu_memory_gb": 80
}
}
},
"qwen1.5-moe-a2.7b": {
"display_name": "Qwen1.5-MoE-A2.7B",
"description": "Alibaba Qwen1.5 MoE - 14B total params, 2.7B active per token",
"config": {
"model": {
"name": "qwen1.5-moe-a2.7b",
"num_parameters": "14B",
"num_layers": 28,
"hidden_size": 5120,
"num_attention_heads": 40,
"vocab_size": 151936,
"max_seq_len": 32768,
"moe_enabled": true,
"num_experts": 8,
"top_k": 4,
"expert_intermediate_size": 15360
},
"training": {
"batch_size": 2,
"gradient_accumulation_steps": 4,
"optimizer": "adamw",
"dtype": "bf16",
"activation_checkpointing": 2
},
"parallelism": {
"tensor_parallel_size": 2,
"pipeline_parallel_size": 1,
"data_parallel_size": 4,
"sequence_parallel": false
},
"engine": {
"type": "deepspeed",
"zero_stage": 3,
"offload_optimizer": "cpu",
"offload_param": "none"
},
"hardware": {
"num_gpus": 8,
"gpu_memory_gb": 80
}
}
},
"deepseek-moe-16b": {
"display_name": "DeepSeek-MoE 16B",
"description": "DeepSeek MoE model with 16.4B total params, ~2.7B active per token",
"config": {
"model": {
"name": "deepseek-moe-16b",
"num_parameters": "16.4B",
"num_layers": 28,
"hidden_size": 2048,
"num_attention_heads": 16,
"vocab_size": 102400,
"max_seq_len": 4096,
"moe_enabled": true,
"num_experts": 64,
"top_k": 6,
"expert_intermediate_size": 1408,
"shared_expert_intermediate_size": 10944
},
"training": {
"batch_size": 4,
"gradient_accumulation_steps": 4,
"optimizer": "adamw",
"dtype": "bf16",
"activation_checkpointing": 2
},
"parallelism": {
"tensor_parallel_size": 2,
"pipeline_parallel_size": 1,
"data_parallel_size": 4,
"sequence_parallel": false
},
"engine": {
"type": "deepspeed",
"zero_stage": 2,
"offload_optimizer": "none",
"offload_param": "none"
},
"hardware": {
"num_gpus": 8,
"gpu_memory_gb": 80
}
}
}
}