Spaces:
Sleeping
Sleeping
| { | |
| "llama2-7b": { | |
| "display_name": "LLaMA 2 7B", | |
| "description": "Meta LLaMA 2 7B model", | |
| "config": { | |
| "model": { | |
| "name": "llama2-7b", | |
| "num_parameters": "7B", | |
| "num_layers": 32, | |
| "hidden_size": 4096, | |
| "num_attention_heads": 32, | |
| "vocab_size": 32000, | |
| "max_seq_len": 4096 | |
| }, | |
| "training": { | |
| "batch_size": 4, | |
| "gradient_accumulation_steps": 4, | |
| "optimizer": "adamw", | |
| "dtype": "bf16", | |
| "activation_checkpointing": 1 | |
| }, | |
| "parallelism": { | |
| "tensor_parallel_size": 1, | |
| "pipeline_parallel_size": 1, | |
| "data_parallel_size": 8, | |
| "sequence_parallel": false | |
| }, | |
| "engine": { | |
| "type": "deepspeed", | |
| "zero_stage": 3, | |
| "offload_optimizer": "cpu", | |
| "offload_param": "none" | |
| }, | |
| "hardware": { | |
| "num_gpus": 8, | |
| "gpu_memory_gb": 80 | |
| } | |
| } | |
| }, | |
| "llama2-13b": { | |
| "display_name": "LLaMA 2 13B", | |
| "description": "Meta LLaMA 2 13B model", | |
| "config": { | |
| "model": { | |
| "name": "llama2-13b", | |
| "num_parameters": "13B", | |
| "num_layers": 40, | |
| "hidden_size": 5120, | |
| "num_attention_heads": 40, | |
| "vocab_size": 32000, | |
| "max_seq_len": 4096 | |
| }, | |
| "training": { | |
| "batch_size": 2, | |
| "gradient_accumulation_steps": 8, | |
| "optimizer": "adamw", | |
| "dtype": "bf16", | |
| "activation_checkpointing": 1 | |
| }, | |
| "parallelism": { | |
| "tensor_parallel_size": 1, | |
| "pipeline_parallel_size": 1, | |
| "data_parallel_size": 8, | |
| "sequence_parallel": false | |
| }, | |
| "engine": { | |
| "type": "deepspeed", | |
| "zero_stage": 3, | |
| "offload_optimizer": "cpu", | |
| "offload_param": "none" | |
| }, | |
| "hardware": { | |
| "num_gpus": 8, | |
| "gpu_memory_gb": 80 | |
| } | |
| } | |
| }, | |
| "llama2-70b": { | |
| "display_name": "LLaMA 2 70B", | |
| "description": "Meta LLaMA 2 70B model", | |
| "config": { | |
| "model": { | |
| "name": "llama2-70b", | |
| "num_parameters": "70B", | |
| "num_layers": 80, | |
| "hidden_size": 8192, | |
| "num_attention_heads": 64, | |
| "vocab_size": 32000, | |
| "max_seq_len": 4096 | |
| }, | |
| "training": { | |
| "batch_size": 1, | |
| "gradient_accumulation_steps": 16, | |
| "optimizer": "adamw", | |
| "dtype": "bf16", | |
| "activation_checkpointing": 2 | |
| }, | |
| "parallelism": { | |
| "tensor_parallel_size": 4, | |
| "pipeline_parallel_size": 2, | |
| "data_parallel_size": 8, | |
| "sequence_parallel": false | |
| }, | |
| "engine": { | |
| "type": "deepspeed", | |
| "zero_stage": 3, | |
| "offload_optimizer": "cpu", | |
| "offload_param": "none" | |
| }, | |
| "hardware": { | |
| "num_gpus": 64, | |
| "gpu_memory_gb": 80 | |
| } | |
| } | |
| }, | |
| "gpt3-175b": { | |
| "display_name": "GPT-3 175B", | |
| "description": "OpenAI GPT-3 175B model", | |
| "config": { | |
| "model": { | |
| "name": "gpt3-175b", | |
| "num_parameters": "175B", | |
| "num_layers": 96, | |
| "hidden_size": 12288, | |
| "num_attention_heads": 96, | |
| "vocab_size": 50257, | |
| "max_seq_len": 2048 | |
| }, | |
| "training": { | |
| "batch_size": 1, | |
| "gradient_accumulation_steps": 1, | |
| "optimizer": "adamw", | |
| "dtype": "bf16", | |
| "activation_checkpointing": 2 | |
| }, | |
| "parallelism": { | |
| "tensor_parallel_size": 8, | |
| "pipeline_parallel_size": 16, | |
| "data_parallel_size": 1, | |
| "sequence_parallel": true | |
| }, | |
| "engine": { | |
| "type": "megatron_lm" | |
| }, | |
| "hardware": { | |
| "num_gpus": 1024, | |
| "gpu_memory_gb": 80 | |
| } | |
| } | |
| }, | |
| "mixtral-8x7b": { | |
| "display_name": "Mixtral 8x7B (MoE)", | |
| "description": "Mistral AI Mixtral 8x7B - 46.7B total params, ~12.9B active per token", | |
| "config": { | |
| "model": { | |
| "name": "mixtral-8x7b", | |
| "num_parameters": "46.7B", | |
| "num_layers": 32, | |
| "hidden_size": 4096, | |
| "num_attention_heads": 32, | |
| "vocab_size": 32000, | |
| "max_seq_len": 32768, | |
| "moe_enabled": true, | |
| "num_experts": 8, | |
| "top_k": 2, | |
| "expert_intermediate_size": 14336 | |
| }, | |
| "training": { | |
| "batch_size": 2, | |
| "gradient_accumulation_steps": 4, | |
| "optimizer": "adamw", | |
| "dtype": "bf16", | |
| "activation_checkpointing": 2 | |
| }, | |
| "parallelism": { | |
| "tensor_parallel_size": 2, | |
| "pipeline_parallel_size": 1, | |
| "data_parallel_size": 4, | |
| "sequence_parallel": false | |
| }, | |
| "engine": { | |
| "type": "deepspeed", | |
| "zero_stage": 3, | |
| "offload_optimizer": "cpu", | |
| "offload_param": "none" | |
| }, | |
| "hardware": { | |
| "num_gpus": 8, | |
| "gpu_memory_gb": 80 | |
| } | |
| } | |
| }, | |
| "glm-4-9b": { | |
| "display_name": "GLM-4 9B (MoE)", | |
| "description": "Tsinghua University GLM-4 9B with MoE architecture", | |
| "config": { | |
| "model": { | |
| "name": "glm-4-9b", | |
| "num_parameters": "9B", | |
| "num_layers": 40, | |
| "hidden_size": 4096, | |
| "num_attention_heads": 32, | |
| "vocab_size": 151552, | |
| "max_seq_len": 8192, | |
| "moe_enabled": true, | |
| "num_experts": 4, | |
| "top_k": 2, | |
| "expert_intermediate_size": 10240, | |
| "shared_expert_intermediate_size": 10240 | |
| }, | |
| "training": { | |
| "batch_size": 4, | |
| "gradient_accumulation_steps": 4, | |
| "optimizer": "adamw", | |
| "dtype": "bf16", | |
| "activation_checkpointing": 2 | |
| }, | |
| "parallelism": { | |
| "tensor_parallel_size": 1, | |
| "pipeline_parallel_size": 1, | |
| "data_parallel_size": 4, | |
| "sequence_parallel": false | |
| }, | |
| "engine": { | |
| "type": "deepspeed", | |
| "zero_stage": 2, | |
| "offload_optimizer": "none", | |
| "offload_param": "none" | |
| }, | |
| "hardware": { | |
| "num_gpus": 4, | |
| "gpu_memory_gb": 80 | |
| } | |
| } | |
| }, | |
| "glm-4.7-355b": { | |
| "display_name": "GLM-4.7 355B (MoE)", | |
| "description": "Tsinghua University GLM-4.7 - Latest flagship with 355B total / 32B active params", | |
| "config": { | |
| "model": { | |
| "name": "glm-4.7-355b", | |
| "num_parameters": "355B", | |
| "num_layers": 46, | |
| "hidden_size": 4096, | |
| "num_attention_heads": 96, | |
| "vocab_size": 151552, | |
| "max_seq_len": 131072, | |
| "moe_enabled": true, | |
| "num_experts": 128, | |
| "top_k": 8, | |
| "expert_intermediate_size": 1408, | |
| "shared_expert_intermediate_size": 10944 | |
| }, | |
| "training": { | |
| "batch_size": 1, | |
| "gradient_accumulation_steps": 16, | |
| "optimizer": "adamw", | |
| "dtype": "bf16", | |
| "activation_checkpointing": 4 | |
| }, | |
| "parallelism": { | |
| "tensor_parallel_size": 8, | |
| "pipeline_parallel_size": 4, | |
| "data_parallel_size": 16, | |
| "sequence_parallel": true | |
| }, | |
| "engine": { | |
| "type": "deepspeed", | |
| "zero_stage": 3, | |
| "offload_optimizer": "cpu", | |
| "offload_param": "cpu" | |
| }, | |
| "hardware": { | |
| "num_gpus": 512, | |
| "gpu_memory_gb": 80 | |
| } | |
| } | |
| }, | |
| "glm-4.5-air-106b": { | |
| "display_name": "GLM-4.5 Air 106B (MoE) ⭐ Air", | |
| "description": "Tsinghua University GLM-4.5 Air - 106B total / 12B active params, optimized for deployment", | |
| "config": { | |
| "model": { | |
| "name": "glm-4.5-air-106b", | |
| "num_parameters": "106B", | |
| "num_layers": 46, | |
| "hidden_size": 4096, | |
| "num_attention_heads": 96, | |
| "vocab_size": 151552, | |
| "max_seq_len": 131072, | |
| "moe_enabled": true, | |
| "num_experts": 128, | |
| "top_k": 8, | |
| "expert_intermediate_size": 1408, | |
| "shared_expert_intermediate_size": 10944 | |
| }, | |
| "training": { | |
| "batch_size": 2, | |
| "gradient_accumulation_steps": 8, | |
| "optimizer": "adamw", | |
| "dtype": "bf16", | |
| "activation_checkpointing": 2 | |
| }, | |
| "parallelism": { | |
| "tensor_parallel_size": 4, | |
| "pipeline_parallel_size": 2, | |
| "data_parallel_size": 8, | |
| "sequence_parallel": false | |
| }, | |
| "engine": { | |
| "type": "deepspeed", | |
| "zero_stage": 3, | |
| "offload_optimizer": "cpu", | |
| "offload_param": "none" | |
| }, | |
| "hardware": { | |
| "num_gpus": 64, | |
| "gpu_memory_gb": 80 | |
| } | |
| } | |
| }, | |
| "qwen1.5-moe-a2.7b": { | |
| "display_name": "Qwen1.5-MoE-A2.7B", | |
| "description": "Alibaba Qwen1.5 MoE - 14B total params, 2.7B active per token", | |
| "config": { | |
| "model": { | |
| "name": "qwen1.5-moe-a2.7b", | |
| "num_parameters": "14B", | |
| "num_layers": 28, | |
| "hidden_size": 5120, | |
| "num_attention_heads": 40, | |
| "vocab_size": 151936, | |
| "max_seq_len": 32768, | |
| "moe_enabled": true, | |
| "num_experts": 8, | |
| "top_k": 4, | |
| "expert_intermediate_size": 15360 | |
| }, | |
| "training": { | |
| "batch_size": 2, | |
| "gradient_accumulation_steps": 4, | |
| "optimizer": "adamw", | |
| "dtype": "bf16", | |
| "activation_checkpointing": 2 | |
| }, | |
| "parallelism": { | |
| "tensor_parallel_size": 2, | |
| "pipeline_parallel_size": 1, | |
| "data_parallel_size": 4, | |
| "sequence_parallel": false | |
| }, | |
| "engine": { | |
| "type": "deepspeed", | |
| "zero_stage": 3, | |
| "offload_optimizer": "cpu", | |
| "offload_param": "none" | |
| }, | |
| "hardware": { | |
| "num_gpus": 8, | |
| "gpu_memory_gb": 80 | |
| } | |
| } | |
| }, | |
| "deepseek-moe-16b": { | |
| "display_name": "DeepSeek-MoE 16B", | |
| "description": "DeepSeek MoE model with 16.4B total params, ~2.7B active per token", | |
| "config": { | |
| "model": { | |
| "name": "deepseek-moe-16b", | |
| "num_parameters": "16.4B", | |
| "num_layers": 28, | |
| "hidden_size": 2048, | |
| "num_attention_heads": 16, | |
| "vocab_size": 102400, | |
| "max_seq_len": 4096, | |
| "moe_enabled": true, | |
| "num_experts": 64, | |
| "top_k": 6, | |
| "expert_intermediate_size": 1408, | |
| "shared_expert_intermediate_size": 10944 | |
| }, | |
| "training": { | |
| "batch_size": 4, | |
| "gradient_accumulation_steps": 4, | |
| "optimizer": "adamw", | |
| "dtype": "bf16", | |
| "activation_checkpointing": 2 | |
| }, | |
| "parallelism": { | |
| "tensor_parallel_size": 2, | |
| "pipeline_parallel_size": 1, | |
| "data_parallel_size": 4, | |
| "sequence_parallel": false | |
| }, | |
| "engine": { | |
| "type": "deepspeed", | |
| "zero_stage": 2, | |
| "offload_optimizer": "none", | |
| "offload_param": "none" | |
| }, | |
| "hardware": { | |
| "num_gpus": 8, | |
| "gpu_memory_gb": 80 | |
| } | |
| } | |
| } | |
| } | |