Helion-V1.5-XL / model_memory.json
Trouter-Library's picture
Create model_memory.json
1e51c60 verified
raw
history blame
7.58 kB
{
"model_name": "Helion-V1.5-XL",
"total_parameters": 16247832576,
"trainable_parameters": 16247832576,
"non_trainable_parameters": 0,
"memory_footprint": {
"model_weights": {
"fp32": {
"size_gb": 64.991,
"size_bytes": 64991330304,
"bits_per_param": 32
},
"fp16": {
"size_gb": 32.496,
"size_bytes": 32495665152,
"bits_per_param": 16
},
"bf16": {
"size_gb": 32.496,
"size_bytes": 32495665152,
"bits_per_param": 16
},
"int8": {
"size_gb": 16.248,
"size_bytes": 16247832576,
"bits_per_param": 8
},
"int4": {
"size_gb": 9.124,
"size_bytes": 9124416288,
"bits_per_param": 4.5,
"note": "Includes quantization overhead"
}
},
"inference_memory": {
"fp32": {
"static_memory_gb": 64.991,
"kv_cache_per_token_mb": 0.393,
"activation_memory_gb": 2.048,
"total_memory_gb": 67.039,
"recommended_vram_gb": 80
},
"bf16": {
"static_memory_gb": 32.496,
"kv_cache_per_token_mb": 0.196,
"activation_memory_gb": 1.024,
"total_memory_gb": 33.520,
"recommended_vram_gb": 40
},
"int8": {
"static_memory_gb": 16.248,
"kv_cache_per_token_mb": 0.196,
"activation_memory_gb": 0.768,
"total_memory_gb": 17.016,
"recommended_vram_gb": 24
},
"int4": {
"static_memory_gb": 9.124,
"kv_cache_per_token_mb": 0.196,
"activation_memory_gb": 0.512,
"total_memory_gb": 9.636,
"recommended_vram_gb": 12
}
},
"training_memory": {
"model_states": {
"model_parameters_gb": 32.496,
"gradients_gb": 32.496,
"optimizer_states_gb": 129.983,
"total_gb": 194.975
},
"activation_memory": {
"per_layer_mb": 147.456,
"total_layers": 48,
"gradient_checkpointing_enabled": true,
"with_checkpointing_gb": 3.538,
"without_checkpointing_gb": 7.077
},
"total_per_gpu": {
"with_gradient_checkpointing_gb": 198.513,
"without_gradient_checkpointing_gb": 202.052,
"recommended_vram_gb": 80,
"batch_size_per_gpu": 1
}
}
},
"layer_breakdown": {
"embedding_layer": {
"parameters": 614400000,
"memory_bf16_mb": 1228.8
},
"decoder_layers": {
"total_layers": 48,
"parameters_per_layer": 325640192,
"memory_per_layer_bf16_mb": 651.28,
"total_parameters": 15630729216,
"total_memory_bf16_mb": 31261.44
},
"output_layer": {
"lm_head_parameters": 614400000,
"memory_bf16_mb": 1228.8,
"note": "Weights not tied with embeddings"
},
"normalization_layers": {
"parameters": 2703360,
"memory_bf16_mb": 5.41
}
},
"component_breakdown": {
"attention_layers": {
"q_proj": {
"shape": [6144, 6144],
"parameters_per_layer": 37748736,
"total_parameters": 1811939328
},
"k_proj": {
"shape": [6144, 1536],
"parameters_per_layer": 9437184,
"total_parameters": 452984832
},
"v_proj": {
"shape": [6144, 1536],
"parameters_per_layer": 9437184,
"total_parameters": 452984832
},
"o_proj": {
"shape": [6144, 6144],
"parameters_per_layer": 37748736,
"total_parameters": 1811939328
},
"total_attention_parameters": 4529848320
},
"mlp_layers": {
"gate_proj": {
"shape": [6144, 24576],
"parameters_per_layer": 150994944,
"total_parameters": 7247757312
},
"up_proj": {
"shape": [6144, 24576],
"parameters_per_layer": 150994944,
"total_parameters": 7247757312
},
"down_proj": {
"shape": [24576, 6144],
"parameters_per_layer": 150994944,
"total_parameters": 7247757312
},
"total_mlp_parameters": 21743271936
}
},
"kv_cache_specifications": {
"num_layers": 48,
"num_kv_heads": 8,
"head_dim": 192,
"hidden_size_kv": 1536,
"cache_size_per_token": {
"bf16_bytes": 196608,
"bf16_mb": 0.1875,
"fp32_bytes": 393216,
"fp32_mb": 0.375
},
"max_context_length": 16384,
"max_cache_size": {
"bf16_gb": 3.072,
"fp32_gb": 6.144
}
},
"inference_benchmarks": {
"hardware_profiles": [
{
"gpu": "NVIDIA A100 80GB",
"precision": "bf16",
"batch_size": 1,
"context_length": 2048,
"tokens_per_second": 47.3,
"latency_ms": 21.1,
"memory_used_gb": 34.2
},
{
"gpu": "NVIDIA A100 80GB",
"precision": "int8",
"batch_size": 1,
"context_length": 2048,
"tokens_per_second": 89.6,
"latency_ms": 11.2,
"memory_used_gb": 17.8
},
{
"gpu": "NVIDIA A100 80GB",
"precision": "int4",
"batch_size": 1,
"context_length": 2048,
"tokens_per_second": 134.2,
"latency_ms": 7.5,
"memory_used_gb": 10.4
},
{
"gpu": "NVIDIA H100 80GB",
"precision": "bf16",
"batch_size": 1,
"context_length": 2048,
"tokens_per_second": 78.1,
"latency_ms": 12.8,
"memory_used_gb": 34.2
},
{
"gpu": "NVIDIA H100 80GB",
"precision": "int4",
"batch_size": 1,
"context_length": 2048,
"tokens_per_second": 218.7,
"latency_ms": 4.6,
"memory_used_gb": 10.4
},
{
"gpu": "NVIDIA RTX 4090",
"precision": "int4",
"batch_size": 1,
"context_length": 2048,
"tokens_per_second": 87.3,
"latency_ms": 11.5,
"memory_used_gb": 10.4
}
]
},
"optimization_recommendations": {
"for_inference": {
"under_12gb": "Use int4 quantization with context length <= 2048",
"12gb_to_24gb": "Use int8 quantization or int4 with longer context",
"24gb_to_40gb": "Use bf16 precision for best quality",
"over_40gb": "Use bf16 with large batch sizes or long contexts"
},
"for_fine_tuning": {
"lora": {
"minimum_vram_gb": 24,
"recommended_vram_gb": 40,
"trainable_parameters_percent": 0.1
},
"qlora": {
"minimum_vram_gb": 16,
"recommended_vram_gb": 24,
"base_precision": "int4",
"adapter_precision": "bf16"
},
"full_fine_tuning": {
"minimum_vram_gb": 80,
"recommended_setup": "Multi-GPU with FSDP",
"gpus_required": 8
}
}
},
"memory_efficient_techniques": {
"quantization": {
"int8": {
"memory_reduction": "50%",
"quality_impact": "minimal",
"speedup": "1.9x"
},
"int4": {
"memory_reduction": "72%",
"quality_impact": "slight degradation",
"speedup": "2.8x"
}
},
"flash_attention": {
"memory_reduction": "proportional to sequence length",
"speedup": "2-3x for long sequences",
"supported": true
},
"gradient_checkpointing": {
"memory_reduction": "50% activation memory",
"speed_penalty": "20-30% slower",
"recommended_for_training": true
},
"cpu_offloading": {
"memory_reduction": "up to 80%",
"speed_penalty": "10-50x slower",
"use_case": "inference on limited hardware"
}
}
}