| { | |
| "model_name": "Helion-V1.5-XL", | |
| "total_parameters": 16247832576, | |
| "trainable_parameters": 16247832576, | |
| "non_trainable_parameters": 0, | |
| "memory_footprint": { | |
| "model_weights": { | |
| "fp32": { | |
| "size_gb": 64.991, | |
| "size_bytes": 64991330304, | |
| "bits_per_param": 32 | |
| }, | |
| "fp16": { | |
| "size_gb": 32.496, | |
| "size_bytes": 32495665152, | |
| "bits_per_param": 16 | |
| }, | |
| "bf16": { | |
| "size_gb": 32.496, | |
| "size_bytes": 32495665152, | |
| "bits_per_param": 16 | |
| }, | |
| "int8": { | |
| "size_gb": 16.248, | |
| "size_bytes": 16247832576, | |
| "bits_per_param": 8 | |
| }, | |
| "int4": { | |
| "size_gb": 9.124, | |
| "size_bytes": 9124416288, | |
| "bits_per_param": 4.5, | |
| "note": "Includes quantization overhead" | |
| } | |
| }, | |
| "inference_memory": { | |
| "fp32": { | |
| "static_memory_gb": 64.991, | |
| "kv_cache_per_token_mb": 0.393, | |
| "activation_memory_gb": 2.048, | |
| "total_memory_gb": 67.039, | |
| "recommended_vram_gb": 80 | |
| }, | |
| "bf16": { | |
| "static_memory_gb": 32.496, | |
| "kv_cache_per_token_mb": 0.196, | |
| "activation_memory_gb": 1.024, | |
| "total_memory_gb": 33.520, | |
| "recommended_vram_gb": 40 | |
| }, | |
| "int8": { | |
| "static_memory_gb": 16.248, | |
| "kv_cache_per_token_mb": 0.196, | |
| "activation_memory_gb": 0.768, | |
| "total_memory_gb": 17.016, | |
| "recommended_vram_gb": 24 | |
| }, | |
| "int4": { | |
| "static_memory_gb": 9.124, | |
| "kv_cache_per_token_mb": 0.196, | |
| "activation_memory_gb": 0.512, | |
| "total_memory_gb": 9.636, | |
| "recommended_vram_gb": 12 | |
| } | |
| }, | |
| "training_memory": { | |
| "model_states": { | |
| "model_parameters_gb": 32.496, | |
| "gradients_gb": 32.496, | |
| "optimizer_states_gb": 129.983, | |
| "total_gb": 194.975 | |
| }, | |
| "activation_memory": { | |
| "per_layer_mb": 147.456, | |
| "total_layers": 48, | |
| "gradient_checkpointing_enabled": true, | |
| "with_checkpointing_gb": 3.538, | |
| "without_checkpointing_gb": 7.077 | |
| }, | |
| "total_per_gpu": { | |
| "with_gradient_checkpointing_gb": 198.513, | |
| "without_gradient_checkpointing_gb": 202.052, | |
| "recommended_vram_gb": 80, | |
| "batch_size_per_gpu": 1 | |
| } | |
| } | |
| }, | |
| "layer_breakdown": { | |
| "embedding_layer": { | |
| "parameters": 614400000, | |
| "memory_bf16_mb": 1228.8 | |
| }, | |
| "decoder_layers": { | |
| "total_layers": 48, | |
| "parameters_per_layer": 325640192, | |
| "memory_per_layer_bf16_mb": 651.28, | |
| "total_parameters": 15630729216, | |
| "total_memory_bf16_mb": 31261.44 | |
| }, | |
| "output_layer": { | |
| "lm_head_parameters": 614400000, | |
| "memory_bf16_mb": 1228.8, | |
| "note": "Weights not tied with embeddings" | |
| }, | |
| "normalization_layers": { | |
| "parameters": 2703360, | |
| "memory_bf16_mb": 5.41 | |
| } | |
| }, | |
| "component_breakdown": { | |
| "attention_layers": { | |
| "q_proj": { | |
| "shape": [6144, 6144], | |
| "parameters_per_layer": 37748736, | |
| "total_parameters": 1811939328 | |
| }, | |
| "k_proj": { | |
| "shape": [6144, 1536], | |
| "parameters_per_layer": 9437184, | |
| "total_parameters": 452984832 | |
| }, | |
| "v_proj": { | |
| "shape": [6144, 1536], | |
| "parameters_per_layer": 9437184, | |
| "total_parameters": 452984832 | |
| }, | |
| "o_proj": { | |
| "shape": [6144, 6144], | |
| "parameters_per_layer": 37748736, | |
| "total_parameters": 1811939328 | |
| }, | |
| "total_attention_parameters": 4529848320 | |
| }, | |
| "mlp_layers": { | |
| "gate_proj": { | |
| "shape": [6144, 24576], | |
| "parameters_per_layer": 150994944, | |
| "total_parameters": 7247757312 | |
| }, | |
| "up_proj": { | |
| "shape": [6144, 24576], | |
| "parameters_per_layer": 150994944, | |
| "total_parameters": 7247757312 | |
| }, | |
| "down_proj": { | |
| "shape": [24576, 6144], | |
| "parameters_per_layer": 150994944, | |
| "total_parameters": 7247757312 | |
| }, | |
| "total_mlp_parameters": 21743271936 | |
| } | |
| }, | |
| "kv_cache_specifications": { | |
| "num_layers": 48, | |
| "num_kv_heads": 8, | |
| "head_dim": 192, | |
| "hidden_size_kv": 1536, | |
| "cache_size_per_token": { | |
| "bf16_bytes": 196608, | |
| "bf16_mb": 0.1875, | |
| "fp32_bytes": 393216, | |
| "fp32_mb": 0.375 | |
| }, | |
| "max_context_length": 16384, | |
| "max_cache_size": { | |
| "bf16_gb": 3.072, | |
| "fp32_gb": 6.144 | |
| } | |
| }, | |
| "inference_benchmarks": { | |
| "hardware_profiles": [ | |
| { | |
| "gpu": "NVIDIA A100 80GB", | |
| "precision": "bf16", | |
| "batch_size": 1, | |
| "context_length": 2048, | |
| "tokens_per_second": 47.3, | |
| "latency_ms": 21.1, | |
| "memory_used_gb": 34.2 | |
| }, | |
| { | |
| "gpu": "NVIDIA A100 80GB", | |
| "precision": "int8", | |
| "batch_size": 1, | |
| "context_length": 2048, | |
| "tokens_per_second": 89.6, | |
| "latency_ms": 11.2, | |
| "memory_used_gb": 17.8 | |
| }, | |
| { | |
| "gpu": "NVIDIA A100 80GB", | |
| "precision": "int4", | |
| "batch_size": 1, | |
| "context_length": 2048, | |
| "tokens_per_second": 134.2, | |
| "latency_ms": 7.5, | |
| "memory_used_gb": 10.4 | |
| }, | |
| { | |
| "gpu": "NVIDIA H100 80GB", | |
| "precision": "bf16", | |
| "batch_size": 1, | |
| "context_length": 2048, | |
| "tokens_per_second": 78.1, | |
| "latency_ms": 12.8, | |
| "memory_used_gb": 34.2 | |
| }, | |
| { | |
| "gpu": "NVIDIA H100 80GB", | |
| "precision": "int4", | |
| "batch_size": 1, | |
| "context_length": 2048, | |
| "tokens_per_second": 218.7, | |
| "latency_ms": 4.6, | |
| "memory_used_gb": 10.4 | |
| }, | |
| { | |
| "gpu": "NVIDIA RTX 4090", | |
| "precision": "int4", | |
| "batch_size": 1, | |
| "context_length": 2048, | |
| "tokens_per_second": 87.3, | |
| "latency_ms": 11.5, | |
| "memory_used_gb": 10.4 | |
| } | |
| ] | |
| }, | |
| "optimization_recommendations": { | |
| "for_inference": { | |
| "under_12gb": "Use int4 quantization with context length <= 2048", | |
| "12gb_to_24gb": "Use int8 quantization or int4 with longer context", | |
| "24gb_to_40gb": "Use bf16 precision for best quality", | |
| "over_40gb": "Use bf16 with large batch sizes or long contexts" | |
| }, | |
| "for_fine_tuning": { | |
| "lora": { | |
| "minimum_vram_gb": 24, | |
| "recommended_vram_gb": 40, | |
| "trainable_parameters_percent": 0.1 | |
| }, | |
| "qlora": { | |
| "minimum_vram_gb": 16, | |
| "recommended_vram_gb": 24, | |
| "base_precision": "int4", | |
| "adapter_precision": "bf16" | |
| }, | |
| "full_fine_tuning": { | |
| "minimum_vram_gb": 80, | |
| "recommended_setup": "Multi-GPU with FSDP", | |
| "gpus_required": 8 | |
| } | |
| } | |
| }, | |
| "memory_efficient_techniques": { | |
| "quantization": { | |
| "int8": { | |
| "memory_reduction": "50%", | |
| "quality_impact": "minimal", | |
| "speedup": "1.9x" | |
| }, | |
| "int4": { | |
| "memory_reduction": "72%", | |
| "quality_impact": "slight degradation", | |
| "speedup": "2.8x" | |
| } | |
| }, | |
| "flash_attention": { | |
| "memory_reduction": "proportional to sequence length", | |
| "speedup": "2-3x for long sequences", | |
| "supported": true | |
| }, | |
| "gradient_checkpointing": { | |
| "memory_reduction": "50% activation memory", | |
| "speed_penalty": "20-30% slower", | |
| "recommended_for_training": true | |
| }, | |
| "cpu_offloading": { | |
| "memory_reduction": "up to 80%", | |
| "speed_penalty": "10-50x slower", | |
| "use_case": "inference on limited hardware" | |
| } | |
| } | |
| } |