flash-attention-explorer / src /constants.py
a0y0346
Add H200 GPU support and improve roofline chart visibility
e2de6cd
"""
Constants for FlashAttention Explorer Space.
Model configurations and GPU specifications.
"""
# Real HuggingFace models with their configurations
MODEL_CONFIGS = {
"SmolLM2-360M": {
"model_id": "HuggingFaceTB/SmolLM2-360M-Instruct",
"layers": 32,
"q_heads": 15,
"kv_heads": 5,
"head_dim": 64,
"hidden_dim": 960,
"vocab_size": 49152,
"description": "Fast demos, scaling curves",
"license": "Apache 2.0",
"gqa_ratio": 3, # 15/5 = 3 Q heads per KV head
},
"Qwen2.5-0.5B": {
"model_id": "Qwen/Qwen2.5-0.5B-Instruct",
"layers": 24,
"q_heads": 14,
"kv_heads": 2,
"head_dim": 64,
"hidden_dim": 896,
"vocab_size": 151936,
"description": "Prefill/Decode comparison",
"license": "Apache 2.0",
"gqa_ratio": 7, # 14/2 = 7 Q heads per KV head
},
"Llama-3.2-1B": {
"model_id": "meta-llama/Llama-3.2-1B-Instruct",
"layers": 16,
"q_heads": 32,
"kv_heads": 8,
"head_dim": 64,
"hidden_dim": 2048,
"vocab_size": 128256,
"description": "GQA demonstration",
"license": "Llama 3.2",
"gqa_ratio": 4, # 32/8 = 4 Q heads per KV head
},
}
# GPU specifications for roofline analysis
GPU_SPECS = {
"A10G": {
"name": "NVIDIA A10G",
"tflops_fp16": 125,
"bandwidth_gbps": 600, # GB/s
"memory_gb": 24,
"sram_kb": 192, # Shared memory per SM
},
"A100_80GB": {
"name": "NVIDIA A100 (80GB)",
"tflops_fp16": 312,
"bandwidth_gbps": 2000,
"memory_gb": 80,
"sram_kb": 192,
},
"H100": {
"name": "NVIDIA H100 (80GB)",
"tflops_fp16": 989,
"bandwidth_gbps": 3350,
"memory_gb": 80,
"sram_kb": 256,
},
"H200": {
"name": "NVIDIA H200 (141GB)",
"tflops_fp16": 989, # Same compute as H100
"bandwidth_gbps": 4800, # HBM3e: 4.8 TB/s
"memory_gb": 141,
"sram_kb": 256,
},
"L40S": {
"name": "NVIDIA L40S",
"tflops_fp16": 362,
"bandwidth_gbps": 864,
"memory_gb": 48,
"sram_kb": 192,
},
"L4": {
"name": "NVIDIA L4",
"tflops_fp16": 121,
"bandwidth_gbps": 300,
"memory_gb": 24,
"sram_kb": 96,
},
}
# Default GPU for Zero GPU Spaces
DEFAULT_GPU = "A10G"
# Attention backend names
ATTENTION_BACKENDS = {
"math": "Math (Naive)",
"flash": "FlashAttention",
"mem_efficient": "Memory Efficient",
}
# Sequence length options for benchmarks
SEQ_LENGTH_OPTIONS = [512, 1024, 2048, 4096]
# Batch size options
BATCH_SIZE_OPTIONS = [1, 2, 4, 8]
# Default values
DEFAULT_SEQ_LEN = 1024
DEFAULT_BATCH_SIZE = 1
DEFAULT_MODEL = "SmolLM2-360M"