Spaces:
Running on Zero
Running on Zero
| """ | |
| Constants for FlashAttention Explorer Space. | |
| Model configurations and GPU specifications. | |
| """ | |
| # Real HuggingFace models with their configurations | |
| MODEL_CONFIGS = { | |
| "SmolLM2-360M": { | |
| "model_id": "HuggingFaceTB/SmolLM2-360M-Instruct", | |
| "layers": 32, | |
| "q_heads": 15, | |
| "kv_heads": 5, | |
| "head_dim": 64, | |
| "hidden_dim": 960, | |
| "vocab_size": 49152, | |
| "description": "Fast demos, scaling curves", | |
| "license": "Apache 2.0", | |
| "gqa_ratio": 3, # 15/5 = 3 Q heads per KV head | |
| }, | |
| "Qwen2.5-0.5B": { | |
| "model_id": "Qwen/Qwen2.5-0.5B-Instruct", | |
| "layers": 24, | |
| "q_heads": 14, | |
| "kv_heads": 2, | |
| "head_dim": 64, | |
| "hidden_dim": 896, | |
| "vocab_size": 151936, | |
| "description": "Prefill/Decode comparison", | |
| "license": "Apache 2.0", | |
| "gqa_ratio": 7, # 14/2 = 7 Q heads per KV head | |
| }, | |
| "Llama-3.2-1B": { | |
| "model_id": "meta-llama/Llama-3.2-1B-Instruct", | |
| "layers": 16, | |
| "q_heads": 32, | |
| "kv_heads": 8, | |
| "head_dim": 64, | |
| "hidden_dim": 2048, | |
| "vocab_size": 128256, | |
| "description": "GQA demonstration", | |
| "license": "Llama 3.2", | |
| "gqa_ratio": 4, # 32/8 = 4 Q heads per KV head | |
| }, | |
| } | |
| # GPU specifications for roofline analysis | |
| GPU_SPECS = { | |
| "A10G": { | |
| "name": "NVIDIA A10G", | |
| "tflops_fp16": 125, | |
| "bandwidth_gbps": 600, # GB/s | |
| "memory_gb": 24, | |
| "sram_kb": 192, # Shared memory per SM | |
| }, | |
| "A100_80GB": { | |
| "name": "NVIDIA A100 (80GB)", | |
| "tflops_fp16": 312, | |
| "bandwidth_gbps": 2000, | |
| "memory_gb": 80, | |
| "sram_kb": 192, | |
| }, | |
| "H100": { | |
| "name": "NVIDIA H100 (80GB)", | |
| "tflops_fp16": 989, | |
| "bandwidth_gbps": 3350, | |
| "memory_gb": 80, | |
| "sram_kb": 256, | |
| }, | |
| "H200": { | |
| "name": "NVIDIA H200 (141GB)", | |
| "tflops_fp16": 989, # Same compute as H100 | |
| "bandwidth_gbps": 4800, # HBM3e: 4.8 TB/s | |
| "memory_gb": 141, | |
| "sram_kb": 256, | |
| }, | |
| "L40S": { | |
| "name": "NVIDIA L40S", | |
| "tflops_fp16": 362, | |
| "bandwidth_gbps": 864, | |
| "memory_gb": 48, | |
| "sram_kb": 192, | |
| }, | |
| "L4": { | |
| "name": "NVIDIA L4", | |
| "tflops_fp16": 121, | |
| "bandwidth_gbps": 300, | |
| "memory_gb": 24, | |
| "sram_kb": 96, | |
| }, | |
| } | |
| # Default GPU for Zero GPU Spaces | |
| DEFAULT_GPU = "A10G" | |
| # Attention backend names | |
| ATTENTION_BACKENDS = { | |
| "math": "Math (Naive)", | |
| "flash": "FlashAttention", | |
| "mem_efficient": "Memory Efficient", | |
| } | |
| # Sequence length options for benchmarks | |
| SEQ_LENGTH_OPTIONS = [512, 1024, 2048, 4096] | |
| # Batch size options | |
| BATCH_SIZE_OPTIONS = [1, 2, 4, 8] | |
| # Default values | |
| DEFAULT_SEQ_LEN = 1024 | |
| DEFAULT_BATCH_SIZE = 1 | |
| DEFAULT_MODEL = "SmolLM2-360M" | |