Spaces:

atharv6f
/

flash-attention-explorer

Running on Zero

flash-attention-explorer / src /constants.py

a0y0346

Add H200 GPU support and improve roofline chart visibility

e2de6cd 2 months ago

2.85 kB

	"""
	Constants for FlashAttention Explorer Space.
	Model configurations and GPU specifications.
	"""

	# Real HuggingFace models with their configurations
	MODEL_CONFIGS = {
	"SmolLM2-360M": {
	"model_id": "HuggingFaceTB/SmolLM2-360M-Instruct",
	"layers": 32,
	"q_heads": 15,
	"kv_heads": 5,
	"head_dim": 64,
	"hidden_dim": 960,
	"vocab_size": 49152,
	"description": "Fast demos, scaling curves",
	"license": "Apache 2.0",
	"gqa_ratio": 3, # 15/5 = 3 Q heads per KV head
	},
	"Qwen2.5-0.5B": {
	"model_id": "Qwen/Qwen2.5-0.5B-Instruct",
	"layers": 24,
	"q_heads": 14,
	"kv_heads": 2,
	"head_dim": 64,
	"hidden_dim": 896,
	"vocab_size": 151936,
	"description": "Prefill/Decode comparison",
	"license": "Apache 2.0",
	"gqa_ratio": 7, # 14/2 = 7 Q heads per KV head
	},
	"Llama-3.2-1B": {
	"model_id": "meta-llama/Llama-3.2-1B-Instruct",
	"layers": 16,
	"q_heads": 32,
	"kv_heads": 8,
	"head_dim": 64,
	"hidden_dim": 2048,
	"vocab_size": 128256,
	"description": "GQA demonstration",
	"license": "Llama 3.2",
	"gqa_ratio": 4, # 32/8 = 4 Q heads per KV head
	},
	}

	# GPU specifications for roofline analysis
	GPU_SPECS = {
	"A10G": {
	"name": "NVIDIA A10G",
	"tflops_fp16": 125,
	"bandwidth_gbps": 600, # GB/s
	"memory_gb": 24,
	"sram_kb": 192, # Shared memory per SM
	},
	"A100_80GB": {
	"name": "NVIDIA A100 (80GB)",
	"tflops_fp16": 312,
	"bandwidth_gbps": 2000,
	"memory_gb": 80,
	"sram_kb": 192,
	},
	"H100": {
	"name": "NVIDIA H100 (80GB)",
	"tflops_fp16": 989,
	"bandwidth_gbps": 3350,
	"memory_gb": 80,
	"sram_kb": 256,
	},
	"H200": {
	"name": "NVIDIA H200 (141GB)",
	"tflops_fp16": 989, # Same compute as H100
	"bandwidth_gbps": 4800, # HBM3e: 4.8 TB/s
	"memory_gb": 141,
	"sram_kb": 256,
	},
	"L40S": {
	"name": "NVIDIA L40S",
	"tflops_fp16": 362,
	"bandwidth_gbps": 864,
	"memory_gb": 48,
	"sram_kb": 192,
	},
	"L4": {
	"name": "NVIDIA L4",
	"tflops_fp16": 121,
	"bandwidth_gbps": 300,
	"memory_gb": 24,
	"sram_kb": 96,
	},
	}

	# Default GPU for Zero GPU Spaces
	DEFAULT_GPU = "A10G"

	# Attention backend names
	ATTENTION_BACKENDS = {
	"math": "Math (Naive)",
	"flash": "FlashAttention",
	"mem_efficient": "Memory Efficient",
	}

	# Sequence length options for benchmarks
	SEQ_LENGTH_OPTIONS = [512, 1024, 2048, 4096]

	# Batch size options
	BATCH_SIZE_OPTIONS = [1, 2, 4, 8]

	# Default values
	DEFAULT_SEQ_LEN = 1024
	DEFAULT_BATCH_SIZE = 1
	DEFAULT_MODEL = "SmolLM2-360M"