Spaces:

atharv6f
/

flash-attention-explorer

Sleeping

a0y0346 commited on Feb 5

Commit

e2de6cd

1 Parent(s): d5ef20e

Add H200 GPU support and improve roofline chart visibility

- Add H200, L40S, L4, RTX 4090 to GPU detection
- Use dynamic memory detection for unknown GPUs
- Add visible labels with backgrounds on roofline chart
- Show GPU specs in chart title
- Use annotations with arrows instead of inline text

Files changed (2) hide show

src/benchmark.py +214 -60
src/constants.py +22 -1

src/benchmark.py CHANGED Viewed

@@ -15,7 +15,6 @@ from .constants import GPU_SPECS, ATTENTION_BACKENDS, MODEL_CONFIGS, DEFAULT_GPU
 def detect_gpu() -> dict:
     """
     Detect the actual GPU and return its specs.
-    Falls back to A10G specs if GPU not recognized.
     Returns:
         Dict with GPU name and specs
@@ -23,48 +22,110 @@ def detect_gpu() -> dict:
     if not torch.cuda.is_available():
         return {"name": "CPU (No GPU)", "detected": False, **GPU_SPECS[DEFAULT_GPU]}
-    gpu_name = torch.cuda.get_device_name(0).lower()
-    # Match against known GPUs
-    if "a10" in gpu_name:
-        return {"detected": True, "detected_name": torch.cuda.get_device_name(0), **GPU_SPECS["A10G"]}
-    elif "a100" in gpu_name:
-        if "80" in gpu_name:
-            return {"detected": True, "detected_name": torch.cuda.get_device_name(0), **GPU_SPECS["A100_80GB"]}
-        return {"detected": True, "detected_name": torch.cuda.get_device_name(0), **GPU_SPECS["A100_80GB"]}
     elif "h100" in gpu_name:
-        return {"detected": True, "detected_name": torch.cuda.get_device_name(0), **GPU_SPECS["H100"]}
     elif "t4" in gpu_name:
-        # T4 specs (common on free tier)
         return {
             "detected": True,
-            "detected_name": torch.cuda.get_device_name(0),
             "name": "NVIDIA T4",
             "tflops_fp16": 65,
             "bandwidth_gbps": 320,
-            "memory_gb": 16,
             "sram_kb": 64,
         }
     elif "v100" in gpu_name:
         return {
             "detected": True,
-            "detected_name": torch.cuda.get_device_name(0),
             "name": "NVIDIA V100",
             "tflops_fp16": 125,
             "bandwidth_gbps": 900,
-            "memory_gb": 32,
             "sram_kb": 128,
         }
     else:
-        # Unknown GPU - use A10G as fallback with actual name
         return {
             "detected": True,
-            "detected_name": torch.cuda.get_device_name(0),
-            "name": f"{torch.cuda.get_device_name(0)} (using A10G specs)",
-            "tflops_fp16": GPU_SPECS["A10G"]["tflops_fp16"],
-            "bandwidth_gbps": GPU_SPECS["A10G"]["bandwidth_gbps"],
-            "memory_gb": GPU_SPECS["A10G"]["memory_gb"],
-            "sram_kb": GPU_SPECS["A10G"]["sram_kb"],
         }
@@ -529,12 +590,29 @@ def create_roofline_chart(
             fig.add_trace(go.Scatter(
                 x=[m["arith_intensity"]],
                 y=[m["achieved_tflops"]],
-                mode="markers+text",
-                name=f"Math (Measured: {m['achieved_tflops']:.1f} TFLOPS)",
-                marker=dict(size=15, color="rgba(239, 68, 68, 0.9)", symbol="circle"),
-                text=[f"Math<br>{m['time_ms']:.1f}ms"],
-                textposition="top center",
             ))
         # Flash backend
         if "flash" in benchmark_metrics:
@@ -542,12 +620,28 @@ def create_roofline_chart(
             fig.add_trace(go.Scatter(
                 x=[m["arith_intensity"]],
                 y=[m["achieved_tflops"]],
-                mode="markers+text",
-                name=f"Flash (Measured: {m['achieved_tflops']:.1f} TFLOPS)",
-                marker=dict(size=15, color="rgba(34, 197, 94, 0.9)", symbol="circle"),
-                text=[f"Flash<br>{m['time_ms']:.1f}ms"],
-                textposition="top center",
             ))
         # Memory-efficient backend
         if "mem_efficient" in benchmark_metrics:
@@ -555,12 +649,28 @@ def create_roofline_chart(
             fig.add_trace(go.Scatter(
                 x=[m["arith_intensity"]],
                 y=[m["achieved_tflops"]],
-                mode="markers+text",
-                name=f"MemEfficient (Measured: {m['achieved_tflops']:.1f} TFLOPS)",
-                marker=dict(size=15, color="rgba(59, 130, 246, 0.9)", symbol="circle"),
-                text=[f"MemEff<br>{m['time_ms']:.1f}ms"],
-                textposition="top center",
             ))
     else:
         # Plot THEORETICAL approximations
         title_suffix = " (Theoretical)"
@@ -572,12 +682,25 @@ def create_roofline_chart(
         fig.add_trace(go.Scatter(
             x=[std_intensity],
             y=[std_achieved],
-            mode="markers+text",
             name="Standard (Theoretical)",
-            marker=dict(size=15, color="rgba(239, 68, 68, 0.5)", symbol="circle-open"),
-            text=["Standard"],
-            textposition="top center",
         ))
         # FlashAttention - compute bound
         flash_intensity = 200
@@ -586,12 +709,25 @@ def create_roofline_chart(
         fig.add_trace(go.Scatter(
             x=[flash_intensity],
             y=[flash_achieved],
-            mode="markers+text",
             name="Flash (Theoretical)",
-            marker=dict(size=15, color="rgba(34, 197, 94, 0.5)", symbol="circle-open"),
-            text=["FlashAttention"],
-            textposition="top center",
         ))
     # Add ridge point marker
     fig.add_trace(go.Scatter(
@@ -602,27 +738,44 @@ def create_roofline_chart(
         marker=dict(size=10, color="rgba(0, 0, 0, 0.6)", symbol="diamond"),
     ))
-    # Add annotations
     fig.add_annotation(
-        x=np.log10(3),
-        y=peak_tflops * 0.2,
-        text="Memory Bound<br>(limited by bandwidth)",
         showarrow=False,
-        font=dict(size=10, color="rgba(239, 68, 68, 0.8)"),
     )
     fig.add_annotation(
-        x=np.log10(500),
-        y=peak_tflops * 0.85,
-        text="Compute Bound<br>(limited by TFLOPS)",
         showarrow=False,
-        font=dict(size=10, color="rgba(34, 197, 94, 0.8)"),
     )
     fig.update_layout(
         title=dict(
-            text=f"Roofline Model: {gpu['name']}{title_suffix}",
             x=0.5,
         ),
         xaxis=dict(
             title="Arithmetic Intensity (FLOPs/byte)",
@@ -631,16 +784,17 @@ def create_roofline_chart(
         ),
         yaxis=dict(
             title="Performance (TFLOPS)",
-            range=[0, peak_tflops * 1.1],
         ),
-        height=400,
-        margin=dict(l=60, r=40, t=60, b=60),
         legend=dict(
             orientation="h",
             yanchor="bottom",
-            y=-0.35,
             xanchor="center",
-            x=0.5
         ),
         showlegend=True,
     )

 def detect_gpu() -> dict:
     """
     Detect the actual GPU and return its specs.
     Returns:
         Dict with GPU name and specs
     if not torch.cuda.is_available():
         return {"name": "CPU (No GPU)", "detected": False, **GPU_SPECS[DEFAULT_GPU]}
+    gpu_name_raw = torch.cuda.get_device_name(0)
+    gpu_name = gpu_name_raw.lower()
+    # Get memory in GB for dynamic spec estimation
+    try:
+        mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+    except Exception:
+        mem_gb = 24  # fallback
+    # Match against known GPUs (ordered from newest to oldest)
+    if "h200" in gpu_name:
+        # H200 specs - HBM3e memory, very high bandwidth
+        return {
+            "detected": True,
+            "detected_name": gpu_name_raw,
+            "name": "NVIDIA H200",
+            "tflops_fp16": 989,  # Same compute as H100
+            "bandwidth_gbps": 4800,  # HBM3e: 4.8 TB/s
+            "memory_gb": round(mem_gb),
+            "sram_kb": 256,
+        }
     elif "h100" in gpu_name:
+        return {"detected": True, "detected_name": gpu_name_raw, **GPU_SPECS["H100"]}
+    elif "a100" in gpu_name:
+        return {"detected": True, "detected_name": gpu_name_raw, **GPU_SPECS["A100_80GB"]}
+    elif "a10" in gpu_name:
+        return {"detected": True, "detected_name": gpu_name_raw, **GPU_SPECS["A10G"]}
+    elif "l40" in gpu_name:
+        # L40S specs
+        return {
+            "detected": True,
+            "detected_name": gpu_name_raw,
+            "name": "NVIDIA L40S",
+            "tflops_fp16": 362,
+            "bandwidth_gbps": 864,
+            "memory_gb": round(mem_gb),
+            "sram_kb": 192,
+        }
+    elif "l4" in gpu_name:
+        # L4 specs
+        return {
+            "detected": True,
+            "detected_name": gpu_name_raw,
+            "name": "NVIDIA L4",
+            "tflops_fp16": 121,
+            "bandwidth_gbps": 300,
+            "memory_gb": round(mem_gb),
+            "sram_kb": 96,
+        }
     elif "t4" in gpu_name:
         return {
             "detected": True,
+            "detected_name": gpu_name_raw,
             "name": "NVIDIA T4",
             "tflops_fp16": 65,
             "bandwidth_gbps": 320,
+            "memory_gb": round(mem_gb),
             "sram_kb": 64,
         }
     elif "v100" in gpu_name:
         return {
             "detected": True,
+            "detected_name": gpu_name_raw,
             "name": "NVIDIA V100",
             "tflops_fp16": 125,
             "bandwidth_gbps": 900,
+            "memory_gb": round(mem_gb),
+            "sram_kb": 128,
+        }
+    elif "rtx 4090" in gpu_name or "4090" in gpu_name:
+        return {
+            "detected": True,
+            "detected_name": gpu_name_raw,
+            "name": "NVIDIA RTX 4090",
+            "tflops_fp16": 330,
+            "bandwidth_gbps": 1008,
+            "memory_gb": round(mem_gb),
             "sram_kb": 128,
         }
     else:
+        # Unknown GPU - estimate specs based on memory size
+        # Higher memory usually means newer/faster GPU
+        if mem_gb >= 70:
+            est_tflops = 500
+            est_bw = 2000
+        elif mem_gb >= 40:
+            est_tflops = 300
+            est_bw = 1500
+        elif mem_gb >= 20:
+            est_tflops = 125
+            est_bw = 600
+        else:
+            est_tflops = 65
+            est_bw = 300
         return {
             "detected": True,
+            "detected_name": gpu_name_raw,
+            "name": gpu_name_raw,  # Use actual name, no "using X specs" suffix
+            "tflops_fp16": est_tflops,
+            "bandwidth_gbps": est_bw,
+            "memory_gb": round(mem_gb),
+            "sram_kb": 128,
+            "estimated": True,  # Flag that these are estimated specs
         }
             fig.add_trace(go.Scatter(
                 x=[m["arith_intensity"]],
                 y=[m["achieved_tflops"]],
+                mode="markers",
+                name=f"Math ({m['achieved_tflops']:.1f} TFLOPS, {m['time_ms']:.1f}ms)",
+                marker=dict(size=16, color="#dc2626", symbol="circle",
+                           line=dict(color="white", width=2)),
             ))
+            # Add label as annotation for better visibility
+            fig.add_annotation(
+                x=np.log10(m["arith_intensity"]),
+                y=m["achieved_tflops"],
+                text=f"<b>Math</b><br>{m['time_ms']:.1f}ms",
+                showarrow=True,
+                arrowhead=2,
+                arrowsize=1,
+                arrowwidth=1,
+                arrowcolor="#dc2626",
+                ax=0,
+                ay=-40,
+                font=dict(size=10, color="#dc2626"),
+                bgcolor="rgba(255, 255, 255, 0.95)",
+                bordercolor="#dc2626",
+                borderwidth=1,
+                borderpad=3,
+            )
         # Flash backend
         if "flash" in benchmark_metrics:
             fig.add_trace(go.Scatter(
                 x=[m["arith_intensity"]],
                 y=[m["achieved_tflops"]],
+                mode="markers",
+                name=f"Flash ({m['achieved_tflops']:.1f} TFLOPS, {m['time_ms']:.1f}ms)",
+                marker=dict(size=16, color="#16a34a", symbol="circle",
+                           line=dict(color="white", width=2)),
             ))
+            fig.add_annotation(
+                x=np.log10(m["arith_intensity"]),
+                y=m["achieved_tflops"],
+                text=f"<b>Flash</b><br>{m['time_ms']:.1f}ms",
+                showarrow=True,
+                arrowhead=2,
+                arrowsize=1,
+                arrowwidth=1,
+                arrowcolor="#16a34a",
+                ax=0,
+                ay=-40,
+                font=dict(size=10, color="#16a34a"),
+                bgcolor="rgba(255, 255, 255, 0.95)",
+                bordercolor="#16a34a",
+                borderwidth=1,
+                borderpad=3,
+            )
         # Memory-efficient backend
         if "mem_efficient" in benchmark_metrics:
             fig.add_trace(go.Scatter(
                 x=[m["arith_intensity"]],
                 y=[m["achieved_tflops"]],
+                mode="markers",
+                name=f"MemEff ({m['achieved_tflops']:.1f} TFLOPS, {m['time_ms']:.1f}ms)",
+                marker=dict(size=16, color="#2563eb", symbol="circle",
+                           line=dict(color="white", width=2)),
             ))
+            fig.add_annotation(
+                x=np.log10(m["arith_intensity"]),
+                y=m["achieved_tflops"],
+                text=f"<b>MemEff</b><br>{m['time_ms']:.1f}ms",
+                showarrow=True,
+                arrowhead=2,
+                arrowsize=1,
+                arrowwidth=1,
+                arrowcolor="#2563eb",
+                ax=30,  # Offset to avoid overlap
+                ay=-30,
+                font=dict(size=10, color="#2563eb"),
+                bgcolor="rgba(255, 255, 255, 0.95)",
+                bordercolor="#2563eb",
+                borderwidth=1,
+                borderpad=3,
+            )
     else:
         # Plot THEORETICAL approximations
         title_suffix = " (Theoretical)"
         fig.add_trace(go.Scatter(
             x=[std_intensity],
             y=[std_achieved],
+            mode="markers",
             name="Standard (Theoretical)",
+            marker=dict(size=15, color="rgba(220, 38, 38, 0.6)", symbol="circle-open",
+                       line=dict(width=2)),
         ))
+        fig.add_annotation(
+            x=np.log10(std_intensity),
+            y=std_achieved,
+            text="<b>Standard</b><br>(theoretical)",
+            showarrow=True,
+            arrowhead=2,
+            ax=0,
+            ay=-35,
+            font=dict(size=10, color="#dc2626"),
+            bgcolor="rgba(255, 255, 255, 0.9)",
+            bordercolor="rgba(220, 38, 38, 0.5)",
+            borderwidth=1,
+            borderpad=3,
+        )
         # FlashAttention - compute bound
         flash_intensity = 200
         fig.add_trace(go.Scatter(
             x=[flash_intensity],
             y=[flash_achieved],
+            mode="markers",
             name="Flash (Theoretical)",
+            marker=dict(size=15, color="rgba(22, 163, 74, 0.6)", symbol="circle-open",
+                       line=dict(width=2)),
         ))
+        fig.add_annotation(
+            x=np.log10(flash_intensity),
+            y=flash_achieved,
+            text="<b>FlashAttention</b><br>(theoretical)",
+            showarrow=True,
+            arrowhead=2,
+            ax=0,
+            ay=-35,
+            font=dict(size=10, color="#16a34a"),
+            bgcolor="rgba(255, 255, 255, 0.9)",
+            bordercolor="rgba(22, 163, 74, 0.5)",
+            borderwidth=1,
+            borderpad=3,
+        )
     # Add ridge point marker
     fig.add_trace(go.Scatter(
         marker=dict(size=10, color="rgba(0, 0, 0, 0.6)", symbol="diamond"),
     ))
+    # Add annotations with better visibility (white background)
     fig.add_annotation(
+        x=np.log10(5),
+        y=peak_tflops * 0.1,
+        text="<b>Memory Bound</b><br>(limited by bandwidth)",
         showarrow=False,
+        font=dict(size=11, color="#dc2626"),  # Solid red
+        bgcolor="rgba(255, 255, 255, 0.9)",
+        bordercolor="#dc2626",
+        borderwidth=1,
+        borderpad=4,
     )
     fig.add_annotation(
+        x=np.log10(300),
+        y=peak_tflops * 0.65,
+        text="<b>Compute Bound</b><br>(limited by TFLOPS)",
         showarrow=False,
+        font=dict(size=11, color="#16a34a"),  # Solid green
+        bgcolor="rgba(255, 255, 255, 0.9)",
+        bordercolor="#16a34a",
+        borderwidth=1,
+        borderpad=4,
     )
+    # Use detected_name if available, otherwise use name
+    display_name = gpu.get("detected_name", gpu.get("name", "GPU"))
+    # Add estimated indicator if specs were estimated
+    estimated_note = " (estimated specs)" if gpu.get("estimated") else ""
     fig.update_layout(
         title=dict(
+            text=f"Roofline Model: {display_name}{title_suffix}{estimated_note}<br>"
+                 f"<span style='font-size:12px;color:#666'>"
+                 f"Peak: {peak_tflops} TFLOPS | Bandwidth: {bandwidth_gbps} GB/s</span>",
             x=0.5,
+            font=dict(size=14),
         ),
         xaxis=dict(
             title="Arithmetic Intensity (FLOPs/byte)",
         ),
         yaxis=dict(
             title="Performance (TFLOPS)",
+            range=[0, peak_tflops * 1.2],  # More headroom for text
         ),
+        height=420,
+        margin=dict(l=60, r=40, t=80, b=80),  # More room for title and legend
         legend=dict(
             orientation="h",
             yanchor="bottom",
+            y=-0.30,
             xanchor="center",
+            x=0.5,
+            font=dict(size=10),
         ),
         showlegend=True,
     )

src/constants.py CHANGED Viewed

@@ -46,7 +46,7 @@ MODEL_CONFIGS = {
 # GPU specifications for roofline analysis
 GPU_SPECS = {
     "A10G": {
-        "name": "NVIDIA A10G (Zero GPU)",
         "tflops_fp16": 125,
         "bandwidth_gbps": 600,  # GB/s
         "memory_gb": 24,
@@ -66,6 +66,27 @@ GPU_SPECS = {
         "memory_gb": 80,
         "sram_kb": 256,
     },
 }
 # Default GPU for Zero GPU Spaces

 # GPU specifications for roofline analysis
 GPU_SPECS = {
     "A10G": {
+        "name": "NVIDIA A10G",
         "tflops_fp16": 125,
         "bandwidth_gbps": 600,  # GB/s
         "memory_gb": 24,
         "memory_gb": 80,
         "sram_kb": 256,
     },
+    "H200": {
+        "name": "NVIDIA H200 (141GB)",
+        "tflops_fp16": 989,  # Same compute as H100
+        "bandwidth_gbps": 4800,  # HBM3e: 4.8 TB/s
+        "memory_gb": 141,
+        "sram_kb": 256,
+    },
+    "L40S": {
+        "name": "NVIDIA L40S",
+        "tflops_fp16": 362,
+        "bandwidth_gbps": 864,
+        "memory_gb": 48,
+        "sram_kb": 192,
+    },
+    "L4": {
+        "name": "NVIDIA L4",
+        "tflops_fp16": 121,
+        "bandwidth_gbps": 300,
+        "memory_gb": 24,
+        "sram_kb": 96,
+    },
 }
 # Default GPU for Zero GPU Spaces