a0y0346 commited on
Commit
e2de6cd
·
1 Parent(s): d5ef20e

Add H200 GPU support and improve roofline chart visibility

Browse files

- Add H200, L40S, L4, RTX 4090 to GPU detection
- Use dynamic memory detection for unknown GPUs
- Add visible labels with backgrounds on roofline chart
- Show GPU specs in chart title
- Use annotations with arrows instead of inline text

Files changed (2) hide show
  1. src/benchmark.py +214 -60
  2. src/constants.py +22 -1
src/benchmark.py CHANGED
@@ -15,7 +15,6 @@ from .constants import GPU_SPECS, ATTENTION_BACKENDS, MODEL_CONFIGS, DEFAULT_GPU
15
  def detect_gpu() -> dict:
16
  """
17
  Detect the actual GPU and return its specs.
18
- Falls back to A10G specs if GPU not recognized.
19
 
20
  Returns:
21
  Dict with GPU name and specs
@@ -23,48 +22,110 @@ def detect_gpu() -> dict:
23
  if not torch.cuda.is_available():
24
  return {"name": "CPU (No GPU)", "detected": False, **GPU_SPECS[DEFAULT_GPU]}
25
 
26
- gpu_name = torch.cuda.get_device_name(0).lower()
 
27
 
28
- # Match against known GPUs
29
- if "a10" in gpu_name:
30
- return {"detected": True, "detected_name": torch.cuda.get_device_name(0), **GPU_SPECS["A10G"]}
31
- elif "a100" in gpu_name:
32
- if "80" in gpu_name:
33
- return {"detected": True, "detected_name": torch.cuda.get_device_name(0), **GPU_SPECS["A100_80GB"]}
34
- return {"detected": True, "detected_name": torch.cuda.get_device_name(0), **GPU_SPECS["A100_80GB"]}
 
 
 
 
 
 
 
 
 
 
 
35
  elif "h100" in gpu_name:
36
- return {"detected": True, "detected_name": torch.cuda.get_device_name(0), **GPU_SPECS["H100"]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  elif "t4" in gpu_name:
38
- # T4 specs (common on free tier)
39
  return {
40
  "detected": True,
41
- "detected_name": torch.cuda.get_device_name(0),
42
  "name": "NVIDIA T4",
43
  "tflops_fp16": 65,
44
  "bandwidth_gbps": 320,
45
- "memory_gb": 16,
46
  "sram_kb": 64,
47
  }
48
  elif "v100" in gpu_name:
49
  return {
50
  "detected": True,
51
- "detected_name": torch.cuda.get_device_name(0),
52
  "name": "NVIDIA V100",
53
  "tflops_fp16": 125,
54
  "bandwidth_gbps": 900,
55
- "memory_gb": 32,
 
 
 
 
 
 
 
 
 
 
56
  "sram_kb": 128,
57
  }
58
  else:
59
- # Unknown GPU - use A10G as fallback with actual name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  return {
61
  "detected": True,
62
- "detected_name": torch.cuda.get_device_name(0),
63
- "name": f"{torch.cuda.get_device_name(0)} (using A10G specs)",
64
- "tflops_fp16": GPU_SPECS["A10G"]["tflops_fp16"],
65
- "bandwidth_gbps": GPU_SPECS["A10G"]["bandwidth_gbps"],
66
- "memory_gb": GPU_SPECS["A10G"]["memory_gb"],
67
- "sram_kb": GPU_SPECS["A10G"]["sram_kb"],
 
68
  }
69
 
70
 
@@ -529,12 +590,29 @@ def create_roofline_chart(
529
  fig.add_trace(go.Scatter(
530
  x=[m["arith_intensity"]],
531
  y=[m["achieved_tflops"]],
532
- mode="markers+text",
533
- name=f"Math (Measured: {m['achieved_tflops']:.1f} TFLOPS)",
534
- marker=dict(size=15, color="rgba(239, 68, 68, 0.9)", symbol="circle"),
535
- text=[f"Math<br>{m['time_ms']:.1f}ms"],
536
- textposition="top center",
537
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538
 
539
  # Flash backend
540
  if "flash" in benchmark_metrics:
@@ -542,12 +620,28 @@ def create_roofline_chart(
542
  fig.add_trace(go.Scatter(
543
  x=[m["arith_intensity"]],
544
  y=[m["achieved_tflops"]],
545
- mode="markers+text",
546
- name=f"Flash (Measured: {m['achieved_tflops']:.1f} TFLOPS)",
547
- marker=dict(size=15, color="rgba(34, 197, 94, 0.9)", symbol="circle"),
548
- text=[f"Flash<br>{m['time_ms']:.1f}ms"],
549
- textposition="top center",
550
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
 
552
  # Memory-efficient backend
553
  if "mem_efficient" in benchmark_metrics:
@@ -555,12 +649,28 @@ def create_roofline_chart(
555
  fig.add_trace(go.Scatter(
556
  x=[m["arith_intensity"]],
557
  y=[m["achieved_tflops"]],
558
- mode="markers+text",
559
- name=f"MemEfficient (Measured: {m['achieved_tflops']:.1f} TFLOPS)",
560
- marker=dict(size=15, color="rgba(59, 130, 246, 0.9)", symbol="circle"),
561
- text=[f"MemEff<br>{m['time_ms']:.1f}ms"],
562
- textposition="top center",
563
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
  else:
565
  # Plot THEORETICAL approximations
566
  title_suffix = " (Theoretical)"
@@ -572,12 +682,25 @@ def create_roofline_chart(
572
  fig.add_trace(go.Scatter(
573
  x=[std_intensity],
574
  y=[std_achieved],
575
- mode="markers+text",
576
  name="Standard (Theoretical)",
577
- marker=dict(size=15, color="rgba(239, 68, 68, 0.5)", symbol="circle-open"),
578
- text=["Standard"],
579
- textposition="top center",
580
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581
 
582
  # FlashAttention - compute bound
583
  flash_intensity = 200
@@ -586,12 +709,25 @@ def create_roofline_chart(
586
  fig.add_trace(go.Scatter(
587
  x=[flash_intensity],
588
  y=[flash_achieved],
589
- mode="markers+text",
590
  name="Flash (Theoretical)",
591
- marker=dict(size=15, color="rgba(34, 197, 94, 0.5)", symbol="circle-open"),
592
- text=["FlashAttention"],
593
- textposition="top center",
594
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
595
 
596
  # Add ridge point marker
597
  fig.add_trace(go.Scatter(
@@ -602,27 +738,44 @@ def create_roofline_chart(
602
  marker=dict(size=10, color="rgba(0, 0, 0, 0.6)", symbol="diamond"),
603
  ))
604
 
605
- # Add annotations
606
  fig.add_annotation(
607
- x=np.log10(3),
608
- y=peak_tflops * 0.2,
609
- text="Memory Bound<br>(limited by bandwidth)",
610
  showarrow=False,
611
- font=dict(size=10, color="rgba(239, 68, 68, 0.8)"),
 
 
 
 
612
  )
613
 
614
  fig.add_annotation(
615
- x=np.log10(500),
616
- y=peak_tflops * 0.85,
617
- text="Compute Bound<br>(limited by TFLOPS)",
618
  showarrow=False,
619
- font=dict(size=10, color="rgba(34, 197, 94, 0.8)"),
 
 
 
 
620
  )
621
 
 
 
 
 
 
 
622
  fig.update_layout(
623
  title=dict(
624
- text=f"Roofline Model: {gpu['name']}{title_suffix}",
 
 
625
  x=0.5,
 
626
  ),
627
  xaxis=dict(
628
  title="Arithmetic Intensity (FLOPs/byte)",
@@ -631,16 +784,17 @@ def create_roofline_chart(
631
  ),
632
  yaxis=dict(
633
  title="Performance (TFLOPS)",
634
- range=[0, peak_tflops * 1.1],
635
  ),
636
- height=400,
637
- margin=dict(l=60, r=40, t=60, b=60),
638
  legend=dict(
639
  orientation="h",
640
  yanchor="bottom",
641
- y=-0.35,
642
  xanchor="center",
643
- x=0.5
 
644
  ),
645
  showlegend=True,
646
  )
 
15
  def detect_gpu() -> dict:
16
  """
17
  Detect the actual GPU and return its specs.
 
18
 
19
  Returns:
20
  Dict with GPU name and specs
 
22
  if not torch.cuda.is_available():
23
  return {"name": "CPU (No GPU)", "detected": False, **GPU_SPECS[DEFAULT_GPU]}
24
 
25
+ gpu_name_raw = torch.cuda.get_device_name(0)
26
+ gpu_name = gpu_name_raw.lower()
27
 
28
+ # Get memory in GB for dynamic spec estimation
29
+ try:
30
+ mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
31
+ except Exception:
32
+ mem_gb = 24 # fallback
33
+
34
+ # Match against known GPUs (ordered from newest to oldest)
35
+ if "h200" in gpu_name:
36
+ # H200 specs - HBM3e memory, very high bandwidth
37
+ return {
38
+ "detected": True,
39
+ "detected_name": gpu_name_raw,
40
+ "name": "NVIDIA H200",
41
+ "tflops_fp16": 989, # Same compute as H100
42
+ "bandwidth_gbps": 4800, # HBM3e: 4.8 TB/s
43
+ "memory_gb": round(mem_gb),
44
+ "sram_kb": 256,
45
+ }
46
  elif "h100" in gpu_name:
47
+ return {"detected": True, "detected_name": gpu_name_raw, **GPU_SPECS["H100"]}
48
+ elif "a100" in gpu_name:
49
+ return {"detected": True, "detected_name": gpu_name_raw, **GPU_SPECS["A100_80GB"]}
50
+ elif "a10" in gpu_name:
51
+ return {"detected": True, "detected_name": gpu_name_raw, **GPU_SPECS["A10G"]}
52
+ elif "l40" in gpu_name:
53
+ # L40S specs
54
+ return {
55
+ "detected": True,
56
+ "detected_name": gpu_name_raw,
57
+ "name": "NVIDIA L40S",
58
+ "tflops_fp16": 362,
59
+ "bandwidth_gbps": 864,
60
+ "memory_gb": round(mem_gb),
61
+ "sram_kb": 192,
62
+ }
63
+ elif "l4" in gpu_name:
64
+ # L4 specs
65
+ return {
66
+ "detected": True,
67
+ "detected_name": gpu_name_raw,
68
+ "name": "NVIDIA L4",
69
+ "tflops_fp16": 121,
70
+ "bandwidth_gbps": 300,
71
+ "memory_gb": round(mem_gb),
72
+ "sram_kb": 96,
73
+ }
74
  elif "t4" in gpu_name:
 
75
  return {
76
  "detected": True,
77
+ "detected_name": gpu_name_raw,
78
  "name": "NVIDIA T4",
79
  "tflops_fp16": 65,
80
  "bandwidth_gbps": 320,
81
+ "memory_gb": round(mem_gb),
82
  "sram_kb": 64,
83
  }
84
  elif "v100" in gpu_name:
85
  return {
86
  "detected": True,
87
+ "detected_name": gpu_name_raw,
88
  "name": "NVIDIA V100",
89
  "tflops_fp16": 125,
90
  "bandwidth_gbps": 900,
91
+ "memory_gb": round(mem_gb),
92
+ "sram_kb": 128,
93
+ }
94
+ elif "rtx 4090" in gpu_name or "4090" in gpu_name:
95
+ return {
96
+ "detected": True,
97
+ "detected_name": gpu_name_raw,
98
+ "name": "NVIDIA RTX 4090",
99
+ "tflops_fp16": 330,
100
+ "bandwidth_gbps": 1008,
101
+ "memory_gb": round(mem_gb),
102
  "sram_kb": 128,
103
  }
104
  else:
105
+ # Unknown GPU - estimate specs based on memory size
106
+ # Higher memory usually means newer/faster GPU
107
+ if mem_gb >= 70:
108
+ est_tflops = 500
109
+ est_bw = 2000
110
+ elif mem_gb >= 40:
111
+ est_tflops = 300
112
+ est_bw = 1500
113
+ elif mem_gb >= 20:
114
+ est_tflops = 125
115
+ est_bw = 600
116
+ else:
117
+ est_tflops = 65
118
+ est_bw = 300
119
+
120
  return {
121
  "detected": True,
122
+ "detected_name": gpu_name_raw,
123
+ "name": gpu_name_raw, # Use actual name, no "using X specs" suffix
124
+ "tflops_fp16": est_tflops,
125
+ "bandwidth_gbps": est_bw,
126
+ "memory_gb": round(mem_gb),
127
+ "sram_kb": 128,
128
+ "estimated": True, # Flag that these are estimated specs
129
  }
130
 
131
 
 
590
  fig.add_trace(go.Scatter(
591
  x=[m["arith_intensity"]],
592
  y=[m["achieved_tflops"]],
593
+ mode="markers",
594
+ name=f"Math ({m['achieved_tflops']:.1f} TFLOPS, {m['time_ms']:.1f}ms)",
595
+ marker=dict(size=16, color="#dc2626", symbol="circle",
596
+ line=dict(color="white", width=2)),
 
597
  ))
598
+ # Add label as annotation for better visibility
599
+ fig.add_annotation(
600
+ x=np.log10(m["arith_intensity"]),
601
+ y=m["achieved_tflops"],
602
+ text=f"<b>Math</b><br>{m['time_ms']:.1f}ms",
603
+ showarrow=True,
604
+ arrowhead=2,
605
+ arrowsize=1,
606
+ arrowwidth=1,
607
+ arrowcolor="#dc2626",
608
+ ax=0,
609
+ ay=-40,
610
+ font=dict(size=10, color="#dc2626"),
611
+ bgcolor="rgba(255, 255, 255, 0.95)",
612
+ bordercolor="#dc2626",
613
+ borderwidth=1,
614
+ borderpad=3,
615
+ )
616
 
617
  # Flash backend
618
  if "flash" in benchmark_metrics:
 
620
  fig.add_trace(go.Scatter(
621
  x=[m["arith_intensity"]],
622
  y=[m["achieved_tflops"]],
623
+ mode="markers",
624
+ name=f"Flash ({m['achieved_tflops']:.1f} TFLOPS, {m['time_ms']:.1f}ms)",
625
+ marker=dict(size=16, color="#16a34a", symbol="circle",
626
+ line=dict(color="white", width=2)),
 
627
  ))
628
+ fig.add_annotation(
629
+ x=np.log10(m["arith_intensity"]),
630
+ y=m["achieved_tflops"],
631
+ text=f"<b>Flash</b><br>{m['time_ms']:.1f}ms",
632
+ showarrow=True,
633
+ arrowhead=2,
634
+ arrowsize=1,
635
+ arrowwidth=1,
636
+ arrowcolor="#16a34a",
637
+ ax=0,
638
+ ay=-40,
639
+ font=dict(size=10, color="#16a34a"),
640
+ bgcolor="rgba(255, 255, 255, 0.95)",
641
+ bordercolor="#16a34a",
642
+ borderwidth=1,
643
+ borderpad=3,
644
+ )
645
 
646
  # Memory-efficient backend
647
  if "mem_efficient" in benchmark_metrics:
 
649
  fig.add_trace(go.Scatter(
650
  x=[m["arith_intensity"]],
651
  y=[m["achieved_tflops"]],
652
+ mode="markers",
653
+ name=f"MemEff ({m['achieved_tflops']:.1f} TFLOPS, {m['time_ms']:.1f}ms)",
654
+ marker=dict(size=16, color="#2563eb", symbol="circle",
655
+ line=dict(color="white", width=2)),
 
656
  ))
657
+ fig.add_annotation(
658
+ x=np.log10(m["arith_intensity"]),
659
+ y=m["achieved_tflops"],
660
+ text=f"<b>MemEff</b><br>{m['time_ms']:.1f}ms",
661
+ showarrow=True,
662
+ arrowhead=2,
663
+ arrowsize=1,
664
+ arrowwidth=1,
665
+ arrowcolor="#2563eb",
666
+ ax=30, # Offset to avoid overlap
667
+ ay=-30,
668
+ font=dict(size=10, color="#2563eb"),
669
+ bgcolor="rgba(255, 255, 255, 0.95)",
670
+ bordercolor="#2563eb",
671
+ borderwidth=1,
672
+ borderpad=3,
673
+ )
674
  else:
675
  # Plot THEORETICAL approximations
676
  title_suffix = " (Theoretical)"
 
682
  fig.add_trace(go.Scatter(
683
  x=[std_intensity],
684
  y=[std_achieved],
685
+ mode="markers",
686
  name="Standard (Theoretical)",
687
+ marker=dict(size=15, color="rgba(220, 38, 38, 0.6)", symbol="circle-open",
688
+ line=dict(width=2)),
 
689
  ))
690
+ fig.add_annotation(
691
+ x=np.log10(std_intensity),
692
+ y=std_achieved,
693
+ text="<b>Standard</b><br>(theoretical)",
694
+ showarrow=True,
695
+ arrowhead=2,
696
+ ax=0,
697
+ ay=-35,
698
+ font=dict(size=10, color="#dc2626"),
699
+ bgcolor="rgba(255, 255, 255, 0.9)",
700
+ bordercolor="rgba(220, 38, 38, 0.5)",
701
+ borderwidth=1,
702
+ borderpad=3,
703
+ )
704
 
705
  # FlashAttention - compute bound
706
  flash_intensity = 200
 
709
  fig.add_trace(go.Scatter(
710
  x=[flash_intensity],
711
  y=[flash_achieved],
712
+ mode="markers",
713
  name="Flash (Theoretical)",
714
+ marker=dict(size=15, color="rgba(22, 163, 74, 0.6)", symbol="circle-open",
715
+ line=dict(width=2)),
 
716
  ))
717
+ fig.add_annotation(
718
+ x=np.log10(flash_intensity),
719
+ y=flash_achieved,
720
+ text="<b>FlashAttention</b><br>(theoretical)",
721
+ showarrow=True,
722
+ arrowhead=2,
723
+ ax=0,
724
+ ay=-35,
725
+ font=dict(size=10, color="#16a34a"),
726
+ bgcolor="rgba(255, 255, 255, 0.9)",
727
+ bordercolor="rgba(22, 163, 74, 0.5)",
728
+ borderwidth=1,
729
+ borderpad=3,
730
+ )
731
 
732
  # Add ridge point marker
733
  fig.add_trace(go.Scatter(
 
738
  marker=dict(size=10, color="rgba(0, 0, 0, 0.6)", symbol="diamond"),
739
  ))
740
 
741
+ # Add annotations with better visibility (white background)
742
  fig.add_annotation(
743
+ x=np.log10(5),
744
+ y=peak_tflops * 0.1,
745
+ text="<b>Memory Bound</b><br>(limited by bandwidth)",
746
  showarrow=False,
747
+ font=dict(size=11, color="#dc2626"), # Solid red
748
+ bgcolor="rgba(255, 255, 255, 0.9)",
749
+ bordercolor="#dc2626",
750
+ borderwidth=1,
751
+ borderpad=4,
752
  )
753
 
754
  fig.add_annotation(
755
+ x=np.log10(300),
756
+ y=peak_tflops * 0.65,
757
+ text="<b>Compute Bound</b><br>(limited by TFLOPS)",
758
  showarrow=False,
759
+ font=dict(size=11, color="#16a34a"), # Solid green
760
+ bgcolor="rgba(255, 255, 255, 0.9)",
761
+ bordercolor="#16a34a",
762
+ borderwidth=1,
763
+ borderpad=4,
764
  )
765
 
766
+ # Use detected_name if available, otherwise use name
767
+ display_name = gpu.get("detected_name", gpu.get("name", "GPU"))
768
+
769
+ # Add estimated indicator if specs were estimated
770
+ estimated_note = " (estimated specs)" if gpu.get("estimated") else ""
771
+
772
  fig.update_layout(
773
  title=dict(
774
+ text=f"Roofline Model: {display_name}{title_suffix}{estimated_note}<br>"
775
+ f"<span style='font-size:12px;color:#666'>"
776
+ f"Peak: {peak_tflops} TFLOPS | Bandwidth: {bandwidth_gbps} GB/s</span>",
777
  x=0.5,
778
+ font=dict(size=14),
779
  ),
780
  xaxis=dict(
781
  title="Arithmetic Intensity (FLOPs/byte)",
 
784
  ),
785
  yaxis=dict(
786
  title="Performance (TFLOPS)",
787
+ range=[0, peak_tflops * 1.2], # More headroom for text
788
  ),
789
+ height=420,
790
+ margin=dict(l=60, r=40, t=80, b=80), # More room for title and legend
791
  legend=dict(
792
  orientation="h",
793
  yanchor="bottom",
794
+ y=-0.30,
795
  xanchor="center",
796
+ x=0.5,
797
+ font=dict(size=10),
798
  ),
799
  showlegend=True,
800
  )
src/constants.py CHANGED
@@ -46,7 +46,7 @@ MODEL_CONFIGS = {
46
  # GPU specifications for roofline analysis
47
  GPU_SPECS = {
48
  "A10G": {
49
- "name": "NVIDIA A10G (Zero GPU)",
50
  "tflops_fp16": 125,
51
  "bandwidth_gbps": 600, # GB/s
52
  "memory_gb": 24,
@@ -66,6 +66,27 @@ GPU_SPECS = {
66
  "memory_gb": 80,
67
  "sram_kb": 256,
68
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  }
70
 
71
  # Default GPU for Zero GPU Spaces
 
46
  # GPU specifications for roofline analysis
47
  GPU_SPECS = {
48
  "A10G": {
49
+ "name": "NVIDIA A10G",
50
  "tflops_fp16": 125,
51
  "bandwidth_gbps": 600, # GB/s
52
  "memory_gb": 24,
 
66
  "memory_gb": 80,
67
  "sram_kb": 256,
68
  },
69
+ "H200": {
70
+ "name": "NVIDIA H200 (141GB)",
71
+ "tflops_fp16": 989, # Same compute as H100
72
+ "bandwidth_gbps": 4800, # HBM3e: 4.8 TB/s
73
+ "memory_gb": 141,
74
+ "sram_kb": 256,
75
+ },
76
+ "L40S": {
77
+ "name": "NVIDIA L40S",
78
+ "tflops_fp16": 362,
79
+ "bandwidth_gbps": 864,
80
+ "memory_gb": 48,
81
+ "sram_kb": 192,
82
+ },
83
+ "L4": {
84
+ "name": "NVIDIA L4",
85
+ "tflops_fp16": 121,
86
+ "bandwidth_gbps": 300,
87
+ "memory_gb": 24,
88
+ "sram_kb": 96,
89
+ },
90
  }
91
 
92
  # Default GPU for Zero GPU Spaces