Spaces:
Sleeping
Sleeping
a0y0346 commited on
Commit ·
e2de6cd
1
Parent(s): d5ef20e
Add H200 GPU support and improve roofline chart visibility
Browse files- Add H200, L40S, L4, RTX 4090 to GPU detection
- Use dynamic memory detection for unknown GPUs
- Add visible labels with backgrounds on roofline chart
- Show GPU specs in chart title
- Use annotations with arrows instead of inline text
- src/benchmark.py +214 -60
- src/constants.py +22 -1
src/benchmark.py
CHANGED
|
@@ -15,7 +15,6 @@ from .constants import GPU_SPECS, ATTENTION_BACKENDS, MODEL_CONFIGS, DEFAULT_GPU
|
|
| 15 |
def detect_gpu() -> dict:
|
| 16 |
"""
|
| 17 |
Detect the actual GPU and return its specs.
|
| 18 |
-
Falls back to A10G specs if GPU not recognized.
|
| 19 |
|
| 20 |
Returns:
|
| 21 |
Dict with GPU name and specs
|
|
@@ -23,48 +22,110 @@ def detect_gpu() -> dict:
|
|
| 23 |
if not torch.cuda.is_available():
|
| 24 |
return {"name": "CPU (No GPU)", "detected": False, **GPU_SPECS[DEFAULT_GPU]}
|
| 25 |
|
| 26 |
-
|
|
|
|
| 27 |
|
| 28 |
-
#
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
elif "h100" in gpu_name:
|
| 36 |
-
return {"detected": True, "detected_name":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
elif "t4" in gpu_name:
|
| 38 |
-
# T4 specs (common on free tier)
|
| 39 |
return {
|
| 40 |
"detected": True,
|
| 41 |
-
"detected_name":
|
| 42 |
"name": "NVIDIA T4",
|
| 43 |
"tflops_fp16": 65,
|
| 44 |
"bandwidth_gbps": 320,
|
| 45 |
-
"memory_gb":
|
| 46 |
"sram_kb": 64,
|
| 47 |
}
|
| 48 |
elif "v100" in gpu_name:
|
| 49 |
return {
|
| 50 |
"detected": True,
|
| 51 |
-
"detected_name":
|
| 52 |
"name": "NVIDIA V100",
|
| 53 |
"tflops_fp16": 125,
|
| 54 |
"bandwidth_gbps": 900,
|
| 55 |
-
"memory_gb":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
"sram_kb": 128,
|
| 57 |
}
|
| 58 |
else:
|
| 59 |
-
# Unknown GPU -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
return {
|
| 61 |
"detected": True,
|
| 62 |
-
"detected_name":
|
| 63 |
-
"name":
|
| 64 |
-
"tflops_fp16":
|
| 65 |
-
"bandwidth_gbps":
|
| 66 |
-
"memory_gb":
|
| 67 |
-
"sram_kb":
|
|
|
|
| 68 |
}
|
| 69 |
|
| 70 |
|
|
@@ -529,12 +590,29 @@ def create_roofline_chart(
|
|
| 529 |
fig.add_trace(go.Scatter(
|
| 530 |
x=[m["arith_intensity"]],
|
| 531 |
y=[m["achieved_tflops"]],
|
| 532 |
-
mode="markers
|
| 533 |
-
name=f"Math (
|
| 534 |
-
marker=dict(size=
|
| 535 |
-
|
| 536 |
-
textposition="top center",
|
| 537 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 538 |
|
| 539 |
# Flash backend
|
| 540 |
if "flash" in benchmark_metrics:
|
|
@@ -542,12 +620,28 @@ def create_roofline_chart(
|
|
| 542 |
fig.add_trace(go.Scatter(
|
| 543 |
x=[m["arith_intensity"]],
|
| 544 |
y=[m["achieved_tflops"]],
|
| 545 |
-
mode="markers
|
| 546 |
-
name=f"Flash (
|
| 547 |
-
marker=dict(size=
|
| 548 |
-
|
| 549 |
-
textposition="top center",
|
| 550 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
# Memory-efficient backend
|
| 553 |
if "mem_efficient" in benchmark_metrics:
|
|
@@ -555,12 +649,28 @@ def create_roofline_chart(
|
|
| 555 |
fig.add_trace(go.Scatter(
|
| 556 |
x=[m["arith_intensity"]],
|
| 557 |
y=[m["achieved_tflops"]],
|
| 558 |
-
mode="markers
|
| 559 |
-
name=f"
|
| 560 |
-
marker=dict(size=
|
| 561 |
-
|
| 562 |
-
textposition="top center",
|
| 563 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 564 |
else:
|
| 565 |
# Plot THEORETICAL approximations
|
| 566 |
title_suffix = " (Theoretical)"
|
|
@@ -572,12 +682,25 @@ def create_roofline_chart(
|
|
| 572 |
fig.add_trace(go.Scatter(
|
| 573 |
x=[std_intensity],
|
| 574 |
y=[std_achieved],
|
| 575 |
-
mode="markers
|
| 576 |
name="Standard (Theoretical)",
|
| 577 |
-
marker=dict(size=15, color="rgba(
|
| 578 |
-
|
| 579 |
-
textposition="top center",
|
| 580 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
|
| 582 |
# FlashAttention - compute bound
|
| 583 |
flash_intensity = 200
|
|
@@ -586,12 +709,25 @@ def create_roofline_chart(
|
|
| 586 |
fig.add_trace(go.Scatter(
|
| 587 |
x=[flash_intensity],
|
| 588 |
y=[flash_achieved],
|
| 589 |
-
mode="markers
|
| 590 |
name="Flash (Theoretical)",
|
| 591 |
-
marker=dict(size=15, color="rgba(
|
| 592 |
-
|
| 593 |
-
textposition="top center",
|
| 594 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 595 |
|
| 596 |
# Add ridge point marker
|
| 597 |
fig.add_trace(go.Scatter(
|
|
@@ -602,27 +738,44 @@ def create_roofline_chart(
|
|
| 602 |
marker=dict(size=10, color="rgba(0, 0, 0, 0.6)", symbol="diamond"),
|
| 603 |
))
|
| 604 |
|
| 605 |
-
# Add annotations
|
| 606 |
fig.add_annotation(
|
| 607 |
-
x=np.log10(
|
| 608 |
-
y=peak_tflops * 0.
|
| 609 |
-
text="Memory Bound<br>(limited by bandwidth)",
|
| 610 |
showarrow=False,
|
| 611 |
-
font=dict(size=
|
|
|
|
|
|
|
|
|
|
|
|
|
| 612 |
)
|
| 613 |
|
| 614 |
fig.add_annotation(
|
| 615 |
-
x=np.log10(
|
| 616 |
-
y=peak_tflops * 0.
|
| 617 |
-
text="Compute Bound<br>(limited by TFLOPS)",
|
| 618 |
showarrow=False,
|
| 619 |
-
font=dict(size=
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
)
|
| 621 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
fig.update_layout(
|
| 623 |
title=dict(
|
| 624 |
-
text=f"Roofline Model: {
|
|
|
|
|
|
|
| 625 |
x=0.5,
|
|
|
|
| 626 |
),
|
| 627 |
xaxis=dict(
|
| 628 |
title="Arithmetic Intensity (FLOPs/byte)",
|
|
@@ -631,16 +784,17 @@ def create_roofline_chart(
|
|
| 631 |
),
|
| 632 |
yaxis=dict(
|
| 633 |
title="Performance (TFLOPS)",
|
| 634 |
-
range=[0, peak_tflops * 1.
|
| 635 |
),
|
| 636 |
-
height=
|
| 637 |
-
margin=dict(l=60, r=40, t=
|
| 638 |
legend=dict(
|
| 639 |
orientation="h",
|
| 640 |
yanchor="bottom",
|
| 641 |
-
y=-0.
|
| 642 |
xanchor="center",
|
| 643 |
-
x=0.5
|
|
|
|
| 644 |
),
|
| 645 |
showlegend=True,
|
| 646 |
)
|
|
|
|
| 15 |
def detect_gpu() -> dict:
|
| 16 |
"""
|
| 17 |
Detect the actual GPU and return its specs.
|
|
|
|
| 18 |
|
| 19 |
Returns:
|
| 20 |
Dict with GPU name and specs
|
|
|
|
| 22 |
if not torch.cuda.is_available():
|
| 23 |
return {"name": "CPU (No GPU)", "detected": False, **GPU_SPECS[DEFAULT_GPU]}
|
| 24 |
|
| 25 |
+
gpu_name_raw = torch.cuda.get_device_name(0)
|
| 26 |
+
gpu_name = gpu_name_raw.lower()
|
| 27 |
|
| 28 |
+
# Get memory in GB for dynamic spec estimation
|
| 29 |
+
try:
|
| 30 |
+
mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
|
| 31 |
+
except Exception:
|
| 32 |
+
mem_gb = 24 # fallback
|
| 33 |
+
|
| 34 |
+
# Match against known GPUs (ordered from newest to oldest)
|
| 35 |
+
if "h200" in gpu_name:
|
| 36 |
+
# H200 specs - HBM3e memory, very high bandwidth
|
| 37 |
+
return {
|
| 38 |
+
"detected": True,
|
| 39 |
+
"detected_name": gpu_name_raw,
|
| 40 |
+
"name": "NVIDIA H200",
|
| 41 |
+
"tflops_fp16": 989, # Same compute as H100
|
| 42 |
+
"bandwidth_gbps": 4800, # HBM3e: 4.8 TB/s
|
| 43 |
+
"memory_gb": round(mem_gb),
|
| 44 |
+
"sram_kb": 256,
|
| 45 |
+
}
|
| 46 |
elif "h100" in gpu_name:
|
| 47 |
+
return {"detected": True, "detected_name": gpu_name_raw, **GPU_SPECS["H100"]}
|
| 48 |
+
elif "a100" in gpu_name:
|
| 49 |
+
return {"detected": True, "detected_name": gpu_name_raw, **GPU_SPECS["A100_80GB"]}
|
| 50 |
+
elif "a10" in gpu_name:
|
| 51 |
+
return {"detected": True, "detected_name": gpu_name_raw, **GPU_SPECS["A10G"]}
|
| 52 |
+
elif "l40" in gpu_name:
|
| 53 |
+
# L40S specs
|
| 54 |
+
return {
|
| 55 |
+
"detected": True,
|
| 56 |
+
"detected_name": gpu_name_raw,
|
| 57 |
+
"name": "NVIDIA L40S",
|
| 58 |
+
"tflops_fp16": 362,
|
| 59 |
+
"bandwidth_gbps": 864,
|
| 60 |
+
"memory_gb": round(mem_gb),
|
| 61 |
+
"sram_kb": 192,
|
| 62 |
+
}
|
| 63 |
+
elif "l4" in gpu_name:
|
| 64 |
+
# L4 specs
|
| 65 |
+
return {
|
| 66 |
+
"detected": True,
|
| 67 |
+
"detected_name": gpu_name_raw,
|
| 68 |
+
"name": "NVIDIA L4",
|
| 69 |
+
"tflops_fp16": 121,
|
| 70 |
+
"bandwidth_gbps": 300,
|
| 71 |
+
"memory_gb": round(mem_gb),
|
| 72 |
+
"sram_kb": 96,
|
| 73 |
+
}
|
| 74 |
elif "t4" in gpu_name:
|
|
|
|
| 75 |
return {
|
| 76 |
"detected": True,
|
| 77 |
+
"detected_name": gpu_name_raw,
|
| 78 |
"name": "NVIDIA T4",
|
| 79 |
"tflops_fp16": 65,
|
| 80 |
"bandwidth_gbps": 320,
|
| 81 |
+
"memory_gb": round(mem_gb),
|
| 82 |
"sram_kb": 64,
|
| 83 |
}
|
| 84 |
elif "v100" in gpu_name:
|
| 85 |
return {
|
| 86 |
"detected": True,
|
| 87 |
+
"detected_name": gpu_name_raw,
|
| 88 |
"name": "NVIDIA V100",
|
| 89 |
"tflops_fp16": 125,
|
| 90 |
"bandwidth_gbps": 900,
|
| 91 |
+
"memory_gb": round(mem_gb),
|
| 92 |
+
"sram_kb": 128,
|
| 93 |
+
}
|
| 94 |
+
elif "rtx 4090" in gpu_name or "4090" in gpu_name:
|
| 95 |
+
return {
|
| 96 |
+
"detected": True,
|
| 97 |
+
"detected_name": gpu_name_raw,
|
| 98 |
+
"name": "NVIDIA RTX 4090",
|
| 99 |
+
"tflops_fp16": 330,
|
| 100 |
+
"bandwidth_gbps": 1008,
|
| 101 |
+
"memory_gb": round(mem_gb),
|
| 102 |
"sram_kb": 128,
|
| 103 |
}
|
| 104 |
else:
|
| 105 |
+
# Unknown GPU - estimate specs based on memory size
|
| 106 |
+
# Higher memory usually means newer/faster GPU
|
| 107 |
+
if mem_gb >= 70:
|
| 108 |
+
est_tflops = 500
|
| 109 |
+
est_bw = 2000
|
| 110 |
+
elif mem_gb >= 40:
|
| 111 |
+
est_tflops = 300
|
| 112 |
+
est_bw = 1500
|
| 113 |
+
elif mem_gb >= 20:
|
| 114 |
+
est_tflops = 125
|
| 115 |
+
est_bw = 600
|
| 116 |
+
else:
|
| 117 |
+
est_tflops = 65
|
| 118 |
+
est_bw = 300
|
| 119 |
+
|
| 120 |
return {
|
| 121 |
"detected": True,
|
| 122 |
+
"detected_name": gpu_name_raw,
|
| 123 |
+
"name": gpu_name_raw, # Use actual name, no "using X specs" suffix
|
| 124 |
+
"tflops_fp16": est_tflops,
|
| 125 |
+
"bandwidth_gbps": est_bw,
|
| 126 |
+
"memory_gb": round(mem_gb),
|
| 127 |
+
"sram_kb": 128,
|
| 128 |
+
"estimated": True, # Flag that these are estimated specs
|
| 129 |
}
|
| 130 |
|
| 131 |
|
|
|
|
| 590 |
fig.add_trace(go.Scatter(
|
| 591 |
x=[m["arith_intensity"]],
|
| 592 |
y=[m["achieved_tflops"]],
|
| 593 |
+
mode="markers",
|
| 594 |
+
name=f"Math ({m['achieved_tflops']:.1f} TFLOPS, {m['time_ms']:.1f}ms)",
|
| 595 |
+
marker=dict(size=16, color="#dc2626", symbol="circle",
|
| 596 |
+
line=dict(color="white", width=2)),
|
|
|
|
| 597 |
))
|
| 598 |
+
# Add label as annotation for better visibility
|
| 599 |
+
fig.add_annotation(
|
| 600 |
+
x=np.log10(m["arith_intensity"]),
|
| 601 |
+
y=m["achieved_tflops"],
|
| 602 |
+
text=f"<b>Math</b><br>{m['time_ms']:.1f}ms",
|
| 603 |
+
showarrow=True,
|
| 604 |
+
arrowhead=2,
|
| 605 |
+
arrowsize=1,
|
| 606 |
+
arrowwidth=1,
|
| 607 |
+
arrowcolor="#dc2626",
|
| 608 |
+
ax=0,
|
| 609 |
+
ay=-40,
|
| 610 |
+
font=dict(size=10, color="#dc2626"),
|
| 611 |
+
bgcolor="rgba(255, 255, 255, 0.95)",
|
| 612 |
+
bordercolor="#dc2626",
|
| 613 |
+
borderwidth=1,
|
| 614 |
+
borderpad=3,
|
| 615 |
+
)
|
| 616 |
|
| 617 |
# Flash backend
|
| 618 |
if "flash" in benchmark_metrics:
|
|
|
|
| 620 |
fig.add_trace(go.Scatter(
|
| 621 |
x=[m["arith_intensity"]],
|
| 622 |
y=[m["achieved_tflops"]],
|
| 623 |
+
mode="markers",
|
| 624 |
+
name=f"Flash ({m['achieved_tflops']:.1f} TFLOPS, {m['time_ms']:.1f}ms)",
|
| 625 |
+
marker=dict(size=16, color="#16a34a", symbol="circle",
|
| 626 |
+
line=dict(color="white", width=2)),
|
|
|
|
| 627 |
))
|
| 628 |
+
fig.add_annotation(
|
| 629 |
+
x=np.log10(m["arith_intensity"]),
|
| 630 |
+
y=m["achieved_tflops"],
|
| 631 |
+
text=f"<b>Flash</b><br>{m['time_ms']:.1f}ms",
|
| 632 |
+
showarrow=True,
|
| 633 |
+
arrowhead=2,
|
| 634 |
+
arrowsize=1,
|
| 635 |
+
arrowwidth=1,
|
| 636 |
+
arrowcolor="#16a34a",
|
| 637 |
+
ax=0,
|
| 638 |
+
ay=-40,
|
| 639 |
+
font=dict(size=10, color="#16a34a"),
|
| 640 |
+
bgcolor="rgba(255, 255, 255, 0.95)",
|
| 641 |
+
bordercolor="#16a34a",
|
| 642 |
+
borderwidth=1,
|
| 643 |
+
borderpad=3,
|
| 644 |
+
)
|
| 645 |
|
| 646 |
# Memory-efficient backend
|
| 647 |
if "mem_efficient" in benchmark_metrics:
|
|
|
|
| 649 |
fig.add_trace(go.Scatter(
|
| 650 |
x=[m["arith_intensity"]],
|
| 651 |
y=[m["achieved_tflops"]],
|
| 652 |
+
mode="markers",
|
| 653 |
+
name=f"MemEff ({m['achieved_tflops']:.1f} TFLOPS, {m['time_ms']:.1f}ms)",
|
| 654 |
+
marker=dict(size=16, color="#2563eb", symbol="circle",
|
| 655 |
+
line=dict(color="white", width=2)),
|
|
|
|
| 656 |
))
|
| 657 |
+
fig.add_annotation(
|
| 658 |
+
x=np.log10(m["arith_intensity"]),
|
| 659 |
+
y=m["achieved_tflops"],
|
| 660 |
+
text=f"<b>MemEff</b><br>{m['time_ms']:.1f}ms",
|
| 661 |
+
showarrow=True,
|
| 662 |
+
arrowhead=2,
|
| 663 |
+
arrowsize=1,
|
| 664 |
+
arrowwidth=1,
|
| 665 |
+
arrowcolor="#2563eb",
|
| 666 |
+
ax=30, # Offset to avoid overlap
|
| 667 |
+
ay=-30,
|
| 668 |
+
font=dict(size=10, color="#2563eb"),
|
| 669 |
+
bgcolor="rgba(255, 255, 255, 0.95)",
|
| 670 |
+
bordercolor="#2563eb",
|
| 671 |
+
borderwidth=1,
|
| 672 |
+
borderpad=3,
|
| 673 |
+
)
|
| 674 |
else:
|
| 675 |
# Plot THEORETICAL approximations
|
| 676 |
title_suffix = " (Theoretical)"
|
|
|
|
| 682 |
fig.add_trace(go.Scatter(
|
| 683 |
x=[std_intensity],
|
| 684 |
y=[std_achieved],
|
| 685 |
+
mode="markers",
|
| 686 |
name="Standard (Theoretical)",
|
| 687 |
+
marker=dict(size=15, color="rgba(220, 38, 38, 0.6)", symbol="circle-open",
|
| 688 |
+
line=dict(width=2)),
|
|
|
|
| 689 |
))
|
| 690 |
+
fig.add_annotation(
|
| 691 |
+
x=np.log10(std_intensity),
|
| 692 |
+
y=std_achieved,
|
| 693 |
+
text="<b>Standard</b><br>(theoretical)",
|
| 694 |
+
showarrow=True,
|
| 695 |
+
arrowhead=2,
|
| 696 |
+
ax=0,
|
| 697 |
+
ay=-35,
|
| 698 |
+
font=dict(size=10, color="#dc2626"),
|
| 699 |
+
bgcolor="rgba(255, 255, 255, 0.9)",
|
| 700 |
+
bordercolor="rgba(220, 38, 38, 0.5)",
|
| 701 |
+
borderwidth=1,
|
| 702 |
+
borderpad=3,
|
| 703 |
+
)
|
| 704 |
|
| 705 |
# FlashAttention - compute bound
|
| 706 |
flash_intensity = 200
|
|
|
|
| 709 |
fig.add_trace(go.Scatter(
|
| 710 |
x=[flash_intensity],
|
| 711 |
y=[flash_achieved],
|
| 712 |
+
mode="markers",
|
| 713 |
name="Flash (Theoretical)",
|
| 714 |
+
marker=dict(size=15, color="rgba(22, 163, 74, 0.6)", symbol="circle-open",
|
| 715 |
+
line=dict(width=2)),
|
|
|
|
| 716 |
))
|
| 717 |
+
fig.add_annotation(
|
| 718 |
+
x=np.log10(flash_intensity),
|
| 719 |
+
y=flash_achieved,
|
| 720 |
+
text="<b>FlashAttention</b><br>(theoretical)",
|
| 721 |
+
showarrow=True,
|
| 722 |
+
arrowhead=2,
|
| 723 |
+
ax=0,
|
| 724 |
+
ay=-35,
|
| 725 |
+
font=dict(size=10, color="#16a34a"),
|
| 726 |
+
bgcolor="rgba(255, 255, 255, 0.9)",
|
| 727 |
+
bordercolor="rgba(22, 163, 74, 0.5)",
|
| 728 |
+
borderwidth=1,
|
| 729 |
+
borderpad=3,
|
| 730 |
+
)
|
| 731 |
|
| 732 |
# Add ridge point marker
|
| 733 |
fig.add_trace(go.Scatter(
|
|
|
|
| 738 |
marker=dict(size=10, color="rgba(0, 0, 0, 0.6)", symbol="diamond"),
|
| 739 |
))
|
| 740 |
|
| 741 |
+
# Add annotations with better visibility (white background)
|
| 742 |
fig.add_annotation(
|
| 743 |
+
x=np.log10(5),
|
| 744 |
+
y=peak_tflops * 0.1,
|
| 745 |
+
text="<b>Memory Bound</b><br>(limited by bandwidth)",
|
| 746 |
showarrow=False,
|
| 747 |
+
font=dict(size=11, color="#dc2626"), # Solid red
|
| 748 |
+
bgcolor="rgba(255, 255, 255, 0.9)",
|
| 749 |
+
bordercolor="#dc2626",
|
| 750 |
+
borderwidth=1,
|
| 751 |
+
borderpad=4,
|
| 752 |
)
|
| 753 |
|
| 754 |
fig.add_annotation(
|
| 755 |
+
x=np.log10(300),
|
| 756 |
+
y=peak_tflops * 0.65,
|
| 757 |
+
text="<b>Compute Bound</b><br>(limited by TFLOPS)",
|
| 758 |
showarrow=False,
|
| 759 |
+
font=dict(size=11, color="#16a34a"), # Solid green
|
| 760 |
+
bgcolor="rgba(255, 255, 255, 0.9)",
|
| 761 |
+
bordercolor="#16a34a",
|
| 762 |
+
borderwidth=1,
|
| 763 |
+
borderpad=4,
|
| 764 |
)
|
| 765 |
|
| 766 |
+
# Use detected_name if available, otherwise use name
|
| 767 |
+
display_name = gpu.get("detected_name", gpu.get("name", "GPU"))
|
| 768 |
+
|
| 769 |
+
# Add estimated indicator if specs were estimated
|
| 770 |
+
estimated_note = " (estimated specs)" if gpu.get("estimated") else ""
|
| 771 |
+
|
| 772 |
fig.update_layout(
|
| 773 |
title=dict(
|
| 774 |
+
text=f"Roofline Model: {display_name}{title_suffix}{estimated_note}<br>"
|
| 775 |
+
f"<span style='font-size:12px;color:#666'>"
|
| 776 |
+
f"Peak: {peak_tflops} TFLOPS | Bandwidth: {bandwidth_gbps} GB/s</span>",
|
| 777 |
x=0.5,
|
| 778 |
+
font=dict(size=14),
|
| 779 |
),
|
| 780 |
xaxis=dict(
|
| 781 |
title="Arithmetic Intensity (FLOPs/byte)",
|
|
|
|
| 784 |
),
|
| 785 |
yaxis=dict(
|
| 786 |
title="Performance (TFLOPS)",
|
| 787 |
+
range=[0, peak_tflops * 1.2], # More headroom for text
|
| 788 |
),
|
| 789 |
+
height=420,
|
| 790 |
+
margin=dict(l=60, r=40, t=80, b=80), # More room for title and legend
|
| 791 |
legend=dict(
|
| 792 |
orientation="h",
|
| 793 |
yanchor="bottom",
|
| 794 |
+
y=-0.30,
|
| 795 |
xanchor="center",
|
| 796 |
+
x=0.5,
|
| 797 |
+
font=dict(size=10),
|
| 798 |
),
|
| 799 |
showlegend=True,
|
| 800 |
)
|
src/constants.py
CHANGED
|
@@ -46,7 +46,7 @@ MODEL_CONFIGS = {
|
|
| 46 |
# GPU specifications for roofline analysis
|
| 47 |
GPU_SPECS = {
|
| 48 |
"A10G": {
|
| 49 |
-
"name": "NVIDIA A10G
|
| 50 |
"tflops_fp16": 125,
|
| 51 |
"bandwidth_gbps": 600, # GB/s
|
| 52 |
"memory_gb": 24,
|
|
@@ -66,6 +66,27 @@ GPU_SPECS = {
|
|
| 66 |
"memory_gb": 80,
|
| 67 |
"sram_kb": 256,
|
| 68 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
}
|
| 70 |
|
| 71 |
# Default GPU for Zero GPU Spaces
|
|
|
|
| 46 |
# GPU specifications for roofline analysis
|
| 47 |
GPU_SPECS = {
|
| 48 |
"A10G": {
|
| 49 |
+
"name": "NVIDIA A10G",
|
| 50 |
"tflops_fp16": 125,
|
| 51 |
"bandwidth_gbps": 600, # GB/s
|
| 52 |
"memory_gb": 24,
|
|
|
|
| 66 |
"memory_gb": 80,
|
| 67 |
"sram_kb": 256,
|
| 68 |
},
|
| 69 |
+
"H200": {
|
| 70 |
+
"name": "NVIDIA H200 (141GB)",
|
| 71 |
+
"tflops_fp16": 989, # Same compute as H100
|
| 72 |
+
"bandwidth_gbps": 4800, # HBM3e: 4.8 TB/s
|
| 73 |
+
"memory_gb": 141,
|
| 74 |
+
"sram_kb": 256,
|
| 75 |
+
},
|
| 76 |
+
"L40S": {
|
| 77 |
+
"name": "NVIDIA L40S",
|
| 78 |
+
"tflops_fp16": 362,
|
| 79 |
+
"bandwidth_gbps": 864,
|
| 80 |
+
"memory_gb": 48,
|
| 81 |
+
"sram_kb": 192,
|
| 82 |
+
},
|
| 83 |
+
"L4": {
|
| 84 |
+
"name": "NVIDIA L4",
|
| 85 |
+
"tflops_fp16": 121,
|
| 86 |
+
"bandwidth_gbps": 300,
|
| 87 |
+
"memory_gb": 24,
|
| 88 |
+
"sram_kb": 96,
|
| 89 |
+
},
|
| 90 |
}
|
| 91 |
|
| 92 |
# Default GPU for Zero GPU Spaces
|