Spaces:

lablab-ai-amd-developer-hackathon
/

gpu-goblin

Sleeping

App Files Files Community

gpu-goblin / runner /profile_parser.py

bharathtelu

Deploy auto-tune UI + scripts (work-from-91d0cf0)

a9aa4ae verified about 2 months ago

Raw

History Blame Contribute Delete

27.1 kB

	"""profile_parser — turn goblin_runner.sh artefacts into RunMetrics.

	Reads three files written by goblin_runner.sh (architecture.md §5):

	<out_dir>/trace.csv rocprofv3 kernel trace
	<out_dir>/torch_profile.json torch.profiler chrome trace
	<out_dir>/amd_smi.csv amd-smi telemetry, ~200 ms cadence

	and produces one `RunMetrics` with a populated `WasteBudget`.

	Each waste-budget bucket is computed with a deliberately simple, documented
	heuristic — these are best-effort signals for the agent, NOT measured
	ground-truth. See the docstring on `_waste_budget()` for the per-bucket
	formula and its known failure modes.

	This module is import-tolerant on machines without the rocprofv3 stack —
	it only reads files. Missing or unparseable files degrade individual
	metrics to zero and append a warning to RunMetrics.warnings rather than
	raising. LiveRunner ultimately decides what to do with parse failures.
	"""

	from __future__ import annotations

	import csv
	import json
	import logging
	import re
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any

	from agent.schemas import KernelEntry, RunMetrics, WasteBudget, WorkloadConfig

	_LOG = logging.getLogger(__name__)


	# MI300X has 192 GB HBM3. We treat sustained <70% utilisation as headroom.
	_HBM_TOTAL_GB = 192.0
	_HBM_HEALTHY_TARGET = 0.70

	# Kernel-name patterns used by the heuristics.
	#
	# We use `(?<![A-Za-z0-9])` / `(?![A-Za-z0-9])` instead of `\b` so that
	# underscores act as token separators — kernel names like
	# `rccl_AllReduce` and `hipBLASLt_generic_gemm` should match.
	_BOUND_L = r"(?<![A-Za-z0-9])"
	_BOUND_R = r"(?![A-Za-z0-9])"
	_RCCL_PATTERN = re.compile(
	_BOUND_L + r"(rccl\|nccl\|all[_-]?reduce\|broadcast\|reduce[_-]?scatter)" + _BOUND_R, re.I
	)
	_GEMM_PATTERN = re.compile(_BOUND_L + r"(gemm\|matmul\|hgemm\|sgemm\|hipblaslt)" + _BOUND_R, re.I)
	_GENERIC_GEMM_PATTERN = re.compile(
	_BOUND_L + r"(generic\|fallback\|naive\|reference)" + _BOUND_R, re.I
	)
	_FP16_PATTERN = re.compile(_BOUND_L + r"(fp16\|half\|f16)" + _BOUND_R, re.I)
	_BF16_PATTERN = re.compile(_BOUND_L + r"(bf16\|bfloat16)" + _BOUND_R, re.I)


	# ---------------------------------------------------------------------------
	# Public entry point
	# ---------------------------------------------------------------------------


	def parse(
	out_dir: Path,
	config: WorkloadConfig \| None = None,
	steps: int = 10,
	) -> RunMetrics:
	"""Build a `RunMetrics` from goblin_runner.sh artefacts in `out_dir`.

	`config` is used by some heuristics (e.g. precision_path skips the
	estimate when the config is already bf16). It can be None at the cost
	of a more conservative waste-budget estimate.

	Always returns a RunMetrics object — individual metrics degrade to
	zero with a warning rather than raising, because LiveRunner ultimately
	decides whether parse failures should trigger fallback.
	"""
	warnings: list[str] = []

	kernels = _read_kernels(out_dir / "trace.csv", warnings)
	torch_summary = _read_torch_profile(out_dir / "torch_profile.json", warnings)
	smi = _read_amd_smi(out_dir / "amd_smi.csv", warnings)

	top_kernels = _top_kernels(kernels, top_n=5)
	gpu_util_pct = smi.gpu_util_pct if smi.gpu_util_pct is not None else _gpu_util_from_kernels(
	kernels, torch_summary
	)
	# Write the resolved gpu_util back into smi so _waste_budget sees the
	# same number RunMetrics reports. Without this, a build where amd-smi's
	# `usage` column returns N/A leaves smi.gpu_util_pct=None — and
	# _waste_budget reads it as 0%, which forces host_gap to consume the
	# full step time and collapses kernel_shape / precision_path to zero
	# (both multiplied by gpu_util).
	smi.gpu_util_pct = gpu_util_pct

	waste_budget = _waste_budget(
	kernels=kernels,
	torch_summary=torch_summary,
	smi=smi,
	config=config,
	)

	attention_kernel_loaded = _detect_attention_kernel(kernels)

	return RunMetrics(
	steps=steps,
	tokens_per_sec=torch_summary.tokens_per_sec or 0.0,
	mfu_pct=torch_summary.mfu_pct or 0.0,
	hbm_peak_gb=smi.hbm_peak_gb or 0.0,
	hbm_avg_gb=smi.hbm_avg_gb or 0.0,
	gpu_util_pct=gpu_util_pct,
	top_kernels=top_kernels,
	attention_kernel_loaded=attention_kernel_loaded,
	waste_budget=waste_budget,
	warnings=warnings,
	rocm_version=smi.rocm_version or "unknown",
	pytorch_version=torch_summary.pytorch_version or "unknown",
	runner_kind="live",
	)


	# ---------------------------------------------------------------------------
	# Internal data carriers
	# ---------------------------------------------------------------------------


	@dataclass
	class _Kernel:
	name: str
	duration_ns: int
	"""Kernel duration in nanoseconds."""

	is_collective: bool = False
	is_gemm: bool = False
	is_generic_gemm: bool = False


	@dataclass
	class _TorchSummary:
	tokens_per_sec: float \| None = None
	mfu_pct: float \| None = None
	pytorch_version: str \| None = None
	step_time_seconds: float \| None = None
	"""Average wall-clock seconds per training step (used by waste budget)."""

	host_busy_fraction: float \| None = None
	"""Fraction of step time the host (CPU) spent doing non-launch work.
	Heuristic: `cpu_self_time / step_time` reported by torch.profiler."""


	@dataclass
	class _SmiSummary:
	hbm_peak_gb: float \| None = None
	hbm_avg_gb: float \| None = None
	gpu_util_pct: float \| None = None
	rocm_version: str \| None = None


	# ---------------------------------------------------------------------------
	# trace.csv (rocprofv3) → list[_Kernel]
	# ---------------------------------------------------------------------------


	def _read_kernels(path: Path, warnings: list[str]) -> list[_Kernel]:
	"""Parse rocprofv3 kernel-trace CSV into a list of `_Kernel` records.

	rocprofv3 column names vary slightly by version. We look up by header and
	accept the common aliases; if the file is missing or unparseable we
	return an empty list and append a warning so the caller can decide.
	"""
	if not path.exists():
	warnings.append(f"profile_parser: kernel trace not found at {path}")
	return []

	try:
	with path.open(newline="") as f:
	reader = csv.DictReader(f)
	if reader.fieldnames is None:
	warnings.append(f"profile_parser: empty kernel trace at {path}")
	return []
	name_col = _pick_column(reader.fieldnames, ["KernelName", "Kernel_Name", "kernel_name", "Name"])
	start_col = _pick_column(reader.fieldnames, ["BeginNs", "start_ns", "BeginNS", "Start"])
	end_col = _pick_column(reader.fieldnames, ["EndNs", "end_ns", "EndNS", "End"])
	duration_col = _pick_column(reader.fieldnames, ["DurationNs", "duration_ns", "Duration"])

	kernels: list[_Kernel] = []
	for row in reader:
	name = (row.get(name_col) or "").strip() if name_col else ""
	if not name:
	continue
	duration = _row_duration_ns(row, duration_col, start_col, end_col)
	if duration <= 0:
	continue
	kernels.append(
	_Kernel(
	name=name,
	duration_ns=duration,
	is_collective=bool(_RCCL_PATTERN.search(name)),
	is_gemm=bool(_GEMM_PATTERN.search(name)),
	is_generic_gemm=bool(
	_GEMM_PATTERN.search(name) and _GENERIC_GEMM_PATTERN.search(name)
	),
	)
	)
	return kernels
	except (OSError, csv.Error) as exc:
	warnings.append(f"profile_parser: failed to read kernel trace ({exc})")
	return []


	def _pick_column(fieldnames: list[str], candidates: list[str]) -> str \| None:
	"""Pick the first matching column name, with three fallback tiers:
	1. Exact match.
	2. Case-insensitive exact match.
	3. Substring match (case-insensitive) — every token of any candidate
	must appear somewhere in the field name. Tolerates the column-name
	drift between rocprofv3 / amd-smi versions (e.g. `VRAM_USED` vs
	`vram_used_mb` vs `VRAM USED MB`).
	"""
	for c in candidates:
	if c in fieldnames:
	return c
	lower = {f.lower(): f for f in fieldnames}
	for c in candidates:
	if c.lower() in lower:
	return lower[c.lower()]
	# Substring tier: split each candidate on _/space, require all tokens
	# appear in the (lowercased) field name. Avoids matching too eagerly
	# by requiring every token of the candidate.
	for c in candidates:
	tokens = [t for t in c.lower().replace("_", " ").split() if t]
	if not tokens:
	continue
	for fname in fieldnames:
	fl = fname.lower()
	if all(t in fl for t in tokens):
	return fname
	return None


	def _row_duration_ns(
	row: dict[str, str], duration_col: str \| None, start_col: str \| None, end_col: str \| None
	) -> int:
	if duration_col and row.get(duration_col):
	try:
	return int(float(row[duration_col]))
	except ValueError:
	return 0
	if start_col and end_col and row.get(start_col) and row.get(end_col):
	try:
	return int(float(row[end_col])) - int(float(row[start_col]))
	except ValueError:
	return 0
	return 0


	def _top_kernels(kernels: list[_Kernel], top_n: int) -> list[KernelEntry]:
	if not kernels:
	return []
	total = sum(k.duration_ns for k in kernels) or 1
	by_name: dict[str, int] = {}
	for k in kernels:
	by_name[k.name] = by_name.get(k.name, 0) + k.duration_ns
	ranked = sorted(by_name.items(), key=lambda kv: kv[1], reverse=True)[:top_n]
	return [KernelEntry(name=name, pct_time=ns / total * 100.0) for name, ns in ranked]


	def _detect_attention_kernel(kernels: list[_Kernel]) -> str:
	for k in kernels:
	n = k.name.lower()
	if "flash_attn" in n and "rocm" in n:
	return "flash_rocm"
	if "flash" in n and "attn" in n:
	return "flash"
	if "scaled_dot_product_attention" in n:
	return "sdpa"
	if kernels:
	return "eager" # nothing flash-shaped; default conservative label
	return "unknown"


	def _gpu_util_from_kernels(kernels: list[_Kernel], torch_summary: _TorchSummary) -> float:
	"""Fallback GPU util when amd-smi is missing.

	util ≈ sum(kernel duration) / total wall-clock step time.
	"""
	if not kernels or torch_summary.step_time_seconds in (None, 0):
	return 0.0
	total_kernel_ns = sum(k.duration_ns for k in kernels)
	wall_ns = torch_summary.step_time_seconds * 1e9
	if wall_ns <= 0:
	return 0.0
	return min(100.0, total_kernel_ns / wall_ns * 100.0)


	# ---------------------------------------------------------------------------
	# torch_profile.json (torch.profiler chrome trace) → _TorchSummary
	# ---------------------------------------------------------------------------


	def _read_torch_profile(path: Path, warnings: list[str]) -> _TorchSummary:
	"""Pull tokens/sec, MFU, and step timing from a torch.profiler artefact.

	The user script (workloads/train_qwen_lora.py in Phase 3) is
	responsible for embedding `tokens_per_sec`, `mfu_pct`, `pytorch_version`
	and `step_time_seconds` in the trace as `metadata` events. If those are
	missing, we estimate `step_time_seconds` from the total trace duration.
	"""
	summary = _TorchSummary()
	if not path.exists():
	warnings.append(f"profile_parser: torch profile not found at {path}")
	return summary
	try:
	data = json.loads(path.read_text())
	except (OSError, json.JSONDecodeError) as exc:
	warnings.append(f"profile_parser: failed to read torch profile ({exc})")
	return summary

	# torch.profiler chrome trace is `{"traceEvents": [...], "metadata": {...}}`
	metadata = data.get("metadata") if isinstance(data, dict) else None
	if isinstance(metadata, dict):
	summary.tokens_per_sec = _coerce_float(metadata.get("tokens_per_sec"))
	summary.mfu_pct = _coerce_float(metadata.get("mfu_pct"))
	summary.pytorch_version = metadata.get("pytorch_version") or metadata.get("torch_version")
	summary.step_time_seconds = _coerce_float(metadata.get("step_time_seconds"))
	summary.host_busy_fraction = _coerce_float(metadata.get("host_busy_fraction"))

	events = data.get("traceEvents") if isinstance(data, dict) else None
	if isinstance(events, list):
	if summary.step_time_seconds is None:
	summary.step_time_seconds = _step_time_from_events(events)
	if summary.host_busy_fraction is None:
	summary.host_busy_fraction = _host_busy_from_events(events)

	return summary


	def _coerce_float(v: Any) -> float \| None:
	if v is None:
	return None
	try:
	return float(v)
	except (TypeError, ValueError):
	return None


	def _step_time_from_events(events: list[dict]) -> float \| None:
	"""Estimate per-step wall-clock seconds from chrome-trace duration events.

	Looks for `name == "ProfilerStep#*"` complete events; falls back to the
	overall trace span if those aren't present.
	"""
	durations: list[float] = []
	overall_start: float \| None = None
	overall_end: float \| None = None
	for ev in events:
	if not isinstance(ev, dict):
	continue
	name = ev.get("name", "")
	ts = ev.get("ts")
	dur = ev.get("dur")
	if isinstance(name, str) and name.startswith("ProfilerStep") and isinstance(dur, (int, float)):
	durations.append(float(dur) / 1e6) # us → seconds
	if isinstance(ts, (int, float)) and isinstance(dur, (int, float)):
	start = float(ts)
	end = start + float(dur)
	overall_start = start if overall_start is None else min(overall_start, start)
	overall_end = end if overall_end is None else max(overall_end, end)
	if durations:
	return sum(durations) / len(durations)
	if overall_start is not None and overall_end is not None and overall_end > overall_start:
	return (overall_end - overall_start) / 1e6
	return None


	def _host_busy_from_events(events: list[dict]) -> float \| None:
	"""Heuristic: cpu_op event time / total event span.

	Used by the data_wait waste bucket to disambiguate "GPU idle because the
	host is busy preparing the next batch" from "GPU idle because nothing
	is running anywhere".
	"""
	cpu_op_us = 0.0
	span_min: float \| None = None
	span_max: float \| None = None
	for ev in events:
	if not isinstance(ev, dict):
	continue
	cat = ev.get("cat", "")
	ts = ev.get("ts")
	dur = ev.get("dur")
	if not (isinstance(ts, (int, float)) and isinstance(dur, (int, float))):
	continue
	ts_f = float(ts)
	dur_f = float(dur)
	end = ts_f + dur_f
	span_min = ts_f if span_min is None else min(span_min, ts_f)
	span_max = end if span_max is None else max(span_max, end)
	if isinstance(cat, str) and "cpu_op" in cat.lower():
	cpu_op_us += dur_f
	if span_min is None or span_max is None or span_max <= span_min:
	return None
	span = span_max - span_min
	if span <= 0:
	return None
	return min(1.0, cpu_op_us / span)


	# ---------------------------------------------------------------------------
	# amd_smi.csv → _SmiSummary
	# ---------------------------------------------------------------------------


	def _read_amd_smi(path: Path, warnings: list[str]) -> _SmiSummary:
	"""Aggregate amd-smi polling output into HBM peak/avg + GPU util."""
	summary = _SmiSummary()
	if not path.exists():
	warnings.append(f"profile_parser: amd-smi telemetry not found at {path}")
	return summary

	try:
	raw = path.read_text()
	except OSError as exc:
	warnings.append(f"profile_parser: failed to read amd-smi telemetry ({exc})")
	return summary

	csv_text = _strip_amd_smi_preamble(raw)
	if csv_text is None:
	warnings.append(f"profile_parser: no parseable header in amd-smi telemetry at {path}")
	return summary

	try:
	import io as _io

	reader = csv.DictReader(_io.StringIO(csv_text))
	if reader.fieldnames is None:
	warnings.append(f"profile_parser: empty amd-smi telemetry at {path}")
	return summary
	hbm_col = _pick_column(
	reader.fieldnames,
	[
	# `amd-smi metric --mem-usage --csv` (ROCm 7.x) emits
	# "used_vram" in MB. Older `amd-smi monitor --vram-usage`
	# emits "VRAM_USED" / "vram_used_mb" depending on minor
	# version.
	"used_vram",
	"USED_VRAM",
	"VRAM_USED_MB",
	"vram_used_mb",
	"VRAM_USED",
	"VRAM_USED_GB",
	"vram_used",
	"VRAM Used",
	# Older rocm 6.x naming
	"MEM_USED",
	"mem_used",
	],
	)
	util_col = _pick_column(
	reader.fieldnames,
	[
	# `amd-smi metric --usage --csv` (ROCm 7.x) emits a single
	# consolidated "usage" column. Some builds report N/A here
	# — _coerce_float drops those silently and we fall back to
	# the kernel-trace gpu_util estimate downstream.
	"usage",
	"USAGE",
	# `amd-smi monitor --gfx` (ROCm 7.x) → "gfx_util"
	"GFX_UTIL",
	"gfx_util",
	"GFX_UTILIZATION",
	"gfx_utilization",
	# rocm 6.x and older
	"GFX_ACTIVITY",
	"gfx_activity",
	"GPU_USE",
	"GFX %",
	"Util",
	],
	)
	rocm_col = _pick_column(reader.fieldnames, ["ROCM_VERSION", "rocm_version"])

	hbm_samples: list[float] = []
	util_samples: list[float] = []
	for row in reader:
	if hbm_col:
	hbm_gb = _hbm_to_gb(row.get(hbm_col), hbm_col)
	if hbm_gb is not None:
	hbm_samples.append(hbm_gb)
	if util_col:
	util = _coerce_float(row.get(util_col))
	if util is not None:
	util_samples.append(min(100.0, util))
	if rocm_col and summary.rocm_version is None:
	val = (row.get(rocm_col) or "").strip()
	if val:
	summary.rocm_version = val
	if hbm_samples:
	summary.hbm_peak_gb = max(hbm_samples)
	summary.hbm_avg_gb = sum(hbm_samples) / len(hbm_samples)
	if util_samples:
	summary.gpu_util_pct = sum(util_samples) / len(util_samples)
	return summary
	except csv.Error as exc:
	warnings.append(f"profile_parser: failed to parse amd-smi telemetry ({exc})")
	return summary


	def _strip_amd_smi_preamble(raw: str) -> str \| None:
	"""Drop everything before the first real CSV header and dedupe repeated
	header lines — both noise produced by `amd-smi <subcmd> --watch`.

	--watch prints a "'CTRL' + 'C' to stop watching output:" banner once at
	the top, then re-emits the CSV header on every iteration. csv.DictReader
	naively reads the banner as fieldnames and treats every subsequent
	header as a misshapen data row. Pre-strip both before handing it off.

	Returns a CSV string ready for DictReader, or None if no header line
	is recognisable.
	"""
	lines = raw.splitlines()
	header_idx: int \| None = None
	for i, line in enumerate(lines):
	if "," not in line:
	continue
	lower = line.lower()
	# Recognised tokens come from the columns amd-smi metric / monitor
	# actually emit. Match conservatively — banners or other noise can
	# contain commas too.
	if any(tok in lower for tok in ("vram", "gfx_", "timestamp,", "gpu_use", "gpu,")):
	header_idx = i
	break
	if header_idx is None:
	return None

	header = lines[header_idx]
	data = [
	line
	for line in lines[header_idx + 1 :]
	if line.strip() and line != header
	]
	return header + "\n" + "\n".join(data) + ("\n" if data else "")


	def _hbm_to_gb(raw: str \| None, column_name: str \| None = None) -> float \| None:
	"""amd-smi sometimes reports VRAM in MB, sometimes in GB.

	First check the column name — `vram` / `*_mb` columns are MB-typed
	in every amd-smi build we've seen; `*_gb` is GB. Without a column-name
	hint, fall back to a value heuristic. The old "v > 1024 ⇒ MB" heuristic
	misclassified small idle samples (e.g. 285 MB at GPU idle) as GB and
	inflated the peak across the run, which then forced memory_headroom to
	zero downstream.
	"""
	if not raw:
	return None
	try:
	v = float(str(raw).strip().replace("MB", "").replace("GB", "").replace(",", ""))
	except ValueError:
	return None
	if column_name:
	lower = column_name.lower()
	if "_mb" in lower or lower.endswith("mb"):
	return v / 1024.0
	if "_gb" in lower or lower.endswith("gb"):
	return v
	if "vram" in lower or "mem" in lower:
	# amd-smi metric / monitor default to MB for the bare
	# `used_vram` / `mem_used` columns on every ROCm 6.x+ build.
	return v / 1024.0
	# No column hint — fall back to value heuristic.
	if v > 1024.0:
	return v / 1024.0
	return v


	# ---------------------------------------------------------------------------
	# Waste-budget heuristics
	# ---------------------------------------------------------------------------


	def _waste_budget(
	*,
	kernels: list[_Kernel],
	torch_summary: _TorchSummary,
	smi: _SmiSummary,
	config: WorkloadConfig \| None,
	) -> WasteBudget:
	"""Decompose step time into the seven WasteBudget buckets (architecture.md §3).

	These are HEURISTICS, not measurements. Each bucket is in seconds-per-step
	so they can be summed against `step_time_seconds`. If we can't observe a
	bucket we set it to 0 — `evidence_coverage` in `propose_patch` then
	discounts confidence accordingly.

	Per-bucket logic:

	data_wait
	Fraction of step time where GPU util was below 30% AND the host was
	busy (host_busy_fraction > 0.5). Maps to dataloader / H2D copy stalls.

	precision_path
	If the user is already on bf16/fp8 we skip this bucket (recovery is 0).
	Otherwise we estimate from kernel names: time spent in fp16-tagged
	GEMMs is the recoverable surface; bf16-tagged kernels are not.

	kernel_shape
	Fraction of total GEMM kernel time spent on kernels whose names
	match `generic\|fallback\|naive\|reference` — i.e. hipBLASLt/MIOpen
	couldn't pick a tuned tile size and fell back to a slow path.

	host_gap
	Time the GPU was idle while the host was NOT busy either — pure
	launch latency / eager-mode kernel gaps. We approximate as
	`(1 - gpu_util) * (1 - host_busy)` × step_time.

	comm_excess
	Sum of all collective-kernel duration (anything matching the rccl
	pattern). Treated as 100% recoverable elsewhere — the rule decides
	recovery_fraction.

	memory_headroom
	`(192 - hbm_peak) / 192 × small_constant`. Only counts the headroom
	that exceeded a healthy 70% target — running at 60% of HBM costs us
	roughly 0.07 of step time worth of optimisation surface.

	useful_gpu
	Whatever step time is left. Will be roughly the busy GPU time minus
	comm_excess and the kernel-shape penalty.
	"""
	step_t = torch_summary.step_time_seconds or 0.0
	gpu_util = (smi.gpu_util_pct or 0.0) / 100.0 # 0..1
	host_busy = torch_summary.host_busy_fraction or 0.0
	if step_t <= 0:
	return WasteBudget()

	# data_wait: GPU under-utilised AND host busy → dataloader bottleneck.
	if gpu_util < 0.30 and host_busy > 0.5:
	data_wait = step_t * (1.0 - gpu_util) * host_busy
	else:
	data_wait = 0.0

	# host_gap: GPU idle while host idle too → launch latency / kernel gaps.
	host_gap = step_t * (1.0 - gpu_util) * (1.0 - host_busy)

	# comm_excess: total time in collective kernels (in seconds).
	comm_excess_ns = sum(k.duration_ns for k in kernels if k.is_collective)
	comm_excess = comm_excess_ns / 1e9

	# kernel_shape: fraction of GEMM time spent on un-tuned/generic kernels.
	gemm_total_ns = sum(k.duration_ns for k in kernels if k.is_gemm) or 1
	generic_gemm_ns = sum(k.duration_ns for k in kernels if k.is_generic_gemm)
	kernel_shape = (generic_gemm_ns / gemm_total_ns) * step_t * gpu_util # cap at "real" GPU time

	# precision_path: only meaningful if config is fp16/fp32. Estimate from
	# kernel name tags. If config is bf16+ we leave as 0 — already optimal.
	precision_path = 0.0
	if config is None or config.precision in {"fp16", "fp32"}:
	fp16_ns = sum(k.duration_ns for k in kernels if _FP16_PATTERN.search(k.name))
	bf16_ns = sum(k.duration_ns for k in kernels if _BF16_PATTERN.search(k.name))
	denom = fp16_ns + bf16_ns
	if denom > 0:
	# Fraction of compute still on fp16. On MI300X bf16 is faster +
	# more numerically stable, so this fraction is the recoverable
	# precision_path surface.
	precision_path = (fp16_ns / denom) * step_t * gpu_util * 0.10

	# memory_headroom: headroom past the 70% healthy target × small constant.
	memory_headroom = 0.0
	hbm_peak = smi.hbm_peak_gb
	if hbm_peak is not None and hbm_peak > 0:
	utilisation = hbm_peak / _HBM_TOTAL_GB
	if utilisation < _HBM_HEALTHY_TARGET:
	slack = (_HBM_HEALTHY_TARGET - utilisation) / _HBM_HEALTHY_TARGET
	# Small constant: HBM slack is real but only enables a fraction of
	# potential gain (you still need a larger batch to use it). 0.05
	# of step time per "unit of slack" is a conservative anchor.
	memory_headroom = slack * step_t * 0.05

	# useful_gpu: everything else. Clamp to >= 0.
	spent = data_wait + host_gap + comm_excess + kernel_shape + precision_path + memory_headroom
	useful_gpu = max(0.0, step_t - spent)

	return WasteBudget(
	useful_gpu=useful_gpu,
	data_wait=data_wait,
	host_gap=host_gap,
	comm_excess=comm_excess,
	memory_headroom=memory_headroom,
	precision_path=precision_path,
	kernel_shape=kernel_shape,
	)