bbkdevops's picture
download
raw
8.84 kB
"""Build/run TinyMind INT6 CUDA kernel and SASS boundary evidence."""
from __future__ import annotations
from datetime import datetime, timezone
import json
from pathlib import Path
import re
import subprocess
import tempfile
ROOT = Path(__file__).resolve().parents[1]
INT6_KERNEL = ROOT / "kernels" / "int6_sparse_ptx" / "int6_sparse_kernel.cu"
INT4_MMA = ROOT / "kernels" / "int4_sparse_ptx" / "int4_sparse_mma.cu"
def _run(command: list[str], cwd: Path) -> dict:
try:
proc = subprocess.run(command, cwd=cwd, capture_output=True, text=True, timeout=120, check=False)
except (OSError, subprocess.SubprocessError) as exc:
return {"command": command, "exit_code": -1, "stdout": "", "stderr": str(exc)}
return {
"command": command,
"exit_code": proc.returncode,
"stdout": proc.stdout,
"stderr": proc.stderr,
}
def _run_msvc(command: list[str], cwd: Path) -> dict:
vcvars = Path("C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Auxiliary/Build/vcvars64.bat")
if not vcvars.exists():
return _run(command, cwd)
quoted = " ".join(f'"{part}"' if (" " in part or "\\" in part or ":" in part) else part for part in command)
shell_command = f'@echo off\ncall "{vcvars}" >nul\n{quoted}\n'
script_path = Path(tempfile.gettempdir()) / "tinymind_cuda_build_env.bat"
script_path.write_text(shell_command, encoding="utf-8")
try:
proc = subprocess.run(["cmd.exe", "/d", "/c", str(script_path)], cwd=cwd, capture_output=True, text=True, timeout=120, check=False)
except (OSError, subprocess.SubprocessError) as exc:
return {"command": command, "exit_code": -1, "stdout": "", "stderr": str(exc)}
return {
"command": command,
"exit_code": proc.returncode,
"stdout": proc.stdout,
"stderr": proc.stderr,
}
def _find_tool(name: str) -> str:
for candidate in [
Path("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/bin") / f"{name}.exe",
Path("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.6/bin") / f"{name}.exe",
]:
if candidate.exists():
return str(candidate)
return name
def build_int6_cuda_eval(out_dir: str | Path) -> dict:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
nvcc = _find_tool("nvcc")
cuobjdump = _find_tool("cuobjdump")
int6_exe = out / "int6_sparse_kernel.exe"
int4_exe = out / "int4_sparse_mma.exe"
int6_build = _run_msvc([nvcc, "-arch=sm_86", "-O3", "-std=c++17", str(INT6_KERNEL), "-o", str(int6_exe)], ROOT)
int6_run = _run([str(int6_exe)], ROOT) if int6_build["exit_code"] == 0 else {"command": [str(int6_exe)], "exit_code": -1, "stdout": "", "stderr": "int6 build failed"}
int6_sass = _run([cuobjdump, "--dump-sass", str(int6_exe)], ROOT) if int6_build["exit_code"] == 0 else {"command": [cuobjdump, "--dump-sass", str(int6_exe)], "exit_code": -1, "stdout": "", "stderr": "int6 build failed"}
int4_build = _run_msvc([nvcc, "-arch=sm_86", "-O3", "-std=c++17", str(INT4_MMA), "-o", str(int4_exe)], ROOT)
int4_run = _run([str(int4_exe)], ROOT) if int4_build["exit_code"] == 0 else {"command": [str(int4_exe)], "exit_code": -1, "stdout": "", "stderr": "int4 build failed"}
int4_sass = _run([cuobjdump, "--dump-sass", str(int4_exe)], ROOT) if int4_build["exit_code"] == 0 else {"command": [cuobjdump, "--dump-sass", str(int4_exe)], "exit_code": -1, "stdout": "", "stderr": "int4 build failed"}
max_error = None
match = re.search(r"max_abs_error:\s*([0-9.eE+-]+)", int6_run.get("stdout", ""))
if match:
max_error = float(match.group(1))
throughput = _parse_int6_throughput(int6_run.get("stdout", ""))
int4_sass_text = int4_sass.get("stdout", "") + int4_sass.get("stderr", "")
int6_sass_text = int6_sass.get("stdout", "") + int6_sass.get("stderr", "")
report = {
"schema_version": "tinymind-int6-cuda-eval-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"hardware_target": "RTX 3090 sm_86",
"int6_cuda_kernel": {
"source": str(INT6_KERNEL),
"exe": str(int6_exe),
"build": _trim(int6_build),
"run": _trim(int6_run),
"sass_contains_tensor_core_imma": bool(re.search(r"IMMA|HMMA|MMA", int6_sass_text)),
"max_abs_error": max_error,
"throughput": throughput,
"passed": int6_build["exit_code"] == 0 and int6_run["exit_code"] == 0 and max_error is not None and max_error < 1e-5,
},
"tensor_core_sparse_ptx_boundary": {
"source": str(INT4_MMA),
"exe": str(int4_exe),
"build": _trim(int4_build),
"run": _trim(int4_run),
"sass_contains_sparse_mma": bool(re.search(r"IMMA|MMA|SP", int4_sass_text)),
"passed": int4_build["exit_code"] == 0 and int4_run["exit_code"] == 0 and bool(re.search(r"IMMA|MMA|SP", int4_sass_text)),
},
"ampere_int6_tensor_core_native": {
"available": False,
"reason": "Ampere exposes INT4/INT8 Tensor Core MMA paths, not native signed INT6 Tensor Core MMA. TinyMind INT6 v1 uses packed INT6 sparse CUDA reference math and can later bridge through INT8/CUTLASS kernels.",
},
"claim_gate": {
"real_cuda_kernel_passed": False,
"real_sparse_tensor_core_sass_observed": False,
"int6_native_tensor_core_claim_allowed": False,
"world_best_kernel_claim_allowed": False,
},
}
report["claim_gate"]["real_cuda_kernel_passed"] = report["int6_cuda_kernel"]["passed"]
report["claim_gate"]["real_sparse_tensor_core_sass_observed"] = report["tensor_core_sparse_ptx_boundary"]["passed"]
json_path = out / "int6_cuda_eval_report.json"
md_path = out / "int6_cuda_eval_report.md"
report["json_path"] = str(json_path)
report["markdown_path"] = str(md_path)
json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
md_path.write_text(_markdown(report), encoding="utf-8")
return report
def _trim(result: dict) -> dict:
return {
"command": result["command"],
"exit_code": result["exit_code"],
"stdout_tail": result.get("stdout", "")[-2000:],
"stderr_tail": result.get("stderr", "")[-2000:],
}
def _parse_int6_throughput(text: str) -> dict:
def grab(name: str) -> float | None:
match = re.search(rf"{re.escape(name)}:\s*([0-9.eE+-]+)", text)
return float(match.group(1)) if match else None
return {
"benchmark_iters": grab("INT6 benchmark iters"),
"elapsed_ms": grab("INT6 benchmark elapsed_ms"),
"reference_elapsed_ms": grab("INT6 reference elapsed_ms"),
"optimized_elapsed_ms": grab("INT6 optimized elapsed_ms"),
"optimized_speedup_vs_reference": grab("INT6 optimized speedup_vs_reference"),
"reference_actual_sparse_tops": grab("INT6 reference actual_sparse_tops"),
"optimized_actual_sparse_tops": grab("INT6 optimized actual_sparse_tops"),
"reference_dense_equivalent_tops": grab("INT6 reference dense_equivalent_tops"),
"optimized_dense_equivalent_tops": grab("INT6 optimized dense_equivalent_tops"),
"actual_sparse_tops": grab("INT6 actual_sparse_tops"),
"dense_equivalent_tops": grab("INT6 dense_equivalent_tops"),
"optimized_kernel": "shared_memory_tiled_cuda_core",
"reference_kernel": "global_memory_cuda_core",
"meaning": "actual_sparse_tops counts only non-zero 2:4 sparse multiply-add work; dense_equivalent_tops counts equivalent dense MxKxN multiply-add work. Optimized path is shared-memory tiled CUDA-core math, not native INT6 Tensor Core.",
}
def _markdown(report: dict) -> str:
int6 = report["int6_cuda_kernel"]
tc = report["tensor_core_sparse_ptx_boundary"]
return "\n".join(
[
"# TinyMind INT6 CUDA Eval",
"",
f"- INT6 CUDA kernel passed: {int6['passed']}",
f"- INT6 max abs error: {int6['max_abs_error']}",
f"- INT6 actual sparse TOPS: {int6['throughput']['actual_sparse_tops']}",
f"- INT6 dense-equivalent TOPS: {int6['throughput']['dense_equivalent_tops']}",
f"- INT6 optimized speedup vs reference: {int6['throughput']['optimized_speedup_vs_reference']}",
f"- Sparse Tensor Core SASS observed: {tc['passed']}",
f"- INT6 native Tensor Core available: {report['ampere_int6_tensor_core_native']['available']}",
f"- Reason: {report['ampere_int6_tensor_core_native']['reason']}",
f"- World-best kernel claim allowed: {report['claim_gate']['world_best_kernel_claim_allowed']}",
"",
]
)

Xet Storage Details

Size:
8.84 kB
·
Xet hash:
371e51236fa1a625655ff5fca7c7a4425f5d06cd316f85aa881f22955e325601

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.