Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /evaluation /int6_cuda_eval.py
| """Build/run TinyMind INT6 CUDA kernel and SASS boundary evidence.""" | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| import re | |
| import subprocess | |
| import tempfile | |
| ROOT = Path(__file__).resolve().parents[1] | |
| INT6_KERNEL = ROOT / "kernels" / "int6_sparse_ptx" / "int6_sparse_kernel.cu" | |
| INT4_MMA = ROOT / "kernels" / "int4_sparse_ptx" / "int4_sparse_mma.cu" | |
| def _run(command: list[str], cwd: Path) -> dict: | |
| try: | |
| proc = subprocess.run(command, cwd=cwd, capture_output=True, text=True, timeout=120, check=False) | |
| except (OSError, subprocess.SubprocessError) as exc: | |
| return {"command": command, "exit_code": -1, "stdout": "", "stderr": str(exc)} | |
| return { | |
| "command": command, | |
| "exit_code": proc.returncode, | |
| "stdout": proc.stdout, | |
| "stderr": proc.stderr, | |
| } | |
| def _run_msvc(command: list[str], cwd: Path) -> dict: | |
| vcvars = Path("C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Auxiliary/Build/vcvars64.bat") | |
| if not vcvars.exists(): | |
| return _run(command, cwd) | |
| quoted = " ".join(f'"{part}"' if (" " in part or "\\" in part or ":" in part) else part for part in command) | |
| shell_command = f'@echo off\ncall "{vcvars}" >nul\n{quoted}\n' | |
| script_path = Path(tempfile.gettempdir()) / "tinymind_cuda_build_env.bat" | |
| script_path.write_text(shell_command, encoding="utf-8") | |
| try: | |
| proc = subprocess.run(["cmd.exe", "/d", "/c", str(script_path)], cwd=cwd, capture_output=True, text=True, timeout=120, check=False) | |
| except (OSError, subprocess.SubprocessError) as exc: | |
| return {"command": command, "exit_code": -1, "stdout": "", "stderr": str(exc)} | |
| return { | |
| "command": command, | |
| "exit_code": proc.returncode, | |
| "stdout": proc.stdout, | |
| "stderr": proc.stderr, | |
| } | |
| def _find_tool(name: str) -> str: | |
| for candidate in [ | |
| Path("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/bin") / f"{name}.exe", | |
| Path("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.6/bin") / f"{name}.exe", | |
| ]: | |
| if candidate.exists(): | |
| return str(candidate) | |
| return name | |
| def build_int6_cuda_eval(out_dir: str | Path) -> dict: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| nvcc = _find_tool("nvcc") | |
| cuobjdump = _find_tool("cuobjdump") | |
| int6_exe = out / "int6_sparse_kernel.exe" | |
| int4_exe = out / "int4_sparse_mma.exe" | |
| int6_build = _run_msvc([nvcc, "-arch=sm_86", "-O3", "-std=c++17", str(INT6_KERNEL), "-o", str(int6_exe)], ROOT) | |
| int6_run = _run([str(int6_exe)], ROOT) if int6_build["exit_code"] == 0 else {"command": [str(int6_exe)], "exit_code": -1, "stdout": "", "stderr": "int6 build failed"} | |
| int6_sass = _run([cuobjdump, "--dump-sass", str(int6_exe)], ROOT) if int6_build["exit_code"] == 0 else {"command": [cuobjdump, "--dump-sass", str(int6_exe)], "exit_code": -1, "stdout": "", "stderr": "int6 build failed"} | |
| int4_build = _run_msvc([nvcc, "-arch=sm_86", "-O3", "-std=c++17", str(INT4_MMA), "-o", str(int4_exe)], ROOT) | |
| int4_run = _run([str(int4_exe)], ROOT) if int4_build["exit_code"] == 0 else {"command": [str(int4_exe)], "exit_code": -1, "stdout": "", "stderr": "int4 build failed"} | |
| int4_sass = _run([cuobjdump, "--dump-sass", str(int4_exe)], ROOT) if int4_build["exit_code"] == 0 else {"command": [cuobjdump, "--dump-sass", str(int4_exe)], "exit_code": -1, "stdout": "", "stderr": "int4 build failed"} | |
| max_error = None | |
| match = re.search(r"max_abs_error:\s*([0-9.eE+-]+)", int6_run.get("stdout", "")) | |
| if match: | |
| max_error = float(match.group(1)) | |
| throughput = _parse_int6_throughput(int6_run.get("stdout", "")) | |
| int4_sass_text = int4_sass.get("stdout", "") + int4_sass.get("stderr", "") | |
| int6_sass_text = int6_sass.get("stdout", "") + int6_sass.get("stderr", "") | |
| report = { | |
| "schema_version": "tinymind-int6-cuda-eval-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "hardware_target": "RTX 3090 sm_86", | |
| "int6_cuda_kernel": { | |
| "source": str(INT6_KERNEL), | |
| "exe": str(int6_exe), | |
| "build": _trim(int6_build), | |
| "run": _trim(int6_run), | |
| "sass_contains_tensor_core_imma": bool(re.search(r"IMMA|HMMA|MMA", int6_sass_text)), | |
| "max_abs_error": max_error, | |
| "throughput": throughput, | |
| "passed": int6_build["exit_code"] == 0 and int6_run["exit_code"] == 0 and max_error is not None and max_error < 1e-5, | |
| }, | |
| "tensor_core_sparse_ptx_boundary": { | |
| "source": str(INT4_MMA), | |
| "exe": str(int4_exe), | |
| "build": _trim(int4_build), | |
| "run": _trim(int4_run), | |
| "sass_contains_sparse_mma": bool(re.search(r"IMMA|MMA|SP", int4_sass_text)), | |
| "passed": int4_build["exit_code"] == 0 and int4_run["exit_code"] == 0 and bool(re.search(r"IMMA|MMA|SP", int4_sass_text)), | |
| }, | |
| "ampere_int6_tensor_core_native": { | |
| "available": False, | |
| "reason": "Ampere exposes INT4/INT8 Tensor Core MMA paths, not native signed INT6 Tensor Core MMA. TinyMind INT6 v1 uses packed INT6 sparse CUDA reference math and can later bridge through INT8/CUTLASS kernels.", | |
| }, | |
| "claim_gate": { | |
| "real_cuda_kernel_passed": False, | |
| "real_sparse_tensor_core_sass_observed": False, | |
| "int6_native_tensor_core_claim_allowed": False, | |
| "world_best_kernel_claim_allowed": False, | |
| }, | |
| } | |
| report["claim_gate"]["real_cuda_kernel_passed"] = report["int6_cuda_kernel"]["passed"] | |
| report["claim_gate"]["real_sparse_tensor_core_sass_observed"] = report["tensor_core_sparse_ptx_boundary"]["passed"] | |
| json_path = out / "int6_cuda_eval_report.json" | |
| md_path = out / "int6_cuda_eval_report.md" | |
| report["json_path"] = str(json_path) | |
| report["markdown_path"] = str(md_path) | |
| json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md_path.write_text(_markdown(report), encoding="utf-8") | |
| return report | |
| def _trim(result: dict) -> dict: | |
| return { | |
| "command": result["command"], | |
| "exit_code": result["exit_code"], | |
| "stdout_tail": result.get("stdout", "")[-2000:], | |
| "stderr_tail": result.get("stderr", "")[-2000:], | |
| } | |
| def _parse_int6_throughput(text: str) -> dict: | |
| def grab(name: str) -> float | None: | |
| match = re.search(rf"{re.escape(name)}:\s*([0-9.eE+-]+)", text) | |
| return float(match.group(1)) if match else None | |
| return { | |
| "benchmark_iters": grab("INT6 benchmark iters"), | |
| "elapsed_ms": grab("INT6 benchmark elapsed_ms"), | |
| "reference_elapsed_ms": grab("INT6 reference elapsed_ms"), | |
| "optimized_elapsed_ms": grab("INT6 optimized elapsed_ms"), | |
| "optimized_speedup_vs_reference": grab("INT6 optimized speedup_vs_reference"), | |
| "reference_actual_sparse_tops": grab("INT6 reference actual_sparse_tops"), | |
| "optimized_actual_sparse_tops": grab("INT6 optimized actual_sparse_tops"), | |
| "reference_dense_equivalent_tops": grab("INT6 reference dense_equivalent_tops"), | |
| "optimized_dense_equivalent_tops": grab("INT6 optimized dense_equivalent_tops"), | |
| "actual_sparse_tops": grab("INT6 actual_sparse_tops"), | |
| "dense_equivalent_tops": grab("INT6 dense_equivalent_tops"), | |
| "optimized_kernel": "shared_memory_tiled_cuda_core", | |
| "reference_kernel": "global_memory_cuda_core", | |
| "meaning": "actual_sparse_tops counts only non-zero 2:4 sparse multiply-add work; dense_equivalent_tops counts equivalent dense MxKxN multiply-add work. Optimized path is shared-memory tiled CUDA-core math, not native INT6 Tensor Core.", | |
| } | |
| def _markdown(report: dict) -> str: | |
| int6 = report["int6_cuda_kernel"] | |
| tc = report["tensor_core_sparse_ptx_boundary"] | |
| return "\n".join( | |
| [ | |
| "# TinyMind INT6 CUDA Eval", | |
| "", | |
| f"- INT6 CUDA kernel passed: {int6['passed']}", | |
| f"- INT6 max abs error: {int6['max_abs_error']}", | |
| f"- INT6 actual sparse TOPS: {int6['throughput']['actual_sparse_tops']}", | |
| f"- INT6 dense-equivalent TOPS: {int6['throughput']['dense_equivalent_tops']}", | |
| f"- INT6 optimized speedup vs reference: {int6['throughput']['optimized_speedup_vs_reference']}", | |
| f"- Sparse Tensor Core SASS observed: {tc['passed']}", | |
| f"- INT6 native Tensor Core available: {report['ampere_int6_tensor_core_native']['available']}", | |
| f"- Reason: {report['ampere_int6_tensor_core_native']['reason']}", | |
| f"- World-best kernel claim allowed: {report['claim_gate']['world_best_kernel_claim_allowed']}", | |
| "", | |
| ] | |
| ) | |
Xet Storage Details
- Size:
- 8.84 kB
- Xet hash:
- 371e51236fa1a625655ff5fca7c7a4425f5d06cd316f85aa881f22955e325601
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.