Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /evaluation /int6_cuda_eval.py

bbkdevops

about 1 month ago

download

raw

8.84 kB

	"""Build/run TinyMind INT6 CUDA kernel and SASS boundary evidence."""

	from __future__ import annotations

	from datetime import datetime, timezone
	import json
	from pathlib import Path
	import re
	import subprocess
	import tempfile


	ROOT = Path(__file__).resolve().parents[1]
	INT6_KERNEL = ROOT / "kernels" / "int6_sparse_ptx" / "int6_sparse_kernel.cu"
	INT4_MMA = ROOT / "kernels" / "int4_sparse_ptx" / "int4_sparse_mma.cu"


	def _run(command: list[str], cwd: Path) -> dict:
	try:
	proc = subprocess.run(command, cwd=cwd, capture_output=True, text=True, timeout=120, check=False)
	except (OSError, subprocess.SubprocessError) as exc:
	return {"command": command, "exit_code": -1, "stdout": "", "stderr": str(exc)}
	return {
	"command": command,
	"exit_code": proc.returncode,
	"stdout": proc.stdout,
	"stderr": proc.stderr,
	}


	def _run_msvc(command: list[str], cwd: Path) -> dict:
	vcvars = Path("C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Auxiliary/Build/vcvars64.bat")
	if not vcvars.exists():
	return _run(command, cwd)
	quoted = " ".join(f'"{part}"' if (" " in part or "\\" in part or ":" in part) else part for part in command)
	shell_command = f'@echo off\ncall "{vcvars}" >nul\n{quoted}\n'
	script_path = Path(tempfile.gettempdir()) / "tinymind_cuda_build_env.bat"
	script_path.write_text(shell_command, encoding="utf-8")
	try:
	proc = subprocess.run(["cmd.exe", "/d", "/c", str(script_path)], cwd=cwd, capture_output=True, text=True, timeout=120, check=False)
	except (OSError, subprocess.SubprocessError) as exc:
	return {"command": command, "exit_code": -1, "stdout": "", "stderr": str(exc)}
	return {
	"command": command,
	"exit_code": proc.returncode,
	"stdout": proc.stdout,
	"stderr": proc.stderr,
	}


	def _find_tool(name: str) -> str:
	for candidate in [
	Path("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/bin") / f"{name}.exe",
	Path("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.6/bin") / f"{name}.exe",
	]:
	if candidate.exists():
	return str(candidate)
	return name


	def build_int6_cuda_eval(out_dir: str \| Path) -> dict:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	nvcc = _find_tool("nvcc")
	cuobjdump = _find_tool("cuobjdump")
	int6_exe = out / "int6_sparse_kernel.exe"
	int4_exe = out / "int4_sparse_mma.exe"

	int6_build = _run_msvc([nvcc, "-arch=sm_86", "-O3", "-std=c++17", str(INT6_KERNEL), "-o", str(int6_exe)], ROOT)
	int6_run = _run([str(int6_exe)], ROOT) if int6_build["exit_code"] == 0 else {"command": [str(int6_exe)], "exit_code": -1, "stdout": "", "stderr": "int6 build failed"}
	int6_sass = _run([cuobjdump, "--dump-sass", str(int6_exe)], ROOT) if int6_build["exit_code"] == 0 else {"command": [cuobjdump, "--dump-sass", str(int6_exe)], "exit_code": -1, "stdout": "", "stderr": "int6 build failed"}

	int4_build = _run_msvc([nvcc, "-arch=sm_86", "-O3", "-std=c++17", str(INT4_MMA), "-o", str(int4_exe)], ROOT)
	int4_run = _run([str(int4_exe)], ROOT) if int4_build["exit_code"] == 0 else {"command": [str(int4_exe)], "exit_code": -1, "stdout": "", "stderr": "int4 build failed"}
	int4_sass = _run([cuobjdump, "--dump-sass", str(int4_exe)], ROOT) if int4_build["exit_code"] == 0 else {"command": [cuobjdump, "--dump-sass", str(int4_exe)], "exit_code": -1, "stdout": "", "stderr": "int4 build failed"}

	max_error = None
	match = re.search(r"max_abs_error:\s*([0-9.eE+-]+)", int6_run.get("stdout", ""))
	if match:
	max_error = float(match.group(1))
	throughput = _parse_int6_throughput(int6_run.get("stdout", ""))

	int4_sass_text = int4_sass.get("stdout", "") + int4_sass.get("stderr", "")
	int6_sass_text = int6_sass.get("stdout", "") + int6_sass.get("stderr", "")
	report = {
	"schema_version": "tinymind-int6-cuda-eval-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"hardware_target": "RTX 3090 sm_86",
	"int6_cuda_kernel": {
	"source": str(INT6_KERNEL),
	"exe": str(int6_exe),
	"build": _trim(int6_build),
	"run": _trim(int6_run),
	"sass_contains_tensor_core_imma": bool(re.search(r"IMMA\|HMMA\|MMA", int6_sass_text)),
	"max_abs_error": max_error,
	"throughput": throughput,
	"passed": int6_build["exit_code"] == 0 and int6_run["exit_code"] == 0 and max_error is not None and max_error < 1e-5,
	},
	"tensor_core_sparse_ptx_boundary": {
	"source": str(INT4_MMA),
	"exe": str(int4_exe),
	"build": _trim(int4_build),
	"run": _trim(int4_run),
	"sass_contains_sparse_mma": bool(re.search(r"IMMA\|MMA\|SP", int4_sass_text)),
	"passed": int4_build["exit_code"] == 0 and int4_run["exit_code"] == 0 and bool(re.search(r"IMMA\|MMA\|SP", int4_sass_text)),
	},
	"ampere_int6_tensor_core_native": {
	"available": False,
	"reason": "Ampere exposes INT4/INT8 Tensor Core MMA paths, not native signed INT6 Tensor Core MMA. TinyMind INT6 v1 uses packed INT6 sparse CUDA reference math and can later bridge through INT8/CUTLASS kernels.",
	},
	"claim_gate": {
	"real_cuda_kernel_passed": False,
	"real_sparse_tensor_core_sass_observed": False,
	"int6_native_tensor_core_claim_allowed": False,
	"world_best_kernel_claim_allowed": False,
	},
	}
	report["claim_gate"]["real_cuda_kernel_passed"] = report["int6_cuda_kernel"]["passed"]
	report["claim_gate"]["real_sparse_tensor_core_sass_observed"] = report["tensor_core_sparse_ptx_boundary"]["passed"]

	json_path = out / "int6_cuda_eval_report.json"
	md_path = out / "int6_cuda_eval_report.md"
	report["json_path"] = str(json_path)
	report["markdown_path"] = str(md_path)
	json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	md_path.write_text(_markdown(report), encoding="utf-8")
	return report


	def _trim(result: dict) -> dict:
	return {
	"command": result["command"],
	"exit_code": result["exit_code"],
	"stdout_tail": result.get("stdout", "")[-2000:],
	"stderr_tail": result.get("stderr", "")[-2000:],
	}


	def _parse_int6_throughput(text: str) -> dict:
	def grab(name: str) -> float \| None:
	match = re.search(rf"{re.escape(name)}:\s*([0-9.eE+-]+)", text)
	return float(match.group(1)) if match else None

	return {
	"benchmark_iters": grab("INT6 benchmark iters"),
	"elapsed_ms": grab("INT6 benchmark elapsed_ms"),
	"reference_elapsed_ms": grab("INT6 reference elapsed_ms"),
	"optimized_elapsed_ms": grab("INT6 optimized elapsed_ms"),
	"optimized_speedup_vs_reference": grab("INT6 optimized speedup_vs_reference"),
	"reference_actual_sparse_tops": grab("INT6 reference actual_sparse_tops"),
	"optimized_actual_sparse_tops": grab("INT6 optimized actual_sparse_tops"),
	"reference_dense_equivalent_tops": grab("INT6 reference dense_equivalent_tops"),
	"optimized_dense_equivalent_tops": grab("INT6 optimized dense_equivalent_tops"),
	"actual_sparse_tops": grab("INT6 actual_sparse_tops"),
	"dense_equivalent_tops": grab("INT6 dense_equivalent_tops"),
	"optimized_kernel": "shared_memory_tiled_cuda_core",
	"reference_kernel": "global_memory_cuda_core",
	"meaning": "actual_sparse_tops counts only non-zero 2:4 sparse multiply-add work; dense_equivalent_tops counts equivalent dense MxKxN multiply-add work. Optimized path is shared-memory tiled CUDA-core math, not native INT6 Tensor Core.",
	}


	def _markdown(report: dict) -> str:
	int6 = report["int6_cuda_kernel"]
	tc = report["tensor_core_sparse_ptx_boundary"]
	return "\n".join(
	[
	"# TinyMind INT6 CUDA Eval",
	"",
	f"- INT6 CUDA kernel passed: {int6['passed']}",
	f"- INT6 max abs error: {int6['max_abs_error']}",
	f"- INT6 actual sparse TOPS: {int6['throughput']['actual_sparse_tops']}",
	f"- INT6 dense-equivalent TOPS: {int6['throughput']['dense_equivalent_tops']}",
	f"- INT6 optimized speedup vs reference: {int6['throughput']['optimized_speedup_vs_reference']}",
	f"- Sparse Tensor Core SASS observed: {tc['passed']}",
	f"- INT6 native Tensor Core available: {report['ampere_int6_tensor_core_native']['available']}",
	f"- Reason: {report['ampere_int6_tensor_core_native']['reason']}",
	f"- World-best kernel claim allowed: {report['claim_gate']['world_best_kernel_claim_allowed']}",
	"",
	]
	)

Xet Storage Details

Size:: 8.84 kB
Xet hash:: 371e51236fa1a625655ff5fca7c7a4425f5d06cd316f85aa881f22955e325601

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.