bbkdevops's picture
download
raw
4.68 kB
"""INT6 to Tensor Core bridge estimator.
Ampere has no native INT6 Tensor Core instruction. The practical route is to
represent signed INT6 as two INT4 sparse Tensor Core passes:
q6 = 4 * q_hi + q_lo
where q_hi and q_lo are signed INT4-compatible correction planes sharing the
same 2:4 pair mask. This preserves the custom INT6 artifact while moving the
hot matmul path toward IMMA.SP.
"""
from __future__ import annotations
from datetime import datetime, timezone
import json
from pathlib import Path
def _load(path: str | Path) -> dict:
p = Path(path)
return json.loads(p.read_text(encoding="utf-8-sig")) if p.exists() else {}
def build_int6_tensorcore_bridge(
out_dir: str | Path,
tfw_report: str | Path = "reports/tfw_optimizer/tfw_optimizer_report.json",
int6_report: str | Path = "reports/int6_cuda_eval_dll/int6_cuda_eval_dll_report.json",
) -> dict:
tfw = _load(tfw_report)
int6 = _load(int6_report)
int4 = next(
(row for row in tfw.get("candidates", []) if row.get("format") == "int4_2:4sp" and row.get("passed")),
{},
)
int6_kernel = int6.get("int6_cuda_kernel", {})
int6_throughput = int6_kernel.get("throughput", {})
int4_tops = float(int4.get("avg_effective_tops", 0.0))
int4_tops_w = float(int4.get("avg_effective_tops_per_watt", 0.0))
reference_tops = float(int6_throughput.get("dense_equivalent_tops") or 0.0)
# Two sparse MMA passes plus correction/scale overhead. This is an estimate,
# not a measured CUDA kernel result.
bridge_overhead = 0.62
estimated_bridge_tops = int4_tops * bridge_overhead / 2.0 if int4_tops > 0 else 0.0
estimated_bridge_tops_w = int4_tops_w * bridge_overhead / 2.0 if int4_tops_w > 0 else 0.0
speedup_vs_reference = estimated_bridge_tops / reference_tops if reference_tops > 0 else 0.0
report = {
"schema_version": "tinymind-int6-tensorcore-bridge-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"equation": "q6 = 4*q_hi + q_lo; compute y = 4*IMMA.SP(q_hi, x) + IMMA.SP(q_lo, x)",
"inputs": {
"tfw_report": str(tfw_report),
"int6_report": str(int6_report),
},
"source_measurements": {
"int4_avg_effective_tops": int4_tops,
"int4_avg_effective_tops_per_watt": int4_tops_w,
"int6_reference_dense_equivalent_tops": reference_tops,
"int6_reference_max_abs_error": int6_kernel.get("max_abs_error"),
},
"bridge_estimate": {
"passes": 2,
"overhead_factor_after_two_passes": bridge_overhead,
"estimated_dense_equivalent_tops": estimated_bridge_tops,
"estimated_tops_per_watt": estimated_bridge_tops_w,
"estimated_speedup_vs_int6_reference": speedup_vs_reference,
},
"implementation_steps": [
"split packed INT6 weights into shared-mask q_hi and q_lo INT4 sparse planes",
"reuse existing IMMA.SP INT4 sparse tile path for both planes",
"fuse 4*high + low accumulation before dequant scale",
"measure drift against INT6 reference layer and BF16 dense layer",
"promote bridge only if measured TF/W and drift gates both pass",
],
"claim_gate": {
"bridge_is_measured_cuda_kernel": False,
"int6_bottleneck_removed": False,
"world_highest_tfw_claim_allowed": False,
"reason": "This report is an engineering estimate; a real fused two-pass kernel must be built and measured before promotion.",
},
}
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
json_path = out / "int6_tensorcore_bridge_report.json"
md_path = out / "int6_tensorcore_bridge_report.md"
report["json_path"] = str(json_path)
report["markdown_path"] = str(md_path)
json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
md_path.write_text(_markdown(report), encoding="utf-8")
return report
def _markdown(report: dict) -> str:
bridge = report["bridge_estimate"]
return "\n".join(
[
"# TinyMind INT6 Tensor Core Bridge",
"",
f"- Equation: `{report['equation']}`",
f"- Estimated dense-equivalent TOPS: {bridge['estimated_dense_equivalent_tops']:.6f}",
f"- Estimated TOPS/W: {bridge['estimated_tops_per_watt']:.6f}",
f"- Estimated speedup vs INT6 reference: {bridge['estimated_speedup_vs_int6_reference']:.2f}x",
f"- Bottleneck removed: {report['claim_gate']['int6_bottleneck_removed']}",
"",
]
)

Xet Storage Details

Size:
4.68 kB
·
Xet hash:
52eedaa3be9fd09f3cd0af03c468e1bdd92897f560c178def2d01ad4f00513c1

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.