Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /evaluation /int6_tensorcore_bridge.py
| """INT6 to Tensor Core bridge estimator. | |
| Ampere has no native INT6 Tensor Core instruction. The practical route is to | |
| represent signed INT6 as two INT4 sparse Tensor Core passes: | |
| q6 = 4 * q_hi + q_lo | |
| where q_hi and q_lo are signed INT4-compatible correction planes sharing the | |
| same 2:4 pair mask. This preserves the custom INT6 artifact while moving the | |
| hot matmul path toward IMMA.SP. | |
| """ | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| def _load(path: str | Path) -> dict: | |
| p = Path(path) | |
| return json.loads(p.read_text(encoding="utf-8-sig")) if p.exists() else {} | |
| def build_int6_tensorcore_bridge( | |
| out_dir: str | Path, | |
| tfw_report: str | Path = "reports/tfw_optimizer/tfw_optimizer_report.json", | |
| int6_report: str | Path = "reports/int6_cuda_eval_dll/int6_cuda_eval_dll_report.json", | |
| ) -> dict: | |
| tfw = _load(tfw_report) | |
| int6 = _load(int6_report) | |
| int4 = next( | |
| (row for row in tfw.get("candidates", []) if row.get("format") == "int4_2:4sp" and row.get("passed")), | |
| {}, | |
| ) | |
| int6_kernel = int6.get("int6_cuda_kernel", {}) | |
| int6_throughput = int6_kernel.get("throughput", {}) | |
| int4_tops = float(int4.get("avg_effective_tops", 0.0)) | |
| int4_tops_w = float(int4.get("avg_effective_tops_per_watt", 0.0)) | |
| reference_tops = float(int6_throughput.get("dense_equivalent_tops") or 0.0) | |
| # Two sparse MMA passes plus correction/scale overhead. This is an estimate, | |
| # not a measured CUDA kernel result. | |
| bridge_overhead = 0.62 | |
| estimated_bridge_tops = int4_tops * bridge_overhead / 2.0 if int4_tops > 0 else 0.0 | |
| estimated_bridge_tops_w = int4_tops_w * bridge_overhead / 2.0 if int4_tops_w > 0 else 0.0 | |
| speedup_vs_reference = estimated_bridge_tops / reference_tops if reference_tops > 0 else 0.0 | |
| report = { | |
| "schema_version": "tinymind-int6-tensorcore-bridge-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "equation": "q6 = 4*q_hi + q_lo; compute y = 4*IMMA.SP(q_hi, x) + IMMA.SP(q_lo, x)", | |
| "inputs": { | |
| "tfw_report": str(tfw_report), | |
| "int6_report": str(int6_report), | |
| }, | |
| "source_measurements": { | |
| "int4_avg_effective_tops": int4_tops, | |
| "int4_avg_effective_tops_per_watt": int4_tops_w, | |
| "int6_reference_dense_equivalent_tops": reference_tops, | |
| "int6_reference_max_abs_error": int6_kernel.get("max_abs_error"), | |
| }, | |
| "bridge_estimate": { | |
| "passes": 2, | |
| "overhead_factor_after_two_passes": bridge_overhead, | |
| "estimated_dense_equivalent_tops": estimated_bridge_tops, | |
| "estimated_tops_per_watt": estimated_bridge_tops_w, | |
| "estimated_speedup_vs_int6_reference": speedup_vs_reference, | |
| }, | |
| "implementation_steps": [ | |
| "split packed INT6 weights into shared-mask q_hi and q_lo INT4 sparse planes", | |
| "reuse existing IMMA.SP INT4 sparse tile path for both planes", | |
| "fuse 4*high + low accumulation before dequant scale", | |
| "measure drift against INT6 reference layer and BF16 dense layer", | |
| "promote bridge only if measured TF/W and drift gates both pass", | |
| ], | |
| "claim_gate": { | |
| "bridge_is_measured_cuda_kernel": False, | |
| "int6_bottleneck_removed": False, | |
| "world_highest_tfw_claim_allowed": False, | |
| "reason": "This report is an engineering estimate; a real fused two-pass kernel must be built and measured before promotion.", | |
| }, | |
| } | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| json_path = out / "int6_tensorcore_bridge_report.json" | |
| md_path = out / "int6_tensorcore_bridge_report.md" | |
| report["json_path"] = str(json_path) | |
| report["markdown_path"] = str(md_path) | |
| json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md_path.write_text(_markdown(report), encoding="utf-8") | |
| return report | |
| def _markdown(report: dict) -> str: | |
| bridge = report["bridge_estimate"] | |
| return "\n".join( | |
| [ | |
| "# TinyMind INT6 Tensor Core Bridge", | |
| "", | |
| f"- Equation: `{report['equation']}`", | |
| f"- Estimated dense-equivalent TOPS: {bridge['estimated_dense_equivalent_tops']:.6f}", | |
| f"- Estimated TOPS/W: {bridge['estimated_tops_per_watt']:.6f}", | |
| f"- Estimated speedup vs INT6 reference: {bridge['estimated_speedup_vs_int6_reference']:.2f}x", | |
| f"- Bottleneck removed: {report['claim_gate']['int6_bottleneck_removed']}", | |
| "", | |
| ] | |
| ) | |
Xet Storage Details
- Size:
- 4.68 kB
- Xet hash:
- 52eedaa3be9fd09f3cd0af03c468e1bdd92897f560c178def2d01ad4f00513c1
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.